From c8bffa87641cefc8911710204362b4c461113e9b Mon Sep 17 00:00:00 2001 From: LinlyBoi Date: Tue, 28 Mar 2023 13:38:43 +0200 Subject: [PATCH] MAJOR gunner refactor (sorry mark) --- py_scripts/gunner.py | 68 ++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 47 deletions(-) diff --git a/py_scripts/gunner.py b/py_scripts/gunner.py index 44b1773..ad454ad 100644 --- a/py_scripts/gunner.py +++ b/py_scripts/gunner.py @@ -7,78 +7,52 @@ import mining_hq # Sharing the dataset variables # Games' data -global games_dat -# Sales in NA -global game_sales_NA -global game_sales_NA_dur -global game_sales_NA_pre -global game_sales_NA_pos -# Sales Globally -global game_sales_GLO - -# Crime Data -# Crime Recorded in The US -global crime_US -# Crime Recorded in Canada -global crime_CA # Loading Datasets games_merged = mining_hq.games_merged_dat -crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx') -crime_US = pd.read_csv('datasets/crime/report.csv') +victim1 = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx") +victim2 = pd.read_csv("datasets/crime/report.csv") # Printing information regarding datasets print("Game Datasets' Info:\n") games_merged.info() print("Crime Datasets' Info:\n") -crime_US.info() -crime_CA.info() +victim2.info() +victim1.info() -# Printing First n values (index start: 0) -print("Game Sale Data:\n", games_merged.head(5)) - -print("US Crime Data:\n", crime_US.head(5)) -print("CA Crime Data:\n", crime_CA.head(5)) # Regarding the Games.xls dataset: # Coercing the non-numeric values will result in NaN # thus allowing easier removal through `.notnull()` -games_merged['Score'] = pd.to_numeric(games_merged['Score'], errors = 'coerce') -games_merged = games_merged[games_merged['Score'].notnull()] - -print("Game Scores (Cleaned):\n", games_merged.head()) -games_merged.info() # Regarding the vgsales-12-4-2019 dataset # Considering we will be using a US (probs CA too) crime datasets # It wouldn't be that useful to have other columns regarding other regions -NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'User_Score', 'GameName', 'Review', ''] -GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales', 'User_Score', 'GameName', 'Review', ''] +def drop_kick(col_list, dataframe): + return dataframe.drop(columns=col_list, axis=1) -game_sales_NA = games_merged.drop(columns = NA_col_list, axis = 1) -game_sales_GLO = games_merged.drop(columns = GLO_col_list, axis = 1) - -print(f"Game Sales for NA:\n{game_sales_NA.head(5)} \nWith minimum year being: {game_sales_NA['Year'].min()}") -print(f"Game Sales Globally:\n{game_sales_GLO.head(5)}\nWith minimum year being: {game_sales_GLO['Year'].min()}") # Getting the range of years which both datasets share -crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min()) -crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max()) +def year_interval(victim1, victim2, col1, col2): + return ( + min(victim2[col2].max(), victim1[col1].max()), + max(victim2[col2].min(), victim1[col1].min()), + ) -crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)] -crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)] +def intersect_by_year(victim1, victim2, col1, col2): + interval = year_interval(victim1, victim2, col1, col2) + victim1 = victim1[(victim1[col1] >= interval[0]) & (victim1[col1] <= interval[1])] + victim2 = victim2[(victim2[col2] >= interval[0]) & (victim2[col2] <= interval[1])] + return (victim1, victim2) # Updating the NA game dataset to fit with the time ranges -game_sales_NA_dur = game_sales_NA[(game_sales_NA['Year'] >= crime_year_min) & (game_sales_NA['Year'] <= crime_year_max)] - -game_sales_NA_pre = game_sales_NA[game_sales_NA['Year'] < crime_year_min] - -game_sales_NA_pos = game_sales_NA[game_sales_NA['Year'] > crime_year_max] - -print(f"Game Sales for NA:\n{game_sales_NA.head(10)}\nWith minimum year being: {game_sales_NA['Year'].min()}") -print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}") +def trisect_by_year(victim1, col, interval): + victim1_pre = victim1[victim1[col] < interval[0]] + victim1_dur = victim1[(victim1[col] <= interval[1]) & (victim1[col] >= interval[0])] + victim1_pos = victim1[victim1[col] > interval[1]] + return (victim1_pre, victim1_dur, victim1_pos)