diff --git a/py_scripts/engineer.py b/py_scripts/engineer.py index 2570c46..9016ffb 100644 --- a/py_scripts/engineer.py +++ b/py_scripts/engineer.py @@ -1,7 +1,14 @@ # Visualisations for Data +import matplotlib.pyplot as plt import seaborn as sns import gunner +from numpy import count_nonzero + +sns.set() +plt.xticks(rotation=90) games_vis = gunner.game_sales_NA -sns.relplot(data = games_vis, x = "Year", y = "NA_Sales") +games_fig = sns.barplot(data = games_vis, x = games_vis["Year"], y = games_vis["NA_Sales"], estimator = count_nonzero) + +plt.show() diff --git a/py_scripts/gunner.py b/py_scripts/gunner.py index ac02f63..fb00f12 100644 --- a/py_scripts/gunner.py +++ b/py_scripts/gunner.py @@ -11,20 +11,34 @@ global games_dat global game_sales_NA # Sales Globally global game_sales_GLO +# Crime Recorded in The US +global crime_US +# Crime Recorded in Canada +global crime_CA # Loading Datasets -game_sales2019_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv') +game_sales_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv') games_dat = pd.read_csv('datasets/videogames/Games.xls') +crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx') +crime_US = pd.read_csv('datasets/crime/report.csv') + # Printing information regarding datasets -print("Data Sets' Info:\n") -game_sales2019_dat.info() +print("Game Datasets' Info:\n") +game_sales_dat.info() games_dat.info() +print("Crime Datasets' Info:\n") +crime_US.info() +crime_CA.info() + # Printing First n values (index start: 0) -print("Game Sale Data:\n", game_sales2019_dat.head(10)) +print("Game Sale Data:\n", game_sales_dat.head(10)) print("Game Scores:\n", games_dat.head(10)) +print("US Crime Data:\n", crime_US.head(10)) +print("CA Crime Data:\n", crime_CA.head(10)) + # Regarding the Games.xls dataset: # Coercing the non-numeric values will result in NaN # thus allowing easier removal through `.notnull()` @@ -41,8 +55,15 @@ games_dat.info() NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'] GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales'] -game_sales_NA = game_sales2019_dat.drop(columns = NA_col_list, axis = 1) -game_sales_GLO = game_sales2019_dat.drop(columns = GLO_col_list, axis = 1) +game_sales_NA = game_sales_dat.drop(columns = NA_col_list, axis = 1) +game_sales_GLO = game_sales_dat.drop(columns = GLO_col_list, axis = 1) -print("Game Sales for NA:\n", game_sales_NA.head(10)) -print("Game Sales Globally:\n", game_sales_GLO.head(10)) +print(f"Game Sales for NA:\n{game_sales_NA.head(10)} \nWith minimum year being: {game_sales_NA['Year'].min()}") +print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}") + +# Getting the range of years which both datasets share +crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min()) +crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max()) + +crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)] +crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)]