diff --git a/py_scripts/engineer.py b/py_scripts/engineer.py index ab22516..f3c089d 100644 --- a/py_scripts/engineer.py +++ b/py_scripts/engineer.py @@ -5,19 +5,30 @@ import mining_hq from numpy import count_nonzero sns.set() -plt.xticks(rotation = 90) games_pre = mining_hq.games_sales_split_pre games_dur = mining_hq.games_sales_split_dur games_pos = mining_hq.games_sales_split_pos -games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero) +crime_US = mining_hq.crime_US_intersect +crime_CA = mining_hq.crime_CA_intersect + +plt.xticks(rotation = 90) +games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True) plt.show() plt.xticks(rotation = 90) -games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero) +games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65) plt.show() plt.xticks(rotation = 90) -games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero) +games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales") +plt.show() + +plt.xticks(rotation = 90) +games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales") +plt.show() + +plt.xticks(rotation = 90) +crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents") plt.show() diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index 9ccbff3..b18449b 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -4,8 +4,7 @@ import pandas as pd import scout import numpy as np import seaborn as sns -import digger -import gunner +import digger, gunner # Instantiating globals to be used in other files global games_merged_dat @@ -16,12 +15,16 @@ global games_sales_split_pos games_review = pd.read_csv("datasets/videogames/Games.xls") games_sales = pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv") +print(games_review.count()) +print(games_sales.count()) + games_review_phase1 = digger.slice_column(games_review, "GameName", "Review") games_review_final = digger.slice_column(games_review, "GameName", "(Import)") games_merged_dat = digger.write_joined_df(games_sales, games_review_final) # Acquisition of Merged dataset +print(games_merged_dat.count()) games_merged_dat.to_csv("datasets/videogames/games_merged.csv") # Loading Crime Datasets @@ -29,12 +32,19 @@ crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx") crime_US = pd.read_csv("datasets/crime/report.csv") +print(crime_US.isnull()) +print(crime_CA.isnull()) + year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year") -print(year_interval[0], year_interval[1]) +year_max = year_interval[0] +year_min = year_interval[1] crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, "report_year", "year") +crime_US_intersect = crime_intersect[0] +crime_CA_intersect = crime_intersect[1] + NA_col_list = [ "PAL_Sales", "JP_Sales", @@ -60,6 +70,10 @@ games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat) sale_tri_split = gunner.trisect_by_year(games_merged_dat, "Year", year_interval) +games_sales_split_pre = sale_tri_split[0] +games_sales_split_dur = sale_tri_split[1] +games_sales_split_pos = sale_tri_split[2] + # Displaying Acquired Data print("Acquired Datasets:\n") print(sale_tri_split[0].head(5), sale_tri_split[1].head(5), sale_tri_split[2].head(5)) @@ -69,6 +83,26 @@ sale_tri_split[0].info() sale_tri_split[1].info() sale_tri_split[2].info() + +print("Dataset Info:\n") +games_sales_split_pre.info() +games_sales_split_dur.info() +games_sales_split_pos.info() + +print(games_sales_split_dur.describe()) + +print( + games_sales_split_pre.head(5), + games_sales_split_dur.head(5), + games_sales_split_pos.head(5), +) +# Required to use binning for cleaning, idk +# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950 + +# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max + +# Need similarity and dissimialrity, scipy time + # Load merged gammas gammas = pd.read_csv("datasets/videogames/games_merged.csv")