From 4605f3cc9365a5f443bfdf5ba4af1d182cf2450e Mon Sep 17 00:00:00 2001 From: Supermjork Date: Tue, 28 Mar 2023 17:02:20 +0200 Subject: [PATCH 1/3] Put names to variables --- py_scripts/mining_hq.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index 0c628de..c0d74fa 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -15,12 +15,16 @@ global games_sales_split_pos games_review = pd.read_csv("datasets/videogames/Games.xls") games_sales = pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv") +print(games_review.isnull()) +print(games_sales.isnull()) + games_review_phase1 = digger.slice_column(games_review, "GameName", "Review") games_review_final = digger.slice_column(games_review, "GameName", "(Import)") games_merged_dat = digger.write_joined_df(games_sales, games_review_final) # Acquisition of Merged dataset +print(games_merged_dat.isnull()) games_merged_dat.to_csv("datasets/videogames/games_merged.csv") # Loading Crime Datasets @@ -28,12 +32,19 @@ crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx') crime_US = pd.read_csv('datasets/crime/report.csv') +print(crime_US.isnull()) +print(crime_CA.isnull()) + year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year") -print(year_interval[0], year_interval[1]) +year_max = year_interval[0] +year_min = year_interval[1] crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, "report_year", "year") +crime_US_intersect = crime_intersect[0] +crime_CA_intersect = crime_intersect[1] + NA_col_list = [ "PAL_Sales", "JP_Sales", @@ -59,13 +70,19 @@ games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat) sale_tri_split = gunner.trisect_by_year(games_merged_dat, 'Year', year_interval) +game_sales_split_pre = sale_tri_split[0] +game_sales_split_dur = sale_tri_split[1] +game_sales_split_pos = sale_tri_split[2] + # Displaying Acquired Data print("Acquired Datasets:\n") -print(sale_tri_split[0].head(5), -sale_tri_split[1].head(5), -sale_tri_split[2].head(5)) +print(game_sales_split_pre.head(5), +game_sales_split_dur.head(5), +game_sales_split_pos.head(5)) print("Dataset Info:\n") -sale_tri_split[0].info() -sale_tri_split[1].info() -sale_tri_split[2].info() +game_sales_split_pre.info() +game_sales_split_dur.info() +game_sales_split_pos.info() + +print(game_sales_split_dur.describe()) From ca748eb57eb966952a5b27680d7fea0ecccd4260 Mon Sep 17 00:00:00 2001 From: Supermjork Date: Tue, 28 Mar 2023 17:18:36 +0200 Subject: [PATCH 2/3] Added midlife crisis in main --- py_scripts/mining_hq.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index c0d74fa..c9a8baa 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -15,8 +15,8 @@ global games_sales_split_pos games_review = pd.read_csv("datasets/videogames/Games.xls") games_sales = pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv") -print(games_review.isnull()) -print(games_sales.isnull()) +print(games_review.count()) +print(games_sales.count()) games_review_phase1 = digger.slice_column(games_review, "GameName", "Review") games_review_final = digger.slice_column(games_review, "GameName", "(Import)") @@ -86,3 +86,10 @@ game_sales_split_dur.info() game_sales_split_pos.info() print(game_sales_split_dur.describe()) + +# Required to use binning for cleaning, idk +# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950 + +# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max + +# Need similarity and dissimialrity, scipy time From 300ce67b60c7f078f23a42746904dfb9e765664e Mon Sep 17 00:00:00 2001 From: Supermjork Date: Tue, 28 Mar 2023 20:41:46 +0200 Subject: [PATCH 3/3] R visualisation better --- py_scripts/engineer.py | 19 +++++++++++++++---- py_scripts/mining_hq.py | 25 ++++++++++++------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/py_scripts/engineer.py b/py_scripts/engineer.py index ab22516..f3c089d 100644 --- a/py_scripts/engineer.py +++ b/py_scripts/engineer.py @@ -5,19 +5,30 @@ import mining_hq from numpy import count_nonzero sns.set() -plt.xticks(rotation = 90) games_pre = mining_hq.games_sales_split_pre games_dur = mining_hq.games_sales_split_dur games_pos = mining_hq.games_sales_split_pos -games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero) +crime_US = mining_hq.crime_US_intersect +crime_CA = mining_hq.crime_CA_intersect + +plt.xticks(rotation = 90) +games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True) plt.show() plt.xticks(rotation = 90) -games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero) +games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65) plt.show() plt.xticks(rotation = 90) -games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero) +games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales") +plt.show() + +plt.xticks(rotation = 90) +games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales") +plt.show() + +plt.xticks(rotation = 90) +crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents") plt.show() diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index c9a8baa..1e64b33 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -3,8 +3,7 @@ import pandas as pd import numpy as np import seaborn as sns -import digger -import gunner +import digger, gunner # Instantiating globals to be used in other files global games_merged_dat @@ -24,7 +23,7 @@ games_review_final = digger.slice_column(games_review, "GameName", "(Import)") games_merged_dat = digger.write_joined_df(games_sales, games_review_final) # Acquisition of Merged dataset -print(games_merged_dat.isnull()) +print(games_merged_dat.count()) games_merged_dat.to_csv("datasets/videogames/games_merged.csv") # Loading Crime Datasets @@ -70,22 +69,22 @@ games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat) sale_tri_split = gunner.trisect_by_year(games_merged_dat, 'Year', year_interval) -game_sales_split_pre = sale_tri_split[0] -game_sales_split_dur = sale_tri_split[1] -game_sales_split_pos = sale_tri_split[2] +games_sales_split_pre = sale_tri_split[0] +games_sales_split_dur = sale_tri_split[1] +games_sales_split_pos = sale_tri_split[2] # Displaying Acquired Data print("Acquired Datasets:\n") -print(game_sales_split_pre.head(5), -game_sales_split_dur.head(5), -game_sales_split_pos.head(5)) +print(games_sales_split_pre.head(5), +games_sales_split_dur.head(5), +games_sales_split_pos.head(5)) print("Dataset Info:\n") -game_sales_split_pre.info() -game_sales_split_dur.info() -game_sales_split_pos.info() +games_sales_split_pre.info() +games_sales_split_dur.info() +games_sales_split_pos.info() -print(game_sales_split_dur.describe()) +print(games_sales_split_dur.describe()) # Required to use binning for cleaning, idk # https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950