Merge pull request #5 from LinlyBoi/regresso-espresso
Regresso espresso APPROVED
This commit is contained in:
11
.gitignore
vendored
11
.gitignore
vendored
@@ -127,5 +127,16 @@ dmypy.json
|
|||||||
|
|
||||||
# Pyre type checker
|
# Pyre type checker
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
# vscode settings conflic shuns
|
# vscode settings conflic shuns
|
||||||
.vscode/
|
.vscode/
|
||||||
|
datasets/videogames/merged_games.xlsx
|
||||||
|
datasets/videogames/games_merged.csv
|
||||||
|
datasets/videogames/games_cleanish.csv
|
||||||
|
|
||||||
|
jupyter-notes/merged_games.csv
|
||||||
|
|
||||||
|
output.csv
|
||||||
|
output.xlsx
|
||||||
|
|
||||||
|
.gitignore
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -11592,7 +11592,7 @@ Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,G
|
|||||||
11592,Minecraft: Story Mode,PC,2015,Adventure,Mojang,0.02,0.05,0,0.01,0.08
|
11592,Minecraft: Story Mode,PC,2015,Adventure,Mojang,0.02,0.05,0,0.01,0.08
|
||||||
11593,P.N.03 - Product Number Three,GC,2003,Shooter,Capcom,0.06,0.02,0,0,0.08
|
11593,P.N.03 - Product Number Three,GC,2003,Shooter,Capcom,0.06,0.02,0,0,0.08
|
||||||
11594,Samurai Warriors 2,X360,2006,Action,Tecmo Koei,0.07,0.01,0,0.01,0.08
|
11594,Samurai Warriors 2,X360,2006,Action,Tecmo Koei,0.07,0.01,0,0.01,0.08
|
||||||
11595,Boku no Natsuyasumi 3: Hokkoku Hen: Chiisana Boku no Dai Sougen??PS3,2007,Adventure,Sony Computer Entertainment,0,0,0.08,0,0.08,
|
11595,Boku no Natsuyasumi 3: Hokkoku Hen: Chiisana Boku no Dai Sougen??PS3,X360,2007,Sony Computer Entertainment,0,0,0.08,0,0.08,
|
||||||
11596,Ape Escape: Pumped & Primed,PS2,2004,Misc,Sony Computer Entertainment,0.04,0.03,0,0.01,0.08
|
11596,Ape Escape: Pumped & Primed,PS2,2004,Misc,Sony Computer Entertainment,0.04,0.03,0,0.01,0.08
|
||||||
11597,Alien Syndrome,PSP,2007,Role-Playing,Sega,0.07,0,0,0.01,0.08
|
11597,Alien Syndrome,PSP,2007,Role-Playing,Sega,0.07,0,0,0.01,0.08
|
||||||
11598,Monster Trucks DS,DS,2005,Racing,Majesco Entertainment,0.07,0,0,0.01,0.08
|
11598,Monster Trucks DS,DS,2005,Racing,Majesco Entertainment,0.07,0,0,0.01,0.08
|
||||||
@@ -13537,7 +13537,7 @@ Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,G
|
|||||||
13537,Top Gun,DS,2006,Simulation,505 Games,0.04,0,0,0,0.04
|
13537,Top Gun,DS,2006,Simulation,505 Games,0.04,0,0,0,0.04
|
||||||
13538,Gem Smashers,3DS,2011,Platform,Crave Entertainment,0.04,0,0,0,0.04
|
13538,Gem Smashers,3DS,2011,Platform,Crave Entertainment,0.04,0,0,0,0.04
|
||||||
13539,Split/Second,PC,2010,Racing,Disney Interactive Studios,0,0.04,0,0.01,0.04
|
13539,Split/Second,PC,2010,Racing,Disney Interactive Studios,0,0.04,0,0.01,0.04
|
||||||
13540,B's-LOG Party??PSP,2010,Adventure,Idea Factory,0,0,0.04,0,0.04,
|
13540,B's-LOG Party??PSP,2010,2010,Idea Factory,0,0,0.04,0,0.04,
|
||||||
13541,King Arthur,GC,2004,Action,Konami Digital Entertainment,0.03,0.01,0,0,0.04
|
13541,King Arthur,GC,2004,Action,Konami Digital Entertainment,0.03,0.01,0,0,0.04
|
||||||
13542,Chicken Shoot,GBA,2005,Action,Zoo Digital Publishing,0.03,0.01,0,0,0.04
|
13542,Chicken Shoot,GBA,2005,Action,Zoo Digital Publishing,0.03,0.01,0,0,0.04
|
||||||
13543,Dai Senryaku VII: Modern Military Tactics,XB,2003,Strategy,Kool Kizz,0.03,0.01,0,0,0.04
|
13543,Dai Senryaku VII: Modern Military Tactics,XB,2003,Strategy,Kool Kizz,0.03,0.01,0,0,0.04
|
||||||
|
|||||||
|
Can't render this file because it is too large.
|
@@ -31,7 +31,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -52,9 +52,22 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 37,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" GameName\n",
|
||||||
|
"0 Baldur's Gate 3 Early Access Review\n",
|
||||||
|
"1 Control: Ultimate Edition Cloud Version Review\n",
|
||||||
|
"2 Doom Eternal: The Ancient Gods Part 1 Review\n",
|
||||||
|
"3 Watch Dogs: Legion Review\n",
|
||||||
|
"4 Ring Of Pain Review\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Unclean showcase\n",
|
"# Unclean showcase\n",
|
||||||
"unclean = df1\n",
|
"unclean = df1\n",
|
||||||
@@ -108,7 +121,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -119,23 +132,118 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Rank Name Genre ESRB_Rating \\\n",
|
||||||
|
"0 1.0 Wii Sports Sports E \n",
|
||||||
|
"1 2.0 Super Mario Bros. Platform NaN \n",
|
||||||
|
"2 3.0 Mario Kart Wii Racing E \n",
|
||||||
|
"3 4.0 PlayerUnknown's Battlegrounds Shooter NaN \n",
|
||||||
|
"4 5.0 Wii Sports Resort Sports E \n",
|
||||||
|
"5 6.0 Pokemon Red / Green / Blue Version Role-Playing E \n",
|
||||||
|
"6 7.0 New Super Mario Bros. Platform E \n",
|
||||||
|
"7 8.0 Tetris Puzzle E \n",
|
||||||
|
"8 9.0 New Super Mario Bros. Wii Platform E \n",
|
||||||
|
"9 10.0 Minecraft Misc NaN \n",
|
||||||
|
"10 11.0 Duck Hunt Shooter NaN \n",
|
||||||
|
"11 12.0 Wii Play Misc E \n",
|
||||||
|
"12 13.0 Kinect Adventures! Party E \n",
|
||||||
|
"13 14.0 Nintendogs Simulation E \n",
|
||||||
|
"14 15.0 Mario Kart DS Racing E \n",
|
||||||
|
"15 16.0 Pokemon Gold / Silver Version Role-Playing E \n",
|
||||||
|
"16 17.0 Wii Fit Sports E \n",
|
||||||
|
"17 18.0 Wii Fit Plus Sports E \n",
|
||||||
|
"18 19.0 Super Mario World Platform E \n",
|
||||||
|
"19 20.0 Grand Theft Auto V Action M \n",
|
||||||
|
"\n",
|
||||||
|
" Platform Publisher Developer Critic_Score \\\n",
|
||||||
|
"0 Wii Nintendo Nintendo EAD 7.7 \n",
|
||||||
|
"1 NES Nintendo Nintendo EAD 10.0 \n",
|
||||||
|
"2 Wii Nintendo Nintendo EAD 8.2 \n",
|
||||||
|
"3 PC PUBG Corporation PUBG Corporation NaN \n",
|
||||||
|
"4 Wii Nintendo Nintendo EAD 8.0 \n",
|
||||||
|
"5 GB Nintendo Game Freak 9.4 \n",
|
||||||
|
"6 DS Nintendo Nintendo EAD 9.1 \n",
|
||||||
|
"7 GB Nintendo Bullet Proof Software NaN \n",
|
||||||
|
"8 Wii Nintendo Nintendo EAD 8.6 \n",
|
||||||
|
"9 PC Mojang Mojang AB 10.0 \n",
|
||||||
|
"10 NES Nintendo Nintendo R&D1 NaN \n",
|
||||||
|
"11 Wii Nintendo Nintendo EAD 5.9 \n",
|
||||||
|
"12 X360 Microsoft Game Studios Good Science Studio 6.7 \n",
|
||||||
|
"13 DS Nintendo Nintendo EAD 8.4 \n",
|
||||||
|
"14 DS Nintendo Nintendo EAD 9.1 \n",
|
||||||
|
"15 GB Nintendo Game Freak 9.2 \n",
|
||||||
|
"16 Wii Nintendo Nintendo EAD 7.9 \n",
|
||||||
|
"17 Wii Nintendo Nintendo EAD 8.0 \n",
|
||||||
|
"18 SNES Nintendo Nintendo EAD 8.5 \n",
|
||||||
|
"19 PS3 Rockstar Games Rockstar North 9.4 \n",
|
||||||
|
"\n",
|
||||||
|
" User_Score Total_Shipped Global_Sales NA_Sales PAL_Sales JP_Sales \\\n",
|
||||||
|
"0 NaN 82.86 NaN NaN NaN NaN \n",
|
||||||
|
"1 NaN 40.24 NaN NaN NaN NaN \n",
|
||||||
|
"2 9.1 37.14 NaN NaN NaN NaN \n",
|
||||||
|
"3 NaN 36.60 NaN NaN NaN NaN \n",
|
||||||
|
"4 8.8 33.09 NaN NaN NaN NaN \n",
|
||||||
|
"5 NaN 31.38 NaN NaN NaN NaN \n",
|
||||||
|
"6 8.1 30.80 NaN NaN NaN NaN \n",
|
||||||
|
"7 NaN 30.26 NaN NaN NaN NaN \n",
|
||||||
|
"8 9.2 30.22 NaN NaN NaN NaN \n",
|
||||||
|
"9 NaN 30.01 NaN NaN NaN NaN \n",
|
||||||
|
"10 NaN 28.31 NaN NaN NaN NaN \n",
|
||||||
|
"11 4.5 28.02 NaN NaN NaN NaN \n",
|
||||||
|
"12 NaN 24.00 NaN NaN NaN NaN \n",
|
||||||
|
"13 NaN 23.96 NaN NaN NaN NaN \n",
|
||||||
|
"14 9.4 23.60 NaN NaN NaN NaN \n",
|
||||||
|
"15 NaN 23.10 NaN NaN NaN NaN \n",
|
||||||
|
"16 NaN 22.67 NaN NaN NaN NaN \n",
|
||||||
|
"17 NaN 21.13 NaN NaN NaN NaN \n",
|
||||||
|
"18 NaN 20.61 NaN NaN NaN NaN \n",
|
||||||
|
"19 NaN NaN 20.32 6.37 9.85 0.99 \n",
|
||||||
|
"\n",
|
||||||
|
" Other_Sales Year Unnamed: 0 Console Review Score \n",
|
||||||
|
"0 NaN 2006.0 NaN NaN NaN NaN \n",
|
||||||
|
"1 NaN 1985.0 NaN NaN NaN NaN \n",
|
||||||
|
"2 NaN 2008.0 NaN NaN NaN NaN \n",
|
||||||
|
"3 NaN 2017.0 NaN NaN NaN NaN \n",
|
||||||
|
"4 NaN 2009.0 NaN NaN NaN NaN \n",
|
||||||
|
"5 NaN 1998.0 NaN NaN NaN NaN \n",
|
||||||
|
"6 NaN 2006.0 NaN NaN NaN NaN \n",
|
||||||
|
"7 NaN 1989.0 NaN NaN NaN NaN \n",
|
||||||
|
"8 NaN 2009.0 NaN NaN NaN NaN \n",
|
||||||
|
"9 NaN 2010.0 NaN NaN NaN NaN \n",
|
||||||
|
"10 NaN 1985.0 NaN NaN NaN NaN \n",
|
||||||
|
"11 NaN 2007.0 NaN NaN NaN NaN \n",
|
||||||
|
"12 NaN 2010.0 NaN NaN NaN NaN \n",
|
||||||
|
"13 NaN 2005.0 NaN NaN NaN NaN \n",
|
||||||
|
"14 NaN 2005.0 NaN NaN NaN NaN \n",
|
||||||
|
"15 NaN 2000.0 NaN NaN NaN NaN \n",
|
||||||
|
"16 NaN 2008.0 NaN NaN NaN NaN \n",
|
||||||
|
"17 NaN 2009.0 NaN NaN NaN NaN \n",
|
||||||
|
"18 NaN 1991.0 NaN NaN NaN NaN \n",
|
||||||
|
"19 3.12 2013.0 NaN NaN NaN NaN \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# merged = pd.merge(df1,df2, how='inner', sort=True) DOES NOT WORK\n",
|
"# merged = pd.merge(df1,df2, how='inner', sort=True) DOES NOT WORK\n",
|
||||||
"# print(merged.head(10))\n",
|
"# print(merged.head(10))\n",
|
||||||
"merged = df2.join(df1, lsuffix='merged') #Good\n",
|
"columns = df1.columns\n",
|
||||||
"print(merged.head(10))"
|
"merged = pd.concat([df2,df1], sort=False, ignore_index=True) #Good\n",
|
||||||
|
"print(merged.head(20))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"merged2.to_csv('merged_games.csv')"
|
"merged.to_csv('merged_games.csv')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,35 +1,13 @@
|
|||||||
# Getting and combining data
|
# Getting and combining data
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import scout
|
||||||
# reading the data
|
from numpy import ndarray
|
||||||
# -> MAKE SURE OF THE DATA FRAMES NAMES PEFORE YOU RUN IT
|
from scipy.stats import binned_statistic
|
||||||
|
|
||||||
|
|
||||||
df1 = pd.read_csv("output_6th_df.csv")
|
# Defining useful Functions to be used later
|
||||||
df2 = pd.read_csv("vgsales-12-4-2019-short.csv")
|
def slice_column(input_df, column, expression=" "):
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# print(pf1.head)
|
|
||||||
# print(pf2.head)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# merging
|
|
||||||
|
|
||||||
combined_df = df1.merge(df2, left_on="Name", right_on="Name", how="left")
|
|
||||||
print(combined_df)
|
|
||||||
combined_df.to_csv("output_final_df.csv")
|
|
||||||
df = combined_df
|
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def slice_column(input_df, output_df, column, expression=" "):
|
|
||||||
unclean = input_df[column].to_list()
|
unclean = input_df[column].to_list()
|
||||||
clean = list()
|
clean = list()
|
||||||
for record in unclean:
|
for record in unclean:
|
||||||
@@ -38,9 +16,37 @@ def slice_column(input_df, output_df, column, expression=" "):
|
|||||||
|
|
||||||
input_df = input_df.drop(columns=[column])
|
input_df = input_df.drop(columns=[column])
|
||||||
input_df[column] = clean
|
input_df[column] = clean
|
||||||
input_df.to_csv(output_df)
|
return input_df
|
||||||
|
|
||||||
|
|
||||||
def write_joined_df(left, right, output_file, lsuf="new_key"):
|
def write_joined_df(left, right, lsuf="new_key"):
|
||||||
merged = left.join(right, lsuffix=lsuf)
|
merged = pd.concat([left, right], sort=False, ignore_index=True) # Good
|
||||||
merged.to_csv(output_file)
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def slam_dunk(dataset, column, labels):
|
||||||
|
min_value = dataset[column].min()
|
||||||
|
max_value = dataset[column].max()
|
||||||
|
print("min: ", min_value, " max: ", max_value)
|
||||||
|
bins = np.linspace(min_value, max_value, len(labels) + 1)
|
||||||
|
bins
|
||||||
|
|
||||||
|
dunked_column = "bin_" + column
|
||||||
|
dataset[dunked_column] = pd.cut(
|
||||||
|
dataset[column], bins=bins, labels=labels, include_lowest=True
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset = scout.cure_depression(dataset)
|
||||||
|
|
||||||
|
x_data = np.arange(0, len(dataset))
|
||||||
|
y_data = dataset[column]
|
||||||
|
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
|
||||||
|
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
||||||
|
dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
def set_to_median(x, bin_intervals):
|
||||||
|
for interval in bin_intervals:
|
||||||
|
if x in interval:
|
||||||
|
return interval.mid
|
||||||
@@ -1,23 +1,38 @@
|
|||||||
# Visualisations for Data
|
# Visualisations for Data
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import gunner
|
import mining_hq
|
||||||
from numpy import count_nonzero
|
from numpy import count_nonzero
|
||||||
|
|
||||||
sns.set()
|
games_pre = mining_hq.games_sales_split_pre
|
||||||
|
games_dur = mining_hq.games_sales_split_dur
|
||||||
|
games_pos = mining_hq.games_sales_split_pos
|
||||||
|
|
||||||
|
crime_US = mining_hq.crime_US_intersect
|
||||||
|
crime_CA = mining_hq.crime_CA_intersect
|
||||||
|
|
||||||
|
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
|
||||||
|
|
||||||
|
sns.set_theme(style = 'ticks', rc = custom_params)
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
|
||||||
games_pre = gunner.game_sales_NA_pre
|
|
||||||
games_dur = gunner.game_sales_NA_dur
|
|
||||||
games_pos = gunner.game_sales_NA_pos
|
|
||||||
|
|
||||||
games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero)
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero)
|
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero)
|
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
|
||||||
|
plt.xlabel("Years")
|
||||||
|
plt.ylabel("Sales in North America (Canada, USA)")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents", estimator=count_nonzero)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|||||||
@@ -4,83 +4,28 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
# Sharing the dataset variables
|
def drop_kick(col_list, dataframe):
|
||||||
# Games' data
|
return dataframe.drop(columns=col_list, axis=1)
|
||||||
global games_dat
|
|
||||||
# Sales in NA
|
|
||||||
global game_sales_NA
|
|
||||||
global game_sales_NA_dur
|
|
||||||
global game_sales_NA_pre
|
|
||||||
global game_sales_NA_pos
|
|
||||||
# Sales Globally
|
|
||||||
global game_sales_GLO
|
|
||||||
|
|
||||||
# Crime Data
|
|
||||||
# Crime Recorded in The US
|
|
||||||
global crime_US
|
|
||||||
# Crime Recorded in Canada
|
|
||||||
global crime_CA
|
|
||||||
|
|
||||||
# Loading Datasets
|
|
||||||
game_sales_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv')
|
|
||||||
games_dat = pd.read_csv('datasets/videogames/Games.xls')
|
|
||||||
|
|
||||||
crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx')
|
|
||||||
crime_US = pd.read_csv('datasets/crime/report.csv')
|
|
||||||
|
|
||||||
# Printing information regarding datasets
|
|
||||||
print("Game Datasets' Info:\n")
|
|
||||||
game_sales_dat.info()
|
|
||||||
games_dat.info()
|
|
||||||
|
|
||||||
print("Crime Datasets' Info:\n")
|
|
||||||
crime_US.info()
|
|
||||||
crime_CA.info()
|
|
||||||
|
|
||||||
# Printing First n values (index start: 0)
|
|
||||||
print("Game Sale Data:\n", game_sales_dat.head(10))
|
|
||||||
print("Game Scores:\n", games_dat.head(10))
|
|
||||||
|
|
||||||
print("US Crime Data:\n", crime_US.head(10))
|
|
||||||
print("CA Crime Data:\n", crime_CA.head(10))
|
|
||||||
|
|
||||||
# Regarding the Games.xls dataset:
|
|
||||||
# Coercing the non-numeric values will result in NaN
|
|
||||||
# thus allowing easier removal through `.notnull()`
|
|
||||||
games_dat['Score'] = pd.to_numeric(games_dat['Score'], errors = 'coerce')
|
|
||||||
|
|
||||||
games_dat = games_dat[games_dat['Score'].notnull()]
|
|
||||||
|
|
||||||
print("Game Scores (Cleaned):\n", games_dat.head())
|
|
||||||
games_dat.info()
|
|
||||||
|
|
||||||
# Regarding the vgsales-12-4-2019 dataset
|
|
||||||
# Considering we will be using a US (probs CA too) crime datasets
|
|
||||||
# It wouldn't be that useful to have other columns regarding other regions
|
|
||||||
NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
|
|
||||||
GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales']
|
|
||||||
|
|
||||||
game_sales_NA = game_sales_dat.drop(columns = NA_col_list, axis = 1)
|
|
||||||
game_sales_GLO = game_sales_dat.drop(columns = GLO_col_list, axis = 1)
|
|
||||||
|
|
||||||
print(f"Game Sales for NA:\n{game_sales_NA.head(10)} \nWith minimum year being: {game_sales_NA['Year'].min()}")
|
|
||||||
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")
|
|
||||||
|
|
||||||
# Getting the range of years which both datasets share
|
# Getting the range of years which both datasets share
|
||||||
crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min())
|
def year_interval(victim1, victim2, col1, col2):
|
||||||
crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max())
|
return (
|
||||||
|
max(victim2[col2].min(), victim1[col1].min()),
|
||||||
|
min(victim2[col2].max(), victim1[col1].max()),
|
||||||
|
)
|
||||||
|
|
||||||
crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)]
|
|
||||||
crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)]
|
|
||||||
|
|
||||||
|
def intersect_by_year(victim1, victim2, col1, col2):
|
||||||
|
interval = year_interval(victim1, victim2, col1, col2)
|
||||||
|
victim1 = victim1[(victim1[col1] >= interval[0]) & (victim1[col1] <= interval[1])]
|
||||||
|
victim2 = victim2[(victim2[col2] >= interval[0]) & (victim2[col2] <= interval[1])]
|
||||||
|
return (victim1, victim2)
|
||||||
|
|
||||||
|
|
||||||
# Updating the NA game dataset to fit with the time ranges
|
# Updating the NA game dataset to fit with the time ranges
|
||||||
game_sales_NA_dur = game_sales_NA[(game_sales_NA['Year'] >= crime_year_min) & (game_sales_NA['Year'] <= crime_year_max)]
|
def trisect_by_year(victim1, col, interval):
|
||||||
|
victim1_pre = victim1[victim1[col] < interval[0]]
|
||||||
game_sales_NA_pre = game_sales_NA[game_sales_NA['Year'] < crime_year_min]
|
victim1_dur = victim1[(victim1[col] <= interval[1]) & (victim1[col] >= interval[0])]
|
||||||
|
victim1_pos = victim1[victim1[col] > interval[1]]
|
||||||
game_sales_NA_pos = game_sales_NA[game_sales_NA['Year'] > crime_year_max]
|
return (victim1_pre, victim1_dur, victim1_pos)
|
||||||
|
|
||||||
print(f"Game Sales for NA:\n{game_sales_NA.head(10)}\nWith minimum year being: {game_sales_NA['Year'].min()}")
|
|
||||||
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")
|
|
||||||
|
|||||||
@@ -3,3 +3,117 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
# containment breach
|
||||||
|
import scipy as scp
|
||||||
|
import digger, gunner, scout
|
||||||
|
|
||||||
|
# Instantiating globals to be used in other files
|
||||||
|
global games_merged_dat
|
||||||
|
global games_sales_split_pre
|
||||||
|
global games_sales_split_dur
|
||||||
|
global games_sales_split_pos
|
||||||
|
|
||||||
|
games_review = pd.read_csv("datasets/videogames/Games.xls")
|
||||||
|
games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv"))
|
||||||
|
|
||||||
|
print(games_review.count())
|
||||||
|
print(games_sales.count())
|
||||||
|
|
||||||
|
games_review_phase1 = digger.slice_column(games_review, "GameName", "Review")
|
||||||
|
games_review_final = digger.slice_column(games_review, "GameName", "(Import)")
|
||||||
|
|
||||||
|
games_merged_dat = digger.write_joined_df(games_sales, games_review_final)
|
||||||
|
|
||||||
|
# Acquisition of Merged dataset
|
||||||
|
print(games_merged_dat.count())
|
||||||
|
|
||||||
|
# Loading Crime Datasets
|
||||||
|
crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
|
||||||
|
|
||||||
|
crime_US = pd.read_csv("datasets/crime/report.csv")
|
||||||
|
|
||||||
|
print(crime_US.isnull())
|
||||||
|
print(crime_CA.isnull())
|
||||||
|
|
||||||
|
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
|
||||||
|
|
||||||
|
year_max = year_interval[0]
|
||||||
|
year_min = year_interval[1]
|
||||||
|
|
||||||
|
crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, "report_year", "year")
|
||||||
|
|
||||||
|
crime_US_intersect = crime_intersect[0]
|
||||||
|
crime_CA_intersect = crime_intersect[1]
|
||||||
|
|
||||||
|
NA_col_list = [
|
||||||
|
"JP_Sales",
|
||||||
|
"Other_Sales",
|
||||||
|
"Global_Sales",
|
||||||
|
"PAL_Sales",
|
||||||
|
"GameName",
|
||||||
|
"Review",
|
||||||
|
"Console",
|
||||||
|
"Score",
|
||||||
|
]
|
||||||
|
GLO_col_list = [
|
||||||
|
"JP_Sales",
|
||||||
|
"Other_Sales",
|
||||||
|
"NA_Sales",
|
||||||
|
"PAL_Sales",
|
||||||
|
"GameName",
|
||||||
|
"Review",
|
||||||
|
"Console",
|
||||||
|
"Score",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Splitting crime datasets
|
||||||
|
# Collecting Split-Up Datasets
|
||||||
|
games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)
|
||||||
|
games_merged_dat.to_csv("datasets/videogames/games_merged.csv", index=False)
|
||||||
|
|
||||||
|
sale_tri_split = gunner.trisect_by_year(games_merged_dat, "Year", year_interval)
|
||||||
|
|
||||||
|
games_sales_split_pre = sale_tri_split[0]
|
||||||
|
games_sales_split_dur = sale_tri_split[1]
|
||||||
|
games_sales_split_pos = sale_tri_split[2]
|
||||||
|
|
||||||
|
# Displaying Acquired Data
|
||||||
|
print("Dataset Info:\n")
|
||||||
|
games_sales_split_pre.info()
|
||||||
|
games_sales_split_dur.info()
|
||||||
|
games_sales_split_pos.info()
|
||||||
|
|
||||||
|
print("Yer forsaken Statistical Description:\n", games_sales_split_dur.describe())
|
||||||
|
|
||||||
|
print(
|
||||||
|
games_sales_split_pre.head(5),
|
||||||
|
games_sales_split_dur.head(5),
|
||||||
|
games_sales_split_pos.head(5),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load merged gammas
|
||||||
|
|
||||||
|
# Required to use binning for cleaning, idk (DONE LESGOOOOOOOOOOOOOOOOOOOO)
|
||||||
|
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
|
||||||
|
gammas = pd.read_csv("datasets/videogames/games_merged.csv")
|
||||||
|
labels = ["smol", "epik", "larg"]
|
||||||
|
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
|
||||||
|
# gammas = gammas[gammas["Genre"].isna() == False]
|
||||||
|
# gammas = scout.cure_depression(gammas)
|
||||||
|
|
||||||
|
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
|
||||||
|
# ah, scheiße
|
||||||
|
# nvm, done, kekW
|
||||||
|
gammas['Critic_Score_Norm'] = scout.scaling_zscore(gammas, 'Critic_Score')
|
||||||
|
print(gammas['Critic_Score_Norm'].head(10))
|
||||||
|
|
||||||
|
# Saving all into a file
|
||||||
|
gammas.to_csv("datasets/videogames/games_cleanish.csv", index=False)
|
||||||
|
|
||||||
|
# Need similarity and dissimialrity, scipy time
|
||||||
|
# Selecting 5 random rows
|
||||||
|
chosen_idx = np.random.choice(len(gammas), replace = False, size = 5)
|
||||||
|
sample_rows = gammas.iloc[chosen_idx]
|
||||||
|
print(sample_rows.head())
|
||||||
|
|
||||||
|
scout.dissimilarity(sample_rows)
|
||||||
|
|||||||
@@ -1 +1,67 @@
|
|||||||
# Regression/Prediction (Totally gonna do later trust bro)
|
# Regression/Prediction (Totally gonna do later trust bro)
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from scipy.spatial import distance
|
||||||
|
import scipy.stats as stats
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def cure_depression(dataset):
|
||||||
|
# this is pog
|
||||||
|
numeric = dataset.select_dtypes(include=np.number)
|
||||||
|
numeric_columns = numeric.columns
|
||||||
|
dataset[numeric_columns] = dataset[numeric_columns].interpolate(
|
||||||
|
method="linear", limit_direct="forward"
|
||||||
|
)
|
||||||
|
# fuck around and find out with other methods maybe idk
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
# Fuck you and whateevr you fucking stand for you dumb whore faggot
|
||||||
|
def regression_expression(dataset, column, missing_value):
|
||||||
|
lr = LinearRegression()
|
||||||
|
numeric = dataset.select_dtypes(include=np.number)
|
||||||
|
# Migrate this to digger
|
||||||
|
# the fookin nulls
|
||||||
|
testdf = numeric[numeric[column].isnull() == False]
|
||||||
|
testdf = testdf[testdf[column] != 0]
|
||||||
|
# the non nulls and non 0s
|
||||||
|
traindf = numeric[numeric[column].isnull() == False]
|
||||||
|
traindf = traindf[traindf[column] != 0]
|
||||||
|
# print(traindf.head(20))
|
||||||
|
# end of migration
|
||||||
|
|
||||||
|
y = traindf[column]
|
||||||
|
traindf.drop(column, axis=1, inplace=True)
|
||||||
|
lr.fit(traindf, y)
|
||||||
|
pred = lr.predict(testdf)
|
||||||
|
# can't put this in data set directly because length no match
|
||||||
|
# join testdf and traindf to form dataset perhaps??
|
||||||
|
testdf[column] = pred
|
||||||
|
print(testdf.head(30))
|
||||||
|
|
||||||
|
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
|
||||||
|
# That helps ^
|
||||||
|
# This boi should work, idk i'm implementing blindly
|
||||||
|
def scaling_zscore(dataframe, col):
|
||||||
|
return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit")
|
||||||
|
|
||||||
|
def dissimilarity(row_arr):
|
||||||
|
row_arr = row_arr.select_dtypes(include = np.number)
|
||||||
|
row_arr = row_arr.drop('Rank', axis = 1)
|
||||||
|
|
||||||
|
print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
|
||||||
|
for i in range(len(row_arr)):
|
||||||
|
print("Entry " , i + 1, " | ", end = "")
|
||||||
|
for j in range(len(row_arr)):
|
||||||
|
eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j])
|
||||||
|
print(" {:#.6g} |".format(eucDist), end = "")
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
def scaling_range(datashitter, col):
|
||||||
|
nonnull = datashitter[col].isna()
|
||||||
|
minmax_scaler = preprocessing.MinMaxScaler()
|
||||||
|
trainer = minmax_scaler.fit_transform(datashitter[nonnull])
|
||||||
|
return minmax_scaler.transform(datashitter[col])
|
||||||
|
|||||||
Reference in New Issue
Block a user