Merge pull request #5 from LinlyBoi/regresso-espresso

Regresso espresso APPROVED
This commit is contained in:
Linly
2023-03-30 20:24:10 +02:00
committed by GitHub
10 changed files with 390 additions and 70720 deletions

11
.gitignore vendored
View File

@@ -127,5 +127,16 @@ dmypy.json
# Pyre type checker # Pyre type checker
.pyre/ .pyre/
# vscode settings conflic shuns # vscode settings conflic shuns
.vscode/ .vscode/
datasets/videogames/merged_games.xlsx
datasets/videogames/games_merged.csv
datasets/videogames/games_cleanish.csv
jupyter-notes/merged_games.csv
output.csv
output.xlsx
.gitignore

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -11592,7 +11592,7 @@ Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,G
11592,Minecraft: Story Mode,PC,2015,Adventure,Mojang,0.02,0.05,0,0.01,0.08 11592,Minecraft: Story Mode,PC,2015,Adventure,Mojang,0.02,0.05,0,0.01,0.08
11593,P.N.03 - Product Number Three,GC,2003,Shooter,Capcom,0.06,0.02,0,0,0.08 11593,P.N.03 - Product Number Three,GC,2003,Shooter,Capcom,0.06,0.02,0,0,0.08
11594,Samurai Warriors 2,X360,2006,Action,Tecmo Koei,0.07,0.01,0,0.01,0.08 11594,Samurai Warriors 2,X360,2006,Action,Tecmo Koei,0.07,0.01,0,0.01,0.08
11595,Boku no Natsuyasumi 3: Hokkoku Hen: Chiisana Boku no Dai Sougen??PS3,2007,Adventure,Sony Computer Entertainment,0,0,0.08,0,0.08, 11595,Boku no Natsuyasumi 3: Hokkoku Hen: Chiisana Boku no Dai Sougen??PS3,X360,2007,Sony Computer Entertainment,0,0,0.08,0,0.08,
11596,Ape Escape: Pumped & Primed,PS2,2004,Misc,Sony Computer Entertainment,0.04,0.03,0,0.01,0.08 11596,Ape Escape: Pumped & Primed,PS2,2004,Misc,Sony Computer Entertainment,0.04,0.03,0,0.01,0.08
11597,Alien Syndrome,PSP,2007,Role-Playing,Sega,0.07,0,0,0.01,0.08 11597,Alien Syndrome,PSP,2007,Role-Playing,Sega,0.07,0,0,0.01,0.08
11598,Monster Trucks DS,DS,2005,Racing,Majesco Entertainment,0.07,0,0,0.01,0.08 11598,Monster Trucks DS,DS,2005,Racing,Majesco Entertainment,0.07,0,0,0.01,0.08
@@ -13537,7 +13537,7 @@ Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,G
13537,Top Gun,DS,2006,Simulation,505 Games,0.04,0,0,0,0.04 13537,Top Gun,DS,2006,Simulation,505 Games,0.04,0,0,0,0.04
13538,Gem Smashers,3DS,2011,Platform,Crave Entertainment,0.04,0,0,0,0.04 13538,Gem Smashers,3DS,2011,Platform,Crave Entertainment,0.04,0,0,0,0.04
13539,Split/Second,PC,2010,Racing,Disney Interactive Studios,0,0.04,0,0.01,0.04 13539,Split/Second,PC,2010,Racing,Disney Interactive Studios,0,0.04,0,0.01,0.04
13540,B's-LOG Party??PSP,2010,Adventure,Idea Factory,0,0,0.04,0,0.04, 13540,B's-LOG Party??PSP,2010,2010,Idea Factory,0,0,0.04,0,0.04,
13541,King Arthur,GC,2004,Action,Konami Digital Entertainment,0.03,0.01,0,0,0.04 13541,King Arthur,GC,2004,Action,Konami Digital Entertainment,0.03,0.01,0,0,0.04
13542,Chicken Shoot,GBA,2005,Action,Zoo Digital Publishing,0.03,0.01,0,0,0.04 13542,Chicken Shoot,GBA,2005,Action,Zoo Digital Publishing,0.03,0.01,0,0,0.04
13543,Dai Senryaku VII: Modern Military Tactics,XB,2003,Strategy,Kool Kizz,0.03,0.01,0,0,0.04 13543,Dai Senryaku VII: Modern Military Tactics,XB,2003,Strategy,Kool Kizz,0.03,0.01,0,0,0.04
Can't render this file because it is too large.

View File

@@ -31,7 +31,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -52,9 +52,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 37,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" GameName\n",
"0 Baldur's Gate 3 Early Access Review\n",
"1 Control: Ultimate Edition Cloud Version Review\n",
"2 Doom Eternal: The Ancient Gods Part 1 Review\n",
"3 Watch Dogs: Legion Review\n",
"4 Ring Of Pain Review\n"
]
}
],
"source": [ "source": [
"# Unclean showcase\n", "# Unclean showcase\n",
"unclean = df1\n", "unclean = df1\n",
@@ -108,7 +121,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -119,23 +132,118 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 33,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Name Genre ESRB_Rating \\\n",
"0 1.0 Wii Sports Sports E \n",
"1 2.0 Super Mario Bros. Platform NaN \n",
"2 3.0 Mario Kart Wii Racing E \n",
"3 4.0 PlayerUnknown's Battlegrounds Shooter NaN \n",
"4 5.0 Wii Sports Resort Sports E \n",
"5 6.0 Pokemon Red / Green / Blue Version Role-Playing E \n",
"6 7.0 New Super Mario Bros. Platform E \n",
"7 8.0 Tetris Puzzle E \n",
"8 9.0 New Super Mario Bros. Wii Platform E \n",
"9 10.0 Minecraft Misc NaN \n",
"10 11.0 Duck Hunt Shooter NaN \n",
"11 12.0 Wii Play Misc E \n",
"12 13.0 Kinect Adventures! Party E \n",
"13 14.0 Nintendogs Simulation E \n",
"14 15.0 Mario Kart DS Racing E \n",
"15 16.0 Pokemon Gold / Silver Version Role-Playing E \n",
"16 17.0 Wii Fit Sports E \n",
"17 18.0 Wii Fit Plus Sports E \n",
"18 19.0 Super Mario World Platform E \n",
"19 20.0 Grand Theft Auto V Action M \n",
"\n",
" Platform Publisher Developer Critic_Score \\\n",
"0 Wii Nintendo Nintendo EAD 7.7 \n",
"1 NES Nintendo Nintendo EAD 10.0 \n",
"2 Wii Nintendo Nintendo EAD 8.2 \n",
"3 PC PUBG Corporation PUBG Corporation NaN \n",
"4 Wii Nintendo Nintendo EAD 8.0 \n",
"5 GB Nintendo Game Freak 9.4 \n",
"6 DS Nintendo Nintendo EAD 9.1 \n",
"7 GB Nintendo Bullet Proof Software NaN \n",
"8 Wii Nintendo Nintendo EAD 8.6 \n",
"9 PC Mojang Mojang AB 10.0 \n",
"10 NES Nintendo Nintendo R&D1 NaN \n",
"11 Wii Nintendo Nintendo EAD 5.9 \n",
"12 X360 Microsoft Game Studios Good Science Studio 6.7 \n",
"13 DS Nintendo Nintendo EAD 8.4 \n",
"14 DS Nintendo Nintendo EAD 9.1 \n",
"15 GB Nintendo Game Freak 9.2 \n",
"16 Wii Nintendo Nintendo EAD 7.9 \n",
"17 Wii Nintendo Nintendo EAD 8.0 \n",
"18 SNES Nintendo Nintendo EAD 8.5 \n",
"19 PS3 Rockstar Games Rockstar North 9.4 \n",
"\n",
" User_Score Total_Shipped Global_Sales NA_Sales PAL_Sales JP_Sales \\\n",
"0 NaN 82.86 NaN NaN NaN NaN \n",
"1 NaN 40.24 NaN NaN NaN NaN \n",
"2 9.1 37.14 NaN NaN NaN NaN \n",
"3 NaN 36.60 NaN NaN NaN NaN \n",
"4 8.8 33.09 NaN NaN NaN NaN \n",
"5 NaN 31.38 NaN NaN NaN NaN \n",
"6 8.1 30.80 NaN NaN NaN NaN \n",
"7 NaN 30.26 NaN NaN NaN NaN \n",
"8 9.2 30.22 NaN NaN NaN NaN \n",
"9 NaN 30.01 NaN NaN NaN NaN \n",
"10 NaN 28.31 NaN NaN NaN NaN \n",
"11 4.5 28.02 NaN NaN NaN NaN \n",
"12 NaN 24.00 NaN NaN NaN NaN \n",
"13 NaN 23.96 NaN NaN NaN NaN \n",
"14 9.4 23.60 NaN NaN NaN NaN \n",
"15 NaN 23.10 NaN NaN NaN NaN \n",
"16 NaN 22.67 NaN NaN NaN NaN \n",
"17 NaN 21.13 NaN NaN NaN NaN \n",
"18 NaN 20.61 NaN NaN NaN NaN \n",
"19 NaN NaN 20.32 6.37 9.85 0.99 \n",
"\n",
" Other_Sales Year Unnamed: 0 Console Review Score \n",
"0 NaN 2006.0 NaN NaN NaN NaN \n",
"1 NaN 1985.0 NaN NaN NaN NaN \n",
"2 NaN 2008.0 NaN NaN NaN NaN \n",
"3 NaN 2017.0 NaN NaN NaN NaN \n",
"4 NaN 2009.0 NaN NaN NaN NaN \n",
"5 NaN 1998.0 NaN NaN NaN NaN \n",
"6 NaN 2006.0 NaN NaN NaN NaN \n",
"7 NaN 1989.0 NaN NaN NaN NaN \n",
"8 NaN 2009.0 NaN NaN NaN NaN \n",
"9 NaN 2010.0 NaN NaN NaN NaN \n",
"10 NaN 1985.0 NaN NaN NaN NaN \n",
"11 NaN 2007.0 NaN NaN NaN NaN \n",
"12 NaN 2010.0 NaN NaN NaN NaN \n",
"13 NaN 2005.0 NaN NaN NaN NaN \n",
"14 NaN 2005.0 NaN NaN NaN NaN \n",
"15 NaN 2000.0 NaN NaN NaN NaN \n",
"16 NaN 2008.0 NaN NaN NaN NaN \n",
"17 NaN 2009.0 NaN NaN NaN NaN \n",
"18 NaN 1991.0 NaN NaN NaN NaN \n",
"19 3.12 2013.0 NaN NaN NaN NaN \n"
]
}
],
"source": [ "source": [
"# merged = pd.merge(df1,df2, how='inner', sort=True) DOES NOT WORK\n", "# merged = pd.merge(df1,df2, how='inner', sort=True) DOES NOT WORK\n",
"# print(merged.head(10))\n", "# print(merged.head(10))\n",
"merged = df2.join(df1, lsuffix='merged') #Good\n", "columns = df1.columns\n",
"print(merged.head(10))" "merged = pd.concat([df2,df1], sort=False, ignore_index=True) #Good\n",
"print(merged.head(20))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"merged2.to_csv('merged_games.csv')" "merged.to_csv('merged_games.csv')"
] ]
}, },
{ {

View File

@@ -1,35 +1,13 @@
# Getting and combining data # Getting and combining data
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import scout
# reading the data from numpy import ndarray
# -> MAKE SURE OF THE DATA FRAMES NAMES PEFORE YOU RUN IT from scipy.stats import binned_statistic
df1 = pd.read_csv("output_6th_df.csv") # Defining useful Functions to be used later
df2 = pd.read_csv("vgsales-12-4-2019-short.csv") def slice_column(input_df, column, expression=" "):
# ----------------------------------------------------------
# print(pf1.head)
# print(pf2.head)
# ---------------------------------------------------------
# merging
combined_df = df1.merge(df2, left_on="Name", right_on="Name", how="left")
print(combined_df)
combined_df.to_csv("output_final_df.csv")
df = combined_df
# ---------------------------------------------------------
def slice_column(input_df, output_df, column, expression=" "):
unclean = input_df[column].to_list() unclean = input_df[column].to_list()
clean = list() clean = list()
for record in unclean: for record in unclean:
@@ -38,9 +16,37 @@ def slice_column(input_df, output_df, column, expression=" "):
input_df = input_df.drop(columns=[column]) input_df = input_df.drop(columns=[column])
input_df[column] = clean input_df[column] = clean
input_df.to_csv(output_df) return input_df
def write_joined_df(left, right, output_file, lsuf="new_key"): def write_joined_df(left, right, lsuf="new_key"):
merged = left.join(right, lsuffix=lsuf) merged = pd.concat([left, right], sort=False, ignore_index=True) # Good
merged.to_csv(output_file) return merged
def slam_dunk(dataset, column, labels):
min_value = dataset[column].min()
max_value = dataset[column].max()
print("min: ", min_value, " max: ", max_value)
bins = np.linspace(min_value, max_value, len(labels) + 1)
bins
dunked_column = "bin_" + column
dataset[dunked_column] = pd.cut(
dataset[column], bins=bins, labels=labels, include_lowest=True
)
dataset = scout.cure_depression(dataset)
x_data = np.arange(0, len(dataset))
y_data = dataset[column]
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
return dataset
def set_to_median(x, bin_intervals):
for interval in bin_intervals:
if x in interval:
return interval.mid

View File

@@ -1,23 +1,38 @@
# Visualisations for Data # Visualisations for Data
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import gunner import mining_hq
from numpy import count_nonzero from numpy import count_nonzero
sns.set() games_pre = mining_hq.games_sales_split_pre
games_dur = mining_hq.games_sales_split_dur
games_pos = mining_hq.games_sales_split_pos
crime_US = mining_hq.crime_US_intersect
crime_CA = mining_hq.crime_CA_intersect
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style = 'ticks', rc = custom_params)
plt.xticks(rotation = 90) plt.xticks(rotation = 90)
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
games_pre = gunner.game_sales_NA_pre
games_dur = gunner.game_sales_NA_dur
games_pos = gunner.game_sales_NA_pos
games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show() plt.show()
plt.xticks(rotation = 90) plt.xticks(rotation = 90)
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero) games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65)
plt.show() plt.show()
plt.xticks(rotation = 90) plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero) games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
plt.xlabel("Years")
plt.ylabel("Sales in North America (Canada, USA)")
plt.show()
plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
plt.show()
plt.xticks(rotation = 90)
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents", estimator=count_nonzero)
plt.show() plt.show()

View File

@@ -4,83 +4,28 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
# Sharing the dataset variables def drop_kick(col_list, dataframe):
# Games' data return dataframe.drop(columns=col_list, axis=1)
global games_dat
# Sales in NA
global game_sales_NA
global game_sales_NA_dur
global game_sales_NA_pre
global game_sales_NA_pos
# Sales Globally
global game_sales_GLO
# Crime Data
# Crime Recorded in The US
global crime_US
# Crime Recorded in Canada
global crime_CA
# Loading Datasets
game_sales_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv')
games_dat = pd.read_csv('datasets/videogames/Games.xls')
crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx')
crime_US = pd.read_csv('datasets/crime/report.csv')
# Printing information regarding datasets
print("Game Datasets' Info:\n")
game_sales_dat.info()
games_dat.info()
print("Crime Datasets' Info:\n")
crime_US.info()
crime_CA.info()
# Printing First n values (index start: 0)
print("Game Sale Data:\n", game_sales_dat.head(10))
print("Game Scores:\n", games_dat.head(10))
print("US Crime Data:\n", crime_US.head(10))
print("CA Crime Data:\n", crime_CA.head(10))
# Regarding the Games.xls dataset:
# Coercing the non-numeric values will result in NaN
# thus allowing easier removal through `.notnull()`
games_dat['Score'] = pd.to_numeric(games_dat['Score'], errors = 'coerce')
games_dat = games_dat[games_dat['Score'].notnull()]
print("Game Scores (Cleaned):\n", games_dat.head())
games_dat.info()
# Regarding the vgsales-12-4-2019 dataset
# Considering we will be using a US (probs CA too) crime datasets
# It wouldn't be that useful to have other columns regarding other regions
NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales']
game_sales_NA = game_sales_dat.drop(columns = NA_col_list, axis = 1)
game_sales_GLO = game_sales_dat.drop(columns = GLO_col_list, axis = 1)
print(f"Game Sales for NA:\n{game_sales_NA.head(10)} \nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")
# Getting the range of years which both datasets share # Getting the range of years which both datasets share
crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min()) def year_interval(victim1, victim2, col1, col2):
crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max()) return (
max(victim2[col2].min(), victim1[col1].min()),
min(victim2[col2].max(), victim1[col1].max()),
)
crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)]
crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)]
def intersect_by_year(victim1, victim2, col1, col2):
interval = year_interval(victim1, victim2, col1, col2)
victim1 = victim1[(victim1[col1] >= interval[0]) & (victim1[col1] <= interval[1])]
victim2 = victim2[(victim2[col2] >= interval[0]) & (victim2[col2] <= interval[1])]
return (victim1, victim2)
# Updating the NA game dataset to fit with the time ranges # Updating the NA game dataset to fit with the time ranges
game_sales_NA_dur = game_sales_NA[(game_sales_NA['Year'] >= crime_year_min) & (game_sales_NA['Year'] <= crime_year_max)] def trisect_by_year(victim1, col, interval):
victim1_pre = victim1[victim1[col] < interval[0]]
game_sales_NA_pre = game_sales_NA[game_sales_NA['Year'] < crime_year_min] victim1_dur = victim1[(victim1[col] <= interval[1]) & (victim1[col] >= interval[0])]
victim1_pos = victim1[victim1[col] > interval[1]]
game_sales_NA_pos = game_sales_NA[game_sales_NA['Year'] > crime_year_max] return (victim1_pre, victim1_dur, victim1_pos)
print(f"Game Sales for NA:\n{game_sales_NA.head(10)}\nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")

View File

@@ -3,3 +3,117 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns
# containment breach
import scipy as scp
import digger, gunner, scout
# Instantiating globals to be used in other files
global games_merged_dat
global games_sales_split_pre
global games_sales_split_dur
global games_sales_split_pos
games_review = pd.read_csv("datasets/videogames/Games.xls")
games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv"))
print(games_review.count())
print(games_sales.count())
games_review_phase1 = digger.slice_column(games_review, "GameName", "Review")
games_review_final = digger.slice_column(games_review, "GameName", "(Import)")
games_merged_dat = digger.write_joined_df(games_sales, games_review_final)
# Acquisition of Merged dataset
print(games_merged_dat.count())
# Loading Crime Datasets
crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
crime_US = pd.read_csv("datasets/crime/report.csv")
print(crime_US.isnull())
print(crime_CA.isnull())
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
year_max = year_interval[0]
year_min = year_interval[1]
crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, "report_year", "year")
crime_US_intersect = crime_intersect[0]
crime_CA_intersect = crime_intersect[1]
NA_col_list = [
"JP_Sales",
"Other_Sales",
"Global_Sales",
"PAL_Sales",
"GameName",
"Review",
"Console",
"Score",
]
GLO_col_list = [
"JP_Sales",
"Other_Sales",
"NA_Sales",
"PAL_Sales",
"GameName",
"Review",
"Console",
"Score",
]
# Splitting crime datasets
# Collecting Split-Up Datasets
games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)
games_merged_dat.to_csv("datasets/videogames/games_merged.csv", index=False)
sale_tri_split = gunner.trisect_by_year(games_merged_dat, "Year", year_interval)
games_sales_split_pre = sale_tri_split[0]
games_sales_split_dur = sale_tri_split[1]
games_sales_split_pos = sale_tri_split[2]
# Displaying Acquired Data
print("Dataset Info:\n")
games_sales_split_pre.info()
games_sales_split_dur.info()
games_sales_split_pos.info()
print("Yer forsaken Statistical Description:\n", games_sales_split_dur.describe())
print(
games_sales_split_pre.head(5),
games_sales_split_dur.head(5),
games_sales_split_pos.head(5),
)
# Load merged gammas
# Required to use binning for cleaning, idk (DONE LESGOOOOOOOOOOOOOOOOOOOO)
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
gammas = pd.read_csv("datasets/videogames/games_merged.csv")
labels = ["smol", "epik", "larg"]
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
# gammas = gammas[gammas["Genre"].isna() == False]
# gammas = scout.cure_depression(gammas)
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
# ah, scheiße
# nvm, done, kekW
gammas['Critic_Score_Norm'] = scout.scaling_zscore(gammas, 'Critic_Score')
print(gammas['Critic_Score_Norm'].head(10))
# Saving all into a file
gammas.to_csv("datasets/videogames/games_cleanish.csv", index=False)
# Need similarity and dissimialrity, scipy time
# Selecting 5 random rows
chosen_idx = np.random.choice(len(gammas), replace = False, size = 5)
sample_rows = gammas.iloc[chosen_idx]
print(sample_rows.head())
scout.dissimilarity(sample_rows)

View File

@@ -1 +1,67 @@
# Regression/Prediction (Totally gonna do later trust bro) # Regression/Prediction (Totally gonna do later trust bro)
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from scipy.spatial import distance
import scipy.stats as stats
import numpy as np
import pandas as pd
def cure_depression(dataset):
# this is pog
numeric = dataset.select_dtypes(include=np.number)
numeric_columns = numeric.columns
dataset[numeric_columns] = dataset[numeric_columns].interpolate(
method="linear", limit_direct="forward"
)
# fuck around and find out with other methods maybe idk
return dataset
# Fuck you and whateevr you fucking stand for you dumb whore faggot
def regression_expression(dataset, column, missing_value):
lr = LinearRegression()
numeric = dataset.select_dtypes(include=np.number)
# Migrate this to digger
# the fookin nulls
testdf = numeric[numeric[column].isnull() == False]
testdf = testdf[testdf[column] != 0]
# the non nulls and non 0s
traindf = numeric[numeric[column].isnull() == False]
traindf = traindf[traindf[column] != 0]
# print(traindf.head(20))
# end of migration
y = traindf[column]
traindf.drop(column, axis=1, inplace=True)
lr.fit(traindf, y)
pred = lr.predict(testdf)
# can't put this in data set directly because length no match
# join testdf and traindf to form dataset perhaps??
testdf[column] = pred
print(testdf.head(30))
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
# That helps ^
# This boi should work, idk i'm implementing blindly
def scaling_zscore(dataframe, col):
return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit")
def dissimilarity(row_arr):
row_arr = row_arr.select_dtypes(include = np.number)
row_arr = row_arr.drop('Rank', axis = 1)
print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
for i in range(len(row_arr)):
print("Entry " , i + 1, " | ", end = "")
for j in range(len(row_arr)):
eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j])
print(" {:#.6g} |".format(eucDist), end = "")
print("\n")
def scaling_range(datashitter, col):
nonnull = datashitter[col].isna()
minmax_scaler = preprocessing.MinMaxScaler()
trainer = minmax_scaler.fit_transform(datashitter[nonnull])
return minmax_scaler.transform(datashitter[col])