Euclidean Distances n stuff

This commit is contained in:
2023-03-30 17:09:38 +02:00
parent eb4861ecc9
commit 172bcf3c23
2 changed files with 35 additions and 11 deletions

View File

@@ -1,10 +1,11 @@
# Instantiating Main Python Script File # Instantiating Main Python Script File
# Collects stuff from the rest of the scripts # Collects stuff from the rest of the scripts
import pandas as pd import pandas as pd
import scout
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns
import digger, gunner # containment breach
import scipy as scp
import digger, gunner, scout
# Instantiating globals to be used in other files # Instantiating globals to be used in other files
global games_merged_dat global games_merged_dat
@@ -48,6 +49,7 @@ NA_col_list = [
"JP_Sales", "JP_Sales",
"Other_Sales", "Other_Sales",
"Global_Sales", "Global_Sales",
"PAL_Sales",
"GameName", "GameName",
"Review", "Review",
"Console", "Console",
@@ -57,6 +59,7 @@ GLO_col_list = [
"JP_Sales", "JP_Sales",
"Other_Sales", "Other_Sales",
"NA_Sales", "NA_Sales",
"PAL_Sales",
"GameName", "GameName",
"Review", "Review",
"Console", "Console",
@@ -96,18 +99,30 @@ print(
games_sales_split_dur.head(5), games_sales_split_dur.head(5),
games_sales_split_pos.head(5), games_sales_split_pos.head(5),
) )
# Required to use binning for cleaning, idk
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
# Need similarity and dissimialrity, scipy time
# Load merged gammas # Load merged gammas
# Required to use binning for cleaning, idk (DONE LESGOOOOOOOOOOOOOOOOOOOO)
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
gammas = pd.read_csv("datasets/videogames/games_merged.csv") gammas = pd.read_csv("datasets/videogames/games_merged.csv")
labels = ["smol", "epik", "larg"] labels = ["smol", "epik", "larg"]
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels) gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
# gammas = gammas[gammas["Genre"].isna() == False] # gammas = gammas[gammas["Genre"].isna() == False]
# gammas = scout.cure_depression(gammas) # gammas = scout.cure_depression(gammas)
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
# ah, scheiße
# nvm, done, kekW
gammas['Critic_Score'] = scout.scaling_zscore(gammas, 'Critic_Score')
print(gammas['Critic_Score'].head(10))
# Saving all into a file
gammas.to_csv("output.csv", index=False) gammas.to_csv("output.csv", index=False)
# Need similarity and dissimialrity, scipy time
# Selecting 5 random rows
chosen_idx = np.random.choice(len(gammas), replace = False, size = 5)
sample_rows = gammas.iloc[chosen_idx]
print(sample_rows.head())
scout.dissimilarity(sample_rows.select_dtypes(include = np.number))

View File

@@ -2,6 +2,8 @@
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn import preprocessing from sklearn import preprocessing
from scipy.spatial import distance
import scipy.stats as stats
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@@ -43,9 +45,16 @@ def regression_expression(dataset, column, missing_value):
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing # https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
# That helps ^ # That helps ^
# This boi should work, idk i'm implementing blindly # This boi should work, idk i'm implementing blindly
def scaling_zscore(datashitter, col): def scaling_zscore(dataframe, col):
scaler = preprocessing.StandardScaler().fit(datashitter[col]) return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit")
return scaler.transform(datashitter[col])
def dissimilarity(row_arr):
for i in len(row_arr):
print("| ")
for j in len(row_arr):
eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j])
print(f"Dissim {i}{j}: {eucDist} |")
print("\n")
def scaling_range(datashitter, col): def scaling_range(datashitter, col):
nonnull = datashitter[col].isna() nonnull = datashitter[col].isna()