Euclidean Distances n stuff
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
# Instantiating Main Python Script File
|
# Instantiating Main Python Script File
|
||||||
# Collects stuff from the rest of the scripts
|
# Collects stuff from the rest of the scripts
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import scout
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import digger, gunner
|
# containment breach
|
||||||
|
import scipy as scp
|
||||||
|
import digger, gunner, scout
|
||||||
|
|
||||||
# Instantiating globals to be used in other files
|
# Instantiating globals to be used in other files
|
||||||
global games_merged_dat
|
global games_merged_dat
|
||||||
@@ -48,6 +49,7 @@ NA_col_list = [
|
|||||||
"JP_Sales",
|
"JP_Sales",
|
||||||
"Other_Sales",
|
"Other_Sales",
|
||||||
"Global_Sales",
|
"Global_Sales",
|
||||||
|
"PAL_Sales",
|
||||||
"GameName",
|
"GameName",
|
||||||
"Review",
|
"Review",
|
||||||
"Console",
|
"Console",
|
||||||
@@ -57,6 +59,7 @@ GLO_col_list = [
|
|||||||
"JP_Sales",
|
"JP_Sales",
|
||||||
"Other_Sales",
|
"Other_Sales",
|
||||||
"NA_Sales",
|
"NA_Sales",
|
||||||
|
"PAL_Sales",
|
||||||
"GameName",
|
"GameName",
|
||||||
"Review",
|
"Review",
|
||||||
"Console",
|
"Console",
|
||||||
@@ -96,18 +99,30 @@ print(
|
|||||||
games_sales_split_dur.head(5),
|
games_sales_split_dur.head(5),
|
||||||
games_sales_split_pos.head(5),
|
games_sales_split_pos.head(5),
|
||||||
)
|
)
|
||||||
# Required to use binning for cleaning, idk
|
|
||||||
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
|
|
||||||
|
|
||||||
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
|
|
||||||
|
|
||||||
# Need similarity and dissimialrity, scipy time
|
|
||||||
|
|
||||||
# Load merged gammas
|
# Load merged gammas
|
||||||
|
|
||||||
|
# Required to use binning for cleaning, idk (DONE LESGOOOOOOOOOOOOOOOOOOOO)
|
||||||
|
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
|
||||||
gammas = pd.read_csv("datasets/videogames/games_merged.csv")
|
gammas = pd.read_csv("datasets/videogames/games_merged.csv")
|
||||||
labels = ["smol", "epik", "larg"]
|
labels = ["smol", "epik", "larg"]
|
||||||
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
|
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
|
||||||
# gammas = gammas[gammas["Genre"].isna() == False]
|
# gammas = gammas[gammas["Genre"].isna() == False]
|
||||||
# gammas = scout.cure_depression(gammas)
|
# gammas = scout.cure_depression(gammas)
|
||||||
|
|
||||||
|
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
|
||||||
|
# ah, scheiße
|
||||||
|
# nvm, done, kekW
|
||||||
|
gammas['Critic_Score'] = scout.scaling_zscore(gammas, 'Critic_Score')
|
||||||
|
print(gammas['Critic_Score'].head(10))
|
||||||
|
|
||||||
|
# Saving all into a file
|
||||||
gammas.to_csv("output.csv", index=False)
|
gammas.to_csv("output.csv", index=False)
|
||||||
|
|
||||||
|
# Need similarity and dissimialrity, scipy time
|
||||||
|
# Selecting 5 random rows
|
||||||
|
chosen_idx = np.random.choice(len(gammas), replace = False, size = 5)
|
||||||
|
sample_rows = gammas.iloc[chosen_idx]
|
||||||
|
print(sample_rows.head())
|
||||||
|
|
||||||
|
scout.dissimilarity(sample_rows.select_dtypes(include = np.number))
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
from sklearn.linear_model import LinearRegression
|
from sklearn.linear_model import LinearRegression
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
from sklearn import preprocessing
|
from sklearn import preprocessing
|
||||||
|
from scipy.spatial import distance
|
||||||
|
import scipy.stats as stats
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@@ -43,9 +45,16 @@ def regression_expression(dataset, column, missing_value):
|
|||||||
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
|
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
|
||||||
# That helps ^
|
# That helps ^
|
||||||
# This boi should work, idk i'm implementing blindly
|
# This boi should work, idk i'm implementing blindly
|
||||||
def scaling_zscore(datashitter, col):
|
def scaling_zscore(dataframe, col):
|
||||||
scaler = preprocessing.StandardScaler().fit(datashitter[col])
|
return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit")
|
||||||
return scaler.transform(datashitter[col])
|
|
||||||
|
def dissimilarity(row_arr):
|
||||||
|
for i in len(row_arr):
|
||||||
|
print("| ")
|
||||||
|
for j in len(row_arr):
|
||||||
|
eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j])
|
||||||
|
print(f"Dissim {i}{j}: {eucDist} |")
|
||||||
|
print("\n")
|
||||||
|
|
||||||
def scaling_range(datashitter, col):
|
def scaling_range(datashitter, col):
|
||||||
nonnull = datashitter[col].isna()
|
nonnull = datashitter[col].isna()
|
||||||
|
|||||||
Reference in New Issue
Block a user