I have depression

2023-03-30 10:53:53 +02:00
parent cab64843da
commit 572c0175bf
4 changed files with 34 additions and 17 deletions
--- a/py_scripts/digger.py
+++ b/py_scripts/digger.py
@@ -21,13 +21,15 @@ def write_joined_df(left, right, lsuf="new_key"):
    return merged


-def slam_dunk(dataset, column, size, labels):
+def slam_dunk(dataset, column, labels):
    min_value = dataset[column].min()
    max_value = dataset[column].max()
-    bins = np.linspace(min_value, max_value, size)
+    print("min: ", min_value, " max: ", max_value)
+    bins = np.linspace(min_value, max_value, len(labels) + 1)
+    bins

    dunked_column = "bin_" + column
    dataset[dunked_column] = pd.cut(
        dataset[column], bins=bins, labels=labels, include_lowest=True
    )
-    return dataset[dunked_column]
+    return dataset
--- a/py_scripts/engineer.py
+++ b/py_scripts/engineer.py
@@ -22,13 +22,13 @@ games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multi
 plt.show()

 plt.xticks(rotation = 90)
-games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
+games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator=count_nonzero)
 plt.show()

 plt.xticks(rotation = 90)
-games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
+games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator=count_nonzero)
 plt.show()

 plt.xticks(rotation = 90)
-crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents")
+crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents", estimator=count_nonzero)
 plt.show()
--- a/py_scripts/mining_hq.py
+++ b/py_scripts/mining_hq.py
@@ -25,7 +25,6 @@ games_merged_dat = digger.write_joined_df(games_sales, games_review_final)

 # Acquisition of Merged dataset
 print(games_merged_dat.count())
-games_merged_dat.to_csv("datasets/videogames/games_merged.csv", index=False)

 # Loading Crime Datasets
 crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
@@ -50,23 +49,26 @@ NA_col_list = [
    "JP_Sales",
    "Other_Sales",
    "Global_Sales",
-    "User_Score",
    "GameName",
    "Review",
+    "Console",
+    "Score",
 ]
 GLO_col_list = [
    "PAL_Sales",
    "JP_Sales",
    "Other_Sales",
    "NA_Sales",
-    "User_Score",
    "GameName",
    "Review",
+    "Console",
+    "Score",
 ]

 # Splitting crime datasets
 # Collecting Split-Up Datasets
 games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)
+games_merged_dat.to_csv("datasets/videogames/games_merged.csv", index=False)

 sale_tri_split = gunner.trisect_by_year(games_merged_dat, "Year", year_interval)

@@ -89,7 +91,7 @@ games_sales_split_pre.info()
 games_sales_split_dur.info()
 games_sales_split_pos.info()

-print(games_sales_split_dur.describe())
+print("Yer forsaken Statistical Description:\n", games_sales_split_dur.describe())

 print(
    games_sales_split_pre.head(5),
@@ -105,10 +107,9 @@ print(

 # Load merged gammas

-gammas = pd.read_excel("datasets/videogames/merged_games.xlsx")
+gammas = pd.read_csv("datasets/videogames/games_merged.csv")
 labels = ["smol", "epik", "larg"]
-gammas["User_Score"] = digger.slam_dunk(gammas, "User_Score", 3, labels=labels)
-gammas = gammas[gammas["Genre"].isna() == False]
-gammas = scout.cure_depression(gammas)
+gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
+# gammas = gammas[gammas["Genre"].isna() == False]
+# gammas = scout.cure_depression(gammas)
 gammas.to_csv("output.csv", index=False)
-scout.regression_expression(gammas, "Global_Sales", 0)
--- a/py_scripts/scout.py
+++ b/py_scripts/scout.py
@@ -1,8 +1,9 @@
 # Regression/Prediction (Totally gonna do later trust bro)
 from sklearn.linear_model import LinearRegression
-
 from sklearn.impute import SimpleImputer
+from sklearn import preprocessing
 import numpy as np
+import pandas as pd


 def cure_depression(dataset):
@@ -16,6 +17,7 @@ def cure_depression(dataset):
    return dataset


+# Fuck you and whateevr you fucking stand for you dumb whore faggot
 def regression_expression(dataset, column, missing_value):
    lr = LinearRegression()
    numeric = dataset.select_dtypes(include=np.number)
@@ -32,9 +34,21 @@ def regression_expression(dataset, column, missing_value):
    y = traindf[column]
    traindf.drop(column, axis=1, inplace=True)
    lr.fit(traindf, y)
-    testdf.drop(column, axis=1, inplace=True)
    pred = lr.predict(testdf)
    # can't put this in data set directly because length no match
    # join testdf and traindf to form dataset perhaps??
    testdf[column] = pred
    print(testdf.head(30))
+
+# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
+# That helps ^
+# This boi should work, idk i'm implementing blindly
+def scaling_zscore(datashitter, col):
+    scaler = preprocessing.StandardScaler().fit(datashitter[col])
+    return scaler.transform(datashitter[col])
+
+def scaling_range(datashitter, col):
+    nonnull = datashitter[col].isna()
+    minmax_scaler = preprocessing.MinMaxScaler()
+    trainer = minmax_scaler.fit_transform(datashitter[nonnull])
+    return minmax_scaler.transform(datashitter[col])