From eb4861ecc9bac0a909370c926dcd13226a7b3d05 Mon Sep 17 00:00:00 2001 From: Supermjork Date: Thu, 30 Mar 2023 16:19:43 +0200 Subject: [PATCH] Committing binning --- py_scripts/digger.py | 7 +++---- py_scripts/engineer.py | 4 ++-- py_scripts/mining_hq.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/py_scripts/digger.py b/py_scripts/digger.py index 6be548f..17b46fe 100644 --- a/py_scripts/digger.py +++ b/py_scripts/digger.py @@ -1,6 +1,7 @@ # Getting and combining data import pandas as pd import numpy as np +import scout from numpy import ndarray from scipy.stats import binned_statistic @@ -24,7 +25,6 @@ def write_joined_df(left, right, lsuf="new_key"): def slam_dunk(dataset, column, labels): - dataset[column] = dataset[column].fillna(0, inplace = True) min_value = dataset[column].min() max_value = dataset[column].max() print("min: ", min_value, " max: ", max_value) @@ -36,14 +36,13 @@ def slam_dunk(dataset, column, labels): dataset[column], bins=bins, labels=labels, include_lowest=True ) - # filling column with means - dataset[column] = dataset[column].interpolate(method = "linear", limit_direction = "backward", limit = 1) + dataset = scout.cure_depression(dataset) x_data = np.arange(0, len(dataset)) y_data = dataset[column] x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels)) bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:]) - dataset['PooShi'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals)) + dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals)) return dataset diff --git a/py_scripts/engineer.py b/py_scripts/engineer.py index ceda338..a6d3796 100644 --- a/py_scripts/engineer.py +++ b/py_scripts/engineer.py @@ -22,11 +22,11 @@ games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multi plt.show() plt.xticks(rotation = 90) -games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator=count_nonzero) +games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales") plt.show() plt.xticks(rotation = 90) -games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator=count_nonzero) +games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales") plt.show() plt.xticks(rotation = 90) diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index 53b8e85..4f85c98 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -13,7 +13,7 @@ global games_sales_split_dur global games_sales_split_pos games_review = pd.read_csv("datasets/videogames/Games.xls") -games_sales = pd.read_csv("datasets/videogames/vgsales.csv") +games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv")) print(games_review.count()) print(games_sales.count())