Committing binning

This commit is contained in:
2023-03-30 16:19:43 +02:00
parent 17b3e05b41
commit eb4861ecc9
3 changed files with 6 additions and 7 deletions

View File

@@ -1,6 +1,7 @@
# Getting and combining data # Getting and combining data
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import scout
from numpy import ndarray from numpy import ndarray
from scipy.stats import binned_statistic from scipy.stats import binned_statistic
@@ -24,7 +25,6 @@ def write_joined_df(left, right, lsuf="new_key"):
def slam_dunk(dataset, column, labels): def slam_dunk(dataset, column, labels):
dataset[column] = dataset[column].fillna(0, inplace = True)
min_value = dataset[column].min() min_value = dataset[column].min()
max_value = dataset[column].max() max_value = dataset[column].max()
print("min: ", min_value, " max: ", max_value) print("min: ", min_value, " max: ", max_value)
@@ -36,14 +36,13 @@ def slam_dunk(dataset, column, labels):
dataset[column], bins=bins, labels=labels, include_lowest=True dataset[column], bins=bins, labels=labels, include_lowest=True
) )
# filling column with means dataset = scout.cure_depression(dataset)
dataset[column] = dataset[column].interpolate(method = "linear", limit_direction = "backward", limit = 1)
x_data = np.arange(0, len(dataset)) x_data = np.arange(0, len(dataset))
y_data = dataset[column] y_data = dataset[column]
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels)) x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:]) bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
dataset['PooShi'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals)) dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
return dataset return dataset

View File

@@ -22,11 +22,11 @@ games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multi
plt.show() plt.show()
plt.xticks(rotation = 90) plt.xticks(rotation = 90)
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator=count_nonzero) games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
plt.show() plt.show()
plt.xticks(rotation = 90) plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator=count_nonzero) games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
plt.show() plt.show()
plt.xticks(rotation = 90) plt.xticks(rotation = 90)

View File

@@ -13,7 +13,7 @@ global games_sales_split_dur
global games_sales_split_pos global games_sales_split_pos
games_review = pd.read_csv("datasets/videogames/Games.xls") games_review = pd.read_csv("datasets/videogames/Games.xls")
games_sales = pd.read_csv("datasets/videogames/vgsales.csv") games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv"))
print(games_review.count()) print(games_review.count())
print(games_sales.count()) print(games_sales.count())