Committing binning
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# Getting and combining data
|
# Getting and combining data
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import scout
|
||||||
from numpy import ndarray
|
from numpy import ndarray
|
||||||
from scipy.stats import binned_statistic
|
from scipy.stats import binned_statistic
|
||||||
|
|
||||||
@@ -24,7 +25,6 @@ def write_joined_df(left, right, lsuf="new_key"):
|
|||||||
|
|
||||||
|
|
||||||
def slam_dunk(dataset, column, labels):
|
def slam_dunk(dataset, column, labels):
|
||||||
dataset[column] = dataset[column].fillna(0, inplace = True)
|
|
||||||
min_value = dataset[column].min()
|
min_value = dataset[column].min()
|
||||||
max_value = dataset[column].max()
|
max_value = dataset[column].max()
|
||||||
print("min: ", min_value, " max: ", max_value)
|
print("min: ", min_value, " max: ", max_value)
|
||||||
@@ -36,14 +36,13 @@ def slam_dunk(dataset, column, labels):
|
|||||||
dataset[column], bins=bins, labels=labels, include_lowest=True
|
dataset[column], bins=bins, labels=labels, include_lowest=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# filling column with means
|
dataset = scout.cure_depression(dataset)
|
||||||
dataset[column] = dataset[column].interpolate(method = "linear", limit_direction = "backward", limit = 1)
|
|
||||||
|
|
||||||
x_data = np.arange(0, len(dataset))
|
x_data = np.arange(0, len(dataset))
|
||||||
y_data = dataset[column]
|
y_data = dataset[column]
|
||||||
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
|
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
|
||||||
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
||||||
dataset['PooShi'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
|
dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|||||||
@@ -22,11 +22,11 @@ games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multi
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator=count_nonzero)
|
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator=count_nonzero)
|
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
plt.xticks(rotation = 90)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ global games_sales_split_dur
|
|||||||
global games_sales_split_pos
|
global games_sales_split_pos
|
||||||
|
|
||||||
games_review = pd.read_csv("datasets/videogames/Games.xls")
|
games_review = pd.read_csv("datasets/videogames/Games.xls")
|
||||||
games_sales = pd.read_csv("datasets/videogames/vgsales.csv")
|
games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv"))
|
||||||
|
|
||||||
print(games_review.count())
|
print(games_review.count())
|
||||||
print(games_sales.count())
|
print(games_sales.count())
|
||||||
|
|||||||
Reference in New Issue
Block a user