new folder who dis
This commit is contained in:
57
dwarves/digger.py
Normal file
57
dwarves/digger.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# Getting and combining data
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from dwarves import scout
|
||||
from numpy import ndarray
|
||||
from scipy.stats import binned_statistic
|
||||
|
||||
|
||||
# Defining useful Functions to be used later
|
||||
def slice_column(input_df, column, expression=" "):
|
||||
unclean = input_df[column].to_list()
|
||||
clean = list()
|
||||
for record in unclean:
|
||||
record = record.split(expression)[0]
|
||||
clean.append(record)
|
||||
|
||||
input_df = input_df.drop(columns=[column])
|
||||
input_df[column] = clean
|
||||
return input_df
|
||||
|
||||
|
||||
def write_joined_df(left, right, lsuf="new_key"):
|
||||
merged = pd.concat([left, right], sort=False, ignore_index=True) # Good
|
||||
return merged
|
||||
|
||||
|
||||
def slam_dunk(dataset, column, labels):
|
||||
min_value = dataset[column].min()
|
||||
max_value = dataset[column].max()
|
||||
print("min: ", min_value, " max: ", max_value)
|
||||
bins = np.linspace(min_value, max_value, len(labels) + 1)
|
||||
bins
|
||||
|
||||
dunked_column = "bin_" + column
|
||||
dataset[dunked_column] = pd.cut(
|
||||
dataset[column], bins=bins, labels=labels, include_lowest=True
|
||||
)
|
||||
|
||||
dataset = scout.cure_depression(dataset)
|
||||
|
||||
x_data = np.arange(0, len(dataset))
|
||||
y_data = dataset[column]
|
||||
x_bins, bin_edges, misc = binned_statistic(
|
||||
y_data, x_data, statistic="median", bins=len(labels)
|
||||
)
|
||||
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
||||
dataset["bin_value"] = dataset[column].apply(
|
||||
lambda x: set_to_median(x, bin_intervals)
|
||||
)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def set_to_median(x, bin_intervals):
|
||||
for interval in bin_intervals:
|
||||
if x in interval:
|
||||
return interval.mid
|
||||
60
dwarves/engineer.py
Normal file
60
dwarves/engineer.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# Visualisations for Data
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
import mining_hq
|
||||
from numpy import count_nonzero
|
||||
|
||||
games_pre = mining_hq.games_sales_split_pre
|
||||
games_dur = mining_hq.games_sales_split_dur
|
||||
games_pos = mining_hq.games_sales_split_pos
|
||||
|
||||
crime_US = mining_hq.crime_US_intersect
|
||||
crime_CA = mining_hq.crime_CA_intersect
|
||||
|
||||
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
|
||||
|
||||
sns.set_theme(style = 'ticks', rc = custom_params)
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", kde = True)
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig_dur = sns.histplot(data = games_dur, x = "Year", kde = True)
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig2_dur = sns.histplot(data = games_dur, x = "Year", hue = "Genre", multiple = "stack", kde = True)
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig3_dur = sns.jointplot(data = games_dur, x = "Year")
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_fig_pos = sns.histplot(data = games_pos, x = "Year")
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
crime_CA_fig = sns.histplot(data = crime_CA, x = "year")
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
crime_US_fig = sns.histplot(data = crime_US, x = "report_year")
|
||||
plt.show()
|
||||
|
||||
games_dur['Violent_US'] = crime_US['violent_crimes']
|
||||
games_dur['NA_Sales'] = games_dur['NA_Sales'].multiply(1000)
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_violence_US = sns.relplot(data = games_dur, x = 'NA_Sales', y = 'Violent_US')
|
||||
plt.show()
|
||||
|
||||
plt.xticks(rotation = 90)
|
||||
games_crime_dur = sns.jointplot(data = games_dur, x = "Year", y = 'Violent_US')
|
||||
plt.show()
|
||||
31
dwarves/gunner.py
Normal file
31
dwarves/gunner.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# Cleaning of datasets
|
||||
# Somewhat main in the beninging
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def drop_kick(col_list, dataframe):
|
||||
return dataframe.drop(columns=col_list, axis=1)
|
||||
|
||||
|
||||
# Getting the range of years which both datasets share
|
||||
def year_interval(victim1, victim2, col1, col2):
|
||||
return (
|
||||
max(victim2[col2].min(), victim1[col1].min()),
|
||||
min(victim2[col2].max(), victim1[col1].max()),
|
||||
)
|
||||
|
||||
|
||||
def intersect_by_year(victim1, victim2, col1, col2):
|
||||
interval = year_interval(victim1, victim2, col1, col2)
|
||||
victim1 = victim1[(victim1[col1] >= interval[0]) & (victim1[col1] <= interval[1])]
|
||||
victim2 = victim2[(victim2[col2] >= interval[0]) & (victim2[col2] <= interval[1])]
|
||||
return (victim1, victim2)
|
||||
|
||||
|
||||
# Updating the NA game dataset to fit with the time ranges
|
||||
def trisect_by_year(victim1, col, interval):
|
||||
victim1_pre = victim1[victim1[col] < interval[0]]
|
||||
victim1_dur = victim1[(victim1[col] <= interval[1]) & (victim1[col] >= interval[0])]
|
||||
victim1_pos = victim1[victim1[col] > interval[1]]
|
||||
return (victim1_pre, victim1_dur, victim1_pos)
|
||||
119
dwarves/mining_hq.py
Normal file
119
dwarves/mining_hq.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# Instantiating Main Python Script File
|
||||
# Collects stuff from the rest of the scripts
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
# containment breach
|
||||
import scipy as scp
|
||||
import digger, gunner, scout
|
||||
|
||||
# Instantiating globals to be used in other files
|
||||
global games_merged_dat
|
||||
global games_sales_split_pre
|
||||
global games_sales_split_dur
|
||||
global games_sales_split_pos
|
||||
|
||||
games_review = pd.read_csv("datasets/videogames/Games.xls")
|
||||
games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv"))
|
||||
|
||||
print(games_review.count())
|
||||
print(games_sales.count())
|
||||
|
||||
games_review_phase1 = digger.slice_column(games_review, "GameName", "Review")
|
||||
games_review_final = digger.slice_column(games_review, "GameName", "(Import)")
|
||||
|
||||
games_merged_dat = digger.write_joined_df(games_sales, games_review_final)
|
||||
|
||||
# Acquisition of Merged dataset
|
||||
print(games_merged_dat.count())
|
||||
|
||||
# Loading Crime Datasets
|
||||
crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
|
||||
|
||||
crime_US = pd.read_csv("datasets/crime/report.csv")
|
||||
|
||||
print(crime_US.isnull())
|
||||
print(crime_CA.isnull())
|
||||
|
||||
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
|
||||
|
||||
year_max = year_interval[0]
|
||||
year_min = year_interval[1]
|
||||
|
||||
crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, "report_year", "year")
|
||||
|
||||
crime_US_intersect = crime_intersect[0]
|
||||
crime_CA_intersect = crime_intersect[1]
|
||||
|
||||
NA_col_list = [
|
||||
"JP_Sales",
|
||||
"Other_Sales",
|
||||
"Global_Sales",
|
||||
"PAL_Sales",
|
||||
"GameName",
|
||||
"Review",
|
||||
"Console",
|
||||
"Score",
|
||||
]
|
||||
GLO_col_list = [
|
||||
"JP_Sales",
|
||||
"Other_Sales",
|
||||
"NA_Sales",
|
||||
"PAL_Sales",
|
||||
"GameName",
|
||||
"Review",
|
||||
"Console",
|
||||
"Score",
|
||||
]
|
||||
|
||||
# Splitting crime datasets
|
||||
# Collecting Split-Up Datasets
|
||||
games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)
|
||||
games_merged_dat.to_csv("datasets/videogames/games_merged.csv", index=False)
|
||||
|
||||
sale_tri_split = gunner.trisect_by_year(games_merged_dat, "Year", year_interval)
|
||||
|
||||
games_sales_split_pre = sale_tri_split[0]
|
||||
games_sales_split_dur = sale_tri_split[1]
|
||||
games_sales_split_pos = sale_tri_split[2]
|
||||
|
||||
# Displaying Acquired Data
|
||||
print("Dataset Info:\n")
|
||||
games_sales_split_pre.info()
|
||||
games_sales_split_dur.info()
|
||||
games_sales_split_pos.info()
|
||||
|
||||
print("Yer forsaken Statistical Description:\n", games_sales_split_dur.describe())
|
||||
|
||||
print(
|
||||
games_sales_split_pre.head(5),
|
||||
games_sales_split_dur.head(5),
|
||||
games_sales_split_pos.head(5),
|
||||
)
|
||||
|
||||
# Load merged gammas
|
||||
|
||||
# Required to use binning for cleaning, idk (DONE LESGOOOOOOOOOOOOOOOOOOOO)
|
||||
# https://towardsdatascience.com/data-preprocessing-with-python-pandas-part-5-binning-c5bd5fd1b950
|
||||
gammas = pd.read_csv("datasets/videogames/games_merged.csv")
|
||||
labels = ["smol", "epik", "larg"]
|
||||
gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels)
|
||||
# gammas = gammas[gammas["Genre"].isna() == False]
|
||||
# gammas = scout.cure_depression(gammas)
|
||||
|
||||
# Also need to transform using Z-score (normal distr go brrrr lmao), or min-max
|
||||
# ah, scheiße
|
||||
# nvm, done, kekW
|
||||
gammas['Critic_Score_Norm'] = scout.scaling_zscore(gammas, 'Critic_Score')
|
||||
print(gammas['Critic_Score_Norm'].head(10))
|
||||
|
||||
# Saving all into a file
|
||||
gammas.to_csv("datasets/videogames/games_cleanish.csv", index=False)
|
||||
|
||||
# Need similarity and dissimialrity, scipy time
|
||||
# Selecting 5 random rows
|
||||
chosen_idx = np.random.choice(len(gammas), replace = False, size = 5)
|
||||
sample_rows = gammas.iloc[chosen_idx]
|
||||
print(sample_rows.head())
|
||||
|
||||
scout.dissimilarity(sample_rows)
|
||||
67
dwarves/scout.py
Normal file
67
dwarves/scout.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# Regression/Prediction (Totally gonna do later trust bro)
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn import preprocessing
|
||||
from scipy.spatial import distance
|
||||
import scipy.stats as stats
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def cure_depression(dataset):
|
||||
# this is pog
|
||||
numeric = dataset.select_dtypes(include=np.number)
|
||||
numeric_columns = numeric.columns
|
||||
dataset[numeric_columns] = dataset[numeric_columns].interpolate(
|
||||
method="linear", limit_direct="forward"
|
||||
)
|
||||
# fuck around and find out with other methods maybe idk
|
||||
return dataset
|
||||
|
||||
|
||||
# Fuck you and whateevr you fucking stand for you dumb whore faggot
|
||||
def regression_expression(dataset, column, missing_value):
|
||||
lr = LinearRegression()
|
||||
numeric = dataset.select_dtypes(include=np.number)
|
||||
# Migrate this to digger
|
||||
# the fookin nulls
|
||||
testdf = numeric[numeric[column].isnull() == False]
|
||||
testdf = testdf[testdf[column] != 0]
|
||||
# the non nulls and non 0s
|
||||
traindf = numeric[numeric[column].isnull() == False]
|
||||
traindf = traindf[traindf[column] != 0]
|
||||
# print(traindf.head(20))
|
||||
# end of migration
|
||||
|
||||
y = traindf[column]
|
||||
traindf.drop(column, axis=1, inplace=True)
|
||||
lr.fit(traindf, y)
|
||||
pred = lr.predict(testdf)
|
||||
# can't put this in data set directly because length no match
|
||||
# join testdf and traindf to form dataset perhaps??
|
||||
testdf[column] = pred
|
||||
print(testdf.head(30))
|
||||
|
||||
# https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
|
||||
# That helps ^
|
||||
# This boi should work, idk i'm implementing blindly
|
||||
def scaling_zscore(dataframe, col):
|
||||
return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit")
|
||||
|
||||
def dissimilarity(row_arr):
|
||||
row_arr = row_arr.select_dtypes(include = np.number)
|
||||
row_arr = row_arr.drop('Rank', axis = 1)
|
||||
|
||||
print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
|
||||
for i in range(len(row_arr)):
|
||||
print("Entry " , i + 1, " | ", end = "")
|
||||
for j in range(len(row_arr)):
|
||||
eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j])
|
||||
print(" {:#.6g} |".format(eucDist), end = "")
|
||||
print("\n")
|
||||
|
||||
def scaling_range(datashitter, col):
|
||||
nonnull = datashitter[col].isna()
|
||||
minmax_scaler = preprocessing.MinMaxScaler()
|
||||
trainer = minmax_scaler.fit_transform(datashitter[nonnull])
|
||||
return minmax_scaler.transform(datashitter[col])
|
||||
Reference in New Issue
Block a user