Merge pull request #6 from LinlyBoi/finishing-touches

Finishing touches
This commit is contained in:
Linly
2023-03-31 18:02:02 +02:00
committed by GitHub
9 changed files with 3863 additions and 2876 deletions

File diff suppressed because it is too large Load Diff

946
dwarves/Mining_HQ.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -40,13 +40,18 @@ def slam_dunk(dataset, column, labels):
x_data = np.arange(0, len(dataset))
y_data = dataset[column]
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
x_bins, bin_edges, misc = binned_statistic(
y_data, x_data, statistic="median", bins=len(labels)
)
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
dataset["bin_value"] = dataset[column].apply(
lambda x: set_to_median(x, bin_intervals)
)
return dataset
def set_to_median(x, bin_intervals):
for interval in bin_intervals:
if x in interval:
return interval.mid
return interval.mid

62
dwarves/engineer.py Normal file
View File

@@ -0,0 +1,62 @@
# Visualisations for Data
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import mining_hq
from numpy import count_nonzero
games_pre = mining_hq.games_sales_split_pre
games_dur = mining_hq.games_sales_split_dur
games_pos = mining_hq.games_sales_split_pos
crime_US = mining_hq.crime_US_intersect
crime_CA = mining_hq.crime_CA_intersect
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style = 'ticks', rc = custom_params)
plt.xticks(rotation = 90)
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
plt.show()
plt.xticks(rotation = 90)
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", kde = True)
plt.show()
plt.xticks(rotation = 90)
games_fig_dur = sns.histplot(data = games_dur, x = "Year", kde = True)
plt.show()
plt.xticks(rotation = 90)
games_fig2_dur = sns.histplot(data = games_dur, x = "Year", hue = "Genre", multiple = "stack", kde = True)
plt.show()
plt.xticks(rotation = 90)
games_fig3_dur = sns.jointplot(data = games_dur, x = "Year")
plt.show()
plt.xticks(rotation = 90)
games_fig_pos = sns.histplot(data = games_pos, x = "Year")
plt.show()
plt.xticks(rotation = 90)
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = 'incident_sum')
plt.show()
plt.xticks(rotation = 90)
crime_US_fig = sns.barplot(data = crime_US, x = "report_year", y = "sum_violence")
plt.show()
games_dur['Violent_US'] = crime_US['violent_crimes']
games_dur['NA_Sales'] = games_dur['NA_Sales'].multiply(1000)
plt.xticks(rotation = 90)
games_violence_US = sns.relplot(data = games_dur, x = 'NA_Sales', y = 'Violent_US')
plt.close(1)
plt.show()
plt.xticks(rotation = 90)
games_crime_dur = sns.jointplot(data = games_dur, x = "Year", y = 'Violent_US')
plt.close(1)
plt.show()

View File

@@ -2,10 +2,9 @@
# Collects stuff from the rest of the scripts
import pandas as pd
import numpy as np
import seaborn as sns
# containment breach
import scipy as scp
import digger, gunner, scout
import gunner, digger, gunner, scout
# Instantiating globals to be used in other files
global games_merged_dat
@@ -32,8 +31,8 @@ crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
crime_US = pd.read_csv("datasets/crime/report.csv")
print(crime_US.isnull())
print(crime_CA.isnull())
print(crime_US.isnull().count())
print(crime_CA.isnull().count())
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
@@ -117,3 +116,4 @@ sample_rows = gammas.iloc[chosen_idx]
print(sample_rows.head())
scout.dissimilarity(sample_rows)
scout.similarity(sample_rows)

View File

@@ -52,7 +52,7 @@ def dissimilarity(row_arr):
row_arr = row_arr.select_dtypes(include = np.number)
row_arr = row_arr.drop('Rank', axis = 1)
print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
print(" Dissim | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
for i in range(len(row_arr)):
print("Entry " , i + 1, " | ", end = "")
for j in range(len(row_arr)):
@@ -60,6 +60,18 @@ def dissimilarity(row_arr):
print(" {:#.6g} |".format(eucDist), end = "")
print("\n")
def similarity(row_arr):
row_arr = row_arr.select_dtypes(include = np.number)
row_arr = row_arr.drop('Rank', axis = 1)
print("Similarity| Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
for i in range(len(row_arr)):
print("Entry ", i + 1, " | ", end = "")
for j in range(len(row_arr)):
sim = 1 - distance.cosine(row_arr.iloc[i], row_arr.iloc[j])
print(" {:#.6g} |".format(sim), end = "")
print("\n")
def scaling_range(datashitter, col):
nonnull = datashitter[col].isna()
minmax_scaler = preprocessing.MinMaxScaler()

View File

@@ -1,38 +0,0 @@
# Visualisations for Data
import matplotlib.pyplot as plt
import seaborn as sns
import mining_hq
from numpy import count_nonzero
games_pre = mining_hq.games_sales_split_pre
games_dur = mining_hq.games_sales_split_dur
games_pos = mining_hq.games_sales_split_pos
crime_US = mining_hq.crime_US_intersect
crime_CA = mining_hq.crime_CA_intersect
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style = 'ticks', rc = custom_params)
plt.xticks(rotation = 90)
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
plt.show()
plt.xticks(rotation = 90)
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65)
plt.show()
plt.xticks(rotation = 90)
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
plt.xlabel("Years")
plt.ylabel("Sales in North America (Canada, USA)")
plt.show()
plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
plt.show()
plt.xticks(rotation = 90)
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents", estimator=count_nonzero)
plt.show()