Merge pull request #6 from LinlyBoi/finishing-touches
Finishing touches
This commit is contained in:
Binary file not shown.
File diff suppressed because it is too large
Load Diff
946
dwarves/Mining_HQ.ipynb
Normal file
946
dwarves/Mining_HQ.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -40,13 +40,18 @@ def slam_dunk(dataset, column, labels):
|
|||||||
|
|
||||||
x_data = np.arange(0, len(dataset))
|
x_data = np.arange(0, len(dataset))
|
||||||
y_data = dataset[column]
|
y_data = dataset[column]
|
||||||
x_bins, bin_edges, misc = binned_statistic(y_data, x_data, statistic='median', bins=len(labels))
|
x_bins, bin_edges, misc = binned_statistic(
|
||||||
|
y_data, x_data, statistic="median", bins=len(labels)
|
||||||
|
)
|
||||||
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
bin_intervals = pd.IntervalIndex.from_arrays(bin_edges[:-1], bin_edges[1:])
|
||||||
dataset['bin_value'] = dataset[column].apply(lambda x: set_to_median(x, bin_intervals))
|
dataset["bin_value"] = dataset[column].apply(
|
||||||
|
lambda x: set_to_median(x, bin_intervals)
|
||||||
|
)
|
||||||
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
def set_to_median(x, bin_intervals):
|
def set_to_median(x, bin_intervals):
|
||||||
for interval in bin_intervals:
|
for interval in bin_intervals:
|
||||||
if x in interval:
|
if x in interval:
|
||||||
return interval.mid
|
return interval.mid
|
||||||
62
dwarves/engineer.py
Normal file
62
dwarves/engineer.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# Visualisations for Data
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
import pandas as pd
|
||||||
|
import mining_hq
|
||||||
|
from numpy import count_nonzero
|
||||||
|
|
||||||
|
games_pre = mining_hq.games_sales_split_pre
|
||||||
|
games_dur = mining_hq.games_sales_split_dur
|
||||||
|
games_pos = mining_hq.games_sales_split_pos
|
||||||
|
|
||||||
|
crime_US = mining_hq.crime_US_intersect
|
||||||
|
crime_CA = mining_hq.crime_CA_intersect
|
||||||
|
|
||||||
|
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
|
||||||
|
|
||||||
|
sns.set_theme(style = 'ticks', rc = custom_params)
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", kde = True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig_dur = sns.histplot(data = games_dur, x = "Year", kde = True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig2_dur = sns.histplot(data = games_dur, x = "Year", hue = "Genre", multiple = "stack", kde = True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig3_dur = sns.jointplot(data = games_dur, x = "Year")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_fig_pos = sns.histplot(data = games_pos, x = "Year")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = 'incident_sum')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
crime_US_fig = sns.barplot(data = crime_US, x = "report_year", y = "sum_violence")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
games_dur['Violent_US'] = crime_US['violent_crimes']
|
||||||
|
games_dur['NA_Sales'] = games_dur['NA_Sales'].multiply(1000)
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_violence_US = sns.relplot(data = games_dur, x = 'NA_Sales', y = 'Violent_US')
|
||||||
|
plt.close(1)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
plt.xticks(rotation = 90)
|
||||||
|
games_crime_dur = sns.jointplot(data = games_dur, x = "Year", y = 'Violent_US')
|
||||||
|
plt.close(1)
|
||||||
|
plt.show()
|
||||||
@@ -2,10 +2,9 @@
|
|||||||
# Collects stuff from the rest of the scripts
|
# Collects stuff from the rest of the scripts
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import seaborn as sns
|
|
||||||
# containment breach
|
# containment breach
|
||||||
import scipy as scp
|
import scipy as scp
|
||||||
import digger, gunner, scout
|
import gunner, digger, gunner, scout
|
||||||
|
|
||||||
# Instantiating globals to be used in other files
|
# Instantiating globals to be used in other files
|
||||||
global games_merged_dat
|
global games_merged_dat
|
||||||
@@ -32,8 +31,8 @@ crime_CA = pd.read_excel("datasets/crime/clean_crime_canada_dataset.xlsx")
|
|||||||
|
|
||||||
crime_US = pd.read_csv("datasets/crime/report.csv")
|
crime_US = pd.read_csv("datasets/crime/report.csv")
|
||||||
|
|
||||||
print(crime_US.isnull())
|
print(crime_US.isnull().count())
|
||||||
print(crime_CA.isnull())
|
print(crime_CA.isnull().count())
|
||||||
|
|
||||||
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
|
year_interval = gunner.year_interval(crime_US, crime_CA, "report_year", "year")
|
||||||
|
|
||||||
@@ -117,3 +116,4 @@ sample_rows = gammas.iloc[chosen_idx]
|
|||||||
print(sample_rows.head())
|
print(sample_rows.head())
|
||||||
|
|
||||||
scout.dissimilarity(sample_rows)
|
scout.dissimilarity(sample_rows)
|
||||||
|
scout.similarity(sample_rows)
|
||||||
@@ -52,7 +52,7 @@ def dissimilarity(row_arr):
|
|||||||
row_arr = row_arr.select_dtypes(include = np.number)
|
row_arr = row_arr.select_dtypes(include = np.number)
|
||||||
row_arr = row_arr.drop('Rank', axis = 1)
|
row_arr = row_arr.drop('Rank', axis = 1)
|
||||||
|
|
||||||
print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
|
print(" Dissim | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
|
||||||
for i in range(len(row_arr)):
|
for i in range(len(row_arr)):
|
||||||
print("Entry " , i + 1, " | ", end = "")
|
print("Entry " , i + 1, " | ", end = "")
|
||||||
for j in range(len(row_arr)):
|
for j in range(len(row_arr)):
|
||||||
@@ -60,6 +60,18 @@ def dissimilarity(row_arr):
|
|||||||
print(" {:#.6g} |".format(eucDist), end = "")
|
print(" {:#.6g} |".format(eucDist), end = "")
|
||||||
print("\n")
|
print("\n")
|
||||||
|
|
||||||
|
def similarity(row_arr):
|
||||||
|
row_arr = row_arr.select_dtypes(include = np.number)
|
||||||
|
row_arr = row_arr.drop('Rank', axis = 1)
|
||||||
|
|
||||||
|
print("Similarity| Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |")
|
||||||
|
for i in range(len(row_arr)):
|
||||||
|
print("Entry ", i + 1, " | ", end = "")
|
||||||
|
for j in range(len(row_arr)):
|
||||||
|
sim = 1 - distance.cosine(row_arr.iloc[i], row_arr.iloc[j])
|
||||||
|
print(" {:#.6g} |".format(sim), end = "")
|
||||||
|
print("\n")
|
||||||
|
|
||||||
def scaling_range(datashitter, col):
|
def scaling_range(datashitter, col):
|
||||||
nonnull = datashitter[col].isna()
|
nonnull = datashitter[col].isna()
|
||||||
minmax_scaler = preprocessing.MinMaxScaler()
|
minmax_scaler = preprocessing.MinMaxScaler()
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
# Visualisations for Data
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import seaborn as sns
|
|
||||||
import mining_hq
|
|
||||||
from numpy import count_nonzero
|
|
||||||
|
|
||||||
games_pre = mining_hq.games_sales_split_pre
|
|
||||||
games_dur = mining_hq.games_sales_split_dur
|
|
||||||
games_pos = mining_hq.games_sales_split_pos
|
|
||||||
|
|
||||||
crime_US = mining_hq.crime_US_intersect
|
|
||||||
crime_CA = mining_hq.crime_CA_intersect
|
|
||||||
|
|
||||||
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
|
|
||||||
|
|
||||||
sns.set_theme(style = 'ticks', rc = custom_params)
|
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
|
||||||
games_fig_pre = sns.histplot(data = games_pre, x = "Year", palette = sns.color_palette("flare"), kde = True)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
|
||||||
games_fig2_pre = sns.histplot(data = games_pre, x = "Year", hue = "Genre", multiple = "stack", shrink = 0.65)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
|
||||||
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales")
|
|
||||||
plt.xlabel("Years")
|
|
||||||
plt.ylabel("Sales in North America (Canada, USA)")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
|
||||||
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plt.xticks(rotation = 90)
|
|
||||||
crime_CA_fig = sns.barplot(data = crime_CA, x = "year", y = "incidents", estimator=count_nonzero)
|
|
||||||
plt.show()
|
|
||||||
Reference in New Issue
Block a user