Merge pull request #4 from LinlyBoi/cleanup

Cleanup into main
This commit is contained in:
Mjørk
2023-03-28 10:30:40 +02:00
committed by GitHub
9 changed files with 17834 additions and 6 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -11,6 +11,125 @@
"Docs:\n",
"- https://pandas.pydata.org/docs/getting_started/index.html#getting-started"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Game/Score/Rating Dataset\n",
"Error found: Game Names had Reviews attached to them"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Game Datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"filename = askopenfilename()\n",
"df1= pd.read_csv(filename)\n",
"from tkinter.filedialog import askopenfilename"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### Cleaning: Removing the word review and anything after it"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Unclean showcase\n",
"unclean = df1\n",
"#limit this output 3 rows pls\n",
"print(unclean[['GameName']].head(5))\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Name\n",
"0 Baldur's Gate 3 Early Access \n",
"1 Control: Ultimate Edition Cloud Version \n",
"2 Doom Eternal: The Ancient Gods Part 1 \n",
"3 Watch Dogs: Legion \n",
"4 Ring Of Pain \n",
"5 Pikmin 3 Deluxe \n",
"6 Ghostrunner \n",
"7 Disc Room \n",
"8 NHL 21 \n",
"9 Noita \n"
]
}
],
"source": [
"# CLeaning\n",
"nuke=df1['GameName'].to_list()\n",
"nuke2 = list()\n",
"\n",
"for orphan in nuke : \n",
" orphan = orphan.split('Review')[0]\n",
" nuke2.append(orphan)\n",
"\n",
"df1['GameName']=nuke\n",
"\n",
"\n",
"\n",
"nuke_frame = pd.DataFrame(nuke2)\n",
"clean=df1.drop(columns=['GameName'])\n",
"\n",
"clean['Name'] = nuke2\n",
"#limit this output 3 rows pls\n",
"print(clean[['Name']].head(5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CSV output\n",
"df1.to_csv('cleaned_games.csv')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Integrating Game Sales with the previous cleaned data set"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@@ -20,8 +139,16 @@
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"version": "3.10.2"
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},

View File

@@ -1 +1,41 @@
# Getting and combining data
# Getting and combining data
import pandas as pd
import numpy as np
# reading the data
# -> MAKE SURE OF THE DATA FRAMES NAMES PEFORE YOU RUN IT
df1 = pd.read_csv("output_6th_df.csv")
df2 = pd.read_csv("vgsales-12-4-2019-short.csv")
# ----------------------------------------------------------
# print(pf1.head)
# print(pf2.head)
# ---------------------------------------------------------
# merging
combined_df = df1.merge(df2, left_on="Name", right_on="Name", how="left")
print(combined_df)
combined_df.to_csv("output_final_df.csv")
df = combined_df
# ---------------------------------------------------------
def slice_column(input_df, output_df, column, expression=" "):
unclean = input_df[column].to_list()
clean = list()
for record in unclean:
record = record.split(expression)[0]
clean.append(record)
input_df = input_df.drop(columns=[column])
input_df[column] = clean
input_df.to_csv(output_df)

View File

@@ -1 +1,23 @@
# Visualisations for Data
# Visualisations for Data
import matplotlib.pyplot as plt
import seaborn as sns
import gunner
from numpy import count_nonzero
sns.set()
plt.xticks(rotation = 90)
games_pre = gunner.game_sales_NA_pre
games_dur = gunner.game_sales_NA_dur
games_pos = gunner.game_sales_NA_pos
games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()
plt.xticks(rotation = 90)
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()
plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()

View File

@@ -1,3 +1,86 @@
# Cleaning of datasets
# Somewhat main in the beninging
import pandas as pd
import pandas as pd
import numpy as np
# Sharing the dataset variables
# Games' data
global games_dat
# Sales in NA
global game_sales_NA
global game_sales_NA_dur
global game_sales_NA_pre
global game_sales_NA_pos
# Sales Globally
global game_sales_GLO
# Crime Data
# Crime Recorded in The US
global crime_US
# Crime Recorded in Canada
global crime_CA
# Loading Datasets
game_sales_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv')
games_dat = pd.read_csv('datasets/videogames/Games.xls')
crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx')
crime_US = pd.read_csv('datasets/crime/report.csv')
# Printing information regarding datasets
print("Game Datasets' Info:\n")
game_sales_dat.info()
games_dat.info()
print("Crime Datasets' Info:\n")
crime_US.info()
crime_CA.info()
# Printing First n values (index start: 0)
print("Game Sale Data:\n", game_sales_dat.head(10))
print("Game Scores:\n", games_dat.head(10))
print("US Crime Data:\n", crime_US.head(10))
print("CA Crime Data:\n", crime_CA.head(10))
# Regarding the Games.xls dataset:
# Coercing the non-numeric values will result in NaN
# thus allowing easier removal through `.notnull()`
games_dat['Score'] = pd.to_numeric(games_dat['Score'], errors = 'coerce')
games_dat = games_dat[games_dat['Score'].notnull()]
print("Game Scores (Cleaned):\n", games_dat.head())
games_dat.info()
# Regarding the vgsales-12-4-2019 dataset
# Considering we will be using a US (probs CA too) crime datasets
# It wouldn't be that useful to have other columns regarding other regions
NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales']
game_sales_NA = game_sales_dat.drop(columns = NA_col_list, axis = 1)
game_sales_GLO = game_sales_dat.drop(columns = GLO_col_list, axis = 1)
print(f"Game Sales for NA:\n{game_sales_NA.head(10)} \nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")
# Getting the range of years which both datasets share
crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min())
crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max())
crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)]
crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)]
# Updating the NA game dataset to fit with the time ranges
game_sales_NA_dur = game_sales_NA[(game_sales_NA['Year'] >= crime_year_min) & (game_sales_NA['Year'] <= crime_year_max)]
game_sales_NA_pre = game_sales_NA[game_sales_NA['Year'] < crime_year_min]
game_sales_NA_pos = game_sales_NA[game_sales_NA['Year'] > crime_year_max]
print(f"Game Sales for NA:\n{game_sales_NA.head(10)}\nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")

View File

@@ -2,4 +2,4 @@
# Collects stuff from the rest of the scripts
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns

View File

@@ -1 +1 @@
# Regression/Prediction (Totally gonna do later trust bro)
# Regression/Prediction (Totally gonna do later trust bro)