Merge pull request #4 from LinlyBoi/cleanup

Cleanup into main
This commit is contained in:
Mjørk
2023-03-28 10:30:40 +02:00
committed by GitHub
9 changed files with 17834 additions and 6 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -11,6 +11,125 @@
"Docs:\n", "Docs:\n",
"- https://pandas.pydata.org/docs/getting_started/index.html#getting-started" "- https://pandas.pydata.org/docs/getting_started/index.html#getting-started"
] ]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning Game/Score/Rating Dataset\n",
"Error found: Game Names had Reviews attached to them"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Game Datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"filename = askopenfilename()\n",
"df1= pd.read_csv(filename)\n",
"from tkinter.filedialog import askopenfilename"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### Cleaning: Removing the word review and anything after it"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Unclean showcase\n",
"unclean = df1\n",
"#limit this output 3 rows pls\n",
"print(unclean[['GameName']].head(5))\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Name\n",
"0 Baldur's Gate 3 Early Access \n",
"1 Control: Ultimate Edition Cloud Version \n",
"2 Doom Eternal: The Ancient Gods Part 1 \n",
"3 Watch Dogs: Legion \n",
"4 Ring Of Pain \n",
"5 Pikmin 3 Deluxe \n",
"6 Ghostrunner \n",
"7 Disc Room \n",
"8 NHL 21 \n",
"9 Noita \n"
]
}
],
"source": [
"# CLeaning\n",
"nuke=df1['GameName'].to_list()\n",
"nuke2 = list()\n",
"\n",
"for orphan in nuke : \n",
" orphan = orphan.split('Review')[0]\n",
" nuke2.append(orphan)\n",
"\n",
"df1['GameName']=nuke\n",
"\n",
"\n",
"\n",
"nuke_frame = pd.DataFrame(nuke2)\n",
"clean=df1.drop(columns=['GameName'])\n",
"\n",
"clean['Name'] = nuke2\n",
"#limit this output 3 rows pls\n",
"print(clean[['Name']].head(5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CSV output\n",
"df1.to_csv('cleaned_games.csv')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Integrating Game Sales with the previous cleaned data set"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@@ -20,8 +139,16 @@
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python", "name": "python",
"version": "3.10.2" "nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}, },
"orig_nbformat": 4 "orig_nbformat": 4
}, },

View File

@@ -1 +1,41 @@
# Getting and combining data # Getting and combining data
import pandas as pd
import numpy as np
# reading the data
# -> MAKE SURE OF THE DATA FRAMES NAMES PEFORE YOU RUN IT
df1 = pd.read_csv("output_6th_df.csv")
df2 = pd.read_csv("vgsales-12-4-2019-short.csv")
# ----------------------------------------------------------
# print(pf1.head)
# print(pf2.head)
# ---------------------------------------------------------
# merging
combined_df = df1.merge(df2, left_on="Name", right_on="Name", how="left")
print(combined_df)
combined_df.to_csv("output_final_df.csv")
df = combined_df
# ---------------------------------------------------------
def slice_column(input_df, output_df, column, expression=" "):
unclean = input_df[column].to_list()
clean = list()
for record in unclean:
record = record.split(expression)[0]
clean.append(record)
input_df = input_df.drop(columns=[column])
input_df[column] = clean
input_df.to_csv(output_df)

View File

@@ -1 +1,23 @@
# Visualisations for Data # Visualisations for Data
import matplotlib.pyplot as plt
import seaborn as sns
import gunner
from numpy import count_nonzero
sns.set()
plt.xticks(rotation = 90)
games_pre = gunner.game_sales_NA_pre
games_dur = gunner.game_sales_NA_dur
games_pos = gunner.game_sales_NA_pos
games_fig_pre = sns.barplot(data = games_pre, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()
plt.xticks(rotation = 90)
games_fig_dur = sns.barplot(data = games_dur, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()
plt.xticks(rotation = 90)
games_fig_pos = sns.barplot(data = games_pos, x = "Year", y = "NA_Sales", estimator = count_nonzero)
plt.show()

View File

@@ -1,3 +1,86 @@
# Cleaning of datasets # Cleaning of datasets
# Somewhat main in the beninging # Somewhat main in the beninging
import pandas as pd
import pandas as pd
import numpy as np
# Sharing the dataset variables
# Games' data
global games_dat
# Sales in NA
global game_sales_NA
global game_sales_NA_dur
global game_sales_NA_pre
global game_sales_NA_pos
# Sales Globally
global game_sales_GLO
# Crime Data
# Crime Recorded in The US
global crime_US
# Crime Recorded in Canada
global crime_CA
# Loading Datasets
game_sales_dat = pd.read_csv('datasets/videogames/vgsales-12-4-2019-short.csv')
games_dat = pd.read_csv('datasets/videogames/Games.xls')
crime_CA = pd.read_excel('datasets/crime/clean_crime_canada_dataset.xlsx')
crime_US = pd.read_csv('datasets/crime/report.csv')
# Printing information regarding datasets
print("Game Datasets' Info:\n")
game_sales_dat.info()
games_dat.info()
print("Crime Datasets' Info:\n")
crime_US.info()
crime_CA.info()
# Printing First n values (index start: 0)
print("Game Sale Data:\n", game_sales_dat.head(10))
print("Game Scores:\n", games_dat.head(10))
print("US Crime Data:\n", crime_US.head(10))
print("CA Crime Data:\n", crime_CA.head(10))
# Regarding the Games.xls dataset:
# Coercing the non-numeric values will result in NaN
# thus allowing easier removal through `.notnull()`
games_dat['Score'] = pd.to_numeric(games_dat['Score'], errors = 'coerce')
games_dat = games_dat[games_dat['Score'].notnull()]
print("Game Scores (Cleaned):\n", games_dat.head())
games_dat.info()
# Regarding the vgsales-12-4-2019 dataset
# Considering we will be using a US (probs CA too) crime datasets
# It wouldn't be that useful to have other columns regarding other regions
NA_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
GLO_col_list = ['PAL_Sales', 'JP_Sales', 'Other_Sales', 'NA_Sales']
game_sales_NA = game_sales_dat.drop(columns = NA_col_list, axis = 1)
game_sales_GLO = game_sales_dat.drop(columns = GLO_col_list, axis = 1)
print(f"Game Sales for NA:\n{game_sales_NA.head(10)} \nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")
# Getting the range of years which both datasets share
crime_year_min = max(crime_US['report_year'].min(), crime_CA['year'].min())
crime_year_max = min(crime_US['report_year'].max(), crime_CA['year'].max())
crime_CA = crime_CA[(crime_CA['year'] >= crime_year_min) & (crime_CA['year'] <= crime_year_max)]
crime_US = crime_US[(crime_US['report_year'] >= crime_year_min) & (crime_US['report_year'] <= crime_year_max)]
# Updating the NA game dataset to fit with the time ranges
game_sales_NA_dur = game_sales_NA[(game_sales_NA['Year'] >= crime_year_min) & (game_sales_NA['Year'] <= crime_year_max)]
game_sales_NA_pre = game_sales_NA[game_sales_NA['Year'] < crime_year_min]
game_sales_NA_pos = game_sales_NA[game_sales_NA['Year'] > crime_year_max]
print(f"Game Sales for NA:\n{game_sales_NA.head(10)}\nWith minimum year being: {game_sales_NA['Year'].min()}")
print(f"Game Sales Globally:\n{game_sales_GLO.head(10)}\nWith minimum year being: {game_sales_GLO['Year'].min()}")

View File

@@ -2,4 +2,4 @@
# Collects stuff from the rest of the scripts # Collects stuff from the rest of the scripts
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns

View File

@@ -1 +1 @@
# Regression/Prediction (Totally gonna do later trust bro) # Regression/Prediction (Totally gonna do later trust bro)