Files
Mining-Away/Mining_HQ.ipynb
2023-03-31 13:14:42 +02:00

460 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Modules\n",
"The dwarves!"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from dwarves import digger, gunner, scout \n",
"import pandas as pd\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pre-Processing\n",
"This segment is a demonstration on how the data cleaning process on the acquired datasets was handled"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Game Datasets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Name Platform Year Genre Publisher \\\n",
"0 1.0 Wii Sports Wii 2006.0 Sports Nintendo \n",
"1 2.0 Super Mario Bros. NES 1985.0 Platform Nintendo \n",
"2 3.0 Mario Kart Wii Wii 2008.0 Racing Nintendo \n",
"3 4.0 Wii Sports Resort Wii 2009.0 Sports Nintendo \n",
"4 5.0 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo \n",
"\n",
" NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Console Review \\\n",
"0 41.49 29.02 3.77 8.46 82.74 NaN NaN \n",
"1 29.08 3.58 6.81 0.77 40.24 NaN NaN \n",
"2 15.85 12.88 3.79 3.31 35.82 NaN NaN \n",
"3 15.75 11.01 3.28 2.96 33.00 NaN NaN \n",
"4 11.27 8.89 10.22 1.00 31.37 NaN NaN \n",
"\n",
" Score GameName \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n"
]
}
],
"source": [
"\n",
"games_review = pd.read_csv(\"datasets/videogames/Games.xls\")\n",
"games_sales = pd.read_csv(\"datasets/videogames/vgsales.csv\")\n",
"\n",
"games_review_phase1 = digger.slice_column(games_review, \"GameName\", \"Review\")\n",
"games_review_final = digger.slice_column(games_review, \"GameName\", \"(Import)\")\n",
"\n",
"games_merged_dat = digger.write_joined_df(games_sales, games_review_final)\n",
"\n",
"# Acquisition of Merged dataset\n",
"print(games_merged_dat.head(5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"gammas = pd.read_csv(\"datasets/videogames/games_merged.csv\")\n",
"labels = [\"smol\", \"epik\", \"larg\"]\n",
"gammas = digger.slam_dunk(gammas, \"Critic_Score\", labels=labels)\n",
"gammas.to_csv(\"datasets/videogames/games_output.csv\", index=False)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crime Datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"crime_CA = pd.read_excel(\"datasets/crime/clean_crime_canada_dataset.xlsx\")\n",
"\n",
"crime_US = pd.read_csv(\"datasets/crime/report.csv\")\n",
"\n",
"NA_col_list = [\n",
" \"JP_Sales\",\n",
" \"Other_Sales\",\n",
" \"Global_Sales\",\n",
" \"GameName\",\n",
" \"Review\",\n",
" \"Console\",\n",
" \"Score\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"print(crime_US.isnull())\n",
"print(crime_CA.isnull())\n",
"\n",
"year_interval = gunner.year_interval(crime_US, crime_CA, \"report_year\", \"year\")\n",
"\n",
"year_max = year_interval[0]\n",
"year_min = year_interval[1]\n",
"\n",
"crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, \"report_year\", \"year\")\n",
"\n",
"crime_US_intersect = crime_intersect[0]\n",
"crime_CA_intersect = crime_intersect[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)\n",
"games_merged_dat.to_csv(\"datasets/videogames/games_merged.csv\", index=False)\n",
"\n",
"sale_tri_split = gunner.trisect_by_year(games_merged_dat, \"Year\", year_interval)\n",
"\n",
"games_sales_split_pre = sale_tri_split[0]\n",
"games_sales_split_dur = sale_tri_split[1]\n",
"games_sales_split_pos = sale_tri_split[2]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"print(\"Acquired Datasets:\\n\")\n",
"print(sale_tri_split[0].head(5), sale_tri_split[1].head(5), sale_tri_split[2].head(5))\n",
"\n",
"print(\"Dataset Info:\\n\")\n",
"sale_tri_split[0].info()\n",
"sale_tri_split[1].info()\n",
"sale_tri_split[2].info()\n",
"\n",
"\n",
"print(\"Dataset Info:\\n\")\n",
"games_sales_split_pre.info()\n",
"games_sales_split_dur.info()\n",
"games_sales_split_pos.info()\n",
"\n",
"print(\"Yer forsaken Statistical Description:\\n\", games_sales_split_dur.describe())\n",
"\n",
"print(\n",
" games_sales_split_pre.head(5),\n",
" games_sales_split_dur.head(5),\n",
" games_sales_split_pos.head(5),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plotting\n",
"Engineer.py steps up to the job"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Importing and setting up environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import pandas as pd\n",
"import mining_hq\n",
"from numpy import count_nonzero\n",
"\n",
"games_pre = mining_hq.games_sales_split_pre\n",
"games_dur = mining_hq.games_sales_split_dur\n",
"games_pos = mining_hq.games_sales_split_pos\n",
"\n",
"crime_US = mining_hq.crime_US_intersect\n",
"crime_CA = mining_hq.crime_CA_intersect\n",
"\n",
"custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n",
"\n",
"sns.set_theme(style = 'ticks', rc = custom_params)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Game Sales pre 2000s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### General Plot"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"games_fig_pre = sns.histplot(data = games_pre, x = \"Year\", palette = sns.color_palette(\"flare\"), kde = True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### According to Genre"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"games_fig2_pre = sns.histplot(data = games_pre, x = \"Year\", hue = \"Genre\", multiple = \"stack\", kde = True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Game Sales during 2000s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### General Plot"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"games_fig_dur = sns.histplot(data = games_dur, x = \"Year\", kde = True)\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### According to Genre"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"games_fig2_dur = sns.histplot(data = games_dur, x = \"Year\", hue = \"Genre\", multiple = \"stack\", kde = True)\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Crime in Canada and US"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"crime_CA_fig = sns.histplot(data = crime_CA, x = \"year\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"crime_US_fig = sns.histplot(data = crime_US, x = \"report_year\")\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Plotting Crime in both CA and US together"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"plt.xticks(rotation = 90)\n",
"crime_US_fig = sns.histplot(data = crime_US, x = \"report_year\")\n",
"plt.show()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Integrating violence crimes column into Game sales \n",
"- Note: multiplying sales by 1000 because they're reduced by default\n",
"- to plot later"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"games_dur['Violent_US'] = crime_US['violent_crimes']\n",
"games_dur['NA_Sales'] = games_dur['NA_Sales'].multiply(1000)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Plotting Sales against Crime with Relplot"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.xticks(rotation = 90)\n",
"games_violence_US = sns.relplot(data = games_dur, x = 'NA_Sales', y = 'Violent_US')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Joined Plot and Histograms"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.xticks(rotation = 90)\n",
"games_crime_dur = sns.jointplot(data = games_dur, x = \"Year\", y = 'Violent_US')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}