diff --git a/Mining_HQ.ipynb b/Mining_HQ.ipynb new file mode 100644 index 0000000..baccfe4 --- /dev/null +++ b/Mining_HQ.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import Modules\n", + "The dwarves!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (2807776577.py, line 1)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[4], line 1\u001b[0;36m\u001b[0m\n\u001b[0;31m import digger, gunner, scout from dwarves\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "from dwarves import digger, gunner, scout \n", + "import pandas as pd\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pre-Processing\n", + "This segment is a demonstration on how the data cleaning process on the acquired datasets was handled" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Game Datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'digger' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m games_review \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m\"\u001b[39m\u001b[39mdatasets/videogames/Games.xls\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m games_sales \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m\"\u001b[39m\u001b[39mdatasets/videogames/vgsales.csv\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m games_review_phase1 \u001b[39m=\u001b[39m digger\u001b[39m.\u001b[39mslice_column(games_review, \u001b[39m\"\u001b[39m\u001b[39mGameName\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mReview\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m games_review_final \u001b[39m=\u001b[39m digger\u001b[39m.\u001b[39mslice_column(games_review, \u001b[39m\"\u001b[39m\u001b[39mGameName\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m(Import)\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 7\u001b[0m games_merged_dat \u001b[39m=\u001b[39m digger\u001b[39m.\u001b[39mwrite_joined_df(games_sales, games_review_final)\n", + "\u001b[0;31mNameError\u001b[0m: name 'digger' is not defined" + ] + } + ], + "source": [ + "\n", + "games_review = pd.read_csv(\"datasets/videogames/Games.xls\")\n", + "games_sales = pd.read_csv(\"datasets/videogames/vgsales.csv\")\n", + "\n", + "games_review_phase1 = digger.slice_column(games_review, \"GameName\", \"Review\")\n", + "games_review_final = digger.slice_column(games_review, \"GameName\", \"(Import)\")\n", + "\n", + "games_merged_dat = digger.write_joined_df(games_sales, games_review_final)\n", + "\n", + "# Acquisition of Merged dataset\n", + "print(games_merged_dat.head(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "gammas = pd.read_csv(\"datasets/videogames/games_merged.csv\")\n", + "labels = [\"smol\", \"epik\", \"larg\"]\n", + "gammas = digger.slam_dunk(gammas, \"Critic_Score\", labels=labels)\n", + "gammas.to_csv(\"datasets/videogames/games_output.csv\", index=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crime Datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "crime_CA = pd.read_excel(\"datasets/crime/clean_crime_canada_dataset.xlsx\")\n", + "\n", + "crime_US = pd.read_csv(\"datasets/crime/report.csv\")\n", + "\n", + "NA_col_list = [\n", + " \"JP_Sales\",\n", + " \"Other_Sales\",\n", + " \"Global_Sales\",\n", + " \"GameName\",\n", + " \"Review\",\n", + " \"Console\",\n", + " \"Score\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "print(crime_US.isnull())\n", + "print(crime_CA.isnull())\n", + "\n", + "year_interval = gunner.year_interval(crime_US, crime_CA, \"report_year\", \"year\")\n", + "\n", + "year_max = year_interval[0]\n", + "year_min = year_interval[1]\n", + "\n", + "crime_intersect = gunner.intersect_by_year(crime_US, crime_CA, \"report_year\", \"year\")\n", + "\n", + "crime_US_intersect = crime_intersect[0]\n", + "crime_CA_intersect = crime_intersect[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "games_merged_dat = gunner.drop_kick(NA_col_list, games_merged_dat)\n", + "games_merged_dat.to_csv(\"datasets/videogames/games_merged.csv\", index=False)\n", + "\n", + "sale_tri_split = gunner.trisect_by_year(games_merged_dat, \"Year\", year_interval)\n", + "\n", + "games_sales_split_pre = sale_tri_split[0]\n", + "games_sales_split_dur = sale_tri_split[1]\n", + "games_sales_split_pos = sale_tri_split[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "print(\"Acquired Datasets:\\n\")\n", + "print(sale_tri_split[0].head(5), sale_tri_split[1].head(5), sale_tri_split[2].head(5))\n", + "\n", + "print(\"Dataset Info:\\n\")\n", + "sale_tri_split[0].info()\n", + "sale_tri_split[1].info()\n", + "sale_tri_split[2].info()\n", + "\n", + "\n", + "print(\"Dataset Info:\\n\")\n", + "games_sales_split_pre.info()\n", + "games_sales_split_dur.info()\n", + "games_sales_split_pos.info()\n", + "\n", + "print(\"Yer forsaken Statistical Description:\\n\", games_sales_split_dur.describe())\n", + "\n", + "print(\n", + " games_sales_split_pre.head(5),\n", + " games_sales_split_dur.head(5),\n", + " games_sales_split_pos.head(5),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting\n", + "Engineer.py steps up to the job" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing and setting up environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "import mining_hq\n", + "from numpy import count_nonzero\n", + "\n", + "games_pre = mining_hq.games_sales_split_pre\n", + "games_dur = mining_hq.games_sales_split_dur\n", + "games_pos = mining_hq.games_sales_split_pos\n", + "\n", + "crime_US = mining_hq.crime_US_intersect\n", + "crime_CA = mining_hq.crime_CA_intersect\n", + "\n", + "custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False}\n", + "\n", + "sns.set_theme(style = 'ticks', rc = custom_params)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Game Sales pre 2000s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### General Plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "games_fig_pre = sns.histplot(data = games_pre, x = \"Year\", palette = sns.color_palette(\"flare\"), kde = True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### According to Genre" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "games_fig2_pre = sns.histplot(data = games_pre, x = \"Year\", hue = \"Genre\", multiple = \"stack\", kde = True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Game Sales during 2000s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### General Plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "games_fig_dur = sns.histplot(data = games_dur, x = \"Year\", kde = True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### According to Genre" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "games_fig2_dur = sns.histplot(data = games_dur, x = \"Year\", hue = \"Genre\", multiple = \"stack\", kde = True)\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crime in Canada and US" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "crime_CA_fig = sns.histplot(data = crime_CA, x = \"year\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "crime_US_fig = sns.histplot(data = crime_US, x = \"report_year\")\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting Crime in both CA and US together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "plt.xticks(rotation = 90)\n", + "crime_US_fig = sns.histplot(data = crime_US, x = \"report_year\")\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Integrating violence crimes column into Game sales \n", + "- Note: multiplying sales by 1000 because they're reduced by default\n", + "- to plot later" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "games_dur['Violent_US'] = crime_US['violent_crimes']\n", + "games_dur['NA_Sales'] = games_dur['NA_Sales'].multiply(1000)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plotting Sales against Crime with Relplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.xticks(rotation = 90)\n", + "games_violence_US = sns.relplot(data = games_dur, x = 'NA_Sales', y = 'Violent_US')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Joined Plot and Histograms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.xticks(rotation = 90)\n", + "games_crime_dur = sns.jointplot(data = games_dur, x = \"Year\", y = 'Violent_US')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}