kmeans + naive based

2023-05-15 19:11:38 +03:00
parent 65d268a902
commit 701c3c6a87
1 changed files with 155 additions and 4 deletions
--- a/dwarves/Mining_HQ.ipynb
+++ b/dwarves/Mining_HQ.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -914,11 +914,162 @@
    "games_crime_dur = sns.jointplot(data = games_dur, x = \"Year\", y = 'Violent_US')\n",
    "plt.close(1)"
   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Kmeans\n",
+    "This executes the kmeans algorithm on the Critic/User scores and total units shipped for video games"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.2907407242071878\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0        2\n",
+       "1        2\n",
+       "2        2\n",
+       "3        2\n",
+       "4        2\n",
+       "        ..\n",
+       "23596    1\n",
+       "23597    1\n",
+       "23598    1\n",
+       "23599    1\n",
+       "23600    1\n",
+       "Name: Kmean_Labels, Length: 23601, dtype: int32"
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn import metrics\n",
+    "gammas_train_kmeans = KMeans(n_clusters=10, random_state=420, n_init=\"auto\").fit(\n",
+    "    gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]]\n",
+    ")\n",
+    "gammas_labels = gammas_train_kmeans.labels_\n",
+    "\n",
+    "silh_score = metrics.silhouette_score(\n",
+    "    gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]],\n",
+    "    gammas_labels,\n",
+    "    metric=\"euclidean\",\n",
+    ")\n",
+    "print(silh_score)\n",
+    "gammas[\"Kmean_Labels\"] = gammas_labels\n",
+    "gammas[\"Kmean_Labels\"]\n",
+    "# print(gammas_train.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Splitting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "# gammas = pd.read_csv(\"../datasets/videogames/games_cleanish.csv\")\n",
+    "gammas_train, gammas_test = train_test_split(gammas, test_size=0.20, random_state=69)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Naive Bayes\n",
+    "Using the classifier on the video game data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of mislabeled points out of a total 4721 points : 302\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "gnb = GaussianNB()\n",
+    "silly_columns = [\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]\n",
+    "prediction = gnb.fit(X=gammas_train[silly_columns], y=gammas_train[\"Kmean_Labels\"]).predict(gammas_test[silly_columns])\n",
+    "len(prediction)\n",
+    "y_test = gammas_test[\"Kmean_Labels\"]\n",
+    "len(y_test)\n",
+    "print(\"Number of mislabeled points out of a total %d points : %d\"\n",
+    "   % (gammas_test.shape[0], (y_test != prediction).sum()))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of mislabeled points out of a total 75 points : 4\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,\n",
+       "       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,\n",
+       "       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,\n",
+       "       1, 2, 0, 0, 2, 1, 0, 0, 1])"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import load_iris\n",
+    "X, y = load_iris(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n",
+    "gnb = GaussianNB()\n",
+    "y_pred = gnb.fit(X_train, y_train).predict(X_test)\n",
+    "print(\"Number of mislabeled points out of a total %d points : %d\"\n",
+    " % (X_test.shape[0], (y_test != y_pred).sum()))\n",
+    "y_test"
+   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "jewpidor",
   "language": "python",
   "name": "python3"
  },
@@ -932,12 +1083,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
-    "hash": "c261aea317cc0286b3b3261fbba9abdec21eaa57589985bb7a274bf54d6cc0a7"
+    "hash": "70ce2434745d4d40671ff71d794558676bf30253e5dd946148d83d754be8251d"
   }
  }
 },