kmeans + naive based

2023-05-15 19:11:38 +03:00
parent 65d268a902
commit 701c3c6a87
1 changed files with 155 additions and 4 deletions
--- a/dwarves/Mining_HQ.ipynb
+++ b/dwarves/Mining_HQ.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -914,11 +914,162 @@
    "games_crime_dur = sns.jointplot(data = games_dur, x = \"Year\", y = 'Violent_US')\n",
    "plt.close(1)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kmeans\n",
    "This executes the kmeans algorithm on the Critic/User scores and total units shipped for video games"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2907407242071878\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0        2\n",
       "1        2\n",
       "2        2\n",
       "3        2\n",
       "4        2\n",
       "        ..\n",
       "23596    1\n",
       "23597    1\n",
       "23598    1\n",
       "23599    1\n",
       "23600    1\n",
       "Name: Kmean_Labels, Length: 23601, dtype: int32"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn import metrics\n",
    "gammas_train_kmeans = KMeans(n_clusters=10, random_state=420, n_init=\"auto\").fit(\n",
    "    gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]]\n",
    ")\n",
    "gammas_labels = gammas_train_kmeans.labels_\n",
    "\n",
    "silh_score = metrics.silhouette_score(\n",
    "    gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]],\n",
    "    gammas_labels,\n",
    "    metric=\"euclidean\",\n",
    ")\n",
    "print(silh_score)\n",
    "gammas[\"Kmean_Labels\"] = gammas_labels\n",
    "gammas[\"Kmean_Labels\"]\n",
    "# print(gammas_train.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "# gammas = pd.read_csv(\"../datasets/videogames/games_cleanish.csv\")\n",
    "gammas_train, gammas_test = train_test_split(gammas, test_size=0.20, random_state=69)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes\n",
    "Using the classifier on the video game data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of mislabeled points out of a total 4721 points : 302\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "gnb = GaussianNB()\n",
    "silly_columns = [\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]\n",
    "prediction = gnb.fit(X=gammas_train[silly_columns], y=gammas_train[\"Kmean_Labels\"]).predict(gammas_test[silly_columns])\n",
    "len(prediction)\n",
    "y_test = gammas_test[\"Kmean_Labels\"]\n",
    "len(y_test)\n",
    "print(\"Number of mislabeled points out of a total %d points : %d\"\n",
    "   % (gammas_test.shape[0], (y_test != prediction).sum()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of mislabeled points out of a total 75 points : 4\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,\n",
       "       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,\n",
       "       0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,\n",
       "       1, 2, 0, 0, 2, 1, 0, 0, 1])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "X, y = load_iris(return_X_y=True)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n",
    "gnb = GaussianNB()\n",
    "y_pred = gnb.fit(X_train, y_train).predict(X_test)\n",
    "print(\"Number of mislabeled points out of a total %d points : %d\"\n",
    " % (X_test.shape[0], (y_test != y_pred).sum()))\n",
    "y_test"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "jewpidor",
   "language": "python",
   "name": "python3"
  },
@@ -932,12 +1083,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.10.9"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
-    "hash": "c261aea317cc0286b3b3261fbba9abdec21eaa57589985bb7a274bf54d6cc0a7"
+    "hash": "70ce2434745d4d40671ff71d794558676bf30253e5dd946148d83d754be8251d"
   }
  }
 },