diff --git a/dwarves/Mining_HQ.ipynb b/dwarves/Mining_HQ.ipynb index ec476b9..b48e203 100644 --- a/dwarves/Mining_HQ.ipynb +++ b/dwarves/Mining_HQ.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -914,11 +914,162 @@ "games_crime_dur = sns.jointplot(data = games_dur, x = \"Year\", y = 'Violent_US')\n", "plt.close(1)" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kmeans\n", + "This executes the kmeans algorithm on the Critic/User scores and total units shipped for video games" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.2907407242071878\n" + ] + }, + { + "data": { + "text/plain": [ + "0 2\n", + "1 2\n", + "2 2\n", + "3 2\n", + "4 2\n", + " ..\n", + "23596 1\n", + "23597 1\n", + "23598 1\n", + "23599 1\n", + "23600 1\n", + "Name: Kmean_Labels, Length: 23601, dtype: int32" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn import metrics\n", + "gammas_train_kmeans = KMeans(n_clusters=10, random_state=420, n_init=\"auto\").fit(\n", + " gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]]\n", + ")\n", + "gammas_labels = gammas_train_kmeans.labels_\n", + "\n", + "silh_score = metrics.silhouette_score(\n", + " gammas[[\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]],\n", + " gammas_labels,\n", + " metric=\"euclidean\",\n", + ")\n", + "print(silh_score)\n", + "gammas[\"Kmean_Labels\"] = gammas_labels\n", + "gammas[\"Kmean_Labels\"]\n", + "# print(gammas_train.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Splitting" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "# gammas = pd.read_csv(\"../datasets/videogames/games_cleanish.csv\")\n", + "gammas_train, gammas_test = train_test_split(gammas, test_size=0.20, random_state=69)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Naive Bayes\n", + "Using the classifier on the video game data set" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of mislabeled points out of a total 4721 points : 302\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "gnb = GaussianNB()\n", + "silly_columns = [\"Critic_Score\", \"User_Score\", \"Total_Shipped\"]\n", + "prediction = gnb.fit(X=gammas_train[silly_columns], y=gammas_train[\"Kmean_Labels\"]).predict(gammas_test[silly_columns])\n", + "len(prediction)\n", + "y_test = gammas_test[\"Kmean_Labels\"]\n", + "len(y_test)\n", + "print(\"Number of mislabeled points out of a total %d points : %d\"\n", + " % (gammas_test.shape[0], (y_test != prediction).sum()))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of mislabeled points out of a total 75 points : 4\n" + ] + }, + { + "data": { + "text/plain": [ + "array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,\n", + " 0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,\n", + " 0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1,\n", + " 1, 2, 0, 0, 2, 1, 0, 0, 1])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "X, y = load_iris(return_X_y=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n", + "gnb = GaussianNB()\n", + "y_pred = gnb.fit(X_train, y_train).predict(X_test)\n", + "print(\"Number of mislabeled points out of a total %d points : %d\"\n", + " % (X_test.shape[0], (y_test != y_pred).sum()))\n", + "y_test" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "jewpidor", "language": "python", "name": "python3" }, @@ -932,12 +1083,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.10.9" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "c261aea317cc0286b3b3261fbba9abdec21eaa57589985bb7a274bf54d6cc0a7" + "hash": "70ce2434745d4d40671ff71d794558676bf30253e5dd946148d83d754be8251d" } } },