diff --git a/.gitignore b/.gitignore index fe6b5f3..c274c99 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,6 @@ jupyter-notes/merged_games.csv output.csv output.xlsx -.gitignore \ No newline at end of file +.gitignore +datasets/videogames/games_train.csv +datasets/videogames/games_test.csv diff --git a/dwarves/mining_hq.py b/dwarves/mining_hq.py index 59e3483..3bb4edd 100644 --- a/dwarves/mining_hq.py +++ b/dwarves/mining_hq.py @@ -2,8 +2,10 @@ # Collects stuff from the rest of the scripts import pandas as pd import numpy as np + # containment breach import scipy as scp +from sklearn.model_selection import train_test_split import gunner, digger, gunner, scout # Instantiating globals to be used in other files @@ -13,7 +15,9 @@ global games_sales_split_dur global games_sales_split_pos games_review = pd.read_csv("datasets/videogames/Games.xls") -games_sales = scout.cure_depression(pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv")) +games_sales = scout.cure_depression( + pd.read_csv("datasets/videogames/vgsales-12-4-2019-short.csv") +) print(games_review.count()) print(games_sales.count()) @@ -23,6 +27,7 @@ games_review_final = digger.slice_column(games_review, "GameName", "(Import)") games_merged_dat = digger.write_joined_df(games_sales, games_review_final) + # Acquisition of Merged dataset print(games_merged_dat.count()) @@ -103,15 +108,20 @@ gammas = digger.slam_dunk(gammas, "Critic_Score", labels=labels) # Also need to transform using Z-score (normal distr go brrrr lmao), or min-max # ah, scheiße # nvm, done, kekW -gammas['Critic_Score_Norm'] = scout.scaling_zscore(gammas, 'Critic_Score') -print(gammas['Critic_Score_Norm'].head(10)) +gammas["Critic_Score_Norm"] = scout.scaling_zscore(gammas, "Critic_Score") +print(gammas["Critic_Score_Norm"].head(10)) # Saving all into a file +gammas = gammas.dropna(how="any", axis=0) # nuke them empty poopers gammas.to_csv("datasets/videogames/games_cleanish.csv", index=False) +# split the data set +gammas_train, gammas_test = train_test_split(gammas, test_size=0.20, random_state=69) +gammas_train.to_csv("datasets/videogames/games_train.csv", index=False) +gammas_test.to_csv("datasets/videogames/games_test.csv", index=False) # Need similarity and dissimialrity, scipy time # Selecting 5 random rows -chosen_idx = np.random.choice(len(gammas), replace = False, size = 5) +chosen_idx = np.random.choice(len(gammas), replace=False, size=5) sample_rows = gammas.iloc[chosen_idx] print(sample_rows.head())