diff --git a/.gitignore b/.gitignore index 08e1aae..fe6b5f3 100644 --- a/.gitignore +++ b/.gitignore @@ -127,12 +127,16 @@ dmypy.json # Pyre type checker .pyre/ + # vscode settings conflic shuns .vscode/ -jupyter-notes/merged_games.csv -datasets/videogames/games_merged.csv -output.csv -.gitignore -output.xlsx -.gitignore datasets/videogames/merged_games.xlsx +datasets/videogames/games_merged.csv +datasets/videogames/games_cleanish.csv + +jupyter-notes/merged_games.csv + +output.csv +output.xlsx + +.gitignore \ No newline at end of file diff --git a/py_scripts/mining_hq.py b/py_scripts/mining_hq.py index a21390d..1b552ce 100644 --- a/py_scripts/mining_hq.py +++ b/py_scripts/mining_hq.py @@ -108,7 +108,7 @@ gammas['Critic_Score_Norm'] = scout.scaling_zscore(gammas, 'Critic_Score') print(gammas['Critic_Score_Norm'].head(10)) # Saving all into a file -gammas.to_csv("output.csv", index=False) +gammas.to_csv("datasets/videogames/games_cleanish.csv", index=False) # Need similarity and dissimialrity, scipy time # Selecting 5 random rows @@ -116,4 +116,4 @@ chosen_idx = np.random.choice(len(gammas), replace = False, size = 5) sample_rows = gammas.iloc[chosen_idx] print(sample_rows.head()) -# scout.dissimilarity(sample_rows.select_dtypes(include = np.number)) +scout.dissimilarity(sample_rows) diff --git a/py_scripts/scout.py b/py_scripts/scout.py index e630ebe..0d396e9 100644 --- a/py_scripts/scout.py +++ b/py_scripts/scout.py @@ -49,11 +49,15 @@ def scaling_zscore(dataframe, col): return stats.zscore(dataframe[col],axis = 0, nan_policy= "omit") def dissimilarity(row_arr): - for i in len(row_arr): - print("| ") - for j in len(row_arr): + row_arr = row_arr.select_dtypes(include = np.number) + row_arr = row_arr.drop('Rank', axis = 1) + + print(" | Entry 1 | Entry 2 | Entry 3 | Entry 4 | Entry 5 |") + for i in range(len(row_arr)): + print("Entry " , i + 1, " | ", end = "") + for j in range(len(row_arr)): eucDist = distance.euclidean(row_arr.iloc[i], row_arr.iloc[j]) - print(f"Dissim {i}{j}: {eucDist} |") + print(" {:#.6g} |".format(eucDist), end = "") print("\n") def scaling_range(datashitter, col):