Last active
October 24, 2022 02:16
-
-
Save wangkuiyi/edb8870d30dc6b639bb3ae1384e8aa12 to your computer and use it in GitHub Desktop.
The marginal and joint distribution of MovieLens-100K data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from typing import Dict, List, Tuple | |
def freq(df: pandas.DataFrame, column_name: str) -> Dict[int, int]: | |
"""Count the word frequency of a pandas data frame column.""" | |
ret = {} | |
for i in range(len(df[column_name])): | |
k = df[column_name][i] | |
if k in ret: | |
ret[k] += 1 | |
else: | |
ret[k] = 1 | |
return ret | |
def rank(freq: Dict[int, int]) -> Tuple[List[int], List[int]]: | |
"""Given a word count returns the word-to-rank map""" | |
s: List[Tuple[int, int]] = sorted(freq.items(), key=lambda item: -item[1]) | |
ws = [] | |
rs = [] | |
for r in range(len(s)): | |
ws.append(s[r][0]) | |
rs.append(r) | |
return ws, rs | |
def reword(df: pandas.DataFrame, column_name: str): | |
ws, rs = rank(freq(df, column_name)) | |
df[column_name] = df[column_name].replace(ws, rs) | |
# Load a CSV file separated by tabs and have no headers. | |
# c.f. https://stackoverflow.com/a/34094058/724872 | |
raw = pandas.read_csv("u.data", sep='\t', header=None) | |
raw.columns = ['user', 'item', 'rate', 'timestamp'] | |
# We use only positive ratings. | |
df = raw.loc[raw['rate'] >= 3] | |
df = df.drop(columns=['rate', 'timestamp']) | |
df.reset_index(inplace=True, drop=True) # Must do this after loc. | |
reword(df, 'user') | |
reword(df, 'item') | |
df.to_csv('u.data.reword') | |
sns.jointplot(x=df["user"], y=df["item"], kind='kde') | |
plt.savefig('/tmp/a.png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Observations