Created
March 14, 2019 21:33
-
-
Save victorkohler/716a07620d5648524e7e227c0f60d32a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import pandas as pd | |
import numpy as np | |
import scipy.sparse as sp | |
from tqdm import tqdm | |
#--------------------------- | |
# LOAD AND PREPARE THE DATA | |
#--------------------------- | |
# Load the dataframe from a tab separated file. | |
df = pd.read_csv('data/usersha1-artmbid-artname-plays.tsv', sep='\t') | |
# Add column names | |
df = df.drop(df.columns[1], axis=1) | |
df.columns = ['user', 'artist', 'plays'] | |
# Drop any rows with missing values | |
df = df.dropna() | |
# Convert artists names into numerical IDs | |
df['user_id'] = df['user'].astype("category").cat.codes | |
df['artist_id'] = df['artist'].astype("category").cat.codes | |
# Create a lookup frame so we can get the artist | |
# names back in readable form later. | |
item_lookup = df[['artist_id', 'artist']].drop_duplicates() | |
item_lookup['artist_id'] = item_lookup.artist_id.astype(str) | |
# We drop our old user and artist columns | |
df = df.drop(['user', 'artist'], axis=1) | |
# Drop any rows with 0 plays | |
df = df.loc[df.plays != 0] | |
# Create lists of all users, artists and plays | |
users = list(np.sort(df.user_id.unique())) | |
artists = list(np.sort(df.artist_id.unique())) | |
plays = list(df.plays) | |
# Get the rows and columns for our new matrix | |
rows = df.user_id.astype(float) | |
cols = df.artist_id.astype(float) | |
# Contruct a sparse matrix for our users and items containing number of plays | |
data_sparse = sp.csr_matrix((plays, (rows, cols)), shape=(len(users), len(artists))) | |
# Get the values of our matrix as a list of user ids | |
# and item ids. Note that our litsts have the same length | |
# as each user id repeats one time for each played artist. | |
uids, iids = data_sparse.nonzero() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment