Skip to content

Instantly share code, notes, and snippets.

@victorkohler
Created March 14, 2019 21:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save victorkohler/716a07620d5648524e7e227c0f60d32a to your computer and use it in GitHub Desktop.
Save victorkohler/716a07620d5648524e7e227c0f60d32a to your computer and use it in GitHub Desktop.
import tensorflow as tf
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
#---------------------------
# LOAD AND PREPARE THE DATA
#---------------------------
# Load the dataframe from a tab separated file.
df = pd.read_csv('data/usersha1-artmbid-artname-plays.tsv', sep='\t')
# Add column names
df = df.drop(df.columns[1], axis=1)
df.columns = ['user', 'artist', 'plays']
# Drop any rows with missing values
df = df.dropna()
# Convert artists names into numerical IDs
df['user_id'] = df['user'].astype("category").cat.codes
df['artist_id'] = df['artist'].astype("category").cat.codes
# Create a lookup frame so we can get the artist
# names back in readable form later.
item_lookup = df[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup.artist_id.astype(str)
# We drop our old user and artist columns
df = df.drop(['user', 'artist'], axis=1)
# Drop any rows with 0 plays
df = df.loc[df.plays != 0]
# Create lists of all users, artists and plays
users = list(np.sort(df.user_id.unique()))
artists = list(np.sort(df.artist_id.unique()))
plays = list(df.plays)
# Get the rows and columns for our new matrix
rows = df.user_id.astype(float)
cols = df.artist_id.astype(float)
# Contruct a sparse matrix for our users and items containing number of plays
data_sparse = sp.csr_matrix((plays, (rows, cols)), shape=(len(users), len(artists)))
# Get the values of our matrix as a list of user ids
# and item ids. Note that our litsts have the same length
# as each user id repeats one time for each played artist.
uids, iids = data_sparse.nonzero()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment