Skip to content

Instantly share code, notes, and snippets.

@jamesthomson
Last active March 27, 2016 23:40
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jamesthomson/35f65a6f6e857bc54ec9 to your computer and use it in GitHub Desktop.
aws version of the lastfm recommendations in spark
#in terminal connect ot the master node
ssh hadoop@ec2-xx-xx-xxx-xxx.compute-1.amazonaws.com -i ~/aws_key_pair.pem
#then fire up spark
MASTER=yarn-client /home/hadoop/spark/bin/pyspark
lines = sc.textFile('s3n://jthomson/lastfm_listens/listens/usersha1-artmbid-artname-plays.tsv')
data = lines.map(lambda l: l.split('\t'))
ratings = data.map(lambda d: (d[0], d[2], 1))
users_lkp = ratings.map(lambda s: s[0]).distinct().zipWithUniqueId()
items_lkp = ratings.map(lambda s: s[1]).distinct().zipWithUniqueId()
repArtist=ratings.map(lambda (u,a,r):(a,(u,r))).join(items_lkp).map(lambda (a,((u,r),i)):(u,i,r))
repUser=repArtist.map(lambda (u,a,r):(u,(a,r))).join(users_lkp).map(lambda (u,((a,r),i)):(i,a,r))
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
rank = 20
numIterations = 10
model = ALS.trainImplicit(repUser, rank, numIterations, 0.01)
#create recs for specific users
#find some shuggie otis fans
ratings.filter(lambda x:x[1]=='shuggie otis').top(10)
#pick one at random and find user id
users_lkp.filter(lambda x:x[0]=='fd3c74ac50f8ffc0089caa3cad8bc7a5997af48e').collect()
#have a look at what they listened to
ratings.filter(lambda x:x[0]=='fd3c74ac50f8ffc0089caa3cad8bc7a5997af48e').map(lambda x: (x[1])).collect()
#generate top 5 predictions
userArtist=items_lkp.map(lambda (a,i):(213489, i))
userPred=model.predictAll(userArtist).map(lambda r: (r[1], r[2])).join(items_lkp.map(lambda (a,i):(i,a))).map(lambda (i,(r,a)):((a,r)))
userPred.takeOrdered(5, key=lambda x: -x[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment