Skip to content

Instantly share code, notes, and snippets.

@markharwood
Created November 7, 2016 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markharwood/f67a8532f0acba8dcc3fba07541b0933 to your computer and use it in GitHub Desktop.
Save markharwood/f67a8532f0acba8dcc3fba07541b0933 to your computer and use it in GitHub Desktop.
IndexLastFM data for use in elastic Graph V5+
from elasticsearch import helpers
from elasticsearch import Elasticsearch
import sys
reload(sys)
sys.setdefaultencoding('utf8')
indexName = "lastfmusersv5"
es = Elasticsearch()
es.indices.delete(index=indexName, ignore=[400, 404])
indexSettings = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"user": {
"properties": {
"artists": {
"type": "keyword"
},
"gender": {
"type": "keyword"
},
"country": {
"type": "keyword"
}
}
}
}
}
es.indices.create(index=indexName, body=indexSettings)
actions = []
# See http://mtg.upf.edu/node/1671 for data
ratingsFilename = "lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv"
profilesFilename = "lastfm-dataset-360K/usersha1-profile.tsv"
rowNum = 0
validRowNums = 0
lastUser = ""
prof = []
artists = []
numProfiles = 0
endNow=False
with open(profilesFilename, 'rb') as profiles:
with open(ratingsFilename, 'rb') as csvfile:
for line in csvfile:
row = line.split("\t");
rowNum += 1
plays = int(row[3])
artistName = row[2]
artistNum = row[1]
user = row[0]
if rowNum == 1:
prof = profiles.next().split("\t")
lastUser = user
if lastUser != user:
if lastUser == prof[0]:
doc = {}
doc["user"] = lastUser
if (prof[1]):
doc["gender"] = prof[1]
if (prof[2]):
doc["age"] = int(prof[2])
doc["numArtists"]=len(artists)
country =prof[3].strip()
doc["country"] = country
doc["joindate"] = prof[4]
doc["artists"] = artists
action = {
"_index": indexName,
'_op_type': 'index',
"_type": "user",
"_source": doc
}
actions.append(action)
# Flush bulk indexing action if necessary
if len(actions) >= 5000:
helpers.bulk(es, actions)
del actions[0:len(actions)]
print rowNum
while user > prof[0]:
try:
profRow = profiles.next()
except Exception, e:
endNow=True
print "missing profile", user, rowNum, numProfiles
break
if (profRow):
prof = profRow.split("\t")
numProfiles += 1
else:
break;
artists = []
lastUser = user
if plays > 1:
artists.append(artistName)
if len(actions) > 0:
helpers.bulk(es, actions)
del actions[0:len(actions)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment