markharwood/IndexLastFmUsers.py Secret

## IndexLastFmUsers.py
from elasticsearch import helpers
from elasticsearch import Elasticsearch
import sys
reload(sys)
sys.setdefaultencoding('utf8')

indexName = "lastfmusers"
es = Elasticsearch()
if True:
    es.indices.delete(index=indexName, ignore=[400, 404])
    indexSettings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "user": {
                "properties": {
                    "artists": {
                        "type": "string", "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }
                    },
                    "gender": {
                        "type": "string", "index": "not_analyzed",
                    },
                    "country": {
                        "type": "string", "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }

                    }
                }
            }
        }
    }
    es.indices.create(index=indexName, body=indexSettings)
actions = []

# See http://mtg.upf.edu/node/1671 for data
ratingsFilename = "lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv"
profilesFilename = "lastfm-dataset-360K/usersha1-profile.tsv"
rowNum = 0
validRowNums = 0
lastUser = ""
prof = []
artists = []
numProfiles = 0
endNow=False
with open(profilesFilename, 'rb') as profiles:
    with open(ratingsFilename, 'rb') as csvfile:
        for line in csvfile:
            row = line.split("\t");
            rowNum += 1
            plays = int(row[3])
            artistName = row[2]
            artistNum = row[1]
            user = row[0]
            if rowNum == 1:
                prof = profiles.next().split("\t")
                lastUser = user
            if lastUser != user:
                if lastUser == prof[0]:
                    doc = {}
                    doc["user"] = lastUser
                    if (prof[1]):
                        doc["gender"] = prof[1]
                    if (prof[2]):
                        doc["age"] = int(prof[2])
                    doc["numArtists"]=len(artists)
                    country =prof[3].strip()

                    doc["country"] = country
                    doc["joindate"] = prof[4]
                    doc["artists"] = artists
                    action = {
                        "_index": indexName,
                        '_op_type': 'index',
                        "_type": "user",
                        "_source": doc
                    }
                    actions.append(action)
                    # Flush bulk indexing action if necessary
                    if len(actions) >= 5000:
                        helpers.bulk(es, actions)
                        del actions[0:len(actions)]
                        print rowNum
                while user > prof[0]:
                    try:
                        profRow = profiles.next()
                    except Exception, e:
                        endNow=True
                        print "missing profile", user, rowNum, numProfiles
                        break
                    if (profRow):
                        prof = profRow.split("\t")
                        numProfiles += 1
                    else:
                        break;

                artists = []
                lastUser = user
            if plays > 1:
                artists.append(artistName)

if len(actions) > 0:
    helpers.bulk(es, actions)
    del actions[0:len(actions)]
	from elasticsearch import helpers
	from elasticsearch import Elasticsearch
	import sys
	reload(sys)
	sys.setdefaultencoding('utf8')

	indexName = "lastfmusers"
	es = Elasticsearch()
	if True:
	es.indices.delete(index=indexName, ignore=[400, 404])
	indexSettings = {
	"settings": {
	"number_of_shards": 1,
	"number_of_replicas": 0
	},
	"mappings": {
	"user": {
	"properties": {
	"artists": {
	"type": "string", "index": "analyzed",
	"fields": {
	"raw": {
	"type": "string",
	"index": "not_analyzed"
	}
	}
	},
	"gender": {
	"type": "string", "index": "not_analyzed",
	},
	"country": {
	"type": "string", "index": "analyzed",
	"fields": {
	"raw": {
	"type": "string",
	"index": "not_analyzed"
	}
	}

	}
	}
	}
	}
	}
	es.indices.create(index=indexName, body=indexSettings)
	actions = []

	# See http://mtg.upf.edu/node/1671 for data
	ratingsFilename = "lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv"
	profilesFilename = "lastfm-dataset-360K/usersha1-profile.tsv"
	rowNum = 0
	validRowNums = 0
	lastUser = ""
	prof = []
	artists = []
	numProfiles = 0
	endNow=False
	with open(profilesFilename, 'rb') as profiles:
	with open(ratingsFilename, 'rb') as csvfile:
	for line in csvfile:
	row = line.split("\t");
	rowNum += 1
	plays = int(row[3])
	artistName = row[2]
	artistNum = row[1]
	user = row[0]
	if rowNum == 1:
	prof = profiles.next().split("\t")
	lastUser = user
	if lastUser != user:
	if lastUser == prof[0]:
	doc = {}
	doc["user"] = lastUser
	if (prof[1]):
	doc["gender"] = prof[1]
	if (prof[2]):
	doc["age"] = int(prof[2])
	doc["numArtists"]=len(artists)
	country =prof[3].strip()

	doc["country"] = country
	doc["joindate"] = prof[4]
	doc["artists"] = artists
	action = {
	"_index": indexName,
	'_op_type': 'index',
	"_type": "user",
	"_source": doc
	}
	actions.append(action)
	# Flush bulk indexing action if necessary
	if len(actions) >= 5000:
	helpers.bulk(es, actions)
	del actions[0:len(actions)]
	print rowNum
	while user > prof[0]:
	try:
	profRow = profiles.next()
	except Exception, e:
	endNow=True
	print "missing profile", user, rowNum, numProfiles
	break
	if (profRow):
	prof = profRow.split("\t")
	numProfiles += 1
	else:
	break;

	artists = []
	lastUser = user
	if plays > 1:
	artists.append(artistName)

	if len(actions) > 0:
	helpers.bulk(es, actions)
	del actions[0:len(actions)]