Skip to content

Instantly share code, notes, and snippets.

@mindsocket
Created March 19, 2012 02:41
Show Gist options
  • Save mindsocket/2091580 to your computer and use it in GitHub Desktop.
Save mindsocket/2091580 to your computer and use it in GitHub Desktop.
Use beets' acoustid plugin, musicbrainz lookup and lastfm api to determine playcounts for a collection
#!/home/roger/.virtualenvs/lastfm/bin/python -u
"""
File: lastmatchwithnums.py
Author: Roger Barnes
A simple program for using acoustid to fingerprint and look up metadata (esp. play counts)
for MP3 files via lastfm. Usage:
$ python lastmatchwithnums.py [folder] ...
All mp3s in all folders (recursive) will be fingerprinted with the beets
acoustid plugin, then looked up (if possible) from last.fm for play and
listener counts. Data is written into redis.
Finally, the data can be extracted and reported. Feed into a symlink thusly:
$ ./lastmatchwithnums.py | xargs -d '\n' -Ixxx ln -sf xxx ~/audiolinks/
Requirements, all pip installable (+ some dependent system packages, YMMV):
beets
redis
pyacoustid
Technology note: The generator pipeline driven programming style in Mathcher was inspired by
David Beazley's presentations on generators. - http://www.dabeaz.com/generators/
TODO - multiprocessing?
"""
import sys
import os
import pylast
from beets.autotag import mb
from redis import Redis
from beetsplug import chroma
from pprint import pprint
# This API key is specifically for this script.
# http://last.fm/api/account
API_KEY = 'faf408096c145277a0e01e712ae4a5f2'
PYLAST_EXCEPTIONS = (
pylast.WSError,
pylast.MalformedResponseError,
pylast.NetworkError,
)
import fnmatch
def gen_find(filepat,top):
for path, dirlist, filelist in os.walk(top):
for name in fnmatch.filter(filelist,filepat):
yield os.path.join(path,name)
class Matcher(object):
def __init__(self, **kwargs):
self.redis = Redis(host=kwargs['redis_host'], port=int(kwargs['redis_port']), db=int(kwargs['redis_db']))
self.network = pylast.LastFMNetwork(api_key = API_KEY)
def processpath(self, path):
''' Cue the generator pipeline shenanigans! Each call is a generator function
'''
mp3files = gen_find("*.mp3", path)
fullpaths = self._getfullpath(mp3files)
# This short circuits anything already in redis
newpaths = (path for path in fullpaths if self._addpath(path))
matches = self._getmatches(newpaths)
matchrecs = self._getmatchrec(matches)
return sum(self._lastfmlookup(matchrecs))
def _getfullpath(self, files):
for relfile in files:
# print >> sys.stderr, "getfullpath", relfile
yield os.path.abspath(os.path.expanduser(relfile))
def _addpath(self, path):
''' This one's not a generator '''
key = "lastfm:" + path
self.redis.sadd("lastfmdirs", os.path.dirname(key))
self.redis.sadd("lastfmdir:" + os.path.dirname(key), key)
result = self.redis.sadd("lastfmpaths", key)
if not result:
print >> sys.stderr, path, "exists", self.redis.hgetall(key)
return result
def _getmatches(self, files):
for path in files:
# print >> sys.stderr, "getmatches", path
matchid = self.redis.get("mbid_for:" + path)
if not matchid:
try:
match = chroma.acoustid_match(path)
matchid = match[0] if match else None
except (EOFError, AttributeError, IOError), err:
print >> sys.stderr, "ERROR matching", path, err
matchid = None
self.redis.set("mbid_for:" + path, matchid)
if not matchid:
continue
yield (path, matchid)
def _getmatchrec(self, pairs):
for path, matchid in pairs:
# print >> sys.stderr, "getmatchrec", path, matchid
if matchid:
matchrec = self.redis.hgetall("matchrec:" + matchid)
if not matchrec:
matchrec = {'id': matchid}
mbtrack = mb.track_for_id(matchid)
try:
matchrec['artist'] = mbtrack.artist
matchrec['title'] = mbtrack.title
except AttributeError:
continue
self.redis.hmset("matchrec:" + matchid, matchrec)
else:
matchrec = {}
yield (path, matchrec)
def _lastfmlookup(self, matchrecs):
for path, matchrec in matchrecs:
# print >> sys.stderr, "lastfmlookup", path, matchrec
key = "lastfm:" + path
if 'id' in matchrec:
try:
track = self.network.get_track_by_mbid(matchrec['id'])
except PYLAST_EXCEPTIONS:
if 'artist' in matchrec and 'title' in matchrec:
track = self.network.get_track(matchrec['artist'], matchrec['title'])
else:
continue
self.redis.hset(key, 'id', matchrec['id'])
self.redis.hset(key, 'artist', matchrec['artist'])
self.redis.hset(key, 'title', matchrec['title'])
try:
self.redis.hset(key, 'playcount', track.get_playcount())
self.redis.hset(key, 'listener_count', track.get_listener_count())
except PYLAST_EXCEPTIONS:
continue
else:
self.redis.hset(key, 'id', "NOMATCH")
print >> sys.stderr, path, "new", self.redis.hgetall(key)
yield 1
def _printsorted(self, identifier, allkeys, calc=lambda play,listen:float(play) / float(listen)):
self.redis.delete(identifier)
for key in allkeys:
rec = self.redis.hgetall(key)
if 'playcount' in rec and 'listener_count' in rec and int(rec['playcount']) > 1000:
self.redis.zadd(identifier, key, calc(rec['playcount'],rec['listener_count']))
scored = self.redis.zrange(identifier, 0, -1, withscores=True)
scored.reverse()
return scored
def printsortedresults(self):
allkeys = self.redis.smembers("lastfmpaths")
scored = self._printsorted("lastfmscore", allkeys)
for x in scored:
if x[1] > 5:
print x[0].split(':')[1]
def printsortedresultsbydir(self):
alldirs = self.redis.smembers("lastfmdirs")
for path in alldirs:
allkeys = self.redis.smembers("lastfmdir:" + path)
scored = self._printsorted("lastfmscore:" + path, allkeys, calc=lambda play,listen:float(play))
for x in scored[:2]:
print x[0].split(':')[1]
if __name__ == '__main__':
args = sys.argv[1:]
matcher = Matcher(redis_host='localhost', redis_port=6379, redis_db=0)
total = 0
for patharg in args:
total += matcher.processpath(patharg)
#print "Processed", total, "files"
matcher.printsortedresultsbydir()
#matcher.printsortedresults()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment