Skip to content

Instantly share code, notes, and snippets.

Last active Dec 14, 2018
What would you like to do?
Whoosh matching of jams dump to MSD
import whoosh
import whoosh.fields
import whoosh.index
import whoosh.analysis
import whoosh.qparser
# Path to a whoosh index of the Million Song Dataset.
# You'll need to create one of these if you don't have one already, as an
# example see
MSD_INDEX = '/home/craffel/projects/midi-dataset-cqt/data/msd/index/'
# Path to the jams.tsv file, get it from
JAMS_TSV_PATH = 'archive/jams.tsv'
# Path to the output match TSV file
OUTPUT_TSV = 'jam_to_msd.tsv'
def search(searcher, schema, artist, title, threshold=26):
''' Find matches with a score above a certain threshold in a whoosh index.
# Convert arguments to unicode
if type(artist) != unicode:
artist = unicode(artist, encoding='utf-8')
if type(title) != unicode:
title = unicode(title, encoding='utf-8')
# Construct a query parser for the whoosh index
arparser = whoosh.qparser.QueryParser('artist', schema)
tiparser = whoosh.qparser.QueryParser('title', schema)
q = whoosh.query.And([arparser.parse(artist), tiparser.parse(title)])
# Get whoosh results
results =
if len(results) > 0:
# If there were any results, return the ones with a score above the
# provided threshold
return [[r['track_id'], r['artist'], r['title']] for r in results if
r.score > threshold]
# If there were no results, return an empty list
return []
if __name__ == '__main__':
# Load in the jams tsv file
with open(JAMS_TSV_PATH) as f:
# Parse each line in the file
jams = [line.strip().split('\t') for line in f]
# Remove header row
jams = jams[1:]
# Load in the whoosh index
msd_index = whoosh.index.open_dir(MSD_INDEX)
# Open the output tsv file for writing
with open(OUTPUT_TSV, 'wb') as jam_to_msd_tsv:
# Open a searcher object from the index
with msd_index.searcher() as searcher:
# Match each jam entry
for jam in jams:
# Extract artist and title from the jam entry
artist, title = jam[2:4]
# Match this artist and title against the MSD
results = search(searcher, msd_index.schema, artist, title)
# Write out each result to the output TSV file
for result in results:
'{}\t{}\n'.format(jam[0], result[0]))
print u"{}: {} - {} -> {} - {}".format(
result[0], result[1], result[2], artist, title)
except Exception as e:
print "Error: {}, jam={}".format(e, jam)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment