Skip to content

Instantly share code, notes, and snippets.

@craffel
Last active April 24, 2016 23:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save craffel/d7f772c9becaf3387d95 to your computer and use it in GitHub Desktop.
Save craffel/d7f772c9becaf3387d95 to your computer and use it in GitHub Desktop.
Scripts used for generating the clean MIDI subset, as used in https://github.com/craffel/midi-dataset
import os
os.chdir('..')
import sys
sys.path.append(os.getcwd())
import normalize_names
import pickle
with open('data/Clean MIDIs-md5_to_artist_title.pickle') as f:
md5_to_artist_title = pickle.load(f)
with open('data/Clean MIDIs-md5_to_path.pickle') as f:
md5_to_path = pickle.load(f)
md5_to_freebase_artist_title = {}
for n, md5 in enumerate(md5_to_path):
artists_titles = md5_to_artist_title[md5]
artists = [artist_title[0] for artist_title in artists_titles]
titles = [artist_title[1] for artist_title in artists_titles]
for n, title in enumerate(titles):
# Some titles have " l" appended to the end which trips up freebase
if title[-2:] == ' l':
titles[n] = title[:-2]
print artists, titles
resolved_artists = normalize_names.echonest_normalize_artist(artists)
if resolved_artists is not None:
resolved_artist, resolved_title = \
normalize_names.freebase_normalize_title(resolved_artists, titles)
if resolved_artist is not None and resolved_title is not None:
md5_to_freebase_artist_title[md5] = [resolved_artist,
resolved_title]
print '\t', resolved_artist, '-', resolved_title
print
with open('data/Clean MIDIs-md5_to_freebase_artist_title.pickle', 'wb') as f:
pickle.dump(md5_to_freebase_artist_title, f)
import os
os.chdir('..')
import sys
sys.path.append(os.getcwd())
import pickle
import csv
import shutil
import normalize_names
def safe_copy(old_path, new_path):
'''
Copies a file, but if the destination exists it appends a number.
'''
if not os.path.exists(new_path):
shutil.copy(old_path, new_path)
else:
n = 1
while os.path.exists((os.path.splitext(new_path)[0] +
'.{}.mid'.format(n))):
n += 1
new_path = os.path.splitext(new_path)[0] + '.{}.mid'.format(n)
shutil.copy(old_path, new_path)
return new_path
if not os.path.exists('data/clean_midi/mid'):
os.makedirs('data/clean_midi/mid')
with open('data/Clean MIDIs-md5_to_freebase_artist_title.pickle') as f:
md5_to_artist_title = pickle.load(f)
with open('data/Clean MIDIs-md5_to_path.pickle') as f:
md5_to_path = pickle.load(f)
with open('file_lists/clean_midi.txt', 'wb') as f:
writer = csv.writer(f, delimiter='\t')
for n, (md5, artist_title) in enumerate(md5_to_artist_title.items()):
artist = normalize_names.clean(artist_title[0]).replace('/', ' ')
title = normalize_names.clean(artist_title[1]).replace('/', ' ')
original_path = os.path.join('data', md5_to_path[md5])
if not os.path.exists(original_path):
print "{} not found".format(original_path)
continue
if not os.path.exists(os.path.join('data/clean_midi/mid', artist)):
os.makedirs(os.path.join('data/clean_midi/mid', artist))
output_path = os.path.join('data/clean_midi/mid', artist,
title[:247] + '.mid')
output_path = safe_copy(original_path, output_path)
writer.writerow([n, artist, title, md5,
output_path.replace('data/clean_midi/mid/', '')])
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
import os
import sys
import numpy as np
import hashlib
import pickle
# <codecell>
path = '../data/Clean MIDIs/'
# <codecell>
def split_all_extensions( f ):
'''
Returns a filename with all extensions removed
'''
while os.path.splitext(f)[1] != '':
f = os.path.splitext(f)[0]
return f
# <codecell>
def safe_rename( old_path, new_path ):
'''
Moves a file, but if the destination exists it appends a number to the filename.
'''
if not os.path.exists( new_path ):
os.renames( old_path, new_path )
else:
n = 1
new_path = split_all_extensions(new_path) + os.path.splitext(new_path)[1]
while os.path.exists( os.path.splitext(new_path)[0] + '.{}.mid'.format( n ) ):
n += 1
new_path = os.path.splitext(new_path)[0] + '.{}.mid'.format( n )
os.renames( old_path, new_path )
# <codecell>
def convert_camelCase( string ):
'''
Replaces any camelCase with camel Case
'''
lowers = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
uppers = [s.upper() for s in lowers]
camel_case_spots = np.flatnonzero(np.array([b in lowers and c in uppers for b, c in zip( string[:-1], string[1:] )]))
if camel_case_spots.shape == (0,):
return None
shift = 1
for n in camel_case_spots:
string = string[:n + shift] + ' ' + string[n + shift:]
shift += 1
return string
# <codecell>
# Remove small and non-midi files, and rename .kar to .mid
for root, subdirectories, files in os.walk( path ):
for f in files:
if os.path.splitext(f)[1].lower() == '.kar':
os.rename( os.path.join( root, f ), os.path.join( root, os.path.splitext(f)[0] + '.mid' ) )
elif os.path.splitext(f)[1].lower() != '.mid':
os.remove( os.path.join(root, f) )
elif os.path.getsize( os.path.join(root, f) ) < 2000:
os.remove( os.path.join(root, f) )
# <codecell>
# Flatten subdirectories
for root, subdirectories, files in os.walk( path ):
for f in files:
if len( os.path.join( root, f ).split('/') ) > 5:
new_path = '/'.join( os.path.join( root, f ).split('/')[:4] + [f] )
safe_rename( os.path.join( root, f ), new_path )
# <codecell>
# Remove empty subdirectories
for root, subdirectories, files in os.walk( path ):
for subdirectory in subdirectories:
if os.listdir( os.path.join(root, subdirectory) ) == [] or os.listdir( os.path.join(root, subdirectory) ) == ['.DS_Store']:
os.rmdir( os.path.join(root, subdirectory) )
# <codecell>
# Remove duplicates
md5dict = {}
for root, subdirectories, files in os.walk( path ):
for f in files:
md5 = hashlib.md5( open( os.path.join( root, f )).read() )
md5 = md5.hexdigest()
if md5 in md5dict:
os.remove( os.path.join(root, f) )
md5dict[md5] += [os.path.join( root, f )]
else:
md5dict[md5] = [os.path.join( root, f )]
# <codecell>
# Convert CamelCase to Camel Case in subdirectories
for root, subdirectories, files in os.walk( path ):
for subdirectory in subdirectories:
if convert_camelCase( subdirectory ) is not None:
safe_rename( os.path.join(root, subdirectory), os.path.join(root, convert_camelCase(subdirectory) ) )
# <codecell>
# Convert CamelCase to Camel Case in files
for root, subdirectories, files in os.walk( path ):
for f in files:
if convert_camelCase( f ) is not None:
safe_rename( os.path.join(root, f), os.path.join(root, convert_camelCase(f) ) )
# <codecell>
# Replace _ and - with space
for root, subdirectories, files in os.walk( path ):
for f in files:
if f.find('_') > -1 or f.find('-') > -1:
safe_rename( os.path.join(root, f), os.path.join( root, f.replace('_', ' ').replace('-', ' ') ) )
# <codecell>
# Remove files which were just artist names (oops)
for root, subdirectories, files in os.walk( path ):
for f in files:
if f[:4] == '.mid':
os.remove( os.path.join(root, f) )
# <codecell>
# Replace . with space
for root, subdirectories, files in os.walk( path ):
for f in files:
title = os.path.splitext( f )[0]
if title.find('.') > -1:
safe_rename( os.path.join(root, f), os.path.join(root, title.replace('.', ' ') + '.mid') )
# <codecell>
# Change duplicate numbering with space to period (yesterday 7.mid -> yesterday.7.mid)
for root, subdirectories, files in os.walk( path ):
for f in files:
title = os.path.splitext(f)[0]
while len(title) > 2 and title[-2] == " " and title[-1] in [str(n) for n in xrange(10)]:
title = title[:-2]
if title != os.path.splitext(f)[0]:
safe_rename( os.path.join(root, f), os.path.join(root, title + '.mid') )
# <codecell>
# Flatten all directories
for root, subdirectories, files in os.walk( path ):
for f in files:
if len( os.path.join( root, f ).split('/') ) > 4:
start = '/'.join( os.path.join(root, f).split('/')[:2] )
end = '/'.join( os.path.join(root, f).split('/')[-2:] )
new_path = os.path.join( start, end )
safe_rename( os.path.join( root, f ), new_path )
# <codecell>
# Remove artist name from track title
for root, subdirectories, files in os.walk( path ):
for f in files:
artist = os.path.split(root)[1]
if artist in f:
safe_rename( os.path.join(root, f), os.path.join( root, f.replace(artist, '').lstrip() ) )
f = f.replace(artist, '').lstrip()
for word in artist.split(' '):
if len(word) > 3 and word in f:
safe_rename( os.path.join(root, f), os.path.join( root, f.replace(word, '').lstrip() ) )
f = f.replace(word, '').lstrip()
# <codecell>
# Strip spaces at the beginning and end of filenames
for root, subdirectories, files in os.walk( path ):
for f in files:
new_f = os.path.splitext( f )[0].lstrip().rstrip() + os.path.splitext(f)[1]
if new_f != f:
safe_rename( os.path.join(root, f), os.path.join(root, new_f) )
# <codecell>
# Remove multiple spaces
for root, subdirectories, files in os.walk( path ):
for f in files:
new_f = f
while new_f.find(' ') > -1:
new_f = new_f.replace(' ',' ')
if new_f != f:
safe_rename( os.path.join(root, f), os.path.join(root, new_f) )
# <codecell>
def normalize_string(string):
'''
Make it lowercase and unicode
'''
return unicode(string.lower(), encoding='utf-8')
# <codecell>
# Make the md5->[[artist, title]] dict
md5_to_artist_title = {}
md5_to_paths = pickle.load( open('../data/Clean MIDIs-md5_to_paths.pickle') )
for root, subdirectories, files in os.walk(path):
for f in files:
if '.mid' not in f:
continue
md5 = hashlib.md5( open( os.path.join(root, f) ).read() )
md5 = md5.hexdigest()
title = split_all_extensions(f)
artist = os.path.split(root)[1]
title = normalize_string(title)
artist = normalize_string(artist)
md5_to_artist_title[md5] = [[artist, title]]
for some_path in md5_to_paths[md5]:
rem, title = os.path.split(some_path)
title = os.path.splitext(title)[0]
artist = os.path.split(rem)[1]
if convert_camelCase(title) is not None:
title = convert_camelCase(title)
if convert_camelCase(artist) is not None:
artist = convert_camelCase(artist)
title = title.replace("_", " ").replace("-"," ")
artist = artist.replace("_", " ").replace("-"," ")
if len(title) > 2 and title[-2] == " " and title[-1] in [str(n) for n in xrange(10)]:
title = title[:-2]
if artist in title:
title = title.replace(artist, "")
for word in artist.split(' '):
if len(word) > 3 and word in title:
title = title.replace(word, "")
while title.find(' ') > -1:
title = title.replace(' ',' ')
while artist.find(' ') > -1:
artist = artist.replace(' ',' ')
artist = artist.lstrip().rstrip()
title = title.lstrip().rstrip()
title = normalize_string(title)
artist = normalize_string(artist)
if [artist, title] not in md5_to_artist_title[md5]:
md5_to_artist_title[md5] += [[artist, title]]
# <codecell>
if __name__ == '__main__':
import whoosh_search
index = whoosh_search.get_whoosh_index('../data/cal500/index/')
searcher = index.searcher()
match_list = []
for root, subdirectories, files in os.walk(path):
for f in files:
if '.mid' not in f.lower():
break
title = split_all_extensions(f)
artist = os.path.split(root)[1]
results = whoosh_search.search(searcher, index.schema, artist, title)
for result in results:
match_list += [[os.path.join(artist, f), "{}-{}.mp3".format( result[1].replace(' ', '_'), result[2].replace(' ', '_') )]]
searcher.close()
pickle.dump( match_list, open('../data/Clean MIDIs-path_to_cal500_path.pickle', 'w') )
import json
import urllib
import unicodedata
import pyen
import collections
FREEBASE_KEY = open('.freebase_key').read()
ECHONEST_KEY = open('.echonest_key').read()
FREEBASE_URL = 'https://www.googleapis.com/freebase/v1/search?'
def clean(string):
'''
Removes non-ascii characters from a string in a semi-smart way
:parameters:
- string : str or unicode
String to clean
:returns:
- clean_string : str
ASCII string
'''
# unicodedata requires unicode type as input
if type(string) == str:
string = unicode(string, 'utf-8', 'ignore')
# unicodedata tries to convert special characters to nearest ascii
# encode converts to ascii, ignoring encoding errors
return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore')
def echonest_normalize_artist(artists):
'''
Normalize artist names using echonest
:parameters:
- artists : str or list of str
Query artist name or list of potential artist names
- titles : str or list of str
Query title or list of potential song titles
:returns:
- artists : list of str
Unique list of matching artists
'''
# Allow strings/unicode to be passed instead of list
if type(artists) == str or type(artists) == unicode:
artists = [artists]
# Keep track of artists that echonest reports as matching
matched_artists = []
# pyen makes querying echonest easy
en = pyen.Pyen(api_key=ECHONEST_KEY)
for query_artist in artists:
# Allow for http query failures
success = False
while not success:
try:
response = en.get('artist/search',
name=clean(query_artist),
results=5,
fuzzy_match='true')
# Skip any errors
except pyen.PyenException as e:
print e.message, e.args
continue
success = True
# If any artists were found, add them to the list
if len(response['artists']) > 0:
for matched_artist in response['artists']:
matched_artists.append(matched_artist['name'])
# No matches = return None
if len(matched_artists) == 0:
return None
# Get unique items from the list
matched_artists = list(collections.OrderedDict.fromkeys(matched_artists))
return matched_artists
def freebase_normalize_title(artists, titles):
'''
Normalize a song title using freebase
:parameters:
- artists : str or list of str
Query artist name or list of potential artist names
- titles : str or list of str
Query title or list of potential song titles
:returns:
- artist : str or NoneType
Freebase's chosen artist from the supplied `artists` list
or None if no match
- title: str or NoneType
Freebase's purported title or None if no match
'''
def title_match(artist, title, old_correction=False):
''' Match a song title with some artist using freebase '''
# Ask freebase for music recordings with the supplied artist
filter_str = '(all type:/music/recording /music/recording/artist:"{}")'
params = {'query': clean(title),
# Remove quotes, they mess up the query
'filter': filter_str.format(clean(artist).replace('"', '')),
# Only return one match
'limit': 1,
'key': FREEBASE_KEY,
# Allow for spelling mistakes
'spell': 'always'}
url = FREEBASE_URL + urllib.urlencode(params)
# Continually try http queries until a successful one
success = False
while not success:
try:
response = json.loads(urllib.urlopen(url).read())
except Exception as e:
print e.message, e.args
continue
# A successful query should always have a 'result' key
if 'result' in response:
success = True
else:
print 'result not in response: {}'.format(response)
# Given a result, get the name
if len(response['result']) > 0:
return response['result'][0]['name']
# For spelling corrections, re-try th query with the correction
if 'correction' in response:
# But only do it once
if old_correction:
return None
else:
return title_match(artist,
response['correction'][0],
True)
return None
# Allow for string args
if type(artists) == str or type(artists) == unicode:
artists = [artists]
if type(titles) == str or type(artists) == unicode:
titles = [titles]
# Try all combinations of supplied artists and titles
for query_artist in artists:
for query_title in titles:
title = title_match(query_artist, query_title)
if title is not None:
return query_artist, title
return None, None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment