Created
September 13, 2012 07:29
-
-
Save mikemccabe/3712588 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import urllib | |
import collections | |
def get_iterable(x): | |
return (x,) if not isinstance(x, (tuple, list)) else x | |
# if isinstance(x, collections.Iterable): | |
# return x | |
# else: | |
# return (x,) | |
def iter_contains_prefix(iter, prefix): | |
for s in iter: | |
if s.startswith(prefix): | |
return True | |
return False | |
def get_url(url): | |
f = urllib.urlopen(url) | |
c = f.read() | |
f.close() | |
return c | |
def get_meta(item_iter): | |
i = 0 | |
for item in item_iter: | |
i += 1 | |
if i % 100 == 0: | |
print i | |
j = get_url("http://archive.org/metadata/" + item) | |
yield json.loads(j); | |
def item_iterator(): | |
filename = 'items' | |
f = open("items") | |
for line in f: | |
yield line.strip() | |
ies = item_iterator() | |
ems = get_meta(ies) | |
def file_has_audio(file): | |
if (file.get('source') == 'original' and | |
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac', | |
'Apple Lossless Audio', 'Advanced Audio Coding')): | |
return True; | |
for md in ems: | |
for file in md['files']: | |
if file_has_audio(file): | |
fexts = file.get('external-identifier', []) | |
found_accoustid = iter_contains_prefix(get_iterable(fexts), | |
'urn:acoustid') | |
if found_accoustid: | |
pass | |
# print (file['format'] + ' ' + md['metadata']['identifier'] + | |
# u"has file " + file['name'].encode('ascii', 'ignore') + u" HAS accoustid") | |
else: | |
pass | |
# print (file['format'] + ' ' + md['metadata']['identifier'] + | |
# u"has file " + file['name'].encode('ascii', 'ignore') + u"w/o accoustid") | |
if found_accoustid: | |
mdexts = md.get('external_identifier', []) | |
found_mb = iter_contains_prefix(get_iterable(mdexts), | |
'urn:mb_release') | |
if not found_mb: | |
print md['metadata']['identifier'] + u' has accoustid but no mb_release' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment