-
-
Save jjjake/3764671 to your computer and use it in GitHub Desktop.
check if an item has an acoustid or music brainz release: ./check_for_acoustid.py {item}
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" Check if an item on archive.org has an acoustid. | |
Usage: | |
./check_for_acoustid.py {item} | |
Usage with GNU Parallel: | |
cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}' | |
""" | |
import sys | |
import json | |
import urllib | |
def get_iterable(x): | |
return (x,) if not isinstance(x, (tuple, list)) else x | |
def iter_contains_prefix(iter, prefix): | |
for s in iter: | |
if s.startswith(prefix): | |
return True | |
return False | |
def get_url(url): | |
f = urllib.urlopen(url) | |
c = f.read() | |
f.close() | |
return c | |
def get_meta(item): | |
j = get_url("http://archive.org/metadata/" + item) | |
yield json.loads(j); | |
def item_iterator(): | |
filename = 'items' | |
f = open("items") | |
for line in f: | |
yield line.strip() | |
def file_has_audio(file): | |
if (file.get('source') == 'original' and | |
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac', | |
'Apple Lossless Audio', 'Advanced Audio Coding')): | |
return True | |
item = sys.argv[1] | |
for metadata in get_meta(item): | |
files = metadata.get('files') | |
if not files: | |
continue | |
for file in files: | |
if file_has_audio(file): | |
fexts = file.get('external-identifier', []) | |
found_accoustid = iter_contains_prefix(get_iterable(fexts), | |
'urn:acoustid') | |
if not found_accoustid: | |
print metadata['metadata']['identifier'] | |
break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This demonstrates doing multiple metadata fetches in parallel. | |
# It seems to be fast enough that the json decoding cost becomes | |
# a significant proportion of the execution time. | |
# It requires gevent; see http://www.gevent.org/intro.html#installation | |
# To make this do something useful, modify do_work(). | |
import requests | |
import gevent | |
from gevent import monkey | |
monkey.patch_all() | |
from gevent import queue as g_queue | |
input_queue = g_queue.JoinableQueue(1000) | |
def queue_ids(ids, start_index=0, count=0): | |
limit_index = start_index + count | |
for i, id in enumerate(ids): | |
if i < start_index: | |
continue | |
if limit_index != 0 and i >= limit_index: | |
break | |
id = id.strip() | |
input_queue.put((id, i)) | |
def worker(): | |
while True: | |
id, i = input_queue.get() | |
try: | |
do_work(id, i) | |
finally: | |
input_queue.task_done() | |
# Do work -> | |
#______________________________________________________________________________ | |
def get_iterable(x): | |
return (x,) if not isinstance(x, (tuple, list)) else x | |
def iter_contains_prefix(iter, prefix): | |
for s in iter: | |
if s.startswith(prefix): | |
return True | |
return False | |
def get_meta(item): | |
r = requests.get("http://archive.org/metadata/" + item) | |
yield r.json | |
def item_iterator(): | |
filename = 'items' | |
f = open("items") | |
for line in f: | |
yield line.strip() | |
def file_has_audio(file): | |
if (file.get('source') == 'original' and | |
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac', | |
'Apple Lossless Audio', 'Advanced Audio Coding')): | |
return True | |
def do_work(item, i): | |
for metadata in get_meta(item): | |
files = metadata.get('files') | |
if not files: | |
continue | |
for file in files: | |
if file_has_audio(file): | |
fexts = file.get('external-identifier', []) | |
found_accoustid = iter_contains_prefix(get_iterable(fexts), | |
'urn:acoustid') | |
if not found_accoustid: | |
print metadata['metadata']['identifier'] | |
break | |
#______________________________________________________________________________ | |
for i in range(20): # 20 seems like a reasonable number - don't go nuts! | |
gevent.spawn(worker) | |
#queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst | |
queue_ids(open('ids.txt')) | |
input_queue.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment