Skip to content

Instantly share code, notes, and snippets.

@jjjake
Forked from mikemccabe/gist:3712588
Created September 22, 2012 00:37
Show Gist options
  • Save jjjake/3764671 to your computer and use it in GitHub Desktop.
Save jjjake/3764671 to your computer and use it in GitHub Desktop.
check if an item has an acoustid or music brainz release: ./check_for_acoustid.py {item}
#!/usr/bin/env python
""" Check if an item on archive.org has an acoustid.
Usage:
./check_for_acoustid.py {item}
Usage with GNU Parallel:
cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}'
"""
import sys
import json
import urllib
def get_iterable(x):
return (x,) if not isinstance(x, (tuple, list)) else x
def iter_contains_prefix(iter, prefix):
for s in iter:
if s.startswith(prefix):
return True
return False
def get_url(url):
f = urllib.urlopen(url)
c = f.read()
f.close()
return c
def get_meta(item):
j = get_url("http://archive.org/metadata/" + item)
yield json.loads(j);
def item_iterator():
filename = 'items'
f = open("items")
for line in f:
yield line.strip()
def file_has_audio(file):
if (file.get('source') == 'original' and
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
'Apple Lossless Audio', 'Advanced Audio Coding')):
return True
item = sys.argv[1]
for metadata in get_meta(item):
files = metadata.get('files')
if not files:
continue
for file in files:
if file_has_audio(file):
fexts = file.get('external-identifier', [])
found_accoustid = iter_contains_prefix(get_iterable(fexts),
'urn:acoustid')
if not found_accoustid:
print metadata['metadata']['identifier']
break
# This demonstrates doing multiple metadata fetches in parallel.
# It seems to be fast enough that the json decoding cost becomes
# a significant proportion of the execution time.
# It requires gevent; see http://www.gevent.org/intro.html#installation
# To make this do something useful, modify do_work().
import requests
import gevent
from gevent import monkey
monkey.patch_all()
from gevent import queue as g_queue
input_queue = g_queue.JoinableQueue(1000)
def queue_ids(ids, start_index=0, count=0):
limit_index = start_index + count
for i, id in enumerate(ids):
if i < start_index:
continue
if limit_index != 0 and i >= limit_index:
break
id = id.strip()
input_queue.put((id, i))
def worker():
while True:
id, i = input_queue.get()
try:
do_work(id, i)
finally:
input_queue.task_done()
# Do work ->
#______________________________________________________________________________
def get_iterable(x):
return (x,) if not isinstance(x, (tuple, list)) else x
def iter_contains_prefix(iter, prefix):
for s in iter:
if s.startswith(prefix):
return True
return False
def get_meta(item):
r = requests.get("http://archive.org/metadata/" + item)
yield r.json
def item_iterator():
filename = 'items'
f = open("items")
for line in f:
yield line.strip()
def file_has_audio(file):
if (file.get('source') == 'original' and
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
'Apple Lossless Audio', 'Advanced Audio Coding')):
return True
def do_work(item, i):
for metadata in get_meta(item):
files = metadata.get('files')
if not files:
continue
for file in files:
if file_has_audio(file):
fexts = file.get('external-identifier', [])
found_accoustid = iter_contains_prefix(get_iterable(fexts),
'urn:acoustid')
if not found_accoustid:
print metadata['metadata']['identifier']
break
#______________________________________________________________________________
for i in range(20): # 20 seems like a reasonable number - don't go nuts!
gevent.spawn(worker)
#queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst
queue_ids(open('ids.txt'))
input_queue.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment