jjjake/check_for_acoustid.py

## check_for_acoustid.py
#!/usr/bin/env python

""" Check if an item on archive.org has an acoustid.

Usage:
./check_for_acoustid.py {item}

Usage with GNU Parallel:
cat itemlist.txt | parallel --max-procs=8 --group './check_for_acoustid.py {}'

"""
import sys
import json
import urllib


def get_iterable(x):
    return (x,) if not isinstance(x, (tuple, list)) else x

def iter_contains_prefix(iter, prefix):
    for s in iter:
        if s.startswith(prefix):
            return True
    return False

def get_url(url):
    f = urllib.urlopen(url)
    c = f.read()
    f.close()
    return c

def get_meta(item):
    j = get_url("http://archive.org/metadata/" + item)
    yield json.loads(j);

def item_iterator():
    filename = 'items'
    f = open("items")
    for line in f:
      yield line.strip()

def file_has_audio(file):
    if (file.get('source') == 'original' and
        file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
                           'Apple Lossless Audio', 'Advanced Audio Coding')):
        return True

item = sys.argv[1]
for metadata in get_meta(item):
    files = metadata.get('files')
    if not files:
        continue
    for file in files:
        if file_has_audio(file):
            fexts = file.get('external-identifier', [])
            found_accoustid = iter_contains_prefix(get_iterable(fexts),
                                                   'urn:acoustid')
            if not found_accoustid:
                print metadata['metadata']['identifier']
                break

## check_for_acoustid_py_parallel.py
 # This demonstrates doing multiple metadata fetches in parallel.
 # It seems to be fast enough that the json decoding cost becomes
 # a significant proportion of the execution time.

 # It requires gevent; see http://www.gevent.org/intro.html#installation

 # To make this do something useful, modify do_work().

 import requests

 import gevent

 from gevent import monkey
 monkey.patch_all()

 from gevent import queue as g_queue
 input_queue = g_queue.JoinableQueue(1000)


 def queue_ids(ids, start_index=0, count=0):
     limit_index = start_index + count
     for i, id in enumerate(ids):
         if i < start_index:
             continue
         if limit_index != 0 and i >= limit_index:
             break
         id = id.strip()
         input_queue.put((id, i))

 def worker():
     while True:
         id, i = input_queue.get()
         try:
             do_work(id, i)
         finally:
             input_queue.task_done()

 # Do work ->
 #______________________________________________________________________________
 def get_iterable(x):
     return (x,) if not isinstance(x, (tuple, list)) else x

 def iter_contains_prefix(iter, prefix):
     for s in iter:
         if s.startswith(prefix):
             return True
     return False

 def get_meta(item):
     r = requests.get("http://archive.org/metadata/" + item)
     yield r.json

def item_iterator():
    filename = 'items'
    f = open("items")
    for line in f:
      yield line.strip()

def file_has_audio(file):
    if (file.get('source') == 'original' and
        file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
                           'Apple Lossless Audio', 'Advanced Audio Coding')):
        return True

def do_work(item, i):
    for metadata in get_meta(item):
        files = metadata.get('files')
        if not files:
            continue
        for file in files:
            if file_has_audio(file):
                fexts = file.get('external-identifier', [])
                found_accoustid = iter_contains_prefix(get_iterable(fexts),
                                                       'urn:acoustid')
                if not found_accoustid:
                    print metadata['metadata']['identifier']
                    break

#______________________________________________________________________________
for i in range(20): # 20 seems like a reasonable number - don't go nuts!
    gevent.spawn(worker)

#queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst
queue_ids(open('ids.txt'))

input_queue.join()
	#!/usr/bin/env python

	""" Check if an item on archive.org has an acoustid.

	Usage:
	./check_for_acoustid.py {item}

	Usage with GNU Parallel:
	cat itemlist.txt \| parallel --max-procs=8 --group './check_for_acoustid.py {}'

	"""
	import sys
	import json
	import urllib


	def get_iterable(x):
	return (x,) if not isinstance(x, (tuple, list)) else x

	def iter_contains_prefix(iter, prefix):
	for s in iter:
	if s.startswith(prefix):
	return True
	return False

	def get_url(url):
	f = urllib.urlopen(url)
	c = f.read()
	f.close()
	return c

	def get_meta(item):
	j = get_url("http://archive.org/metadata/" + item)
	yield json.loads(j);

	def item_iterator():
	filename = 'items'
	f = open("items")
	for line in f:
	yield line.strip()

	def file_has_audio(file):
	if (file.get('source') == 'original' and
	file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
	'Apple Lossless Audio', 'Advanced Audio Coding')):
	return True

	item = sys.argv[1]
	for metadata in get_meta(item):
	files = metadata.get('files')
	if not files:
	continue
	for file in files:
	if file_has_audio(file):
	fexts = file.get('external-identifier', [])
	found_accoustid = iter_contains_prefix(get_iterable(fexts),
	'urn:acoustid')
	if not found_accoustid:
	print metadata['metadata']['identifier']
	break
	# This demonstrates doing multiple metadata fetches in parallel.
	# It seems to be fast enough that the json decoding cost becomes
	# a significant proportion of the execution time.

	# It requires gevent; see http://www.gevent.org/intro.html#installation

	# To make this do something useful, modify do_work().

	import requests

	import gevent

	from gevent import monkey
	monkey.patch_all()

	from gevent import queue as g_queue
	input_queue = g_queue.JoinableQueue(1000)


	def queue_ids(ids, start_index=0, count=0):
	limit_index = start_index + count
	for i, id in enumerate(ids):
	if i < start_index:
	continue
	if limit_index != 0 and i >= limit_index:
	break
	id = id.strip()
	input_queue.put((id, i))

	def worker():
	while True:
	id, i = input_queue.get()
	try:
	do_work(id, i)
	finally:
	input_queue.task_done()

	# Do work ->
	#______________________________________________________________________________
	def get_iterable(x):
	return (x,) if not isinstance(x, (tuple, list)) else x

	def iter_contains_prefix(iter, prefix):
	for s in iter:
	if s.startswith(prefix):
	return True
	return False

	def get_meta(item):
	r = requests.get("http://archive.org/metadata/" + item)
	yield r.json

	def item_iterator():
	filename = 'items'
	f = open("items")
	for line in f:
	yield line.strip()

	def file_has_audio(file):
	if (file.get('source') == 'original' and
	file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
	'Apple Lossless Audio', 'Advanced Audio Coding')):
	return True

	def do_work(item, i):
	for metadata in get_meta(item):
	files = metadata.get('files')
	if not files:
	continue
	for file in files:
	if file_has_audio(file):
	fexts = file.get('external-identifier', [])
	found_accoustid = iter_contains_prefix(get_iterable(fexts),
	'urn:acoustid')
	if not found_accoustid:
	print metadata['metadata']['identifier']
	break

	#______________________________________________________________________________
	for i in range(20): # 20 seems like a reasonable number - don't go nuts!
	gevent.spawn(worker)

	#queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst
	queue_ids(open('ids.txt'))

	input_queue.join()