Skip to content

Instantly share code, notes, and snippets.

@jjjake
Created October 19, 2012 03:42
Show Gist options
  • Save jjjake/3916104 to your computer and use it in GitHub Desktop.
Save jjjake/3916104 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Provided a list of identifiers for items on archive.org, return all items
# that have an "acoustid" for every original audio file, but NOT a
# "mb_recording_id".
#
import sys
import logging
from datetime import datetime
import ujson
# parallel_md_get.py available here: https://gist.github.com/3784845
from parallel_md_get import metadata_record_iterator
## Logging!
date = datetime.utcnow().strftime("%Y-%m-%d")
log_filename = "logs/%s-%s.log" % (__file__.strip('.py'), date)
logging_format = "%(asctime)s\t%(levelname)s\t%(message)s"
logging.basicConfig(filename=log_filename,level=logging.INFO,
format=logging_format)
#______________________________________________________________________________
def get_iterable(x):
return (x,) if not isinstance(x, (tuple, list)) else x
def iter_contains_prefix(iter, prefix):
for s in iter:
if s.startswith(prefix) and 'unknown' not in s:
return True
return False
#______________________________________________________________________________
def file_has_audio(file):
if (file.get('source') == 'original' and
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac',
'Apple Lossless Audio', 'Advanced Audio Coding')):
return True
def all_files_have_extrnal_id(files, external_id):
for file in files:
if file_has_audio(file):
fexts = file.get('external-identifier', [])
prefix = 'urn:' + external_id
found_accoustid = iter_contains_prefix(get_iterable(fexts),
prefix)
if not found_accoustid:
return False
return True
#______________________________________________________________________________
ids = open(sys.argv[1])
results = metadata_record_iterator(ids, workers=20)
for i, id, md_json in results:
try:
metadata = ujson.loads(md_json)
files = metadata.get('files')
logging.info("retrieved and parsed metadata:\t%s" % id)
if not files:
logging.warning("item has no files!\t%s" % id)
continue
if all_files_have_extrnal_id(files, external_id='acoustid') and \
not all_files_have_extrnal_id(files, 'mb_recording_id'):
output_filename = 'db/items_dont_have_acoustid-%s.txt' % date
with open(output_filename, 'a') as f:
line = "%s\n" % id
f.write(line)
except Exception, e:
logging.error("%s" % (e, id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment