Created
October 19, 2012 03:42
-
-
Save jjjake/3916104 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Provided a list of identifiers for items on archive.org, return all items | |
# that have an "acoustid" for every original audio file, but NOT a | |
# "mb_recording_id". | |
# | |
import sys | |
import logging | |
from datetime import datetime | |
import ujson | |
# parallel_md_get.py available here: https://gist.github.com/3784845 | |
from parallel_md_get import metadata_record_iterator | |
## Logging! | |
date = datetime.utcnow().strftime("%Y-%m-%d") | |
log_filename = "logs/%s-%s.log" % (__file__.strip('.py'), date) | |
logging_format = "%(asctime)s\t%(levelname)s\t%(message)s" | |
logging.basicConfig(filename=log_filename,level=logging.INFO, | |
format=logging_format) | |
#______________________________________________________________________________ | |
def get_iterable(x): | |
return (x,) if not isinstance(x, (tuple, list)) else x | |
def iter_contains_prefix(iter, prefix): | |
for s in iter: | |
if s.startswith(prefix) and 'unknown' not in s: | |
return True | |
return False | |
#______________________________________________________________________________ | |
def file_has_audio(file): | |
if (file.get('source') == 'original' and | |
file['format'] in ('VBR MP3', 'AIFF', '24bit Flac', 'Flac', | |
'Apple Lossless Audio', 'Advanced Audio Coding')): | |
return True | |
def all_files_have_extrnal_id(files, external_id): | |
for file in files: | |
if file_has_audio(file): | |
fexts = file.get('external-identifier', []) | |
prefix = 'urn:' + external_id | |
found_accoustid = iter_contains_prefix(get_iterable(fexts), | |
prefix) | |
if not found_accoustid: | |
return False | |
return True | |
#______________________________________________________________________________ | |
ids = open(sys.argv[1]) | |
results = metadata_record_iterator(ids, workers=20) | |
for i, id, md_json in results: | |
try: | |
metadata = ujson.loads(md_json) | |
files = metadata.get('files') | |
logging.info("retrieved and parsed metadata:\t%s" % id) | |
if not files: | |
logging.warning("item has no files!\t%s" % id) | |
continue | |
if all_files_have_extrnal_id(files, external_id='acoustid') and \ | |
not all_files_have_extrnal_id(files, 'mb_recording_id'): | |
output_filename = 'db/items_dont_have_acoustid-%s.txt' % date | |
with open(output_filename, 'a') as f: | |
line = "%s\n" % id | |
f.write(line) | |
except Exception, e: | |
logging.error("%s" % (e, id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment