Created
October 24, 2012 00:16
-
-
Save jjjake/3942889 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Find out the most used metadata fields on archive.org | |
# | |
import sys | |
import logging | |
from datetime import datetime | |
import ujson as json | |
import cPickle as pickle | |
# parallel_md_get.py available here: https://gist.github.com/3784845 | |
from parallel_md_get import metadata_record_iterator | |
## Logging! | |
date = datetime.utcnow().strftime("%Y-%m-%d") | |
log_filename = "logs/%s-%s.log" % (__file__.strip('.py'), date) | |
logging_format = "%(asctime)s\t%(levelname)s\t%(message)s" | |
logging.basicConfig(filename=log_filename,level=logging.INFO, | |
format=logging_format) | |
#______________________________________________________________________________ | |
def count_keys(key_dictionary, keys): | |
try: | |
for key in metadata_keys: | |
if key_dictionary.get(key) is None: | |
key_dictionary[key] = 0 | |
if key_dictionary.get(key) is not None: | |
key_dictionary[key] = key_dictionary.get(key) + 1 | |
return key_dictionary | |
except Exception, e: | |
logging.error('%s\t%s' % e, id) | |
return key_dictionary | |
def write_dictionary_to_file(dictionary, filename='ia_meta_key_counter.txt'): | |
with open(filename, 'wb') as f: | |
pickle.dump(dictionary, f) | |
#______________________________________________________________________________ | |
ids = open(sys.argv[1]) | |
results = metadata_record_iterator(ids, workers=20) | |
key_dictionary = {} | |
for i, id, md_json in results: | |
id = id.strip('()').strip() | |
logging.info('%s\t%s' % (i, id)) | |
try: | |
ia_json = json.loads(md_json) | |
except Exception, e: | |
logging.error('%s\t%s' % e, id) | |
continue | |
ia_metadata = ia_json.get('metadata') | |
if ia_metadata is None: | |
logging.warning('item has no metadata!\t' + id) | |
continue | |
metadata_keys = ia_metadata.keys() | |
keys_dictionary = count_keys(key_dictionary, metadata_keys) | |
write_dictionary_to_file(keys_dictionary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment