Skip to content

Instantly share code, notes, and snippets.

@jjjake
Created October 24, 2012 00:16
Show Gist options
  • Save jjjake/3942889 to your computer and use it in GitHub Desktop.
Save jjjake/3942889 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#
# Find out the most used metadata fields on archive.org
#
import sys
import logging
from datetime import datetime
import ujson as json
import cPickle as pickle
# parallel_md_get.py available here: https://gist.github.com/3784845
from parallel_md_get import metadata_record_iterator
## Logging!
date = datetime.utcnow().strftime("%Y-%m-%d")
log_filename = "logs/%s-%s.log" % (__file__.strip('.py'), date)
logging_format = "%(asctime)s\t%(levelname)s\t%(message)s"
logging.basicConfig(filename=log_filename,level=logging.INFO,
format=logging_format)
#______________________________________________________________________________
def count_keys(key_dictionary, keys):
try:
for key in metadata_keys:
if key_dictionary.get(key) is None:
key_dictionary[key] = 0
if key_dictionary.get(key) is not None:
key_dictionary[key] = key_dictionary.get(key) + 1
return key_dictionary
except Exception, e:
logging.error('%s\t%s' % e, id)
return key_dictionary
def write_dictionary_to_file(dictionary, filename='ia_meta_key_counter.txt'):
with open(filename, 'wb') as f:
pickle.dump(dictionary, f)
#______________________________________________________________________________
ids = open(sys.argv[1])
results = metadata_record_iterator(ids, workers=20)
key_dictionary = {}
for i, id, md_json in results:
id = id.strip('()').strip()
logging.info('%s\t%s' % (i, id))
try:
ia_json = json.loads(md_json)
except Exception, e:
logging.error('%s\t%s' % e, id)
continue
ia_metadata = ia_json.get('metadata')
if ia_metadata is None:
logging.warning('item has no metadata!\t' + id)
continue
metadata_keys = ia_metadata.keys()
keys_dictionary = count_keys(key_dictionary, metadata_keys)
write_dictionary_to_file(keys_dictionary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment