Skip to content

Instantly share code, notes, and snippets.

@EricIO
Created March 2, 2015 19:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EricIO/2606a3d45d522ebd6e02 to your computer and use it in GitHub Desktop.
Save EricIO/2606a3d45d522ebd6e02 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
import sys
import pprint
import internetarchive
def _search_collection(collection_name):
""" Searches the internet archive for the specified collection.
if no items are found for the collection it returns None otherwise
the Search object is returned.
"""
collection = internetarchive.search_items('collection:{}'.format(collection_name))
if collection.num_found == 0:
return None
else:
return collection
def _get_item_data(item):
data = internetarchive.get_item(item.get('identifier'))
total_size = 0
file_info = []
for f in data.files:
if f['source'] == 'original':
file_info.append(f)
total_size += int(f.get('size',0))
return {'identifier' : data.identifier,
'files' : file_info,
'size' : total_size}
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Missing parameter: collection name')
sys.exit(-1)
collection = sys.argv[1]
print('Getting data for the collection {}'.format(collection))
collection_data = _search_collection(collection)
if not collection:
print('No collection {} found'.format(collection))
sys.exit(-1)
else:
results = []
# Note the internetarchive library does a http request for each item so this
# could take some time.
for item in collection_data:
print('Proccessing item {0}\n'.format(item['identifier']))
results.append(_get_item_data(item))
collection_original_size = 0
for item in results:
collection_original_size += int(item['size'])
print('Item {}:'.format(item['identifier']))
print('\tTotal original files: {}'.format(len(item['files'])))
print('\tTotal size: {}'.format(item.get('size')))
print('Total collection original file size: {}'.format(collection_original_size))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment