Created
March 2, 2015 19:24
-
-
Save EricIO/2606a3d45d522ebd6e02 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import sys | |
import pprint | |
import internetarchive | |
def _search_collection(collection_name): | |
""" Searches the internet archive for the specified collection. | |
if no items are found for the collection it returns None otherwise | |
the Search object is returned. | |
""" | |
collection = internetarchive.search_items('collection:{}'.format(collection_name)) | |
if collection.num_found == 0: | |
return None | |
else: | |
return collection | |
def _get_item_data(item): | |
data = internetarchive.get_item(item.get('identifier')) | |
total_size = 0 | |
file_info = [] | |
for f in data.files: | |
if f['source'] == 'original': | |
file_info.append(f) | |
total_size += int(f.get('size',0)) | |
return {'identifier' : data.identifier, | |
'files' : file_info, | |
'size' : total_size} | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print('Missing parameter: collection name') | |
sys.exit(-1) | |
collection = sys.argv[1] | |
print('Getting data for the collection {}'.format(collection)) | |
collection_data = _search_collection(collection) | |
if not collection: | |
print('No collection {} found'.format(collection)) | |
sys.exit(-1) | |
else: | |
results = [] | |
# Note the internetarchive library does a http request for each item so this | |
# could take some time. | |
for item in collection_data: | |
print('Proccessing item {0}\n'.format(item['identifier'])) | |
results.append(_get_item_data(item)) | |
collection_original_size = 0 | |
for item in results: | |
collection_original_size += int(item['size']) | |
print('Item {}:'.format(item['identifier'])) | |
print('\tTotal original files: {}'.format(len(item['files']))) | |
print('\tTotal size: {}'.format(item.get('size'))) | |
print('Total collection original file size: {}'.format(collection_original_size)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment