Skip to content

Instantly share code, notes, and snippets.

@mstevenson
Last active November 25, 2022 00:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.
Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.
Download all images from an Internet Archive collection and write the caption to a text file
import internetarchive as ia
from pathlib import Path
import argparse
import time
config = dict(general=dict(secure=False))
def download_collection(collection_name, output_dir):
search = ia.search_items(f'collection:{collection_name}', config=config)
dir = Path(output_dir) / collection_name
dir.mkdir(exist_ok=True)
for result in search:
download_item(result['identifier'], dir)
time.sleep(0.1)
def download_item(item_id, dir):
item = ia.get_item(item_id, config=config)
meta = item.metadata
title = meta['title']
if isinstance(title, list): # not sure why a list is sometimes returned, so just move on
return
description = meta.get('description', None)
formats = ['JPEG']
for file in item.files:
if file['format'] in formats:
filename = file['name']
try:
item.download(files=filename, \
formats=file['format'], \
destdir=dir,
no_directory=True, \
verbose=True, \
ignore_existing=True, \
retries=2)
except Exception as e:
print(f'Download failed: {e}')
return
caption = dir / Path(filename).with_suffix('.txt')
with open(caption, 'w', encoding='utf-8') as f:
if description and not isinstance(description, list):
f.write(description)
else:
f.write(title)
return
print(f'no image for {item_id}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('collection', help='Collection name')
parser.add_argument('output_dir', help='Output directory')
args = parser.parse_args()
print(f'Downloading collection {args.collection} to {args.output_dir}')
download_collection(args.collection, args.output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment