Skip to content

Instantly share code, notes, and snippets.

@brienna
Created October 9, 2018 01:59
Show Gist options
  • Save brienna/4ff6b0a807f8942d1d0a584848a2d35f to your computer and use it in GitHub Desktop.
Save brienna/4ff6b0a807f8942d1d0a584848a2d35f to your computer and use it in GitHub Desktop.
Download astro-ph source files
import json
from datetime import datetime
def begin_download():
"""Sets up download of tars from arxiv bucket."""
print('Beginning tar download & extraction...')
# Create a reusable Paginator
paginator = s3resource.meta.client.get_paginator('list_objects_v2')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(
Bucket='arxiv',
RequestPayer='requester',
Prefix='src/'
)
# Download and extract tars
numFiles = 0
for page in page_iterator:
numFiles = 0
for page in page_iterator:
numFiles = numFiles + len(page['Contents'])
for file in page['Contents']:
key = file['Key']
# If current file is a tar
if key.endswith('.tar'):
download_file(key)
print('Processed ' + str(numFiles - 1) + ' tars') # -1
if __name__ == '__main__':
"""Runs if script is called on command line"""
# Begin tar download & extraction
begin_download()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment