Skip to content

Instantly share code, notes, and snippets.

@brienna
Created October 9, 2018 01:58
Show Gist options
  • Save brienna/b42390c28dc31d9197a47dab14f2e1f1 to your computer and use it in GitHub Desktop.
Save brienna/b42390c28dc31d9197a47dab14f2e1f1 to your computer and use it in GitHub Desktop.
Explore the manifest file
from bs4 import BeautifulSoup
def explore_metadata():
"""Explores arxiv bucket metadata."""
print('\narxiv bucket metadata:')
with open('src/arXiv_src_manifest.xml', 'r') as manifest:
soup = BeautifulSoup(manifest, 'xml')
# Print last time the manifest was edited
timestamp = soup.arXivSRC.find('timestamp', recursive=False).string
print('Manifest was last edited on ' + timestamp)
# Print number of files in bucket
numOfFiles = len(soup.find_all('file'))
print('arxiv bucket contains ' + str(numOfFiles) + ' tars')
# Print total size
total_size = 0
for size in soup.find_all('size'):
total_size = total_size + int(size.string)
print('Total size: ' + str(total_size/1000000000) + ' GB')
print('')
if __name__ == '__main__':
"""Runs if script is called on command line"""
# Explore bucket metadata
explore_metadata()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment