Skip to content

Instantly share code, notes, and snippets.

@zaemyung
Created June 28, 2023 19:45
Show Gist options
  • Save zaemyung/ceb6885e444fb15df5a1716919480d5a to your computer and use it in GitHub Desktop.
Save zaemyung/ceb6885e444fb15df5a1716919480d5a to your computer and use it in GitHub Desktop.
Download arxiv dump files from S3 bucket - latest files first.
import os
from datetime import datetime
from pathlib import Path
# pip install boto3 python-dotenv beautifulsoup4
import boto3
import dotenv
from bs4 import BeautifulSoup
dotenv.load_dotenv()
def calc_used_up_bytes():
root_directory = Path('./arxiv_chunks')
return sum(f.stat().st_size for f in root_directory.glob('**/*') if f.is_file())
if __name__ == '__main__':
s3 = boto3.resource(
's3', # the AWS resource we want to use
region_name='us-east-1', # same region arxiv bucket is in
)
client = boto3.client('s3')
# result = client.list_objects(Bucket='arxiv', RequestPayer='requester')
# for o in result['Contents']:
# print(o)
# client.download_file('arxiv', 'pdf/arXiv_pdf_manifest.xml', 'arXiv_pdf_manifest.xml', ExtraArgs={'RequestPayer': 'requester'})
# client.download_file('arxiv', 'src/arXiv_src_manifest.xml', 'arXiv_src_manifest.xml', ExtraArgs={'RequestPayer': 'requester'})
with open('arXiv_src_manifest.xml', 'r') as f:
soup = BeautifulSoup(f, features='xml')
files = soup.find_all('file')
chunks = []
total_number_of_items = 0
total_size = 0
for i, file in enumerate(files):
filename = file.find('filename').text.strip()
num_items = int(file.find('num_items').text.strip())
size = int(file.find('size').text.strip())
timestamp = file.find('timestamp').text.strip()
content_md5sum = file.find('content_md5sum').text.strip()
chunks.append({
'content_md5sum': content_md5sum,
'filename': filename,
'num_items': num_items,
'size': size,
# 2010-12-23 00:13:59
'timestamp': datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S'),
})
total_number_of_items += num_items
total_size += size
print(i, filename, num_items, size, timestamp)
chunks.sort(key=lambda x: x['timestamp'], reverse=True)
print(f'number of chunks:{len(chunks)}\ntotal number of items:{total_number_of_items}\ntotal size in TB: {total_size/ (2**40):.2f}')
used_up_bytes = calc_used_up_bytes()
# print(used_up_bytes/(2**30))
max_bytes = 100 * (2**30)
# Downloaded up to i = 194. Need to start from i = 195
print('downloading...')
for i, chunk in enumerate(chunks):
file_size = chunk['size']
if used_up_bytes + file_size >= max_bytes:
break
print(i)
filename = chunk['filename']
out_path = os.path.join('arxiv_chunks', os.path.basename(filename))
if os.path.isfile(out_path):
print(f'{out_path} already exists! Skipped.')
continue
client.download_file('arxiv', filename, out_path, ExtraArgs={'RequestPayer': 'requester'})
used_up_bytes += file_size
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment