Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Created January 17, 2023 08:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylemcdonald/dc6c8771b37ee83318b3dbd8ec2dd152 to your computer and use it in GitHub Desktop.
Save kylemcdonald/dc6c8771b37ee83318b3dbd8ec2dd152 to your computer and use it in GitHub Desktop.
Python script for downloading public CPRA documents.
import requests
import json
from itertools import count
import os
import sys
import urllib3
from multiprocessing.dummy import Pool
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry
if len(sys.argv) != 2:
print('Usage: python download-cpra.py 18-1282')
exit()
request_id = sys.argv[1]
output_folder = request_id
lacity_parallel = 8
lacity_ratelimit = 5 # requests per second, i think it's actually 300 per minute
s3_parallel = 12
s3_ratelimit = 10 # requests per second
print(f'downloading document metadata for {request_id}')
all = []
for page_number in count():
url = f'https://lacity.nextrequest.com/client/request_documents?request_id={request_id}&page_number={page_number}'
headers = {
'accept': 'application/json'
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print('error')
response = response.json()
documents_count = response['total_documents_count']
documents = response['documents']
all.extend(documents)
if len(documents) > 0:
print(len(all), end=' ')
sys.stdout.flush()
if len(documents) < 25:
if len(all) > 0:
print('done')
break
if len(documents) == 0:
print('no documents available for', request_id)
exit()
download_urls = []
download_paths = []
for e in all:
document_id = e['id']
url = f'https://lacity.nextrequest.com/client/documents/download?document_id={document_id}&request_id={request_id}'
download_urls.append(url)
path = os.path.join(output_folder, e['folder_name'], e['subfolder_name'], e['title'])
download_paths.append(path)
print('requesting s3 urls...')
jobs = download_urls
job_count = len(jobs)
pbar = tqdm(total=job_count, leave=True)
def get_url(url):
connection_pool = urllib3.PoolManager()
response = connection_pool.urlopen('GET', url)
if response.status != 200:
raise ValueError(f'error {response.status} for url {url}')
return response.data
@sleep_and_retry
@limits(calls=lacity_ratelimit, period=1)
def job(cur_job):
response = json.loads(get_url(cur_job))['url']
pbar.update(1)
return response
with Pool(lacity_parallel) as pool:
s3_urls = pool.map(job, jobs)
print('done')
print('downloading files...')
jobs = list(zip(s3_urls, download_paths))[:100]
job_count = len(jobs)
pbar = tqdm(total=job_count, leave=True)
def safe_download_url(url, path):
try:
connection_pool = urllib3.PoolManager()
try:
if not os.path.exists(path):
raise
exists_size = os.path.getsize(path)
response = connection_pool.request('HEAD', url)
true_size = int(response.headers['Content-Length'])
if exists_size == true_size:
return 'exists'
except:
head, tail = os.path.split(path)
os.makedirs(head, exist_ok=True)
response = connection_pool.urlopen('GET', url)
with open(path, 'wb') as f:
f.write(response.data)
return 'downloaded'
# print('downloaded', path)
except Exception as e:
# return 'error'
raise e
@sleep_and_retry
@limits(calls=s3_ratelimit, period=1)
def job(cur_job):
response = safe_download_url(*cur_job)
pbar.update(1)
return response
with Pool(lacity_parallel) as pool:
results = pool.map(job, jobs)
print('done')
for url, path, status in zip(s3_urls, download_paths, results):
if status != 'downloaded':
print(status, path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment