kylemcdonald/download-cpra.py

## download-cpra.py
import requests
import json
from itertools import count
import os
import sys
import urllib3
from multiprocessing.dummy import Pool
from tqdm import tqdm
from ratelimit import limits, sleep_and_retry

if len(sys.argv) != 2:
    print('Usage: python download-cpra.py 18-1282')
    exit()

request_id = sys.argv[1]
output_folder = request_id

lacity_parallel = 8
lacity_ratelimit = 5 # requests per second, i think it's actually 300 per minute
s3_parallel = 12
s3_ratelimit = 10 # requests per second

print(f'downloading document metadata for {request_id}')

all = []
for page_number in count():

  url = f'https://lacity.nextrequest.com/client/request_documents?request_id={request_id}&page_number={page_number}'

  headers = {
    'accept': 'application/json'
  }

  response = requests.get(url, headers=headers)

  if response.status_code != 200:
    print('error')

  response = response.json()
  documents_count = response['total_documents_count']
  documents = response['documents']

  all.extend(documents)

  if len(documents) > 0:
    print(len(all), end=' ')
    sys.stdout.flush()

  if len(documents) < 25:
    if len(all) > 0:
        print('done')
    break

if len(documents) == 0:
    print('no documents available for', request_id)
    exit()

download_urls = []
download_paths = []

for e in all:
    document_id = e['id']
    url = f'https://lacity.nextrequest.com/client/documents/download?document_id={document_id}&request_id={request_id}'
    download_urls.append(url)

    path = os.path.join(output_folder, e['folder_name'], e['subfolder_name'], e['title'])
    download_paths.append(path)

print('requesting s3 urls...')
jobs = download_urls
job_count = len(jobs)
pbar = tqdm(total=job_count, leave=True)

def get_url(url):
    connection_pool = urllib3.PoolManager()
    response = connection_pool.urlopen('GET', url)
    if response.status != 200:
        raise ValueError(f'error {response.status} for url {url}')
    return response.data

@sleep_and_retry
@limits(calls=lacity_ratelimit, period=1)
def job(cur_job):
    response = json.loads(get_url(cur_job))['url']
    pbar.update(1)
    return response

with Pool(lacity_parallel) as pool:
    s3_urls = pool.map(job, jobs)
print('done')

print('downloading files...')
jobs = list(zip(s3_urls, download_paths))[:100]
job_count = len(jobs)
pbar = tqdm(total=job_count, leave=True)

def safe_download_url(url, path):
    try:
        connection_pool = urllib3.PoolManager()

        try:
            if not os.path.exists(path):
                raise
            exists_size = os.path.getsize(path)
            response = connection_pool.request('HEAD', url)
            true_size = int(response.headers['Content-Length'])
            if exists_size == true_size:
                return 'exists'
        except:
            head, tail = os.path.split(path)
            os.makedirs(head, exist_ok=True)

        response = connection_pool.urlopen('GET', url)
        with open(path, 'wb') as f:
            f.write(response.data)
        return 'downloaded'
        # print('downloaded', path)
    except Exception as e:
        # return 'error'
        raise e

@sleep_and_retry
@limits(calls=s3_ratelimit, period=1)
def job(cur_job):
    response = safe_download_url(*cur_job)
    pbar.update(1)
    return response

with Pool(lacity_parallel) as pool:
    results = pool.map(job, jobs)

print('done')

for url, path, status in zip(s3_urls, download_paths, results):
    if status != 'downloaded':
        print(status, path)
	import requests
	import json
	from itertools import count
	import os
	import sys
	import urllib3
	from multiprocessing.dummy import Pool
	from tqdm import tqdm
	from ratelimit import limits, sleep_and_retry

	if len(sys.argv) != 2:
	print('Usage: python download-cpra.py 18-1282')
	exit()

	request_id = sys.argv[1]
	output_folder = request_id

	lacity_parallel = 8
	lacity_ratelimit = 5 # requests per second, i think it's actually 300 per minute
	s3_parallel = 12
	s3_ratelimit = 10 # requests per second

	print(f'downloading document metadata for {request_id}')

	all = []
	for page_number in count():

	url = f'https://lacity.nextrequest.com/client/request_documents?request_id={request_id}&page_number={page_number}'

	headers = {
	'accept': 'application/json'
	}

	response = requests.get(url, headers=headers)

	if response.status_code != 200:
	print('error')

	response = response.json()
	documents_count = response['total_documents_count']
	documents = response['documents']

	all.extend(documents)

	if len(documents) > 0:
	print(len(all), end=' ')
	sys.stdout.flush()

	if len(documents) < 25:
	if len(all) > 0:
	print('done')
	break

	if len(documents) == 0:
	print('no documents available for', request_id)
	exit()

	download_urls = []
	download_paths = []

	for e in all:
	document_id = e['id']
	url = f'https://lacity.nextrequest.com/client/documents/download?document_id={document_id}&request_id={request_id}'
	download_urls.append(url)

	path = os.path.join(output_folder, e['folder_name'], e['subfolder_name'], e['title'])
	download_paths.append(path)

	print('requesting s3 urls...')
	jobs = download_urls
	job_count = len(jobs)
	pbar = tqdm(total=job_count, leave=True)

	def get_url(url):
	connection_pool = urllib3.PoolManager()
	response = connection_pool.urlopen('GET', url)
	if response.status != 200:
	raise ValueError(f'error {response.status} for url {url}')
	return response.data

	@sleep_and_retry
	@limits(calls=lacity_ratelimit, period=1)
	def job(cur_job):
	response = json.loads(get_url(cur_job))['url']
	pbar.update(1)
	return response

	with Pool(lacity_parallel) as pool:
	s3_urls = pool.map(job, jobs)
	print('done')

	print('downloading files...')
	jobs = list(zip(s3_urls, download_paths))[:100]
	job_count = len(jobs)
	pbar = tqdm(total=job_count, leave=True)

	def safe_download_url(url, path):
	try:
	connection_pool = urllib3.PoolManager()

	try:
	if not os.path.exists(path):
	raise
	exists_size = os.path.getsize(path)
	response = connection_pool.request('HEAD', url)
	true_size = int(response.headers['Content-Length'])
	if exists_size == true_size:
	return 'exists'
	except:
	head, tail = os.path.split(path)
	os.makedirs(head, exist_ok=True)

	response = connection_pool.urlopen('GET', url)
	with open(path, 'wb') as f:
	f.write(response.data)
	return 'downloaded'
	# print('downloaded', path)
	except Exception as e:
	# return 'error'
	raise e

	@sleep_and_retry
	@limits(calls=s3_ratelimit, period=1)
	def job(cur_job):
	response = safe_download_url(*cur_job)
	pbar.update(1)
	return response

	with Pool(lacity_parallel) as pool:
	results = pool.map(job, jobs)

	print('done')

	for url, path, status in zip(s3_urls, download_paths, results):
	if status != 'downloaded':
	print(status, path)