-
-
Save arvestad/fbf3a99eb2cc52f26ae048294dcabdee to your computer and use it in GitHub Desktop.
UniProt API downloader code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import re | |
import requests | |
from requests.adapters import HTTPAdapter, Retry | |
import sys | |
re_next_link = re.compile(r'<(.+)>; rel="next"') | |
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) | |
session = requests.Session() | |
session.mount("https://", HTTPAdapter(max_retries=retries)) | |
pagesize = 500 | |
def get_next_link(headers): | |
if "Link" in headers: | |
match = re_next_link.match(headers["Link"]) | |
if match: | |
return match.group(1) | |
def get_batch(batch_url): | |
batch_no = 1 | |
while batch_url: | |
response = session.get(batch_url) | |
response.raise_for_status() | |
total = response.headers["x-total-results"] | |
if batch_no % 50 == 0: | |
print(f'{batch_no * pagesize} out of {total} ', file=sys.stderr, flush=True) | |
else: | |
print('.', file=sys.stderr, end='', flush=True) | |
yield response, total | |
batch_url = get_next_link(response.headers) | |
batch_no += 1 | |
print('', file=sys.stderr) | |
def main(url, outfile): | |
if outfile: | |
oh = open(outfile, 'w') | |
else: | |
oh = sys.stdout | |
try: | |
for batch, total in get_batch(url): | |
for line in batch.text.splitlines(): | |
print(line, file=oh) | |
except Exception as e: | |
print(f'An error occured when reading data from {url}', file=sys.stderr) | |
sys.exit(str(e)) | |
finally: | |
if outfile: | |
oh.close() | |
def prepare_args(): | |
ap = argparse.ArgumentParser(description='Download UniProt data in chunks, to comply with REST API policies.') | |
ap.add_argument('url', help='UniProt REST API URL') | |
ap.add_argument('-o', '--outputfile', help='Where to put the downloaded file') | |
return ap | |
if __name__ == '__main__': | |
ap = prepare_args() | |
args = ap.parse_args() | |
main(args.url, args.outputfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment