Skip to content

Instantly share code, notes, and snippets.

@mccalluc
Last active February 27, 2018 17:29
Show Gist options
  • Save mccalluc/c21d15b3b1784a671fc3d0092c380a03 to your computer and use it in GitHub Desktop.
Save mccalluc/c21d15b3b1784a671fc3d0092c380a03 to your computer and use it in GitHub Desktop.
Download files from Princeton BII, zip, and move to S3
#!/usr/bin/env python
import argparse
import requests
import re
import shutil
from urllib.parse import urlencode
from tempfile import mkdtemp
import os
import urllib3
import zipfile
# from boto.s3.key import Key
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# TODO: "verify=False" below because "certificate verify failed"
def arg_parser():
parser = argparse.ArgumentParser(
description='Download files from BII, zip, and move to S3')
parser.add_argument('--study_id', type=str, required=True)
#parser.add_argument('--bucket', type=str, required=True)
return parser
def isa_urls(id):
query = urlencode({'studyId': id})
front_url = 'https://pentacon-bii.princeton.edu/study.seam?{}'.format(query)
front_html = requests.get(front_url, verify=False).text
repo_url = re.search(r'href="([^"]+submission_repo[^"]+)', front_html).group(1)
print(repo_url)
repo_html = requests.get(repo_url, verify=False).text
isa_paths = re.findall(r'href="([^"]+\.txt)"', repo_html)
print(isa_paths)
return [os.path.join(repo_url, path) for path in isa_paths]
def download(urls):
dir = mkdtemp()
for url in urls:
local_filename = os.path.join(dir, url.split('/')[-1])
r = requests.get(url, stream=True, verify=False)
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return dir
def zip(id, dir):
os.chdir(dir)
paths = os.listdir(dir)
zip_filename = '{}.zip'.format(id)
with zipfile.ZipFile(zip_filename, mode='w') as z:
for path in paths:
z.write(path)
return os.path.join(dir, zip_filename)
# def upload(filename, bucket):
# Key(bucket=bucket)
def main(args):
id = args.study_id
urls = isa_urls(id)
dir = download(urls)
zip_filename = zip(id, dir)
print(zip_filename)
if __name__ == '__main__':
parser = arg_parser()
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment