Skip to content

Instantly share code, notes, and snippets.

@crypt3lx2k
Last active February 22, 2019 02:41
Show Gist options
  • Save crypt3lx2k/46c698b41e72bfd58498a7d4ba0ad431 to your computer and use it in GitHub Desktop.
Save crypt3lx2k/46c698b41e72bfd58498a7d4ba0ad431 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python3
"""Downloads pdfs from David Silver's publications directory."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
import time
import urllib.request
FLAGS = {
'url' : 'http://www0.cs.ucl.ac.uk/staff/d.silver/web/Publications_files/',
'output_dir' : 'papers/',
'pdf_regex' : re.compile(r'([A-Za-z0-9\-_]+?\.pdf)'),
'wait_seconds' : 60.0
}
def main():
print(FLAGS)
return 0
if not os.path.exists(FLAGS['output_dir']):
print('Creating {} directory'.format(FLAGS['output_dir']))
os.makedirs(FLAGS['output_dir'])
print('Reading {} for papers.'.format(FLAGS['url']))
with urllib.request.urlopen(FLAGS['url']) as in_url:
html_data = str(in_url.read())
pdfs = set()
for pdf in FLAGS['pdf_regex'].findall(html_data):
pdfs.add(pdf)
pdfs = sorted(pdfs)
for pdf in pdfs:
pdf_path = FLAGS['output_dir'] + pdf
if os.path.exists(pdf_path):
print('Skipping {} as it already exists.'.format(pdf))
continue
pdf_url = FLAGS['url'] + pdf
print('Downloading {}'.format(pdf_url))
try:
with urllib.request.urlopen(pdf_url) as in_url:
pdf_data = in_url.read()
except urllib.error.HTTPError as e:
print('Failed download with error {}'.format(e))
continue
print('Writing {}'.format(pdf_path))
with open(pdf_path, 'wb') as out_file:
out_file.write(pdf_data)
if FLAGS['wait_seconds'] > 0.0:
print('Sleeping for {} seconds'.format(FLAGS['wait_seconds']))
time.sleep(FLAGS['wait_seconds'])
return 0
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--url', type=str, metavar='url',
default=FLAGS['url'],
help='Publications directory.'
)
parser.add_argument(
'--output_dir', type=str, metavar='dir',
default=FLAGS['output_dir'],
help='Where to store the downloaded papers.'
)
parser.add_argument(
'--wait_seconds', type=float, metavar='s',
default=FLAGS['wait_seconds'],
help='How many seconds to wait between each download.'
)
ARGS = parser.parse_args()
FLAGS.update(ARGS.__dict__)
exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment