Skip to content

Instantly share code, notes, and snippets.

@mathandy
Last active February 14, 2018 03:04
Show Gist options
  • Save mathandy/5bd7367ebd3d30494b103c4ce362a414 to your computer and use it in GitHub Desktop.
Save mathandy/5bd7367ebd3d30494b103c4ce362a414 to your computer and use it in GitHub Desktop.
Downloads (and renames by title) ArXiv papers linked to in a Google Doc converted to HTML.
"""Downloads ArXiv papers linked to in a Google Doc converted to HTML.
Notes:
------
* tested on OS X with Python 3
* Requires arxiv (`pip install arxiv`)
* Names of PDFs will be the papers' titles on ArXiv (with some
slight formatting changes).
"""
# For compatibility (but use Python 3 anyways)
from __future__ import print_function
try: from urllib import urlretrieve
except ImportError: from urllib.request import urlretrieve
# Standard Library Dependencies
import re
import os
# Non-Standard 3rd Party Dependencies
import arxiv # pip install arxiv
file_containing_links = 'AndysReadingList.html'
get_authors = True
# Extract all href values from `file_containing_links`
href_tag_pattern = re.compile('href="([^\s]*)"')
with open(file_containing_links, 'r') as f:
string_containing_links = f.read()
links = href_tag_pattern.findall(string_containing_links)
# clean the (tracking?) junk google leaves around the url
links = [x.replace('https://www.google.com/url?q=','') for x in links]
links = [x.split('&sa=')[0] for x in links]
# find all arXiv links
pattern = re.compile("arxiv.org/\w\w\w/(\d\d\d\d.\d\d\d\d\d)")
non_arxiv_links = []
arxiv_ids = []
for l in links:
try:
arxiv_ids.append(pattern.findall(l)[0])
except IndexError:
non_arxiv_links.append(l)
def download(obj, dirname=None, prepend_id=False, reformat=True):
"""Downloads and renames the ArXiv paper referred to by `obj`."""
if dirname is None:
dirname = os.getcwd()
try:
title = obj['title']
assert title
if reformat:
# replace tabs and endlines with spaces
title = title.replace('\n', ' ').replace('\t', ' ')
# remove duplicate spaces
title = ' '.join(filter(None, title.split(' ')))
# replace colons with hyphens
title = title.replace(':', '-')
if prepend_id:
title = obj['arxiv_url'].split('/')[-1] + '-' + title
filename = os.path.join(dirname, title + '.pdf')
if os.path.exists(filename):
print("Already exists: {}".format(filename))
else:
print(filename[:-4])
urlretrieve(obj['pdf_url'], filename)
return True
except Exception as e:
print("Something went wrong...\n{}\n", e)
return False
# get the corresponding papers
papers = [arxiv.query(id_list=[x])[0] for x in set(arxiv_ids)]
for k, paper in enumerate(papers):
print("Downloading paper {}/{} : ".format(k, len(papers)), end='')
download(paper)
# report non-arXiv links
if non_arxiv_links:
print('\nThe following links were not downloaded (i.e. are not from arXiv):')
for x in non_arxiv_links:
print(x)
if get_authors:
# make a dictionary of form `(author, list_of_papers)`
authors = list(set([x for p in papers for x in p.authors]))
authors = dict([(x, []) for x in authors])
for paper in papers:
for a in paper.authors:
authors[a].append(paper)
# sort so most prevalent authors listed last
authors = dict(sorted(list(authors.items()), key=lambda x: len(x[1])))
# write results to file
with open('authors.txt', 'w') as f:
for x in authors:
f.write(x + '\n')
for k, p in enumerate(authors[x]):
f.write("\t%s. "%(k+1) + p.title + '\n')
f.write('\n')
print("\nSee `authors.txt` for list of authors sorted by prevalence.\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment