mathandy/getpdfs.py

## getpdfs.py
"""Downloads ArXiv papers linked to in a Google Doc converted to HTML.

Notes:
------
  * tested on OS X with Python 3
  * Requires arxiv (`pip install arxiv`)
  * Names of PDFs will be the papers' titles on ArXiv (with some
    slight formatting changes).
"""

# For compatibility (but use Python 3 anyways)
from __future__ import print_function
try: from urllib import urlretrieve
except ImportError: from urllib.request import urlretrieve

# Standard Library Dependencies
import re
import os

# Non-Standard 3rd Party Dependencies
import arxiv  # pip install arxiv


file_containing_links = 'AndysReadingList.html'
get_authors = True

# Extract all href values from `file_containing_links`
href_tag_pattern = re.compile('href="([^\s]*)"')
with open(file_containing_links, 'r') as f:
    string_containing_links = f.read()
links = href_tag_pattern.findall(string_containing_links)

# clean the (tracking?) junk google leaves around the url
links = [x.replace('https://www.google.com/url?q=','') for x in links]
links = [x.split('&amp;sa=')[0] for x in links]


# find all arXiv links
pattern = re.compile("arxiv.org/\w\w\w/(\d\d\d\d.\d\d\d\d\d)")
non_arxiv_links = []
arxiv_ids = []
for l in links:
    try:
        arxiv_ids.append(pattern.findall(l)[0])
    except IndexError:
        non_arxiv_links.append(l)


def download(obj, dirname=None, prepend_id=False, reformat=True):
    """Downloads and renames the ArXiv paper referred to by `obj`."""
    if dirname is None:
        dirname = os.getcwd()

    try:
        title = obj['title']
        assert title

        if reformat:
            # replace tabs and endlines with spaces
            title = title.replace('\n', ' ').replace('\t', ' ')

            # remove duplicate spaces
            title = ' '.join(filter(None, title.split(' ')))

            # replace colons with hyphens
            title = title.replace(':', '-')

        if prepend_id:
            title = obj['arxiv_url'].split('/')[-1] + '-' + title

        filename = os.path.join(dirname, title + '.pdf')
        if os.path.exists(filename):
            print("Already exists: {}".format(filename))
        else:
            print(filename[:-4])
            urlretrieve(obj['pdf_url'], filename)
            return True
    except Exception as e:
        print("Something went wrong...\n{}\n", e)
    return False


# get the corresponding papers
papers = [arxiv.query(id_list=[x])[0] for x in set(arxiv_ids)]
for k, paper in enumerate(papers):
    print("Downloading paper {}/{} : ".format(k, len(papers)), end='')
    download(paper)


# report non-arXiv links
if non_arxiv_links:
    print('\nThe following links were not downloaded (i.e. are not from arXiv):')
    for x in non_arxiv_links:
        print(x)


if get_authors:
    # make a dictionary of form `(author, list_of_papers)`
    authors = list(set([x for p in papers for x in p.authors]))
    authors = dict([(x, []) for x in authors])
    for paper in papers:
        for a in paper.authors:
            authors[a].append(paper)

    # sort so most prevalent authors listed last
    authors = dict(sorted(list(authors.items()), key=lambda x: len(x[1])))

    # write results to file
    with open('authors.txt', 'w') as f:
        for x in authors:
            f.write(x + '\n')
            for k, p in enumerate(authors[x]):
                f.write("\t%s. "%(k+1) + p.title + '\n')
            f.write('\n')

    print("\nSee `authors.txt` for list of authors sorted by prevalence.\n")
	"""Downloads ArXiv papers linked to in a Google Doc converted to HTML.

	Notes:
	------
	* tested on OS X with Python 3
	* Requires arxiv (`pip install arxiv`)
	* Names of PDFs will be the papers' titles on ArXiv (with some
	slight formatting changes).
	"""

	# For compatibility (but use Python 3 anyways)
	from __future__ import print_function
	try: from urllib import urlretrieve
	except ImportError: from urllib.request import urlretrieve

	# Standard Library Dependencies
	import re
	import os

	# Non-Standard 3rd Party Dependencies
	import arxiv # pip install arxiv


	file_containing_links = 'AndysReadingList.html'
	get_authors = True

	# Extract all href values from `file_containing_links`
	href_tag_pattern = re.compile('href="([^\s]*)"')
	with open(file_containing_links, 'r') as f:
	string_containing_links = f.read()
	links = href_tag_pattern.findall(string_containing_links)

	# clean the (tracking?) junk google leaves around the url
	links = [x.replace('https://www.google.com/url?q=','') for x in links]
	links = [x.split('&sa=')[0] for x in links]


	# find all arXiv links
	pattern = re.compile("arxiv.org/\w\w\w/(\d\d\d\d.\d\d\d\d\d)")
	non_arxiv_links = []
	arxiv_ids = []
	for l in links:
	try:
	arxiv_ids.append(pattern.findall(l)[0])
	except IndexError:
	non_arxiv_links.append(l)


	def download(obj, dirname=None, prepend_id=False, reformat=True):
	"""Downloads and renames the ArXiv paper referred to by `obj`."""
	if dirname is None:
	dirname = os.getcwd()

	try:
	title = obj['title']
	assert title

	if reformat:
	# replace tabs and endlines with spaces
	title = title.replace('\n', ' ').replace('\t', ' ')

	# remove duplicate spaces
	title = ' '.join(filter(None, title.split(' ')))

	# replace colons with hyphens
	title = title.replace(':', '-')

	if prepend_id:
	title = obj['arxiv_url'].split('/')[-1] + '-' + title

	filename = os.path.join(dirname, title + '.pdf')
	if os.path.exists(filename):
	print("Already exists: {}".format(filename))
	else:
	print(filename[:-4])
	urlretrieve(obj['pdf_url'], filename)
	return True
	except Exception as e:
	print("Something went wrong...\n{}\n", e)
	return False


	# get the corresponding papers
	papers = [arxiv.query(id_list=[x])[0] for x in set(arxiv_ids)]
	for k, paper in enumerate(papers):
	print("Downloading paper {}/{} : ".format(k, len(papers)), end='')
	download(paper)


	# report non-arXiv links
	if non_arxiv_links:
	print('\nThe following links were not downloaded (i.e. are not from arXiv):')
	for x in non_arxiv_links:
	print(x)


	if get_authors:
	# make a dictionary of form `(author, list_of_papers)`
	authors = list(set([x for p in papers for x in p.authors]))
	authors = dict([(x, []) for x in authors])
	for paper in papers:
	for a in paper.authors:
	authors[a].append(paper)

	# sort so most prevalent authors listed last
	authors = dict(sorted(list(authors.items()), key=lambda x: len(x[1])))

	# write results to file
	with open('authors.txt', 'w') as f:
	for x in authors:
	f.write(x + '\n')
	for k, p in enumerate(authors[x]):
	f.write("\t%s. "%(k+1) + p.title + '\n')
	f.write('\n')

	print("\nSee `authors.txt` for list of authors sorted by prevalence.\n")