TomAugspurger/fetch_citations.py

## fetch_citations.py
"""
Want to take a file and fetch the BibTeX info from ideas.
"""

import re
import sys

import requests
import pathlib
from bs4 import BeautifulSoup

base_url = 'http://ideas.repec.org/cgi-bin/htsearch'
follow_up = []

# TODO just call c.run() in init.  Or do all the components, set to self, call.


class FetchCitation(object):
    """Get citations via Repec."""
    def __init__(self, path, dest):
        if isinstance(path, pathlib.PosixPath):
            self.path = path
        else:
            try:
                self.path = pathlib.PosixPath(path)
            except:
                raise OSError
        self.dest = dest
        self.authors, self.paper = self.get_info(self.path)
        self.miss = []

    def get_info(self, path):
        """Take a filename and parse out the authors and paper title.
        """
        pat = re.compile(r' \([\d-]{0,4}\)')
        sep_authors = re.compile(r' & |, ')
        try:
            all_authors, paper = re.split(pat, self.path.as_posix())
            paper = paper.lstrip(' - ')
            self.paper = paper
        except OSError:
            print('Missed %s' % path)
        try:
            authors = re.split(sep_authors, all_authors)
            authors = [author.strip('& ' or 'and ') for author in authors]
            self.authors = authors
        except:
            print('Missed %s' % path)
            self.miss.append(path)
        return (authors, paper)

    def get_paper_match(self, links):
        """From a set of links, find the ones that match the pattern
        for a paper result. : [] -> [].
        """
        matches = []
        for link in links:
            m = re.match('<a href="http://ideas.repec.org/p/.+', str(link))
            if m:
                matches.append(m.group()[9:-2])
        self.matches = matches
        return matches

    def search(self, all=False):
        """Send the search and 'parse' the results.
        """
        payload = {'q': ' '.join([' '.join(self.authors), self.paper]).rstrip('.pdf'),
        'cmd': 'Search', 'form': 'extended', 'm': 'all', 'ps': '10',
        'fmt': 'long', 'wm': 'wrd', 'sp': '1', 'sy': '1', 'wf': '4BFF',
        'dt': 'range', 'db': '', 'de': ''}

        def _search(self, payload, all=False):
            r = requests.get(base_url, params=payload)
            soup = BeautifulSoup(r.content)
            links = soup.findAll('a')
            paper_urls = self.get_paper_match(links)
            best = paper_urls[0]
            if not all:
                return best
            else:
                return paper_urls
        try:
            best = _search(self, payload)
            self.best = best
            return best
        except IndexError:  # No results
            try:
                payload['q'] = ' '.join([self.authors[0], self.paper]).rstrip('.pdf')
                best = _search(self, payload)
                self.best = best
                return best
            except IndexError:
                try:
                    payload['q'] = ' '.join([self.authors[1], self.paper]).rstrip('.pdf')
                    best = _search(self, payload)
                    self.best = best
                    return best
                except IndexError:
                    raise IndexError("No Results Found.")

    def google_search(self, authors, paper):
        pass
        # raise IndexError("No Results Found.")

    def get_bib(self, paper_page):
        """Takes a RePeC page and finds the BibTeX entry and returns a string.
        """
        r = requests.get(paper_page)
        soup = BeautifulSoup(r.content)
        links = soup.findAll('a')
        mask = [x.text == 'BibTeX' for x in links]
        try:
            bib = links[mask.index(True)]  # BS object with just the end of url.
            r2 = requests.get('http://ideas.repec.org' + bib.attrs['href'])
        except ValueError:  # Alternate formatting.
            p1 = re.search(r'handle" value="[\w*:]*', r.text).group()
            k1 = p1.split('="')[-1]
            p2 = re.search(r'"ref" value="\w* \w*', r.text).group()
            k2 = p2.split('="')[-1]
            d = {'handle': k1,
                    'ref': k2,
                    'output': '2'}  # 2 is BibTeX I think.
            url = 'http://ideas.repec.org/cgi-bin/refs.cgi'
            r2 = requests.post(url=url, data=d)
        self.bib = r2.text
        return r2.content

    def write_bib(self, bib, f):
        """Appends lines in bib to open file f.
        """
        if f.closed:
            f = open(f.name, 'a')  # Makes assumption on location.
        try:
            f.write(bib)
            print('Success')
        except:
            print('Fail')
            follow_up.append(bib)
        f.close()

    def run(self):
        """
        Compose all the methods to get the citation.
        """
        self.write_bib(self.get_bib(self.search()), self.dest)

if __name__ == '__main__':
    full_path = pathlib.PosixPath(sys.argv[1])
    file_path = pathlib.PosixPath(full_path.parts[-1])
    if len(sys.argv) == 2:
        f = open('/Users/tom/citations.bib', 'a')
    else:
        f = open(sys.argv[2], 'a')
    try:
        cite = FetchCitation(file_path, f)
        cite.run()
        print('Success for %s') % file_path.as_posix()
    except:
        with open('/Users/tom/Desktop/follow_up.txt', 'a') as f:
            f.write(full_path.as_posix() + '\n')
            print('Failed on %s') % full_path.as_posix()
            sys.exit(1)
	"""
	Want to take a file and fetch the BibTeX info from ideas.
	"""

	import re
	import sys

	import requests
	import pathlib
	from bs4 import BeautifulSoup

	base_url = 'http://ideas.repec.org/cgi-bin/htsearch'
	follow_up = []

	# TODO just call c.run() in init. Or do all the components, set to self, call.


	class FetchCitation(object):
	"""Get citations via Repec."""
	def __init__(self, path, dest):
	if isinstance(path, pathlib.PosixPath):
	self.path = path
	else:
	try:
	self.path = pathlib.PosixPath(path)
	except:
	raise OSError
	self.dest = dest
	self.authors, self.paper = self.get_info(self.path)
	self.miss = []

	def get_info(self, path):
	"""Take a filename and parse out the authors and paper title.
	"""
	pat = re.compile(r' \([\d-]{0,4}\)')
	sep_authors = re.compile(r' & \|, ')
	try:
	all_authors, paper = re.split(pat, self.path.as_posix())
	paper = paper.lstrip(' - ')
	self.paper = paper
	except OSError:
	print('Missed %s' % path)
	try:
	authors = re.split(sep_authors, all_authors)
	authors = [author.strip('& ' or 'and ') for author in authors]
	self.authors = authors
	except:
	print('Missed %s' % path)
	self.miss.append(path)
	return (authors, paper)

	def get_paper_match(self, links):
	"""From a set of links, find the ones that match the pattern
	for a paper result. : [] -> [].
	"""
	matches = []
	for link in links:
	m = re.match('<a href="http://ideas.repec.org/p/.+', str(link))
	if m:
	matches.append(m.group()[9:-2])
	self.matches = matches
	return matches

	def search(self, all=False):
	"""Send the search and 'parse' the results.
	"""
	payload = {'q': ' '.join([' '.join(self.authors), self.paper]).rstrip('.pdf'),
	'cmd': 'Search', 'form': 'extended', 'm': 'all', 'ps': '10',
	'fmt': 'long', 'wm': 'wrd', 'sp': '1', 'sy': '1', 'wf': '4BFF',
	'dt': 'range', 'db': '', 'de': ''}

	def _search(self, payload, all=False):
	r = requests.get(base_url, params=payload)
	soup = BeautifulSoup(r.content)
	links = soup.findAll('a')
	paper_urls = self.get_paper_match(links)
	best = paper_urls[0]
	if not all:
	return best
	else:
	return paper_urls
	try:
	best = _search(self, payload)
	self.best = best
	return best
	except IndexError: # No results
	try:
	payload['q'] = ' '.join([self.authors[0], self.paper]).rstrip('.pdf')
	best = _search(self, payload)
	self.best = best
	return best
	except IndexError:
	try:
	payload['q'] = ' '.join([self.authors[1], self.paper]).rstrip('.pdf')
	best = _search(self, payload)
	self.best = best
	return best
	except IndexError:
	raise IndexError("No Results Found.")

	def google_search(self, authors, paper):
	pass
	# raise IndexError("No Results Found.")

	def get_bib(self, paper_page):
	"""Takes a RePeC page and finds the BibTeX entry and returns a string.
	"""
	r = requests.get(paper_page)
	soup = BeautifulSoup(r.content)
	links = soup.findAll('a')
	mask = [x.text == 'BibTeX' for x in links]
	try:
	bib = links[mask.index(True)] # BS object with just the end of url.
	r2 = requests.get('http://ideas.repec.org' + bib.attrs['href'])
	except ValueError: # Alternate formatting.
	p1 = re.search(r'handle" value="[\w:]', r.text).group()
	k1 = p1.split('="')[-1]
	p2 = re.search(r'"ref" value="\w* \w*', r.text).group()
	k2 = p2.split('="')[-1]
	d = {'handle': k1,
	'ref': k2,
	'output': '2'} # 2 is BibTeX I think.
	url = 'http://ideas.repec.org/cgi-bin/refs.cgi'
	r2 = requests.post(url=url, data=d)
	self.bib = r2.text
	return r2.content

	def write_bib(self, bib, f):
	"""Appends lines in bib to open file f.
	"""
	if f.closed:
	f = open(f.name, 'a') # Makes assumption on location.
	try:
	f.write(bib)
	print('Success')
	except:
	print('Fail')
	follow_up.append(bib)
	f.close()

	def run(self):
	"""
	Compose all the methods to get the citation.
	"""
	self.write_bib(self.get_bib(self.search()), self.dest)

	if __name__ == '__main__':
	full_path = pathlib.PosixPath(sys.argv[1])
	file_path = pathlib.PosixPath(full_path.parts[-1])
	if len(sys.argv) == 2:
	f = open('/Users/tom/citations.bib', 'a')
	else:
	f = open(sys.argv[2], 'a')
	try:
	cite = FetchCitation(file_path, f)
	cite.run()
	print('Success for %s') % file_path.as_posix()
	except:
	with open('/Users/tom/Desktop/follow_up.txt', 'a') as f:
	f.write(full_path.as_posix() + '\n')
	print('Failed on %s') % full_path.as_posix()
	sys.exit(1)