Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Last active December 11, 2015 19:29
Show Gist options
  • Save TomAugspurger/4649234 to your computer and use it in GitHub Desktop.
Save TomAugspurger/4649234 to your computer and use it in GitHub Desktop.
Fetching BibTeX files for a paper repository from ideas.repec.org. Doesn't have much in the way of error-checking or validation yet; that comes next
"""
Want to take a file and fetch the BibTeX info from ideas.
"""
import re
import sys
import requests
import pathlib
from bs4 import BeautifulSoup
base_url = 'http://ideas.repec.org/cgi-bin/htsearch'
follow_up = []
# TODO just call c.run() in init. Or do all the components, set to self, call.
class FetchCitation(object):
"""Get citations via Repec."""
def __init__(self, path, dest):
if isinstance(path, pathlib.PosixPath):
self.path = path
else:
try:
self.path = pathlib.PosixPath(path)
except:
raise OSError
self.dest = dest
self.authors, self.paper = self.get_info(self.path)
self.miss = []
def get_info(self, path):
"""Take a filename and parse out the authors and paper title.
"""
pat = re.compile(r' \([\d-]{0,4}\)')
sep_authors = re.compile(r' & |, ')
try:
all_authors, paper = re.split(pat, self.path.as_posix())
paper = paper.lstrip(' - ')
self.paper = paper
except OSError:
print('Missed %s' % path)
try:
authors = re.split(sep_authors, all_authors)
authors = [author.strip('& ' or 'and ') for author in authors]
self.authors = authors
except:
print('Missed %s' % path)
self.miss.append(path)
return (authors, paper)
def get_paper_match(self, links):
"""From a set of links, find the ones that match the pattern
for a paper result. : [] -> [].
"""
matches = []
for link in links:
m = re.match('<a href="http://ideas.repec.org/p/.+', str(link))
if m:
matches.append(m.group()[9:-2])
self.matches = matches
return matches
def search(self, all=False):
"""Send the search and 'parse' the results.
"""
payload = {'q': ' '.join([' '.join(self.authors), self.paper]).rstrip('.pdf'),
'cmd': 'Search', 'form': 'extended', 'm': 'all', 'ps': '10',
'fmt': 'long', 'wm': 'wrd', 'sp': '1', 'sy': '1', 'wf': '4BFF',
'dt': 'range', 'db': '', 'de': ''}
def _search(self, payload, all=False):
r = requests.get(base_url, params=payload)
soup = BeautifulSoup(r.content)
links = soup.findAll('a')
paper_urls = self.get_paper_match(links)
best = paper_urls[0]
if not all:
return best
else:
return paper_urls
try:
best = _search(self, payload)
self.best = best
return best
except IndexError: # No results
try:
payload['q'] = ' '.join([self.authors[0], self.paper]).rstrip('.pdf')
best = _search(self, payload)
self.best = best
return best
except IndexError:
try:
payload['q'] = ' '.join([self.authors[1], self.paper]).rstrip('.pdf')
best = _search(self, payload)
self.best = best
return best
except IndexError:
raise IndexError("No Results Found.")
def google_search(self, authors, paper):
pass
# raise IndexError("No Results Found.")
def get_bib(self, paper_page):
"""Takes a RePeC page and finds the BibTeX entry and returns a string.
"""
r = requests.get(paper_page)
soup = BeautifulSoup(r.content)
links = soup.findAll('a')
mask = [x.text == 'BibTeX' for x in links]
try:
bib = links[mask.index(True)] # BS object with just the end of url.
r2 = requests.get('http://ideas.repec.org' + bib.attrs['href'])
except ValueError: # Alternate formatting.
p1 = re.search(r'handle" value="[\w*:]*', r.text).group()
k1 = p1.split('="')[-1]
p2 = re.search(r'"ref" value="\w* \w*', r.text).group()
k2 = p2.split('="')[-1]
d = {'handle': k1,
'ref': k2,
'output': '2'} # 2 is BibTeX I think.
url = 'http://ideas.repec.org/cgi-bin/refs.cgi'
r2 = requests.post(url=url, data=d)
self.bib = r2.text
return r2.content
def write_bib(self, bib, f):
"""Appends lines in bib to open file f.
"""
if f.closed:
f = open(f.name, 'a') # Makes assumption on location.
try:
f.write(bib)
print('Success')
except:
print('Fail')
follow_up.append(bib)
f.close()
def run(self):
"""
Compose all the methods to get the citation.
"""
self.write_bib(self.get_bib(self.search()), self.dest)
if __name__ == '__main__':
full_path = pathlib.PosixPath(sys.argv[1])
file_path = pathlib.PosixPath(full_path.parts[-1])
if len(sys.argv) == 2:
f = open('/Users/tom/citations.bib', 'a')
else:
f = open(sys.argv[2], 'a')
try:
cite = FetchCitation(file_path, f)
cite.run()
print('Success for %s') % file_path.as_posix()
except:
with open('/Users/tom/Desktop/follow_up.txt', 'a') as f:
f.write(full_path.as_posix() + '\n')
print('Failed on %s') % full_path.as_posix()
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment