Last active
December 11, 2015 19:29
-
-
Save TomAugspurger/4649234 to your computer and use it in GitHub Desktop.
Fetching BibTeX files for a paper repository from ideas.repec.org. Doesn't have much in the way of error-checking or validation yet; that comes next
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Want to take a file and fetch the BibTeX info from ideas. | |
""" | |
import re | |
import sys | |
import requests | |
import pathlib | |
from bs4 import BeautifulSoup | |
base_url = 'http://ideas.repec.org/cgi-bin/htsearch' | |
follow_up = [] | |
# TODO just call c.run() in init. Or do all the components, set to self, call. | |
class FetchCitation(object): | |
"""Get citations via Repec.""" | |
def __init__(self, path, dest): | |
if isinstance(path, pathlib.PosixPath): | |
self.path = path | |
else: | |
try: | |
self.path = pathlib.PosixPath(path) | |
except: | |
raise OSError | |
self.dest = dest | |
self.authors, self.paper = self.get_info(self.path) | |
self.miss = [] | |
def get_info(self, path): | |
"""Take a filename and parse out the authors and paper title. | |
""" | |
pat = re.compile(r' \([\d-]{0,4}\)') | |
sep_authors = re.compile(r' & |, ') | |
try: | |
all_authors, paper = re.split(pat, self.path.as_posix()) | |
paper = paper.lstrip(' - ') | |
self.paper = paper | |
except OSError: | |
print('Missed %s' % path) | |
try: | |
authors = re.split(sep_authors, all_authors) | |
authors = [author.strip('& ' or 'and ') for author in authors] | |
self.authors = authors | |
except: | |
print('Missed %s' % path) | |
self.miss.append(path) | |
return (authors, paper) | |
def get_paper_match(self, links): | |
"""From a set of links, find the ones that match the pattern | |
for a paper result. : [] -> []. | |
""" | |
matches = [] | |
for link in links: | |
m = re.match('<a href="http://ideas.repec.org/p/.+', str(link)) | |
if m: | |
matches.append(m.group()[9:-2]) | |
self.matches = matches | |
return matches | |
def search(self, all=False): | |
"""Send the search and 'parse' the results. | |
""" | |
payload = {'q': ' '.join([' '.join(self.authors), self.paper]).rstrip('.pdf'), | |
'cmd': 'Search', 'form': 'extended', 'm': 'all', 'ps': '10', | |
'fmt': 'long', 'wm': 'wrd', 'sp': '1', 'sy': '1', 'wf': '4BFF', | |
'dt': 'range', 'db': '', 'de': ''} | |
def _search(self, payload, all=False): | |
r = requests.get(base_url, params=payload) | |
soup = BeautifulSoup(r.content) | |
links = soup.findAll('a') | |
paper_urls = self.get_paper_match(links) | |
best = paper_urls[0] | |
if not all: | |
return best | |
else: | |
return paper_urls | |
try: | |
best = _search(self, payload) | |
self.best = best | |
return best | |
except IndexError: # No results | |
try: | |
payload['q'] = ' '.join([self.authors[0], self.paper]).rstrip('.pdf') | |
best = _search(self, payload) | |
self.best = best | |
return best | |
except IndexError: | |
try: | |
payload['q'] = ' '.join([self.authors[1], self.paper]).rstrip('.pdf') | |
best = _search(self, payload) | |
self.best = best | |
return best | |
except IndexError: | |
raise IndexError("No Results Found.") | |
def google_search(self, authors, paper): | |
pass | |
# raise IndexError("No Results Found.") | |
def get_bib(self, paper_page): | |
"""Takes a RePeC page and finds the BibTeX entry and returns a string. | |
""" | |
r = requests.get(paper_page) | |
soup = BeautifulSoup(r.content) | |
links = soup.findAll('a') | |
mask = [x.text == 'BibTeX' for x in links] | |
try: | |
bib = links[mask.index(True)] # BS object with just the end of url. | |
r2 = requests.get('http://ideas.repec.org' + bib.attrs['href']) | |
except ValueError: # Alternate formatting. | |
p1 = re.search(r'handle" value="[\w*:]*', r.text).group() | |
k1 = p1.split('="')[-1] | |
p2 = re.search(r'"ref" value="\w* \w*', r.text).group() | |
k2 = p2.split('="')[-1] | |
d = {'handle': k1, | |
'ref': k2, | |
'output': '2'} # 2 is BibTeX I think. | |
url = 'http://ideas.repec.org/cgi-bin/refs.cgi' | |
r2 = requests.post(url=url, data=d) | |
self.bib = r2.text | |
return r2.content | |
def write_bib(self, bib, f): | |
"""Appends lines in bib to open file f. | |
""" | |
if f.closed: | |
f = open(f.name, 'a') # Makes assumption on location. | |
try: | |
f.write(bib) | |
print('Success') | |
except: | |
print('Fail') | |
follow_up.append(bib) | |
f.close() | |
def run(self): | |
""" | |
Compose all the methods to get the citation. | |
""" | |
self.write_bib(self.get_bib(self.search()), self.dest) | |
if __name__ == '__main__': | |
full_path = pathlib.PosixPath(sys.argv[1]) | |
file_path = pathlib.PosixPath(full_path.parts[-1]) | |
if len(sys.argv) == 2: | |
f = open('/Users/tom/citations.bib', 'a') | |
else: | |
f = open(sys.argv[2], 'a') | |
try: | |
cite = FetchCitation(file_path, f) | |
cite.run() | |
print('Success for %s') % file_path.as_posix() | |
except: | |
with open('/Users/tom/Desktop/follow_up.txt', 'a') as f: | |
f.write(full_path.as_posix() + '\n') | |
print('Failed on %s') % full_path.as_posix() | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment