Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script to rename papers from ACL Anthology to 'author year title.pdf'
"""Script to rename papers from ACL Anthology to 'author year title.pdf'
Given PDF files from the ACL anthology http://aclweb.org/anthology/
downloads bibtex file and extracts author, year, title
to suggest more descriptive names.
Before: N04-1016.pdf
After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]
Usage:
$ python3 aclrename.py >/tmp/rename.sh
$ # do post-editing on /tmp/rename.sh
$ bash /tmp/rename.sh
"""
import re
import sys
import glob
import time
import requests
EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL,
author = {Lapata, Mirella and Keller, Frank},
title = {The Web as a Baseline: Evaluating the Performance of \
Unsupervised Web-based Models for a Range of NLP Tasks},
booktitle = {HLT-NAACL 2004: Main Proceedings },
editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
year = 2004,
month = {May 2 - May 7},
address = {Boston, Massachusetts, USA},
publisher = {Association for Computational Linguistics},
pages = {121--128}
}'''
ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$')
BIBLINE = re.compile(r'^\s*(\S+)\s*=\s*(?:\{(.*)\}|(.*)),?\s*$')
# http://aclweb.org/anthology/N/N04/N04-1016.bib
URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib'
ALLCAPS = re.compile(r'^[A-Z\W]+$')
def main():
"""Suggest new filenames for all ACL papers in current directory."""
for filename in glob.glob('*.pdf'):
match = ACLPAPER.match(filename)
if match:
time.sleep(0.5)
bib = requests.get(URLTEMPLATE % (
match.group(3), match.group(2), match.group(1)))
if bib.status_code != 200:
print('could not get bib (%s): %s' % (
bib.status_code, filename), file=sys.stderr)
continue
author, year, title = parsebib(bib.content.decode('utf8'))
newfilename = '%s %s %s.pdf' % (author, year, title)
newfilename = newfilename.replace('/', '').replace('\\', '')
print('mv %s "%s"' % (filename, newfilename))
print('SUCCESS:', filename, file=sys.stderr)
def parsebib(bib):
"""Parse a bibtex string and return (author, year, title)."""
data = {}
for line in bib.splitlines():
bibmatch = BIBLINE.match(line)
if bibmatch is None:
continue
# raise ValueError('error with line: %s' % line)
data[bibmatch.group(1).lower()] = (
bibmatch.group(2) or bibmatch.group(3)).strip('{},')
if 'year' not in data or 'author' not in data or 'title' not in data:
print(bib, file=sys.stderr)
print(data, file=sys.stderr)
raise ValueError
year = data['year']
title = data['title'][:120].replace('{', '').replace('}', '')
author = lastname(data['author'])
if data['author'].count(' and ') > 1: # et al
author += ' et al.'
elif data['author'].count(' and ') == 1: # A & B
author += ' & ' + lastname(data['author'].split(' and ')[1])
if ALLCAPS.match(author):
author = author.title()
if ALLCAPS.match(title):
title = title.title()
# FIXME: handle accents
return author, year, title
def lastname(name):
"""Take first name from string and return last name."""
if ',' in name:
return name[:name.index(',')].strip()
return name.split(' and ')[0].split()[-1].strip()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment