andreasvc/aclrename.py

## aclrename.py
"""Script to rename papers from ACL Anthology to 'author year title.pdf'

Given PDF files from the ACL anthology http://aclweb.org/anthology/
downloads bibtex file and extracts author, year, title
to suggest more descriptive names.

Before: N04-1016.pdf
After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]

Usage:
$ python3 aclrename.py >/tmp/rename.sh
$ # do post-editing on /tmp/rename.sh
$ bash /tmp/rename.sh
"""
import re
import sys
import glob
import time
import requests

EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL,
  author    = {Lapata, Mirella  and  Keller, Frank},
  title     = {The Web as a Baseline: Evaluating the Performance of \
Unsupervised Web-based Models for a Range of NLP Tasks},
  booktitle = {HLT-NAACL 2004: Main Proceedings },
  editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
  year      = 2004,
  month     = {May 2 - May 7},
  address   = {Boston, Massachusetts, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {121--128}
}'''

ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$')
BIBLINE = re.compile(r'^\s*(\S+)\s*=\s*(?:\{(.*)\}|(.*)),?\s*$')
# http://aclweb.org/anthology/N/N04/N04-1016.bib
URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib'
ALLCAPS = re.compile(r'^[A-Z\W]+$')


def main():
	"""Suggest new filenames for all ACL papers in current directory."""
	for filename in glob.glob('*.pdf'):
		match = ACLPAPER.match(filename)
		if match:
			time.sleep(0.5)
			bib = requests.get(URLTEMPLATE % (
					match.group(3), match.group(2), match.group(1)))
			if bib.status_code != 200:
				print('could not get bib (%s): %s' % (
						bib.status_code, filename), file=sys.stderr)
				continue
			author, year, title = parsebib(bib.content.decode('utf8'))
			newfilename = '%s %s %s.pdf' % (author, year, title)
			newfilename = newfilename.replace('/', '').replace('\\', '')
			print('mv %s "%s"' % (filename, newfilename))
			print('SUCCESS:', filename, file=sys.stderr)


def parsebib(bib):
	"""Parse a bibtex string and return (author, year, title)."""
	data = {}
	for line in bib.splitlines():
		bibmatch = BIBLINE.match(line)
		if bibmatch is None:
			continue
			# raise ValueError('error with line: %s' % line)
		data[bibmatch.group(1).lower()] = (
				bibmatch.group(2) or bibmatch.group(3)).strip('{},')
	if 'year' not in data or 'author' not in data or 'title' not in data:
		print(bib, file=sys.stderr)
		print(data, file=sys.stderr)
		raise ValueError
	year = data['year']
	title = data['title'][:120].replace('{', '').replace('}', '')
	author = lastname(data['author'])
	if data['author'].count(' and ') > 1:  # et al
		author += ' et al.'
	elif data['author'].count(' and ') == 1:  # A & B
		author += ' & ' + lastname(data['author'].split(' and ')[1])
	if ALLCAPS.match(author):
		author = author.title()
	if ALLCAPS.match(title):
		title = title.title()
	# FIXME: handle accents
	return author, year, title


def lastname(name):
	"""Take first name from string and return last name."""
	if ',' in name:
		return name[:name.index(',')].strip()
	return name.split(' and ')[0].split()[-1].strip()

if __name__ == '__main__':
	main()
	"""Script to rename papers from ACL Anthology to 'author year title.pdf'

	Given PDF files from the ACL anthology http://aclweb.org/anthology/
	downloads bibtex file and extracts author, year, title
	to suggest more descriptive names.

	Before: N04-1016.pdf
	After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]

	Usage:
	$ python3 aclrename.py >/tmp/rename.sh
	$ # do post-editing on /tmp/rename.sh
	$ bash /tmp/rename.sh
	"""
	import re
	import sys
	import glob
	import time
	import requests

	EXAMPLE = '''@inproceedings{lapata-keller:2004:HLTNAACL,
	author = {Lapata, Mirella and Keller, Frank},
	title = {The Web as a Baseline: Evaluating the Performance of \
	Unsupervised Web-based Models for a Range of NLP Tasks},
	booktitle = {HLT-NAACL 2004: Main Proceedings },
	editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
	year = 2004,
	month = {May 2 - May 7},
	address = {Boston, Massachusetts, USA},
	publisher = {Association for Computational Linguistics},
	pages = {121--128}
	}'''

	ACLPAPER = re.compile(r'^((([JPNECDQWKHRT])\d{2})-\d{4})\.pdf$')
	BIBLINE = re.compile(r'^\s(\S+)\s=\s(?:\{(.)\}\|(.)),?\s$')
	# http://aclweb.org/anthology/N/N04/N04-1016.bib
	URLTEMPLATE = 'http://aclweb.org/anthology/%s/%s/%s.bib'
	ALLCAPS = re.compile(r'^[A-Z\W]+$')


	def main():
	"""Suggest new filenames for all ACL papers in current directory."""
	for filename in glob.glob('*.pdf'):
	match = ACLPAPER.match(filename)
	if match:
	time.sleep(0.5)
	bib = requests.get(URLTEMPLATE % (
	match.group(3), match.group(2), match.group(1)))
	if bib.status_code != 200:
	print('could not get bib (%s): %s' % (
	bib.status_code, filename), file=sys.stderr)
	continue
	author, year, title = parsebib(bib.content.decode('utf8'))
	newfilename = '%s %s %s.pdf' % (author, year, title)
	newfilename = newfilename.replace('/', '').replace('\\', '')
	print('mv %s "%s"' % (filename, newfilename))
	print('SUCCESS:', filename, file=sys.stderr)


	def parsebib(bib):
	"""Parse a bibtex string and return (author, year, title)."""
	data = {}
	for line in bib.splitlines():
	bibmatch = BIBLINE.match(line)
	if bibmatch is None:
	continue
	# raise ValueError('error with line: %s' % line)
	data[bibmatch.group(1).lower()] = (
	bibmatch.group(2) or bibmatch.group(3)).strip('{},')
	if 'year' not in data or 'author' not in data or 'title' not in data:
	print(bib, file=sys.stderr)
	print(data, file=sys.stderr)
	raise ValueError
	year = data['year']
	title = data['title'][:120].replace('{', '').replace('}', '')
	author = lastname(data['author'])
	if data['author'].count(' and ') > 1: # et al
	author += ' et al.'
	elif data['author'].count(' and ') == 1: # A & B
	author += ' & ' + lastname(data['author'].split(' and ')[1])
	if ALLCAPS.match(author):
	author = author.title()
	if ALLCAPS.match(title):
	title = title.title()
	# FIXME: handle accents
	return author, year, title


	def lastname(name):
	"""Take first name from string and return last name."""
	if ',' in name:
	return name[:name.index(',')].strip()
	return name.split(' and ')[0].split()[-1].strip()

	if __name__ == '__main__':
	main()