Skip to content

Instantly share code, notes, and snippets.

@GuyAglionby
Last active December 23, 2019 15:07
Show Gist options
  • Save GuyAglionby/a9d184ddec0253fe8d49c49acd88bbe5 to your computer and use it in GitHub Desktop.
Save GuyAglionby/a9d184ddec0253fe8d49c49acd88bbe5 to your computer and use it in GitHub Desktop.
Downloads ACL PDFs, extracts abstracts, and put them into the XML used to build the ACL Anthology (cf https://github.com/acl-org/acl-anthology/issues/714)
import pdfquery
import re
from lxml import etree as ET
import urllib.request
import urllib.error
# https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt
with open('words_alpha.txt', 'r') as f:
words = set([x.strip() for x in f])
words.add('embeddings')
words.add('softmax')
words.add('state-of-the-art')
words.add('sequence-to-sequence')
word_re = re.compile('[A-Za-z]+')
acl_paper_url_stub = 'https://www.aclweb.org/anthology/'
acl_xml_file = '/Users/guyaglionby/Documents/projects/acl-anthology/data/xml/W18.xml'
acl_tree = ET.parse(acl_xml_file)
acl_xml = acl_tree.getroot()
volume_ids = list([str(x) for x in range(51,66)])
volume_trees = acl_xml.findall('volume')
for volume in volume_trees:
if volume.attrib['id'] not in volume_ids:
continue
print('Volume', volume.attrib['id'])
for paper in volume.findall('paper'):
paper_id = paper.find('url').text
print(f'\n\n\n{paper_id}')
pdf_url = acl_paper_url_stub + paper_id + '.pdf'
try:
f, _ = urllib.request.urlretrieve(pdf_url)
except urllib.error.HTTPError:
print('URL not found for this paper')
continue
try:
pdf = pdfquery.PDFQuery(f)
pdf.load(0)
except:
print('Parser error in file')
continue
abstract_elements = pdf.tree.xpath('//*/LTTextLineHorizontal[LTTextBoxHorizontal[contains(text(), "Abstract")]]/following::LTTextBoxHorizontal')
if len(abstract_elements) == 0:
continue
abstract_elements = abstract_elements[0].findall('LTTextLineHorizontal')
abstract_lines = [x.text.strip() for x in abstract_elements]
joined_abstract = ''
for i, line in enumerate(abstract_lines):
line = line.replace('fi', 'fi')
line = line.replace('fl', 'fl')
if line.endswith('-'):
this_line_end_word = line.split(' ')[-1][:-1]
next_line_word = abstract_lines[i + 1].split(' ')[0].replace('fi', 'fi').replace('fl', 'fl')
concat_word = this_line_end_word + next_line_word
concat_word = word_re.search(concat_word)
if concat_word is not None and concat_word[0].lower() in words:
print(f'{paper_id}: Concatenating: "{this_line_end_word}{next_line_word}"')
joined_abstract += line[:-1]
else:
print(f'{paper_id}: Hyphenating "{this_line_end_word}-{next_line_word}"')
joined_abstract += line
else:
joined_abstract += line + ' '
abstract = paper.find('abstract')
if abstract is None:
abstract = ET.SubElement(paper, 'abstract')
joined_abstract = joined_abstract.strip()
if joined_abstract.endswith(' Introduction'):
joined_abstract = joined_abstract[:-len(' Introduction')]
abstract.text = joined_abstract
acl_tree.write(acl_xml_file, encoding='utf-8', xml_declaration=True, pretty_print=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment