GuyAglionby/fix-ws-abstracts.py

## fix-ws-abstracts.py
import pdfquery
import re
from lxml import etree as ET
import urllib.request
import urllib.error

# https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt
with open('words_alpha.txt', 'r') as f:
    words = set([x.strip() for x in f])
    words.add('embeddings')
    words.add('softmax')
    words.add('state-of-the-art')
    words.add('sequence-to-sequence')

word_re = re.compile('[A-Za-z]+')

acl_paper_url_stub = 'https://www.aclweb.org/anthology/'

acl_xml_file = '/Users/guyaglionby/Documents/projects/acl-anthology/data/xml/W18.xml'
acl_tree = ET.parse(acl_xml_file)
acl_xml = acl_tree.getroot()

volume_ids = list([str(x) for x in range(51,66)])
volume_trees = acl_xml.findall('volume')

for volume in volume_trees:
    if volume.attrib['id'] not in volume_ids:
        continue
    print('Volume', volume.attrib['id'])
    for paper in volume.findall('paper'):
        paper_id = paper.find('url').text
        print(f'\n\n\n{paper_id}')
        pdf_url = acl_paper_url_stub + paper_id + '.pdf'
        try:
            f, _ = urllib.request.urlretrieve(pdf_url)
        except urllib.error.HTTPError:
            print('URL not found for this paper')
            continue
        try:
            pdf = pdfquery.PDFQuery(f)
            pdf.load(0)
        except:
            print('Parser error in file')
            continue
        abstract_elements = pdf.tree.xpath('//*/LTTextLineHorizontal[LTTextBoxHorizontal[contains(text(), "Abstract")]]/following::LTTextBoxHorizontal')
        if len(abstract_elements) == 0:
            continue
        abstract_elements = abstract_elements[0].findall('LTTextLineHorizontal')
        abstract_lines = [x.text.strip() for x in abstract_elements]
        joined_abstract = ''
        for i, line in enumerate(abstract_lines):
            line = line.replace('ﬁ', 'fi')
            line = line.replace('ﬂ', 'fl')
            if line.endswith('-'):
                this_line_end_word = line.split(' ')[-1][:-1]
                next_line_word = abstract_lines[i + 1].split(' ')[0].replace('ﬁ', 'fi').replace('ﬂ', 'fl')
                concat_word = this_line_end_word + next_line_word
                concat_word = word_re.search(concat_word)
                if concat_word is not None and concat_word[0].lower() in words:
                    print(f'{paper_id}: Concatenating: "{this_line_end_word}{next_line_word}"')
                    joined_abstract += line[:-1]
                else:
                    print(f'{paper_id}: Hyphenating "{this_line_end_word}-{next_line_word}"')
                    joined_abstract += line
            else:
                joined_abstract += line + ' '
        abstract = paper.find('abstract')
        if abstract is None:
            abstract = ET.SubElement(paper, 'abstract')
        joined_abstract = joined_abstract.strip()
        if joined_abstract.endswith(' Introduction'):
            joined_abstract = joined_abstract[:-len(' Introduction')]
        abstract.text = joined_abstract

acl_tree.write(acl_xml_file, encoding='utf-8', xml_declaration=True, pretty_print=True)
	import pdfquery
	import re
	from lxml import etree as ET
	import urllib.request
	import urllib.error

	# https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt
	with open('words_alpha.txt', 'r') as f:
	words = set([x.strip() for x in f])
	words.add('embeddings')
	words.add('softmax')
	words.add('state-of-the-art')
	words.add('sequence-to-sequence')

	word_re = re.compile('[A-Za-z]+')

	acl_paper_url_stub = 'https://www.aclweb.org/anthology/'

	acl_xml_file = '/Users/guyaglionby/Documents/projects/acl-anthology/data/xml/W18.xml'
	acl_tree = ET.parse(acl_xml_file)
	acl_xml = acl_tree.getroot()

	volume_ids = list([str(x) for x in range(51,66)])
	volume_trees = acl_xml.findall('volume')

	for volume in volume_trees:
	if volume.attrib['id'] not in volume_ids:
	continue
	print('Volume', volume.attrib['id'])
	for paper in volume.findall('paper'):
	paper_id = paper.find('url').text
	print(f'\n\n\n{paper_id}')
	pdf_url = acl_paper_url_stub + paper_id + '.pdf'
	try:
	f, _ = urllib.request.urlretrieve(pdf_url)
	except urllib.error.HTTPError:
	print('URL not found for this paper')
	continue
	try:
	pdf = pdfquery.PDFQuery(f)
	pdf.load(0)
	except:
	print('Parser error in file')
	continue
	abstract_elements = pdf.tree.xpath('//*/LTTextLineHorizontal[LTTextBoxHorizontal[contains(text(), "Abstract")]]/following::LTTextBoxHorizontal')
	if len(abstract_elements) == 0:
	continue
	abstract_elements = abstract_elements[0].findall('LTTextLineHorizontal')
	abstract_lines = [x.text.strip() for x in abstract_elements]
	joined_abstract = ''
	for i, line in enumerate(abstract_lines):
	line = line.replace('ﬁ', 'fi')
	line = line.replace('ﬂ', 'fl')
	if line.endswith('-'):
	this_line_end_word = line.split(' ')[-1][:-1]
	next_line_word = abstract_lines[i + 1].split(' ')[0].replace('ﬁ', 'fi').replace('ﬂ', 'fl')
	concat_word = this_line_end_word + next_line_word
	concat_word = word_re.search(concat_word)
	if concat_word is not None and concat_word[0].lower() in words:
	print(f'{paper_id}: Concatenating: "{this_line_end_word}{next_line_word}"')
	joined_abstract += line[:-1]
	else:
	print(f'{paper_id}: Hyphenating "{this_line_end_word}-{next_line_word}"')
	joined_abstract += line
	else:
	joined_abstract += line + ' '
	abstract = paper.find('abstract')
	if abstract is None:
	abstract = ET.SubElement(paper, 'abstract')
	joined_abstract = joined_abstract.strip()
	if joined_abstract.endswith(' Introduction'):
	joined_abstract = joined_abstract[:-len(' Introduction')]
	abstract.text = joined_abstract

	acl_tree.write(acl_xml_file, encoding='utf-8', xml_declaration=True, pretty_print=True)