Skip to content

Instantly share code, notes, and snippets.

@aryamccarthy
Created March 30, 2019 19:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aryamccarthy/18faf5056343cca9a9fed82d95c80bff to your computer and use it in GitHub Desktop.
Save aryamccarthy/18faf5056343cca9a9fed82d95c80bff to your computer and use it in GitHub Desktop.
parse that TACL metadata
#! /usr/bin/env python3
"""
Convert MIT Press XML files for TACL to Anthology XML.
"""
import logging
import xml.etree.ElementTree as etree
from pathlib import Path
from typing import List, Optional, Tuple
__version__ = '0.1'
log = logging.getLogger(__name__ if __name__ != '__main__ '
else Path(__file__).stem)
def parse_args():
"""Parse command line arguments."""
import argparse
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('tacl_year_root', metavar='FOLDER', type=Path)
parser.add_argument('--outfile', '-o', default=sys.stdout.buffer, help='Output XML file (default stdout)')
verbosity = parser.add_mutually_exclusive_group()
verbosity.add_argument('-v', '--verbose', action='store_const',
const=logging.DEBUG, default=logging.INFO)
verbosity.add_argument('-q', '--quiet', dest='verbose',
action='store_const', const=logging.WARNING)
parser.add_argument('--version', action='version',
version=f'%(prog)s v{__version__}')
args = parser.parse_args()
args.tacl_year_root = args.tacl_year_root.resolve() # Get absolute path.
# args.outfile = argparse.FileType(mode='w')(args.outfile)
return args
def get_volume_info(xml: Path) -> str:
log.info("Getting volume info from {}".format(xml))
# So far, their XML for the volume doesn't play nicely with xml.etree. Thus, we hack.
paper = etree.Element('paper')
paper.attrib['id'] = "1000" # hard-code because there's only one collection.
volume_text = xml.stem.split(".")[-1]
title_text = "Transactions of the Association for Computational Linguistics"
title = etree.Element('title')
title.text = "{}, Volume {}".format(title_text, volume_text)
paper.append(title)
year_text = xml.stem.split(".")[1]
year = etree.Element('year')
year.text = year_text
paper.append(year)
return paper
def get_paperid(xml: Path) -> str:
basename = xml.stem
for i in range(1, 4+1):
assert basename[-i] in [str(x) for x in range(10)], basename
return "1" + basename[-3:] # TACL is always QXX-1YYY.
def get_title(xml_front_node: etree.Element) -> str:
article_meta = xml_front_node.find('article-meta')
title_group = article_meta.find('title-group')
title_text = title_group.find('article-title').text
return title_text
def get_year(xml_front_node: etree.Element) -> str:
article_meta = xml_front_node.find('article-meta')
pub_date = article_meta.find('pub-date')
year_text = pub_date.find('year').text
return year_text
def get_abstract(xml_front_node: etree.Element) -> str:
article_meta = xml_front_node.find('article-meta')
abstract = article_meta.find('abstract')
abstract_text = "".join(abstract.itertext()).strip()
return abstract_text
def get_authors(xml_front_node: etree.Element) -> List[Tuple[str, str]]:
article_meta = xml_front_node.find('article-meta')
contrib_group = article_meta.find('contrib-group')
authors = []
for author in contrib_group.findall('contrib'):
string_name = author.find('string-name')
given_names = string_name.find('given-names').text
surname = string_name.find('surname').text
authors.append((given_names, surname))
return authors
def get_pages(xml_front_node: etree.Element) -> Tuple[str, str]:
article_meta = xml_front_node.find('article-meta')
fpage = article_meta.find('fpage')
lpage = article_meta.find('lpage')
return fpage.text, lpage.text
def process_xml(xml: Path) -> Optional[etree.Element]:
logging.info("Reading {}".format(xml))
paper = etree.Element('paper')
paperid = get_paperid(xml)
paper.attrib['id'] = paperid
tree = etree.parse(xml)
root = tree.getroot()
front = root.find('front')
title_text = get_title(front)
title = etree.Element('title')
title.text = title_text
paper.append(title)
authors = get_authors(front)
for given_names, surname in authors:
first = etree.Element('first')
first.text = given_names
last = etree.Element('last')
last.text = surname
author = etree.Element('author')
author.append(first)
author.append(last)
paper.append(author)
year_text = get_year(front)
year = etree.Element('year')
year.text = year_text
paper.append(year)
abstract_text = get_abstract(front)
abstract = etree.Element('abstract')
abstract.text = abstract_text
paper.append(abstract)
pages_tuple = get_pages(front)
pages = etree.Element('pages')
pages.text = "–".join(pages_tuple) # en-dash, not hyphen!
paper.append(pages)
return paper
if __name__ == '__main__':
import sys
if sys.version_info < (3,6):
sys.stderr.write("Python >=3.6 required.\n")
sys.exit(1)
args = parse_args()
logging.basicConfig(level=args.verbose)
prefix = "Q" if "tacl" in args.tacl_year_root.stem else "J" # J for CL, Q for TACL.
year_suffix = args.tacl_year_root.stem.split(".")[1][-2:] # Feels hacky, too.
volume_id = prefix + year_suffix
volume = etree.Element('volume')
volume.attrib['id'] = volume_id
volume_info = get_volume_info(list(args.tacl_year_root.glob("tacl.20*.*/tacl.20*.*.xml"))[0])
volume.append(volume_info)
for xml in sorted(args.tacl_year_root.glob("tacl_a_*/*.xml")):
print(xml)
pdf = xml.with_suffix(".pdf")
if not pdf.is_file():
log.error("Missing pdf for " + xml.name)
papernode = process_xml(xml)
if papernode is None:
continue
url_text = "http://www.aclweb.org/anthology/{}-{}".format(volume_id, papernode.attrib['id'])
url = etree.Element('url')
url.text = url_text
papernode.append(url)
volume.append(papernode)
for paper in volume:
for field in paper:
field.tail = '\n '
if len(paper):
paper.text = '\n '
paper[-1].tail = '\n '
paper.tail = '\n\n '
if len(volume):
volume.text = '\n '
volume[-1].tail = '\n'
volume.tail = '\n'
et = etree.ElementTree(volume)
et.write(args.outfile, encoding="UTF-8", xml_declaration=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment