Skip to content

Instantly share code, notes, and snippets.

@mjpost
Created March 4, 2021 16:21
Show Gist options
  • Save mjpost/52e2e84639727009137a32ab1e1ad5a6 to your computer and use it in GitHub Desktop.
Save mjpost/52e2e84639727009137a32ab1e1ad5a6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2019--2021 Matt Post <post@cs.jhu.edu>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utility script I change all the time for finding papers in the Anthology and doing
quick fixes on them. This version looks for paper titles that have "[In French]",
removes it, and inserts the appropriate language tag:
<language>fra</language>
It then writes out the XML file.
"""
import argparse
import os
import re
import readline
import shutil
import sys
import lxml.etree as etree
from collections import defaultdict, OrderedDict
from datetime import datetime
from normalize_anth import normalize
from anthology.utils import (
make_simple_element,
build_anthology_id,
deconstruct_anthology_id,
indent,
compute_hash_from_file,
)
from anthology.index import AnthologyIndex
from anthology.people import PersonName
from anthology.bibtex import read_bibtex
from anthology.venues import VenueIndex
from itertools import chain
from typing import Dict, Any
def main(args):
for collection_file in args.files:
root_node = etree.parse(collection_file).getroot()
for paper in root_node.findall(".//paper"):
title = paper.find("./title")
search_text = " [In <fixed-case>F</fixed-case>rench]"
if search_text in title.text:
title.text = title.text.sub(search_text, "")
make_simple_element("language", "fra", parent=paper)
tree = etree.ElementTree(root_node)
indent(root_node)
tree.write(
collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+", help="List of XML files.")
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment