Skip to content

Instantly share code, notes, and snippets.

@GuyAglionby
Created October 18, 2022 12:38
Show Gist options
  • Save GuyAglionby/93c1d6af00e45a802ef3c49f40decfaa to your computer and use it in GitHub Desktop.
Save GuyAglionby/93c1d6af00e45a802ef3c49f40decfaa to your computer and use it in GitHub Desktop.
Remove duplicate video entries from ACL Anthology data
import glob
import itertools
from lxml import etree
from networkx.utils import UnionFind
from tqdm import tqdm
def elems_same(elem1, elem2):
return elem1.attrib == elem2.attrib
def handle_file(filename):
with open(filename, 'r') as f:
root = etree.parse(f)
for paper in root.xpath('//paper'):
videos = paper.xpath('./video')
uf = UnionFind(videos)
for v1, v2 in itertools.combinations(videos, 2):
if elems_same(v1, v2):
uf.union(v1, v2)
for dupes in uf.to_sets():
if len(dupes) > 1:
# remove the last video elems, presumably they were most recently added + for consistency
dupes_w_idx = sorted([(dupe, videos.index(dupe)) for dupe in dupes], key=lambda x: x[1])
for elem, _ in dupes_w_idx[1:]:
paper.remove(elem)
string_rep = etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='UTF-8').decode('utf-8')
# hacky
bad_paper1 = "\n </paper>\n <paper id="
good_paper1 = "\n </paper>\n <paper id="
string_rep = string_rep.replace(bad_paper1, good_paper1)
bad_paper2 = "\n </paper>\n </volume>"
good_paper2 = "\n </paper>\n </volume>"
string_rep = string_rep.replace(bad_paper2, good_paper2)
with open(filename, 'w') as f:
f.write(string_rep)
def main():
for file in tqdm(list(glob.glob('acl-anthology/data/xml/*.xml'))):
handle_file(file)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment