Skip to content

Instantly share code, notes, and snippets.

@pulsejet
Created May 31, 2018 13:47
Show Gist options
  • Save pulsejet/d3194ac75414f19f418badbfae7c5c2a to your computer and use it in GitHub Desktop.
Save pulsejet/d3194ac75414f19f418badbfae7c5c2a to your computer and use it in GitHub Desktop.
Gets the diffs direct from XML
# NOTE: Currently this prints the diff for the second data element and exits
import nltk
import xml.etree.ElementTree as ET
import copy
import difflib
tree = ET.parse('xmlsdev.xml')
root = tree.getroot()
def custom_tokenize(sentence):
s = "".join(sentence.itertext()).replace('-', ' - ')
return(nltk.word_tokenize(s))
def remove_all_nodes(parent, childname):
for child in parent.findall(childname):
tail = child.tail
child.clear()
child.tail = tail
with open('corrected', 'w', encoding='utf-8') as file:
i = 0
for sentence in root.iter('sentence'):
sentence_c = copy.deepcopy(sentence)
remove_all_nodes(sentence, 'ins')
remove_all_nodes(sentence_c, 'del')
s1 = custom_tokenize(sentence)
s2 = custom_tokenize(sentence_c)
diff = difflib.ndiff(s1, s2)
if (i==1):
print('\n'.join(diff))
exit()
file.write('\n')
if i%100 == 0:
print(i)
i += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment