Skip to content

Instantly share code, notes, and snippets.

@pulsejet
Created May 31, 2018 15:34
Show Gist options
  • Save pulsejet/8f37877556eddc7131bd84fbe40af631 to your computer and use it in GitHub Desktop.
Save pulsejet/8f37877556eddc7131bd84fbe40af631 to your computer and use it in GitHub Desktop.
import nltk
import xml.etree.ElementTree as ET
import copy
import difflib
tree = ET.parse('xmlsdev.xml')
root = tree.getroot()
def xml_stringify(node):
return "".join(node.itertext())
def custom_tokenize(sentence):
return(nltk.word_tokenize(xml_stringify(sentence).replace('-', ' - ')))
def remove_all_nodes(parent, childname):
for child in parent.findall(childname):
tail = child.tail
child.clear()
child.tail = tail
def diff_to_xml(diff):
return [diff_line_to_xml(x) for x in diff]
def diff_line_to_xml(line):
if line[0] == ' ':
return 'c'
if line[0] == '-':
return '<del>' + line[2:] + '</del>'
if line[0] == '+':
return '<ins>' + line[2:] + '</ins>'
with open('corrected', 'w', encoding='utf-8') as file:
i = 0
for sentence in root.iter('sentence'):
sentence_c = copy.deepcopy(sentence)
remove_all_nodes(sentence, 'ins')
remove_all_nodes(sentence_c, 'del')
s1 = custom_tokenize(sentence)
s2 = custom_tokenize(sentence_c)
diff = difflib.ndiff(s1, s2)
file.write('\n'.join([x for x in diff_to_xml(diff) if x]))
file.write('\n')
if i%100 == 0:
print(i)
i += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment