Skip to content

Instantly share code, notes, and snippets.

@PonteIneptique
Created February 6, 2019 15:24
Show Gist options
  • Save PonteIneptique/5ffa287420f87d8a441518b008164c64 to your computer and use it in GitHub Desktop.
Save PonteIneptique/5ffa287420f87d8a441518b008164c64 to your computer and use it in GitHub Desktop.
Attempt at a small function for lxml parser that fix illformed xml when possible
from lxml import etree as ET
import re
def fix_xml(xml_string: str) -> str:
""" Given an illformated xml, try to fix it
:param xml_string: XML that is faulty
:return: xml that should not be faulty
"""
parser = ET.XMLParser(recover=True)
doc = ET.fromstring(xml_string, parser=parser)
xml = None
new_xml_string = ""+xml_string
tag = re.compile(r"^Opening and ending tag mismatch: ([a-zA-Z\-_0-9]+) line ([0-9]+) and ([a-zA-Z\-_0-9]+), line ([0-9]+), column ([0-9]+)")
while xml is None:
try:
xml = ET.fromstring(new_xml_string)
except ET.XMLSyntaxError as E:
# Find where the error is
res = tag.findall(str(E))
if res:
tag1, line1, tag2, line2, col2 = res[0]
# Chunk into lines
lines = new_xml_string.split("\n")
# Find the line of the first tag
tag1_line_index = int(line1)-1
tag1_line = lines[tag1_line_index]
# Find the line of the second tag
tag2_line_index = int(line2)-1
tag2_line = lines[tag2_line_index]
column = int(col2)
if new_xml_string.count("<"+tag1) == new_xml_string.count("</"+tag1):
# Tag 1 is well closed, tag2 is not opened
tag2_line = tag2_line[:column-(len("</>")+len(tag2)+1)]+tag2_line[column-1:]
lines[tag2_line_index] = tag2_line
else:
# Tag1 is not not closed, tag2 is well opened
tag_length = (len("</>")+len(tag2)+1)
# Insert the ending tag just before the ending tag
tag2_line = tag2_line[:column-tag_length]+"</"+tag1+">"+tag2_line[column-tag_length:]
lines[tag2_line_index] = tag2_line
pass
if new_xml_string == "\n".join(lines):
raise E
new_xml_string = "\n".join(lines)
else:
raise E
print(new_xml_string)
print("----")
return new_xml_string
fix_xml("<fragment>"+""""
<lb n="1"/><name><expan><abbr>Sex</abbr><ex>to</ex></expan>Vervicio
<lb n="2"/>Modestino</name> et <name>Verv
<lb n="4"/>
... """+"</fragment>")
fix_xml("<fragment>"+""""
<lb n="2"/>Modestino</name> et <name>Verv
... """+"</fragment>")
fix_xml("<fragment>"+""""
Sex</abbr><ex>to</ex></expan>Vervicio
<lb n="2"/>Modestino</name> et <name>Verv
<lb break="no" n="3"/>iciae Modestinae</name>
<lb n="4"/>
... """+"</fragment>")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment