-
-
Save dalelane/a0514b2e283a882d9ef3 to your computer and use it in GitHub Desktop.
########################################################################## | |
# | |
# xmldiff | |
# | |
# Simple utility script to enable a diff of two XML files in a way | |
# that ignores the order or attributes and elements. | |
# | |
# Dale Lane (email@dalelane.co.uk) | |
# 6 Oct 2014 | |
# | |
########################################################################## | |
# | |
# Overview | |
# The approach is to sort both files by attribute and element, and | |
# then reuse an existing diff implementation on the sorted files. | |
# | |
# Arguments | |
# <diffcommand> the command that should be run to diff the sorted files | |
# <filename1> the first XML file to diff | |
# <filename2> the second XML file to diff | |
# | |
# Background | |
# http://dalelane.co.uk/blog/?p=3225 | |
# | |
########################################################################## | |
import os, sys, subprocess, platform | |
import lxml.etree as le | |
from operator import attrgetter | |
# | |
# Check required arguments | |
if len(sys.argv) != 4: | |
print ("Usage: python xmldiff.py <diffcommand> <filename1> <filename2>") | |
quit() | |
# | |
# Prepares the location of the temporary file that will be created by xmldiff | |
def createFileObj(prefix, name): | |
return { | |
"filename" : os.path.abspath(name), | |
"tmpfilename" : "." + prefix + "." + os.path.basename(name) | |
} | |
# | |
# Function to sort XML elements by id | |
# (where the elements have an 'id' attribute that can be cast to an int) | |
def sortbyid(elem): | |
id = elem.get('id') | |
if id: | |
try: | |
return int(id) | |
except ValueError: | |
return 0 | |
return 0 | |
# | |
# Function to sort XML elements by their text contents | |
def sortbytext(elem): | |
text = elem.text | |
if text: | |
return text | |
else: | |
return '' | |
# | |
# Function to sort XML attributes alphabetically by key | |
# The original item is left unmodified, and it's attributes are | |
# copied to the provided sorteditem | |
def sortAttrs(item, sorteditem): | |
attrkeys = sorted(item.keys()) | |
for key in attrkeys: | |
sorteditem.set(key, item.get(key)) | |
# | |
# Function to sort XML elements | |
# The sorted elements will be added as children of the provided newroot | |
# This is a recursive function, and will be called on each of the children | |
# of items. | |
def sortElements(items, newroot): | |
# The intended sort order is to sort by XML element name | |
# If more than one element has the same name, we want to | |
# sort by their text contents. | |
# If more than one element has the same name and they do | |
# not contain any text contents, we want to sort by the | |
# value of their ID attribute. | |
# If more than one element has the same name, but has | |
# no text contents or ID attribute, their order is left | |
# unmodified. | |
# | |
# We do this by performing three sorts in the reverse order | |
items = sorted(items, key=sortbyid) | |
items = sorted(items, key=sortbytext) | |
items = sorted(items, key=attrgetter('tag')) | |
# Once sorted, we sort each of the items | |
for item in items: | |
# Create a new item to represent the sorted version | |
# of the next item, and copy the tag name and contents | |
newitem = le.Element(item.tag) | |
if item.text and item.text.isspace() == False: | |
newitem.text = item.text | |
# Copy the attributes (sorted by key) to the new item | |
sortAttrs(item, newitem) | |
# Copy the children of item (sorted) to the new item | |
sortElements(list(item), newitem) | |
# Append this sorted item to the sorted root | |
newroot.append(newitem) | |
# | |
# Function to sort the provided XML file | |
# fileobj.filename will be left untouched | |
# A new sorted copy of it will be created at fileobj.tmpfilename | |
def sortFile(fileobj): | |
with open(fileobj['filename'], 'r') as original: | |
# parse the XML file and get a pointer to the top | |
xmldoc = le.parse(original) | |
xmlroot = xmldoc.getroot() | |
# create a new XML element that will be the top of | |
# the sorted copy of the XML file | |
newxmlroot = le.Element(xmlroot.tag) | |
# create the sorted copy of the XML file | |
sortAttrs(xmlroot, newxmlroot) | |
sortElements(list(xmlroot), newxmlroot) | |
# write the sorted XML file to the temp file | |
newtree = le.ElementTree(newxmlroot) | |
with open(fileobj['tmpfilename'], 'wb') as newfile: | |
newtree.write(newfile, pretty_print=True) | |
# | |
# sort each of the specified files | |
filefrom = createFileObj("from", sys.argv[2]) | |
sortFile(filefrom) | |
fileto = createFileObj("to", sys.argv[3]) | |
sortFile(fileto) | |
# | |
# invoke the requested diff command to compare the two sorted files | |
if platform.system() == "Windows": | |
sp = subprocess.Popen([ "cmd", "/c", sys.argv[1] + " " + filefrom['tmpfilename'] + " " + fileto['tmpfilename'] ]) | |
sp.communicate() | |
else: | |
sp = subprocess.Popen([ "/bin/bash", "-i", "-c", sys.argv[1] + " " + os.path.abspath(filefrom['tmpfilename']) + " " + os.path.abspath(fileto['tmpfilename']) ]) | |
sp.communicate() | |
# | |
# cleanup - delete the temporary sorted files after the diff terminates | |
os.remove(filefrom['tmpfilename']) | |
os.remove(fileto['tmpfilename']) |
This breaks if there are any comments. I added in sortElements:
# Once sorted, we sort each of the items
for item in items:
**if item.tag is le.Comment:
continue**
I am relatively new to using the terminal on the Mac. When I try your command line instruction python xmldiff.py diffmerge FileA.xml FileB.xml it returns the error "bash: diffmerge: command not found". What am I doing wrong?
I've made this into a repo here: https://github.com/allanlewis/xmldiff
Is there any Git Client that can support this function?
I need Xml Ignore order function to work when I am comparing my XML through tortoisegit or any client.
@dalelane what would be the license for this code?
it is not working for following set of files
1.xml
2
rnc1
na0
1
ssn
1
0x21a7
1
rnc1
na0
1
ssn
1
0x21a7
3
rnc1
na0
1
ssn
1
0x21a7
2.xml
2
rnc1
na0
1
ssn
1
0x21a7
3
rnc1
na0
1
ssn
1
0x21a7
1
rnc1
na0
1
ssn
1
0x21a7
even the files are same just the input order for load-sharing-item is changed, but still it showing the difference
$ python Diff_xml.py diff 1.xml 2.xml
14c14
< 1
<load-sharing-id>3</load-sharing-id>
23c23
< 3
<load-sharing-id>1</load-sharing-id>
following error happens to me:
lxml.etree.XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1
from line 126
even though the file starts with a '<'
Great code, thanks! Would you mind adding a license to it? I would love to use this, but don't want to if it's not licensed for re-use!
@dalelane what would be the license for this code? See http://choosealicense.com/ for help.
I am getting this error:
Traceback (most recent call last):
File "xmldiff.py", line 146, in <module>
sortFile(filefrom)
File "xmldiff.py", line 135, in sortFile
sortElements(list(xmlroot), newxmlroot)
File "xmldiff.py", line 105, in sortElements
newitem = le.Element(item.tag)
File "src/lxml/etree.pyx", line 2996, in lxml.etree.Element
File "src/lxml/apihelpers.pxi", line 95, in lxml.etree._makeElement
File "src/lxml/apihelpers.pxi", line 1584, in lxml.etree._getNsTag
File "src/lxml/apihelpers.pxi", line 1602, in lxml.etree.__getNsTag
File "src/lxml/apihelpers.pxi", line 1472, in lxml.etree._utf8
TypeError: Argument must be bytes or unicode, got 'cython_function_or_method'
@harryyuanfeng you may have comments in your xml.
You can ignore them by changing the "sortfile" function:
parser = le.XMLParser(remove_comments=True)
# parse the XML file and get a pointer to the top
xmldoc = le.parse(original, parser=parser)
this works nicely just for osx the popen command should be as below
sp = subprocess.Popen([ "open", "-a", sys.argv[1] , "--args" , os.path.abspath(filefrom['tmpfilename']) , os.path.abspath(fileto['tmpfilename']) ])