Skip to content

Instantly share code, notes, and snippets.

@longhotsummer
Created December 11, 2019 08:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save longhotsummer/08d1d2e9eb5494638986cc288ab91157 to your computer and use it in GitHub Desktop.
Save longhotsummer/08d1d2e9eb5494638986cc288ab91157 to your computer and use it in GitHub Desktop.
Tests xmldiff on some basic AKN HTML
from xmldiff import main, formatting
import lxml.etree
import lxml.html
XSLT = u'''<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:diff="http://namespaces.shoobx.com/diff"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="@diff:insert-formatting">
<xsl:attribute name="class">
<xsl:value-of select="'ins'"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="@diff:insert">
<xsl:attribute name="classx">
<xsl:value-of select="'ins '"/>
<xsl:value-of select="../@class"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="@diff:delete">
<xsl:attribute name="classx">
<xsl:value-of select="'del '"/>
<xsl:value-of select="../@class"/>
</xsl:attribute>
</xsl:template>
<xsl:template match="diff:delete">
<del><xsl:apply-templates /></del>
</xsl:template>
<xsl:template match="diff:insert">
<ins><xsl:apply-templates /></ins>
</xsl:template>
<xsl:template match="@* | node()">
<xsl:copy>
<xsl:apply-templates select="@* | node()"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>'''
XSLT_TEMPLATE = lxml.etree.fromstring(XSLT)
class HTMLFormatter(formatting.XMLFormatter):
def render(self, result):
transform = lxml.etree.XSLT(XSLT_TEMPLATE)
result = transform(result)
# XSLT doesn't let us add an element to an attribute, so here
# we move "classx" over onto "class"
for node in result.xpath('//*[@classx]'):
node.set('class', node.attrib.pop('classx'))
return super(HTMLFormatter, self).render(result)
old_s = '''
<section class="akn-section" id="section-6" data-id="section-6"><h3>6. Power to control production, sale, etc., of drugs to which Part II applies</h3>
<section class="akn-paragraph akn--no-indent" id="section-6.paragraph0" data-id="section-6.paragraph0"><span class="akn-content"><span class="akn-blockList" id="section-6.paragraph0.list0" data-id="section-6.paragraph0.list0"><span class="akn-listIntroduction">The Minister may by regulations-</span><span class="akn-item" id="section-6.paragraph0.list0.a" data-id="section-6.paragraph0.list0.a"><span class="akn-num">(a)</span><span class="akn-p">provide for controlling or restricting the production, possession, sale and distribution of drugs to which this Part applies;</span></span><span class="akn-item" id="section-6.paragraph0.list0.b" data-id="section-6.paragraph0.list0.b"><span class="akn-num">(b)</span><span class="akn-p">provide for prohibiting the production, possession, sale or distribution of any drug to which this Part applies except by persons licensed or otherwise authorised in that behalf by the Minister, and the cultivation of plants from which such drugs are derived;</span></span><span class="akn-item" id="section-6.paragraph0.list0.c" data-id="section-6.paragraph0.list0.c"><span class="akn-num">(c)</span><span class="akn-p">prescribe measures to be taken for the eradication of plants, to which regulations made under paragraph (b) apply, found to be growing wild.</span></span></span></span></section></section>
'''
new_s = '''
<section class="akn-section" id="section-6" data-id="section-6"><h3>6. Power to control production, sale, etc., of drugs to which Part II applies</h3>
<section class="akn-paragraph akn--no-indent" id="section-6.paragraph0" data-id="section-6.paragraph0"><span class="akn-content"><span class="akn-blockList" id="section-6.paragraph0.list0" data-id="section-6.paragraph0.list0"><span class="akn-listIntroduction">The Minister may by regulations-</span><span class="akn-item" id="section-6.paragraph0.list0.a" data-id="section-6.paragraph0.list0.a"><span class="akn-num">(a)</span><span class="akn-p">provide for controlling or restricting the production, possession, sale and distribution of drugs to which this Part applies;</span></span><span class="akn-item" id="section-6.paragraph0.list0.aa" data-id="section-6.paragraph0.list0.aa"><span class="akn-num">(aa)</span><span class="akn-p">provide for prohibiting the production, possession, sale or distribution of any drug to which this Part applies except by persons licensed or otherwise authorised in that behalf by the Minister, and the cultivation of plants from which such drugs are derived;</span></span><span class="akn-item" id="section-6.paragraph0.list0.c" data-id="section-6.paragraph0.list0.c"><span class="akn-num">(c)</span><span class="akn-p">prescribe measures to be really taken for the eradication of plants, to which regulations made under paragraph (b) apply, found to be growing wild.</span></span></span><span class="akn-p">and some closeout.</span></span></section></section>
'''
old_html = lxml.etree.fromstring(old_s)
new_html = lxml.etree.fromstring(new_s)
for html in [old_html, new_html]:
for node in html.xpath('//*[@id]'):
del node.attrib['id']
del node.attrib['data-id']
formatter = formatting.XMLFormatter(normalize=formatting.WS_NONE)
formatter = HTMLFormatter(normalize=formatting.WS_NONE, pretty_print=True)
diff = main.diff_trees(old_html, new_html, formatter=formatter, diff_options={
'F': 0.75,
'uniqueattrs': [],
'ratio_mode': 'fast',
})
print(diff)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment