Skip to content

Instantly share code, notes, and snippets.

@frederik-elwert
Created January 2, 2015 20:54
Show Gist options
  • Save frederik-elwert/fef31d94b3ef4589a983 to your computer and use it in GitHub Desktop.
Save frederik-elwert/fef31d94b3ef4589a983 to your computer and use it in GitHub Desktop.
Flat to hierarchical XML
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from lxml import etree\n",
"\n",
"origtree = etree.XML('''\n",
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n",
"<speaker xml:id=\"spk-0001\">\n",
"<w xml:id=\"w0000410\">EGEON</w>\n",
"</speaker>\n",
"<ab xml:id=\"ab-0001\">\n",
"<lb xml:id=\"lb-00009\"/>\n",
"<milestone unit=\"ftln\" xml:id=\"ftln-0001\" n=\"1.1.1\" ana=\"#verse\"\n",
"corresp=\"#w0000420 #p0000430 #c0000440 #w0000450 #p0000460 #c0000470\n",
"#w0000480 #c0000490 #w0000500 #c0000510 #w0000520 #c0000530 #w0000540\n",
"#p0000550\"/>\n",
"<w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n",
"<pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n",
"<c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n",
"<w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n",
"<pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n",
"<c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n",
"<w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n",
"<c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n",
"<w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n",
"<c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n",
"<w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n",
"<c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n",
"<w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n",
"<pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n",
"<lb xml:id=\"lb-00010\"/>\n",
"<milestone unit=\"ftln\" xml:id=\"ftln-0002\" n=\"1.1.2\" ana=\"#verse\"\n",
"corresp=\"#w0000560 #c0000570 #w0000580 #c0000590 #w0000600 #c0000610\n",
"#w0000620 #c0000630 #w0000640 #c0000650 #w0000660 #c0000670 #w0000680\n",
"#c0000690 #w0000700 #c0000710 #w0000720 #c0000730 #w0000740 #p0000750\"/>\n",
"<w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n",
"<c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n",
"<c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n",
"<c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n",
"<c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n",
"<c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n",
"<c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n",
"<c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n",
"<c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n",
"<c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n",
"<w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n",
"<pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n",
"</ab>\n",
"</sp>\n",
"''', parser=etree.XMLParser(remove_blank_text=True))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from copy import deepcopy\n",
"tree = deepcopy(origtree)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for sp in tree.xpath('//sp'):\n",
" ab = sp.xpath('ab')[0] # Assumes only one ab per sp.\n",
" for milestone in ab.xpath('milestone'):\n",
" line = etree.SubElement(sp, 'l')\n",
" corr_ids = [id_.lstrip('#') for id_ in milestone.get('corresp').split()]\n",
" for id_ in corr_ids:\n",
" elem = sp.xpath('id($cid)', cid=id_)[0]\n",
" line.append(elem)\n",
" sp.remove(ab)\n",
"print(etree.tostring(tree, pretty_print=True, encoding='unicode'))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n",
" <speaker xml:id=\"spk-0001\">\n",
" <w xml:id=\"w0000410\">EGEON</w>\n",
" </speaker>\n",
" <l>\n",
" <w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n",
" <pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n",
" <c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n",
" <pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n",
" <c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n",
" <c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n",
" <c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n",
" <c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n",
" <pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n",
" </l>\n",
" <l>\n",
" <w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n",
" <c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n",
" <c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n",
" <c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n",
" <c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n",
" <c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n",
" <c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n",
" <c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n",
" <c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n",
" <c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n",
" <pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n",
" </l>\n",
"</sp>\n",
"\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from copy import deepcopy\n",
"tree = deepcopy(origtree)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for sp in tree.xpath('//sp'):\n",
" ab = sp.xpath('ab')[0] # Assumes only one ab per sp.\n",
" for milestone in ab.xpath('milestone'):\n",
" line = etree.SubElement(sp, 'l')\n",
" elem = milestone.getnext()\n",
" while True:\n",
" if elem is None or elem.tag == 'milestone':\n",
" break\n",
" line.append(deepcopy(elem)) # Can't append directly, would break getnext().\n",
" elem = elem.getnext()\n",
" sp.remove(ab)\n",
"print(etree.tostring(tree, pretty_print=True, encoding='unicode'))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n",
" <speaker xml:id=\"spk-0001\">\n",
" <w xml:id=\"w0000410\">EGEON</w>\n",
" </speaker>\n",
" <l>\n",
" <w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n",
" <pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n",
" <c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n",
" <pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n",
" <c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n",
" <c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n",
" <c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n",
" <c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n",
" <w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n",
" <pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n",
" <lb xml:id=\"lb-00010\"/>\n",
" </l>\n",
" <l>\n",
" <w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n",
" <c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n",
" <c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n",
" <c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n",
" <c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n",
" <c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n",
" <c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n",
" <c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n",
" <c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n",
" <c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n",
" <w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n",
" <pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n",
" </l>\n",
"</sp>\n",
"\n"
]
}
],
"prompt_number": 5
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment