Skip to content

Instantly share code, notes, and snippets.

@astynax
Created August 20, 2013 13:14
Show Gist options
  • Save astynax/6281224 to your computer and use it in GitHub Desktop.
Save astynax/6281224 to your computer and use it in GitHub Desktop.
XML batteries
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "XML"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "#\u0411\u0430\u0442\u0430\u0440\u0435\u0439\u043a\u0438 XML \u0434\u043b\u044f Python"
},
{
"cell_type": "code",
"collapsed": false,
"input": "## big.svg [64\u041a\u0431]\nfbig = '/home/rinat/big.svg'\n## mini.xml [18\u041a\u0431]\nfmini = '/home/rinat/mini.xml'\n\n\ndef process(node):\n pass",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 93
},
{
"cell_type": "markdown",
"metadata": {},
"source": "##minidom\n\n * \u043f\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043e\u0431\u043a\u0435 \u0441 2.0\n * \u043c\u0435\u0434\u043b\u0435\u043d\u043d\u044b\u0439\n * \u043f\u0440\u043e\u0436\u043e\u0440\u043b\u0438\u0432\u044b\u0439\n * javastyle\n "
},
{
"cell_type": "code",
"collapsed": false,
"input": "from xml.dom import minidom\nimport timeit\n\ndef iterate_children(parent):\n child = parent.firstChild\n while child != None:\n yield child\n child = child.nextSibling\n\n#mini\n%timeit with open(fmini,'r') as xmlfile: minidom.parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: iterate_children(minidom.parse(xmlfile))\n \n#big\n%timeit with open(fbig,'r') as xmlfile: minidom.parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: iterate_children(minidom.parse(xmlfile))\n",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "100 loops, best of 3: 7.41 ms per loop\n100 loops, best of 3: 7.37 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n100 loops, best of 3: 11.7 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n100 loops, best of 3: 11.7 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"prompt_number": 88
},
{
"cell_type": "markdown",
"metadata": {},
"source": "##ElementTree\n\n * \u043f\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043e\u0431\u043a\u0435 \u0441 2.4\n * \u0435\u0441\u0442\u044c \u0432\u0435\u0440\u0441\u0438\u0438 \u0438 \u0434\u043b\u044f \u0431\u043e\u043b\u0435\u0435 \u0440\u0430\u043d\u043d\u0435\u0433\u043e python\n * \u0441\u043e\u0431\u044b\u0442\u0438\u0439\u043d\u0430\u044f \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0430 (intparse)\n * \u0435\u0441\u0442\u044c \u0421\u0418\u0448\u043d\u0430\u044f \u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f _cElementTree_\n"
},
{
"cell_type": "code",
"collapsed": false,
"input": "from xml.etree.ElementTree import ElementTree\n\n#mini\n%timeit with open(fmini,'r') as xmlfile: ElementTree().parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: [process(node) for node in ElementTree().parse(xmlfile).iter()]\n \n#big\n%timeit with open(fbig,'r') as xmlfile: ElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in ElementTree().parse(xmlfile).iter()]",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "100 loops, best of 3: 7.83 ms per loop\n100 loops, best of 3: 8.3 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"prompt_number": 66
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "cElementTree"
},
{
"cell_type": "code",
"collapsed": false,
"input": "from xml.etree.cElementTree import ElementTree as cElementTree\n\n## mini.xml\n%timeit with open(fmini,'r') as xmlfile: cElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in cElementTree().parse(xmlfile).iter()]\n \n## big.xml\n%timeit with open(fbig,'r') as xmlfile: cElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in cElementTree().parse(xmlfile).iter()]",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "1000 loops, best of 3: 885 \u00b5s per loop\n1000 loops, best of 3: 1.33 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n1000 loops, best of 3: 1.08 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n1000 loops, best of 3: 1.48 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"prompt_number": 69
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "BeautifulSoup"
},
{
"cell_type": "code",
"collapsed": false,
"input": "from BeautifulSoup import BeautifulStoneSoup\n%timeit with open('/home/rinat/mini.xml','r') as xmlfile: BeautifulStoneSoup(xmlfile)\n%timeit with open('/home/rinat/mini.xml','r') as xmlfile: [process(node) for node in BeautifulStoneSoup(xmlfile).findAll()]",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "10 loops, best of 3: 42.3 ms per loop\n10 loops, best of 3: 49.5 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"prompt_number": 60
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "sax"
},
{
"cell_type": "code",
"collapsed": false,
"input": "import xml.sax\n\nclass Handler(xml.sax.ContentHandler):\n def startElement(self, name, attrs):\n pass\n \nparser = xml.sax.make_parser()\n\n#mini\n%timeit with open(fmini,\"r\") as xmlfile: parser.parse(xmlfile) \nparser.setContentHandler(Handler())\n%timeit with open(fmini,\"r\") as xmlfile: parser.parse(xmlfile)\n\n#big\n%timeit with open(fbig,\"r\") as xmlfile: parser.parse(xmlfile) \nparser.setContentHandler(Handler())\n%timeit with open(fbig,\"r\") as xmlfile: parser.parse(xmlfile)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "100 loops, best of 3: 1.93 ms per loop\n100 loops, best of 3: 1.98 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n100 loops, best of 3: 2.07 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n100 loops, best of 3: 2.06 ms per loop"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\n"
}
],
"prompt_number": 89
},
{
"cell_type": "markdown",
"metadata": {},
"source": "##lxml\n\n * \u0441\u0430\u043c\u044b\u0439 \u0431\u044b\u0441\u0442\u0440\u044b\u0439 \u043d\u0430 \u0434\u044b\u043d\u043d\u044b\u0439 \u043c\u043e\u043c\u0435\u043d\u0442\n * \u0441\u043e\u0431\u044b\u0442\u0438\u0439\u043d\u044b\u0439 \u043f\u0430\u0440\u0441\u0435\u0440\n * \u043f\u043e\u0434\u0434\u0435\u0440\u0436\u043a\u0430 xslt\n * \u043e\u0431\u0435\u0440\u043a\u0442\u0430 \u043d\u0430\u0434 libxml2 \u0438 linxslt\n * \u043f\u0430\u0440\u0441\u0438\u043d\u0433 \u0434\u043e\u0440\u043e\u0436\u0435 \u0447\u0435\u043c cElementTree, \u0438\u0431\u043e \u0441\u043e\u0431\u0438\u0440\u0430\u0435\u0442\u0441\u044f \u0431\u043e\u043b\u044c\u0448\u0435 \u043c\u0435\u0442\u0430\u0434\u0430\u043d\u043d\u044b\u0445 (\u0441\u0441\u044b\u043b\u043a\u0438 \u043d\u0430 \u0440\u043e\u0434\u0438\u0442\u0435\u043b\u044f, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440)\n * \u0434\u043b\u044f \u0431\u043e\u043b\u044c\u0448\u0438\u0445 \u0444\u0430\u0439\u043b\u043e \u043d\u0430\u0441\u0442\u043e\u044f\u0442\u0435\u043b\u044c\u043d\u043e \u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u0443\u0435\u0442\u0441\u044f \n - \u043a\u043b\u0430\u0441\u0441 Target\n - iterparse"
},
{
"cell_type": "code",
"collapsed": false,
"input": "from lxml import etree\n\n## mini\n%timeit with open(fmini,'r') as xmlfile: etree.parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: [process(node) for node in etree.parse(xmlfile).iter()]\n \n## big\n%timeit with open(fbig,'r') as xmlfile: etree.parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in etree.parse(xmlfile).iter()]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment