Created
August 20, 2013 13:14
-
-
Save astynax/6281224 to your computer and use it in GitHub Desktop.
XML batteries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "XML" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "#\u0411\u0430\u0442\u0430\u0440\u0435\u0439\u043a\u0438 XML \u0434\u043b\u044f Python" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "## big.svg [64\u041a\u0431]\nfbig = '/home/rinat/big.svg'\n## mini.xml [18\u041a\u0431]\nfmini = '/home/rinat/mini.xml'\n\n\ndef process(node):\n pass", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 93 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "##minidom\n\n * \u043f\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043e\u0431\u043a\u0435 \u0441 2.0\n * \u043c\u0435\u0434\u043b\u0435\u043d\u043d\u044b\u0439\n * \u043f\u0440\u043e\u0436\u043e\u0440\u043b\u0438\u0432\u044b\u0439\n * javastyle\n " | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from xml.dom import minidom\nimport timeit\n\ndef iterate_children(parent):\n child = parent.firstChild\n while child != None:\n yield child\n child = child.nextSibling\n\n#mini\n%timeit with open(fmini,'r') as xmlfile: minidom.parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: iterate_children(minidom.parse(xmlfile))\n \n#big\n%timeit with open(fbig,'r') as xmlfile: minidom.parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: iterate_children(minidom.parse(xmlfile))\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "100 loops, best of 3: 7.41 ms per loop\n100 loops, best of 3: 7.37 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n100 loops, best of 3: 11.7 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n100 loops, best of 3: 11.7 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"prompt_number": 88 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "##ElementTree\n\n * \u043f\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442\u0441\u044f \u0432 \u043a\u043e\u0440\u043e\u0431\u043a\u0435 \u0441 2.4\n * \u0435\u0441\u0442\u044c \u0432\u0435\u0440\u0441\u0438\u0438 \u0438 \u0434\u043b\u044f \u0431\u043e\u043b\u0435\u0435 \u0440\u0430\u043d\u043d\u0435\u0433\u043e python\n * \u0441\u043e\u0431\u044b\u0442\u0438\u0439\u043d\u0430\u044f \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0430 (intparse)\n * \u0435\u0441\u0442\u044c \u0421\u0418\u0448\u043d\u0430\u044f \u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f _cElementTree_\n" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from xml.etree.ElementTree import ElementTree\n\n#mini\n%timeit with open(fmini,'r') as xmlfile: ElementTree().parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: [process(node) for node in ElementTree().parse(xmlfile).iter()]\n \n#big\n%timeit with open(fbig,'r') as xmlfile: ElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in ElementTree().parse(xmlfile).iter()]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "100 loops, best of 3: 7.83 ms per loop\n100 loops, best of 3: 8.3 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"prompt_number": 66 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "cElementTree" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from xml.etree.cElementTree import ElementTree as cElementTree\n\n## mini.xml\n%timeit with open(fmini,'r') as xmlfile: cElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in cElementTree().parse(xmlfile).iter()]\n \n## big.xml\n%timeit with open(fbig,'r') as xmlfile: cElementTree().parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in cElementTree().parse(xmlfile).iter()]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "1000 loops, best of 3: 885 \u00b5s per loop\n1000 loops, best of 3: 1.33 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n1000 loops, best of 3: 1.08 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n1000 loops, best of 3: 1.48 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"prompt_number": 69 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "BeautifulSoup" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from BeautifulSoup import BeautifulStoneSoup\n%timeit with open('/home/rinat/mini.xml','r') as xmlfile: BeautifulStoneSoup(xmlfile)\n%timeit with open('/home/rinat/mini.xml','r') as xmlfile: [process(node) for node in BeautifulStoneSoup(xmlfile).findAll()]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "10 loops, best of 3: 42.3 ms per loop\n10 loops, best of 3: 49.5 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"prompt_number": 60 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "sax" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import xml.sax\n\nclass Handler(xml.sax.ContentHandler):\n def startElement(self, name, attrs):\n pass\n \nparser = xml.sax.make_parser()\n\n#mini\n%timeit with open(fmini,\"r\") as xmlfile: parser.parse(xmlfile) \nparser.setContentHandler(Handler())\n%timeit with open(fmini,\"r\") as xmlfile: parser.parse(xmlfile)\n\n#big\n%timeit with open(fbig,\"r\") as xmlfile: parser.parse(xmlfile) \nparser.setContentHandler(Handler())\n%timeit with open(fbig,\"r\") as xmlfile: parser.parse(xmlfile)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "100 loops, best of 3: 1.93 ms per loop\n100 loops, best of 3: 1.98 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n100 loops, best of 3: 2.07 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n100 loops, best of 3: 2.06 ms per loop" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\n" | |
} | |
], | |
"prompt_number": 89 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "##lxml\n\n * \u0441\u0430\u043c\u044b\u0439 \u0431\u044b\u0441\u0442\u0440\u044b\u0439 \u043d\u0430 \u0434\u044b\u043d\u043d\u044b\u0439 \u043c\u043e\u043c\u0435\u043d\u0442\n * \u0441\u043e\u0431\u044b\u0442\u0438\u0439\u043d\u044b\u0439 \u043f\u0430\u0440\u0441\u0435\u0440\n * \u043f\u043e\u0434\u0434\u0435\u0440\u0436\u043a\u0430 xslt\n * \u043e\u0431\u0435\u0440\u043a\u0442\u0430 \u043d\u0430\u0434 libxml2 \u0438 linxslt\n * \u043f\u0430\u0440\u0441\u0438\u043d\u0433 \u0434\u043e\u0440\u043e\u0436\u0435 \u0447\u0435\u043c cElementTree, \u0438\u0431\u043e \u0441\u043e\u0431\u0438\u0440\u0430\u0435\u0442\u0441\u044f \u0431\u043e\u043b\u044c\u0448\u0435 \u043c\u0435\u0442\u0430\u0434\u0430\u043d\u043d\u044b\u0445 (\u0441\u0441\u044b\u043b\u043a\u0438 \u043d\u0430 \u0440\u043e\u0434\u0438\u0442\u0435\u043b\u044f, \u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440)\n * \u0434\u043b\u044f \u0431\u043e\u043b\u044c\u0448\u0438\u0445 \u0444\u0430\u0439\u043b\u043e \u043d\u0430\u0441\u0442\u043e\u044f\u0442\u0435\u043b\u044c\u043d\u043e \u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u0443\u0435\u0442\u0441\u044f \n - \u043a\u043b\u0430\u0441\u0441 Target\n - iterparse" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from lxml import etree\n\n## mini\n%timeit with open(fmini,'r') as xmlfile: etree.parse(xmlfile)\n%timeit with open(fmini,'r') as xmlfile: [process(node) for node in etree.parse(xmlfile).iter()]\n \n## big\n%timeit with open(fbig,'r') as xmlfile: etree.parse(xmlfile)\n%timeit with open(fbig,'r') as xmlfile: [process(node) for node in etree.parse(xmlfile).iter()]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": "*" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment