Last active
June 8, 2016 11:36
-
-
Save breyten/1fb4fe875b68b8797bb3b624f3654fa4 to your computer and use it in GitHub Desktop.
Parsing big pretty printed XML files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import re | |
import codecs | |
from lxml import etree | |
def parse_big_xml(filename, item_tag=u'item'): | |
with codecs.open(filename, 'r', 'utf-8') as in_file: | |
output = u'' | |
for line in in_file: | |
if line == u'<%s>' % (item_tag,): | |
output = line | |
elif line == u'</%s>' % (item_tag,): | |
output += line | |
yield lxml.etree.fromstring(output) | |
else: | |
output += line | |
if __name == '__main__': | |
for item in (parse_big_xml(sys.argv[1], sys.argv[2])): | |
print item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment