breyten/parse_big_xml.py

## parse_big_xml.py
import sys
import os
import re
import codecs

from lxml import etree

def parse_big_xml(filename, item_tag=u'item'):
    with codecs.open(filename, 'r', 'utf-8') as in_file:
        output = u''
        for line in in_file:
            if line == u'<%s>' % (item_tag,):
                output = line
            elif line == u'</%s>' % (item_tag,):
                output += line
                yield lxml.etree.fromstring(output)
            else:
                output += line

if __name == '__main__':
    for item in (parse_big_xml(sys.argv[1], sys.argv[2])):
        print item
	import sys
	import os
	import re
	import codecs

	from lxml import etree

	def parse_big_xml(filename, item_tag=u'item'):
	with codecs.open(filename, 'r', 'utf-8') as in_file:
	output = u''
	for line in in_file:
	if line == u'<%s>' % (item_tag,):
	output = line
	elif line == u'</%s>' % (item_tag,):
	output += line
	yield lxml.etree.fromstring(output)
	else:
	output += line

	if __name == '__main__':
	for item in (parse_big_xml(sys.argv[1], sys.argv[2])):
	print item