flavioamieiro/wp_extractor.py

## wp_extractor.py
from lxml import etree

FILENAME = 'data/ptwiki-20130306-pages-meta-current.xml'

def get_parser():
    for event, element in etree.iterparse(FILENAME, events=('end',)):
        if element.tag.endswith('page'):
            namespace_tag = element.find(
                    '{http://www.mediawiki.org/xml/export-0.8/}ns')
            if namespace_tag.text == '0':
                title_tag = element.find(
                        '{http://www.mediawiki.org/xml/export-0.8/}title')
                text_tag = element.find(
                        '{http://www.mediawiki.org/xml/export-0.8/}revision').find('{http://www.mediawiki.org/xml/export-0.8/}text')
                title, text = title_tag.text, text_tag.text
                yield element, title, text
            # we clear pages. other elements are still in memory, but pages are
            # the biggest part of the xml, so we should use a decent amount of
            # memory.
            element.clear()

with open('data/index.txt', 'w') as index:
    parser = get_parser()
    i = 0
    for element, title, text in parser:
        i += 1
        doc_id = '{:015d}'.format(i)
        current_document = '{} - {}\n'.format(doc_id, title.encode('utf-8'))
        index.write(current_document)
        with open('data/articles/{}.txt'.format(doc_id), 'w') as fp:
            fp.write('{}\n'.format(title.encode('utf-8')))
            fp.write('{}\n'.format(text.encode('utf-8')))
	from lxml import etree

	FILENAME = 'data/ptwiki-20130306-pages-meta-current.xml'

	def get_parser():
	for event, element in etree.iterparse(FILENAME, events=('end',)):
	if element.tag.endswith('page'):
	namespace_tag = element.find(
	'{http://www.mediawiki.org/xml/export-0.8/}ns')
	if namespace_tag.text == '0':
	title_tag = element.find(
	'{http://www.mediawiki.org/xml/export-0.8/}title')
	text_tag = element.find(
	'{http://www.mediawiki.org/xml/export-0.8/}revision').find('{http://www.mediawiki.org/xml/export-0.8/}text')
	title, text = title_tag.text, text_tag.text
	yield element, title, text
	# we clear pages. other elements are still in memory, but pages are
	# the biggest part of the xml, so we should use a decent amount of
	# memory.
	element.clear()

	with open('data/index.txt', 'w') as index:
	parser = get_parser()
	i = 0
	for element, title, text in parser:
	i += 1
	doc_id = '{:015d}'.format(i)
	current_document = '{} - {}\n'.format(doc_id, title.encode('utf-8'))
	index.write(current_document)
	with open('data/articles/{}.txt'.format(doc_id), 'w') as fp:
	fp.write('{}\n'.format(title.encode('utf-8')))
	fp.write('{}\n'.format(text.encode('utf-8')))