Skip to content

Instantly share code, notes, and snippets.

@flavioamieiro
Created March 21, 2013 21:53
Show Gist options
  • Save flavioamieiro/5217149 to your computer and use it in GitHub Desktop.
Save flavioamieiro/5217149 to your computer and use it in GitHub Desktop.
Extrai textos do dump da wikipedia em xml
from lxml import etree
FILENAME = 'data/ptwiki-20130306-pages-meta-current.xml'
def get_parser():
for event, element in etree.iterparse(FILENAME, events=('end',)):
if element.tag.endswith('page'):
namespace_tag = element.find(
'{http://www.mediawiki.org/xml/export-0.8/}ns')
if namespace_tag.text == '0':
title_tag = element.find(
'{http://www.mediawiki.org/xml/export-0.8/}title')
text_tag = element.find(
'{http://www.mediawiki.org/xml/export-0.8/}revision').find('{http://www.mediawiki.org/xml/export-0.8/}text')
title, text = title_tag.text, text_tag.text
yield element, title, text
# we clear pages. other elements are still in memory, but pages are
# the biggest part of the xml, so we should use a decent amount of
# memory.
element.clear()
with open('data/index.txt', 'w') as index:
parser = get_parser()
i = 0
for element, title, text in parser:
i += 1
doc_id = '{:015d}'.format(i)
current_document = '{} - {}\n'.format(doc_id, title.encode('utf-8'))
index.write(current_document)
with open('data/articles/{}.txt'.format(doc_id), 'w') as fp:
fp.write('{}\n'.format(title.encode('utf-8')))
fp.write('{}\n'.format(text.encode('utf-8')))
@flavioamieiro
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment