Skip to content

Instantly share code, notes, and snippets.

@jirivrany
Created December 4, 2014 07:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jirivrany/77d4f250f773e81064dc to your computer and use it in GitHub Desktop.
Save jirivrany/77d4f250f773e81064dc to your computer and use it in GitHub Desktop.
# -*- coding: UTF-8 -*-
"""
Parsovani velkych soubory pomoci Eltree iter parseru
"""
import xml.etree.ElementTree as etree
import bz2
def fix_tag(ns, nsmap, tag):
return '{{{}}}{}'.format(nsmap[''], tag)
def parse_dump(xml_fn):
with bz2.BZ2File(xml_fn, 'r') as fr:
nsmap = {}
for event, elem in etree.iterparse(fr, events=('end', 'start-ns')):
if event == 'start-ns':
ns, url = elem
nsmap[ns] = url
if event == 'end':
if elem.tag == fix_tag('', nsmap, 'page'):
title = elem.find(fix_tag('', nsmap, 'title')).text
yield title
elem.clear()
if __name__ == '__main__':
fname = 'cswiki-latest-pages-articles.xml.bz2'
idx = 0
for title in parse_dump(fname):
print title
idx += 1
if idx >= 10:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment