Created
November 27, 2012 04:06
-
-
Save atdt/4152290 to your computer and use it in GitHub Desktop.
Convert dump to line-separated JSON, suitable for BigQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"name": "content", | |
"type": "string" | |
}, | |
{ | |
"name": "ns", | |
"type": "integer" | |
}, | |
{ | |
"name": "rev", | |
"type": "integer" | |
}, | |
{ | |
"name": "id", | |
"type": "integer" | |
}, | |
{ | |
"name": "title", | |
"type": "string" | |
} | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import bz2 | |
import itertools | |
import json | |
import StringIO | |
import os | |
from xml.sax.saxutils import unescape | |
DUMP = 'data/enwiki-latest-pages-articles.xml.bz2' | |
DEST = '/mnt/jdump/enwiki-nov-2012.json' | |
def iter_pages(): | |
with bz2.BZ2File(DUMP, 'r') as dump: | |
capturing = False | |
buf = StringIO.StringIO() | |
for line in dump: | |
if line.startswith(' <title>'): | |
title = unescape(line[11:-9]) | |
if line.startswith(' <ns>'): | |
ns = int(line[8:-6]) | |
continue | |
elif line.startswith(' <id>'): | |
id = int(line[8:-6]) | |
continue | |
elif line.startswith(' <id>'): | |
rev = int(line[10:-6]) | |
continue | |
elif line.startswith(' <text xml:space="preserve">'): | |
capturing = True | |
line = line[33:] | |
if line.endswith('</text>\n'): | |
buf.write(line[:-8]) | |
capturing = False | |
content = unescape(buf.getvalue()) | |
buf.close() | |
buf = StringIO.StringIO() | |
yield dict(id=id, title=title, ns=ns, rev=rev, content=content) | |
continue | |
if capturing: | |
buf.write(line) | |
with open(DEST, 'wb') as fp: | |
for count, page in enumerate(iter_pages()): | |
if count % 1000 == 0: | |
print('%d pages completed' % count) | |
json.dump(page, fp, check_circular=None) | |
fp.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment