Skip to content

Instantly share code, notes, and snippets.

@atdt
Created November 27, 2012 04:06
Show Gist options
  • Save atdt/4152290 to your computer and use it in GitHub Desktop.
Save atdt/4152290 to your computer and use it in GitHub Desktop.
Convert dump to line-separated JSON, suitable for BigQuery
[
{
"name": "content",
"type": "string"
},
{
"name": "ns",
"type": "integer"
},
{
"name": "rev",
"type": "integer"
},
{
"name": "id",
"type": "integer"
},
{
"name": "title",
"type": "string"
}
]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import bz2
import itertools
import json
import StringIO
import os
from xml.sax.saxutils import unescape
DUMP = 'data/enwiki-latest-pages-articles.xml.bz2'
DEST = '/mnt/jdump/enwiki-nov-2012.json'
def iter_pages():
with bz2.BZ2File(DUMP, 'r') as dump:
capturing = False
buf = StringIO.StringIO()
for line in dump:
if line.startswith(' <title>'):
title = unescape(line[11:-9])
if line.startswith(' <ns>'):
ns = int(line[8:-6])
continue
elif line.startswith(' <id>'):
id = int(line[8:-6])
continue
elif line.startswith(' <id>'):
rev = int(line[10:-6])
continue
elif line.startswith(' <text xml:space="preserve">'):
capturing = True
line = line[33:]
if line.endswith('</text>\n'):
buf.write(line[:-8])
capturing = False
content = unescape(buf.getvalue())
buf.close()
buf = StringIO.StringIO()
yield dict(id=id, title=title, ns=ns, rev=rev, content=content)
continue
if capturing:
buf.write(line)
with open(DEST, 'wb') as fp:
for count, page in enumerate(iter_pages()):
if count % 1000 == 0:
print('%d pages completed' % count)
json.dump(page, fp, check_circular=None)
fp.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment