atdt/dump-schema.json

## dump-schema.json
[
  {
    "name": "content",
    "type": "string"
  },
  {
    "name": "ns",
    "type": "integer"
  },
  {
    "name": "rev",
    "type": "integer"
  },
  {
    "name": "id",
    "type": "integer"
  },
  {
    "name": "title",
    "type": "string"
  }
]

## jdump.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import bz2
import itertools
import json
import StringIO
import os

from xml.sax.saxutils import unescape

DUMP = 'data/enwiki-latest-pages-articles.xml.bz2'
DEST = '/mnt/jdump/enwiki-nov-2012.json'


def iter_pages():
    with bz2.BZ2File(DUMP, 'r') as dump:
        capturing = False
        buf = StringIO.StringIO()
        for line in dump:
            if line.startswith('    <title>'):
                title = unescape(line[11:-9])
            if line.startswith('    <ns>'):
                ns = int(line[8:-6])
                continue
            elif line.startswith('    <id>'):
                id = int(line[8:-6])
                continue
            elif line.startswith('      <id>'):
                rev = int(line[10:-6])
                continue
            elif line.startswith('      <text xml:space="preserve">'):
                capturing = True
                line = line[33:]
            if line.endswith('</text>\n'):
                buf.write(line[:-8])
                capturing = False
                content = unescape(buf.getvalue())
                buf.close()
                buf = StringIO.StringIO()
                yield dict(id=id, title=title, ns=ns, rev=rev, content=content)
                continue
            if capturing:
                buf.write(line)


with open(DEST, 'wb') as fp:
    for count, page in enumerate(iter_pages()):
        if count % 1000 == 0:
            print('%d pages completed' % count)
        json.dump(page, fp, check_circular=None)
        fp.write('\n')
	[
	{
	"name": "content",
	"type": "string"
	},
	{
	"name": "ns",
	"type": "integer"
	},
	{
	"name": "rev",
	"type": "integer"
	},
	{
	"name": "id",
	"type": "integer"
	},
	{
	"name": "title",
	"type": "string"
	}
	]
	#!/usr/bin/env python
	# -- coding: utf-8 --
	import bz2
	import itertools
	import json
	import StringIO
	import os

	from xml.sax.saxutils import unescape

	DUMP = 'data/enwiki-latest-pages-articles.xml.bz2'
	DEST = '/mnt/jdump/enwiki-nov-2012.json'


	def iter_pages():
	with bz2.BZ2File(DUMP, 'r') as dump:
	capturing = False
	buf = StringIO.StringIO()
	for line in dump:
	if line.startswith(' <title>'):
	title = unescape(line[11:-9])
	if line.startswith(' <ns>'):
	ns = int(line[8:-6])
	continue
	elif line.startswith(' <id>'):
	id = int(line[8:-6])
	continue
	elif line.startswith(' <id>'):
	rev = int(line[10:-6])
	continue
	elif line.startswith(' <text xml:space="preserve">'):
	capturing = True
	line = line[33:]
	if line.endswith('</text>\n'):
	buf.write(line[:-8])
	capturing = False
	content = unescape(buf.getvalue())
	buf.close()
	buf = StringIO.StringIO()
	yield dict(id=id, title=title, ns=ns, rev=rev, content=content)
	continue
	if capturing:
	buf.write(line)


	with open(DEST, 'wb') as fp:
	for count, page in enumerate(iter_pages()):
	if count % 1000 == 0:
	print('%d pages completed' % count)
	json.dump(page, fp, check_circular=None)
	fp.write('\n')