Skip to content

Instantly share code, notes, and snippets.

@gunshor
Forked from c2nes/dumpjson.py
Created February 21, 2014 23:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gunshor/9145546 to your computer and use it in GitHub Desktop.
Save gunshor/9145546 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import sys
SOURCE_FILE = "/home/cmthunes/enwiki-20120211-pages-meta-current.xml"
OUTPUT_FILE = "/mnt/reports/cmthunes/enwiki-20120211.json"
def extract_with_delims(s, start_delim, end_delim):
start_index = s.find(start_delim)
if start_index == -1:
return None
end_index = s.find(end_delim)
if end_index == -1:
return None
if end_index <= start_index:
return None
start_index += len(start_delim)
return s[start_index:end_index]
def extract_data(part):
title = extract_with_delims(part, "<title>", "</title>")
timestamp = extract_with_delims(part, "<timestamp>", "</timestamp>")
text = extract_with_delims(part, "<text xml:space=\"preserve\">", "</text>")
return (title, timestamp, text)
def json_encode_string(s):
s = s.replace("\\", "\\\\")
s = s.replace("/", "\\/")
s = s.replace('"', '\\"')
s = s.replace("\n", "\\n")
s = s.replace("\r", "\\r")
s = s.replace("\t", "\\t")
return '"' + s + '"'
def split_records():
enwikisource = open(SOURCE_FILE)
text_buffer = ""
start_index = 0
end_index = 0
while True:
chunk = enwikisource.read(16 * 1024 * 1024)
if chunk:
text_buffer += chunk
start_index = 0
end_index = 0
while True:
start_index = text_buffer.find("<page>", start_index)
# No pages in the buffer, continue loading data
if start_index == -1:
break
end_index = text_buffer.find("</page>", end_index)
# No complete page in buffer
if end_index == -1:
break
yield text_buffer[start_index:end_index + len("</page>")]
start_index = end_index + len("</page>")
end_index = start_index
# No more data
if chunk == "":
break
if start_index == -1:
text_buffer = ""
else:
text_buffer = text_buffer[start_index:]
if os.path.exists(OUTPUT_FILE):
print "Output file already exists. Please remove first so I don't destroy your stuff please"
sys.exit(1)
json_file = open(OUTPUT_FILE, "w")
template = '{"title": %s, "timestamp": %s, "text": %s},\n'
i = 0
json_file.write("[\n")
try:
for page in split_records():
i += 1
sys.stdout.write("\r%d" % (i,))
sys.stdout.flush()
title, timestamp, text = extract_data(page)
if None in (title, timestamp, text):
continue
json_file.write(template % tuple(map(json_encode_string,
(title, timestamp, text))))
finally:
json_file.write("]\n")
json_file.close()
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment