Skip to content

Instantly share code, notes, and snippets.

@kuenishi
Last active December 14, 2015 02:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuenishi/5015277 to your computer and use it in GitHub Desktop.
Save kuenishi/5015277 to your computer and use it in GitHub Desktop.
import bz2
from xml.dom.minidom import parse, parseString
from xml.dom import Node
import riak
import msgpack
import sys
client = riak.RiakClient(port=8098, transport_class=riak.RiakHttpTransport)
bucket_name = 'jawiki'
def do_put(key, value):
bucket = client.bucket(bucket_name)
riakobj = bucket.new(key.encode('unicode-escape'), data=value)
riakobj.add_index('id_int', value['id'])
riakobj.store()
def maybe_string(string):
snipped = string.strip()
if snipped: return snipped
else: return None
switch = {
Node.ELEMENT_NODE : lambda x : node2obj(x),
Node.ATTRIBUTE_NODE : lambda x : node2obj(x),
Node.TEXT_NODE : lambda x: maybe_string(x.nodeValue), # x.nodeValue,
Node.CDATA_SECTION_NODE : lambda x: x,
Node.ENTITY_NODE : lambda x: node2obj(x),
Node.PROCESSING_INSTRUCTION_NODE : lambda x: None,
Node.COMMENT_NODE : lambda x: None,
Node.DOCUMENT_NODE : lambda x: None,
Node.DOCUMENT_TYPE_NODE : lambda x: None,
Node.NOTATION_NODE : lambda x: None
}
def node2obj(page_dom):
page = {}
for node in page_dom.childNodes:
tag_name = node.nodeName
page[tag_name] = switch[node.nodeType](node)
if '#text' in page and len(page) == 1:
if page['#text'] is None:
return None
else:
try: return int(page['#text'])
except: return page['#text']
if '#text' in page and page['#text'] is None:
page.pop('#text')
return page
f = bz2.BZ2File(sys.argv[1])
for page in parse(f).getElementsByTagName('page'):
data = node2obj(page)
print(data['id'], data['title'])
do_put(data['title'], data)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment