Skip to content

Instantly share code, notes, and snippets.

@kuenishi
Last active December 14, 2015 03:18
Show Gist options
  • Save kuenishi/5019649 to your computer and use it in GitHub Desktop.
Save kuenishi/5019649 to your computer and use it in GitHub Desktop.
import bz2
from lxml import etree
import riak
import msgpack
import sys
client = riak.RiakClient(port=8098, transport_class=riak.RiakHttpTransport)
bucket_name = 'jawiki'
def do_put(key, value):
bucket = client.bucket(bucket_name)
riakobj = bucket.new(key.encode('unicode-escape'), data=value)
riakobj.add_index('id_int', value['id'])
riakobj.store()
def maybe_string(string):
snipped = string.strip()
if snipped: return snipped
else: return None
def element2obj(element):
ret_obj = {}
if element.text is not None:
try:
return (element.tag[42:], int(element.text))
except:
return (element.tag[42:], element.text)
for child in element.iterchildren():
(key, obj) = element2obj(child)
if isinstance(key, int): key = str(int)
ret_obj[key] = obj
return (element.tag[42:], ret_obj)
f = bz2.BZ2File(sys.argv[1])
for _, element in etree.iterparse(f,
remove_blank_text=True,
tag='{http://www.mediawiki.org/xml/export-0.8/}page'):
(_, data) = element2obj(element)
key = data['title']
if isinstance(key, int):
key = str(int)
print(">%s" % key.encode('utf8'))
do_put(key, data)
element.clear()
f.close()
import bz2
from lxml import etree
import riak
import msgpack
import sys
sysmaxuint = 2*64-1
client = riak.RiakClient(port=8098, transport_class=riak.RiakHttpTransport)
bucket_name = 'jawiki'
def do_put(key, value):
bucket = client.bucket(bucket_name)
riakobj = bucket.new_binary(key.encode('unicode-escape'),
data=msgpack.dumps(value),
content_type='application/msgpack')
riakobj.add_index('id_int', value['id'])
riakobj.store()
def maybe_string(string):
snipped = string.strip()
if snipped: return snipped
else: return None
def element2obj(element):
ret_obj = {}
if element.text is not None:
try:
i = int(element.text)
if i > sysmaxuint:
return (element.tag[42:], element.text)
else:
return (element.tag[42:], i)
except:
return (element.tag[42:], element.text)
for child in element.iterchildren():
(key, obj) = element2obj(child)
if isinstance(key, int): key = str(key)
if isinstance(key, long): key = str(key)
ret_obj[key] = obj
return (element.tag[42:], ret_obj)
f = bz2.BZ2File(sys.argv[1])
for _, element in etree.iterparse(f,
remove_blank_text=True,
tag='{http://www.mediawiki.org/xml/export-0.8/}page'):
(_, data) = element2obj(element)
key = data['title']
if isinstance(key, int): key = str(key)
elif isinstance(key, long): key = str(key)
print(">%s" % key.encode('utf8'))
do_put(key, data)
element.clear()
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment