Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
a = LOAD 'test' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('data:column_1');
STORE a INTO 'out' USING PigStorage(',');
import random
import sys
from org.apache.hadoop.hbase import HBaseConfiguration, HTableDescriptor, HColumnDescriptor
from org.apache.hadoop.hbase.client import HBaseAdmin, HTable, Put
def setup_table(name, *families):
conf = HBaseConfiguration()
desc = HTableDescriptor(name)
[desc.addFamily(HColumnDescriptor('%s:' % f)) for f in families]
admin = HBaseAdmin(conf)
if admin.tableExists(name):
admin.disableTable(name)
admin.deleteTable(name)
admin.createTable(desc)
table = HTable(conf, name)
return table
def put_row(table, row, **families):
p = Put(row)
for f, values in families.items():
[p.add(f, k, v) for k,v in values.items()]
return p
def data(cols):
return dict([('column_%d' % i, 'value_%d_%d' % (i, random.randint(0, 5))) for i in range(cols)])
def rows(count, batch, **families):
for i in xrange(0, count, batch):
yield [put_row(t, 'row_%s' % (i+r), **families) for r in range(batch)]
print '%s rows put' % (i + batch)
META = {'a':'b'}
if __name__ == '__main__':
# Put n rows in batches of b
name = sys.argv[1]
cols, n, b = [int(p) for p in sys.argv[2:5]]
t = setup_table(name, 'meta', 'data')
[t.put(r) for r in rows(n, b, meta=META, data=data(cols))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment