Skip to content

Instantly share code, notes, and snippets.

@jason-s
Last active August 29, 2015 14:13
Show Gist options
  • Save jason-s/e0f4cb0fff1e2ab2fb20 to your computer and use it in GitHub Desktop.
Save jason-s/e0f4cb0fff1e2ab2fb20 to your computer and use it in GitHub Desktop.
import numpy as np
import tables as pt
import subprocess
import random
import re
import os
import zlib
class H5fileContext(object):
def __init__(self, filename, groupname):
self.filename = filename
self.groupname = groupname
def open(self):
self.hfile = pt.open_file(self.filename, 'w')
self.g = self.hfile.createGroup(self.hfile.root, self.groupname)
def close(self):
self.hfile.flush()
self.hfile.close()
def recordString(self, nodename, s, complevel=5, complib='zlib', shuffle=True):
'''creates a CArray object in an HDF5 file
that represents a unicode string'''
array_of_bytes = np.fromstring(s.encode('utf-8'),np.uint8)
atom = pt.UInt8Atom() # @UndefinedVariable
filters = pt.Filters(complevel=complevel, complib=complib, shuffle=shuffle)
ca = self.hfile.create_carray(self.g, nodename, atom, shape=(len(array_of_bytes),),
filters=filters)
ca[:] = array_of_bytes
return ca
def recordGroupAttribute(self, attrname, s, complevel=5):
setattr(self.g._v_attrs, attrname, zlib.compress(s, complevel))
def __enter__(self):
self.open()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
jabberwocky = '''
`Twas brillig, and the slithy toves
Did gyre and gimble in the wabe:
All mimsy were the borogoves,
And the mome raths outgrabe.
"Beware the Jabberwock, my son!
The jaws that bite, the claws that catch!
Beware the Jubjub bird, and shun
The frumious Bandersnatch!"
He took his vorpal sword in hand:
Long time the manxome foe he sought --
So rested he by the Tumtum tree,
And stood awhile in thought.
And, as in uffish thought he stood,
The Jabberwock, with eyes of flame,
Came whiffling through the tulgey wood,
And burbled as it came!
One, two! One, two! And through and through
The vorpal blade went snicker-snack!
He left it dead, and with its head
He went galumphing back.
"And, has thou slain the Jabberwock?
Come to my arms, my beamish boy!
O frabjous day! Callooh! Callay!"
He chortled in his joy.
'Twas brillig, and the slithy toves
Did gyre and gimble in the wabe;
All mimsy were the borogoves,
And the mome raths outgrabe.
'''
def randpoem(source, seed, n):
r = random.Random()
r.seed(seed)
words = re.split(r'\W+', source)
words = list(set(word.lower() for word in words if word))
line = []
i = 0
while True:
i += 1
if i >= n:
return
if r.random() < 0.1:
if line:
yield ' '.join(line).capitalize()+(r.choice(['','.',';',',','?','!']))
line = []
else:
yield ""
else:
line.append(r.choice(words))
def popgroup(stack, n, kvnext):
level, kv = stack[-1]
if level > n:
k = 0
items = []
val = None
while level > n:
itemslevel = level
stack.pop()
items.append(kv)
level, kv = stack[-1]
if level < itemslevel:
k = kv[0]
if k == 'Attribute':
k = k+':'+':'.join(kv[1].split())
kv = (k, dict(items))
else:
kv = (k, (kv[1], dict(items)))
stack[-1] = (level, kv)
items = []
stack.append((n, kvnext))
def parse_h5ls(out):
r = re.compile(r'^(\s*)([^\W\d][\w-]*):?\s*(.*)')
stack = [(-1, ('a','b'))]
kv = None
for line in out.splitlines()[1:]:
m = r.search(line)
if m:
spaces = m.group(1)
nsp = 0 if spaces is None else len(spaces)
k = m.group(2)
v = m.group(3)
popgroup(stack, nsp, (k,v))
popgroup(stack, 0, None)
stack.pop()
result = list(item[1] for item in stack[1:])
return result
def analyze(filename, groupname, base_overhead=0, doprint=True):
out = subprocess.check_output(['h5ls','-v',filename+'/'+groupname])
data = parse_h5ls(out)
tlogical = 0
talloc = 0
for item in data:
storage = item[1][1]['Storage'].split(', ')
storage = [item.split()[0] for item in storage]
tlogical += int(storage[0])
talloc += int(storage[1])
totsz = os.path.getsize(filename)
n = len(data)
return (n, tlogical, talloc, totsz)
def explore_filesizes(rootdir):
def task_file_store_string(filebase, n, s, shuffle=True):
def func(f):
for k in xrange(n):
f.recordString('boing%d' % k, s, shuffle=shuffle)
func.filebase = filebase
return func
def task_file_attr_string(filebase, n, s):
def func(f):
for k in xrange(n):
f.recordGroupAttribute('boing%d' % k, s)
func.filebase = filebase
return func
R100 = '\n'.join(randpoem(jabberwocky, 123, 100))
R1000 = '\n'.join(randpoem(jabberwocky, 123, 1000))
R10000 = '\n'.join(randpoem(jabberwocky, 123, 10000))
tasks = [
task_file_store_string('empty', 0, '')
]
tasks += [task_file_store_string('jabberwocky%02d' % n, n, jabberwocky) for n in [1,2,5,10,20]]
tasks += [task_file_attr_string('jabberwocky_attr%02d' % n, n, jabberwocky) for n in [1,2,5,10,20]]
tasks += [task_file_store_string('jabberwocky%02d_noshuffle' % n, n, jabberwocky, shuffle=False) for n in [1,10]]
tasks += [
task_file_store_string('jabberwocky01repeat10', 1, jabberwocky*10),
task_file_store_string('jabberwockyR100',1,R100),
task_file_store_string('jabberwockyR1000',1,R1000),
task_file_store_string('jabberwockyR10000',1,R10000),
task_file_store_string('jabberwockyR100x2',2,R100),
task_file_store_string('jabberwockyR1000x2',2,R1000),
task_file_store_string('jabberwockyR10000x2',2,R10000),
task_file_store_string('jabberwockyR100x2',10,R100),
task_file_store_string('jabberwockyR1000x2',10,R1000),
task_file_store_string('jabberwockyR10000x2',10,R10000),
]
baseline_overhead = None
for task in tasks:
filename = os.path.join(rootdir,'%s.h5' % task.filebase)
groupname = 'data'
with H5fileContext(filename,groupname) as f:
task(f)
(n, tlogical, talloc, totsz) = analyze(filename, groupname)
overhead = (totsz-talloc)
if baseline_overhead is None:
baseline_overhead = overhead
baseline_str = str(baseline_overhead)
print 'logical: %8d alloc: %8d disk: %8d ovhd: %8d+%s items: %2d -- %s' % (
tlogical, talloc, totsz, overhead-baseline_overhead, baseline_str, n, filename)
if __name__ == '__main__':
explore_filesizes('c:/tmp/hdf5')
logical: 0 alloc: 0 disk: 2232 ovhd: 0+2232 items: 0 -- c:/tmp/hdf5\empty.h5
logical: 966 alloc: 619 disk: 5643 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01.h5
logical: 1932 alloc: 1238 disk: 8702 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwocky02.h5
logical: 4830 alloc: 3095 disk: 17879 ovhd: 12552+2232 items: 5 -- c:/tmp/hdf5\jabberwocky05.h5
logical: 9660 alloc: 6190 disk: 33678 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwocky10.h5
[warning: C:\app\python\anaconda\lib\site-packages\tables\attributeset.py:391: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal]
logical: 19320 alloc: 12380 disk: 65252 ovhd: 50640+2232 items: 20 -- c:/tmp/hdf5\jabberwocky20.h5
logical: 0 alloc: 0 disk: 2760 ovhd: 528+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr01.h5
logical: 0 alloc: 0 disk: 3288 ovhd: 1056+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr02.h5
logical: 0 alloc: 0 disk: 4872 ovhd: 2640+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr05.h5
logical: 0 alloc: 0 disk: 7512 ovhd: 5280+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr10.h5
logical: 0 alloc: 0 disk: 12792 ovhd: 10560+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr20.h5
logical: 966 alloc: 619 disk: 5619 ovhd: 2768+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01_noshuffle.h5
logical: 9660 alloc: 6190 disk: 33390 ovhd: 24968+2232 items: 10 -- c:/tmp/hdf5\jabberwocky10_noshuffle.h5
logical: 9660 alloc: 685 disk: 5709 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01repeat10.h5
logical: 489 alloc: 398 disk: 5422 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR100.h5
logical: 5051 alloc: 2124 disk: 7148 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR1000.h5
logical: 52917 alloc: 16425 disk: 21449 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR10000.h5
logical: 978 alloc: 796 disk: 8260 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR100x2.h5
logical: 10102 alloc: 4248 disk: 11712 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR1000x2.h5
logical: 105834 alloc: 32850 disk: 40314 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR10000x2.h5
logical: 4890 alloc: 3980 disk: 31468 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR100x2.h5
logical: 50510 alloc: 21240 disk: 48728 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR1000x2.h5
logical: 529170 alloc: 164250 disk: 191738 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR10000x2.h5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment