Last active
August 29, 2015 14:13
-
-
Save jason-s/e0f4cb0fff1e2ab2fb20 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import tables as pt | |
import subprocess | |
import random | |
import re | |
import os | |
import zlib | |
class H5fileContext(object): | |
def __init__(self, filename, groupname): | |
self.filename = filename | |
self.groupname = groupname | |
def open(self): | |
self.hfile = pt.open_file(self.filename, 'w') | |
self.g = self.hfile.createGroup(self.hfile.root, self.groupname) | |
def close(self): | |
self.hfile.flush() | |
self.hfile.close() | |
def recordString(self, nodename, s, complevel=5, complib='zlib', shuffle=True): | |
'''creates a CArray object in an HDF5 file | |
that represents a unicode string''' | |
array_of_bytes = np.fromstring(s.encode('utf-8'),np.uint8) | |
atom = pt.UInt8Atom() # @UndefinedVariable | |
filters = pt.Filters(complevel=complevel, complib=complib, shuffle=shuffle) | |
ca = self.hfile.create_carray(self.g, nodename, atom, shape=(len(array_of_bytes),), | |
filters=filters) | |
ca[:] = array_of_bytes | |
return ca | |
def recordGroupAttribute(self, attrname, s, complevel=5): | |
setattr(self.g._v_attrs, attrname, zlib.compress(s, complevel)) | |
def __enter__(self): | |
self.open() | |
return self | |
def __exit__(self, exc_type, exc_val, exc_tb): | |
self.close() | |
jabberwocky = ''' | |
`Twas brillig, and the slithy toves | |
Did gyre and gimble in the wabe: | |
All mimsy were the borogoves, | |
And the mome raths outgrabe. | |
"Beware the Jabberwock, my son! | |
The jaws that bite, the claws that catch! | |
Beware the Jubjub bird, and shun | |
The frumious Bandersnatch!" | |
He took his vorpal sword in hand: | |
Long time the manxome foe he sought -- | |
So rested he by the Tumtum tree, | |
And stood awhile in thought. | |
And, as in uffish thought he stood, | |
The Jabberwock, with eyes of flame, | |
Came whiffling through the tulgey wood, | |
And burbled as it came! | |
One, two! One, two! And through and through | |
The vorpal blade went snicker-snack! | |
He left it dead, and with its head | |
He went galumphing back. | |
"And, has thou slain the Jabberwock? | |
Come to my arms, my beamish boy! | |
O frabjous day! Callooh! Callay!" | |
He chortled in his joy. | |
'Twas brillig, and the slithy toves | |
Did gyre and gimble in the wabe; | |
All mimsy were the borogoves, | |
And the mome raths outgrabe. | |
''' | |
def randpoem(source, seed, n): | |
r = random.Random() | |
r.seed(seed) | |
words = re.split(r'\W+', source) | |
words = list(set(word.lower() for word in words if word)) | |
line = [] | |
i = 0 | |
while True: | |
i += 1 | |
if i >= n: | |
return | |
if r.random() < 0.1: | |
if line: | |
yield ' '.join(line).capitalize()+(r.choice(['','.',';',',','?','!'])) | |
line = [] | |
else: | |
yield "" | |
else: | |
line.append(r.choice(words)) | |
def popgroup(stack, n, kvnext): | |
level, kv = stack[-1] | |
if level > n: | |
k = 0 | |
items = [] | |
val = None | |
while level > n: | |
itemslevel = level | |
stack.pop() | |
items.append(kv) | |
level, kv = stack[-1] | |
if level < itemslevel: | |
k = kv[0] | |
if k == 'Attribute': | |
k = k+':'+':'.join(kv[1].split()) | |
kv = (k, dict(items)) | |
else: | |
kv = (k, (kv[1], dict(items))) | |
stack[-1] = (level, kv) | |
items = [] | |
stack.append((n, kvnext)) | |
def parse_h5ls(out): | |
r = re.compile(r'^(\s*)([^\W\d][\w-]*):?\s*(.*)') | |
stack = [(-1, ('a','b'))] | |
kv = None | |
for line in out.splitlines()[1:]: | |
m = r.search(line) | |
if m: | |
spaces = m.group(1) | |
nsp = 0 if spaces is None else len(spaces) | |
k = m.group(2) | |
v = m.group(3) | |
popgroup(stack, nsp, (k,v)) | |
popgroup(stack, 0, None) | |
stack.pop() | |
result = list(item[1] for item in stack[1:]) | |
return result | |
def analyze(filename, groupname, base_overhead=0, doprint=True): | |
out = subprocess.check_output(['h5ls','-v',filename+'/'+groupname]) | |
data = parse_h5ls(out) | |
tlogical = 0 | |
talloc = 0 | |
for item in data: | |
storage = item[1][1]['Storage'].split(', ') | |
storage = [item.split()[0] for item in storage] | |
tlogical += int(storage[0]) | |
talloc += int(storage[1]) | |
totsz = os.path.getsize(filename) | |
n = len(data) | |
return (n, tlogical, talloc, totsz) | |
def explore_filesizes(rootdir): | |
def task_file_store_string(filebase, n, s, shuffle=True): | |
def func(f): | |
for k in xrange(n): | |
f.recordString('boing%d' % k, s, shuffle=shuffle) | |
func.filebase = filebase | |
return func | |
def task_file_attr_string(filebase, n, s): | |
def func(f): | |
for k in xrange(n): | |
f.recordGroupAttribute('boing%d' % k, s) | |
func.filebase = filebase | |
return func | |
R100 = '\n'.join(randpoem(jabberwocky, 123, 100)) | |
R1000 = '\n'.join(randpoem(jabberwocky, 123, 1000)) | |
R10000 = '\n'.join(randpoem(jabberwocky, 123, 10000)) | |
tasks = [ | |
task_file_store_string('empty', 0, '') | |
] | |
tasks += [task_file_store_string('jabberwocky%02d' % n, n, jabberwocky) for n in [1,2,5,10,20]] | |
tasks += [task_file_attr_string('jabberwocky_attr%02d' % n, n, jabberwocky) for n in [1,2,5,10,20]] | |
tasks += [task_file_store_string('jabberwocky%02d_noshuffle' % n, n, jabberwocky, shuffle=False) for n in [1,10]] | |
tasks += [ | |
task_file_store_string('jabberwocky01repeat10', 1, jabberwocky*10), | |
task_file_store_string('jabberwockyR100',1,R100), | |
task_file_store_string('jabberwockyR1000',1,R1000), | |
task_file_store_string('jabberwockyR10000',1,R10000), | |
task_file_store_string('jabberwockyR100x2',2,R100), | |
task_file_store_string('jabberwockyR1000x2',2,R1000), | |
task_file_store_string('jabberwockyR10000x2',2,R10000), | |
task_file_store_string('jabberwockyR100x2',10,R100), | |
task_file_store_string('jabberwockyR1000x2',10,R1000), | |
task_file_store_string('jabberwockyR10000x2',10,R10000), | |
] | |
baseline_overhead = None | |
for task in tasks: | |
filename = os.path.join(rootdir,'%s.h5' % task.filebase) | |
groupname = 'data' | |
with H5fileContext(filename,groupname) as f: | |
task(f) | |
(n, tlogical, talloc, totsz) = analyze(filename, groupname) | |
overhead = (totsz-talloc) | |
if baseline_overhead is None: | |
baseline_overhead = overhead | |
baseline_str = str(baseline_overhead) | |
print 'logical: %8d alloc: %8d disk: %8d ovhd: %8d+%s items: %2d -- %s' % ( | |
tlogical, talloc, totsz, overhead-baseline_overhead, baseline_str, n, filename) | |
if __name__ == '__main__': | |
explore_filesizes('c:/tmp/hdf5') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
logical: 0 alloc: 0 disk: 2232 ovhd: 0+2232 items: 0 -- c:/tmp/hdf5\empty.h5 | |
logical: 966 alloc: 619 disk: 5643 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01.h5 | |
logical: 1932 alloc: 1238 disk: 8702 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwocky02.h5 | |
logical: 4830 alloc: 3095 disk: 17879 ovhd: 12552+2232 items: 5 -- c:/tmp/hdf5\jabberwocky05.h5 | |
logical: 9660 alloc: 6190 disk: 33678 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwocky10.h5 | |
[warning: C:\app\python\anaconda\lib\site-packages\tables\attributeset.py:391: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal] | |
logical: 19320 alloc: 12380 disk: 65252 ovhd: 50640+2232 items: 20 -- c:/tmp/hdf5\jabberwocky20.h5 | |
logical: 0 alloc: 0 disk: 2760 ovhd: 528+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr01.h5 | |
logical: 0 alloc: 0 disk: 3288 ovhd: 1056+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr02.h5 | |
logical: 0 alloc: 0 disk: 4872 ovhd: 2640+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr05.h5 | |
logical: 0 alloc: 0 disk: 7512 ovhd: 5280+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr10.h5 | |
logical: 0 alloc: 0 disk: 12792 ovhd: 10560+2232 items: 0 -- c:/tmp/hdf5\jabberwocky_attr20.h5 | |
logical: 966 alloc: 619 disk: 5619 ovhd: 2768+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01_noshuffle.h5 | |
logical: 9660 alloc: 6190 disk: 33390 ovhd: 24968+2232 items: 10 -- c:/tmp/hdf5\jabberwocky10_noshuffle.h5 | |
logical: 9660 alloc: 685 disk: 5709 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwocky01repeat10.h5 | |
logical: 489 alloc: 398 disk: 5422 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR100.h5 | |
logical: 5051 alloc: 2124 disk: 7148 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR1000.h5 | |
logical: 52917 alloc: 16425 disk: 21449 ovhd: 2792+2232 items: 1 -- c:/tmp/hdf5\jabberwockyR10000.h5 | |
logical: 978 alloc: 796 disk: 8260 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR100x2.h5 | |
logical: 10102 alloc: 4248 disk: 11712 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR1000x2.h5 | |
logical: 105834 alloc: 32850 disk: 40314 ovhd: 5232+2232 items: 2 -- c:/tmp/hdf5\jabberwockyR10000x2.h5 | |
logical: 4890 alloc: 3980 disk: 31468 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR100x2.h5 | |
logical: 50510 alloc: 21240 disk: 48728 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR1000x2.h5 | |
logical: 529170 alloc: 164250 disk: 191738 ovhd: 25256+2232 items: 10 -- c:/tmp/hdf5\jabberwockyR10000x2.h5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment