Skip to content

Instantly share code, notes, and snippets.

@molpopgen
Last active October 5, 2017 19:42
Show Gist options
  • Save molpopgen/574a77a658f97fe1414ffbeef80c4a9e to your computer and use it in GitHub Desktop.
Save molpopgen/574a77a658f97fe1414ffbeef80c4a9e to your computer and use it in GitHub Desktop.
WIP for fwdpy11 <-> PyTables interaction
# Test code writing fwdpy11
# pop data to hdf5 via PyTables.
# GOAL IS COMPRESSION!!!!
# Instead of using ragged arrays
# (not compressable), we use carrays,
# and simply concacenate gamete data.
# In longer-term, this would be
# abstracted into some sort of GameteTable
# class that interacts with an hdf5 object.
# This totally shows how it is easy to get
# a lot of copy/pasted code. There is a lot
# of room for abstraction...
import tests.quick_pops
import tables
import numpy as np
import pickle
import lzma
import fwdpy11
import collections
import sys
COMPLEVEL = 5
def _fillTable(t, d):
"""
There is a bug in PyTables w.r.to direct
insertion from a structured array. The workaround
is to manually enter all data.
https://github.com/PyTables/PyTables/issues/643
:param t: a tables.Table
:param d: a NumPy record array
"""
keys = d.dtype.names
for datum in d:
row = t.row
for i, key in enumerate(keys):
row[key] = datum[i]
row.append()
def _merge_record_arrays(arrays):
"""
Merges numpy record arrays
:param arrays: tuple of record arrays
"""
import numpy.lib.recfunctions as rfn
return rfn.merge_arrays(arrays,
flatten=True, usemask=False)
def _write_mutations(h5f, group, tablename, mutation_container,
uint_container,
uint_container_dtype_name,
filters=None):
marray = _merge_record_arrays((np.array(mutation_container.array()),
np.array(uint_container,
dtype=[(uint_container_dtype_name,
np.uint32)])))
return h5f.create_table(group, tablename, obj=marray, filters=filters)
def _rebuild_mutations(mutation_table):
mutations = fwdpy11.MutationContainer()
uintvec = fwdpy11.VectorUint32()
field = None
valid_fields = ['mcounts', 'ftime']
for f in valid_fields:
if any(i == f for i in mutation_table.dtype.names) is True:
field = f
for record in mutation_table.iterrows():
uintvec.append(record[field])
# Things are out-of-order here
# b/c of how C++ representation
# to get into numpy array
# packs the struct nicely
m = fwdpy11.Mutation(
record['pos'], record['s'], record['h'],
record['g'], record['label'])
mutations.append(m)
if field == 'mcount':
rvtype = collections.namedtuple(
'MutationTableData', ['mutations', 'mcounts'])
rv = rvtype(mutations, uintvec)
else:
rvtype = collections.namedtuple(
'FixationTableData', ['fixations', 'fixation_times'])
rv = rvtype(mutations, uintvec)
return rv
def _write_gamete_mutation_keys(h5f, group, name1,
name2, gametes, attribute_name, filters):
u32atom = tables.UInt32Atom()
# Write length of each gametes neutral muts container
n_array = h5f.create_carray(group, name1, u32atom,
(len(gametes),), filters=filters)
for i in range(len(gametes)):
n_array[i] = len(getattr(gametes[i], attribute_name))
n_array.flush()
# Write the neutral gamete data to the file
nneutral = sum([len(getattr(i, attribute_name)) for i in gametes])
neutral_array = h5f.create_carray(group, name2, u32atom,
(nneutral,), filters=filters)
dummy = 0
for i in gametes:
neutral_array[dummy:(dummy + len(getattr(i, attribute_name)))
] = np.array(getattr(i, attribute_name))
dummy += len(getattr(i, attribute_name))
if dummy != nneutral:
raise RuntimeError("inconsistent internal state")
neutral_array.flush()
def _write_gametes(h5f, group, gametes, filters):
# Write number of occurrences of each gamete to file
u32atom = tables.UInt32Atom()
n_array = h5f.create_carray(group, 'gamete_n', u32atom,
(len(gametes),), filters=filters)
for i in range(len(gametes)):
n_array[i] = gametes[i].n
n_array.flush()
_write_gamete_mutation_keys(
h5f,
group, 'gamete_num_neutral',
'gamete_mutations', gametes, 'mutations', filters)
_write_gamete_mutation_keys(
h5f,
group, 'gamete_num_selected',
'gamete_smutations', gametes, 'smutations', filters)
def _read_gamete_num_mutations(group, name):
if hasattr(group, name) is False:
raise ValueError("path does not exist")
n = getattr(group, name)
rv = fwdpy11.VectorUint32()
for i in n:
rv.append(i)
return rv
def _read_gametes(group):
nneutral_per_gam = _read_gamete_num_mutations(
group, 'gamete_num_neutral')
nselected_per_gam = _read_gamete_num_mutations(
group, 'gamete_num_selected')
gamete_n = _read_gamete_num_mutations(group, 'gamete_n')
if len(set([len(nneutral_per_gam),
len(nselected_per_gam),
len(gamete_n)])) != 1:
raise RuntimeError("inconsistent container lengths")
gametes = fwdpy11.GameteContainer()
ndata = getattr(group, 'gamete_mutations')
sdata = getattr(group, 'gamete_smutations')
ndummy, sdummy = 0, 0
for neut, sel, n in zip(nneutral_per_gam, nselected_per_gam, gamete_n):
tn = fwdpy11.VectorUint32()
ts = fwdpy11.VectorUint32()
for i in ndata[ndummy:(ndummy + neut)]:
tn.append(i)
for i in sdata[sdummy:(sdummy + sel)]:
ts.append(i)
gametes.append(fwdpy11.Gamete((n, tn, ts)))
ndummy += neut
sdummy += sel
assert(ndummy == len(ndata))
assert(sdummy == len(sdata))
return gametes
def _write_slocus_diploids(h5f, group, diploids, filters):
g = h5f.create_group(group, 'diploids')
t = h5f.create_table(g, 'traits', obj=np.array(diploids.trait_array()))
t.flush()
t = h5f.create_table(g, 'keys', obj=np.array(diploids.key_array()))
t.flush()
def _read_slocus_diploids(path):
trait_path = getattr(path, 'traits')
key_path = getattr(path, 'keys')
# "typedef"
Diploid = fwdpy11.SingleLocusDiploid
if len(trait_path) == len(key_path):
rv = fwdpy11.DiploidContainer()
for i, j in zip(enumerate(trait_path.iterrows()), key_path):
rv.append(Diploid.create(j['first'],
j['second'],
i[0], i[1]['g'],
i[1]['e'], i[1]['w']
))
return rv
def _write_generation(h5f, group, g, filters):
carray = h5f.create_carray(group, 'generation', obj=g,
filters=filters)
carray.flush()
PopHDF = collections.namedtuple('PopHDF', ['poptype', 'pathname'])
def h5write_pop(pop, h5f, group, filters=None):
# Does the group exist?
if hasattr(h5f, group) is False:
group = h5f.create_group(h5f.root, group, filters=filters)
if isinstance(pop, fwdpy11.SlocusPop):
_write_generation(h5f, group, np.array(
[pop.N] * 1, dtype=np.uint32), filters)
_write_slocus_diploids(h5f, group, pop.diploids, filters)
gamete_group = h5f.create_group(group, 'gametes')
_write_gametes(h5f, gamete_group, pop.gametes, filters)
mtab = _write_mutations(h5f,
group, 'mutations',
pop.mutations, pop.mcounts,
'mcounts',
filters=filters)
mtab.flush()
ftab = _write_mutations(h5f, group, 'fixations',
pop.fixations, pop.fixation_times,
'ftime',
filters=filters)
ftab.flush()
return PopHDF(pop.__class__, group._v_pathname)
else:
raise NotImplementedError("unsupported population type")
def h5read_pop(h5f, pophdf):
path = h5f.get_node(h5f.root, pophdf.pathname)
mut_root = h5f.get_node(path, 'mutations')
mutations, mcounts = _rebuild_mutations(mut_root)
fix_root = h5f.get_node(path, 'fixations')
fixations, fixation_times = _rebuild_mutations(fix_root)
dip_root = h5f.get_node(path, 'diploids')
diploids = _read_slocus_diploids(dip_root)
gam_root = h5f.get_node(path, 'gametes')
gametes = _read_gametes(gam_root)
return pophdf.poptype.create_with_fixations(
diploids, gametes, mutations, fixations, fixation_times, 0)
if __name__ == "__main__":
pop = tests.quick_pops.quick_nonneutral_slocus(N=1000, simlen=10000)
h5f = tables.open_file("population.h5", 'w',
filters=tables.Filters(complevel=COMPLEVEL,
complib='zlib'))
global_filters = tables.Filters(complevel=COMPLEVEL, complib='zlib')
pw = h5write_pop(pop, h5f, 'inner_group', global_filters)
h5f.close()
h5f = tables.open_file("population.h5", 'r')
pop2 = h5read_pop(h5f, pw)
h5f.close()
assert(pop == pop2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment