Skip to content

Instantly share code, notes, and snippets.

@wasade
Last active August 15, 2016 17:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wasade/60db4059e3b7e42bb648db1d10ef72d6 to your computer and use it in GitHub Desktop.
Save wasade/60db4059e3b7e42bb648db1d10ef72d6 to your computer and use it in GitHub Desktop.
Block decomposition method for unifrac + rapid subsetting of biom direct from disk + joblib + balanced parentheses tree
import numpy as np
import bp
import biom
import skbio
import sys
import joblib
import h5py
class biom_counts_wrapper(object):
def __init__(self, table_f):
self.table_f = table_f
with h5py.File(self.table_f, 'r') as table_h5grp:
self.ids = np.asarray([i.decode('ascii') for i in table_h5grp['sample/ids'][:]])
self.otu_ids = np.asarray([i.decode('ascii') for i in table_h5grp['observation/ids'][:]])
def __getitem__(self, indices_to_take):
with h5py.File(self.table_f, 'r') as table_h5grp:
table = biom.Table.from_hdf5(table_h5grp,
ids=self.ids[indices_to_take],
subset_with_metadata=False)
# need to wrap as it returns a matrix type
return np.array(table.matrix_data.todense().astype(int).T)
def __len__(self):
return len(self.ids)
class monkey:
def __init__(self, t_fp):
self.fp = t_fp
self.bp = None
# needed for skbio.diversity._util._vectorize_counts_and_tree
def to_array(self, **kwargs):
if self.bp is None:
raise AttributeError("should not happen")
return bp.to_skbio_treearray(self.bp)
# punch through
def shear(self, tips):
if self.bp is None:
self.bp = bp.parse_newick(open(self.fp).read())
new_bp = self.bp.shear(set(tips)).collapse()
new_obj = self.__class__(self.fp)
new_obj.bp = new_bp
return new_obj
tree_full = monkey(sys.argv[1])
biom_full = biom_counts_wrapper(sys.argv[2])
def mapper(func, kw_gen):
with joblib.Parallel(n_jobs=int(sys.argv[3])) as parallel:
results = parallel(joblib.delayed(func)(**kw) for kw in kw_gen)
return results
if __name__ == '__main__':
from skbio.diversity._block import block_beta_diversity
from skbio.diversity import beta_diversity
if sys.argv[4] == 'block':
result = block_beta_diversity('unweighted_unifrac', biom_full,
ids=biom_full.ids,
otu_ids=biom_full.otu_ids, tree=tree_full,
validate=False, k=64, map_f=mapper)
elif sys.argv[4] == 'normal':
result = beta_diversity('unweighted_unifrac',
biom_full[np.arange(len(biom_full.ids))],
ids=biom_full.ids,
otu_ids=biom_full.otu_ids,
tree=tree_full.shear(biom_full.otu_ids),
validate=False)
else:
# validate
bkres = block_beta_diversity('unweighted_unifrac', biom_full,
ids=biom_full.ids,
otu_ids=biom_full.otu_ids, tree=tree_full,
validate=False, k=64, map_f=mapper)
normal = beta_diversity('unweighted_unifrac',
biom_full[np.arange(len(biom_full.ids))],
ids=biom_full.ids,
otu_ids=biom_full.otu_ids,
tree=tree_full.shear(biom_full.otu_ids),
validate=False)
import numpy.testing as npt
npt.assert_almost_equal(bkres.data, normal.data)
@wasade
Copy link
Author

wasade commented Jul 25, 2016

...likely possible to avoid reparsing the tree on each individual block. but this works right now, and at 4 cores, appears to be 2x faster than skbio by itself when benching on 2048 random AG samples (gg 97%). Compute isn't necessarily impressive, but memory is: 10x reduction so anticipate a (relatively) small foot print even with large trees.

note: k=64 is 64 samples per block. this seems to be the sweet spot from unifrac benching previously where basically, 64 samples seemed to be a good trade off for the amount of time to construct the counts array vs. avg per pairwise calculation. seems to hold here as well, were k=32 and k=128 are both slower. This could easily be dependent on the size of the tree and the richness of the samples, so possibly k=64 is overfit to AG data.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment