wasade/snappy_unifrac.py

## snappy_unifrac.py
import numpy as np
import bp
import biom
import skbio
import sys
import joblib
import h5py

class biom_counts_wrapper(object):
    def __init__(self, table_f):
        self.table_f = table_f
        with h5py.File(self.table_f, 'r') as table_h5grp:
            self.ids = np.asarray([i.decode('ascii') for i in table_h5grp['sample/ids'][:]])
            self.otu_ids = np.asarray([i.decode('ascii') for i in table_h5grp['observation/ids'][:]])

    def __getitem__(self, indices_to_take):
        with h5py.File(self.table_f, 'r') as table_h5grp:
            table = biom.Table.from_hdf5(table_h5grp,
                                         ids=self.ids[indices_to_take],
                                         subset_with_metadata=False)
        # need to wrap as it returns a matrix type
        return np.array(table.matrix_data.todense().astype(int).T)

    def __len__(self):
        return len(self.ids)

class monkey:
    def __init__(self, t_fp):
        self.fp = t_fp
        self.bp = None

    # needed for skbio.diversity._util._vectorize_counts_and_tree
    def to_array(self, **kwargs):
        if self.bp is None:
            raise AttributeError("should not happen")
        return bp.to_skbio_treearray(self.bp)

    # punch through
    def shear(self, tips):
        if self.bp is None:
            self.bp = bp.parse_newick(open(self.fp).read())
        new_bp = self.bp.shear(set(tips)).collapse()
        new_obj = self.__class__(self.fp)
        new_obj.bp = new_bp
        return new_obj


tree_full = monkey(sys.argv[1])
biom_full = biom_counts_wrapper(sys.argv[2])

def mapper(func, kw_gen):
    with joblib.Parallel(n_jobs=int(sys.argv[3])) as parallel:
        results = parallel(joblib.delayed(func)(**kw) for kw in kw_gen)
    return results


if __name__ == '__main__':
    from skbio.diversity._block import block_beta_diversity
    from skbio.diversity import beta_diversity

    if sys.argv[4] == 'block':
        result = block_beta_diversity('unweighted_unifrac', biom_full,
                                      ids=biom_full.ids,
                                      otu_ids=biom_full.otu_ids, tree=tree_full,
                                      validate=False, k=64, map_f=mapper)
    elif sys.argv[4] == 'normal':
        result = beta_diversity('unweighted_unifrac',
                                      biom_full[np.arange(len(biom_full.ids))],
                                      ids=biom_full.ids,
                                      otu_ids=biom_full.otu_ids,
                                      tree=tree_full.shear(biom_full.otu_ids),
                                      validate=False)
    else:
        # validate
        bkres = block_beta_diversity('unweighted_unifrac', biom_full,
                                      ids=biom_full.ids,
                                      otu_ids=biom_full.otu_ids, tree=tree_full,
                                      validate=False, k=64, map_f=mapper)
        normal = beta_diversity('unweighted_unifrac',
                                      biom_full[np.arange(len(biom_full.ids))],
                                      ids=biom_full.ids,
                                      otu_ids=biom_full.otu_ids,
                                      tree=tree_full.shear(biom_full.otu_ids),
                                      validate=False)
        import numpy.testing as npt
        npt.assert_almost_equal(bkres.data, normal.data)
	import numpy as np
	import bp
	import biom
	import skbio
	import sys
	import joblib
	import h5py

	class biom_counts_wrapper(object):
	def __init__(self, table_f):
	self.table_f = table_f
	with h5py.File(self.table_f, 'r') as table_h5grp:
	self.ids = np.asarray([i.decode('ascii') for i in table_h5grp['sample/ids'][:]])
	self.otu_ids = np.asarray([i.decode('ascii') for i in table_h5grp['observation/ids'][:]])

	def __getitem__(self, indices_to_take):
	with h5py.File(self.table_f, 'r') as table_h5grp:
	table = biom.Table.from_hdf5(table_h5grp,
	ids=self.ids[indices_to_take],
	subset_with_metadata=False)
	# need to wrap as it returns a matrix type
	return np.array(table.matrix_data.todense().astype(int).T)

	def __len__(self):
	return len(self.ids)

	class monkey:
	def __init__(self, t_fp):
	self.fp = t_fp
	self.bp = None

	# needed for skbio.diversity._util._vectorize_counts_and_tree
	def to_array(self, **kwargs):
	if self.bp is None:
	raise AttributeError("should not happen")
	return bp.to_skbio_treearray(self.bp)

	# punch through
	def shear(self, tips):
	if self.bp is None:
	self.bp = bp.parse_newick(open(self.fp).read())
	new_bp = self.bp.shear(set(tips)).collapse()
	new_obj = self.__class__(self.fp)
	new_obj.bp = new_bp
	return new_obj


	tree_full = monkey(sys.argv[1])
	biom_full = biom_counts_wrapper(sys.argv[2])

	def mapper(func, kw_gen):
	with joblib.Parallel(n_jobs=int(sys.argv[3])) as parallel:
	results = parallel(joblib.delayed(func)(**kw) for kw in kw_gen)
	return results


	if __name__ == '__main__':
	from skbio.diversity._block import block_beta_diversity
	from skbio.diversity import beta_diversity

	if sys.argv[4] == 'block':
	result = block_beta_diversity('unweighted_unifrac', biom_full,
	ids=biom_full.ids,
	otu_ids=biom_full.otu_ids, tree=tree_full,
	validate=False, k=64, map_f=mapper)
	elif sys.argv[4] == 'normal':
	result = beta_diversity('unweighted_unifrac',
	biom_full[np.arange(len(biom_full.ids))],
	ids=biom_full.ids,
	otu_ids=biom_full.otu_ids,
	tree=tree_full.shear(biom_full.otu_ids),
	validate=False)
	else:
	# validate
	bkres = block_beta_diversity('unweighted_unifrac', biom_full,
	ids=biom_full.ids,
	otu_ids=biom_full.otu_ids, tree=tree_full,
	validate=False, k=64, map_f=mapper)
	normal = beta_diversity('unweighted_unifrac',
	biom_full[np.arange(len(biom_full.ids))],
	ids=biom_full.ids,
	otu_ids=biom_full.otu_ids,
	tree=tree_full.shear(biom_full.otu_ids),
	validate=False)
	import numpy.testing as npt
	npt.assert_almost_equal(bkres.data, normal.data)