Created
March 9, 2023 15:21
-
-
Save ivirshup/c56439fad2f2d768ec753b6e50ee22ab to your computer and use it in GitHub Desktop.
Simple OOC concat for anndata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zarr | |
import pandas as pd, numpy as np | |
from anndata.experimental import read_elem, write_elem | |
from anndata._core.sparse_dataset import SparseDataset | |
def _df_index(df: zarr.Group) -> np.ndarray: | |
index_key = df.attrs["_index"] | |
return read_elem(df[index_key]) | |
def concatenate_anndatas(groups: list[zarr.Group], output_group: zarr.Group): | |
# All groups must be anndata | |
for group in groups: | |
assert group.attrs["encoding-type"] == "anndata" | |
# All var_names must be equal | |
var_names = _df_index(groups[0]["var"]) | |
var_names_key = groups[0]["var"].attrs["_index"] | |
for g in groups[1:]: | |
cur_var_names = _df_index(g["var"]) | |
if not np.array_equal(var_names, cur_var_names): | |
raise ValueError("var_names must be equal") | |
# Validate X | |
for g in groups: | |
assert g["X"].attrs["encoding-type"] == "csr_matrix" | |
assert g["X"].attrs["encoding-version"] == "0.1.0" | |
output_group.attrs.update({"encoding-type": "anndata", "encoding-version": "0.1.0"}) | |
# Write var names | |
var = output_group.create_group("var") | |
var.attrs.update({ | |
"_index": var_names_key, | |
"column-order": [], | |
"encoding-type": "dataframe", | |
"encoding-version": "0.2.0", | |
}) | |
write_elem(var, var_names_key, var_names) | |
# Write obs names | |
obs_names_key = groups[0]["obs"].attrs["_index"] | |
obs = output_group.create_group("obs") | |
obs.attrs.update({ | |
"_index": obs_names_key, | |
"column-order": [], | |
"encoding-type": "dataframe", | |
"encoding-version": "0.2.0", | |
}) | |
write_elem(obs, obs_names_key, np.concatenate([_df_index(g["obs"]) for g in groups])) | |
# Write X | |
write_elem(output_group, "X", SparseDataset(groups[0]["X"])) | |
X = SparseDataset(output_group["X"]) | |
for g in groups[1:]: | |
X.append(SparseDataset(g["X"])) | |
from scipy import sparse | |
import pandas as pd | |
import numpy as np | |
import anndata as ad | |
from anndata.experimental import read_elem, write_elem | |
from anndata.tests.helpers import assert_equal | |
import zarr | |
def test_concatenate_ondisk(): | |
adatas = [] | |
obs_offset = 0 | |
n_vars = 20 | |
var_names = pd.Index([f"gene_{i}" for i in range(n_vars)], name="genes") | |
for i in range(3): | |
n = np.random.random_integers(30, 50) | |
adata = ad.AnnData( | |
X=sparse.random(n, 20, density=.2, format="csr"), | |
obs=pd.DataFrame(index=pd.Index([f"cell_{i}" for i in range(obs_offset, obs_offset + n)], name="cells")), | |
var=pd.DataFrame(index=var_names), | |
) | |
obs_offset += n | |
adatas.append(adata) | |
groups = [] | |
for adata in adatas: | |
group = zarr.group() | |
write_elem(group, "/", adata) | |
groups.append(group) | |
output_group = zarr.group() | |
concatenate_anndatas(groups, output_group) | |
result = read_elem(output_group) | |
expected = ad.concat(adatas) | |
assert_equal(expected, result) | |
test_concatenate_ondisk() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment