Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Created March 9, 2023 15:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivirshup/c56439fad2f2d768ec753b6e50ee22ab to your computer and use it in GitHub Desktop.
Save ivirshup/c56439fad2f2d768ec753b6e50ee22ab to your computer and use it in GitHub Desktop.
Simple OOC concat for anndata
import zarr
import pandas as pd, numpy as np
from anndata.experimental import read_elem, write_elem
from anndata._core.sparse_dataset import SparseDataset
def _df_index(df: zarr.Group) -> np.ndarray:
index_key = df.attrs["_index"]
return read_elem(df[index_key])
def concatenate_anndatas(groups: list[zarr.Group], output_group: zarr.Group):
# All groups must be anndata
for group in groups:
assert group.attrs["encoding-type"] == "anndata"
# All var_names must be equal
var_names = _df_index(groups[0]["var"])
var_names_key = groups[0]["var"].attrs["_index"]
for g in groups[1:]:
cur_var_names = _df_index(g["var"])
if not np.array_equal(var_names, cur_var_names):
raise ValueError("var_names must be equal")
# Validate X
for g in groups:
assert g["X"].attrs["encoding-type"] == "csr_matrix"
assert g["X"].attrs["encoding-version"] == "0.1.0"
output_group.attrs.update({"encoding-type": "anndata", "encoding-version": "0.1.0"})
# Write var names
var = output_group.create_group("var")
var.attrs.update({
"_index": var_names_key,
"column-order": [],
"encoding-type": "dataframe",
"encoding-version": "0.2.0",
})
write_elem(var, var_names_key, var_names)
# Write obs names
obs_names_key = groups[0]["obs"].attrs["_index"]
obs = output_group.create_group("obs")
obs.attrs.update({
"_index": obs_names_key,
"column-order": [],
"encoding-type": "dataframe",
"encoding-version": "0.2.0",
})
write_elem(obs, obs_names_key, np.concatenate([_df_index(g["obs"]) for g in groups]))
# Write X
write_elem(output_group, "X", SparseDataset(groups[0]["X"]))
X = SparseDataset(output_group["X"])
for g in groups[1:]:
X.append(SparseDataset(g["X"]))
from scipy import sparse
import pandas as pd
import numpy as np
import anndata as ad
from anndata.experimental import read_elem, write_elem
from anndata.tests.helpers import assert_equal
import zarr
def test_concatenate_ondisk():
adatas = []
obs_offset = 0
n_vars = 20
var_names = pd.Index([f"gene_{i}" for i in range(n_vars)], name="genes")
for i in range(3):
n = np.random.random_integers(30, 50)
adata = ad.AnnData(
X=sparse.random(n, 20, density=.2, format="csr"),
obs=pd.DataFrame(index=pd.Index([f"cell_{i}" for i in range(obs_offset, obs_offset + n)], name="cells")),
var=pd.DataFrame(index=var_names),
)
obs_offset += n
adatas.append(adata)
groups = []
for adata in adatas:
group = zarr.group()
write_elem(group, "/", adata)
groups.append(group)
output_group = zarr.group()
concatenate_anndatas(groups, output_group)
result = read_elem(output_group)
expected = ad.concat(adatas)
assert_equal(expected, result)
test_concatenate_ondisk()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment