Skip to content

Instantly share code, notes, and snippets.

@timothymillar
Last active April 10, 2024 09:12
Show Gist options
  • Save timothymillar/2bb2a4521daf948d76808d60eccb390a to your computer and use it in GitHub Desktop.
Save timothymillar/2bb2a4521daf948d76808d60eccb390a to your computer and use it in GitHub Desktop.
Example of manually calculating kinship in chunks for a large pedigree
# %%
# # Rough chunked kinship with sgkit
#
# This is a rough example of calculating kinship in chunks with sgkit.
# It is not well optimised and can be very slow in practice.
# It will also be less effective in deep pedigrees with many genertions
# because calculating each chunk of the kinship matrix will require
# calculating the kinship matrix of the sample in that chun *and* their
# ancestors.
#
# %%
import pandas as pd
import numpy as np
import xarray as xr
import sgkit as sg
# %%
# Simulate a pedigree with 10 equal sized generations.
# This is a fairly shallow pedigree which makes it easier to work with.
# Deeper pedigrees are possible, but may require smaller chunk sizes.
np.random.seed(0)
n_generations = 10
n_samples_per_generation = 1000
n_samples = n_generations * n_samples_per_generation
parent_id = np.full((n_samples, 2), -1, int)
for gen in range(1, n_generations):
offset = gen * n_samples_per_generation
p = np.random.randint(n_samples_per_generation, size=(n_samples_per_generation, 2))
p += (gen - 1) * n_samples_per_generation
parent_id[offset: offset + n_samples_per_generation] = p
# %%
# Create and sgkit style pedigree dataset and assign coordinates.
parent_id = np.where(parent_id < 0, '.', parent_id.astype('U'))
sample_id = np.arange(n_samples).astype('U')
ped = xr.Dataset()
ped['sample_id'] = 'samples', sample_id
ped['parent_id'] = ['samples', 'parents'], parent_id
ped = ped.assign_coords(samples=sample_id)
# %%
# Loop through sub-pedigrees.
# In each iteration we combine two subsets of sample ids.
# Next we extract the combined subset of samples and their ancestors.
# We can then calculate kinship for this sub-pedigree.
groups = np.split(sample_id, 10) # use more (smaller) groups to minimise the memory requirement.
chunks = []
for i in range(len(groups)):
for j in range(i):
# combine two group of target samples
targets = np.concatenate([groups[j], groups[i]])
# pull out the sub-pedigree of these samples and their ancestors
sub = sg.pedigree_sel(ped, samples=targets, ancestor_depth=n_samples)
# calculate kinship for the sub-pedigree
sub = sg.pedigree_kinship(sub).assign_coords(
samples_0=sub.samples.values,
samples_1=sub.samples.values,
)
# (optionally) slice out the kinship for the target samples without their ancestors
targets_kinship = sub.stat_pedigree_kinship.loc[targets, targets]
# append the sub-pedigree kinship to our list of chunks
chunks.append(targets_kinship)
# %%
# The resulting list of chunks can be computed one at a time.
# Note that some pairs of samples will be present in multiple chunks,
# so you may need to deduplicate them in practice.
chunks[0]
chunks[0].compute()
# %%
# We can identify pairs of samples with high kinship in each chunk
chunk = chunks[-1].compute()
idx = chunk >= 0.25
x, y = np.where(idx)
pairs = pd.DataFrame({
'sample_0': chunk.samples_0.values[x],
'sample_1': chunk.samples_1.values[y],
'kinship': chunk.values[x, y]
})
# %%
# we can inspect the pedigree for a pair of related samples
pair = pairs.loc[2].values[0:2]
sg.display_pedigree(sg.pedigree_sel(ped, samples=pair, ancestor_depth=n_samples))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment