Last active
April 10, 2024 09:12
-
-
Save timothymillar/2bb2a4521daf948d76808d60eccb390a to your computer and use it in GitHub Desktop.
Example of manually calculating kinship in chunks for a large pedigree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
# # Rough chunked kinship with sgkit | |
# | |
# This is a rough example of calculating kinship in chunks with sgkit. | |
# It is not well optimised and can be very slow in practice. | |
# It will also be less effective in deep pedigrees with many genertions | |
# because calculating each chunk of the kinship matrix will require | |
# calculating the kinship matrix of the sample in that chun *and* their | |
# ancestors. | |
# | |
# %% | |
import pandas as pd | |
import numpy as np | |
import xarray as xr | |
import sgkit as sg | |
# %% | |
# Simulate a pedigree with 10 equal sized generations. | |
# This is a fairly shallow pedigree which makes it easier to work with. | |
# Deeper pedigrees are possible, but may require smaller chunk sizes. | |
np.random.seed(0) | |
n_generations = 10 | |
n_samples_per_generation = 1000 | |
n_samples = n_generations * n_samples_per_generation | |
parent_id = np.full((n_samples, 2), -1, int) | |
for gen in range(1, n_generations): | |
offset = gen * n_samples_per_generation | |
p = np.random.randint(n_samples_per_generation, size=(n_samples_per_generation, 2)) | |
p += (gen - 1) * n_samples_per_generation | |
parent_id[offset: offset + n_samples_per_generation] = p | |
# %% | |
# Create and sgkit style pedigree dataset and assign coordinates. | |
parent_id = np.where(parent_id < 0, '.', parent_id.astype('U')) | |
sample_id = np.arange(n_samples).astype('U') | |
ped = xr.Dataset() | |
ped['sample_id'] = 'samples', sample_id | |
ped['parent_id'] = ['samples', 'parents'], parent_id | |
ped = ped.assign_coords(samples=sample_id) | |
# %% | |
# Loop through sub-pedigrees. | |
# In each iteration we combine two subsets of sample ids. | |
# Next we extract the combined subset of samples and their ancestors. | |
# We can then calculate kinship for this sub-pedigree. | |
groups = np.split(sample_id, 10) # use more (smaller) groups to minimise the memory requirement. | |
chunks = [] | |
for i in range(len(groups)): | |
for j in range(i): | |
# combine two group of target samples | |
targets = np.concatenate([groups[j], groups[i]]) | |
# pull out the sub-pedigree of these samples and their ancestors | |
sub = sg.pedigree_sel(ped, samples=targets, ancestor_depth=n_samples) | |
# calculate kinship for the sub-pedigree | |
sub = sg.pedigree_kinship(sub).assign_coords( | |
samples_0=sub.samples.values, | |
samples_1=sub.samples.values, | |
) | |
# (optionally) slice out the kinship for the target samples without their ancestors | |
targets_kinship = sub.stat_pedigree_kinship.loc[targets, targets] | |
# append the sub-pedigree kinship to our list of chunks | |
chunks.append(targets_kinship) | |
# %% | |
# The resulting list of chunks can be computed one at a time. | |
# Note that some pairs of samples will be present in multiple chunks, | |
# so you may need to deduplicate them in practice. | |
chunks[0] | |
chunks[0].compute() | |
# %% | |
# We can identify pairs of samples with high kinship in each chunk | |
chunk = chunks[-1].compute() | |
idx = chunk >= 0.25 | |
x, y = np.where(idx) | |
pairs = pd.DataFrame({ | |
'sample_0': chunk.samples_0.values[x], | |
'sample_1': chunk.samples_1.values[y], | |
'kinship': chunk.values[x, y] | |
}) | |
# %% | |
# we can inspect the pedigree for a pair of related samples | |
pair = pairs.loc[2].values[0:2] | |
sg.display_pedigree(sg.pedigree_sel(ped, samples=pair, ancestor_depth=n_samples)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment