timothymillar/rough_chunked_kinship.py

## rough_chunked_kinship.py
# %%
# # Rough chunked kinship with sgkit
#
# This is a rough example of calculating kinship in chunks with sgkit.
# It is not well optimised and can be very slow in practice.
# It will also be less effective in deep pedigrees with many genertions
# because calculating each chunk of the kinship matrix will require
# calculating the kinship matrix of the sample in that chun *and* their
# ancestors.
#

# %%
import pandas as pd
import numpy as np
import xarray as xr
import sgkit as sg


# %%
# Simulate a pedigree with 10 equal sized generations.
# This is a fairly shallow pedigree which makes it easier to work with.
# Deeper pedigrees are possible, but may require smaller chunk sizes.
np.random.seed(0)
n_generations = 10
n_samples_per_generation = 1000
n_samples = n_generations * n_samples_per_generation
parent_id = np.full((n_samples, 2), -1, int)
for gen in range(1, n_generations):
    offset = gen * n_samples_per_generation
    p = np.random.randint(n_samples_per_generation, size=(n_samples_per_generation, 2))
    p += (gen - 1) * n_samples_per_generation
    parent_id[offset: offset + n_samples_per_generation] = p


# %%
# Create and sgkit style pedigree dataset and assign coordinates.
parent_id = np.where(parent_id < 0, '.', parent_id.astype('U'))
sample_id = np.arange(n_samples).astype('U')
ped = xr.Dataset()
ped['sample_id'] = 'samples', sample_id
ped['parent_id'] = ['samples', 'parents'], parent_id
ped = ped.assign_coords(samples=sample_id)


# %%
# Loop through sub-pedigrees.
# In each iteration we combine two subsets of sample ids.
# Next we extract the combined subset of samples and their ancestors.
# We can then calculate kinship for this sub-pedigree.
groups = np.split(sample_id, 10)  # use more (smaller) groups to minimise the memory requirement.
chunks = []
for i in range(len(groups)):
    for j in range(i):
        # combine two group of target samples
        targets = np.concatenate([groups[j], groups[i]])
        # pull out the sub-pedigree of these samples and their ancestors
        sub = sg.pedigree_sel(ped, samples=targets, ancestor_depth=n_samples)
        # calculate kinship for the sub-pedigree
        sub = sg.pedigree_kinship(sub).assign_coords(
            samples_0=sub.samples.values,
            samples_1=sub.samples.values,
        )
        # (optionally) slice out the kinship for the target samples without their ancestors
        targets_kinship = sub.stat_pedigree_kinship.loc[targets, targets]
        # append the sub-pedigree kinship to our list of chunks
        chunks.append(targets_kinship)


# %%
# The resulting list of chunks can be computed one at a time.
# Note that some pairs of samples will be present in multiple chunks,
# so you may need to deduplicate them in practice.
chunks[0]
chunks[0].compute()


# %%
# We can identify pairs of samples with high kinship in each chunk
chunk = chunks[-1].compute()
idx = chunk >= 0.25
x, y = np.where(idx)
pairs = pd.DataFrame({
    'sample_0': chunk.samples_0.values[x],
    'sample_1': chunk.samples_1.values[y],
    'kinship': chunk.values[x, y]
})


# %%
# we can inspect the pedigree for a pair of related samples
pair = pairs.loc[2].values[0:2]
sg.display_pedigree(sg.pedigree_sel(ped, samples=pair, ancestor_depth=n_samples))
	# %%
	# # Rough chunked kinship with sgkit
	#
	# This is a rough example of calculating kinship in chunks with sgkit.
	# It is not well optimised and can be very slow in practice.
	# It will also be less effective in deep pedigrees with many genertions
	# because calculating each chunk of the kinship matrix will require
	# calculating the kinship matrix of the sample in that chun and their
	# ancestors.
	#

	# %%
	import pandas as pd
	import numpy as np
	import xarray as xr
	import sgkit as sg


	# %%
	# Simulate a pedigree with 10 equal sized generations.
	# This is a fairly shallow pedigree which makes it easier to work with.
	# Deeper pedigrees are possible, but may require smaller chunk sizes.
	np.random.seed(0)
	n_generations = 10
	n_samples_per_generation = 1000
	n_samples = n_generations * n_samples_per_generation
	parent_id = np.full((n_samples, 2), -1, int)
	for gen in range(1, n_generations):
	offset = gen * n_samples_per_generation
	p = np.random.randint(n_samples_per_generation, size=(n_samples_per_generation, 2))
	p += (gen - 1) * n_samples_per_generation
	parent_id[offset: offset + n_samples_per_generation] = p


	# %%
	# Create and sgkit style pedigree dataset and assign coordinates.
	parent_id = np.where(parent_id < 0, '.', parent_id.astype('U'))
	sample_id = np.arange(n_samples).astype('U')
	ped = xr.Dataset()
	ped['sample_id'] = 'samples', sample_id
	ped['parent_id'] = ['samples', 'parents'], parent_id
	ped = ped.assign_coords(samples=sample_id)


	# %%
	# Loop through sub-pedigrees.
	# In each iteration we combine two subsets of sample ids.
	# Next we extract the combined subset of samples and their ancestors.
	# We can then calculate kinship for this sub-pedigree.
	groups = np.split(sample_id, 10) # use more (smaller) groups to minimise the memory requirement.
	chunks = []
	for i in range(len(groups)):
	for j in range(i):
	# combine two group of target samples
	targets = np.concatenate([groups[j], groups[i]])
	# pull out the sub-pedigree of these samples and their ancestors
	sub = sg.pedigree_sel(ped, samples=targets, ancestor_depth=n_samples)
	# calculate kinship for the sub-pedigree
	sub = sg.pedigree_kinship(sub).assign_coords(
	samples_0=sub.samples.values,
	samples_1=sub.samples.values,
	)
	# (optionally) slice out the kinship for the target samples without their ancestors
	targets_kinship = sub.stat_pedigree_kinship.loc[targets, targets]
	# append the sub-pedigree kinship to our list of chunks
	chunks.append(targets_kinship)


	# %%
	# The resulting list of chunks can be computed one at a time.
	# Note that some pairs of samples will be present in multiple chunks,
	# so you may need to deduplicate them in practice.
	chunks[0]
	chunks[0].compute()


	# %%
	# We can identify pairs of samples with high kinship in each chunk
	chunk = chunks[-1].compute()
	idx = chunk >= 0.25
	x, y = np.where(idx)
	pairs = pd.DataFrame({
	'sample_0': chunk.samples_0.values[x],
	'sample_1': chunk.samples_1.values[y],
	'kinship': chunk.values[x, y]
	})


	# %%
	# we can inspect the pedigree for a pair of related samples
	pair = pairs.loc[2].values[0:2]
	sg.display_pedigree(sg.pedigree_sel(ped, samples=pair, ancestor_depth=n_samples))