Skip to content

Instantly share code, notes, and snippets.

@quasiben
Last active April 6, 2024 15:52
Show Gist options
  • Save quasiben/f7f3099b3b250101d15dd4b3a2fda26e to your computer and use it in GitHub Desktop.
Save quasiben/f7f3099b3b250101d15dd4b3a2fda26e to your computer and use it in GitHub Desktop.
import cupy
import numpy as np
import xarray as xr
import dask.array as da
from dask.array import stats
import fsspec
n = 10000 # Number of variants (i.e. genomic locations)
m = 100000 # Number of individuals (i.e. people)
c = 3 # Number of covariates (i.e. confounders)
# Representative settings for single (small) UK Biobank chromosome:
# n, m, c = 141910, 365941, 25
path = f"file://sim_ds_{n}_{m}_{c}.zarr"
path
fs = fsspec.filesystem('file')
if not fs.exists(path):
rs = da.random.RandomState(0)
XL, BL = rs.randint(0, 128, size=(n, m), chunks=(5216, 5792)), da.array([1] + [0] * (m - 1))
XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,))
Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m)
ds = xr.Dataset(dict(
# This is a proxy for discretized allele dosages (between 0 and 2)
XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')),
# This value represents covariates for samples, e.g. age, sex, ancestry, etc.
XC=(('samples', 'covariates'), XC.astype('f4')),
# This is the outcome on which all variant data will be regressed separately
Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')),
))
print(f'Saving simulated data to {path}')
ds.to_zarr(fsspec.get_mapper(path), mode='w', consolidated=True)
ds = xr.open_zarr(fsspec.get_mapper(path), consolidated=True)
def gwas(XL, XC, Y):
# Add intercept
ones = da.ones_like(cupy.ones((XC.shape[0], 1)))
XC = da.concatenate([ones, XC], axis=1)
# Rechunk along short axes
XC = XC.rechunk((None, -1))
Y = Y.rechunk((None, -1))
dof = Y.shape[0] - XC.shape[1] - 1
# Apply orthogonal projection to eliminate core covariates
XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0]
YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]
# Estimate coefficients for each loop covariate
XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
B = (XLP.T @ YP) / XLPS
# Compute residuals for each loop covariate and outcome separately
YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
RSS = (YR ** 2).sum(axis=0)
# Get t-statistics for coefficient estimates and match to p-values
T = B / np.sqrt(RSS / dof / XLPS)
T = T.map_blocks(cupy.asnumpy)
P = da.map_blocks(
lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
)
B = B.map_blocks(cupy.asnumpy)
return xr.Dataset(dict(
beta=(('variants','outcomes'), B),
pval=(('variants','outcomes'), P)
))
# Define the GWAS regressions
dsr = gwas(
# Note: This (the largest) array needs to be rechunked due to scalability
# issues with da.matmul, specifically https://github.com/dask/dask/pull/6924.
# See here for more details:
# https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134
ds.XL.data.rechunk((652, 5792)).T.astype('f4').map_blocks(cupy.asarray),
ds.XC.data.map_blocks(cupy.asarray),
ds.Y.data.map_blocks(cupy.asarray)
)
# Compute and save betas/p-values
output_path = f"file://sim_res-optimization-gpu_{n}_{m}_{c}.zarr"
#dsr.to_zarr(fsspec.get_mapper(output_path), mode='w', consolidated=True)
print(f'Results saved to {output_path}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment