quasiben/gwas-gpu.py

## gwas-gpu.py
import cupy
import numpy as np
import xarray as xr
import dask.array as da
from dask.array import stats
import fsspec

n = 10000 # Number of variants (i.e. genomic locations)
m = 100000 # Number of individuals (i.e. people)
c = 3    # Number of covariates (i.e. confounders)

# Representative settings for single (small) UK Biobank chromosome:
# n, m, c = 141910, 365941, 25

path = f"file://sim_ds_{n}_{m}_{c}.zarr"
path

fs = fsspec.filesystem('file')

if not fs.exists(path):
    rs = da.random.RandomState(0)
    XL, BL = rs.randint(0, 128, size=(n, m), chunks=(5216, 5792)), da.array([1] + [0] * (m - 1))
    XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,))
    Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m)
    ds = xr.Dataset(dict(
        # This is a proxy for discretized allele dosages (between 0 and 2)
        XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')),
        # This value represents covariates for samples, e.g. age, sex, ancestry, etc.
        XC=(('samples', 'covariates'), XC.astype('f4')),
        # This is the outcome on which all variant data will be regressed separately
        Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')),
    ))
    print(f'Saving simulated data to {path}')
    ds.to_zarr(fsspec.get_mapper(path), mode='w', consolidated=True)

ds = xr.open_zarr(fsspec.get_mapper(path), consolidated=True)

def gwas(XL, XC, Y):
    # Add intercept
    ones = da.ones_like(cupy.ones((XC.shape[0], 1)))
    XC = da.concatenate([ones, XC], axis=1)

    # Rechunk along short axes
    XC = XC.rechunk((None, -1))
    Y = Y.rechunk((None, -1))
    dof = Y.shape[0] - XC.shape[1] - 1

    # Apply orthogonal projection to eliminate core covariates
    XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0]
    YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]

    # Estimate coefficients for each loop covariate
    XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
    B = (XLP.T @ YP) / XLPS

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
    RSS = (YR ** 2).sum(axis=0)

    # Get t-statistics for coefficient estimates and match to p-values
    T = B / np.sqrt(RSS / dof / XLPS)
    T = T.map_blocks(cupy.asnumpy)
    P = da.map_blocks(
        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
    )
    B = B.map_blocks(cupy.asnumpy)
    return xr.Dataset(dict(
        beta=(('variants','outcomes'), B),
        pval=(('variants','outcomes'), P)
    ))

# Define the GWAS regressions
dsr = gwas(
    # Note: This (the largest) array needs to be rechunked due to scalability
    # issues with da.matmul, specifically https://github.com/dask/dask/pull/6924.
    # See here for more details:
    # https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134
    ds.XL.data.rechunk((652, 5792)).T.astype('f4').map_blocks(cupy.asarray),
    ds.XC.data.map_blocks(cupy.asarray),
    ds.Y.data.map_blocks(cupy.asarray)
)

# Compute and save betas/p-values
output_path = f"file://sim_res-optimization-gpu_{n}_{m}_{c}.zarr"
#dsr.to_zarr(fsspec.get_mapper(output_path), mode='w', consolidated=True)
print(f'Results saved to {output_path}')
	import cupy
	import numpy as np
	import xarray as xr
	import dask.array as da
	from dask.array import stats
	import fsspec

	n = 10000 # Number of variants (i.e. genomic locations)
	m = 100000 # Number of individuals (i.e. people)
	c = 3 # Number of covariates (i.e. confounders)

	# Representative settings for single (small) UK Biobank chromosome:
	# n, m, c = 141910, 365941, 25

	path = f"file://sim_ds_{n}_{m}_{c}.zarr"
	path

	fs = fsspec.filesystem('file')

	if not fs.exists(path):
	rs = da.random.RandomState(0)
	XL, BL = rs.randint(0, 128, size=(n, m), chunks=(5216, 5792)), da.array([1] + [0] * (m - 1))
	XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,))
	Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m)
	ds = xr.Dataset(dict(
	# This is a proxy for discretized allele dosages (between 0 and 2)
	XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')),
	# This value represents covariates for samples, e.g. age, sex, ancestry, etc.
	XC=(('samples', 'covariates'), XC.astype('f4')),
	# This is the outcome on which all variant data will be regressed separately
	Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')),
	))
	print(f'Saving simulated data to {path}')
	ds.to_zarr(fsspec.get_mapper(path), mode='w', consolidated=True)

	ds = xr.open_zarr(fsspec.get_mapper(path), consolidated=True)

	def gwas(XL, XC, Y):
	# Add intercept
	ones = da.ones_like(cupy.ones((XC.shape[0], 1)))
	XC = da.concatenate([ones, XC], axis=1)

	# Rechunk along short axes
	XC = XC.rechunk((None, -1))
	Y = Y.rechunk((None, -1))
	dof = Y.shape[0] - XC.shape[1] - 1

	# Apply orthogonal projection to eliminate core covariates
	XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0]
	YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]

	# Estimate coefficients for each loop covariate
	XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
	B = (XLP.T @ YP) / XLPS

	# Compute residuals for each loop covariate and outcome separately
	YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
	RSS = (YR ** 2).sum(axis=0)

	# Get t-statistics for coefficient estimates and match to p-values
	T = B / np.sqrt(RSS / dof / XLPS)
	T = T.map_blocks(cupy.asnumpy)
	P = da.map_blocks(
	lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
	)
	B = B.map_blocks(cupy.asnumpy)
	return xr.Dataset(dict(
	beta=(('variants','outcomes'), B),
	pval=(('variants','outcomes'), P)
	))

	# Define the GWAS regressions
	dsr = gwas(
	# Note: This (the largest) array needs to be rechunked due to scalability
	# issues with da.matmul, specifically https://github.com/dask/dask/pull/6924.
	# See here for more details:
	# https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134
	ds.XL.data.rechunk((652, 5792)).T.astype('f4').map_blocks(cupy.asarray),
	ds.XC.data.map_blocks(cupy.asarray),
	ds.Y.data.map_blocks(cupy.asarray)
	)

	# Compute and save betas/p-values
	output_path = f"file://sim_res-optimization-gpu_{n}_{m}_{c}.zarr"
	#dsr.to_zarr(fsspec.get_mapper(output_path), mode='w', consolidated=True)
	print(f'Results saved to {output_path}')