Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Rice 3K analysis 1: genetic variants are not uniformly distributed
WITH
ind AS (
-- count variants for each sample/ref/bin
SELECT
call.name AS sample, reference_name AS ref, CAST(start_position/1000000 AS INT64) AS bin, COUNT(call.name) AS n
FROM `bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0`
JOIN UNNEST(call) AS call
JOIN UNNEST(alternate_bases) AS alt
WHERE alt.alt != '<*>'
GROUP BY sample, ref, bin
--ORDER BY sample, ref, bin
),
pop AS (
-- over population of all ref/bin tuples
SELECT ref, bin, AVG(n) AS pop_mu, STDDEV(n) AS pop_sigma
FROM ind
GROUP BY ref, bin
),
zscore AS (
-- Z-score for each individual's bin vs. population avg of bin
SELECT
ind.sample,
ind.n AS ind_n,
(ind.n-pop.pop_mu)/pop.pop_sigma AS z,
pop.ref,
pop.bin,
pop.pop_mu,
pop.pop_sigma
FROM pop, ind
WHERE ind.ref = pop.ref AND ind.bin = pop.bin
)
SELECT * from zscore ORDER BY sample,ref,bin --ABS(Z) DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.