Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allenday/c8d52f06b68ef6c291d6309c2469b61b to your computer and use it in GitHub Desktop.
Save allenday/c8d52f06b68ef6c291d6309c2469b61b to your computer and use it in GitHub Desktop.
Rice 3K analysis 1: genetic variants are not uniformly distributed
WITH
ind AS (
-- count variants for each sample/ref/bin
SELECT
call.name AS sample, reference_name AS ref, CAST(start_position/1000000 AS INT64) AS bin, COUNT(call.name) AS n
FROM `bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0`
JOIN UNNEST(call) AS call
JOIN UNNEST(alternate_bases) AS alt
WHERE alt.alt != '<*>'
GROUP BY sample, ref, bin
--ORDER BY sample, ref, bin
),
pop AS (
-- over population of all ref/bin tuples
SELECT ref, bin, AVG(n) AS pop_mu, STDDEV(n) AS pop_sigma
FROM ind
GROUP BY ref, bin
),
zscore AS (
-- Z-score for each individual's bin vs. population avg of bin
SELECT
ind.sample,
ind.n AS ind_n,
(ind.n-pop.pop_mu)/pop.pop_sigma AS z,
pop.ref,
pop.bin,
pop.pop_mu,
pop.pop_sigma
FROM pop, ind
WHERE ind.ref = pop.ref AND ind.bin = pop.bin
)
SELECT * from zscore ORDER BY sample,ref,bin --ABS(Z) DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment