Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Rice3K analysis 2: some specific regions are under selective pressure
#standardSQL
--
-- The following query computes the Hardy-Weinberg equilibrium for variants.
--
WITH variants AS (
SELECT reference_name, start_position, end_position, reference_bases, alt,
SUM(HOM_REF) AS HOM_REF,
SUM(HOM_ALT) AS HOM_ALT,
SUM(HET) AS HET
FROM (
SELECT
reference_name,
start_position,
end_position,
reference_bases,
alt.alt,
-- Within each variant count the number of HOM_REF/HOM_ALT/HET samples.
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 0) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_REF,
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_ALT,
(SELECT SUM(CAST((SELECT LOGICAL_OR(gt = 0) AND LOGICAL_OR(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HET
FROM
`bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0` v
JOIN UNNEST(alternate_bases) AS alt
WHERE
--reference_name = 'Chr9' AND start_position = 4691919 AND
# Only include biallelic snps.
reference_bases IN ('A','C','G','T')
AND ARRAY_LENGTH(alternate_bases) = 1
AND alt.alt IN ('A','C','G','T')
)
AS x
GROUP BY reference_name, start_position, end_position, reference_bases, alt
),
observations AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
HOM_REF AS OBS_HOM1,
HET AS OBS_HET,
HOM_ALT AS OBS_HOM2,
HOM_REF + HET + HOM_ALT AS SAMPLE_COUNT,
3024 - (HOM_REF + HET + HOM_ALT) AS MISSING_COUNT
FROM variants
),
expectations AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
# we use F as an inbreeding coefficient.
# to revert to standard Hardy-Weiberg F=0, but for selfing/inbred rice population we need to use a higher value 0.95
# Expected AA
# p^2
# ((COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2) * SAMPLE_COUNT
POW((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT), 2) * (SAMPLE_COUNT + MISSING_COUNT)
+ 0.95 * ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT)) * ((OBS_HOM2 + OBS_HET/2) / (SAMPLE_COUNT)) * (MISSING_COUNT + SAMPLE_COUNT)
AS E_HOM1,
# Expected Aa
# 2pq
# 2 * (COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) * (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) * SAMPLE_COUNT
(1.0 - 0.95) * 2
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT))
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT AS E_HET,
# Expected aa
# q^2
# (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2 * SAMPLE_COUNT
POW((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT, 2) * SAMPLE_COUNT
+ 0.95
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT))
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT
AS E_HOM2
FROM observations
WHERE SAMPLE_COUNT > 0
),
stats AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
E_HOM1,
E_HET,
E_HOM2,
# Chi Squared Calculation
# SUM(((Observed - Expected)^2) / Expected )
ROUND((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2), 6)
AS ChiSq,
# Determine if Chi Sq value is significant
# The chi-squared score corresponding to a nominal P-value of 0.05
# for a table with 2 degrees of freedom is 5.991.
IF((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2)
> 5.991, TRUE, FALSE) AS PVALUE_SIG
FROM expectations
WHERE
E_HOM1 > 0 AND E_HET > 0 AND E_HOM2 > 0
-- Optionally add a clause here to constrain the results.
)
SELECT pt.ref,pt.bin,pt.n AS t,pf.n AS f,pt.n/(pt.n+pf.n) AS pct_t,pf.n/(pt.n+pf.n) AS pct_f
FROM
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n
FROM stats
WHERE PVALUE_SIG IS TRUE
GROUP BY ref, bin
) AS pt,
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n
FROM stats
WHERE PVALUE_SIG IS FALSE
GROUP BY ref, bin
) AS pf
WHERE pt.ref = pf.ref AND pt.bin = pf.bin
ORDER BY ref, bin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.