Skip to content

Instantly share code, notes, and snippets.

@allenday
Created March 6, 2019 10:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allenday/422c983a13c110985c1a375fb7447016 to your computer and use it in GitHub Desktop.
Save allenday/422c983a13c110985c1a375fb7447016 to your computer and use it in GitHub Desktop.
Rice3K analysis 2: some specific regions are under selective pressure
#standardSQL
--
-- The following query computes the Hardy-Weinberg equilibrium for variants.
--
WITH variants AS (
SELECT reference_name, start_position, end_position, reference_bases, alt,
SUM(HOM_REF) AS HOM_REF,
SUM(HOM_ALT) AS HOM_ALT,
SUM(HET) AS HET
FROM (
SELECT
reference_name,
start_position,
end_position,
reference_bases,
alt.alt,
-- Within each variant count the number of HOM_REF/HOM_ALT/HET samples.
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 0) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_REF,
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_ALT,
(SELECT SUM(CAST((SELECT LOGICAL_OR(gt = 0) AND LOGICAL_OR(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HET
FROM
`bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0` v
JOIN UNNEST(alternate_bases) AS alt
WHERE
--reference_name = 'Chr9' AND start_position = 4691919 AND
# Only include biallelic snps.
reference_bases IN ('A','C','G','T')
AND ARRAY_LENGTH(alternate_bases) = 1
AND alt.alt IN ('A','C','G','T')
)
AS x
GROUP BY reference_name, start_position, end_position, reference_bases, alt
),
observations AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
HOM_REF AS OBS_HOM1,
HET AS OBS_HET,
HOM_ALT AS OBS_HOM2,
HOM_REF + HET + HOM_ALT AS SAMPLE_COUNT,
3024 - (HOM_REF + HET + HOM_ALT) AS MISSING_COUNT
FROM variants
),
expectations AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
# we use F as an inbreeding coefficient.
# to revert to standard Hardy-Weiberg F=0, but for selfing/inbred rice population we need to use a higher value 0.95
# Expected AA
# p^2
# ((COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2) * SAMPLE_COUNT
POW((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT), 2) * (SAMPLE_COUNT + MISSING_COUNT)
+ 0.95 * ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT)) * ((OBS_HOM2 + OBS_HET/2) / (SAMPLE_COUNT)) * (MISSING_COUNT + SAMPLE_COUNT)
AS E_HOM1,
# Expected Aa
# 2pq
# 2 * (COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) * (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) * SAMPLE_COUNT
(1.0 - 0.95) * 2
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT))
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT AS E_HET,
# Expected aa
# q^2
# (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2 * SAMPLE_COUNT
POW((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT, 2) * SAMPLE_COUNT
+ 0.95
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT))
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT
AS E_HOM2
FROM observations
WHERE SAMPLE_COUNT > 0
),
stats AS (
SELECT
reference_name,
start_position,
reference_bases,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
E_HOM1,
E_HET,
E_HOM2,
# Chi Squared Calculation
# SUM(((Observed - Expected)^2) / Expected )
ROUND((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2), 6)
AS ChiSq,
# Determine if Chi Sq value is significant
# The chi-squared score corresponding to a nominal P-value of 0.05
# for a table with 2 degrees of freedom is 5.991.
IF((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2)
> 5.991, TRUE, FALSE) AS PVALUE_SIG
FROM expectations
WHERE
E_HOM1 > 0 AND E_HET > 0 AND E_HOM2 > 0
-- Optionally add a clause here to constrain the results.
)
SELECT pt.ref,pt.bin,pt.n AS t,pf.n AS f,pt.n/(pt.n+pf.n) AS pct_t,pf.n/(pt.n+pf.n) AS pct_f
FROM
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n
FROM stats
WHERE PVALUE_SIG IS TRUE
GROUP BY ref, bin
) AS pt,
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n
FROM stats
WHERE PVALUE_SIG IS FALSE
GROUP BY ref, bin
) AS pf
WHERE pt.ref = pf.ref AND pt.bin = pf.bin
ORDER BY ref, bin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment