-
-
Save cqgd/f63916fa4ba2c8a0b060344302592335 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(ggplot2) | |
#Load sample QC data | |
sqc = fread("ukbb/qc/ukb_sqc_v2.txt") | |
#Check PCs | |
subset = sample(1:nrow(sqc),size=1e3) | |
ggplot(sqc[subset]) + | |
geom_point(aes(x=PC3,y=PC4, col=factor(in.white.British.ancestry.subset))) | |
#Filter PC outliers | |
PCs = paste0("PC",1:6) | |
is_outlier = function(x,sds=7){ | |
abs(mean(x)-x) >= sds*sd(x) | |
} | |
out = sqc[,lapply(.SD,is_outlier),.SDcols=PCs] | |
sqc[,PCoutlier := rowSums(out)>=1] | |
#Prepare link between sample QC and phenotype data, add eid's | |
link_path = c("ukbb/links/ukb1062_cal_chr1_v2_s488366.fam") | |
link = fread(link_path) | |
nrow(link)==nrow(sqc) | |
sqc[,eid:=link[,1]] | |
#Add phenotype data (includes ethnicity) | |
pheno = fread("ukbb/pheno/9343/ukb9343.csv") | |
included_ethnicities = c(1,1001,1002) #See https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001 | |
pheno[,`21000-0.0` %in% included_ethnicities] | |
sqcpheno = merge(sqc, pheno[,.(eid,`21000-0.0`)], by="eid", all.x=TRUE, all.y=FALSE) | |
#Filter by conditions listed at https://github.com/Nealelab/UK_Biobank_GWAS#imputed-v3-sample-qc | |
conditions = list(all = expression(rep(TRUE,.N)), | |
used.in.pca = expression(used.in.pca.calculation == 1), | |
sex.aneuploidy = expression(putative.sex.chromosome.aneuploidy == 0), | |
pc.outlier = expression(PCoutlier == FALSE), | |
ethnicity.included = expression(`21000-0.0` %in% included_ethnicities), | |
consent = expression(eid>0)) | |
condition_count = function(data,conditions){ | |
pass_alone = numeric(length(conditions)) | |
pass_cumulative = numeric(length(conditions)) | |
cumulative = TRUE | |
for(i in seq_along(conditions)){ | |
alone = data[,eval(conditions[[i]])] | |
pass_alone[i] = sum(alone) | |
cumulative = cumulative & alone | |
pass_cumulative[i] = sum(cumulative) | |
} | |
return(data.table(filter=names(conditions), pass_alone, pass_cumulative)) | |
} | |
qc_overview = condition_count(sqcpheno, conditions) | |
print(qc_overview) | |
# filter pass_alone pass_cumulative | |
# 1: all 488377 488377 | |
# 2: used.in.pca 407219 407219 | |
# 3: sex.aneuploidy 487725 406825 | |
# 4: pc.outlier 477101 396439 | |
# 5: ethnicity.included 444409 364965 | |
# 6: consent 488366 364965 | |
#Differs from "QCed sample count: 361194 samples" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment