Skip to content

Instantly share code, notes, and snippets.

@cqgd
Last active August 29, 2018 17:28
Show Gist options
  • Save cqgd/f63916fa4ba2c8a0b060344302592335 to your computer and use it in GitHub Desktop.
Save cqgd/f63916fa4ba2c8a0b060344302592335 to your computer and use it in GitHub Desktop.
library(data.table)
library(ggplot2)
#Load sample QC data
sqc = fread("ukbb/qc/ukb_sqc_v2.txt")
#Check PCs
subset = sample(1:nrow(sqc),size=1e3)
ggplot(sqc[subset]) +
geom_point(aes(x=PC3,y=PC4, col=factor(in.white.British.ancestry.subset)))
#Filter PC outliers
PCs = paste0("PC",1:6)
is_outlier = function(x,sds=7){
abs(mean(x)-x) >= sds*sd(x)
}
out = sqc[,lapply(.SD,is_outlier),.SDcols=PCs]
sqc[,PCoutlier := rowSums(out)>=1]
#Prepare link between sample QC and phenotype data, add eid's
link_path = c("ukbb/links/ukb1062_cal_chr1_v2_s488366.fam")
link = fread(link_path)
nrow(link)==nrow(sqc)
sqc[,eid:=link[,1]]
#Add phenotype data (includes ethnicity)
pheno = fread("ukbb/pheno/9343/ukb9343.csv")
included_ethnicities = c(1,1001,1002) #See https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001
pheno[,`21000-0.0` %in% included_ethnicities]
sqcpheno = merge(sqc, pheno[,.(eid,`21000-0.0`)], by="eid", all.x=TRUE, all.y=FALSE)
#Filter by conditions listed at https://github.com/Nealelab/UK_Biobank_GWAS#imputed-v3-sample-qc
conditions = list(all = expression(rep(TRUE,.N)),
used.in.pca = expression(used.in.pca.calculation == 1),
sex.aneuploidy = expression(putative.sex.chromosome.aneuploidy == 0),
pc.outlier = expression(PCoutlier == FALSE),
ethnicity.included = expression(`21000-0.0` %in% included_ethnicities),
consent = expression(eid>0))
condition_count = function(data,conditions){
pass_alone = numeric(length(conditions))
pass_cumulative = numeric(length(conditions))
cumulative = TRUE
for(i in seq_along(conditions)){
alone = data[,eval(conditions[[i]])]
pass_alone[i] = sum(alone)
cumulative = cumulative & alone
pass_cumulative[i] = sum(cumulative)
}
return(data.table(filter=names(conditions), pass_alone, pass_cumulative))
}
qc_overview = condition_count(sqcpheno, conditions)
print(qc_overview)
# filter pass_alone pass_cumulative
# 1: all 488377 488377
# 2: used.in.pca 407219 407219
# 3: sex.aneuploidy 487725 406825
# 4: pc.outlier 477101 396439
# 5: ethnicity.included 444409 364965
# 6: consent 488366 364965
#Differs from "QCed sample count: 361194 samples"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment