Skip to content

Instantly share code, notes, and snippets.

@johnbowes
Last active January 21, 2016 15:20
Show Gist options
  • Save johnbowes/b254e642b8329a580d18 to your computer and use it in GitHub Desktop.
Save johnbowes/b254e642b8329a580d18 to your computer and use it in GitHub Desktop.
QC process for Illumina Human Core Exome array
#!/bin/bash
module load apps/binapps/anaconda/2.1.0
module load apps/gcc/R/3.1.0
# clean data threshold variables.
SNP_MISS_THRESH=0.02 # threshold for SNP missing rate
SNP_MAF_THRESH=0.01 # threshold for minor allele
SNP_HWE_THRESH=1e-3 # threshold for HWE
SNP_HWE_GROUP='ALL' # define within which group to calculate HWE
IND_MISS_THRESH=0.02 # threhsold for individual mising rate
IND_HET_SD=3 # standard deviations from heterozygosity mean for upper/lower bound
# clean data file variables.
NAME='bstop'
BED_UPDATE="data/${NAME}/source/ps_bstop_batch1_final"
SNP_EXCLUDE="data/${NAME}/clean/exclusions/snp"
IND_EXCLUDE="data/${NAME}/clean/exclusions/ind"
SNP_QC="data/${NAME}/clean/snp/${NAME}"
IND_QC="data/${NAME}/clean/ind/${NAME}"
IBD_QC="data/${NAME}/clean/ibd/${NAME}"
SNP_DUP="data/${NAME}/clean/snp_duplicates/${NAME}"
BED_QC="data/${NAME}/clean/dataset/${NAME}_qc"
# hapmap PCA variables
IND_PCA="data/${NAME}/clean/pca/${NAME}"
####
# 1. Exclude unplaced (CHR=0), Y and mitochondrial SNPs
####
gawk '{if($1==0 || $1==24 || $1==26) print $2}' ${BED_UPDATE}.bim > ${SNP_EXCLUDE}/fail_snp_non-autosomal.list
# concatenate lists
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions
####
# 2. Perform SNP QC.
####
# create jobs list.
JOBS="--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --hardy --out ${SNP_QC} --silent\n
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --freq --out ${SNP_QC} --silent\n
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --missing --out ${SNP_QC} --silent"
# run jobs in parallel.
echo -e $JOBS | xargs -L 1 -P 3 ./bin/plink2
# create exclusion lists
sed '1 d' ${SNP_QC}.lmiss | gawk -v threshold=${SNP_MISS_THRESH} '$5 > threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_call_rate.list
sed '1 d' ${SNP_QC}.frq | gawk -v threshold=${SNP_MAF_THRESH} '$5 < threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_freq.list
sed '1 d' ${SNP_QC}.hwe | grep ${SNP_HWE_GROUP} | gawk -v threshold=${SNP_HWE_THRESH} '$9 < threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_hwe.list
# concatenate lists
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions
###
# 3. SNP duplicates based on chromosome and base position
###
./bin/plink2 \
--bfile ${BED_UPDATE} \
--exclude ${SNP_EXCLUDE}.exclusions \
--missing \
--out $SNP_DUP
python data/${NAME}/clean/scripts/positional_duplicates.py ${SNP_DUP}.lmiss ${BED_UPDATE}.bim ${SNP_EXCLUDE}/fail_snp_duplicates.list
# concatenate lists
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions
####
# 4. Perform sample call rate and heterozygosity QC.
####
# create jobs list.
JOBS="--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --missing --out ${IND_QC} --silent --noweb\n
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --het --out ${IND_QC} --silent --noweb"
# run jobs in parallel.
echo -e $JOBS | xargs -L 1 -P 2 ./bin/plink2
# run .r to create summary table and plots.
Rscript data/${NAME}/clean/scripts/clean_data.R ${IND_QC} ${IND_EXCLUDE} ${IND_MISS_THRESH} ${IND_HET_SD} &> data/${NAME}/clean/ind/sample_quality.log
# concatenate lists.
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions
####
# 5. IBD.
####
# generate clean dataset
./bin/plink2 \
--bfile ${BED_UPDATE} \
--remove ${IND_EXCLUDE}.exclusions \
--exclude ${SNP_EXCLUDE}.exclusions \
--make-bed \
--out ${IBD_QC}
./bin/king_1.9 -b ${IBD_QC}.bed --bysample --prefix ${IBD_QC}
./bin/king_1.9 -b ${IBD_QC}.bed --kinship --related --degree 2 --prefix ${IBD_QC}
python data/${NAME}/clean/scripts/king.py --prefix ${IBD_QC} --out ${IND_EXCLUDE}
# concatenate sample qc lists
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions
####
# 6. PCA using SNPweights.
####
# convert to EIGENSTRAT format
cat > ${IND_PCA}.par << EOL
genotypename: ${IND_PCA}.bed
snpname: ${IND_PCA}.bim
indivname: ${IND_PCA}.fam
outputformat: EIGENSTRAT
genotypeoutname: ${IND_PCA}.geno
snpoutname: ${IND_PCA}.snp
indivoutname: ${IND_PCA}.ind
familynames: NO
EOL
./bin/plink2 --bfile ${BED_UPDATE} --remove ${IND_EXCLUDE}.exclusions --exclude ${SNP_EXCLUDE}.exclusions --make-bed --out ${IND_PCA}
bin/convertf -p ${IND_PCA}.par
# run SNPweights
cat > ${IND_PCA}.sw.par << EOL
geno: ${IND_PCA}.geno
snp: ${IND_PCA}.snp
ind: ${IND_PCA}.ind
snpwt: bin/snpwt.CO
predpcoutput: ${IND_PCA}.predpc
EOL
bin/inferanc -p ${IND_PCA}.sw.par
sed -i '1iID STATUS SNPs PC1 PC2 EUR AFR ASN' ${IND_PCA}.predpc
#Rscript clean_data/scripts/snpweights.R ${IND_PCA} ${IND_EXCLUDE}
# create list of exclusions
gawk '{if($4 < 0.0085 && $5 < 0.075) print $1}' ${IND_PCA}.predpc > ${IND_PCA}.fail
grep -w -f ${IND_PCA}.fail ${IND_PCA}.fam | gawk '{print $1,$2}' > ${IND_EXCLUDE}/fail_pca.list
# concatenate sample qc lists
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions
####
# 7. Sex-check
####
./bin/plink2 \
--bfile ${BED_UPDATE} \
--remove ${IND_EXCLUDE}.exclusions \
--exclude ${SNP_EXCLUDE}.exclusions \
--check-sex \
--out $IND_QC
# create list of exclusions
gawk '{if($5 ~/PROBLEM/ && $3 > 0) print $1,$2}' ${IND_QC}.sexcheck > ${IND_EXCLUDE}/fail_check_sex.list
# concatenate sample qc lists
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions
####
# 8. Create QC'd dataset
####
./bin/plink2 \
--bfile ${BED_UPDATE} \
--remove ${IND_EXCLUDE}.exclusions \
--exclude ${SNP_EXCLUDE}.exclusions \
--make-bed \
--out ${BED_QC}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment