Last active
January 21, 2016 15:20
-
-
Save johnbowes/b254e642b8329a580d18 to your computer and use it in GitHub Desktop.
QC process for Illumina Human Core Exome array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
module load apps/binapps/anaconda/2.1.0 | |
module load apps/gcc/R/3.1.0 | |
# clean data threshold variables. | |
SNP_MISS_THRESH=0.02 # threshold for SNP missing rate | |
SNP_MAF_THRESH=0.01 # threshold for minor allele | |
SNP_HWE_THRESH=1e-3 # threshold for HWE | |
SNP_HWE_GROUP='ALL' # define within which group to calculate HWE | |
IND_MISS_THRESH=0.02 # threhsold for individual mising rate | |
IND_HET_SD=3 # standard deviations from heterozygosity mean for upper/lower bound | |
# clean data file variables. | |
NAME='bstop' | |
BED_UPDATE="data/${NAME}/source/ps_bstop_batch1_final" | |
SNP_EXCLUDE="data/${NAME}/clean/exclusions/snp" | |
IND_EXCLUDE="data/${NAME}/clean/exclusions/ind" | |
SNP_QC="data/${NAME}/clean/snp/${NAME}" | |
IND_QC="data/${NAME}/clean/ind/${NAME}" | |
IBD_QC="data/${NAME}/clean/ibd/${NAME}" | |
SNP_DUP="data/${NAME}/clean/snp_duplicates/${NAME}" | |
BED_QC="data/${NAME}/clean/dataset/${NAME}_qc" | |
# hapmap PCA variables | |
IND_PCA="data/${NAME}/clean/pca/${NAME}" | |
#### | |
# 1. Exclude unplaced (CHR=0), Y and mitochondrial SNPs | |
#### | |
gawk '{if($1==0 || $1==24 || $1==26) print $2}' ${BED_UPDATE}.bim > ${SNP_EXCLUDE}/fail_snp_non-autosomal.list | |
# concatenate lists | |
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions | |
#### | |
# 2. Perform SNP QC. | |
#### | |
# create jobs list. | |
JOBS="--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --hardy --out ${SNP_QC} --silent\n | |
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --freq --out ${SNP_QC} --silent\n | |
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --missing --out ${SNP_QC} --silent" | |
# run jobs in parallel. | |
echo -e $JOBS | xargs -L 1 -P 3 ./bin/plink2 | |
# create exclusion lists | |
sed '1 d' ${SNP_QC}.lmiss | gawk -v threshold=${SNP_MISS_THRESH} '$5 > threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_call_rate.list | |
sed '1 d' ${SNP_QC}.frq | gawk -v threshold=${SNP_MAF_THRESH} '$5 < threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_freq.list | |
sed '1 d' ${SNP_QC}.hwe | grep ${SNP_HWE_GROUP} | gawk -v threshold=${SNP_HWE_THRESH} '$9 < threshold {print $2}' > ${SNP_EXCLUDE}/fail_snp_hwe.list | |
# concatenate lists | |
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions | |
### | |
# 3. SNP duplicates based on chromosome and base position | |
### | |
./bin/plink2 \ | |
--bfile ${BED_UPDATE} \ | |
--exclude ${SNP_EXCLUDE}.exclusions \ | |
--missing \ | |
--out $SNP_DUP | |
python data/${NAME}/clean/scripts/positional_duplicates.py ${SNP_DUP}.lmiss ${BED_UPDATE}.bim ${SNP_EXCLUDE}/fail_snp_duplicates.list | |
# concatenate lists | |
cat ${SNP_EXCLUDE}/*.list | sort -u > ${SNP_EXCLUDE}.exclusions | |
#### | |
# 4. Perform sample call rate and heterozygosity QC. | |
#### | |
# create jobs list. | |
JOBS="--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --missing --out ${IND_QC} --silent --noweb\n | |
--bfile ${BED_UPDATE} --exclude ${SNP_EXCLUDE}.exclusions --het --out ${IND_QC} --silent --noweb" | |
# run jobs in parallel. | |
echo -e $JOBS | xargs -L 1 -P 2 ./bin/plink2 | |
# run .r to create summary table and plots. | |
Rscript data/${NAME}/clean/scripts/clean_data.R ${IND_QC} ${IND_EXCLUDE} ${IND_MISS_THRESH} ${IND_HET_SD} &> data/${NAME}/clean/ind/sample_quality.log | |
# concatenate lists. | |
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions | |
#### | |
# 5. IBD. | |
#### | |
# generate clean dataset | |
./bin/plink2 \ | |
--bfile ${BED_UPDATE} \ | |
--remove ${IND_EXCLUDE}.exclusions \ | |
--exclude ${SNP_EXCLUDE}.exclusions \ | |
--make-bed \ | |
--out ${IBD_QC} | |
./bin/king_1.9 -b ${IBD_QC}.bed --bysample --prefix ${IBD_QC} | |
./bin/king_1.9 -b ${IBD_QC}.bed --kinship --related --degree 2 --prefix ${IBD_QC} | |
python data/${NAME}/clean/scripts/king.py --prefix ${IBD_QC} --out ${IND_EXCLUDE} | |
# concatenate sample qc lists | |
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions | |
#### | |
# 6. PCA using SNPweights. | |
#### | |
# convert to EIGENSTRAT format | |
cat > ${IND_PCA}.par << EOL | |
genotypename: ${IND_PCA}.bed | |
snpname: ${IND_PCA}.bim | |
indivname: ${IND_PCA}.fam | |
outputformat: EIGENSTRAT | |
genotypeoutname: ${IND_PCA}.geno | |
snpoutname: ${IND_PCA}.snp | |
indivoutname: ${IND_PCA}.ind | |
familynames: NO | |
EOL | |
./bin/plink2 --bfile ${BED_UPDATE} --remove ${IND_EXCLUDE}.exclusions --exclude ${SNP_EXCLUDE}.exclusions --make-bed --out ${IND_PCA} | |
bin/convertf -p ${IND_PCA}.par | |
# run SNPweights | |
cat > ${IND_PCA}.sw.par << EOL | |
geno: ${IND_PCA}.geno | |
snp: ${IND_PCA}.snp | |
ind: ${IND_PCA}.ind | |
snpwt: bin/snpwt.CO | |
predpcoutput: ${IND_PCA}.predpc | |
EOL | |
bin/inferanc -p ${IND_PCA}.sw.par | |
sed -i '1iID STATUS SNPs PC1 PC2 EUR AFR ASN' ${IND_PCA}.predpc | |
#Rscript clean_data/scripts/snpweights.R ${IND_PCA} ${IND_EXCLUDE} | |
# create list of exclusions | |
gawk '{if($4 < 0.0085 && $5 < 0.075) print $1}' ${IND_PCA}.predpc > ${IND_PCA}.fail | |
grep -w -f ${IND_PCA}.fail ${IND_PCA}.fam | gawk '{print $1,$2}' > ${IND_EXCLUDE}/fail_pca.list | |
# concatenate sample qc lists | |
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions | |
#### | |
# 7. Sex-check | |
#### | |
./bin/plink2 \ | |
--bfile ${BED_UPDATE} \ | |
--remove ${IND_EXCLUDE}.exclusions \ | |
--exclude ${SNP_EXCLUDE}.exclusions \ | |
--check-sex \ | |
--out $IND_QC | |
# create list of exclusions | |
gawk '{if($5 ~/PROBLEM/ && $3 > 0) print $1,$2}' ${IND_QC}.sexcheck > ${IND_EXCLUDE}/fail_check_sex.list | |
# concatenate sample qc lists | |
cat ${IND_EXCLUDE}/*.list | sort -u > ${IND_EXCLUDE}.exclusions | |
#### | |
# 8. Create QC'd dataset | |
#### | |
./bin/plink2 \ | |
--bfile ${BED_UPDATE} \ | |
--remove ${IND_EXCLUDE}.exclusions \ | |
--exclude ${SNP_EXCLUDE}.exclusions \ | |
--make-bed \ | |
--out ${BED_QC} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment