Skip to content

Instantly share code, notes, and snippets.

@dinovski
Last active October 19, 2018 07:32
Show Gist options
  • Save dinovski/69a208fbf0dd1e4c0080fc89967eacd8 to your computer and use it in GitHub Desktop.
Save dinovski/69a208fbf0dd1e4c0080fc89967eacd8 to your computer and use it in GitHub Desktop.
imputation, phasing, and file wrangling for local ancestry inference with RFMix
#!/bin/bash
## 1. Imputation of WGS samples with Impute2 (http://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
## and phasing with Shapeit (http://www.shapeit.fr/)
## 2. File wrangling for local ancestry inference with RFMix (https://github.com/slowkoni/rfmix)
##
## NOTE: 'PopPhased' directory included with RFMix MUST be local to working directory ie. ${ODIR}
## reference data was obtained from http://genetics.med.harvard.edu/reichlab/Reich_Lab/Datasets.html
## Bin paths
PYTHON=
PLINK=
SHAPEIT=
RFMIX=
IDIR=/path/to/plink/files
ODIR=/out/path
NAME=jnomes.hgdp
REF_DIR=/path/to/ref/samps
## sample VCFs must be converted to plink format and merged with reference data
${PLINK} --vcf ${VCF} --biallelic-only strict --make-bed --out ${IDIR}/jnomes
${PLINK} --bfile ${IDIR}/jnomes --bmerge ${IDIR}/hgdp --make-bed --out ${IDIR}/${NAME}
## split dataset by chr prior to phasing
for chr in $(seq 1 22); do
${PLINK} --bfile ${IDIR}/${NAME} \
--chr $chr \
--make-bed \
--out ${ODIR}/${NAME}.chr$chr
done
## remove problem snps
## Prior to phasing, individuals or sites with more than 5% missing genotypes and failing strand alignment are excluded.
## Monomorphic or singleton markers should be removed, as they are not informative for phasing
for chr in $(seq 1 22); do
${SHAPEIT} -check --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
-M ${REF_DIR}/chr${chr}.genetic_map.txt \
--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
${REF_DIR}/chr${chr}.legend.gz \
${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
--output-log ${ODIR}/check${chr} 2>&1 ${ODIR}/logfile.txt ;
done
## phase
for chr in $(seq 1 22); do
${SHAPEIT} --input-bed ${IDIR}/${NAME}.chr${chr}.bed ${IDIR}/${NAME}.chr${chr}.bim ${IDIR}/${NAME}.chr${chr}.fam \
-M ${REF_DIR}/chr${chr}.genetic_map.txt \
--input-ref ${REF_DIR}/chr${chr}.haplotypes.gz \
${REF_DIR}/chr${chr}.legend.gz \
${REF_DIR}/ALL_1000G_phase1integrated_v3.sample \
--exclude-snp check${chr}.snp.strand.exclude \
-O ${ODIR}/output${chr} \
--output-log ${ODIR}/real${chr} 2>&1 > ${ODIR}/logfile${chr}.txt ;
done
## Modify haps files to select populations of interest to compare to admixed (ie. input) samples
## split 'haps' files (output from phasing with Shapeit)
## person in the nth row has columns (n-1)*2 + 1 and (n-1)*2 + 2
SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE
sed 1,2d ${SAMP_FILE} | tr ' ' '\t' | cat -n - | awk '($2 ~ "GEN")'
## TO DO: automate column detection and separation of admixed samples
## admixed sample column info
## 275 GEN15-59-D GEN15-59-D 0 0 0 0 -9 = col 549,550
## 276 GEN15-60-D GEN15-60-D 0 0 0 0 -9 = col 551,552
## 277 GEN15-61-D GEN15-61-D 0 0 0 0 -9 = col 553,554
## 278 GEN15-63-D GEN15-63-D 0 0 0 0 -9 = col 555,556
for chr in $(seq 1 22); do
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 | tr '\t' ' ' > ${ODIR}/output${chr}.haps.header
done
## remove admixed samples from haps files
for chr in $(seq 1 22); do
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 --complement | tr '\t' ' ' | \
paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/ref${chr}.haps
done
## keep only admixed samples in haps files
for chr in $(seq 1 22); do
cat ${ODIR}/output${chr}.haps | tr ' ' '\t' | cut -f 1-5 --complement | cut -f 549-556 | tr '\t' ' ' | \
paste ${ODIR}/output${chr}.haps.header - | tr '\t' ' ' > ${ODIR}/admixed${chr}.haps
done
## sample files
sed -n 1,2p ${SAMP_FILE} > ${ODIR}/sample.header
## admixed samples
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($2 ~ "GEN")' | tr '\t' ' ' | cat ${ODIR}/sample.header - > ${ODIR}/admixed.SAMPLE
## reference samples
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($2 !~ "GEN")' | tr '\t' ' ' > ${ODIR}/refonly.SAMPLE
## edit ref.haps file to include specific populations
## BEGIN narrow-cut-haps.sh
for chr in $(seq 1 22); do
SAMP_FILE=${ODIR}/output.revised.pops.SAMPLE
HAPS=${ODIR}/ref${chr}.haps
## print a list of columns (1-based) to keep
COLS=$(awk 'BEGIN {
## List of populations to keep (from first column of sample file)
list["Druze" ] = 1
list["French"] = 1
list["Italian" ] = 1
list["Palestinian" ] = 1
list["Russian" ] = 1
list["Adygei" ] = 1
# Keep the first 5 columns of the HAPS file
print 1
print 2
print 3
print 4
print 5
}
$1 in list {
# for each population to keep, print the corresponding columns in the HAPS file.
# 1. skip the first 5 columns
# 2. assume te input file (in "sample" format) has two header lines,
# subtract 3 from the line number (NR) to make column number zero-based
# 3. multiply by 2 (as there are two fields for each item in he HAPS file)
# 4. add 6 => 5 fields + 1 to make it 1-based again.
# add 6+1 for the next field number of the pair.
print (NR-3)*2+6
print (NR-3)*2+6+1
}' "$SAMP_FILE")
## Combine columns into a comma-separated list
COLS=$(echo "$COLS" | paste -d, -s)
cut -d' ' --fields "$COLS" "$HAPS" > ${ODIR}/rfmix-narrow${chr}.haps
done
## END
## edit sample file to include ref samples by pop in modified .haps file
## wide cut (ie, more populations)
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 == "Adygei" || $1 =="Druze" || $1 == "French" || $1 == "Han" || \
$1 == "Italian" || $1 == "Luhya" || $1 == "Orcadian" || $1 == "Palestinian" || $1 == "Russian" || $1 == "Yoruba")' | \
tr '\t' ' ' | cat {ODIR}/sample.header - > ${ODIR}/ref.wide.SAMPLE
## narrow cut
cat ${SAMP_FILE} | tr ' ' '\t' | awk '($1 =="Druze" || $1 == "French" || $1 == "Italian" || \
$1 == "Palestinian" || $1 == "Russian" || $1 == "Adygei")' | tr '\t' ' ' | cat {ODIR}/sample.header - > {ODIR}/ref.narrow.SAMPLE
## edit pop labels so that there are 4 classes:
## levant = druze/palestinian
## sw_eur = french/italian
## e_eur = russian
## khazar = adygei
cat {ODIR}/ref.narrow.SAMPLE | tr ' ' '\t' | awk '{ if ($1 == "Druze" || $1 == "Palestinian") {$1 = "levant"} ; print $0}' | \
awk '{if ($1 == "French" || $1 == "Italian") {$1 = "sw_eur"} ; print $0}' | \
awk '{ if ($1 == "Russian") {$1 = "e_eur"} ; print $0}' | \
awk '{ if ($1 == "Adygei") {$1 = "khazar"} ; print $0}' > {ODIR}/ref.narrow.mod.SAMPLE
## 'keep' files=single sample per line
cat {ODIR}/admixed.SAMPLE | tr ' ' '\t' | cut -f 2 | sed 1,2d > {ODIR}/admixed.keep
#cat {ODIR}/ref.wide.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.wide.keep
cat {ODIR}/ref.narrow.mod.SAMPLE | sed 1,2d | tr ' ' '\t' | cut -f 2 > {ODIR}/ref.narrow.keep
## shapeit to rfmix by chr: narrow pops
## rfmix-narrow${chr}.haps are modified ref${chr}.haps files
for chr in $(seq 1 22); do
${PYTHON} ${RFMIX}/shapeit2rfmix.py \
--shapeit_hap_ref {ODIR}/rfmix-narrow${chr}.haps \
--shapeit_hap_admixed {ODIR}/admixed${chr}.haps \
--shapeit_sample_ref {ODIR}/ref.narrow.mod.SAMPLE \
--shapeit_sample_admixed {ODIR}/admixed.SAMPLE \
--ref_keep {ODIR}/ref.narrow.keep \
--admixed_keep {ODIR}/admixed.keep \
--chr ${chr} \
--genetic_map ${REF_DIR}/chr${chr}.genetic_map.txt \
--out {ODIR}/${NAME}_${chr}_narrow
done
## shapeit to rfmix output files must end in "chr${chr}.map" etc.; need to rename due to redundant chr names
for chr in $(seq 1 22); do
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations
done
for chr in $(seq 1 22); do
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.map ${ODIR}/${NAME}.narrow.chr${chr}.map
done
for chr in $(seq 1 22); do
mv ${ODIR}/${NAME}_${chr}_narrow_chr${chr}.alleles ${ODIR}/${NAME}.narrow.chr${chr}.alleles
done
## edit classes files to assign index to each ref population
## this must be done if ref and admixed samples were phased together
## --sample argument includes all samples
## This 'keep' file is different: ie. first 2 columns of sample files
## Sample files are not sample files...they are output from shapeit to rfmix
sed 1,2d ${ODIR}/ref.narrow.mod.SAMPLE | tr ' ' '\t' | cut -f 1,2 | tr '\t' ' ' > ${ODIR}/ref.narrow.keep
## there must be a keep file for each individual population
awk '{print $0 >> $1 ".keep"}' ${ODIR}/ref.narrow.keep
for chr in $(seq 1 22); do
${PYTHON} ${RFMIX}/classes.py \
--ref ${ODIR}/e_eur.keep,${ODIR}/khazar.keep,${ODIR}/levant.keep,${ODIR}/sw_eur.keep \
--sample ${ODIR}/${NAME}_${chr}_narrow.sample \
--out ${ODIR}/${NAME}_${chr}.narrow.mod.classes
done
## RUN RFMIX: 2 EM iterations, 50 generations; min node size 5 (min num ref haplotypes per tree node, default=1); bootstrap 0 (for admixed)
## must run with 'PopPhased' dir local
## node size 5 recommended if ref sample sizes are skewed > 2:1 or if doing EM or if related samples (to reduce variance)
#RFMIX_OPTS='-e 2 -n 5 -w 0.2 -G 50 -s 0'
## rerun RFMix with larger window; OMNI array has ~600K SNPs and ~300K overlap with jnomes so we only
## included about 20 per window before with -w 0.2 (cM); try going up to 1cM
RFMIX_OPTS='-e 2 -n 5 -w 1 -G 50 -s 0'
for chr in $(seq 1 22); do
${PYTHON} ${RFMIX}/RunRFMix.py ${RFMIX_OPTS} \
--num-threads 4 \
--use-reference-panels-in-EM \
--forward-backward \
PopPhased \
${ODIR}/${NAME}.narrow.chr${chr}.alleles \
${ODIR}/${NAME}_${chr}.narrow.mod.classes \
${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
-o ${ODIR}/${NAME}_chr${chr}.rfmix
done
## Collapse output
## multiple viterbi files indexed by EM iteration = 1 row per snp and 1 col per admixed haplotype
## forward-backward = each haplotype gets 1 col per ancestry (posterior probability of that ancestry
## at that SNP in that haplotype, for as many ref pops used)
## corrected phasings = same format as alleles file (1 row per snp, 1 col per haplotype) but only contains admixed haplotypes
## collapse output into bed files and generate karyogram plots
## use ${ODIR}/${NAME}.revised.pops.SAMPLE for pop IDs
## pop labels need to be in order of "rfmix populations", ie. order of ref pop classes
## determined the class IDs by comparing a *narrow.mod.classes file with ref.narrow.mod.SAMPLE:
## 0=ashkenazi (ie. admixed samples)
## 1=e_eur
## 2=khazar
## 3=levant
## 4=sw_eur
## with fbk; default threshold (0.9)
SAMP_ID=GEN15-59-D ## admixed sample of interest
for chr in $(seq 1 22); do
${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \
--snp_locations ${ODIR}/${NAME}.narrow.${chr}.snp_locations \
--fbk ${ODIR}/${NAME}_${chr}.rfmix.2.ForwardBackward.txt \
--ind ${SAMP_ID} \
--ind_info ${ODIR}/${NAME}_narrow.sample \
--pop_labels e_eur,khazar,levant,sw_eur \
--out ${SAMP_ID}
done
## note: forward backward is same format as Viterbi except instead of 1 col per
## ancestry, each haplotype gets 1 col per ancestry
## value is the posterior probability of that ancestry at that SNP in that haplotype
## Forward-Backward gives marginal probability for each individual state, Viterbi
## gives probability of the most likely sequence of states
## no fbk
for chr in $(seq 1 22); do
${PYTHON} ${RFMIX}/collapse_ancestry_mod.py \
--rfmix ${ODIR}/${NAME}_chr${chr}.rfmix.2.Viterbi.txt \
--snp_locations ${ODIR}/${NAME}.narrow.chr${chr}.snp_locations \
--ind ${SAMP_ID} \
--ind_info ${ODIR}/${NAME}_narrow.sample \
--pop_labels e_eur,khazar,levant,sw_eur \
--out ${ODIR}/${SAMP_ID}_nofbk
done
## plot karyogram
IND='${SAMP_ID}_nofbk'; ${PYTHON} ${RFMIX}/plot_karyogram.py \
--bed_a ${IND}_A.bed \
--bed_b ${IND}_B.bed \
--pop_order e_eur,khazar,levant,sw_eur \
--centromeres ${RFMIX}/centromeres_hg19.bed \
--ind ${IND} \
--out ${IND}.png
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment