-
-
Save yassineS/fe2712ad52d76460b927e3f391ea51f6 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
# Author: Yassine Souilmi <yassine.souilmi@adelaide.edu.au> | |
set -u | |
# module load BCFtools | |
bed=$1 # bed file name | |
ref = $2 # link to the reference fasta file | |
# Create a polarised VCF for each population: | |
awk '{print $1}' ${bed}.fam | sort -u | while read pop | |
do | |
echo "${pop}" > pop.txt | |
mkdir -pv ${bed}_popSplit | |
plink --bfile ${bed} \ | |
--keep-fam pop.txt \ | |
--recode vcf-iid bgz \ | |
--keep-allele-order \ | |
--out ${bed}_popSplit/${pop}_${bed} | |
tabix -p vcf ${bed}_popSplit/${pop}_${bed}.vcf.gz | |
bcftools norm \ | |
-t $(seq 1 22 | tr -s '\n' ',' | sed 's/22,/22/') \ | |
-O z -o ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \ | |
-c s \ | |
-f ~/fastdir/Refs/gatk-bundle/b37/human_g1k_v37_decoy.fasta \ | |
${bed}_popSplit/${pop}_${bed}.vcf.gz | |
tabix -p vcf ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz | |
done | |
# Create SFS files | |
mkdir -pv ${bed}_popSplit_SF2_inputs | |
for chr in {1..22} | |
do | |
python ~/bin/vcf2SF.py \ | |
-g \ | |
-v ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \ | |
-c ${chr} \ | |
-o ${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs | |
done | |
# Running SweepFinder2 | |
wd=$PWD | |
awk '{print $1}' ${bed}.fam | sort -u | while read pop | |
for chr in {1..22} | |
do | |
mkdir -pv ${pop}_sf2_wd/${chr} | |
cd ${pop}_sf2_wd/${chr}/ #Making sure all jobs run in a dedicated folder | |
SweepFinder2 -sg 1000 ${wd}/${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs ./${pop}_${chr}.SF2out | |
done # for loop (chromosomes) | |
done # while loop (populations) |
# File created by Arun Durvasula: https://gist.github.com/arundurvasula/30727385d4605fb26770 | |
# Modified by Yassine Souilmi in May 2018 | |
import gzip | |
import csv | |
import argparse | |
import sys | |
parser = argparse.ArgumentParser(description="script to convert an all sites vcf to sweepfinder format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.") | |
parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.") | |
parser.add_argument("-o", "--out", action="store", required=True, help="Output filename") | |
parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.") | |
parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.") | |
parser.add_argument("-p", "--phased", action="store_true", required=False, help="Speficy if the genotypes are phased.") | |
args = parser.parse_args() | |
vcf_in = args.vcf | |
out_name = args.out | |
out_chr = args.chromosome | |
sample_names = [] | |
sample_seqs = [] | |
# Printing output information | |
print "Generating SFS" | |
print "**************" | |
print "From: ",vcf_in | |
print "For chrmosome: ",out_chr | |
if args.phased: | |
sep = "|" | |
print "The input VCF is phased" | |
else: | |
sep = "/" | |
print "The input VCF is not phased" | |
print "Writing the output to: ",out_name | |
print "**************" | |
if args.gzip: | |
opener = gzip.open | |
else: | |
opener = open | |
sweep_out = open(out_name, 'w') | |
sweep_out.write("position\tx\tn\tfolded\n") # write header | |
folded = "1" | |
with opener(vcf_in, 'r') as tsvin: | |
tsvin = csv.reader(tsvin, delimiter='\t') | |
for row in tsvin: | |
if any('##' in strings for strings in row): | |
continue | |
if any('#CHROM' in strings for strings in row): | |
sample_names = row[9:] | |
chrom,pos,id,ref,alt,qual,filter,info,format=row[0:9] | |
haplotypes = row[9:] | |
location = pos | |
if chrom == out_chr: | |
alt_list = alt.split(",") | |
x = 0 | |
n = 0 | |
for index,haplotype in enumerate(haplotypes): | |
if haplotype.split(sep)[0] != '.': | |
n = n + 1 # count up the non missing calls | |
if haplotype.split(sep)[0] != "1" and haplotype.split(sep)[0] != ".": | |
x = x + 1 # count reference alleles. | |
if x != 0 and x != n: | |
sweep_out.write(location+"\t"+str(x)+"\t"+str(n)+"\t"+folded+"\n") | |
else: | |
continue | |
print "Done!" | |
print "**************" |
Thank you for your feedback. The value "1" in the script is contradictory to the comment, so you aren't wrong. However, the allele you are counting (reference or alt) shouldn't make a difference to the shape of your SFS if it is folded (i.e. not polarized by ancestral allele). So you can either use the script as-is or change it to "0" as you described.
Just for sake of demonstration, I ran a test on a small-ish dataset where I know I got some outliers with the version of the script above counting ref alleles, and one with a modified version where I'm counting the alt alleles. Here's how the values in general compare, the x-axis represent the former (counting ref) and the y-axis represents the modified version:
The plot above shows consistency in values, especially in high CLR values which is what determines your outliers. However, it isn't very informative about the genomic distribution of the values. So I also plotted the values in manhattan plot to show that the genomic distribution is the same, lightblue represents the default script and lightgrey the suggested modification:
Your question is very relevant and good for you for spotting my comment error. Hope this was informative.
Hi, Yassine
I hoped to get insight into how I assign my VCFs to their populations.
"Admixture has obscured signals of historical hard sweeps in humans." Uses Plink1.9 to assign the populations. Here, with this new edition of the bash script, where do I assign my population groups?
Should there be one bed file for each genome?
Or one bed file per population?
Thanks,
Michael
Hi Michael,
The plink step loops over the populations and creates one vcf for each of the populations you are looping over. The awk '{print $1}' ${bed}.fam | sort -u | while read pop
step extracts the population labels from the .fam
file, and creates the vcfs in question.
Cheers,
Yassine
Hi, there. I am new to population genetics and I am confused by the script
vcf2SF.py
.I am confused about the 73th row, which count the derived allele. Because I read from the manual the "0" means the site is the same with column "REF", "1" means the site is the same with the first one of the column "ALT". So I feel like the expression show be
if haplotype.split(sep)[0] != "0" and haplotype.split(sep)[0] != ".":
.I'll appreciate that, if you could answer my questions. Do please my rudeness if I said something wrong.