yassineS/run_sweepfinder2_byPop_byChrom.sh

## run_sweepfinder2_byPop_byChrom.sh
#!/bin/bash
# Author: Yassine Souilmi <yassine.souilmi@adelaide.edu.au>

set -u

# module load BCFtools

bed=$1 # bed file name
ref = $2 # link to the reference fasta file

# Create a polarised VCF for each population:
awk '{print $1}' ${bed}.fam | sort -u | while read pop
do
	echo "${pop}" > pop.txt
	mkdir -pv ${bed}_popSplit

	plink --bfile ${bed} \
	--keep-fam pop.txt \
	--recode vcf-iid bgz \
	--keep-allele-order \
	--out ${bed}_popSplit/${pop}_${bed}

	tabix -p vcf ${bed}_popSplit/${pop}_${bed}.vcf.gz

	bcftools norm \
	-t $(seq 1 22 | tr -s '\n' ',' | sed 's/22,/22/') \
	-O z -o ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \
	-c s \
	-f ~/fastdir/Refs/gatk-bundle/b37/human_g1k_v37_decoy.fasta \
	${bed}_popSplit/${pop}_${bed}.vcf.gz

	tabix -p vcf ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz

done

# Create SFS files
mkdir -pv ${bed}_popSplit_SF2_inputs

for chr in {1..22}
do
	python ~/bin/vcf2SF.py \
		-g \
		-v ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \
		-c ${chr} \
		-o ${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs
done


# Running SweepFinder2
wd=$PWD

awk '{print $1}' ${bed}.fam | sort -u | while read pop
	for chr in {1..22}
	do
    	mkdir -pv ${pop}_sf2_wd/${chr}
    	cd ${pop}_sf2_wd/${chr}/ #Making sure all jobs run in a dedicated folder

    	SweepFinder2 -sg 1000 ${wd}/${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs ./${pop}_${chr}.SF2out

	done # for loop (chromosomes)
done # while loop (populations)

## vcf2SF.py
# File created by Arun Durvasula: https://gist.github.com/arundurvasula/30727385d4605fb26770
# Modified by Yassine Souilmi in May 2018


import gzip
import csv
import argparse
import sys

parser = argparse.ArgumentParser(description="script to convert an all sites vcf to sweepfinder format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.")
parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.")
parser.add_argument("-o", "--out", action="store", required=True, help="Output filename")
parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.")
parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.")
parser.add_argument("-p", "--phased", action="store_true", required=False, help="Speficy if the genotypes are phased.")
args = parser.parse_args()

vcf_in = args.vcf
out_name = args.out
out_chr = args.chromosome

sample_names = []
sample_seqs = []

# Printing output information
print "Generating SFS"
print "**************"
print "From: ",vcf_in
print "For chrmosome: ",out_chr

if args.phased:
	sep = "|"
	print "The input VCF is phased"
else:
	sep =  "/"
	print "The input VCF is not phased"

print "Writing the output to: ",out_name
print "**************"

if args.gzip:
    opener = gzip.open
else:
    opener = open

sweep_out = open(out_name, 'w')
sweep_out.write("position\tx\tn\tfolded\n") # write header
folded = "1"

with opener(vcf_in, 'r') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')

    for row in tsvin:

        if any('##' in strings for strings in row):
            continue
        if any('#CHROM' in strings for strings in row):
            sample_names = row[9:]

        chrom,pos,id,ref,alt,qual,filter,info,format=row[0:9]
        haplotypes = row[9:]
        location = pos

    	if chrom == out_chr:
        	alt_list = alt.split(",")
        	x = 0
        	n = 0
        	for index,haplotype in enumerate(haplotypes):
            		if haplotype.split(sep)[0] != '.':
                		n = n + 1 # count up the non missing calls
            		if haplotype.split(sep)[0] != "1" and haplotype.split(sep)[0] != ".":
                		x = x + 1 # count reference alleles.

        	if x != 0 and x != n:
           		sweep_out.write(location+"\t"+str(x)+"\t"+str(n)+"\t"+folded+"\n")
       	else:
             	continue

print "Done!"
print "**************"
	#!/bin/bash
	# Author: Yassine Souilmi <yassine.souilmi@adelaide.edu.au>

	set -u

	# module load BCFtools

	bed=$1 # bed file name
	ref = $2 # link to the reference fasta file

	# Create a polarised VCF for each population:
	awk '{print $1}' ${bed}.fam \| sort -u \| while read pop
	do
	echo "${pop}" > pop.txt
	mkdir -pv ${bed}_popSplit

	plink --bfile ${bed} \
	--keep-fam pop.txt \
	--recode vcf-iid bgz \
	--keep-allele-order \
	--out ${bed}_popSplit/${pop}_${bed}

	tabix -p vcf ${bed}_popSplit/${pop}_${bed}.vcf.gz

	bcftools norm \
	-t $(seq 1 22 \| tr -s '\n' ',' \| sed 's/22,/22/') \
	-O z -o ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \
	-c s \
	-f ~/fastdir/Refs/gatk-bundle/b37/human_g1k_v37_decoy.fasta \
	${bed}_popSplit/${pop}_${bed}.vcf.gz

	tabix -p vcf ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz

	done

	# Create SFS files
	mkdir -pv ${bed}_popSplit_SF2_inputs

	for chr in {1..22}
	do
	python ~/bin/vcf2SF.py \
	-g \
	-v ${bed}_popSplit/${pop}_${bed}_fixedRefAllele.vcf.gz \
	-c ${chr} \
	-o ${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs
	done


	# Running SweepFinder2
	wd=$PWD

	awk '{print $1}' ${bed}.fam \| sort -u \| while read pop
	for chr in {1..22}
	do
	mkdir -pv ${pop}_sf2_wd/${chr}
	cd ${pop}_sf2_wd/${chr}/ #Making sure all jobs run in a dedicated folder

	SweepFinder2 -sg 1000 ${wd}/${bed}_popSplit_SF2_inputs/${pop}_${bed}_${chr}.sfs ./${pop}_${chr}.SF2out

	done # for loop (chromosomes)
	done # while loop (populations)
	# File created by Arun Durvasula: https://gist.github.com/arundurvasula/30727385d4605fb26770
	# Modified by Yassine Souilmi in May 2018




	import gzip
	import csv
	import argparse
	import sys

	parser = argparse.ArgumentParser(description="script to convert an all sites vcf to sweepfinder format. FASTA description will be the sample name in the VCF header.Only does one chromosome/region at a time.")
	parser.add_argument("-v", "--vcf", action="store", required=True, help="Input VCF file. Should be a multisample vcf, though it should theoretically work with a single sample.")
	parser.add_argument("-o", "--out", action="store", required=True, help="Output filename")
	parser.add_argument("-c", "--chromosome", action="store", required=True, help="Chromosome to output. Should be something in the first column of the vcf.")
	parser.add_argument("-g", "--gzip", action="store_true", required=False, help="Set if the VCF is gzipped.")
	parser.add_argument("-p", "--phased", action="store_true", required=False, help="Speficy if the genotypes are phased.")
	args = parser.parse_args()

	vcf_in = args.vcf
	out_name = args.out
	out_chr = args.chromosome

	sample_names = []
	sample_seqs = []

	# Printing output information
	print "Generating SFS"
	print "**************"
	print "From: ",vcf_in
	print "For chrmosome: ",out_chr

	if args.phased:
	sep = "\|"
	print "The input VCF is phased"
	else:
	sep = "/"
	print "The input VCF is not phased"

	print "Writing the output to: ",out_name
	print "**************"

	if args.gzip:
	opener = gzip.open
	else:
	opener = open

	sweep_out = open(out_name, 'w')
	sweep_out.write("position\tx\tn\tfolded\n") # write header
	folded = "1"

	with opener(vcf_in, 'r') as tsvin:
	tsvin = csv.reader(tsvin, delimiter='\t')

	for row in tsvin:

	if any('##' in strings for strings in row):
	continue
	if any('#CHROM' in strings for strings in row):
	sample_names = row[9:]

	chrom,pos,id,ref,alt,qual,filter,info,format=row[0:9]
	haplotypes = row[9:]
	location = pos

	if chrom == out_chr:
	alt_list = alt.split(",")
	x = 0
	n = 0
	for index,haplotype in enumerate(haplotypes):
	if haplotype.split(sep)[0] != '.':
	n = n + 1 # count up the non missing calls
	if haplotype.split(sep)[0] != "1" and haplotype.split(sep)[0] != ".":
	x = x + 1 # count reference alleles.

	if x != 0 and x != n:
	sweep_out.write(location+"\t"+str(x)+"\t"+str(n)+"\t"+folded+"\n")
	else:
	continue

	print "Done!"
	print "**************"