muppetjones/kSNP3_cluster.sh

## kSNP3_cluster.sh
#!/bin/tcsh
#v3.91
########################################################################

# WHERE ARE ALL THE kSNP SCRIPTS?
# IF YOU INSTALLED kSNP ANYWHERE OTHER THAN /user/local THEN YOU MUST MODIFY THIS TO POINT TO THE DIRECTORY WHERE YOU HAVE INSTALLED kSNP SCRIPTS
set kSNP=/usr/local/kSNP3
#set kSNP=/g/g15/shea/kSNP3_Source

########################################################################


# Example:  kSNP3  -in example.fasta.list -outdir Test.out -k 13 -annotate example.annotate_list -min_frac 0.7


if  ($#argv == 0)  then
         printf "Usage: kSNP3 <options>\
Options are the following: \
 -k <kmer_length> # required\n\
 -in <input_fastaFile_list>  # required file listing  full path location of each genome and the genome name, one line per genome, tab delimited between full path to genome fasta file in column 1 and genome name in column 2. This format allows multi-read,multi-chromosome, and multi-contig genomes, each genome in separate fasta. If multiple chromosomes are listed as separate fasta entries in a single genome file, positions and annotations are found for each gi number \n\
 -outdir <output_directory> # required \n\
 -annotate <annotate_list> # optional file listing genome names for which to find positions and annotate SNPs, names match column 2 of the -in file. \n\
 -SNPs_all <path to SNPs all file> # optional, if given then it uses existing SNPs instead of searching for new ones, and adds new genomes to the existing analysis. Assumes only the new genomes are listed in the -in file.\n\
 -core # optional, if present calculate core SNPs and core SNP parsimony tree\n\
 -ML # optional, if present calculate Maximum Likelihood tree\n\
 -min_frac <minimum_fraction_genomes_with_locus> # optional to create a parsimony tree based only on SNP loci that occur in at least this fraction of genomes, for example -min_frac 0.5 \n\
 -genbank <genbank.gbk> # optional file for SNP annotation\n\
 -CPU <num_CPU> # optional, number of CPU's to use, if not specified it uses all available\n\
 -NJ # optional, calculate a neighbor joining tree\n\
 -vcf # optional, create a vcf file using the first genome specified in the -positions file as the reference genome\n\
 -all_annotations  # optional, annotate each locus exhaustively with all the annotations in any of the annotated genomes. Without this option it only provides the first annotation it comes to for a given locus, checking in the order genomes are listed in the -annotate file.\n"
         exit
endif

set DEBUG=0

echo "Location of kSNP scripts: "
echo "$kSNP"

#  Tell kSNP where the input files are
set thisDir = `pwd`              # Directory with the input files
echo "The home directory is $thisDir"

# Read in parameters from command line
set annotate_list="nonexistent_file"
set genbankFile="nonexistent_file"
set all_annotations=0

while ($#argv > 0)
    switch ($argv[1])
	case -vcf:
            set vcf=1
            breaksw
	case -NJ:
            set nj=1
            breaksw
	case -ML:
            set ML=1
            breaksw
	case -core:
            set core=1
            breaksw
        case -k:
            shift
            set k=$argv[1]
            breaksw
        case -in:
            shift
            set fasta_list="$argv[1]"
            breaksw
        case -outdir:
            shift
            set dir="$argv[1]"
            breaksw
        case -annotate:
            shift
            set annotate_list="$argv[1]"
            breaksw
	case -all_annotations:
            set all_annotations=1
            breaksw
        case -min_frac:
            shift
            set min_fraction_with_locus="$argv[1]"
            breaksw
        case -genbank:
            shift
            set genbankFile="$argv[1]"
            breaksw
	case -CPU:
	    shift
	    set num_cpus=$argv[1]
	    breaksw
	case -SNPs_all:
	    shift
	    set SNPs_all="$argv[1]"
	    breaksw
        default:
            shift
            printf "Unknown parameter $argv[1]\n"
      endsw
    shift
end


if ($?dir) then
    set dir = `$kSNP/add_paths3 "$dir" "$thisDir"`
    printf "dir $dir\n"
endif

if ($?fasta_list) then
    set fasta_list = `$kSNP/add_paths3 "$fasta_list" "$thisDir"`
    printf "fasta_list $fasta_list\n"
endif


if ($?annotate_list) then
    set annotate_list = `$kSNP/add_paths3 "$annotate_list" "$thisDir"`
    printf "annotate_list $annotate_list\n"
endif

if ($?genbankFile) then
    set genbankFile = `$kSNP/add_paths3 "$genbankFile" "$thisDir"`
    printf "genbankFile $genbankFile\n"
endif

if ($?SNPs_all) then
    set SNPs_all = `$kSNP/add_paths3 "$SNPs_all" "$thisDir"`
    printf "SNPs_all $SNPs_all\n"
endif

echo "Starting kSNP"
date
set startseconds=`date +%s`

echo "input fasta_list: $fasta_list"
echo "output directory: $dir"
echo "k=$k"
echo "annotate_list file: $annotate_list"
if ($all_annotations == 1) then
    echo "Report all annotations."
else
    echo "Report minimal annotations."
endif

if ( $?min_fraction_with_locus ) then
echo "min_fraction_with_locus: $min_fraction_with_locus"
endif

if ($?genbankFile  ) then
 if ( -e "$genbankFile") then
   echo "Genbank file for annotations (and any from NCBI with gi number which are automatically downloaded): $genbankFile"
 endif
endif

#get the number of CPUs
if !($?num_cpus) then
    #get the operating system
    set OS=`uname`

    if ($OS == 'Darwin') then
	echo "The operating system is $OS"
	/usr/sbin/system_profiler SPHardwareDataType>wubba
	set num_cpus=` awk '/Total Number of Cores/ {print $5}' wubba`
	echo "There are $num_cpus CPUs"
	rm wubba
	##@ num_cpus=$num_proc
    endif

    if ($OS != 'Darwin') then
	echo "The operating system is $OS"
	set num_cpus=`cat /proc/cpuinfo | grep processor | wc -l`
    endif
    if ($num_cpus < 1 ) then
	set num_cpus=8
	echo "Could not figure out the number of CPUs, will run 8 processes"
    endif

endif
echo "Number CPUs: $num_cpus"

#chesk the fasta genome files to be sure line endings are Unix and fix if they are not
cp -f "$fasta_list" fasta_list
$kSNP/LE2Unix fasta_list
# First check genome names. Prints to STDERR if duplicate names, STDOUT list of genome names parsed for kSNP. Use names corresponding to (none, any or all of) these in the $annotate_list file.
#echo  "Sequence names used for kSNP:"
#$kSNP/genome_names3  "$fasta"

if !( -e "$dir") then
mkdir "$dir"
endif
cd "$dir"

if ( -e "$annotate_list") then
    cp -f "$annotate_list" annotate_list
else
   touch annotate_list
endif

cp -f "$fasta_list" fasta_list

#DOS to unix
perl -i -pe 's/\015\012/\012/g' annotate_list
perl -i -pe 's/\015/\012/g' annotate_list
perl -i -pe 's/\015\012/\012/g' fasta_list
perl -i -pe 's/\015/\012/g' fasta_list


echo "Finished genomes for finding SNP positions:"
cat annotate_list
echo ""


# Make lookup table of genome names and fsplit# files, and create fsplit# files by merging entries of multi-contig/multi-read input genomes.
set count=0
set num_seqs=`wc -l fasta_list | awk '{print $1}' `
echo "Number of input sequences: $num_seqs "
printf "" >! fileName2genomeName
while ($count < $num_seqs)
    set name=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $2}' fasta_list`
    set file=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $1}' fasta_list`
    printf "$count\t$name\t$file\n"
    $kSNP/merge_fasta_reads3  "$file" >! fsplit$count
    printf "fsplit$count\t$name\n" >> fileName2genomeName
    @ count ++
end

if ( $k <= 31  ) then
    # jellyfish can do forward and reverse complement counts at same time, only the canonical direction (first in sorted list) kmer is listed, but counts are for both directions
    date
    echo "Running jellyfish to find k-mers"
    foreach f (fsplit*[0-9])
      if !(-s kmers_all.$f) then
	echo "$f"
	$kSNP/jellyfish count -C -o Jelly.$f -m $k -s 1000000000  -t $num_cpus  $f
	printf "" >! unsortedkmers.$f
	foreach i (Jelly."$f"_*)
	    $kSNP/jellyfish dump -c $i  >> unsortedkmers."$f"
	end
    	sort unsortedkmers.$f >! kmers_all.$f
	rm -f unsortedkmers.$f
      endif
    end
    echo "Finished running jellyfish"
endif

if  ($k>31 ) then
    echo "Running sa to find k-mers"
    date
    foreach f (fsplit*[0-9])
     if !(-s kmers_all.$f) then
	$kSNP/sa $f $k 0
	$kSNP/rc_kmer_freqs3 $f.counts >! kmers_all.$f
	rm -f $f.counts
     endif
    end
    echo "Finished running sa"
    date
endif

# Remove kmers that occur less than freq=average of median and mean kmer frequency for that genome.
echo "Removing kmers that occur less than freq=average of median and mean kmer frequency for that genome."
date
foreach f (fsplit*[0-9])
  awk '{print $2}' kmers_all.$f > ! freq.$f
  set min_kmer_coverage=`$kSNP/get_quantile3 freq.$f`
  echo "minimum kmer coverage for $f is $min_kmer_coverage"
  awk -v m=$min_kmer_coverage '$2>=m {print}' kmers_all.$f >! kmers.$f
end
date
rm freq.*

# Remove kmers from a genome if there are conflicting alleles in that genome
echo "Removing conflicting kmers from each genome with conflicting alleles"
date
foreach f (fsplit*)
    echo $f
    mkdir Dir.$f
    cd Dir.$f
    $kSNP/subset_mers3 ../kmers.$f
    printf "" >! cmds_remove_conflicting
    foreach subset (*.mers)
	echo "$kSNP/delete_allele_conflicts3 $subset" >> cmds_remove_conflicting
    end
    $kSNP/parallel_commands3 $num_cpus cmds_remove_conflicting
    cd ..
end
echo "Finished removing conflicting kmers"
date


echo "Merged sorted kmer files and remove duplicates"
date
$kSNP/subset_mer_list3 > ! mer_list
printf "" >! cmds_sort
foreach subset (`cat mer_list`)
echo "sort  -m  -u Dir.*/$subset.conflictsDeleted  > $subset" >> cmds_sort
end
$kSNP/parallel_commands3 $num_cpus cmds_sort
echo "Finished merging kmers across genomes"
date

################################ NEW
# Do not look for new SNPs, just find old ones from  -SNPs_all  input option
if (  $?SNPs_all  ) then
    #  ADD GENOMES to existing SNP analysis
    printf "Using existing SNPs from $SNPs_all file\n"
    date
    $kSNP/subset_SNPs_all3 "$SNPs_all"
    foreach subset (`cat mer_list`)
      if (-s $subset.SNPs_all) then
	 $kSNP/SNPs2fastaQuery3 $subset.SNPs_all >! SNP_loci.$subset.fasta
      endif
    end
endif
################################## if no -SNPs_all file or it is empty, then find new SNPs

if (! $?SNPs_all  ) then
    # do all the SNP finding
    printf "Discovering new SNPs\n\n"
    date

    echo "Finding kmers with multiple allele variants"
    printf "" >! cmds_pick_snps
    foreach subset (`cat mer_list`)
	echo "$kSNP/pick_snps_from_kmer_genome_counts3 $subset > SNP_loci.$subset.fasta" >> cmds_pick_snps
    end
    $kSNP/parallel_commands3 $num_cpus cmds_pick_snps
    echo "Finished finding kmers with multiple allele variants"
endif

 # Find which genome has which allele variant, by comparing the SNP_loci and Dir.$f/$subset.conflictsDeleted  foreach genome
date
echo "Finding allele in each genome"
printf "" >! cmds_find_allele
foreach f (fsplit*)
    foreach subset (`cat mer_list`)
	echo "$kSNP/find_allele3 SNP_loci.$subset.fasta  Dir.$f/$subset.conflictsDeleted $f > Dir.$f/SNPs.$subset" >> cmds_find_allele
    end
end
$kSNP/parallel_commands3 $num_cpus cmds_find_allele
foreach f (fsplit*)
    cat Dir.$f/SNPs.*.mers >! Dir.$f/SNPs
end

# Run mummer to find the position of each SNP in the finished genomes. Don't do this for unassembled draft genomes or merged raw read genomes, since positional information is not informative.

if (-s annotate_list) then
    echo "Finding SNP positions in finished genomes using mummer."
    date
    printf "" >! cmds_mummer
    printf "" >! cmds_parse_mummer
    foreach genome (`cat annotate_list`)
	set test=`grep -w  $genome fileName2genomeName | wc -l`
	set f=`grep -w  $genome fileName2genomeName | awk '{print $1}'`
	if ($test > 0 ) then
	    set file=`grep -w  $genome fasta_list  | awk -F'\011' '{print $1}'`
	    printf "genome: $genome  in Dir.$f\n"
	    awk -F'\011' '{print ">" $1 "_" $2 "\n" $3 }' Dir.$f/SNPs >! Dir.$f/SNPs.fasta
	    printf "$kSNP/mummer -maxmatch -l $k -b -c  Dir.$f/SNPs.fasta "'"'"$file"'"'" > Dir.$f/mummer.out\n" >> cmds_mummer
	    printf "$kSNP/parse_mummer4kSNP3  Dir.$f/mummer.out  > Dir.$f/SNP.positions\n" >> cmds_parse_mummer
	endif
    end
    $kSNP/parallel_commands3 $num_cpus cmds_mummer
    $kSNP/parallel_commands3 $num_cpus cmds_parse_mummer
    date
    echo "Finished finding SNP positions in finished genomes using mummer."
endif

# concatenate SNP files for each genome into one and sort it, and number the loci
echo "Concatenate results for each genome and sort by locus to create  SNPs_all_labelLoci"
date
printf "" >! all_SNPs_unsorted
foreach f (fsplit*)
    set test=`grep -w  $f fileName2genomeName | wc -l`
    if ($test > 0 ) then
	set genome=`grep -w  $f fileName2genomeName | awk '{print $2}'`
	printf "genome: $genome  in Dir.$f\n"
	if (-s Dir.$f/SNP.positions) then
	    awk -F'\011' -v f=$f '{print $1 "\t" $2 "\t" $3  "\t" f "\t" $4}' Dir.$f/SNP.positions  >> all_SNPs_unsorted
	    #cat Dir.$f/SNP.positions >> all_SNPs_unsorted
	else
	    awk -v genome=$genome '{print  $1 "\t" $2 "\tx\t" genome "\t" }' Dir.$f/SNPs >> all_SNPs_unsorted
	endif
    endif
end
if (  $?SNPs_all ) then
    # use existing SNP numbering
    awk -F'\011'  '{print $2 "\t" $3 "\t" $4  "\t" $5 "\t" $6 "\t" $7}' "$SNPs_all" >> all_SNPs_unsorted
endif
sort -u all_SNPs_unsorted >! all_SNPs_sorted
$kSNP/number_SNPs_all3 all_SNPs_sorted
$kSNP/rename_from_table3 all_SNPs_sorted_labelLoci fileName2genomeName SNPs_all


# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file.
if (-s annotate_list) then
    set ref_genome=`head -1 annotate_list`
endif
if !($?ref_genome) then
    set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'`
endif

if ($?vcf ) then
    $kSNP/parse_SNPs2VCF3 SNPs_all VCF.$ref_genome.vcf  $ref_genome
endif

echo "Finished finding SNPs"
date


# You can delete this Directory if everything works, but it's useful for debugging in case the run fails
rm -r TemporaryFilesToDelete
mkdir TemporaryFilesToDelete
mv -f Dir.* TemporaryFilesToDelete/.
if (-e cmds_mummer) then
mv -f cmds_mummer TemporaryFilesToDelete/.
mv -f cmds_parse_mummer TemporaryFilesToDelete/.
endif
mv -f  *.mers TemporaryFilesToDelete/.
mv -f Jelly.* TemporaryFilesToDelete/.
mv -f SNP_loci.*.mers.fasta TemporaryFilesToDelete/.
mv -f kmers*  TemporaryFilesToDelete/.
mv -f fsplit* TemporaryFilesToDelete/.
mv -f  all_SNPs_unsorted  TemporaryFilesToDelete/.
mv -f  all_SNPs_sorted* TemporaryFilesToDelete/.
mv -f mer_list TemporaryFilesToDelete/.
mv -f *.mers.SNPs_all TemporaryFilesToDelete/.


##probes_from_SNPs_all_kmers $probe_prefix_label

## Create a SNP matrix and fasta, for inputting to PHYLIP, FastTreeMP or other tools like SplitsTree
$kSNP/SNPs_all_2_fasta_matrix3 SNPs_all SNPs_all_matrix.fasta SNPs_all_matrix

printf "parsimony\n" >! tree_list1
printf "parsimony\n" >! tree_list2

############### Make tree using SNP matrix
echo "Building parsimony tree"

# Build parsimony tree
$kSNP/parsimonator -s SNPs_all_matrix -n SNPs_all -N 100 -p 1234

# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_all | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`
printf "Number of most parsimonious trees from SNPs_all: $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"

cat $best_parsimony_trees >! intree

# Get majority consensus tree
rm outfile outtree
#PHYLIP consense was the only tool i found  that forced resolution of every branch. FastTree to give it branch lengths will crash if some notes have splits to >2 children. But you need to modify seq.h and phylip.h before compiling consense to allow longer names so they don't get truncated
echo "Y\n" | $kSNP/consense

# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo   -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_all_matrix.fasta >! tree.parsimony.tre
mv RAxML* TemporaryFilesToDelete/.

## Build parsimony tree from SNPs_in_majority"$min_fraction_with_locus"
if ($?min_fraction_with_locus) then
 printf "Getting SNPs_in_majority$min_fraction_with_locus and building tree\n"
 $kSNP/core_SNPs3 SNPs_all fileName2genomeName $min_fraction_with_locus
 $kSNP/SNPs_all_2_fasta_matrix3 SNPs_in_majority"$min_fraction_with_locus"  SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta SNPs_in_majority"$min_fraction_with_locus"_matrix

# Build parsimony tree
$kSNP/parsimonator -s SNPs_in_majority"$min_fraction_with_locus"_matrix -n SNPs_majority"$min_fraction_with_locus" -N 100 -p 1234

# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`
printf "Number of most parsimonious trees for SNPs_in_majority$min_fraction_with_locus : $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"


cat $best_parsimony_trees >! intree

# Get majority consensus tree
rm outfile outtree
#Find consensus parsimony tree
echo "Y\n" | $kSNP/consense

# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo   -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta  >! tree.majority"$min_fraction_with_locus".tre
mv RAxML* TemporaryFilesToDelete/.

# Uncomment the following line to build ML majority tree, and write over the parsimony majority tree just built
 #$kSNP/FastTreeMP  -nt -pseudo  -gamma   -gtr SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta  >!  tree.majority"$min_fraction_with_locus".tre


 foreach t (  majority"$min_fraction_with_locus" )
    $kSNP/label_tree_nodes3 tree.$t.tre   > ! tree_nodeLabel.$t.tre
    $kSNP/tree_nodes3 tree_nodeLabel."$t".tre  nodes.$t
   if (-s tree_nodeLabel.$t.tre ) then
     echo "Placing SNPs on nodes $t tree"
     $kSNP/SNPs2nodes-new3 SNPs_in_majority"$min_fraction_with_locus"  nodes.$t.perlhash tree_nodeLabel.$t.tre  Node_SNP_counts.$t
     if (-e COUNT_Homoplastic_SNPs) then
	mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
     endif
     if (-e ClusterInfo) then
	mv ClusterInfo ClusterInfo.$t
     endif
     if (-e Homoplasy_groups) then
	mv Homoplasy_groups Homoplasy_groups.$t
     endif
     date
     echo "Finished placing SNPs on nodes $t tree"
	printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
	grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t

	if (-s tree_nodeLabel.$t.tre.rerooted) then
	    rm -f tree_nodeLabel.$t.tre
	    mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
	endif

	#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.

	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1

   endif
 end
endif

##Building parsimony tree from only the core SNPs
if ($?core) then
 printf "Getting core SNPs"
 if (! $?min_fraction_with_locus) then
   $kSNP/core_SNPs3 SNPs_all fileName2genomeName 0.5
 endif
 $kSNP/SNPs_all_2_fasta_matrix3 core_SNPs core_SNPs_matrix.fasta core_SNPs_matrix

# Build parsimony tree
$kSNP/parsimonator -s core_SNPs_matrix  -n SNPs_core -N 100 -p 1234

# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_core | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`

set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`

printf "Number of most parsimonious trees for SNPs_core : $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"


cat $best_parsimony_trees >! intree

# Get majority consensus tree
rm outfile outtree
#Find consensus parsimony tree
echo "Y\n" | $kSNP/consense

# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo   -nome -mllen -gamma -gtr -intree outtree.resolved core_SNPs_matrix.fasta  >! tree.core.tre
mv RAxML* TemporaryFilesToDelete/.

# Uncomment the following line to build ML core tree, and write over the parsimony core tree just built
#  $kSNP/FastTreeMP -nt  -gamma   -gtr core_SNPs_matrix.fasta  >!  tree.core.tre

 if (-s core_SNPs) then
   foreach t (  core )
    $kSNP/label_tree_nodes3 tree.$t.tre   > ! tree_nodeLabel.$t.tre
    $kSNP/tree_nodes3 tree_nodeLabel."$t".tre  nodes.$t
    if (-s tree_nodeLabel.$t.tre ) then
	echo "Placing SNPs on nodes $t tree"
	$kSNP/SNPs2nodes-new3 core_SNPs nodes.$t.perlhash tree_nodeLabel.$t.tre  Node_SNP_counts.$t
	if (-e COUNT_Homoplastic_SNPs) then
	    mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
	endif
	if (-e ClusterInfo) then
	    mv ClusterInfo ClusterInfo.$t
	endif
	if (-e Homoplasy_groups) then
	    mv Homoplasy_groups Homoplasy_groups.$t
	endif
	date
	echo "Finished placing SNPs on nodes $t tree"
	echo ""
	printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
	grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t

	if (-s tree_nodeLabel.$t.tre.rerooted) then
	    rm -f tree_nodeLabel.$t.tre
	    mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
	endif

	#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.

	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1

    endif
   end
 endif
endif

## Building ML FastTree tree from all SNPs
if ($?ML) then
 $kSNP/FastTreeMP  -nt -pseudo  -gamma -gtr SNPs_all_matrix.fasta >!  tree.ML.tre
 printf "ML\n" >> tree_list1
 printf "ML\n" >> tree_list2
endif


if ( $?nj) then
echo "Building NJ tree"
date
# NOTE:  This next line can take a long time if there are million+ SNP loci and 100+ genomes. SNP_matrix2dist_matrix does loops, so it's slow, should be parallelized.  Probably should try the PHYLIP program, although scores might be different since i count them as somewhat closer if they share a locus but not the allele than if they don't even share the locus. But since NJ SNP trees are not accurate anyway, i'm not inclined to spend anymore time since no one should use this option.
$kSNP/SNP_matrix2dist_matrix3 SNPs_all_matrix >! NJ.dist.matrix
$kSNP/distance_tree3 >! tree.NJ.tre
echo "Finished building NJ tree"
printf "NJ\n" >> tree_list1
printf "NJ\n" >> tree_list2
date
endif

#######################

$kSNP/find_unresolved_clusters3 tree.parsimony.tre >! unresolved_clusters

date
echo "Finding nodes"

foreach t ( `cat tree_list1` )
    if (-s tree."$t".tre) then
	$kSNP/label_tree_nodes3 tree.$t.tre   > ! tree_nodeLabel.$t.tre
	$kSNP/tree_nodes3 tree_nodeLabel."$t".tre  nodes.$t
	echo "Placing SNPs on nodes $t tree"
	$kSNP/SNPs2nodes-new3 SNPs_all nodes.$t.perlhash tree_nodeLabel.$t.tre  Node_SNP_counts.$t
	if (-e COUNT_Homoplastic_SNPs) then
	    mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
	endif
	if (-e ClusterInfo) then
	    mv ClusterInfo ClusterInfo.$t
	endif
	if (-e Homoplasy_groups) then
	    mv Homoplasy_groups Homoplasy_groups.$t
	endif
	date
	echo "Finished placing SNPs on nodes $t tree"
	echo ""
    endif
end


# Relabel trees with SNP counts at nodes
foreach t (  `cat tree_list1` )
    if (-s tree."$t".tre) then

	printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
	grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t

	if (-s tree_nodeLabel.$t.tre.rerooted) then
	    rm -f tree_nodeLabel.$t.tre
	    mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
	endif

	#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.

	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
	$kSNP/labelTree_AlleleCount-new3  tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1
    endif
end

mv -f nodes.* TemporaryFilesToDelete/.
mv -f tree_tipAlleleCounts.*.NodeLabel.tre TemporaryFilesToDelete/.
mv -f tree_nodeLabel.* TemporaryFilesToDelete/.

########
# find proteins where SNPs land, codons, amino acids, and identify nonsynonymous SNPs
echo "Annotating SNPs."
date

# Only get genbank file and annoate if there is positional information for some genomes, ie. annotate_list is not empty
if (-s annotate_list) then

    # Get whole genome annotations from genbank, unfortunately you have to get the whole genbank file with sequence data, since the much smaller feature table does not have mature peptides making viral annotation useless with polyproteins only.
    set count=0
    printf "" >! headers.annotate_list
    foreach genome (`cat annotate_list`)
	set file_check=`grep -w  $genome fasta_list  | wc -l`
	if ($file_check > 0 ) then
	    set file=`grep -w  $genome fasta_list  | awk -F'\011' '{print $1}'`
	    printf "$file\n"
	    $kSNP/get_genbank_file3 "$file" genbank_from_NCBI.gbk.$count
	    fgrep ">" "$file" | sed -e "s/^>/>$genome /" >> headers.annotate_list
	    @ count ++
	endif
    end
    cat genbank_from_NCBI.gbk.* | grep -v BioProject  >! genbank_from_NCBI.gbk
    rm genbank_from_NCBI.gbk.*

    if (-e "$genbankFile" ) then
	$kSNP/annotate_SNPs_from_genbankFiles3   -all $all_annotations  $genbankFile
    else
	$kSNP/annotate_SNPs_from_genbankFiles3   -all $all_annotations
    endif

    printf "Num_NotAnnotatedRegion\tAnnotatedNotProtein\tNum_NonSynon\tNum_Synon\tNS/S\tNSfractionOfAnnotated\tNumLoci\tNum_InAnnotatedGenomes\tNum_NotInAnnotatedGenome\n" >! Annotation_summary
    set i=SNP_annotations
    set num_notInAnnotatedGenome=`grep  NotInAnnotatedGenome $i |  awk '  {print $1}' | sort -u | wc -l | awk '{print $1}'`
    set num_UnAnnRegion=`grep  UnannotatedRegion $i |  awk '  {print $1}' | sort -u | wc -l | awk '{print $1}'`
    set num_AnnNotProtein=`grep  NotProteinCoding $i |  awk '  {print $1}' | sort -u | wc -l | awk '{print $1}'`

    set NS_total=`grep -v LocusNum $i |  awk ' $3>0 {print $1}' | sort -u | wc -l | awk '{print $1}'`
    set Num_loci=`grep -v LocusNum $i |  awk '{print $1}' | sort -u | wc -l | awk '{print $1}'`
    set Num_loci_in_annotated=`grep -v LocusNum $i | grep -v  NotInAnnotatedGenome |  awk '{print $1}' | sort -u | wc -l | awk '{print $1}'`
    set S_total=`perl -e "print ($Num_loci_in_annotated-$NS_total)"`
    if ($S_total > 0) then
    set NS_Sratio=`perl -e "print $NS_total/$S_total"`
    else
	set NS_Sratio="inf"
    endif
    if ($Num_loci_in_annotated > 0) then
	set NSfraction_overall=`perl -e "print $NS_total/$Num_loci_in_annotated"`
    else
	set NSfraction_overall="inf"
    endif

    printf "$num_UnAnnRegion\t$num_AnnNotProtein\t$NS_total\t$S_total\t$NS_Sratio\t$NSfraction_overall\t$Num_loci\t$Num_loci_in_annotated\t$num_notInAnnotatedGenome\n"  >> Annotation_summary


    $kSNP/parse_protein_annotation_counts3 SNP_annotations >!  Protein_Annotation_counts

    echo "Finished SNP annotation."
endif

echo "Finished running kSNP"
date
set endseconds=`date +%s`
set elapsedTime=`perl -e "print (($endseconds-$startseconds)/60/60)"`
echo "Elapsed time for kSNP in hours: $elapsedTime"


mv cmds* TemporaryFilesToDelete/.
mv tree_list1 TemporaryFilesToDelete/.
mv tree_list2 TemporaryFilesToDelete/.
mv -f fileName2genomeName TemporaryFilesToDelete/.
rm intree outtree outfile

if (-s SNPs_all && -s tree.parsimony.tre && -s tree_AlleleCounts.parsimony.tre && -s unresolved_clusters && -s COUNT_SNPs && $DEBUG<1) then
    rm -r TemporaryFilesToDelete
endif

exit

##########################################################################################################
##########################################################################################################

# HRE finder is not updated to work with kSNP3.  Use kSNP2 if you want to use HREfinder.
#  In case you want to run HREFinder  at http://sourceforge.net/projects/hrefinder/
# set your path to the hreFinder code  "set hre=/path/to/hreFinder"
# and comment out the exit line above.
# YOU MUST HAVE ALL THE GENOMES IN THE -p annotate_list LIST. FOR HREFINDER YOU NEED POSITIONAL
# INFORMATION FOR ALL OF THEM, EVEN THE DRAFT GENOMES THAT ARE ASSEMBLED INTO A FEW LARGE CONTIGS.
# If some draft genomes are in alot of contigs, it is recommended that
# you remove those and rerun kSNP before attempting hreFinder.
# Don't run hreFinder with genomes that are raw unassembled reads.


###### Run hreFinder to predict series of SNPs likely to have been involved in homologous recombination events

set hre=/usr/gapps/kpath/hreFinder


if (-s SNPs_all) then

# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file.
if (-s annotate_list) then
    set ref_genome=`head -1 annotate_list`
endif
if !($?ref_genome) then
    set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'`
endif


foreach tree (`cat tree_list2`)
mkdir  HRE.$tree
cd HRE.$tree
$hre/run_config.py ../tree.$tree.tre    ../fastainput ../SNPs_all $ref_genome

echo ""
echo $tree
echo "Number of SNPs involved in HRE events:"
awk '$1!="" {print $1}' hreSNPs | sort -u | wc -l
echo "Number of HRE events:"
grep -v HRE_events hre_from_to_c | awk ' total=total+$5 {} END {print total}'
echo "Number of HRE events from outside tree:"
grep -v HRE_events hre_from_to_c | grep outside | awk ' total=total+$5 {} END {print total}'


cd ..
end

endif

exit


# Set up standing db of genbank files so you don't have to go online to annotate SNPs
mkdir GenbankFiles
cd GenbankFiles
foreach domain ( Viruses Bacteria )
mkdir $domain
cd $domain
foreach type (gbk)
mkdir Temp
cd Temp
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/all.$type.tar.gz"
tar -xvzf all.$type.tar.gz
rm *.tar.gz
mv */*  ..
rm -r *
cd ..
rm -r Temp
end
cd ..
end

# get gbk files for plasmids
foreach domain ( Plasmids)
mkdir $domain
cd $domain
foreach type (gbk)
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/plasmids.all.$type.tar.gz"
tar -xvzf plasmids.all.$type.tar.gz
mv am/ftp-genomes/Plasmids/$type/* .
rm -r am
rm *.tar.gz
end
cd ..
end