jelber2/pilon-runs-1-2.sh

## pilon-runs-1-2.sh
#! /bin/bash

set -e

# installing fasta-splitter.pl
## wget http://kirill-kryukov.com/study/tools/fasta-splitter/files/fasta-splitter-0.2.6.zip
## unzip fasta-splitter-0.2.6.zip

# assumes initial genome to be error-corrected by pilon is called
## genome.pilon-0.fasta

# set variables
# RAM in gigabytes
RAM=350
interleavedReadsPath=/genetics/pacbio/pilon-vs-racon/chr20.pbjelly.fastq.gz
workDir=/genetics/pacbio/pilon-vs-racon/pilon
fastaSplitterPath=/genetics/pacbio/fasta-splitter.pl
samtoolsPath=samtools
bwaPath=bwa
pilonPath=/opt/pilon/pilon-1.22.jar
seqtkPath=/opt/seqtk/seqtk
# number of cores on computer
numThreads=75

# start of script
x=0
while [ $x -lt 2 ]
do
  y=$(($x+1))

  ### a get the contig lengths with ${samtoolsPath} faidx
  cd ${workDir}
  ${samtoolsPath} faidx genome.pilon-${x}.fasta

  ### b split genome into 39 parts with fasta-splitter.pl
  perl ${fastaSplitterPath} --n-parts 39 genome.pilon-${x}.fasta --out-dir pilon-${y}/
  cd ${workDir}/pilon-${y}

  ### c create a sequence from 01,02,...,39
  seq -w 1 39 > samples

  ### d get the contig names
  while read i;do grep ">" genome.pilon-${x}.part-${i}.fasta|perl -pe "s/>//g" > targetlist${i}; done < samples

  ### e get the contig lengths
  cut -f 1-2 ../genome.pilon-${x}.fasta.fai > pilon.chromosome.lengths.txt

  ### f get the names of the contigs that are in split files where there are less than 10 contigs
  cp samples samples-to-split

  ### g write script to make target lists for pilon
  ###   The script below breaks up the task into ${numThreads} pieces
  while read i
  do
    lines="$(wc -l targetlist${i}|cut -d " " -f 1)"
    while read line;
    do
      chrlength="$(grep -P "^$line\t" pilon.chromosome.lengths.txt | cut -f 2)"
      numberofsplits="$(perl -w -e "use POSIX; print ceil(${numThreads}/$lines)")"
      numberofsplits2=$(($numberofsplits - 1))
      splitlength="$(perl -w -e "use POSIX; print ceil($chrlength/$numberofsplits)")"
      splits=1
      while [[ $splits -lt $numberofsplits ]]
      do
        if [[ $splits -eq 1 ]]
        then
          splitlength3="$(($splitlength + 1))"
          stopbase=$(($splitlength * ($splits + 1)))
          echo "${line}:1-${splitlength}" >> targetlist-${i}
          echo "${line}:${splitlength3}-${stopbase}" >> targetlist-${i}
          ((splits++))
        elif [[ $splits -eq $numberofsplits2 ]]
        then
          startbase=$((($splitlength * ($splits)) + 1))
          echo "${line}:${startbase}-${chrlength}" >> targetlist-${i}
          ((splits++))
        else
          stopbase=$(($splitlength * ($splits + 1)))
          startbase=$((($splitlength * $splits) + 1))
          echo "${line}:${startbase}-${stopbase}" >> targetlist-${i}
          ((splits++))
        fi
      done
    done < targetlist${i}
  done < samples-to-split

  ### h rename targetlist-01 to targetlist01
  while read i;do
  mv -f targetlist-${i} targetlist${i}
  done < samples-to-split

  ### i make a list of the targetlists
  while read i;do
  ls targetlist${i} >> targetlists
  done < samples

  ### j bwa index, bwa mem, sam2bam, samtools sort, samtools index
  cd ${workDir}
  ${bwaPath} index -a bwtsw genome.pilon-${x}.fasta > bwa-index.log 2>&1
  ${bwaPath} mem -p -M -t ${numThreads} genome.pilon-${x}.fasta \
  ${interleavedReadsPath} 2> bwa.log |\
  ${samtoolsPath} view -@${numThreads} -F 4 -Sb - |${samtoolsPath} sort -@${numThreads} - > reads-mapped-to-genome.pilon-${x}.fasta.bam
  ${samtoolsPath} index -@${numThreads} reads-mapped-to-genome.pilon-${x}.fasta.bam

  ### k run pilon (takes about 1-2 days)
  cd ${workDir}/pilon-${y}
  while read i;do
  java -Xmx${RAM}g -jar ${pilonPath} \
  --genome ../genome.pilon-${x}.fasta \
  --frags ../reads-mapped-to-genome.pilon-${x}.fasta.bam \
  --output ${i}-pilon \
  --targets ${i} \
  --diploid \
  --changes \
  --fix bases --threads ${numThreads} > ${i}-pilon.log 2>&1
  done < targetlists

  ### l combine output files
  #### i get rid of extra headers in each output file
  while read i;do
  ${seqtkPath} seq -l0 targetlist${i}-pilon.fasta | awk '!seen[$1]++'|${seqtkPath} seq -l80 > targetlist${i}-pilon.fasta2
  done < samples

  #### ii combine the output files
  cat $(find ./ -name "targetlist*-pilon.fasta2" | sort -V) > pilon.fasta

  #### iii change output file name and remove "_pilon" from end of contig/scaffold names
  ${seqtkPath} randbase pilon.fasta | ${seqtkPath} seq -U > ../genome.pilon-${y}.fasta
  perl -pi -e "s/_pilon//g" ../genome.pilon-${y}.fasta

  ### m increase loop value
  ((x++))
done
	#! /bin/bash

	set -e

	# installing fasta-splitter.pl
	## wget http://kirill-kryukov.com/study/tools/fasta-splitter/files/fasta-splitter-0.2.6.zip
	## unzip fasta-splitter-0.2.6.zip

	# assumes initial genome to be error-corrected by pilon is called
	## genome.pilon-0.fasta

	# set variables
	# RAM in gigabytes
	RAM=350
	interleavedReadsPath=/genetics/pacbio/pilon-vs-racon/chr20.pbjelly.fastq.gz
	workDir=/genetics/pacbio/pilon-vs-racon/pilon
	fastaSplitterPath=/genetics/pacbio/fasta-splitter.pl
	samtoolsPath=samtools
	bwaPath=bwa
	pilonPath=/opt/pilon/pilon-1.22.jar
	seqtkPath=/opt/seqtk/seqtk
	# number of cores on computer
	numThreads=75

	# start of script
	x=0
	while [ $x -lt 2 ]
	do
	y=$(($x+1))

	### a get the contig lengths with ${samtoolsPath} faidx
	cd ${workDir}
	${samtoolsPath} faidx genome.pilon-${x}.fasta

	### b split genome into 39 parts with fasta-splitter.pl
	perl ${fastaSplitterPath} --n-parts 39 genome.pilon-${x}.fasta --out-dir pilon-${y}/
	cd ${workDir}/pilon-${y}

	### c create a sequence from 01,02,...,39
	seq -w 1 39 > samples

	### d get the contig names
	while read i;do grep ">" genome.pilon-${x}.part-${i}.fasta\|perl -pe "s/>//g" > targetlist${i}; done < samples

	### e get the contig lengths
	cut -f 1-2 ../genome.pilon-${x}.fasta.fai > pilon.chromosome.lengths.txt

	### f get the names of the contigs that are in split files where there are less than 10 contigs
	cp samples samples-to-split

	### g write script to make target lists for pilon
	### The script below breaks up the task into ${numThreads} pieces
	while read i
	do
	lines="$(wc -l targetlist${i}\|cut -d " " -f 1)"
	while read line;
	do
	chrlength="$(grep -P "^$line\t" pilon.chromosome.lengths.txt \| cut -f 2)"
	numberofsplits="$(perl -w -e "use POSIX; print ceil(${numThreads}/$lines)")"
	numberofsplits2=$(($numberofsplits - 1))
	splitlength="$(perl -w -e "use POSIX; print ceil($chrlength/$numberofsplits)")"
	splits=1
	while [[ $splits -lt $numberofsplits ]]
	do
	if [[ $splits -eq 1 ]]
	then
	splitlength3="$(($splitlength + 1))"
	stopbase=$(($splitlength * ($splits + 1)))
	echo "${line}:1-${splitlength}" >> targetlist-${i}
	echo "${line}:${splitlength3}-${stopbase}" >> targetlist-${i}
	((splits++))
	elif [[ $splits -eq $numberofsplits2 ]]
	then
	startbase=$((($splitlength * ($splits)) + 1))
	echo "${line}:${startbase}-${chrlength}" >> targetlist-${i}
	((splits++))
	else
	stopbase=$(($splitlength * ($splits + 1)))
	startbase=$((($splitlength * $splits) + 1))
	echo "${line}:${startbase}-${stopbase}" >> targetlist-${i}
	((splits++))
	fi
	done
	done < targetlist${i}
	done < samples-to-split

	### h rename targetlist-01 to targetlist01
	while read i;do
	mv -f targetlist-${i} targetlist${i}
	done < samples-to-split

	### i make a list of the targetlists
	while read i;do
	ls targetlist${i} >> targetlists
	done < samples

	### j bwa index, bwa mem, sam2bam, samtools sort, samtools index
	cd ${workDir}
	${bwaPath} index -a bwtsw genome.pilon-${x}.fasta > bwa-index.log 2>&1
	${bwaPath} mem -p -M -t ${numThreads} genome.pilon-${x}.fasta \
	${interleavedReadsPath} 2> bwa.log \|\
	${samtoolsPath} view -@${numThreads} -F 4 -Sb - \|${samtoolsPath} sort -@${numThreads} - > reads-mapped-to-genome.pilon-${x}.fasta.bam
	${samtoolsPath} index -@${numThreads} reads-mapped-to-genome.pilon-${x}.fasta.bam

	### k run pilon (takes about 1-2 days)
	cd ${workDir}/pilon-${y}
	while read i;do
	java -Xmx${RAM}g -jar ${pilonPath} \
	--genome ../genome.pilon-${x}.fasta \
	--frags ../reads-mapped-to-genome.pilon-${x}.fasta.bam \
	--output ${i}-pilon \
	--targets ${i} \
	--diploid \
	--changes \
	--fix bases --threads ${numThreads} > ${i}-pilon.log 2>&1
	done < targetlists

	### l combine output files
	#### i get rid of extra headers in each output file
	while read i;do
	${seqtkPath} seq -l0 targetlist${i}-pilon.fasta \| awk '!seen[$1]++'\|${seqtkPath} seq -l80 > targetlist${i}-pilon.fasta2
	done < samples

	#### ii combine the output files
	cat $(find ./ -name "targetlist*-pilon.fasta2" \| sort -V) > pilon.fasta

	#### iii change output file name and remove "_pilon" from end of contig/scaffold names
	${seqtkPath} randbase pilon.fasta \| ${seqtkPath} seq -U > ../genome.pilon-${y}.fasta
	perl -pi -e "s/_pilon//g" ../genome.pilon-${y}.fasta

	### m increase loop value
	((x++))
	done