jelber2/Running-PBJelly-example-with-indel-correction-with-BBMap-no-Pilon.txt

## Running-PBJelly-example-with-indel-correction-with-BBMap-no-Pilon.txt
# goes along with http://seqanswers.com/forums/showthread.php?p=220925#post220925
#
# assumes you have PBJelly, blasr, tabix, bcftools, samtools installed
# below I am using a machine with 70 cores on a single node, adjust to the number of cores to your machine
# The scripts below are obviously not designed for use with a cluster, but can be modified
#
#########################
# STEP 1 Combine the FASTQ files and remove the originals to save space
#########################
## first combine files and delete the originals to save space
WorkDir=/genetics/elbers/pacbio

cd ${WorkDir}
cat ?.subreads.fastq > pacbio-genome-pbjelly.fastq
rm ?.subreads.fastq

## second add fake quality values (from Q0=! to Q30=> in Sanger encoding) for pbjelly to work
cat pacbio-genome-pbjelly.fastq | tr "!" ">" > pacbio-genome-pbjelly-fqual.fastq

#########################
# STEP 2 Create pbjelly-config-assemble.xml
#########################
## use nano to create the file then paste everything between the lines starting with #### and ending with ####
nano pbjelly-config-assemble.xml
####pbjelly-config-assemble.xml####
<jellyProtocol>
    <reference>/genetics/pacbio/genome.fasta</reference>
    <outputDir>/genetics/pacbio/</outputDir>
    <cluster>
        <command notes="For single node, multi-core machines" >${CMD} ${JOBNAME} 2> ${STDERR} 1> ${STDOUT} &amp;</command>
        <nJobs>70</nJobs>
    </cluster>
    <blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 1 -noSplitSubreads</blasr>
    <input baseDir="/genetics/pacbio/">
        <job>pacbio-genome-pbjelly-fqual.fastq</job>
    </input>
</jellyProtocol>
####pbjelly-config-assemble.xml####

#########################
# STEP 3 Create fake qualities for reference to fill in gaps for
#########################
/opt/PBSuite_15.8.24/bin/fakeQuals.py genome.fasta genome.qual

#########################
# STEP 4 Summarize the assembly and reads
#########################
## first Summarize the assembly
/opt/PBSuite_15.8.24/bin/summarizeAssembly.py genome.fasta > genome.fasta.summary

## second Summarize the reads
/opt/PBSuite_15.8.24/bin/readSummary.py pbjelly-config-assemble.xml > pacbio-genome-pbjelly-fqual.fastq.summary

#########################
# STEP 5 Setup pbjelly
#########################
## note - this step takes about 1 hour
## note2 - pbjelly steps are submitted in the background, but even though it says step is
## finished, it might not be, you need to see if there are still pbjelly processes running
## for example "ps aux|grep -v "root"|less -S" to see if you still see PBJellySuite or
## blasr or Setup.py or Extraction.py or Support.py or Assemble.py or Out.py
/opt/PBSuite_15.8.24/bin/Jelly.py setup pbjelly-config-assemble.xml

#########################
# STEP 6 Map FASTQ reads to genome.fasta reference
#########################
/opt/PBSuite_15.8.24/bin/Jelly.py mapping pbjelly-config-mapping.xml

#########################
# STEP 7 Summarize mapping results
#########################
/opt/PBSuite_15.8.24/bin/Jelly.py support pbjelly-config-assemble.xml -x "--minMapq=50"

#########################
# STEP 8 Extract reads that appear to bridge gaps
#########################
/opt/PBSuite_15.8.24/bin/Jelly.py extraction pbjelly-config-assemble.xml

#########################
# STEP 9 Assemble reads to fill in gaps
#########################
/opt/PBSuite_15.8.24/bin/Jelly.py assembly pbjelly-config-assemble.xml

#########################
# STEP 10 Produce output with gaps filled in
#########################
/opt/PBSuite_15.8.24/bin/Jelly.py output pbjelly-config-assemble.xml

#########################
# STEP 11 Convert het bases to hom and convert lower to uppercase
#########################
## see https://github.com/lh3/seqtk for a link to download seqtk
/opt/seqtk/seqtk randbase jelly.out.fasta |/opt/seqtk/seqtk seq -U > ../jelly.out.upper.fasta

#########################
# STEP 12 Run a variant caller or use Pilon (twice)
# to correct indels in jelly.out.fasta PacBio filled in Gaps using Illumina reads
#########################
## personally, whether you use Pilon or a Variant Caller (such as callvariants.sh from BBTools), I found that mapping with BBMap with the vslow=t option had better indel correction for simulated data
## for real data, using the steps below outperformed Pilon in terms of better RNA-Seq mapping rates
##
## presumably you can use your 10x reads to correct indels?
## calling this paired-end read file as 10x-illumina-quality-controlled-and-trimmed-reads.fq.gz in steps below

#########################
# STEP 13 Error correct with BBMap once
#########################
cd /genetics/user
# download latest version of BBTools
wget https://sourceforge.net/projects/bbmap/files/BBMap_38.25.tar.gz
tar xzf BBMap_38.25.tar.gz
cd bbmap
# compile jni components
cd jni
# add to ~/.bashrc     export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64" or where ever your jvm machine is installed
# then source ~/.bashrc
make -f makefile.linux

cd ${WorkDir}

# map reads with bbmap 38.25 in vslow mode using jni
/genetics/user/bbmap/bbmap.sh -Xmx350g -eoom usejni=t pigz=t threads=75 vslow=t ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads.fq.gz \
out=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.bam overwrite=t 2> bbmap.log &

# callvariants 38.25
/genetics/user/bbmap/callvariants.sh -Xmx350g threads=75 ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.sorted.bam \
ploidy=2 vcf=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf minscore=20.0 overwrite=t 2> callvariants.log &

# use bgzip and tabix to compress then index vcf file
bgzip -f 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf
tabix -p vcf 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz

# use bcftools to get consensus sequence
bcftools consensus -f jelly.out.upper.fasta 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz -o jelly.out.upper.bbmap-1.fasta 2> bcftools.log &
	# goes along with http://seqanswers.com/forums/showthread.php?p=220925#post220925
	#
	# assumes you have PBJelly, blasr, tabix, bcftools, samtools installed
	# below I am using a machine with 70 cores on a single node, adjust to the number of cores to your machine
	# The scripts below are obviously not designed for use with a cluster, but can be modified
	#
	#########################
	# STEP 1 Combine the FASTQ files and remove the originals to save space
	#########################
	## first combine files and delete the originals to save space
	WorkDir=/genetics/elbers/pacbio

	cd ${WorkDir}
	cat ?.subreads.fastq > pacbio-genome-pbjelly.fastq
	rm ?.subreads.fastq

	## second add fake quality values (from Q0=! to Q30=> in Sanger encoding) for pbjelly to work
	cat pacbio-genome-pbjelly.fastq \| tr "!" ">" > pacbio-genome-pbjelly-fqual.fastq

	#########################
	# STEP 2 Create pbjelly-config-assemble.xml
	#########################
	## use nano to create the file then paste everything between the lines starting with #### and ending with ####
	nano pbjelly-config-assemble.xml
	####pbjelly-config-assemble.xml####
	<jellyProtocol>
	<reference>/genetics/pacbio/genome.fasta</reference>
	<outputDir>/genetics/pacbio/</outputDir>
	<cluster>
	<command notes="For single node, multi-core machines" >${CMD} ${JOBNAME} 2> ${STDERR} 1> ${STDOUT} &</command>
	<nJobs>70</nJobs>
	</cluster>
	<blasr>-minMatch 8 -minPctIdentity 70 -bestn 1 -nCandidates 10 -maxScore -500 -nproc 1 -noSplitSubreads</blasr>
	<input baseDir="/genetics/pacbio/">
	<job>pacbio-genome-pbjelly-fqual.fastq</job>
	</input>
	</jellyProtocol>
	####pbjelly-config-assemble.xml####

	#########################
	# STEP 3 Create fake qualities for reference to fill in gaps for
	#########################
	/opt/PBSuite_15.8.24/bin/fakeQuals.py genome.fasta genome.qual

	#########################
	# STEP 4 Summarize the assembly and reads
	#########################
	## first Summarize the assembly
	/opt/PBSuite_15.8.24/bin/summarizeAssembly.py genome.fasta > genome.fasta.summary

	## second Summarize the reads
	/opt/PBSuite_15.8.24/bin/readSummary.py pbjelly-config-assemble.xml > pacbio-genome-pbjelly-fqual.fastq.summary

	#########################
	# STEP 5 Setup pbjelly
	#########################
	## note - this step takes about 1 hour
	## note2 - pbjelly steps are submitted in the background, but even though it says step is
	## finished, it might not be, you need to see if there are still pbjelly processes running
	## for example "ps aux\|grep -v "root"\|less -S" to see if you still see PBJellySuite or
	## blasr or Setup.py or Extraction.py or Support.py or Assemble.py or Out.py
	/opt/PBSuite_15.8.24/bin/Jelly.py setup pbjelly-config-assemble.xml

	#########################
	# STEP 6 Map FASTQ reads to genome.fasta reference
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py mapping pbjelly-config-mapping.xml

	#########################
	# STEP 7 Summarize mapping results
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py support pbjelly-config-assemble.xml -x "--minMapq=50"

	#########################
	# STEP 8 Extract reads that appear to bridge gaps
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py extraction pbjelly-config-assemble.xml

	#########################
	# STEP 9 Assemble reads to fill in gaps
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py assembly pbjelly-config-assemble.xml

	#########################
	# STEP 10 Produce output with gaps filled in
	#########################
	/opt/PBSuite_15.8.24/bin/Jelly.py output pbjelly-config-assemble.xml

	#########################
	# STEP 11 Convert het bases to hom and convert lower to uppercase
	#########################
	## see https://github.com/lh3/seqtk for a link to download seqtk
	/opt/seqtk/seqtk randbase jelly.out.fasta \|/opt/seqtk/seqtk seq -U > ../jelly.out.upper.fasta

	#########################
	# STEP 12 Run a variant caller or use Pilon (twice)
	# to correct indels in jelly.out.fasta PacBio filled in Gaps using Illumina reads
	#########################
	## personally, whether you use Pilon or a Variant Caller (such as callvariants.sh from BBTools), I found that mapping with BBMap with the vslow=t option had better indel correction for simulated data
	## for real data, using the steps below outperformed Pilon in terms of better RNA-Seq mapping rates
	##
	## presumably you can use your 10x reads to correct indels?
	## calling this paired-end read file as 10x-illumina-quality-controlled-and-trimmed-reads.fq.gz in steps below

	#########################
	# STEP 13 Error correct with BBMap once
	#########################
	cd /genetics/user
	# download latest version of BBTools
	wget https://sourceforge.net/projects/bbmap/files/BBMap_38.25.tar.gz
	tar xzf BBMap_38.25.tar.gz
	cd bbmap
	# compile jni components
	cd jni
	# add to ~/.bashrc export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64" or where ever your jvm machine is installed
	# then source ~/.bashrc
	make -f makefile.linux

	cd ${WorkDir}

	# map reads with bbmap 38.25 in vslow mode using jni
	/genetics/user/bbmap/bbmap.sh -Xmx350g -eoom usejni=t pigz=t threads=75 vslow=t ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads.fq.gz \
	out=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.bam overwrite=t 2> bbmap.log &

	# callvariants 38.25
	/genetics/user/bbmap/callvariants.sh -Xmx350g threads=75 ref=jelly.out.upper.fasta in=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.sorted.bam \
	ploidy=2 vcf=10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf minscore=20.0 overwrite=t 2> callvariants.log &

	# use bgzip and tabix to compress then index vcf file
	bgzip -f 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf
	tabix -p vcf 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz

	# use bcftools to get consensus sequence
	bcftools consensus -f jelly.out.upper.fasta 10x-illumina-quality-controlled-and-trimmed-reads-mapped-to-jelly.out.upper.fasta-using-bbmap.vcf.gz -o jelly.out.upper.bbmap-1.fasta 2> bcftools.log &