jasongallant/resequenceandsnp.sh

## resequenceandsnp.sh
#WORKFLOW FOR GENOME RESEQUENCING:

#CONCATENATE CASAVA OUTPUT
# Annoying first step, Casava 1.8 splits files into groups of 4,000,000 reads, requiring you to stitch it back together...

cat *P1_GA1_*R1* > Limenitis_Pool_P1_GA1_R1.fastq.gz
cat *P1_GA1_*R2* > Limenitis_Pool_P1_GA1_R2.fastq.gz

#repeat for each group of files per each individual pool/sample

# SAMPLE CLEANUP
Custom script used.  Will include later

#BOWTIE2 ALIGNMENT TO WNTA CONTIG
# Next, each sample is submitted (separated pool) to cluster, using bowtie's local alignment parameters (very fast), singletons from quality trimming, and including the proper information about insert size (0-1000bp)...
qsub -l h_rt=08:00:00 -V -N Limenitis_Pool_P1_GA1.sam -b y $BT2_HOME/bowtie2 -x wnt_contig --fast-local -I 0 -X 1000 -1 Limenitis_Pool_P1_GA1_R1.adapter-trimmed.fastq -2 Limenitis_Pool_P1_GA1_R2.adapter-trimmed.fastq -U Limenitis_Pool_P1_GA1_R1.adapter-trimmed.read-singleton.fastq -S Limenitis_Pool_P1_GA1.sam

#CONVERT TO BAM
#Of course, none of the downstream programs play well with the text-version of the SAM alignment file, so you must convert to BAM file...
qsub -V -b y "samtools view -bS Limenitis_Pool_P1_GA1.sam > Limenitis_Pool_P1_GA1.bam"

#SORT BAM FILES
#In order to perform operations, BAM files must be ordered relative each other...
qsub -V -b y samtools sort Limenitis_Pool_P1_GA17.bam Limenitis_Pool_P1_GA17.bam.sorted

#ADD READ GROUPS
#To use GATK's downstream tools, you need to add the required metadata to the file, which is not automatically kept track of.  Picard's "AddOrReplaceReadGroups.jar" does a reasonably snappy job of this...
qsub -V -b y java -jar AddOrReplaceReadGroups.jar INPUT=Limenitis_Pool_P1_GA1.bam.sorted.bam OUTPUT=Limenitis_Pool_P1_GA1.final.bam ID=CDHP7ACXX.LANE3 PL=ILLUMINA PU=CDHP7ACXX LB=Pool_P1_GA1 SM=GA1

#REORDER BAM FILES
#For reasons that escape me presently, the files need to be reordered now before they can be merged... This can be accomplished using Picard's "ReorderSam.jar"
qsub -V -b y java -jar ReorderSam.jar INPUT=Limenitis_Pool_P1_GA1.final.bam OUTPUT=Limenitis_Pool_P1_GA1.final.reordered.bam REFERENCE=wntacontig.fa

#MERGE BAM FILES BY SAMPLE
#BAM files can then be merged in whichever way you'd like.  I think that you can merge them all (though it would take a while and probably crash IGV downstream).  I've tended to keep them as separate files per sample, though with the ReadGroups now added this should be unnecessary:
#there is a little trick here for our particular cluster system: the default temporary directory is size limited, and the job will fail if this directory runs out of space.  Using the  "-Djava.io.tmpdir=" flag tells java to use another location for the temporary file that has no size limitations.
qsub -V -b y java -Djava.io.tmpdir=/someotherplace/tmp -jar MergeSamFiles.jar INPUT=Limenitis_Pool_P1_GA1.final.reordered.bam INPUT=Limenitis_Pool_P2_GA1.final.reordered.bam OUTPUT=GA1_merged.bam

#Call SNPs with GATK
#GATK recommends all sorts of preprocessing.  For now I didn't do any, but I probably should...
java -jar GenomeAnalysisTK.jar -R wntacontig.fa -T UnifiedGenotyper --num_threads 1 --multiallelic --output_mode EMIT_ALL_SITES \
-I GA1_merged.bam \
# add -I flags for each of your samples (merged BAM files from above).

#Call SNPS with Samtools pileup
# Add all of your merged Bamfiles before the pipe.  This will allow you to perform a simple binary association test for each SNP and a phenotype.  Simply create a textile "listofsamples" and order it by your phenotype of interest.  Tell the script the count of the samples in the first group ("-1 11").
samtools mpileup -uf wntacontig.fa GA1_merged.bam | bcftools view -vcs listofsamples -1 11 - > samtools_mpileup_withassoc.vcf
	#WORKFLOW FOR GENOME RESEQUENCING:

	#CONCATENATE CASAVA OUTPUT
	# Annoying first step, Casava 1.8 splits files into groups of 4,000,000 reads, requiring you to stitch it back together...

	cat P1_GA1_R1* > Limenitis_Pool_P1_GA1_R1.fastq.gz
	cat P1_GA1_R2* > Limenitis_Pool_P1_GA1_R2.fastq.gz

	#repeat for each group of files per each individual pool/sample

	# SAMPLE CLEANUP
	Custom script used. Will include later

	#BOWTIE2 ALIGNMENT TO WNTA CONTIG
	# Next, each sample is submitted (separated pool) to cluster, using bowtie's local alignment parameters (very fast), singletons from quality trimming, and including the proper information about insert size (0-1000bp)...
	qsub -l h_rt=08:00:00 -V -N Limenitis_Pool_P1_GA1.sam -b y $BT2_HOME/bowtie2 -x wnt_contig --fast-local -I 0 -X 1000 -1 Limenitis_Pool_P1_GA1_R1.adapter-trimmed.fastq -2 Limenitis_Pool_P1_GA1_R2.adapter-trimmed.fastq -U Limenitis_Pool_P1_GA1_R1.adapter-trimmed.read-singleton.fastq -S Limenitis_Pool_P1_GA1.sam

	#CONVERT TO BAM
	#Of course, none of the downstream programs play well with the text-version of the SAM alignment file, so you must convert to BAM file...
	qsub -V -b y "samtools view -bS Limenitis_Pool_P1_GA1.sam > Limenitis_Pool_P1_GA1.bam"

	#SORT BAM FILES
	#In order to perform operations, BAM files must be ordered relative each other...
	qsub -V -b y samtools sort Limenitis_Pool_P1_GA17.bam Limenitis_Pool_P1_GA17.bam.sorted

	#ADD READ GROUPS
	#To use GATK's downstream tools, you need to add the required metadata to the file, which is not automatically kept track of. Picard's "AddOrReplaceReadGroups.jar" does a reasonably snappy job of this...
	qsub -V -b y java -jar AddOrReplaceReadGroups.jar INPUT=Limenitis_Pool_P1_GA1.bam.sorted.bam OUTPUT=Limenitis_Pool_P1_GA1.final.bam ID=CDHP7ACXX.LANE3 PL=ILLUMINA PU=CDHP7ACXX LB=Pool_P1_GA1 SM=GA1

	#REORDER BAM FILES
	#For reasons that escape me presently, the files need to be reordered now before they can be merged... This can be accomplished using Picard's "ReorderSam.jar"
	qsub -V -b y java -jar ReorderSam.jar INPUT=Limenitis_Pool_P1_GA1.final.bam OUTPUT=Limenitis_Pool_P1_GA1.final.reordered.bam REFERENCE=wntacontig.fa

	#MERGE BAM FILES BY SAMPLE
	#BAM files can then be merged in whichever way you'd like. I think that you can merge them all (though it would take a while and probably crash IGV downstream). I've tended to keep them as separate files per sample, though with the ReadGroups now added this should be unnecessary:
	#there is a little trick here for our particular cluster system: the default temporary directory is size limited, and the job will fail if this directory runs out of space. Using the "-Djava.io.tmpdir=" flag tells java to use another location for the temporary file that has no size limitations.
	qsub -V -b y java -Djava.io.tmpdir=/someotherplace/tmp -jar MergeSamFiles.jar INPUT=Limenitis_Pool_P1_GA1.final.reordered.bam INPUT=Limenitis_Pool_P2_GA1.final.reordered.bam OUTPUT=GA1_merged.bam

	#Call SNPs with GATK
	#GATK recommends all sorts of preprocessing. For now I didn't do any, but I probably should...
	java -jar GenomeAnalysisTK.jar -R wntacontig.fa -T UnifiedGenotyper --num_threads 1 --multiallelic --output_mode EMIT_ALL_SITES \
	-I GA1_merged.bam \
	# add -I flags for each of your samples (merged BAM files from above).

	#Call SNPS with Samtools pileup
	# Add all of your merged Bamfiles before the pipe. This will allow you to perform a simple binary association test for each SNP and a phenotype. Simply create a textile "listofsamples" and order it by your phenotype of interest. Tell the script the count of the samples in the first group ("-1 11").
	samtools mpileup -uf wntacontig.fa GA1_merged.bam \| bcftools view -vcs listofsamples -1 11 - > samtools_mpileup_withassoc.vcf