caryan/variant_calling.jl

## variant_calling.jl
using Glob
using DataFrames

const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar"
const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar"

function prepare_reference(ref)
	#index the reference for bwa
	run(`bwa index $ref.fasta`)
	#sort the reference TODO: does it need to be sorted? .fai file seems minimal
	run(`samtools faidx $ref.fasta`)
	#create sequence dictionary (required for IndelRealigner)

	run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`)
end

function call_variants(ref, reads)
	#align reads to reference and call variants
	#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz

	#align reads using bwa
	run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam"))

	#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam
	strain = match(r"AZPAE\d{5}", reads).match
	run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam
			RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`)

	# mark duplicates
	run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`)

	# build bam index (requires bam file sorted in coordinate order)
	run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`)

	#Create realignment targets for IndelRealigner below
	run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`)

	# Indel realignment
	#needs	a reference dictionary (ref.dict for ref.fasta)
	run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam
			-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`)

	#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file )
	run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam
			-ploidy 1 -o $reads.vcf`)
end

function call_variants_directory(dir, ref)
	#Loop over call_variants for multiple strain reads in a directory

	prepare_reference(joinpath(dir,ref))

	#extract the first part before the R1 or R2 and remove duplicates
	fastq_files = glob("*.fastq.gz", dir)
	strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) |> unique

	@parallel for strain in strains
		call_variants(joinpath(dir,ref), joinpath(dir,strain))
	end

end

function gatk_variants_table_to_clc(df_gatk::DataFrame)
		#Helper function to convert a variant table from GATK to CLC style
		df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[])

		for idx = 1:size(df_gatk,1)
				#Check for contiguous SNV to turn into MNV
				if (idx > 1) && ((df[end, :Type] == "SNV") || (df[end, :Type] == "MNV")) &&
						(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) &&
						(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1)
						df[end, :Type] = "MNV"
						df[end, :Length] += 1
						df[end, :Reference] *= df_gatk[idx, :REF]
						df[end, :Allele] *= df_gatk[idx, :ALT]

				#Look for insertion
				elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1)
						push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] )

				#Look for deletion
				elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1)
						push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] )

				#Otherwise push on the SNP
				else
						push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]])
				end

		end
		return df
end

function consolidate_variant_calls(dir)
	#consolidate variant calls into a single dataframe for comparative analysis

	#Grab all the variant call files and convert to tsv tables
	vcf_files = glob(joinpath(dir, "*.vcf"))
	ref_file = joinpath(dir, "PAO1_reference.fasta")
	for f in vcf_files
		base_name = splitext(f)[1]
		run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f
			-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`)
	end

	#Load into data frames, convert to CLC style
	dfs = []
	tsv_files = glob(joinpath(dir, "*.variants.tsv"))
	for f in tsv_files
		df = readtable(f)
		strain_name = match(r"(AZPAE\d{5})", f).captures[1]
		df = gatk_variants_table_to_clc(df)
		rename!(df, :Allele, Symbol(strain_name))
		# writetable(f[1:end-3] * "clc.tsv", df)
		push!(dfs, df)
	end

	#join to a single data frame
	main_df = dfs[1]
	for ct = 2:length(dfs)
		# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer)
		#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97)
		main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer)
		#check the references are the same
		for ct = 1:size(main_df,1)
			if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1])
				if(((main_df[ct,:Reference] != main_df[ct,:Reference_1])))
					error("Rows don't match at $ct")
				end
			end
			if isna(main_df[ct, :Reference])
				main_df[ct, :Reference] = main_df[ct, :Reference_1]
			end
		end
		delete!(main_df, :Reference_1)
	end
	sort!(main_df, cols=:Ref_Pos_)

	writetable(joinpath(dir, "variant_summary.tsv"), main_df)

	return main_df
end
	using Glob
	using DataFrames

	const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar"
	const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar"

	function prepare_reference(ref)
	#index the reference for bwa
	run(`bwa index $ref.fasta`)
	#sort the reference TODO: does it need to be sorted? .fai file seems minimal
	run(`samtools faidx $ref.fasta`)
	#create sequence dictionary (required for IndelRealigner)

	run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`)
	end

	function call_variants(ref, reads)
	#align reads to reference and call variants
	#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz

	#align reads using bwa
	run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam"))

	#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam
	strain = match(r"AZPAE\d{5}", reads).match
	run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam
	RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`)

	# mark duplicates
	run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`)

	# build bam index (requires bam file sorted in coordinate order)
	run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`)

	#Create realignment targets for IndelRealigner below
	run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`)

	# Indel realignment
	#needs a reference dictionary (ref.dict for ref.fasta)
	run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam
	-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`)

	#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file )
	run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam
	-ploidy 1 -o $reads.vcf`)
	end

	function call_variants_directory(dir, ref)
	#Loop over call_variants for multiple strain reads in a directory

	prepare_reference(joinpath(dir,ref))

	#extract the first part before the R1 or R2 and remove duplicates
	fastq_files = glob("*.fastq.gz", dir)
	strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) \|> unique

	@parallel for strain in strains
	call_variants(joinpath(dir,ref), joinpath(dir,strain))
	end

	end

	function gatk_variants_table_to_clc(df_gatk::DataFrame)
	#Helper function to convert a variant table from GATK to CLC style
	df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[])

	for idx = 1:size(df_gatk,1)
	#Check for contiguous SNV to turn into MNV
	if (idx > 1) && ((df[end, :Type] == "SNV") \|\| (df[end, :Type] == "MNV")) &&
	(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) &&
	(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1)
	df[end, :Type] = "MNV"
	df[end, :Length] += 1
	df[end, :Reference] *= df_gatk[idx, :REF]
	df[end, :Allele] *= df_gatk[idx, :ALT]

	#Look for insertion
	elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1)
	push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] )

	#Look for deletion
	elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1)
	push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] )

	#Otherwise push on the SNP
	else
	push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]])
	end

	end
	return df
	end

	function consolidate_variant_calls(dir)
	#consolidate variant calls into a single dataframe for comparative analysis

	#Grab all the variant call files and convert to tsv tables
	vcf_files = glob(joinpath(dir, "*.vcf"))
	ref_file = joinpath(dir, "PAO1_reference.fasta")
	for f in vcf_files
	base_name = splitext(f)[1]
	run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f
	-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`)
	end

	#Load into data frames, convert to CLC style
	dfs = []
	tsv_files = glob(joinpath(dir, "*.variants.tsv"))
	for f in tsv_files
	df = readtable(f)
	strain_name = match(r"(AZPAE\d{5})", f).captures[1]
	df = gatk_variants_table_to_clc(df)
	rename!(df, :Allele, Symbol(strain_name))
	# writetable(f[1:end-3] * "clc.tsv", df)
	push!(dfs, df)
	end

	#join to a single data frame
	main_df = dfs[1]
	for ct = 2:length(dfs)
	# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer)
	#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97)
	main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer)
	#check the references are the same
	for ct = 1:size(main_df,1)
	if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1])
	if(((main_df[ct,:Reference] != main_df[ct,:Reference_1])))
	error("Rows don't match at $ct")
	end
	end
	if isna(main_df[ct, :Reference])
	main_df[ct, :Reference] = main_df[ct, :Reference_1]
	end
	end
	delete!(main_df, :Reference_1)
	end
	sort!(main_df, cols=:Ref_Pos_)

	writetable(joinpath(dir, "variant_summary.tsv"), main_df)

	return main_df
	end