jelber2/Plot_Animated_Nanopore_Quality_x_Length_2D_Contours.jl

## Plot_Animated_Nanopore_Quality_x_Length_2D_Contours.jl
# new in revision 2, multithreaded support - up to twice as fast as single-threaded, revision 1
# invoke with:

# julia --threads 16 Plot_Animated_Nanopore_Quality_x_Length_2D_Contours.jl --input_file test.bam --fps 0.7 --output_file test.gif

# but need a lot of threads (up to 16, above 16 threads, get saturation in cores vs. time plot)

using XAM
using ArgParse
using DataFrames
using Dates
using Statistics
using Base.MathConstants
using StatsBase
using StatsPlots
using KernelDensity
using Base.Threads

function parse_commandline()
    s = ArgParseSettings()

    @add_arg_table s begin
        "--input_file", "-i"
            help = "Path to the input BAM file"
            required = true
        "--fps", "-f"
            help = "Frames per second for the animation (try maybe 0.35)"
            arg_type = Float64
            required = true
        "--output_file", "-o"
            help = "Output file name for the animation (try output.gif)"
            required = true
    end

    return parse_args(s)
end

function get_BAM_records(input_file::String)
    println()
    println("Reading BAM Index for preallocating df size.")
    input_file_index = string(input_file, ".bai")
    try
        total_records = XAM.BAM.BAI(input_file_index).n_no_coors
    catch e
        println("Please use samtools index as $input_file_index is not available in the same directory as $input_file.")
        println()
    end
    total_records = XAM.BAM.BAI(input_file_index).n_no_coors
    println("Total records in the BAM file: $total_records")

    df = DataFrame(col1=Vector{Int}(undef, total_records), col2=Vector{Any}(undef, total_records), col3=Vector{Int}(undef, total_records))
    reader = open(BAM.Reader, input_file)
    record = BAM.Record()
    counter = Atomic{Int}(0)
    reader_lock = ReentrantLock()

    println()
    println("Reading BAM records for adding to df.")
    Threads.@threads for i in 1:total_records
        local_record = BAM.Record()
        lock(reader_lock)
        read!(reader, local_record)
        unlock(reader_lock)

        df.col1[i] = Int(round(-10*log10(sum(y->10^(Int(Char(y))/-10), BAM.quality(local_record))/length(BAM.quality(local_record))), RoundNearestTiesUp))
        df.col2[i] = values(local_record)[7]
        df.col3[i] = BAM.seqlength(local_record)

        atomic_add!(counter, 1)
        print("\rRead $(counter[]) BAM records.")
        flush(stdout)
    end
    println()
    println("Done reading BAM records for adding to df.")
    close(reader)
    return df
end

function process_BAM_records(df::DataFrame)

    println("Begin plotting.")
    # Read the data
    ez1_orig = df

    # Convert the second column to DateTime
    ez1_orig.col2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.col2]

    # Convert the first datetime to the start of the day
    start_time = floor(ez1_orig.col2[1], Day)

    # Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes
    intervals = start_time:Hour(1):floor(maximum(ez1_orig.col2), Day) + Day(1) - Minute(1)

    # Create a new column named interval in ez1_orig to store the assigned intervals
    ez1_orig.interval = similar(ez1_orig.col2, DateTime)

    # Assign each datetime in ez1_orig.Column2 to the nearest interval
    for i in 1:nrow(ez1_orig)
        index = searchsorted(intervals, ez1_orig.col2[i]).stop
        if index == 0
            ez1_orig.interval[i] = intervals[1]
        elseif index > 1 && (ez1_orig.col2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.col2[i])
            ez1_orig.interval[i] = intervals[index-1]
        else
            ez1_orig.interval[i] = intervals[index]
        end
    end

    sort!(ez1_orig, :interval)


    ez1_orig.col3 = log10.(ez1_orig.col3)

    # Create an empty plot
    plt = plot(xlim = (0, 40), ylim = (0,6),
    title = "Hourly Hourly Read Quality by Length", xlabel = "Average Read Quality Phred Score", ylabel = "Read Length Bases",
    size=(800,600), legend=true)


    # Assuming your DataFrame is named 'ez1_orig'
    result = combine(groupby(ez1_orig, :interval)) do subdf
        x = subdf.col1
        y = subdf.col3
        xmin, xmax = 0, 40
        ymin, ymax = 0, maximum(6)  # Adjust the range for col3 based on your data

        # Perform 2D KDE
        kde_data = kde((x, y); boundary=((xmin, xmax), (ymin, ymax)), npoints=(100, 100))

        return kde_data
    end


    # Create the animation
    anim = @animate for i in 1:nrow(result)
        animate_density!(plt, result, i, ez1_orig)
    end
end

# Define the animation function
function animate_density!(plt, result, frame, ez1_orig)
    interval = result.interval[frame]
    row = result[frame, :]
    plot(plt, row.x1,
         title = "\"Hourly\" Avg Read Qual by Length, Interval $interval",
         xlims = (0, 40), ylims = (0, 6), xlabel = "Average Read Quality Phred Score",
         ylabel = "Read Length Bases", yticks=([1, 2, 3, 4, 5, 6], ["10", "100", "1000", "10000", "100000", "1000000"]))
end

function main()
    parsed_args = parse_commandline()
    input_file = parsed_args["input_file"]
    fps = parsed_args["fps"]
    output_file = parsed_args["output_file"]

    df = get_BAM_records(input_file)
    anim = process_BAM_records(df)

    # Save the animation with the specified output file name
    gif(anim, output_file, fps=fps, show_msg=false)
end


main()
	# new in revision 2, multithreaded support - up to twice as fast as single-threaded, revision 1
	# invoke with:

	# julia --threads 16 Plot_Animated_Nanopore_Quality_x_Length_2D_Contours.jl --input_file test.bam --fps 0.7 --output_file test.gif

	# but need a lot of threads (up to 16, above 16 threads, get saturation in cores vs. time plot)

	using XAM
	using ArgParse
	using DataFrames
	using Dates
	using Statistics
	using Base.MathConstants
	using StatsBase
	using StatsPlots
	using KernelDensity
	using Base.Threads

	function parse_commandline()
	s = ArgParseSettings()

	@add_arg_table s begin
	"--input_file", "-i"
	help = "Path to the input BAM file"
	required = true
	"--fps", "-f"
	help = "Frames per second for the animation (try maybe 0.35)"
	arg_type = Float64
	required = true
	"--output_file", "-o"
	help = "Output file name for the animation (try output.gif)"
	required = true
	end

	return parse_args(s)
	end

	function get_BAM_records(input_file::String)
	println()
	println("Reading BAM Index for preallocating df size.")
	input_file_index = string(input_file, ".bai")
	try
	total_records = XAM.BAM.BAI(input_file_index).n_no_coors
	catch e
	println("Please use samtools index as $input_file_index is not available in the same directory as $input_file.")
	println()
	end
	total_records = XAM.BAM.BAI(input_file_index).n_no_coors
	println("Total records in the BAM file: $total_records")

	df = DataFrame(col1=Vector{Int}(undef, total_records), col2=Vector{Any}(undef, total_records), col3=Vector{Int}(undef, total_records))
	reader = open(BAM.Reader, input_file)
	record = BAM.Record()
	counter = Atomic{Int}(0)
	reader_lock = ReentrantLock()

	println()
	println("Reading BAM records for adding to df.")
	Threads.@threads for i in 1:total_records
	local_record = BAM.Record()
	lock(reader_lock)
	read!(reader, local_record)
	unlock(reader_lock)

	df.col1[i] = Int(round(-10*log10(sum(y->10^(Int(Char(y))/-10), BAM.quality(local_record))/length(BAM.quality(local_record))), RoundNearestTiesUp))
	df.col2[i] = values(local_record)[7]
	df.col3[i] = BAM.seqlength(local_record)

	atomic_add!(counter, 1)
	print("\rRead $(counter[]) BAM records.")
	flush(stdout)
	end
	println()
	println("Done reading BAM records for adding to df.")
	close(reader)
	return df
	end

	function process_BAM_records(df::DataFrame)

	println("Begin plotting.")
	# Read the data
	ez1_orig = df

	# Convert the second column to DateTime
	ez1_orig.col2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.col2]

	# Convert the first datetime to the start of the day
	start_time = floor(ez1_orig.col2[1], Day)

	# Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes
	intervals = start_time:Hour(1):floor(maximum(ez1_orig.col2), Day) + Day(1) - Minute(1)

	# Create a new column named interval in ez1_orig to store the assigned intervals
	ez1_orig.interval = similar(ez1_orig.col2, DateTime)

	# Assign each datetime in ez1_orig.Column2 to the nearest interval
	for i in 1:nrow(ez1_orig)
	index = searchsorted(intervals, ez1_orig.col2[i]).stop
	if index == 0
	ez1_orig.interval[i] = intervals[1]
	elseif index > 1 && (ez1_orig.col2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.col2[i])
	ez1_orig.interval[i] = intervals[index-1]
	else
	ez1_orig.interval[i] = intervals[index]
	end
	end

	sort!(ez1_orig, :interval)


	ez1_orig.col3 = log10.(ez1_orig.col3)

	# Create an empty plot
	plt = plot(xlim = (0, 40), ylim = (0,6),
	title = "Hourly Hourly Read Quality by Length", xlabel = "Average Read Quality Phred Score", ylabel = "Read Length Bases",
	size=(800,600), legend=true)



	# Assuming your DataFrame is named 'ez1_orig'
	result = combine(groupby(ez1_orig, :interval)) do subdf
	x = subdf.col1
	y = subdf.col3
	xmin, xmax = 0, 40
	ymin, ymax = 0, maximum(6) # Adjust the range for col3 based on your data

	# Perform 2D KDE
	kde_data = kde((x, y); boundary=((xmin, xmax), (ymin, ymax)), npoints=(100, 100))

	return kde_data
	end


	# Create the animation
	anim = @animate for i in 1:nrow(result)
	animate_density!(plt, result, i, ez1_orig)
	end
	end

	# Define the animation function
	function animate_density!(plt, result, frame, ez1_orig)
	interval = result.interval[frame]
	row = result[frame, :]
	plot(plt, row.x1,
	title = "\"Hourly\" Avg Read Qual by Length, Interval $interval",
	xlims = (0, 40), ylims = (0, 6), xlabel = "Average Read Quality Phred Score",
	ylabel = "Read Length Bases", yticks=([1, 2, 3, 4, 5, 6], ["10", "100", "1000", "10000", "100000", "1000000"]))
	end

	function main()
	parsed_args = parse_commandline()
	input_file = parsed_args["input_file"]
	fps = parsed_args["fps"]
	output_file = parsed_args["output_file"]

	df = get_BAM_records(input_file)
	anim = process_BAM_records(df)

	# Save the animation with the specified output file name
	gif(anim, output_file, fps=fps, show_msg=false)
	end


	main()