Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active March 26, 2024 10:04
Show Gist options
  • Save jelber2/7163185d5e3f6ff81f8bf4620e0a3740 to your computer and use it in GitHub Desktop.
Save jelber2/7163185d5e3f6ff81f8bf4620e0a3740 to your computer and use it in GitHub Desktop.
Plot changes in read length distribution of Nanopore Dorado called bases
using XAM
using ArgParse
using DataFrames
using Dates
using Plots
using StatsBase
using Plots.PlotMeasures
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"--input_file", "-i"
help = "Path to the input BAM file"
required = true
"--read_lengths", "-r"
help = "Read lengths interval size (try maybe 5000)"
arg_type = Int
required = true
"--fps", "-f"
help = "Frames per second for the animation (try maybe 5)"
arg_type = Int
required = true
"--xlims_max", "-x"
help = "Maximum value for xlims (try maybe 200000)"
arg_type = Int
required = true
"--output_file", "-o"
help = "Output file name for the animation (try output.gif)"
required = true
"--log10"
help = "Use log10 scale for the plot"
action = :store_true
end
return parse_args(s)
end
# Function to find the length of the shortest value in the running sum
function findShortestInRunningSum(values)
# Sort the values in descending order
sorted_values = sort(values, rev=true)
# Calculate the total sum
total_sum = sum(sorted_values)
# Calculate half of the total sum
half_sum = total_sum / 2
# Initialize running sum and the length of the shortest value
running_sum = 0
shortest_value = Inf
# Iterate through the sorted values
for value in sorted_values
running_sum += value
# If the running sum is equal to or greater than half of the total sum
if running_sum >= half_sum
# Update the shortest length if the current value's length is shorter
if value < shortest_value
shortest_value = value
end
break
end
end
return shortest_value
end
function get_BAM_records(input_file::String)
# Initialize an empty DataFrame with the desired column names
df = DataFrame(Column1 = Int[], Column2 = Any[])
reader = open(BAM.Reader, input_file)
record = BAM.Record()
counter = 0
println()
println("Reading BAM records.")
while !eof(reader)
empty!(record)
read!(reader, record)
# Append each record's BAM.seqlength(record) and values(record)[7] to the DataFrame
push!(df, [BAM.seqlength(record), values(record)[7]])
counter += 1
# Update the counter display
print("\rRead $counter BAM records.")
flush(stdout)
end
println("Done reading BAM records.")
println()
close(reader)
return df
end
function process_BAM_records(df::DataFrame, read_lengths_interval_size::Int, xlims_max::Int, use_log10=false)
println("Begin plotting.")
# Read the data
ez1_orig = df
# Convert the second column to DateTime
ez1_orig.Column2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.Column2]
# Convert the first datetime to the start of the day
start_time = floor(ez1_orig.Column2[1], Day)
# Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes
intervals = start_time:Hour(1):floor(maximum(ez1_orig.Column2), Day) + Day(1) - Minute(1)
# Create a new column named interval in ez1_orig to store the assigned intervals
ez1_orig.interval = similar(ez1_orig.Column2, DateTime)
# Assign each datetime in ez1_orig.Column2 to the nearest interval
for i in 1:nrow(ez1_orig)
index = searchsorted(intervals, ez1_orig.Column2[i]).stop
if index == 0
ez1_orig.interval[i] = intervals[1]
elseif index > 1 && (ez1_orig.Column2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.Column2[i])
ez1_orig.interval[i] = intervals[index-1]
else
ez1_orig.interval[i] = intervals[index]
end
end
# Initialize an empty DataFrame to store the results
results = DataFrame()
sort!(ez1_orig, :interval)
j = 0
for hour in unique(ez1_orig.interval)
test = []
filtered_df = ez1_orig[ez1_orig.interval .== hour, :]
read_lengths = 0:read_lengths_interval_size:ceil(maximum(ez1_orig.Column1)/read_lengths_interval_size)*read_lengths_interval_size
for i in 1:length(read_lengths)-1
range_start = read_lengths[i]
range_end = read_lengths[i+1]
sum_in_range = sum(filtered_df.Column1[(filtered_df.Column1 .> range_start) .& (filtered_df.Column1 .<= range_end)])
push!(test, sum_in_range)
end
test3 = DataFrame(
number_of_bases = test,
read_length = read_lengths[1:end-1],
hour = fill(hour, length(test))
)
if isempty(results)
results = test3
else
j += 1
sum_row = results.number_of_bases[j:j+nrow(test3)-1] + test3.number_of_bases
j += nrow(test3) - 1
test3.number_of_bases .= sum_row
append!(results, test3, promote=true)
end
end
sort!(results, :hour)
# Create the animation
anim = @animate for hour in unique(results.hour)
filtered_results = results[results.hour .== hour, :]
create_plot(filtered_results, results, ez1_orig, xlims_max, use_log10)
end
return anim
println("Done plotting.")
end
# Create a plot for each frame
function create_plot(filtered_results::DataFrame, results::DataFrame, ez1_orig::DataFrame, xlims_max::Int, use_log10=false)
if use_log10
p = plot(log10.(filtered_results.read_length), filtered_results.number_of_bases, group=filtered_results.hour, seriestype=:bar,
xlabel="Log10 Read length", ylabel="Number of Bases",
title="$(round(sum(ez1_orig.Column1/1000000000), digits=2)) Gbp output, red line N50=$(round(findShortestInRunningSum(ez1_orig.Column1)/1000, digits=2)) Kbp",
xlims=(log10(minimum(filtered_results.read_length)), log10(xlims_max)), legend=false, ylims=(0, maximum(results.number_of_bases)))
else
p = plot(filtered_results.read_length, filtered_results.number_of_bases, group=filtered_results.hour, seriestype=:bar,
xlabel="Read length", ylabel="Number of Bases",
title="$(round(sum(ez1_orig.Column1/1000000000), digits=2)) Gbp output, red line N50=$(round(findShortestInRunningSum(ez1_orig.Column1)/1000, digits=2)) Kbp",
xlims=(0, xlims_max), legend=false, ylims=(0, maximum(results.number_of_bases)))
end
if use_log10
vline!([log10(findShortestInRunningSum(ez1_orig.Column1))], color=:red, linestyle=:dash, linewidth=1)
else
vline!([findShortestInRunningSum(ez1_orig.Column1)], color=:red, linestyle=:dash, linewidth=1)
end
return p
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
read_lengths_interval_size = parsed_args["read_lengths"]
fps = parsed_args["fps"]
xlims_max = parsed_args["xlims_max"]
output_file = parsed_args["output_file"]
use_log10=parsed_args["log10"]
df = get_BAM_records(input_file)
anim = process_BAM_records(df, read_lengths_interval_size, xlims_max, use_log10)
# Save the animation with the specified output file name
gif(anim, output_file, fps=fps)
end
main()
@jelber2
Copy link
Author

jelber2 commented Mar 18, 2024

WGS_HG002_Monarch_all_1st_SUP

@jelber2
Copy link
Author

jelber2 commented Mar 26, 2024

WGS_HG002_Monarch_all_1st_SUP2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment