Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active July 15, 2024 09:46
Show Gist options
  • Save jelber2/4053b3c10a495c331e5420c54b9b1d6c to your computer and use it in GitHub Desktop.
Save jelber2/4053b3c10a495c331e5420c54b9b1d6c to your computer and use it in GitHub Desktop.
Takes reads processed by shredBAM.jl and then makes hi-c pairs
# revision 2, now supports multi-core processing
# Basic idea is to take a long read alignment file that has been shredded with shredBAM.jl, then
# output hi-c-like read-pairs
#
# So for example,
# ----------------------- 1long read
# 1r1 2r1 3r1 3r2 2r2 1r2
# results in 1read1, 1read2
# 2read1, 2read2
# 3read1, 3read2
# in an interleaved FASTA file ready, hopefully for BWA-MEM or BWA-MEM2
# pretty slow at the moment
# see https://gist.github.com/jelber2/47129820373474a768dacabc0e686fdc for running shredBAM.jl, which is also pretty slow
#
# note, need to name sort the BAM file with samtools sort -@threads -n input > input.name.sorted.bam !!FIRST!!
#
# Usage
# julia long-reads-shredded-2-hi-c-pairs.jl hg002.herro.Q30.sam1.3.noSoftClip.22.shredBAM.jl.150.name.sorted.bam makepairs.fasta &
#
#
using XAM
using DataFrames
using DataFramesMeta
using ArgParse
using Base.Threads
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"input_file"
help = "Path to the input BAM file"
required = true
"output_file"
help = "Path to the output FASTA file"
required = true
end
return parse_args(s)
end
function make_pairs(input_file::String, output_file::String)
counter1 = Atomic{Int}(0)
reads = DataFrame(read_id=String[], sequence=String[])
reader = open(BAM.Reader, input_file)
record = BAM.Record()
while !eof(reader)
empty!(record)
read!(reader, record)
if BAM.ismapped(record)
if BAM.isprimaryalignment(record)
push!(reads.read_id, BAM.tempname(record))
push!(reads.sequence, string(BAM.sequence(record)))
end
end
atomic_add!(counter1, 1)
print("\rRead $(counter1[]) BAM records.")
flush(stdout)
end
close(reader)
println("")
reads = transform(reads, :read_id => ByRow(x -> replace(x, r"_\d+$" => "")) => :read_id_new)
reads2 = unique(reads.read_id_new)
# Open output file
output_file = open(output_file, "w")
# Pre-allocate a buffer for each thread
buffers = [IOBuffer() for _ in 1:nthreads()]
local_buffer_lock = ReentrantLock()
relevant_reads_lock = ReentrantLock()
n_rows_lock = ReentrantLock()
counter = Atomic{Int}(0)
relevant_reads_lock = ReentrantLock()
counter_lock = ReentrantLock()
Threads.@threads :static for read_id_new in reads2
local_buffer = buffers[threadid()]
relevant_reads = lock(relevant_reads_lock) do
reads[reads.read_id_new .== read_id_new, :]
end
n_rows = size(relevant_reads, 1)
if iseven(n_rows)
number_pairs = n_rows ÷ 2
for i in 1:number_pairs
local_counter = lock(counter_lock) do
atomic_add!(counter, 1)
counter[]
end
write(local_buffer, ">seq_$(local_counter)_1\n")
write(local_buffer, relevant_reads[i, :sequence], "\n")
write(local_buffer, ">seq_$(local_counter)_2\n")
write(local_buffer, relevant_reads[n_rows-i+1, :sequence], "\n")
print("\rWrote $(counter[]) FASTA record pairs.")
flush(stdout)
end
else
number_pairs = (n_rows-1) ÷ 2
for i in 1:number_pairs
local_counter = lock(counter_lock) do
atomic_add!(counter, 1)
counter[]
end
write(local_buffer, ">seq_$(local_counter)_1\n")
write(local_buffer, relevant_reads[i, :sequence], "\n")
write(local_buffer, ">seq_$(local_counter)_2\n")
write(local_buffer, relevant_reads[n_rows-i, :sequence], "\n")
print("\rWrote $(counter[]) FASTA record pairs.")
flush(stdout)
end
end
end
# Write all buffers to file
for buffer in buffers
write(output_file, take!(buffer))
end
close(output_file)
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
output_file = parsed_args["output_file"]
make_pairs(input_file, output_file)
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment