jelber2/long-reads-shredded-2-hi-c-pairs.jl

## long-reads-shredded-2-hi-c-pairs.jl
# revision 2, now supports multi-core processing
# Basic idea is to take a long read alignment file that has been shredded with shredBAM.jl, then
# output hi-c-like read-pairs
#
# So for example,
# ----------------------- 1long read
# 1r1 2r1 3r1  3r2 2r2 1r2
# results in        1read1, 1read2
#                   2read1, 2read2
#                   3read1, 3read2
# in an interleaved FASTA file ready, hopefully for BWA-MEM or BWA-MEM2
# pretty slow at the moment
# see https://gist.github.com/jelber2/47129820373474a768dacabc0e686fdc for running shredBAM.jl, which is also pretty slow
#
# note, need to name sort the BAM file with samtools sort -@threads -n input > input.name.sorted.bam !!FIRST!!
#
# Usage
# julia long-reads-shredded-2-hi-c-pairs.jl hg002.herro.Q30.sam1.3.noSoftClip.22.shredBAM.jl.150.name.sorted.bam makepairs.fasta &
#
#
using XAM
using DataFrames
using DataFramesMeta
using ArgParse
using Base.Threads

function parse_commandline()
    s = ArgParseSettings()

    @add_arg_table s begin
        "input_file"
            help = "Path to the input BAM file"
            required = true
        "output_file"
            help = "Path to the output FASTA file"
            required = true
    end

    return parse_args(s)
end


function make_pairs(input_file::String, output_file::String)

    counter1 = Atomic{Int}(0)
    reads = DataFrame(read_id=String[], sequence=String[])

    reader = open(BAM.Reader, input_file)
    record = BAM.Record()
    while !eof(reader)
        empty!(record)
        read!(reader, record)
        if BAM.ismapped(record)
            if BAM.isprimaryalignment(record)
                push!(reads.read_id, BAM.tempname(record))
                push!(reads.sequence, string(BAM.sequence(record)))
            end
        end
        atomic_add!(counter1, 1)
        print("\rRead $(counter1[]) BAM records.")
        flush(stdout)
    end
    close(reader)
    println("")

    reads = transform(reads, :read_id => ByRow(x -> replace(x, r"_\d+$" => "")) => :read_id_new)

    reads2 = unique(reads.read_id_new)

    # Open output file
    output_file = open(output_file, "w")

    # Pre-allocate a buffer for each thread
    buffers = [IOBuffer() for _ in 1:nthreads()]

    local_buffer_lock = ReentrantLock()
    relevant_reads_lock = ReentrantLock()
    n_rows_lock = ReentrantLock()

    counter = Atomic{Int}(0)
relevant_reads_lock = ReentrantLock()
counter_lock = ReentrantLock()

Threads.@threads :static for read_id_new in reads2
    local_buffer = buffers[threadid()]

    relevant_reads = lock(relevant_reads_lock) do
        reads[reads.read_id_new .== read_id_new, :]
    end

    n_rows = size(relevant_reads, 1)

    if iseven(n_rows)
        number_pairs = n_rows ÷ 2
        for i in 1:number_pairs
            local_counter = lock(counter_lock) do
                atomic_add!(counter, 1)
                counter[]
            end
            write(local_buffer, ">seq_$(local_counter)_1\n")
            write(local_buffer, relevant_reads[i, :sequence], "\n")
            write(local_buffer, ">seq_$(local_counter)_2\n")
            write(local_buffer, relevant_reads[n_rows-i+1, :sequence], "\n")
            print("\rWrote $(counter[]) FASTA record pairs.")
            flush(stdout)
        end
    else
        number_pairs = (n_rows-1) ÷ 2
        for i in 1:number_pairs
            local_counter = lock(counter_lock) do
                atomic_add!(counter, 1)
                counter[]
            end
            write(local_buffer, ">seq_$(local_counter)_1\n")
            write(local_buffer, relevant_reads[i, :sequence], "\n")
            write(local_buffer, ">seq_$(local_counter)_2\n")
            write(local_buffer, relevant_reads[n_rows-i, :sequence], "\n")
            print("\rWrote $(counter[]) FASTA record pairs.")
            flush(stdout)
        end
    end
end

    # Write all buffers to file
    for buffer in buffers
        write(output_file, take!(buffer))
    end

    close(output_file)
end

function main()
    parsed_args = parse_commandline()
    input_file = parsed_args["input_file"]
    output_file = parsed_args["output_file"]
    make_pairs(input_file, output_file)
end

main()
	# revision 2, now supports multi-core processing
	# Basic idea is to take a long read alignment file that has been shredded with shredBAM.jl, then
	# output hi-c-like read-pairs
	#
	# So for example,
	# ----------------------- 1long read
	# 1r1 2r1 3r1 3r2 2r2 1r2
	# results in 1read1, 1read2
	# 2read1, 2read2
	# 3read1, 3read2
	# in an interleaved FASTA file ready, hopefully for BWA-MEM or BWA-MEM2
	# pretty slow at the moment
	# see https://gist.github.com/jelber2/47129820373474a768dacabc0e686fdc for running shredBAM.jl, which is also pretty slow
	#
	# note, need to name sort the BAM file with samtools sort -@threads -n input > input.name.sorted.bam !!FIRST!!
	#
	# Usage
	# julia long-reads-shredded-2-hi-c-pairs.jl hg002.herro.Q30.sam1.3.noSoftClip.22.shredBAM.jl.150.name.sorted.bam makepairs.fasta &
	#
	#
	using XAM
	using DataFrames
	using DataFramesMeta
	using ArgParse
	using Base.Threads

	function parse_commandline()
	s = ArgParseSettings()

	@add_arg_table s begin
	"input_file"
	help = "Path to the input BAM file"
	required = true
	"output_file"
	help = "Path to the output FASTA file"
	required = true
	end

	return parse_args(s)
	end


	function make_pairs(input_file::String, output_file::String)

	counter1 = Atomic{Int}(0)
	reads = DataFrame(read_id=String[], sequence=String[])

	reader = open(BAM.Reader, input_file)
	record = BAM.Record()
	while !eof(reader)
	empty!(record)
	read!(reader, record)
	if BAM.ismapped(record)
	if BAM.isprimaryalignment(record)
	push!(reads.read_id, BAM.tempname(record))
	push!(reads.sequence, string(BAM.sequence(record)))
	end
	end
	atomic_add!(counter1, 1)
	print("\rRead $(counter1[]) BAM records.")
	flush(stdout)
	end
	close(reader)
	println("")

	reads = transform(reads, :read_id => ByRow(x -> replace(x, r"_\d+$" => "")) => :read_id_new)

	reads2 = unique(reads.read_id_new)

	# Open output file
	output_file = open(output_file, "w")

	# Pre-allocate a buffer for each thread
	buffers = [IOBuffer() for _ in 1:nthreads()]

	local_buffer_lock = ReentrantLock()
	relevant_reads_lock = ReentrantLock()
	n_rows_lock = ReentrantLock()

	counter = Atomic{Int}(0)
	relevant_reads_lock = ReentrantLock()
	counter_lock = ReentrantLock()

	Threads.@threads :static for read_id_new in reads2
	local_buffer = buffers[threadid()]

	relevant_reads = lock(relevant_reads_lock) do
	reads[reads.read_id_new .== read_id_new, :]
	end

	n_rows = size(relevant_reads, 1)

	if iseven(n_rows)
	number_pairs = n_rows ÷ 2
	for i in 1:number_pairs
	local_counter = lock(counter_lock) do
	atomic_add!(counter, 1)
	counter[]
	end
	write(local_buffer, ">seq_$(local_counter)_1\n")
	write(local_buffer, relevant_reads[i, :sequence], "\n")
	write(local_buffer, ">seq_$(local_counter)_2\n")
	write(local_buffer, relevant_reads[n_rows-i+1, :sequence], "\n")
	print("\rWrote $(counter[]) FASTA record pairs.")
	flush(stdout)
	end
	else
	number_pairs = (n_rows-1) ÷ 2
	for i in 1:number_pairs
	local_counter = lock(counter_lock) do
	atomic_add!(counter, 1)
	counter[]
	end
	write(local_buffer, ">seq_$(local_counter)_1\n")
	write(local_buffer, relevant_reads[i, :sequence], "\n")
	write(local_buffer, ">seq_$(local_counter)_2\n")
	write(local_buffer, relevant_reads[n_rows-i, :sequence], "\n")
	print("\rWrote $(counter[]) FASTA record pairs.")
	flush(stdout)
	end
	end
	end

	# Write all buffers to file
	for buffer in buffers
	write(output_file, take!(buffer))
	end

	close(output_file)
	end

	function main()
	parsed_args = parse_commandline()
	input_file = parsed_args["input_file"]
	output_file = parsed_args["output_file"]
	make_pairs(input_file, output_file)
	end

	main()