Last active
March 7, 2024 09:45
Stitch FASTA reads shredded with shred.sh from BBTools back together
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tested on Julialang v.1.10.2, DataStructures v0.18.16, and FASTX v2.1.4 | |
# Usage | |
# $ julia stitch-fasta.jl chr20.herro.fasta.Q30.recal.shred.fasta > chr20.herro.fasta.Q30.recal.fasta | |
# | |
import Pkg; Pkg.add("FASTX") | |
import Pkg; Pkg.add("DataStructures") | |
using DataStructures | |
using FASTX | |
function process_fasta_file(filename::String) | |
reader = FASTAReader(open(filename)) | |
sequences = Dict{String, String}() | |
sequences2 = Dict{String, String}() | |
for record in reader | |
# Get identifier | |
main_id = string(identifier(record)) | |
seq = sequence(record) | |
sequences[main_id] = seq | |
end | |
close(reader) | |
sorted_pairs = OrderedDict(sort(collect(pairs(sequences)))) | |
for (id, seq) in sorted_pairs | |
main_id = split(id, '_')[1] | |
# Aggregate sequences by the main identifier | |
if haskey(sequences2, main_id) | |
sequences2[main_id] *= seq | |
else | |
sequences2[main_id] = seq | |
end | |
end | |
# Print the aggregated sequences | |
for (id, seq) in sequences2 | |
println(">$id") | |
println(seq) | |
end | |
end | |
# Check if a filename is passed as a command line argument | |
if length(ARGS) < 1 | |
println("Usage: julia ", PROGRAM_FILE, " <filename.fasta>") | |
return | |
end | |
# Example usage with command line argument | |
process_fasta_file(ARGS[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment