Skip to content

Instantly share code, notes, and snippets.

@epaule
Last active March 17, 2023 16:55
Show Gist options
  • Save epaule/ea93cb6bdf625e5293f5713b90fe1da9 to your computer and use it in GitHub Desktop.
Save epaule/ea93cb6bdf625e5293f5713b90fe1da9 to your computer and use it in GitHub Desktop.
rough parser for a ASG contamination file
#!/bin/env crystal
require "option_parser"
phylum="Arthropoda|insect"
dir="20230226_qqAmaFero1.20230225.haplotigs.fa_asg_cobiont_check_run/collected_tables/"
OptionParser.parse do |parser|
parser.banner = "Usage: filter_merged --phylum xyz --infile <in.merged>"
parser.on("-p PHYLUM","--phylum=PHYLUM","Specifies the phylum(s) of the host separated by | [default=#{phylum}]"){|p|phylum=p}
parser.on("-d directory","--directory=DIR","merged ASG directory[default=#{dir}]"){|d|dir=d}
parser.on("-h", "--help", "Show this help") do
puts parser
exit
end
parser.invalid_option do |flag|
STDERR.puts "ERROR: #{flag} is not a valid option."
STDERR.puts parser
exit(1)
end
end
Dir.glob("#{dir}/*vecscreen_contamination").each{|file|
hits = [] of String
File.each_line(file) do |line|
hits << line if /VecScreen/.match(line)
end
if hits.size > 0
hits << ""
puts "========== EUKARYOTE ADAPTOR SCREEN =========="
hits.each{|h|puts h}
end
}
Dir.glob("#{dir}/*contamination_check_merged_table.csv").each{|file|
header=0
categories = {} of String => Array(String)
File.each_line(file) do |line|
header += 1
next if header == 1
next if /#{phylum}/.match(line)
columns = line.split(',')
next if /no-hit/.match(columns[-1])
categories[columns[-1]]||=[] of String
categories[columns[-1]] << columns[0]
end
categories.each{|k,v|
puts "##{k}"
v.each{|scaffold|
puts "REMOVE\t#{scaffold}"
}
puts ""
}
}
#!/bin/env ruby
require "optionparser"
phylum="Arthropoda|insect"
dir="20230226_qqAmaFero1.20230225.haplotigs.fa_asg_cobiont_check_run/collected_tables/"
OptionParser.new do |parser|
parser.banner = "Usage: filter_merged --phylum xyz --infile <in.merged>"
parser.on("-p PHYLUM","--phylum=PHYLUM","Specifies the phylum(s) of the host separated by | [default=#{phylum}]"){|p|phylum=p}
parser.on("-d DIR","--directory=DIR","merged ASG infile [default=#{file}]"){|d|dir=d}
parser.on("-h", "--help", "Show this help") do
puts parser
exit
end
end.parse!
Dir.glob("#{dir}/*vecscreen_contamination").each{|file|
hits = []
File.open(file).each_line do |line|
hits << line if /VecScreen/.match(line)
end
if hits.size > 0
hits << ""
puts "========== EUKARYOTE ADAPTOR SCREEN =========="
hits.each{|h|puts h}
end
}
Dir.glob("#{dir}/*contamination_check_merged_table.csv").each{|file|
header=0
categories= {}
File.open(file).each_line do |line|
header += 1
next if header == 1
next if /#{phylum}/.match(line)
columns = line.split(',')
next if /no-hit/.match(columns[-1])
categories[columns[-1]]||=[]
categories[columns[-1]] << columns[0]
end
categories.each{|k,v|
puts "##{k}"
v.each{|scaffold|
puts "REMOVE\t#{scaffold}"
}
puts ""
}
}
@epaule
Copy link
Author

epaule commented Nov 21, 2022

it skips all lines that match the phyla and the ones with no-hit in as classifier, then pretty-prints the merged classifier.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment