Skip to content

Instantly share code, notes, and snippets.

@epaule
Last active November 16, 2023 11:38
Show Gist options
  • Save epaule/90dec329b6186e5e4bfb6327c0c109bd to your computer and use it in GitHub Desktop.
Save epaule/90dec329b6186e5e4bfb6327c0c109bd to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# frozen_string_literal: true
# add > signs to the nearest gaps in a TPF based on AGP breaks
# usage: ./mark_scaffolds.rb -a AGP -t TPF -o TPF2
require 'optparse'
# get the gaps from the AGP as Hash of Arrays [seq_id] => [1,2,3]
def parse_agp(file)
gaps = {}
cmd = "grep -v '#' #{file} | grep -v proximity_ligation | sort -k6,6 -nk7"
lines = {}
IO.popen(cmd) do |pipe|
pipe.each_line do |line|
lines[line.split[5]] ||= []
lines[line.split[5]] << line
end
end
lines.each do |k, v|
next unless v.size > 1
v[1..-1].each do |l|
gaps[k] ||= []
gaps[k] << l.split[6].to_i
end
end
gaps
end
# get the texel size from the AGP
def get_texel_size(file)
texel_size = 0
File.new(file).each_line do |l|
if (m = %r{(\d+)\.\d+ bp/texel}.match(l))
texel_size = m[1].to_i
end
end
texel_size
end
# get tpf gaps and lines
# returns Hash of Arrays for the gaps [seq_id] => [1,2,3]
def parse_tpf(file)
t_gaps = {}
previous_line = ''
File.new(file).each_line do |l|
if l.include?('GAP')
(id, _, stop) = previous_line.split[1].split(/[:-]/)
t_gaps[id] ||= []
t_gaps[id] << stop.to_i
end
previous_line = l
end
t_gaps
end
# match up the two gaps and return a Hash of Arrays for the ones that need marking
def match_gaps(a_gaps, t_gaps, texel)
ma_gaps = {}
new_gaps = {}
a_gaps.each do |k, v|
unless t_gaps.key?(k) # for cases where the target sequence has no existing gaps
new_gaps[k] = v
v.each { |gap| warn "WARNING: will create new gap for AGP: #{k} (#{gap})" }
next
end
v.each do |pos|
p = t_gaps[k].min_by { |x| (pos - x).abs }
# if out by more than the windows, add it to the new gaps
if (pos - p).abs > texel
warn "WARNING: for #{k} AGP: #{pos} => TPF: #{p} (distance #{(pos - p).abs} > #{texel}) => will create new gap"
new_gaps[k] ||= []
new_gaps[k] << pos
else
ma_gaps[k] ||= []
ma_gaps[k] << p
end
end
end
[ma_gaps, new_gaps]
end
agp = ''
tpf = ''
out = ''
window_size = 8 # inherited from Alan
OptionParser.new do |parser|
parser.banner = 'Usage: mark_scaffolds.rb --agp PRETEXT_AGP --tpf SCAFFOLDS_TPF --out MODIFIED_TPF [--window 7]'
parser.on('-a', '--agp Pretext_AGP') { |a| agp = a }
parser.on('-t', '--tpf SCAFFOLDS_TPF') { |t| tpf = t }
parser.on('-o', '--out MODIFIED_TPF') { |t| out = t }
parser.on('-w', '--window TEXELS') { |t| window_size = t.to_i }
parser.on('-h', '--help') do
puts parser
exit
end
end.parse!
agp_gaps = parse_agp(agp)
tpf_gaps = parse_tpf(tpf)
marked_gaps, novel_gaps = match_gaps(agp_gaps, tpf_gaps, get_texel_size(agp) * window_size)
# if the gap matches a marked gap, add a >
previous_line = ''
out_file = File.new(out, 'w')
File.readlines(tpf).each do |l|
if l.include?('GAP')
unless l.include?('>') # skip already marked bits
(id, _, stop) = previous_line.split[1].split(/[:-]/)
out_file.print '>' if marked_gaps.key?(id) && marked_gaps[id].include?(stop.to_i)
end
out_file.print l
else
c = l.split
(id, start, stop) = c[1].split(/[:-]/)
start = start.to_i
stop = stop.to_i
new = []
new = novel_gaps[id].grep(start..stop) if novel_gaps.key?(id)
if new.size >= 1 # if one or more of the novel gaps are inside a segment
lines = []
[start, new, stop + 1].flatten.each_cons(2) do |a, b|
lines << "?\t#{id}:#{a}-#{b - 1}\t#{c[2]}\t#{c[3]}\n"
end.to_a
out_file.print(lines.join(">GAP\tTYPE-2\t200\n"))
else
out_file.print l
end
end
previous_line = l
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment