Last active
November 16, 2023 11:38
-
-
Save epaule/90dec329b6186e5e4bfb6327c0c109bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# frozen_string_literal: true | |
# add > signs to the nearest gaps in a TPF based on AGP breaks | |
# usage: ./mark_scaffolds.rb -a AGP -t TPF -o TPF2 | |
require 'optparse' | |
# get the gaps from the AGP as Hash of Arrays [seq_id] => [1,2,3] | |
def parse_agp(file) | |
gaps = {} | |
cmd = "grep -v '#' #{file} | grep -v proximity_ligation | sort -k6,6 -nk7" | |
lines = {} | |
IO.popen(cmd) do |pipe| | |
pipe.each_line do |line| | |
lines[line.split[5]] ||= [] | |
lines[line.split[5]] << line | |
end | |
end | |
lines.each do |k, v| | |
next unless v.size > 1 | |
v[1..-1].each do |l| | |
gaps[k] ||= [] | |
gaps[k] << l.split[6].to_i | |
end | |
end | |
gaps | |
end | |
# get the texel size from the AGP | |
def get_texel_size(file) | |
texel_size = 0 | |
File.new(file).each_line do |l| | |
if (m = %r{(\d+)\.\d+ bp/texel}.match(l)) | |
texel_size = m[1].to_i | |
end | |
end | |
texel_size | |
end | |
# get tpf gaps and lines | |
# returns Hash of Arrays for the gaps [seq_id] => [1,2,3] | |
def parse_tpf(file) | |
t_gaps = {} | |
previous_line = '' | |
File.new(file).each_line do |l| | |
if l.include?('GAP') | |
(id, _, stop) = previous_line.split[1].split(/[:-]/) | |
t_gaps[id] ||= [] | |
t_gaps[id] << stop.to_i | |
end | |
previous_line = l | |
end | |
t_gaps | |
end | |
# match up the two gaps and return a Hash of Arrays for the ones that need marking | |
def match_gaps(a_gaps, t_gaps, texel) | |
ma_gaps = {} | |
new_gaps = {} | |
a_gaps.each do |k, v| | |
unless t_gaps.key?(k) # for cases where the target sequence has no existing gaps | |
new_gaps[k] = v | |
v.each { |gap| warn "WARNING: will create new gap for AGP: #{k} (#{gap})" } | |
next | |
end | |
v.each do |pos| | |
p = t_gaps[k].min_by { |x| (pos - x).abs } | |
# if out by more than the windows, add it to the new gaps | |
if (pos - p).abs > texel | |
warn "WARNING: for #{k} AGP: #{pos} => TPF: #{p} (distance #{(pos - p).abs} > #{texel}) => will create new gap" | |
new_gaps[k] ||= [] | |
new_gaps[k] << pos | |
else | |
ma_gaps[k] ||= [] | |
ma_gaps[k] << p | |
end | |
end | |
end | |
[ma_gaps, new_gaps] | |
end | |
agp = '' | |
tpf = '' | |
out = '' | |
window_size = 8 # inherited from Alan | |
OptionParser.new do |parser| | |
parser.banner = 'Usage: mark_scaffolds.rb --agp PRETEXT_AGP --tpf SCAFFOLDS_TPF --out MODIFIED_TPF [--window 7]' | |
parser.on('-a', '--agp Pretext_AGP') { |a| agp = a } | |
parser.on('-t', '--tpf SCAFFOLDS_TPF') { |t| tpf = t } | |
parser.on('-o', '--out MODIFIED_TPF') { |t| out = t } | |
parser.on('-w', '--window TEXELS') { |t| window_size = t.to_i } | |
parser.on('-h', '--help') do | |
puts parser | |
exit | |
end | |
end.parse! | |
agp_gaps = parse_agp(agp) | |
tpf_gaps = parse_tpf(tpf) | |
marked_gaps, novel_gaps = match_gaps(agp_gaps, tpf_gaps, get_texel_size(agp) * window_size) | |
# if the gap matches a marked gap, add a > | |
previous_line = '' | |
out_file = File.new(out, 'w') | |
File.readlines(tpf).each do |l| | |
if l.include?('GAP') | |
unless l.include?('>') # skip already marked bits | |
(id, _, stop) = previous_line.split[1].split(/[:-]/) | |
out_file.print '>' if marked_gaps.key?(id) && marked_gaps[id].include?(stop.to_i) | |
end | |
out_file.print l | |
else | |
c = l.split | |
(id, start, stop) = c[1].split(/[:-]/) | |
start = start.to_i | |
stop = stop.to_i | |
new = [] | |
new = novel_gaps[id].grep(start..stop) if novel_gaps.key?(id) | |
if new.size >= 1 # if one or more of the novel gaps are inside a segment | |
lines = [] | |
[start, new, stop + 1].flatten.each_cons(2) do |a, b| | |
lines << "?\t#{id}:#{a}-#{b - 1}\t#{c[2]}\t#{c[3]}\n" | |
end.to_a | |
out_file.print(lines.join(">GAP\tTYPE-2\t200\n")) | |
else | |
out_file.print l | |
end | |
end | |
previous_line = l | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment