Created
January 26, 2018 16:59
-
-
Save lmrodriguezr/f2a40415a0ee6b396dd27d7c111f97d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
class MultiGeneBlast | |
attr :file, :query, :genes, :hits | |
def initialize(file) | |
@file = file | |
@genes = MultiGeneBlast::Set.new | |
@hits = MultiGeneBlast::Set.new | |
load_from_file! | |
end | |
def load_from_file! | |
File.open(file, "r") do |fh| | |
load_header(fh) | |
load_hits(fh) | |
load_details(fh) | |
end | |
end | |
private | |
def load_header(fh) | |
while !fh.eof? | |
ln = fh.gets.chomp | |
case ln | |
when '', /\ATable of genes, .*:/ # Do nothing | |
when /\ASignificant hits:/ ; break | |
when /\AClusterBlast scores for (.*)/ | |
@query = $1 | |
else | |
@genes << MultiGeneBlast::Gene.new(ln) | |
end | |
end | |
end | |
def load_hits(fh) | |
while !fh.eof? | |
case fh.gets.chomp | |
when '' # Do nothing | |
when /\ADetails:/ ; break | |
when /\A(\d+)\. (\S+)\s+(.*)\./ | |
@hits << MultiGeneBlast::Hit.new($1, $2, $3) | |
end | |
end | |
end | |
def load_details(fh) | |
while !fh.eof? | |
hit = load_details_header(fh) | |
load_details_genes(fh, hit) | |
load_details_hits(fh, hit) | |
end | |
end | |
def load_details_header(fh) | |
h = nil | |
while !fh.eof? | |
case fh.gets.chomp | |
when '', '>>' # Do nothing | |
when /\ATable of genes, .*:/ ; break | |
when /\A(\d+)\. (.*)/ | |
h = @hits[$1.to_i] | |
raise "Bad name: got #{$2}, expected #{h.gene}" if h.gene!=$2 | |
when /\ASource: (.*)\./ | |
raise "Bad source: got #{$1}, expected #{h.source}" if h.source!=$1 | |
when /\ANumber .* hits to this cluster: (\d+)/ | |
h.proteins_with_hits = $1.to_i | |
when /\AMultiGeneBlast score: ([\d\.]+)/ | |
h.score = $1.to_f | |
when /\ACumulative Blast bit score: ([\d\.]+)/ | |
h.cumulative_score = $1.to_f | |
end | |
end | |
h | |
end | |
def load_details_genes(fh, hit) | |
while !fh.eof? | |
ln = fh.gets.chomp | |
case ln | |
when '' # Do nothing | |
when /\ATable of Blast hits .*:/ ; break | |
else | |
hit.genes << MultiGeneBlast::Gene.new(ln) | |
end | |
end | |
end | |
def load_details_hits(fh, hit) | |
while !fh.eof? | |
ln = fh.gets.chomp | |
case ln | |
when '' # Do nothing | |
when '>>' ; break | |
else | |
hit.blast_hits << MultiGeneBlast::BlastHit.new(ln) | |
end | |
end | |
end | |
end | |
class MultiGeneBlast::Gene | |
attr :name, :loc_start, :loc_end, :strand, :annotation, :locus_tag | |
alias index name | |
def initialize(str) | |
s = str.split(/\s+/) | |
[1,2].each{ |i| s[i] = s[i].to_i } | |
@name, @loc_start, @loc_end, @strand, @annotation, @locus_tag = *s | |
end | |
end | |
class MultiGeneBlast::Hit | |
attr :rank, :gene, :source | |
attr_accessor :proteins_with_hits, :score, :cumulative_score | |
attr_accessor :genes, :blast_hits | |
alias index rank | |
def initialize(rank, gene, source) | |
@rank, @gene, @source = rank.to_i, gene, source | |
@genes = MultiGeneBlast::Set.new | |
@blast_hits = MultiGeneBlast::Set.new | |
end | |
end | |
class MultiGeneBlast::BlastHit | |
attr :query, :subject, :identity, :score, :coverage, :evalue | |
alias index subject | |
def initialize(str) | |
s = str.split(/\s+/) | |
(2..5).each{ |i| s[i] = s[i].to_f } | |
@query, @subject, @identity, @score, @coverage, @evalue = *s | |
end | |
end | |
class MultiGeneBlast::Set < Hash | |
def <<(v) self[v.index]=v ; end | |
end | |
# Main: | |
file = ARGV.shift | |
unless file | |
$stderr.puts "Usage: #{__FILE__} MultiGeneBlast.txt" | |
exit 0 | |
end | |
mgb = MultiGeneBlast.new(file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment