Last active
December 14, 2016 15:07
-
-
Save gisleDK/6ef145f7989bc28ccfb2 to your computer and use it in GitHub Desktop.
Extract flanking genes and genes of interest as well as finding co-occurrences. Extracts round-robin, as in correctly extracts flanking genes near start/stop of linear genome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby2.2 | |
require 'pp' | |
require 'optparse' | |
ARGV << "-h" if ARGV.empty? | |
cmd_init = File.basename($0) + " " + ARGV.join(" ") | |
options = {} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: #{File.basename(__FILE__)} [options]" | |
opts.on("-h", "--help", "Display this screen" ) do | |
$stderr.puts opts | |
exit | |
end | |
opts.on("-l", "--goi_file <file>", String, "List containing genes of interest") do |o| | |
options[:goi_file] = o | |
end | |
opts.on("-g", "--domtblout_file <file>", String, "ACCESSION_PROTEINID_UNIQID domtblout file to process") do |o| | |
options[:domtblout_file] = o | |
end | |
opts.on("-f", "--flanking_genes <int>", Integer, "Number of flanking genes to output (default 5)") do |o| | |
options[:flank] = o | |
end | |
opts.on("-o", "--output cooccurence <file>", String, "cooccurence fraction of domains output file") do |o| | |
options[:cooccur_file] = o | |
end | |
end.parse! | |
options[:flank] ||= 5 | |
class Correlations | |
attr_writer :dataset | |
attr_accessor :total_appearance_count | |
attr_accessor :seen_together_count | |
def initialize(dataset) | |
self.dataset = dataset | |
self.total_appearance_count = Hash.new(0) | |
self.seen_together_count = Hash.new(0) | |
end | |
def count_appearance(list) | |
list.each do |item| | |
@total_appearance_count[item] += 1 | |
end | |
end | |
def count_appearance_of_pair(pair) | |
pair.sort! | |
@seen_together_count[pair] += 1 | |
end | |
def count_dataset | |
@dataset.each do |list| | |
count_appearance(list) | |
pairs = list.combination(2).to_a | |
pairs.each do |pair| | |
count_appearance_of_pair(pair) | |
end | |
end | |
end | |
def correlate(options) | |
puts "Missing cooccurrence output filename" if options[:cooccur_file] == nil | |
exit if options[:cooccur_file] == nil | |
count_dataset | |
correlations = [] | |
@total_appearance_count.each do |item, count| | |
@seen_together_count.each do |pair, pair_count| | |
if pair.include?(item) | |
correlation = pair_count | |
comparison_item = (pair - [item])[0] | |
correlations << Correlation.new(item, comparison_item, correlation) | |
end | |
end | |
end | |
File.open(options[:cooccur_file], "w+") do |i| | |
i.puts(correlations.sort) | |
end | |
end | |
end | |
class Correlation | |
attr_accessor :from | |
attr_accessor :to | |
attr_accessor :correlation | |
def initialize(from, to, correlation) | |
self.from, self.to, self.correlation = from, to, correlation | |
end | |
def <=>(other) | |
s = "#{self.from}#{self.correlation - 100}" | |
o = "#{other.from}#{other.correlation - 100}" | |
s <=> o | |
end | |
def correlation_fraction | |
sprintf('%.1f', correlation ) | |
end | |
def to_s | |
"#{from}\t#{to}\t#{correlation_fraction}" | |
end | |
end | |
data_struct = Hash.new { |hash, key| hash[key] = {} } | |
File.open(options[:domtblout_file]) do |ios| | |
ios.each_line do |line| | |
line.chomp! | |
if line[0] == '#' | |
else | |
fields = line.split("\s") | |
id = fields[0] | |
cid = id.split('_')[0,2].join("_") | |
number = fields.first.split("_").last.split("D").last.to_i | |
domain = fields[1] | |
data_struct[cid][number] = [id, domain] | |
end | |
end | |
end | |
goi_list = [] | |
File.open(options[:goi_file]) do |ios| | |
ios.each_line do |line| | |
line.chomp! | |
if line[0] == '#' | |
else | |
goi_list << line.split("\t").first | |
end | |
end | |
end | |
flank_list = [] | |
cooccurrence = [] | |
goi_list.each do |goi| | |
fields = goi.split("_") | |
cid = fields[0,2].join("_") | |
index = fields.last.split("D").last.to_i | |
dat = data_struct.sort | |
if data_struct.has_key? cid | |
occurence = [] | |
flank_list_copy = [] | |
missing = [] | |
(index - options[:flank]).upto(index + options[:flank]).each do |i| | |
flank_list << data_struct[cid][i] | |
flank_list_copy << data_struct[cid][i] | |
if data_struct[cid][i] == nil | |
missing << (i - index) | |
end | |
end | |
if missing.any? | |
missing.count | |
i = missing.count { |x| x > 0} | |
flank_list = (data_struct[cid].values.first(i)+flank_list).rotate(5) | |
j = missing.count { |x| x < 0} | |
flank_list = (data_struct[cid].values.last(j)+flank_list) | |
end | |
flank_list_copy.each do |tuple| | |
if tuple | |
occurence << tuple.last | |
occurence.uniq! | |
end | |
end | |
cooccurrence << occurence | |
end | |
end | |
correlations = Correlations.new(cooccurrence) | |
correlations.correlate(options) | |
flank_list.each do |tuple| | |
if tuple | |
puts tuple.join("\t") | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment