Skip to content

Instantly share code, notes, and snippets.

@sld
Last active December 17, 2015 18:19
Show Gist options
  • Save sld/5652006 to your computer and use it in GitHub Desktop.
Save sld/5652006 to your computer and use it in GitHub Desktop.
ROSALIND 20 # Ruby
require 'set'
require_relative 'graph_and_ant_colony' # https://gist.github.com/sld/2711080
class String
def indexes letter
(0 .. self.length - 1).find_all { |i| self[i,1] == letter }
end
end
class Array
def swap ind1, ind2
tmp = self[ind1]
self[ind1] = self[ind2]
self[ind2] = tmp
end
end
# Task names starts by " Task: TASK_NAME"
# Example: "# Task: Computing GC Content "
module Rosalind20
def count_dna_nucliotides( dna_string )
{'A' => dna_string.count('A'), 'C' => dna_string.count('C'), 'G' => dna_string.count('G'), 'T' => dna_string.count('T')}
end
def dna_to_rna( dna_string )
dna_string.tr "T", "U"
end
def reverse_complement( dna_string )
dna_string.reverse.tr("A","Z").tr("C","Q").tr("T","A").tr("G","C").tr("Q","G").tr("Z","T")
end
def parse_fasta_file(filename)
fasta_pairs = {}
fasta_id = nil
File.open(filename).each_line do |line|
if line[0] == '>'
fasta_id = line.chomp
fasta_id[0] = ""
fasta_pairs[fasta_id] = ""
else
fasta_pairs[fasta_id] += line.chomp
end
end
return fasta_pairs
end
def parse_adjacency_list_file filename
graph_data = {:nodes_count => nil, :adjacency_list => []}
File.open(filename).each_line do |line|
splitted_line = line.split
if splitted_line.count == 2
graph_data[:adjacency_list] << [splitted_line[0].to_i, splitted_line[1].to_i]
else
graph_data[:nodes_count] = splitted_line.first.to_i
end
end
return graph_data
end
def parse_set_file filename
lines = File.open(filename).lines.to_a
universum_set = (1..lines[0].to_i).to_a
lines[1].delete! "{}"
lines[2].delete! "{}"
set1 = lines[1].split(",").map{|e| e.to_i}
set2 = lines[2].split(",").map{|e| e.to_i}
return [universum_set.to_set, set1.to_set, set2.to_set]
end
def calc_gc(dna_string)
100.0 * ( dna_string.count('G') + dna_string.count('C') ) / dna_string.length
end
# Task: Computing GC Content
def find_max_gc( fasta_pairs )
max_fasta_pair = fasta_pairs.max_by{ |fasta_id, dna_string| calc_gc(dna_string) }
return [max_fasta_pair[0], calc_gc(max_fasta_pair[1])]
end
def get_substring_data dna_string, substring_length, substring_ind
start_ind = substring_ind
end_ind = start_ind + substring_length
{:string => dna_string[start_ind...end_ind], :start_ind => start_ind + 1}
end
# Task: Finding a Motif in DNA
def get_substring_start_indexes( dna_string, substring )
indexes = []
legal_substring_count = dna_string.length - substring.length
for i in 0...legal_substring_count
substring_data = get_substring_data(dna_string, substring.length, i)
substring_data[:string] == substring ? indexes << substring_data[:start_ind] : nil
end
return indexes
end
# Task: A Brief Introduction to Graph Theory
def make_directed_graph fasta_pairs, k
graph = []
fasta_pairs.each do |top_fasta_id, top_dna_string|
fasta_pairs.each do |fasta_id, dna_string|
if top_dna_string[-k..-1] == dna_string[0...k] && top_fasta_id != fasta_id
graph << [top_fasta_id, fasta_id]
end
end
end
return graph
end
# ("AGACCTGCCG", "CCTGCCGGAA") => CCTGCCG
def find_start_substring( src_str, dst_str )
start_substring = ""
indexes = dst_str.indexes(src_str[-1]).reverse
indexes.each do |ind|
if src_str[-1-ind..-1] == dst_str[0..ind]
start_substring = dst_str[0..ind]
break
end
end
return start_substring
end
def make_overlap_graph(fasta_pairs)
overlap_graph = []
strings_arr = fasta_pairs.values
for i in 0...strings_arr.length
for j in i+1...strings_arr.length
if strings_arr[i][strings_arr[j]] || strings_arr[j][strings_arr[i]]
next
else
start_substring = find_start_substring( strings_arr[i], strings_arr[j] )
end_substring = find_start_substring( strings_arr[j], strings_arr[i] )
overlap_graph << {:src => strings_arr[i], :dst => strings_arr[j], :to => start_substring, :from => end_substring}
end
end
end
return overlap_graph
end
def get_edges_from_overlap_graph overlap_graph
nodes = Set.new
edges = Set.new
overlap_graph.each do |edge|
node_src = nodes.find{|e| e.name == edge[:src]} || Node.new(edge[:src])
node_dst = nodes.find{|e| e.name == edge[:dst]} || Node.new(edge[:dst])
nodes << node_dst
nodes << node_src
if edge[:to].length > 0
new_edge = Edge.new(node_src, node_dst, -edge[:to].length)
edges << new_edge if !edges.find{|e| e == new_edge}
end
if edge[:from].length > 0
new_edge = Edge.new(node_dst, node_src, -edge[:from].length)
edges << new_edge if !edges.find{|e| e == new_edge}
end
end
return edges
end
def get_superstring_from_route route
superstring = ""
route.edges.each_with_index do |edge, i|
src_str = edge.from.name
dst_str = edge.to.name
substr_weight = edge.weight
if i == 0
superstring = src_str.clone
superstring[substr_weight..-1] = dst_str
else
superstring[substr_weight..-1] = dst_str
end
end
return superstring
end
def check_superstring superstring, test_strings
test_strings.each do|str|
return "Uncorrect superstring!" if !superstring[str]
end
return "Good superstring!"
end
# Task: Genome Assembly as Shortest Superstring
def get_best_route(overlap_graph, need_print = false)
good_edges = overlap_graph.get_more_than_half_weight_edges
good_graph = Graph.new good_edges, :bidirectional => false, :sort_edges => true
superstring = get_superstring_from_route(good_graph)
p superstring.length
p superstring if need_print
end
# Task: Counting Point Mutations
def hamming_distance str1, str2
distance = 0
for i in 0...str1.length
if str1[i] != str2[i]
distance += 1
end
end
return distance
end
def get_corrects dna_strings
appears = {}
for i in 0...dna_strings.length
for j in i+1...dna_strings.length
appears[dna_strings[i]] ||= 0
if dna_strings[i] == dna_strings[j] || reverse_complement(dna_strings[i]) == dna_strings[j]
appears[dna_strings[i]] += 1
end
end
end
appears.find_all{|str, appears_count| appears_count > 0}.collect{|arr| arr[0]}
end
def get_corrections(incorrects, corrects)
corrections = []
incorrects.each do |incorrect|
corrects.each do |correct|
ham_dist = hamming_distance(incorrect, correct)
if ham_dist == 1
corrections << [incorrect, correct]
break
else
correct_complement = reverse_complement(correct)
compl_ham_dist = hamming_distance(incorrect, correct_complement)
if compl_ham_dist == 1
corrections << [incorrect, correct_complement]
break
end
end
end
end
return corrections
end
# Task: Error Correction in Reads
def get_corrections_from_strings(dna_strings)
corrects = get_corrects(dna_strings)
incorrects = dna_strings - corrects
corrections = get_corrections(incorrects, corrects)
corrections.each{ |(incorrect, correction)| puts "#{incorrect}->#{correction}" }
end
def codon_table
codon_table_arr = "UUU F CUU L AUU I GUU V
UUC F CUC L AUC I GUC V
UUA L CUA L AUA I GUA V
UUG L CUG L AUG M GUG V
UCU S CCU P ACU T GCU A
UCC S CCC P ACC T GCC A
UCA S CCA P ACA T GCA A
UCG S CCG P ACG T GCG A
UAU Y CAU H AAU N GAU D
UAC Y CAC H AAC N GAC D
UAA Stop CAA Q AAA K GAA E
UAG Stop CAG Q AAG K GAG E
UGU C CGU R AGU S GGU G
UGC C CGC R AGC S GGC G
UGA Stop CGA R AGA R GGA G
UGG W CGG R AGG R GGG G".split.each_slice(2).to_a
codon_table_hash = Hash[codon_table_arr]
codon_table_hash["UAA"] = ""
codon_table_hash["UAG"] = ""
codon_table_hash["UGA"] = ""
return codon_table_hash
end
# Task: Translating RNA into Protein
def rna_into_protein rna_string
rna_string.chars.each_slice(3).map(&:join).map{|str3| codon_table[str3]}.join
end
def permutations(array, permutations_array, ind, permutation_ind)
if permutation_ind == array.length-1
return permutations_array
else
for i in ind...array.length
new_arr = array.clone
new_arr.swap(ind, i)
permutations_array << new_arr
permutations(new_arr, permutations_array, ind+1, permutation_ind+1)
end
end
return permutations_array.uniq
end
# Task: RNA Splicing
def rna_without_introns dna_string, introns
introns.each{ |intron| dna_string[intron] = "" }
rna_into_protein( dna_to_rna( dna_string ) )
end
# Task: Enumerating Gene Orders
def permutations_of_length length
array = (1..length).to_a
total_length = array.inject(1){|s,e| s*e}
permutations = permutations(array, [], 0, 0)
file = File.new 'perm', 'w'
file.puts "#{total_length}"
permutations.each{|perm| file.puts perm.join(" ")}
end
# Task: Enumerating k-mers Lexicographically
def k_mers(array, k)
array.repeated_permutation(k).each{|sub_arr| puts sub_arr.join}
end
# Task: k-Mer Composition
def composition_4mer( dna_string )
cnt = 4
permutations = Hash[['A', 'C', 'G', 'T'].repeated_permutation(cnt).map{|e| [e.join, 0]}]
for i in 0..(dna_string.length - cnt)
permutations[dna_string[i...i+cnt]] += 1 if permutations[dna_string[i...i+cnt]]
end
return permutations.values
end
# Task: Inferring mRNA from Protein
def protein_to_rna_possible_count( protein_string, mod = 1000000 )
rna_freq_table = {}
codon_table.values.each do |elem|
rna_freq_table[elem] ||= 0
rna_freq_table[elem] += 1
end
possible_count = 1
protein_string.each_char do |char|
possible_count = rna_freq_table[char] * possible_count
end
return ( ( possible_count * rna_freq_table[""] ) % mod )
end
# Task: Mendel's First Law
# k - dominant homo; m - hetero; n - recessive homo
def dominant_probability( k, m, n)
population = k + m + n
(k / population) * ((k-1) / (population - 1) + m / (population - 1) + n / (population - 1)) +
(m / population) * (k / (population - 1) + 0.75*(m-1) / (population - 1) + 0.5*n / (population - 1)) +
(n / population) * (k / (population - 1) + 0.5*m / (population - 1))
end
# Task: Completing a Tree
def min_edges_to_be_tree adjacency_list, nodes_count
nodes_groups = Array.new( nodes_count )
(1..nodes_count).to_a.each{ |i| nodes_groups[i-1] = [i].to_set }
adjacency_list.each do |(node_from, node_to)|
has_group = false
for i in 0...nodes_groups.length
node_group = nodes_groups[i]
if node_group.include?(node_from) || node_group.include?(node_to)
has_group = true
node_group << node_from
node_group << node_to
end
end
nodes_groups << [node_from, node_to].to_set if !has_group
end
(1..nodes_count).to_a.each do |node|
to_merge_groups = []
nodes_groups.each_with_index do |node_group, i|
to_merge_groups << i if node_group.include?(node)
end
merged_group = [].to_set
to_merge_groups.each{|ind| merged_group += nodes_groups[ind]; nodes_groups[ind] = nil}
nodes_groups.compact!
nodes_groups << merged_group
end
return (nodes_groups.count - 1)
end
# Task: Counting Subsets
def subsets_count n
2**n % 1000000
end
# Task: Introduction to Set Operations
def set_operations set1, set2, universum_set
{:union => set1 + set2, :intersection => set1 & set2, :difference1 => set1 - set2, :difference2 => set2 - set1,
:complement1 => universum_set - set1, :complement2 => universum_set - set2}
end
# Task: Constructing a De Bruijn Graph
def make_de_bruijn dna_strings
for i in 0...dna_strings.length
dna_strings << reverse_complement(dna_strings[i])
end
dna_strings.sort!
dna_strings = dna_strings.to_set
k = dna_strings.first.length - 1
edges = Set.new
nodes = {}
dna_strings.each do |dna_str|
nodes[dna_str[0...k]] ||= Node.new dna_str[0...k]
nodes[dna_str[1...k+1]] ||= Node.new dna_str[1...k+1]
edges << Edge.new( nodes[dna_str[0...k]], nodes[dna_str[1...k+1]], 0)
end
return edges
end
end
include Rosalind20
dna_strings = %w(
TGAT
CATG
TCAT
ATGC
CATC
CATC
)
edges = make_de_bruijn( dna_strings )
edges.each {|edge| puts edge.to_s(:with_brackets => true)}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment