Skip to content

Instantly share code, notes, and snippets.

@kejadlen
Forked from roshank/gist:994419
Created May 27, 2011 00:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kejadlen/994422 to your computer and use it in GitHub Desktop.
Save kejadlen/994422 to your computer and use it in GitHub Desktop.
require 'logger'
require 'net/http'
require 'open-uri'
#Roshan Khan
#CS 590B
#HW4 - HMM
class Fasta
attr_accessor :dna, :html
def initialize(code)
p "Retrieving Fasta for #{code}"
@html = File.open(code).read
@dna = html.sub(/\A[^\n]*\n/, '')
@dna.gsub!(/[^ACGT]/, 'T')
p "Fasta retrieved, length #{@dna.length}"
end
end
class ViterbiPath
STATES = ['STANDARD', 'CG_RICH']
OBS = ['A', 'C', 'G', 'T']
START = {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001}
TRANS = {'STANDARD' , {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001} , 'CG_RICH' , {'STANDARD' , 0.01, 'CG_RICH' , 0.99} }
#TRANS = {'STANDARD' , {'STANDARD' , 0.6, 'CG_RICH' , 0.4} , 'CG_RICH' , {'STANDARD' , 0.2, 'CG_RICH' , 0.8} }
EMIT = {'STANDARD' , { 'A' , 0.25, 'C' , 0.25, 'G' , 0.25, 'T',0.25 }, 'CG_RICH' , {'A' , 0.2, 'C' , 0.3, 'G' , 0.3, 'T',0.2} }
# Initalize data structure for holding calculations. States x Dna array
def initialize (dna)
@dna = dna
end
def traceback
@path = []
last_parent = ""
# Find highest value in the last node, to see where to start trace
if (@vit[@vit.length-1]['STANDARD'] > @vit[@vit.length-1]['CG_RICH'])
p "Probability of this path is " + @vit[@vit.length-1]['STANDARD'].to_s
@path << @vit[@vit.length-1]['STANDARDparent']
lastparent = @vit[@vit.length-1]['STANDARDparent']
else
p "Probability of this path is " + @vit[@vit.length-1]['CG_RICH'].to_s
@path << @vit[@vit.length-1]['CG_RICHparent']
lastparent = @vit[@vit.length-1]['CG_RICHparent']
end
# Now count back from last node - 1
count = @vit.length - 2
while (count >= 0)
# New path is viterbi sequence position, and look up parent value
@path << @vit[count][lastparent+'parent']
lastparent = @vit[count][lastparent+'parent']
count -= 1
end
@path = @path.reverse
end
def train
p "Initiating training"
tc = { "STANDARD", {"STANDARD", 0.0, "CG_RICH", 0.0}, "CG_RICH", {"STANDARD", 0.0, "CG_RICH", 0.0} }
ec = {'STANDARD' , { 'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0 }, 'CG_RICH' , {'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0} }
cur = @path[1]
for i in (2...@path.length)
tc[cur][@path[i]] += 1
ec[cur][@dna[i].chr] += 1
cur = @path[i]
end
standard_total = tc["STANDARD"]["STANDARD"] + tc["STANDARD"]["CG_RICH"]
cg_total = tc["CG_RICH"]["CG_RICH"] + tc["CG_RICH"]["STANDARD"]
TRANS["STANDARD"]["STANDARD"] = tc["STANDARD"]["STANDARD"] / standard_total
TRANS["STANDARD"]["CG_RICH"] = tc["STANDARD"]["CG_RICH"] / standard_total
TRANS["CG_RICH"]["STANDARD"] = tc["CG_RICH"]["STANDARD"] / cg_total
TRANS["CG_RICH"]["CG_RICH"] = tc["CG_RICH"]["CG_RICH"] / cg_total
s_total = ec["STANDARD"]["A"] + ec["STANDARD"]["C"] + ec["STANDARD"]["G"] + ec["STANDARD"]["T"]
g_total = ec["CG_RICH"]["A"] + ec["CG_RICH"]["C"] + ec["CG_RICH"]["G"] + ec["CG_RICH"]["T"]
EMIT["STANDARD"]["A"] = ec["STANDARD"]["A"] / s_total
EMIT["STANDARD"]["C"] = ec["STANDARD"]["C"] / s_total
EMIT["STANDARD"]["G"] = ec["STANDARD"]["G"] / s_total
EMIT["STANDARD"]["T"] = ec["STANDARD"]["T"] / s_total
EMIT["CG_RICH"]["A"] = ec["CG_RICH"]["A"] / g_total
EMIT["CG_RICH"]["C"] = ec["CG_RICH"]["C"] / g_total
EMIT["CG_RICH"]["G"] = ec["CG_RICH"]["G"] / g_total
EMIT["CG_RICH"]["T"] = ec["CG_RICH"]["T"] / g_total
p "New parameters are"
p "TRANSITION"
p TRANS
p "EMISSION"
p EMIT
end
# 1->1 = 1->1 / 1->1+1->2
# 1->2 = 1->2 / 1->1+1->2
# 2->1 = 2->1/ 2->1 + 2->2
# 2->2 = 2->2 / 2->2 + 2->2
# same thing for distribution
def print_cg_areas
in_cg = false
start_node = 0
len = 0
hits = 0
@path.each_with_index do |state, index|
if (state == "CG_RICH")
if (in_cg == true)
len+=1
else
in_cg = true
start_node = index
len=1
end
else
if (in_cg == true)
hits += 1
# FINISHED A CG_RICH AREA - PRINT THIS
p "CG_Rich area from #{start_node} to #{start_node + len} which is #{len} bp long."
odds = @vit[index]['CG_RICH']
end
in_cg = false
len = 0
end
end
p "Total number of hits in this sequence is #{hits}"
end
def longest_rich_area
in_cg = false
start_node = 0
longest_start = 0
longest_len = 0
len = 0
@path.each_with_index do |state, index|
if (state == "CG_RICH")
if (in_cg == true)
len+=1
else
in_cg = true
start_node = index
len=1
end
else
in_cg = false
len = 0
end
if (longest_len < len)
longest_len = len
longest_start = start_node
end
end
p longest_len
p longest_start
#p @path[longest_start...longest_start+longest_len]
end
#TODO: Change this function name
# Calculate state values across all nodes in the chain
def predict()
@vit = Array.new()
@vit << {}
for state in STATES
@vit[0][state] = Math.log(START[state] * EMIT[state][@dna[0].chr])
end
start = Time.now
for t in (1...@dna.length-1)
@vit << {}
if (t % 100000 == 0)
p "Completed #{t} in #{Time.now - start} seconds"
end
for y in STATES
values = []
for y0 in STATES
values << [@vit[t-1][y0] + Math.log(TRANS[y0][y] * EMIT[y][@dna[t].chr]), y0]
end
@vit[t][y] = values.max[0]
@vit[t][y + 'parent'] = values.max[1]
end
end
end
end
if __FILE__ == $0
f = Fasta.new(ARGV[0])
vp = ViterbiPath.new(f.dna)
(0...10).each do |i|
p "---------------------------- ITERATION #{i+1} ----------------------------"
vp.predict
vp.traceback
vp.print_cg_areas
vp.train
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment