kejadlen/gist:994422

## gistfile1.rb
require 'logger'
require 'net/http'
require 'open-uri'

#Roshan Khan
#CS 590B
#HW4 - HMM

class Fasta
  attr_accessor :dna, :html

  def initialize(code)
    p "Retrieving Fasta for #{code}"
    @html = File.open(code).read
    @dna = html.sub(/\A[^\n]*\n/, '')
    @dna.gsub!(/[^ACGT]/, 'T')
    p "Fasta retrieved, length #{@dna.length}"
  end
end

class ViterbiPath

  STATES = ['STANDARD', 'CG_RICH']
  OBS = ['A', 'C', 'G', 'T']
  START =  {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001}
  TRANS =  {'STANDARD' , {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001} , 'CG_RICH' , {'STANDARD' , 0.01, 'CG_RICH' , 0.99} }
  #TRANS =  {'STANDARD' , {'STANDARD' , 0.6, 'CG_RICH' , 0.4} , 'CG_RICH' , {'STANDARD' , 0.2, 'CG_RICH' , 0.8} }
  EMIT =   {'STANDARD' , { 'A' , 0.25, 'C' , 0.25, 'G' , 0.25, 'T',0.25 }, 'CG_RICH' ,  {'A' , 0.2, 'C' , 0.3, 'G' , 0.3, 'T',0.2} }


  # Initalize data structure for holding calculations. States x Dna array
  def initialize (dna)
    @dna = dna
  end

  def traceback
    @path = []
    last_parent = ""
    # Find highest value in the last node, to see where to start trace
    if (@vit[@vit.length-1]['STANDARD'] > @vit[@vit.length-1]['CG_RICH'])
      p "Probability of this path is " + @vit[@vit.length-1]['STANDARD'].to_s
      @path << @vit[@vit.length-1]['STANDARDparent']
      lastparent = @vit[@vit.length-1]['STANDARDparent']
    else
      p "Probability of this path is " + @vit[@vit.length-1]['CG_RICH'].to_s
      @path << @vit[@vit.length-1]['CG_RICHparent']
      lastparent = @vit[@vit.length-1]['CG_RICHparent']
    end


    # Now count back from last node - 1
    count = @vit.length - 2
    while (count >= 0)
      # New path is viterbi sequence position, and look up parent value
      @path << @vit[count][lastparent+'parent']
      lastparent = @vit[count][lastparent+'parent']
      count -= 1
    end
    @path = @path.reverse
  end

  def train
    p "Initiating training"

    tc = { "STANDARD", {"STANDARD", 0.0, "CG_RICH", 0.0}, "CG_RICH", {"STANDARD", 0.0, "CG_RICH", 0.0} }
    ec = {'STANDARD' , { 'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0 }, 'CG_RICH' ,  {'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0} }

    cur = @path[1]

    for i in (2...@path.length)
      tc[cur][@path[i]] += 1
      ec[cur][@dna[i].chr] += 1
      cur = @path[i]
    end

    standard_total = tc["STANDARD"]["STANDARD"] + tc["STANDARD"]["CG_RICH"]
    cg_total = tc["CG_RICH"]["CG_RICH"] + tc["CG_RICH"]["STANDARD"]

    TRANS["STANDARD"]["STANDARD"] = tc["STANDARD"]["STANDARD"] / standard_total
    TRANS["STANDARD"]["CG_RICH"] = tc["STANDARD"]["CG_RICH"] / standard_total
    TRANS["CG_RICH"]["STANDARD"] = tc["CG_RICH"]["STANDARD"] / cg_total
    TRANS["CG_RICH"]["CG_RICH"] = tc["CG_RICH"]["CG_RICH"] / cg_total

    s_total = ec["STANDARD"]["A"] + ec["STANDARD"]["C"] + ec["STANDARD"]["G"] + ec["STANDARD"]["T"]
    g_total = ec["CG_RICH"]["A"] + ec["CG_RICH"]["C"] + ec["CG_RICH"]["G"] + ec["CG_RICH"]["T"]

    EMIT["STANDARD"]["A"] = ec["STANDARD"]["A"] / s_total
    EMIT["STANDARD"]["C"] = ec["STANDARD"]["C"] / s_total
    EMIT["STANDARD"]["G"] = ec["STANDARD"]["G"] / s_total
    EMIT["STANDARD"]["T"] = ec["STANDARD"]["T"] / s_total

    EMIT["CG_RICH"]["A"] = ec["CG_RICH"]["A"] / g_total
    EMIT["CG_RICH"]["C"] = ec["CG_RICH"]["C"] / g_total
    EMIT["CG_RICH"]["G"] = ec["CG_RICH"]["G"] / g_total
    EMIT["CG_RICH"]["T"] = ec["CG_RICH"]["T"] / g_total

    p "New parameters are"
    p "TRANSITION"
    p TRANS
    p "EMISSION"
    p EMIT
  end

  # 1->1 = 1->1 / 1->1+1->2
  # 1->2 = 1->2 / 1->1+1->2
  # 2->1 = 2->1/ 2->1 + 2->2
  # 2->2 = 2->2 / 2->2 + 2->2
  # same thing for distribution

  def print_cg_areas
    in_cg = false
    start_node = 0
    len = 0
    hits = 0
    @path.each_with_index do |state, index|
      if (state == "CG_RICH")
        if (in_cg == true)
          len+=1
        else
          in_cg = true
          start_node = index
          len=1
        end
      else
        if (in_cg == true)
          hits += 1
          # FINISHED A CG_RICH AREA - PRINT THIS
          p "CG_Rich area from #{start_node} to #{start_node + len} which is #{len} bp long."
          odds = @vit[index]['CG_RICH']
        end
        in_cg = false
        len = 0
      end
    end
    p "Total number of hits in this sequence is #{hits}"
  end

  def longest_rich_area
    in_cg = false
    start_node = 0
    longest_start = 0
    longest_len = 0
    len = 0
    @path.each_with_index do |state, index|
      if (state == "CG_RICH")
        if (in_cg == true)
          len+=1
        else
          in_cg = true
          start_node = index
          len=1
        end
      else
        in_cg = false
        len = 0
      end

      if (longest_len < len)
        longest_len = len
        longest_start = start_node
      end
    end

    p longest_len
    p longest_start
    #p @path[longest_start...longest_start+longest_len]
  end

  #TODO: Change this function name
  # Calculate state values across all nodes in the chain
  def predict()
    @vit = Array.new()
    @vit << {}
    for state in STATES
      @vit[0][state] = Math.log(START[state] * EMIT[state][@dna[0].chr])
    end
    start = Time.now
    for t in (1...@dna.length-1)
      @vit << {}
      if (t % 100000 == 0)
        p "Completed #{t} in #{Time.now - start} seconds"
      end
      for y in STATES
        values = []
        for y0 in STATES
          values << [@vit[t-1][y0] + Math.log(TRANS[y0][y] * EMIT[y][@dna[t].chr]), y0]
        end
        @vit[t][y] = values.max[0]
        @vit[t][y + 'parent'] = values.max[1]
      end
    end
  end
end

if __FILE__ == $0
  f = Fasta.new(ARGV[0])
  vp = ViterbiPath.new(f.dna)
  (0...10).each do |i|
    p "---------------------------- ITERATION #{i+1} ----------------------------"
    vp.predict
    vp.traceback
    vp.print_cg_areas
    vp.train
  end
end
	require 'logger'
	require 'net/http'
	require 'open-uri'

	#Roshan Khan
	#CS 590B
	#HW4 - HMM

	class Fasta
	attr_accessor :dna, :html

	def initialize(code)
	p "Retrieving Fasta for #{code}"
	@html = File.open(code).read
	@dna = html.sub(/\A[^\n]*\n/, '')
	@dna.gsub!(/[^ACGT]/, 'T')
	p "Fasta retrieved, length #{@dna.length}"
	end
	end

	class ViterbiPath

	STATES = ['STANDARD', 'CG_RICH']
	OBS = ['A', 'C', 'G', 'T']
	START = {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001}
	TRANS = {'STANDARD' , {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001} , 'CG_RICH' , {'STANDARD' , 0.01, 'CG_RICH' , 0.99} }
	#TRANS = {'STANDARD' , {'STANDARD' , 0.6, 'CG_RICH' , 0.4} , 'CG_RICH' , {'STANDARD' , 0.2, 'CG_RICH' , 0.8} }
	EMIT = {'STANDARD' , { 'A' , 0.25, 'C' , 0.25, 'G' , 0.25, 'T',0.25 }, 'CG_RICH' , {'A' , 0.2, 'C' , 0.3, 'G' , 0.3, 'T',0.2} }


	# Initalize data structure for holding calculations. States x Dna array
	def initialize (dna)
	@dna = dna
	end

	def traceback
	@path = []
	last_parent = ""
	# Find highest value in the last node, to see where to start trace
	if (@vit[@vit.length-1]['STANDARD'] > @vit[@vit.length-1]['CG_RICH'])
	p "Probability of this path is " + @vit[@vit.length-1]['STANDARD'].to_s
	@path << @vit[@vit.length-1]['STANDARDparent']
	lastparent = @vit[@vit.length-1]['STANDARDparent']
	else
	p "Probability of this path is " + @vit[@vit.length-1]['CG_RICH'].to_s
	@path << @vit[@vit.length-1]['CG_RICHparent']
	lastparent = @vit[@vit.length-1]['CG_RICHparent']
	end


	# Now count back from last node - 1
	count = @vit.length - 2
	while (count >= 0)
	# New path is viterbi sequence position, and look up parent value
	@path << @vit[count][lastparent+'parent']
	lastparent = @vit[count][lastparent+'parent']
	count -= 1
	end
	@path = @path.reverse
	end

	def train
	p "Initiating training"

	tc = { "STANDARD", {"STANDARD", 0.0, "CG_RICH", 0.0}, "CG_RICH", {"STANDARD", 0.0, "CG_RICH", 0.0} }
	ec = {'STANDARD' , { 'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0 }, 'CG_RICH' , {'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0} }

	cur = @path[1]

	for i in (2...@path.length)
	tc[cur][@path[i]] += 1
	ec[cur][@dna[i].chr] += 1
	cur = @path[i]
	end

	standard_total = tc["STANDARD"]["STANDARD"] + tc["STANDARD"]["CG_RICH"]
	cg_total = tc["CG_RICH"]["CG_RICH"] + tc["CG_RICH"]["STANDARD"]

	TRANS["STANDARD"]["STANDARD"] = tc["STANDARD"]["STANDARD"] / standard_total
	TRANS["STANDARD"]["CG_RICH"] = tc["STANDARD"]["CG_RICH"] / standard_total
	TRANS["CG_RICH"]["STANDARD"] = tc["CG_RICH"]["STANDARD"] / cg_total
	TRANS["CG_RICH"]["CG_RICH"] = tc["CG_RICH"]["CG_RICH"] / cg_total

	s_total = ec["STANDARD"]["A"] + ec["STANDARD"]["C"] + ec["STANDARD"]["G"] + ec["STANDARD"]["T"]
	g_total = ec["CG_RICH"]["A"] + ec["CG_RICH"]["C"] + ec["CG_RICH"]["G"] + ec["CG_RICH"]["T"]

	EMIT["STANDARD"]["A"] = ec["STANDARD"]["A"] / s_total
	EMIT["STANDARD"]["C"] = ec["STANDARD"]["C"] / s_total
	EMIT["STANDARD"]["G"] = ec["STANDARD"]["G"] / s_total
	EMIT["STANDARD"]["T"] = ec["STANDARD"]["T"] / s_total

	EMIT["CG_RICH"]["A"] = ec["CG_RICH"]["A"] / g_total
	EMIT["CG_RICH"]["C"] = ec["CG_RICH"]["C"] / g_total
	EMIT["CG_RICH"]["G"] = ec["CG_RICH"]["G"] / g_total
	EMIT["CG_RICH"]["T"] = ec["CG_RICH"]["T"] / g_total

	p "New parameters are"
	p "TRANSITION"
	p TRANS
	p "EMISSION"
	p EMIT
	end

	# 1->1 = 1->1 / 1->1+1->2
	# 1->2 = 1->2 / 1->1+1->2
	# 2->1 = 2->1/ 2->1 + 2->2
	# 2->2 = 2->2 / 2->2 + 2->2
	# same thing for distribution

	def print_cg_areas
	in_cg = false
	start_node = 0
	len = 0
	hits = 0
	@path.each_with_index do \|state, index\|
	if (state == "CG_RICH")
	if (in_cg == true)
	len+=1
	else
	in_cg = true
	start_node = index
	len=1
	end
	else
	if (in_cg == true)
	hits += 1
	# FINISHED A CG_RICH AREA - PRINT THIS
	p "CG_Rich area from #{start_node} to #{start_node + len} which is #{len} bp long."
	odds = @vit[index]['CG_RICH']
	end
	in_cg = false
	len = 0
	end
	end
	p "Total number of hits in this sequence is #{hits}"
	end

	def longest_rich_area
	in_cg = false
	start_node = 0
	longest_start = 0
	longest_len = 0
	len = 0
	@path.each_with_index do \|state, index\|
	if (state == "CG_RICH")
	if (in_cg == true)
	len+=1
	else
	in_cg = true
	start_node = index
	len=1
	end
	else
	in_cg = false
	len = 0
	end

	if (longest_len < len)
	longest_len = len
	longest_start = start_node
	end
	end

	p longest_len
	p longest_start
	#p @path[longest_start...longest_start+longest_len]
	end

	#TODO: Change this function name
	# Calculate state values across all nodes in the chain
	def predict()
	@vit = Array.new()
	@vit << {}
	for state in STATES
	@vit[0][state] = Math.log(START[state] * EMIT[state][@dna[0].chr])
	end
	start = Time.now
	for t in (1...@dna.length-1)
	@vit << {}
	if (t % 100000 == 0)
	p "Completed #{t} in #{Time.now - start} seconds"
	end
	for y in STATES
	values = []
	for y0 in STATES
	values << [@vit[t-1][y0] + Math.log(TRANS[y0][y] * EMIT[y][@dna[t].chr]), y0]
	end
	@vit[t][y] = values.max[0]
	@vit[t][y + 'parent'] = values.max[1]
	end
	end
	end
	end

	if __FILE__ == $0
	f = Fasta.new(ARGV[0])
	vp = ViterbiPath.new(f.dna)
	(0...10).each do \|i\|
	p "---------------------------- ITERATION #{i+1} ----------------------------"
	vp.predict
	vp.traceback
	vp.print_cg_areas
	vp.train
	end
	end