MaxGabriel/molecules.rb

## molecules.rb
#!/usr/bin/env ruby

require 'csv'
require 'set'

## HA (hyaluronic acid) molecule merging script
## Written by Max Tagher, licensed to the Public Domain under Creative Commons Zero 1.0

### Overview
# This script runs a simulation of HA molecules merging with one another.
# Given initial conditions, it will simulate a percentage of them merging with other molecules to create larger chains.

# The algorithm used is
# Step 1. Generation
# Generate NUMBER_OF_MOLECULES, each with a length calculated from MOLECULE_AVG_LENGTH and MOLECULE_STD_DEV, assuming a normal (gaussian) distribution.
# For each unit of the molecule, there will be a FAB_PERCENT chance that it will be a fab that can connect to other fabs, lengthening the chain.

# Step 2. Connection
# For each fab, there is a (AGGREGATION_PERCENT / 2.0) chance that it will connect to another fab.
# If this dice roll succeeds and there is an open fab on the molecule, there is a (AGGREGATION_PERCENT / 2.0) * CLOSENESS_FACTOR chance that the molecule connects to a fab on itself.

# Step 3. Output
# After merging the HA molecules, for each HA molecule the total length and number of fabs is recorded in a CSV. Output looks like
# Molecule Length, Fabs
# 54, 2
# 53, 0
# 115, 6
# (etc.)


### Running this script
# Install the Ruby programming language. Then run `ruby molecules.rb` from the directory the script is in.
# If you have access to a Mac or Linux machine you should maybe use that; it will have much better Ruby support, but Windows should still work.
# If you have trouble running the script, it could potentially be modified to run in a browser tool like tryruby.org. It would need to be upload the output CSV to somewhere online, which complicates things a little.


## Variables

# The filename to print results to. This can be a path as well (e.g. /tmp/molecules.csv).
CSV_OUTPUT_FILENAME = "molecules.csv"

NUMBER_OF_MOLECULES = 40000 # In practice this will be something like 1K to 10K
MOLECULE_AVG_LENGTH = 500 # Number of units in the molecule, not daltons. In practice will be ~500 units long. This value is given by the molecule vendor.
MOLECULE_STD_DEV = 10 # Standard Deviation in the length of the molecule. This value is given by the molecule vendor.


# The percent of the units that will be a fab, allowing them to connect to other molecules.
# Should be a value between 0 and 1. In practice will be between 2 and 7%
FAB_PERCENT = 0.07


# Each unit of HA can have a fab on it
  # Number of filled spots over number of total spots
# Aggregation % reaction by which dots find other dots
   # User input ranging from 0-100%, divided in two because each starting dot finds an ending dot
  # (Percent of material that ends up connected)

# The total percent of fabs that will end up connected to another fab.
# A value between 0 and 1. In practice, this will be between 5 and 10%
AGGREGATION_PERCENT = 0.10

# The factor by which a fab is more likely to merge with a fab on its own molecule (e.g. Being 5x more likely to merge with itself rather than a different molecule)
CLOSENESS_FACTOR = 5

# If true, the script will print each molecule, with a "." for empty, "◦" for disconnected fab, and "•" for a connected fab.
# Note: this doesn't take into account merging strings together.
PRETTY_PRINT_OUTPUT = false

# If true, print additional information as the script is run
DEBUG = false

# Potential issues:
# Right now after applying the closeness factor, and going on to run the regular aggregation percent calc, the siblings of the initial pair are still considered.

# Algorithm
# 1. pick a random fab
# 2. chance that that fab merges with itself
# 3. chance that that fab merges with another fab

class Fab
  attr_accessor :connected_fab
  attr_accessor :acid_string

  # todo add initializer

  def initialize(acid_string)
    self.acid_string = acid_string
  end

  def connected?
    return !disconnected?
  end

  def disconnected?
    return connected_fab.nil?
  end

  def connect!(other_fab)
    self.connected_fab = other_fab
    other_fab.connected_fab = self
  end

  def siblings
    acid_string.fabs.select { |f2| f2 != self }
  end

end

class AcidString
  # attr_accessor :length
  # attr_accessor :unit_array
  attr_accessor :fabs
  attr_accessor :length

  def initialize(length, fab_percent)
    # unit_array = []

    fab_count = (fab_percent * length).round
    fabs = Array.new(fab_count)

    fab_count.times do |n|
      fab = Fab.new(self)
      fabs[n] = fab
    end


    # length.times do # Can elimiate this overhead by just calculating fab_percent * length and making that many fabs.

    #   if Kernel.rand < fab_percent
    #     fab = Fab.new(self)
    #     unit_array << fab
    #     fabs << fab
    #   elsif DEBUG # Don't bother adding nils if not in debug mode, we're only adding for pretty printing.
    #     unit_array << nil
    #   end

    # end
    # self.unit_array = unit_array
    self.fabs = fabs
    self.length = length
  end

  def to_dot_string
    s = ""
    unit_array.each do |unit|
      if unit.nil?
        s << "."
      elsif unit.is_a? Fab
        s << (unit.connected? ? "•" : "◦")
      end
    end
    s
  end

end

class Script

  def self.run

    strings = Array.new(NUMBER_OF_MOLECULES)

    rng = RandomGaussian.new(MOLECULE_AVG_LENGTH, MOLECULE_STD_DEV)

    puts "Generating molecules..." if DEBUG

    fab_count = 0

    NUMBER_OF_MOLECULES.times do |i|
      length = rng.rand.round
      as = AcidString.new(length, FAB_PERCENT)
      fab_count += as.fabs.length
      strings[i] = as
    end


    all_fabs = Array.new(fab_count)
    current_index = 0


    strings.each do |s|
      s.fabs.each do |f|
        all_fabs[current_index] = f
        current_index += 1
      end
      # all_fabs += s.fabs
    end

    # fab_count = all_fabs.length
    puts "Done generating molecules. Total fabs: #{fab_count}" if DEBUG

    chance_to_run_connect_operation = AGGREGATION_PERCENT / 2.0
    chance_to_connect_with_self = chance_to_run_connect_operation * CLOSENESS_FACTOR

    # Make array of fabs
    # sample with index for the random fabs
    # Remove by index from the array
    # Run this (total fabs * FAB_PERCENT times)

    number_of_connection_operations_to_make = 0
    fab_count.times do
      number_of_connection_operations_to_make += 1 if Kernel.rand < chance_to_run_connect_operation
    end

    while number_of_connection_operations_to_make > 0
      fab1_index = rand(0..(all_fabs.length - 1))
      fab1 = all_fabs[fab1_index]

      # This fab is either already connected or will be at the end of this loop, so remove it.
      all_fabs.delete_at(fab1_index)

      # The fab may have already been connected by sibling pairing which doesn't remove the sibling fab
      # If so just move onto the next fab.
      if fab1.connected?
        next
      end

      if Kernel.rand < chance_to_connect_with_self && (siblings = fab1.siblings.select { |f2| f2.disconnected? }; siblings.length > 0)
        # puts "combining with self" if DEBUG
        fab1.connect!(siblings.sample)
      else
        # puts "combining with random fab" if DEBUG
        fab2_index = nil
        fab2 = nil


        while fab2.nil?
          # puts "Finding a random fab"
          rand_fab_index = rand(0..(all_fabs.length - 1))
          rand_fab = all_fabs[rand_fab_index]

          if rand_fab.connected?
            all_fabs.delete_at(rand_fab_index)
          else
            fab2_index = rand_fab_index
            fab2 = rand_fab
          end
        end

        fab1.connect!(fab2)
        all_fabs.delete_at(fab2_index)
      end

      number_of_connection_operations_to_make -= 1
      puts "Connection operations remaining #{number_of_connection_operations_to_make}..." if DEBUG && number_of_connection_operations_to_make % 1000 == 0
    end

    puts "Done making connections" if DEBUG

    string_set = strings.to_set
    grouped = []

    puts "Running for #{string_set.count} molecules..." if DEBUG

    while !string_set.empty?
      puts "Remaining molecules: #{string_set.count}" if DEBUG && string_set.count % 1000 == 0
      connected = find_connected_pairs_deep(string_set.first)
      string_set.subtract(connected)
      grouped << connected
    end

    puts "Generating CSV" if DEBUG

    CSV.open(CSV_OUTPUT_FILENAME, "wb", {:headers => ["Molecule Length", "Fabs"], :write_headers => true}) do |csv|
      grouped.each do |strings|
        total_length = strings.inject(0) { |sum, as| sum + as.length }
        fab_length = strings.inject(0) { |sum, as| sum + as.fabs.length }
        csv << [total_length, fab_length]
      end
    end

    if PRETTY_PRINT_OUTPUT
      strings.each do |s|
        puts s.to_dot_string
      end
    end

    nil

  end

  # Given an AcidString, find all the other AcidStrings it's connected to recursively, and return all the AcidStrings in a Set.
  def self.find_connected_pairs_deep(acid_string)
    to_add = Set.new([acid_string])
    final = Set.new


    while to_add.length > 0
      as1 = to_add.first
      acid_string.fabs.each do |fab|
        if fab.connected_fab
          as2 = fab.connected_fab.acid_string
          to_add << as2 unless final.include?(as2) || to_add.include?(as2)
        end
      end
      to_add.delete(as1)
      final << as1
    end

    return final
  end


  # def self.find_connected_pairs_deep(acid_string)
  #   s = Set.new
  #   self.find_connected_pairs(acid_string, s)
  #   return s
  # end

  # private

  # def self.find_connected_pairs(acid_string, set)
  #   return if set.include?(acid_string)
  #   set << acid_string


  #   return set
  # end


end


# Taken from http://stackoverflow.com/a/6178290/1176156
# (Public domain license)
class RandomGaussian
  def initialize(mean, stddev, rand_helper = lambda { Kernel.rand })
    @rand_helper = rand_helper
    @mean = mean
    @stddev = stddev
    @valid = false
    @next = 0
  end

  def rand
    if @valid then
      @valid = false
      return @next
    else
      @valid = true
      x, y = self.class.gaussian(@mean, @stddev, @rand_helper)
      @next = y
      return x
    end
  end

  private
  def self.gaussian(mean, stddev, rand)
    theta = 2 * Math::PI * rand.call
    rho = Math.sqrt(-2 * Math.log(1 - rand.call))
    scale = stddev * rho
    x = mean + scale * Math.cos(theta)
    y = mean + scale * Math.sin(theta)
    return x, y
  end
end

Script.run

# if __FILE__ == $0
#   Script.run
# end
	#!/usr/bin/env ruby

	require 'csv'
	require 'set'

	## HA (hyaluronic acid) molecule merging script
	## Written by Max Tagher, licensed to the Public Domain under Creative Commons Zero 1.0

	### Overview
	# This script runs a simulation of HA molecules merging with one another.
	# Given initial conditions, it will simulate a percentage of them merging with other molecules to create larger chains.

	# The algorithm used is
	# Step 1. Generation
	# Generate NUMBER_OF_MOLECULES, each with a length calculated from MOLECULE_AVG_LENGTH and MOLECULE_STD_DEV, assuming a normal (gaussian) distribution.
	# For each unit of the molecule, there will be a FAB_PERCENT chance that it will be a fab that can connect to other fabs, lengthening the chain.

	# Step 2. Connection
	# For each fab, there is a (AGGREGATION_PERCENT / 2.0) chance that it will connect to another fab.
	# If this dice roll succeeds and there is an open fab on the molecule, there is a (AGGREGATION_PERCENT / 2.0) * CLOSENESS_FACTOR chance that the molecule connects to a fab on itself.

	# Step 3. Output
	# After merging the HA molecules, for each HA molecule the total length and number of fabs is recorded in a CSV. Output looks like
	# Molecule Length, Fabs
	# 54, 2
	# 53, 0
	# 115, 6
	# (etc.)



	### Running this script
	# Install the Ruby programming language. Then run `ruby molecules.rb` from the directory the script is in.
	# If you have access to a Mac or Linux machine you should maybe use that; it will have much better Ruby support, but Windows should still work.
	# If you have trouble running the script, it could potentially be modified to run in a browser tool like tryruby.org. It would need to be upload the output CSV to somewhere online, which complicates things a little.


	## Variables

	# The filename to print results to. This can be a path as well (e.g. /tmp/molecules.csv).
	CSV_OUTPUT_FILENAME = "molecules.csv"

	NUMBER_OF_MOLECULES = 40000 # In practice this will be something like 1K to 10K
	MOLECULE_AVG_LENGTH = 500 # Number of units in the molecule, not daltons. In practice will be ~500 units long. This value is given by the molecule vendor.
	MOLECULE_STD_DEV = 10 # Standard Deviation in the length of the molecule. This value is given by the molecule vendor.



	# The percent of the units that will be a fab, allowing them to connect to other molecules.
	# Should be a value between 0 and 1. In practice will be between 2 and 7%
	FAB_PERCENT = 0.07


	# Each unit of HA can have a fab on it
	# Number of filled spots over number of total spots
	# Aggregation % reaction by which dots find other dots
	# User input ranging from 0-100%, divided in two because each starting dot finds an ending dot
	# (Percent of material that ends up connected)

	# The total percent of fabs that will end up connected to another fab.
	# A value between 0 and 1. In practice, this will be between 5 and 10%
	AGGREGATION_PERCENT = 0.10

	# The factor by which a fab is more likely to merge with a fab on its own molecule (e.g. Being 5x more likely to merge with itself rather than a different molecule)
	CLOSENESS_FACTOR = 5

	# If true, the script will print each molecule, with a "." for empty, "◦" for disconnected fab, and "•" for a connected fab.
	# Note: this doesn't take into account merging strings together.
	PRETTY_PRINT_OUTPUT = false

	# If true, print additional information as the script is run
	DEBUG = false

	# Potential issues:
	# Right now after applying the closeness factor, and going on to run the regular aggregation percent calc, the siblings of the initial pair are still considered.

	# Algorithm
	# 1. pick a random fab
	# 2. chance that that fab merges with itself
	# 3. chance that that fab merges with another fab

	class Fab
	attr_accessor :connected_fab
	attr_accessor :acid_string

	# todo add initializer

	def initialize(acid_string)
	self.acid_string = acid_string
	end

	def connected?
	return !disconnected?
	end

	def disconnected?
	return connected_fab.nil?
	end

	def connect!(other_fab)
	self.connected_fab = other_fab
	other_fab.connected_fab = self
	end

	def siblings
	acid_string.fabs.select { \|f2\| f2 != self }
	end

	end

	class AcidString
	# attr_accessor :length
	# attr_accessor :unit_array
	attr_accessor :fabs
	attr_accessor :length

	def initialize(length, fab_percent)
	# unit_array = []

	fab_count = (fab_percent * length).round
	fabs = Array.new(fab_count)

	fab_count.times do \|n\|
	fab = Fab.new(self)
	fabs[n] = fab
	end


	# length.times do # Can elimiate this overhead by just calculating fab_percent * length and making that many fabs.

	# if Kernel.rand < fab_percent
	# fab = Fab.new(self)
	# unit_array << fab
	# fabs << fab
	# elsif DEBUG # Don't bother adding nils if not in debug mode, we're only adding for pretty printing.
	# unit_array << nil
	# end

	# end
	# self.unit_array = unit_array
	self.fabs = fabs
	self.length = length
	end

	def to_dot_string
	s = ""
	unit_array.each do \|unit\|
	if unit.nil?
	s << "."
	elsif unit.is_a? Fab
	s << (unit.connected? ? "•" : "◦")
	end
	end
	s
	end

	end

	class Script

	def self.run

	strings = Array.new(NUMBER_OF_MOLECULES)

	rng = RandomGaussian.new(MOLECULE_AVG_LENGTH, MOLECULE_STD_DEV)

	puts "Generating molecules..." if DEBUG

	fab_count = 0

	NUMBER_OF_MOLECULES.times do \|i\|
	length = rng.rand.round
	as = AcidString.new(length, FAB_PERCENT)
	fab_count += as.fabs.length
	strings[i] = as
	end



	all_fabs = Array.new(fab_count)
	current_index = 0


	strings.each do \|s\|
	s.fabs.each do \|f\|
	all_fabs[current_index] = f
	current_index += 1
	end
	# all_fabs += s.fabs
	end

	# fab_count = all_fabs.length
	puts "Done generating molecules. Total fabs: #{fab_count}" if DEBUG

	chance_to_run_connect_operation = AGGREGATION_PERCENT / 2.0
	chance_to_connect_with_self = chance_to_run_connect_operation * CLOSENESS_FACTOR

	# Make array of fabs
	# sample with index for the random fabs
	# Remove by index from the array
	# Run this (total fabs * FAB_PERCENT times)

	number_of_connection_operations_to_make = 0
	fab_count.times do
	number_of_connection_operations_to_make += 1 if Kernel.rand < chance_to_run_connect_operation
	end

	while number_of_connection_operations_to_make > 0
	fab1_index = rand(0..(all_fabs.length - 1))
	fab1 = all_fabs[fab1_index]

	# This fab is either already connected or will be at the end of this loop, so remove it.
	all_fabs.delete_at(fab1_index)

	# The fab may have already been connected by sibling pairing which doesn't remove the sibling fab
	# If so just move onto the next fab.
	if fab1.connected?
	next
	end

	if Kernel.rand < chance_to_connect_with_self && (siblings = fab1.siblings.select { \|f2\| f2.disconnected? }; siblings.length > 0)
	# puts "combining with self" if DEBUG
	fab1.connect!(siblings.sample)
	else
	# puts "combining with random fab" if DEBUG
	fab2_index = nil
	fab2 = nil


	while fab2.nil?
	# puts "Finding a random fab"
	rand_fab_index = rand(0..(all_fabs.length - 1))
	rand_fab = all_fabs[rand_fab_index]

	if rand_fab.connected?
	all_fabs.delete_at(rand_fab_index)
	else
	fab2_index = rand_fab_index
	fab2 = rand_fab
	end
	end

	fab1.connect!(fab2)
	all_fabs.delete_at(fab2_index)
	end

	number_of_connection_operations_to_make -= 1
	puts "Connection operations remaining #{number_of_connection_operations_to_make}..." if DEBUG && number_of_connection_operations_to_make % 1000 == 0
	end

	puts "Done making connections" if DEBUG

	string_set = strings.to_set
	grouped = []

	puts "Running for #{string_set.count} molecules..." if DEBUG

	while !string_set.empty?
	puts "Remaining molecules: #{string_set.count}" if DEBUG && string_set.count % 1000 == 0
	connected = find_connected_pairs_deep(string_set.first)
	string_set.subtract(connected)
	grouped << connected
	end

	puts "Generating CSV" if DEBUG

	CSV.open(CSV_OUTPUT_FILENAME, "wb", {:headers => ["Molecule Length", "Fabs"], :write_headers => true}) do \|csv\|
	grouped.each do \|strings\|
	total_length = strings.inject(0) { \|sum, as\| sum + as.length }
	fab_length = strings.inject(0) { \|sum, as\| sum + as.fabs.length }
	csv << [total_length, fab_length]
	end
	end

	if PRETTY_PRINT_OUTPUT
	strings.each do \|s\|
	puts s.to_dot_string
	end
	end

	nil

	end

	# Given an AcidString, find all the other AcidStrings it's connected to recursively, and return all the AcidStrings in a Set.
	def self.find_connected_pairs_deep(acid_string)
	to_add = Set.new([acid_string])
	final = Set.new


	while to_add.length > 0
	as1 = to_add.first
	acid_string.fabs.each do \|fab\|
	if fab.connected_fab
	as2 = fab.connected_fab.acid_string
	to_add << as2 unless final.include?(as2) \|\| to_add.include?(as2)
	end
	end
	to_add.delete(as1)
	final << as1
	end

	return final
	end


	# def self.find_connected_pairs_deep(acid_string)
	# s = Set.new
	# self.find_connected_pairs(acid_string, s)
	# return s
	# end

	# private

	# def self.find_connected_pairs(acid_string, set)
	# return if set.include?(acid_string)
	# set << acid_string


	# return set
	# end


	end



	# Taken from http://stackoverflow.com/a/6178290/1176156
	# (Public domain license)
	class RandomGaussian
	def initialize(mean, stddev, rand_helper = lambda { Kernel.rand })
	@rand_helper = rand_helper
	@mean = mean
	@stddev = stddev
	@valid = false
	@next = 0
	end

	def rand
	if @valid then
	@valid = false
	return @next
	else
	@valid = true
	x, y = self.class.gaussian(@mean, @stddev, @rand_helper)
	@next = y
	return x
	end
	end

	private
	def self.gaussian(mean, stddev, rand)
	theta = 2 * Math::PI * rand.call
	rho = Math.sqrt(-2 * Math.log(1 - rand.call))
	scale = stddev * rho
	x = mean + scale * Math.cos(theta)
	y = mean + scale * Math.sin(theta)
	return x, y
	end
	end

	Script.run

	# if __FILE__ == $0
	# Script.run
	# end