briandoll/correlations.rb

## correlations.rb
class Correlations

  # the dataset in question
  attr_writer :dataset

  # Count the number of instances an individual term appears
  #
  # Example:
  # {"Ruby" => 3, "PHP" => 2, "C++" => 2}
  attr_accessor :total_appearance_count

  # Count the number of instances a given pair was seen together
  #
  # Example:
  # {["PHP", "Ruby"] => 2, ["C", "PHP"] => 1}
  attr_accessor :seen_together_count

  def initialize(dataset)
    self.dataset = dataset
    self.total_appearance_count = Hash.new(0)
    self.seen_together_count = Hash.new(0)
  end

  # Mark the appearance of each element in the list
  def count_appearance(list)
    list.each do |item|
      @total_appearance_count[item] += 1
    end
  end

  # Mark the appearance of a given pair
  def count_appearance_of_pair(pair)
    pair.sort!
    @seen_together_count[pair] += 1
  end

  # Count up the dataset
  def count_dataset
    @dataset.each do |list|
      count_appearance(list)
      pairs = list.combination(2).to_a
      pairs.each do |pair|
        count_appearance_of_pair(pair)
      end
    end
  end

  # generate the correlations
  def correlate(print = true)
    puts "Errrrrr no dataset?" if @dataset.empty?
    count_dataset
    correlations = []
    @total_appearance_count.each do |item, count|
      @seen_together_count.each do |pair, pair_count|
        if pair.include?(item)
          correlation = (pair_count / count.to_f)
          comparison_item = (pair - [item])[0]
          correlations << Correlation.new(item, comparison_item, correlation)
        end
      end
    end
    print_correlations(correlations) if print
  end

  def print_correlations(correlations)
    correlations.sort.each{|c| puts c.to_s}
  end

end

class Correlation

  # the primary element
  attr_accessor :from

  # the possibly correlated element
  attr_accessor :to

  # the relationship 'from' to 'to'
  attr_accessor :correlation

  def initialize(from, to, correlation)
    self.from, self.to, self.correlation = from, to, correlation
  end

  # sort by 'from' alphabetically, then on decending correlation
  def <=>(other)
    s = "#{self.from}#{self.correlation - 100}"
    o = "#{other.from}#{other.correlation - 100}"
    s <=> o
  end

  # format as percentage with one decimal precision
  def correlation_percentage
    sprintf('%.1f', (correlation * 100))
  end

  # tell the story
  def to_s
    "#{from} is #{correlation_percentage}% correlated with #{to}"
  end
end

### Input
# sample = [
#   ["Ruby", "PHP", "C", "JavaScript"],
#   ["Ruby", "C"],
#   ["Ruby", "C", "Forth"],
#   ["C", "JavaScript"],
#   ["PHP", "JavaScript"],
#   ["Ruby", "C"],
# ]

# ## Run it
# correlations = Correlations.new(sample)
# correlations.correlate

### Produces:
# C is 80.0% correlated with Ruby
# C is 40.0% correlated with JavaScript
# C is 20.0% correlated with Forth
# C is 20.0% correlated with PHP
# Forth is 100.0% correlated with Ruby
# Forth is 100.0% correlated with C
# JavaScript is 66.6666666666667% correlated with PHP
# JavaScript is 66.6666666666667% correlated with C
# JavaScript is 33.3333333333333% correlated with Ruby
# PHP is 100.0% correlated with JavaScript
# PHP is 50.0% correlated with Ruby
# PHP is 50.0% correlated with C
# Ruby is 100.0% correlated with C
# Ruby is 25.0% correlated with JavaScript
# Ruby is 25.0% correlated with PHP
# Ruby is 25.0% correlated with Forth
	class Correlations

	# the dataset in question
	attr_writer :dataset

	# Count the number of instances an individual term appears
	#
	# Example:
	# {"Ruby" => 3, "PHP" => 2, "C++" => 2}
	attr_accessor :total_appearance_count

	# Count the number of instances a given pair was seen together
	#
	# Example:
	# {["PHP", "Ruby"] => 2, ["C", "PHP"] => 1}
	attr_accessor :seen_together_count

	def initialize(dataset)
	self.dataset = dataset
	self.total_appearance_count = Hash.new(0)
	self.seen_together_count = Hash.new(0)
	end

	# Mark the appearance of each element in the list
	def count_appearance(list)
	list.each do \|item\|
	@total_appearance_count[item] += 1
	end
	end

	# Mark the appearance of a given pair
	def count_appearance_of_pair(pair)
	pair.sort!
	@seen_together_count[pair] += 1
	end

	# Count up the dataset
	def count_dataset
	@dataset.each do \|list\|
	count_appearance(list)
	pairs = list.combination(2).to_a
	pairs.each do \|pair\|
	count_appearance_of_pair(pair)
	end
	end
	end

	# generate the correlations
	def correlate(print = true)
	puts "Errrrrr no dataset?" if @dataset.empty?
	count_dataset
	correlations = []
	@total_appearance_count.each do \|item, count\|
	@seen_together_count.each do \|pair, pair_count\|
	if pair.include?(item)
	correlation = (pair_count / count.to_f)
	comparison_item = (pair - [item])[0]
	correlations << Correlation.new(item, comparison_item, correlation)
	end
	end
	end
	print_correlations(correlations) if print
	end

	def print_correlations(correlations)
	correlations.sort.each{\|c\| puts c.to_s}
	end

	end

	class Correlation

	# the primary element
	attr_accessor :from

	# the possibly correlated element
	attr_accessor :to

	# the relationship 'from' to 'to'
	attr_accessor :correlation

	def initialize(from, to, correlation)
	self.from, self.to, self.correlation = from, to, correlation
	end

	# sort by 'from' alphabetically, then on decending correlation
	def <=>(other)
	s = "#{self.from}#{self.correlation - 100}"
	o = "#{other.from}#{other.correlation - 100}"
	s <=> o
	end

	# format as percentage with one decimal precision
	def correlation_percentage
	sprintf('%.1f', (correlation * 100))
	end

	# tell the story
	def to_s
	"#{from} is #{correlation_percentage}% correlated with #{to}"
	end
	end

	### Input
	# sample = [
	# ["Ruby", "PHP", "C", "JavaScript"],
	# ["Ruby", "C"],
	# ["Ruby", "C", "Forth"],
	# ["C", "JavaScript"],
	# ["PHP", "JavaScript"],
	# ["Ruby", "C"],
	# ]

	# ## Run it
	# correlations = Correlations.new(sample)
	# correlations.correlate

	### Produces:
	# C is 80.0% correlated with Ruby
	# C is 40.0% correlated with JavaScript
	# C is 20.0% correlated with Forth
	# C is 20.0% correlated with PHP
	# Forth is 100.0% correlated with Ruby
	# Forth is 100.0% correlated with C
	# JavaScript is 66.6666666666667% correlated with PHP
	# JavaScript is 66.6666666666667% correlated with C
	# JavaScript is 33.3333333333333% correlated with Ruby
	# PHP is 100.0% correlated with JavaScript
	# PHP is 50.0% correlated with Ruby
	# PHP is 50.0% correlated with C
	# Ruby is 100.0% correlated with C
	# Ruby is 25.0% correlated with JavaScript
	# Ruby is 25.0% correlated with PHP
	# Ruby is 25.0% correlated with Forth