Created
March 15, 2012 05:38
-
-
Save briandoll/2042201 to your computer and use it in GitHub Desktop.
Correlation between terms in an array of arrays
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Correlations | |
# the dataset in question | |
attr_writer :dataset | |
# Count the number of instances an individual term appears | |
# | |
# Example: | |
# {"Ruby" => 3, "PHP" => 2, "C++" => 2} | |
attr_accessor :total_appearance_count | |
# Count the number of instances a given pair was seen together | |
# | |
# Example: | |
# {["PHP", "Ruby"] => 2, ["C", "PHP"] => 1} | |
attr_accessor :seen_together_count | |
def initialize(dataset) | |
self.dataset = dataset | |
self.total_appearance_count = Hash.new(0) | |
self.seen_together_count = Hash.new(0) | |
end | |
# Mark the appearance of each element in the list | |
def count_appearance(list) | |
list.each do |item| | |
@total_appearance_count[item] += 1 | |
end | |
end | |
# Mark the appearance of a given pair | |
def count_appearance_of_pair(pair) | |
pair.sort! | |
@seen_together_count[pair] += 1 | |
end | |
# Count up the dataset | |
def count_dataset | |
@dataset.each do |list| | |
count_appearance(list) | |
pairs = list.combination(2).to_a | |
pairs.each do |pair| | |
count_appearance_of_pair(pair) | |
end | |
end | |
end | |
# generate the correlations | |
def correlate(print = true) | |
puts "Errrrrr no dataset?" if @dataset.empty? | |
count_dataset | |
correlations = [] | |
@total_appearance_count.each do |item, count| | |
@seen_together_count.each do |pair, pair_count| | |
if pair.include?(item) | |
correlation = (pair_count / count.to_f) | |
comparison_item = (pair - [item])[0] | |
correlations << Correlation.new(item, comparison_item, correlation) | |
end | |
end | |
end | |
print_correlations(correlations) if print | |
end | |
def print_correlations(correlations) | |
correlations.sort.each{|c| puts c.to_s} | |
end | |
end | |
class Correlation | |
# the primary element | |
attr_accessor :from | |
# the possibly correlated element | |
attr_accessor :to | |
# the relationship 'from' to 'to' | |
attr_accessor :correlation | |
def initialize(from, to, correlation) | |
self.from, self.to, self.correlation = from, to, correlation | |
end | |
# sort by 'from' alphabetically, then on decending correlation | |
def <=>(other) | |
s = "#{self.from}#{self.correlation - 100}" | |
o = "#{other.from}#{other.correlation - 100}" | |
s <=> o | |
end | |
# format as percentage with one decimal precision | |
def correlation_percentage | |
sprintf('%.1f', (correlation * 100)) | |
end | |
# tell the story | |
def to_s | |
"#{from} is #{correlation_percentage}% correlated with #{to}" | |
end | |
end | |
### Input | |
# sample = [ | |
# ["Ruby", "PHP", "C", "JavaScript"], | |
# ["Ruby", "C"], | |
# ["Ruby", "C", "Forth"], | |
# ["C", "JavaScript"], | |
# ["PHP", "JavaScript"], | |
# ["Ruby", "C"], | |
# ] | |
# ## Run it | |
# correlations = Correlations.new(sample) | |
# correlations.correlate | |
### Produces: | |
# C is 80.0% correlated with Ruby | |
# C is 40.0% correlated with JavaScript | |
# C is 20.0% correlated with Forth | |
# C is 20.0% correlated with PHP | |
# Forth is 100.0% correlated with Ruby | |
# Forth is 100.0% correlated with C | |
# JavaScript is 66.6666666666667% correlated with PHP | |
# JavaScript is 66.6666666666667% correlated with C | |
# JavaScript is 33.3333333333333% correlated with Ruby | |
# PHP is 100.0% correlated with JavaScript | |
# PHP is 50.0% correlated with Ruby | |
# PHP is 50.0% correlated with C | |
# Ruby is 100.0% correlated with C | |
# Ruby is 25.0% correlated with JavaScript | |
# Ruby is 25.0% correlated with PHP | |
# Ruby is 25.0% correlated with Forth |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment