envp/goodnes_of_fit.rb

## goodnes_of_fit.rb
module GoodnessOfFit
  # All goodness of fit helpers go here

  ## Pearson's chi squared test for goodness of fit
  # == References
  # * http://stattrek.com/chi-square-test/goodness-of-fit.aspx?Tutorial=AP
  # * https://en.wikipedia.org/wiki/Degrees_of_freedom_(statistics)#Residuals
  #
  # The null hypothesis is always that the +candidate+ does belong to
  # the +target+ distribution.

  module PearsonChiSquared
    # Partitions an interval into +parts+ pieces
    # using an ad-hac technique
    def partitions(lower, upper, parts)
      left_tail = [-Float::INFINITY, lower]
      right_tail = [upper, Float::INFINITY]
      c1, c2 = [], []
      chunk_size = (upper - lower) / parts.to_f

      # Interleave two arrays to create each of our intervals
      (0..parts).each do |p|
        c1 << lower + chunk_size * p
        c2 << lower + chunk_size * (p + 1)
      end

      chunks = c1.zip(c2)
      chunks.pop

      # final results
      [left_tail, *chunks, right_tail]
    end

    def pearson_chi_squared(candidate:, target:, significance: 0.05, sample_size: 100, bins:10)
      bin_size = sample_size / bins

      # Degrees of freedom
      freedoms = bins - 1

      # Collect sample data and ascertain group size
      # along with classes to divide the support into
      s = Array.new(sample_size) {candidate.call}.sort
      grp_size = (s.max - s.min) / bins
      groups = partitions(s.min, s.max, bins)

      # Find sample mean and variance
      smean = s.reduce(:+) / sample_size.to_f
      svariance = (s.map {|i| (i - smean) ** 2}.reduce(:+)) / (sample_size - 1).to_f

      ##
      #  NORMALIZE THE DATA TO THE TARGET DISTRIBUTION DEFAULTS / STANDARD HERE
      #  SO THAT LATER, CDF CAN BE CALCULATED WITH JUST 1 ARGUMENT
      ##

      # Group data into bins
      bins = groups.each_with_index do |m, i|
        { limits: groups[i], elems: s.elems_between(*groups[i])] }
      end

      # Find expected count for each bin and use that to determine the deviation
      chi_sq = bins.each_with_index.collect { |b, i|
        exp_count = sample_size * (target.cdf(b[:limits].last) - target.cdf(b[:limits].first))
        obs_count = b[:elems].count
        ((exp_count - obs_count) ** 2) / exp_count
      }.reduce(:+)

      # A trusted library to provide this method
      # For 9 degrees of freedom and 0.05 significance level
      # the statistic mustbe lower than 3.33
      alpha = ChiSquared.chi2cdf(chi_sq, freedoms)

      return alpha > significance
    end
  end
end
	module GoodnessOfFit
	# All goodness of fit helpers go here

	## Pearson's chi squared test for goodness of fit
	# == References
	# * http://stattrek.com/chi-square-test/goodness-of-fit.aspx?Tutorial=AP
	# * https://en.wikipedia.org/wiki/Degrees_of_freedom_(statistics)#Residuals
	#
	# The null hypothesis is always that the +candidate+ does belong to
	# the +target+ distribution.

	module PearsonChiSquared
	# Partitions an interval into +parts+ pieces
	# using an ad-hac technique
	def partitions(lower, upper, parts)
	left_tail = [-Float::INFINITY, lower]
	right_tail = [upper, Float::INFINITY]
	c1, c2 = [], []
	chunk_size = (upper - lower) / parts.to_f

	# Interleave two arrays to create each of our intervals
	(0..parts).each do \|p\|
	c1 << lower + chunk_size * p
	c2 << lower + chunk_size * (p + 1)
	end

	chunks = c1.zip(c2)
	chunks.pop

	# final results
	[left_tail, *chunks, right_tail]
	end

	def pearson_chi_squared(candidate:, target:, significance: 0.05, sample_size: 100, bins:10)
	bin_size = sample_size / bins

	# Degrees of freedom
	freedoms = bins - 1

	# Collect sample data and ascertain group size
	# along with classes to divide the support into
	s = Array.new(sample_size) {candidate.call}.sort
	grp_size = (s.max - s.min) / bins
	groups = partitions(s.min, s.max, bins)

	# Find sample mean and variance
	smean = s.reduce(:+) / sample_size.to_f
	svariance = (s.map {\|i\| (i - smean) ** 2}.reduce(:+)) / (sample_size - 1).to_f

	##
	# NORMALIZE THE DATA TO THE TARGET DISTRIBUTION DEFAULTS / STANDARD HERE
	# SO THAT LATER, CDF CAN BE CALCULATED WITH JUST 1 ARGUMENT
	##

	# Group data into bins
	bins = groups.each_with_index do \|m, i\|
	{ limits: groups[i], elems: s.elems_between(*groups[i])] }
	end

	# Find expected count for each bin and use that to determine the deviation
	chi_sq = bins.each_with_index.collect { \|b, i\|
	exp_count = sample_size * (target.cdf(b[:limits].last) - target.cdf(b[:limits].first))
	obs_count = b[:elems].count
	((exp_count - obs_count) ** 2) / exp_count
	}.reduce(:+)

	# A trusted library to provide this method
	# For 9 degrees of freedom and 0.05 significance level
	# the statistic mustbe lower than 3.33
	alpha = ChiSquared.chi2cdf(chi_sq, freedoms)

	return alpha > significance
	end
	end
	end