SebastianPoell/pca.rb

## pca.rb
# This script implements a very basic PCA (Principal Component Analysis).
# PCAs are often used as statistical method on data sets to reduce
# dimensionality / complexity. This is done by finding new features which
# maximize variance. PCAs can reduce dimensionality down to any number,
# but lose information within the process. However, most of the time,
# dimensions can be reduced without having any significant loss.
#
# https://en.wikipedia.org/wiki/Principal_component_analysis

# Since we do not want to implement an equation solver (for calculating
# eigenvectors and eigenvalues), we use Ruby's standard 'matrix' library.
require "matrix"

# Ruby's standard 'matrix' library does not support normalization,
# standardization, covariance and so on, out of the box. Thus, we extend
# the Matrix class by reopening it.
class Matrix

  # Standardizes all matrix values. Also often referred to as 'Z-Score'.
  # This helps dealing with different feature spaces. It is calculated like:
  #
  # Z-Score = (x - u) / o
  #
  # where x is the value, u is the mean and o is the standard deviation
  def standardize

    # Get the mean for each column
    means = column_means

    # Get the standard deviation for each column
    standard_deviations = column_standard_deviations

    # Iterate through all values and calculate standardized value
    values = row_size.times.map do |yi|
      column_size.times.map do |xi|
        (self[yi, xi] - means[xi]) / standard_deviations[xi]
      end
    end

    # Return new standardized matrix
    new_matrix(values)
  end

  # This is a rather naive implementation to compute the covariance matrix.
  # Faster computation methods exist and could be extended in the future.
  # In a covariance matrix a positive value means that two features are
  # correlated (increasing one increases the other), while negative value
  # means that two features are inversely correlated (increasing one
  # decreases the other). The result is always a dxd matrix where d is the
  # number of columns/features. The values are computed like this:
  #
  # Covariance[x, y] = 1/n * sum(i) { (xi - xm) * (yi - ym) }
  #
  # where n is the number of rows, xi is the i-th value of the feature x,
  # yi is the i-th value of the feature y and xm and ym are the means of
  # those features.
  def covariance

    # Get the mean for each column
    means = column_means

    # Calculate covariance for each feature combination
    values = column_size.times.map do |yi|
      column_size.times.map do |xi|
        x = column(xi) - Vector.elements([means[xi]] * row_size)
        y = column(yi) - Vector.elements([means[yi]] * row_size)
        x.inner_product(y) / row_size.to_f
      end
    end

    # Return new covariance matrix
    new_matrix(values)
  end

  private

    def column_means
      column_vectors.map { |v| v.sum.to_f / row_size }
    end

    def column_standard_deviations
      column_vectors.map { |v| standard_deviation(v) }
    end

    # Calculates the standard deviation. Please notice, that Python uses a
    # ffod=0 parameter, which per default computes the variance as
    # (sum / size) and not as (sum / (size - 1)). Matlab however uses - 1.
    def standard_deviation(vector)
      mean = vector.sum / vector.size
      sum = vector.sum { |v| (v - mean) ** 2 }
      variance = sum / (vector.size - 1)
      Math.sqrt(variance)
    end
end

class PCA

  def initialize(data)

    # Init matrix
    @data = Matrix.rows(data)

    # For a PCA, the first step is usually to standardize the data.
    # This is done because otherwise one feature could 'outscale' others.
    @data = @data.standardize

    # Using the standardized matrix, we can now compute the covariance matrix
    # to get the correlation on all features
    covariance_matrix = @data.covariance

    # Compute eigenvectors and eigenvalues from the covariance matrix.
    # Eigenvectors with the highest eigenvalues carry the most information.
    # Ruby already sorts the vectors by eigenvalues in ascending order, thus
    # we can later drop eigenvectors with low eigenvalues.
    @eigenvectors, @eigenvalues, _ = covariance_matrix.eigensystem
  end

  def transform(number_of_components:)

    # Guard that number_of_components does not exceed the number of original
    # features. We can only reduce as many features as we have ;)
    if number_of_components > @eigenvalues.column_size
      raise "Number of components has to be <= #{@eigenvalues.column_size}."
    end

    # Combine N best eigenvectors as 'new features' into a new matrix.
    # Since they are already sorted in ascending order, we can simply
    # take the last N (number_of_components) vectors and reverse them.
    # This is also the step where the information loss happens.
    eigenvector_subset = Matrix.columns(
      @eigenvectors.column_vectors.reverse.first(number_of_components)
    )

    # Transform the original data by multiplying the new eigenvector subset
    # onto the original dataset. Return the transformed dataset.
    (eigenvector_subset.transpose * @data.transpose).transpose
  end

  def inspect

    # Since each eigenvalue is proportional to the variance of the data,
    # we can calculate the relevancy of each vector by normalizing it with
    # the sum of all eigenvalues. We cumulate the data such that we know
    # the impact that any number of components has on the transformation.
    (1..@data.column_size).each do |index|

      # Pick N most important eigenvectors
      vectors = @eigenvalues.column_vectors.reverse.first(index)

      # Calculate percentage of importance for those cumulated N eigenvectors
      percentage = vectors.sum do |vector|
        vector.sum * 100 / @eigenvalues.sum
      end

      # Print information
      puts "Keeping #{index} component(s): #{percentage.round(2)}%"
    end
  end
end
	# This script implements a very basic PCA (Principal Component Analysis).
	# PCAs are often used as statistical method on data sets to reduce
	# dimensionality / complexity. This is done by finding new features which
	# maximize variance. PCAs can reduce dimensionality down to any number,
	# but lose information within the process. However, most of the time,
	# dimensions can be reduced without having any significant loss.
	#
	# https://en.wikipedia.org/wiki/Principal_component_analysis

	# Since we do not want to implement an equation solver (for calculating
	# eigenvectors and eigenvalues), we use Ruby's standard 'matrix' library.
	require "matrix"

	# Ruby's standard 'matrix' library does not support normalization,
	# standardization, covariance and so on, out of the box. Thus, we extend
	# the Matrix class by reopening it.
	class Matrix

	# Standardizes all matrix values. Also often referred to as 'Z-Score'.
	# This helps dealing with different feature spaces. It is calculated like:
	#
	# Z-Score = (x - u) / o
	#
	# where x is the value, u is the mean and o is the standard deviation
	def standardize

	# Get the mean for each column
	means = column_means

	# Get the standard deviation for each column
	standard_deviations = column_standard_deviations

	# Iterate through all values and calculate standardized value
	values = row_size.times.map do \|yi\|
	column_size.times.map do \|xi\|
	(self[yi, xi] - means[xi]) / standard_deviations[xi]
	end
	end

	# Return new standardized matrix
	new_matrix(values)
	end

	# This is a rather naive implementation to compute the covariance matrix.
	# Faster computation methods exist and could be extended in the future.
	# In a covariance matrix a positive value means that two features are
	# correlated (increasing one increases the other), while negative value
	# means that two features are inversely correlated (increasing one
	# decreases the other). The result is always a dxd matrix where d is the
	# number of columns/features. The values are computed like this:
	#
	# Covariance[x, y] = 1/n * sum(i) { (xi - xm) * (yi - ym) }
	#
	# where n is the number of rows, xi is the i-th value of the feature x,
	# yi is the i-th value of the feature y and xm and ym are the means of
	# those features.
	def covariance

	# Get the mean for each column
	means = column_means

	# Calculate covariance for each feature combination
	values = column_size.times.map do \|yi\|
	column_size.times.map do \|xi\|
	x = column(xi) - Vector.elements([means[xi]] * row_size)
	y = column(yi) - Vector.elements([means[yi]] * row_size)
	x.inner_product(y) / row_size.to_f
	end
	end

	# Return new covariance matrix
	new_matrix(values)
	end

	private

	def column_means
	column_vectors.map { \|v\| v.sum.to_f / row_size }
	end

	def column_standard_deviations
	column_vectors.map { \|v\| standard_deviation(v) }
	end

	# Calculates the standard deviation. Please notice, that Python uses a
	# ffod=0 parameter, which per default computes the variance as
	# (sum / size) and not as (sum / (size - 1)). Matlab however uses - 1.
	def standard_deviation(vector)
	mean = vector.sum / vector.size
	sum = vector.sum { \|v\| (v - mean) ** 2 }
	variance = sum / (vector.size - 1)
	Math.sqrt(variance)
	end
	end

	class PCA

	def initialize(data)

	# Init matrix
	@data = Matrix.rows(data)

	# For a PCA, the first step is usually to standardize the data.
	# This is done because otherwise one feature could 'outscale' others.
	@data = @data.standardize

	# Using the standardized matrix, we can now compute the covariance matrix
	# to get the correlation on all features
	covariance_matrix = @data.covariance

	# Compute eigenvectors and eigenvalues from the covariance matrix.
	# Eigenvectors with the highest eigenvalues carry the most information.
	# Ruby already sorts the vectors by eigenvalues in ascending order, thus
	# we can later drop eigenvectors with low eigenvalues.
	@eigenvectors, @eigenvalues, _ = covariance_matrix.eigensystem
	end

	def transform(number_of_components:)

	# Guard that number_of_components does not exceed the number of original
	# features. We can only reduce as many features as we have ;)
	if number_of_components > @eigenvalues.column_size
	raise "Number of components has to be <= #{@eigenvalues.column_size}."
	end

	# Combine N best eigenvectors as 'new features' into a new matrix.
	# Since they are already sorted in ascending order, we can simply
	# take the last N (number_of_components) vectors and reverse them.
	# This is also the step where the information loss happens.
	eigenvector_subset = Matrix.columns(
	@eigenvectors.column_vectors.reverse.first(number_of_components)
	)

	# Transform the original data by multiplying the new eigenvector subset
	# onto the original dataset. Return the transformed dataset.
	(eigenvector_subset.transpose * @data.transpose).transpose
	end

	def inspect

	# Since each eigenvalue is proportional to the variance of the data,
	# we can calculate the relevancy of each vector by normalizing it with
	# the sum of all eigenvalues. We cumulate the data such that we know
	# the impact that any number of components has on the transformation.
	(1..@data.column_size).each do \|index\|

	# Pick N most important eigenvectors
	vectors = @eigenvalues.column_vectors.reverse.first(index)

	# Calculate percentage of importance for those cumulated N eigenvectors
	percentage = vectors.sum do \|vector\|
	vector.sum * 100 / @eigenvalues.sum
	end

	# Print information
	puts "Keeping #{index} component(s): #{percentage.round(2)}%"
	end
	end
	end