ept/dataloss.rb

## dataloss.rb
# Parameters:
prob_nodefail = 0.001     # Probability of a single node failing
replication_factor = 3    # Number of copies of each partition (r)
partitions_per_node = 256 # Number of partitions per node
max_nodes = 10000         # Maximum number of nodes to consider

# (n - r)! / n! == r! / (n choose r)
# Intuitively: the fraction of the n! possible permutations of n nodes that
# results in the replicas of one particular partition to be mapped to three
# particular nodes, in a particular order.
partition_fract = 1.0

# Consider all possible cluster sizes n up to max_nodes
(replication_factor .. max_nodes).each do |nodes|

  # The probability that at least one partition is lost at this cluster size
  # (added up cumulatively in the inner loop)
  prob_dataloss = 0.0

  # f! * (n - r)! / ((f - r)! * n!) == (f choose r) / (n choose r)
  # Intuitively: the probability that all replicas of one particular partition
  # are lost, given that f nodes are faulty.
  prob_partitionloss = partition_fract

  # Binomial coefficient (n choose f): the number of different ways of choosing
  # f faults among n nodes (uses arbitrary-precision integer arithmetic)
  binomial_coeff = 1

  # Consider all the possible numbers of faulty nodes f (from 1 to all nodes)
  (1 .. nodes).each do |faults|

    # n choose f
    binomial_coeff = binomial_coeff * (nodes - faults + 1) / faults

    # Use binomial distribution to calculate probability of having exactly this
    # number of faults. Calculate in logs, because otherwise the binomial
    # coefficient overflows the double-precision floating point type.
    prob_faults = Math.exp(Math.log(binomial_coeff) +
                           faults * Math.log(prob_nodefail) +
                           (nodes - faults) * Math.log(1 - prob_nodefail))

    if faults >= replication_factor
      # p(0 partitions lost | f faults) =
      #     p(one particular partition not lost | f faults) ^ num_partitions
      prob_none_lost = (1.0 - prob_partitionloss) ** (nodes * partitions_per_node)

      # p(>= 1 partition lost AND f faults) =
      #     p(f faults) * (1 - p(0 partitions lost | f faults))
      prob_dataloss += prob_faults * (1.0 - prob_none_lost)

      # f! * (n - r)! / ((f - r)! * n!)
      prob_partitionloss *= (faults + 1.0) / (faults - replication_factor + 1.0)
    end
  end

  # Output probability that >= 1 partition is lost when you have n nodes
  puts "#{nodes},#{prob_dataloss}"

  # (n - r)! / n!
  partition_fract *= (nodes - replication_factor + 1.0) / (nodes + 1.0)
end
	# Parameters:
	prob_nodefail = 0.001 # Probability of a single node failing
	replication_factor = 3 # Number of copies of each partition (r)
	partitions_per_node = 256 # Number of partitions per node
	max_nodes = 10000 # Maximum number of nodes to consider

	# (n - r)! / n! == r! / (n choose r)
	# Intuitively: the fraction of the n! possible permutations of n nodes that
	# results in the replicas of one particular partition to be mapped to three
	# particular nodes, in a particular order.
	partition_fract = 1.0

	# Consider all possible cluster sizes n up to max_nodes
	(replication_factor .. max_nodes).each do \|nodes\|

	# The probability that at least one partition is lost at this cluster size
	# (added up cumulatively in the inner loop)
	prob_dataloss = 0.0

	# f! * (n - r)! / ((f - r)! * n!) == (f choose r) / (n choose r)
	# Intuitively: the probability that all replicas of one particular partition
	# are lost, given that f nodes are faulty.
	prob_partitionloss = partition_fract

	# Binomial coefficient (n choose f): the number of different ways of choosing
	# f faults among n nodes (uses arbitrary-precision integer arithmetic)
	binomial_coeff = 1

	# Consider all the possible numbers of faulty nodes f (from 1 to all nodes)
	(1 .. nodes).each do \|faults\|

	# n choose f
	binomial_coeff = binomial_coeff * (nodes - faults + 1) / faults

	# Use binomial distribution to calculate probability of having exactly this
	# number of faults. Calculate in logs, because otherwise the binomial
	# coefficient overflows the double-precision floating point type.
	prob_faults = Math.exp(Math.log(binomial_coeff) +
	faults * Math.log(prob_nodefail) +
	(nodes - faults) * Math.log(1 - prob_nodefail))

	if faults >= replication_factor
	# p(0 partitions lost \| f faults) =
	# p(one particular partition not lost \| f faults) ^ num_partitions
	prob_none_lost = (1.0 - prob_partitionloss) ** (nodes * partitions_per_node)

	# p(>= 1 partition lost AND f faults) =
	# p(f faults) * (1 - p(0 partitions lost \| f faults))
	prob_dataloss += prob_faults * (1.0 - prob_none_lost)

	# f! * (n - r)! / ((f - r)! * n!)
	prob_partitionloss *= (faults + 1.0) / (faults - replication_factor + 1.0)
	end
	end

	# Output probability that >= 1 partition is lost when you have n nodes
	puts "#{nodes},#{prob_dataloss}"

	# (n - r)! / n!
	partition_fract *= (nodes - replication_factor + 1.0) / (nodes + 1.0)
	end