drewlanenga/surprise.py

## surprise.py
def surprise( mapcount ):
    """
      calculate the surprise value for a given mapcount.
      basically if the more uneven the distribution of values,
      the higher the surprise value.

      for example, a good field to use for a coverage score
      might have a surprise value less than 0.5 or 0.6.
    """
    values = mapcount.values()
    k = float(len(values))
    n = float(sum(values))

    # if the distribution were uniform, we would expect this value
    expected = 1 / k

    # get the proportion for each value
    p = [ v / n for v in values ]

    # calculate the exponent (this determines the penalty)
    y = 1 / (sd(p) * (k ** 2))

    # the absolute value of the difference for the expected
    p_diff = [ abs(x - expected) for x in p ]

    # calculate the actual value
    value = sum( [ ( (diff ** y) ) for diff in p_diff ] ) / k

    return value


def sd( vector ):
	from math import sqrt

	n = float(len(vector))
	mean = sum(vector) / n
	stdv = sqrt( sum([ (x - mean) ** 2 for x in vector ])/ (n - 1) )

	return stdv


print surprise({"value1": 100, "value2": 50, "value3": 40000})    #  0.846230952166
print surprise({"value1": 100, "value2": 50, "value3": 400})      #  0.638978305914
print surprise({"value1": 100, "value2": 50, "value3": 40})       #  0.255465721389
print surprise({"value1": 100, "value2": 50, "value3": 75})       #  0.0740740740741
	def surprise( mapcount ):
	"""
	calculate the surprise value for a given mapcount.
	basically if the more uneven the distribution of values,
	the higher the surprise value.

	for example, a good field to use for a coverage score
	might have a surprise value less than 0.5 or 0.6.
	"""
	values = mapcount.values()
	k = float(len(values))
	n = float(sum(values))

	# if the distribution were uniform, we would expect this value
	expected = 1 / k

	# get the proportion for each value
	p = [ v / n for v in values ]

	# calculate the exponent (this determines the penalty)
	y = 1 / (sd(p) * (k ** 2))

	# the absolute value of the difference for the expected
	p_diff = [ abs(x - expected) for x in p ]

	# calculate the actual value
	value = sum( [ ( (diff ** y) ) for diff in p_diff ] ) / k

	return value


	def sd( vector ):
	from math import sqrt

	n = float(len(vector))
	mean = sum(vector) / n
	stdv = sqrt( sum([ (x - mean) ** 2 for x in vector ])/ (n - 1) )

	return stdv


	print surprise({"value1": 100, "value2": 50, "value3": 40000}) # 0.846230952166
	print surprise({"value1": 100, "value2": 50, "value3": 400}) # 0.638978305914
	print surprise({"value1": 100, "value2": 50, "value3": 40}) # 0.255465721389
	print surprise({"value1": 100, "value2": 50, "value3": 75}) # 0.0740740740741