dylan-evans/statgen.py

## statgen.py
#!/usr/bin/python
# Generate a (roughly normal) sample population for a given mean & standard deviation.
# This was intended for use modelling hypothetical demographics in a sim game.

"""\
Demonstration of the StatGen class

This script demonstrates the StatGen class which generates a normalized sample
from a mean and standard deviation. The algorithm can be adjusted, tweaked and
broken in many different ways which can provide some interesting results, but
the current form provides an almost bell curve sample. Well it's more of a
volcano than a bell curve, use your imagination!

If matplotlib is available this script displays a histogram representation, this
is recommended.

The purpose of the algorithm was to provide sample data for a game
"""

from math import sqrt
from optparse import OptionParser
from random import random


class StatGen(object):
    """\
    Statistic generation class.

    This class generates a statistical population based on a given mean,
    standard deviation and sample size. While the result is not intended to
    be perfect the resulting sample should be more or less normalized which
    is achieved by using both a floating mean and standard deviation, and while
    i came up with the maths from scratch i have difficulty explaining it.
    """
    limit = 2.6 # Theoretical limit in standard deviations

    def __init__(self, weight, mean, std_dev, verbose=False):
        self.verbose = verbose
        self.weight = weight + 1
        self.orig_mean = self.mean = mean
        self.std_dev = std_dev
        self.toggle = 1
        self.next(); self.next()

    def next(self):
        """Get the next value"""
        # Create the offset from the current std_dev
        # The offset is the actual limit of the value
        offset = self.limit * self.std_dev
        val = offset * random() # Calculate the raw value
        #val = (sqrt(random())-self.limit*self.orig_std_dev)*(self.std_dev)

        # Decide whether the value should be greater or lesser than the mean.
        # This method simply alternates
        if self.toggle:
            self.toggle = 0
            val += self.mean
        else:
            self.toggle = 1
            val = self.mean - val
        # Alternate method, probably a better sample but slower
        #if random() > 0.5:
        #  val += self.mean
        #else:
        #  val = self.mean - val

        if self.weight > 1:
            # Recalculate the standard deviation
            diff = (val - self.mean) ** 2
            inter = (self.std_dev ** 2) * self.weight - diff
            self.weight -= 1
            try:
                self.std_dev = sqrt(inter / self.weight)
            except:
                pass
            # Calculate the new mean
            mean = (self.mean / self.weight) * (self.weight + 1) - (val / self.weight)
            self.mean = (mean - self.mean) / 2
            val += self.orig_mean # Align the value by adding the original mean
            if self.verbose:
                print "value: %f new mean: %f" % (val, self.mean)

        return val

if __name__ == '__main__':
    # Test program portion

    # Parse command line
    parser = OptionParser()
    parser.add_option('-p', '--population', action='store', type='int', default=1000)
    # I am aware that population is technically incorrect, however it does start
    # with a 'P' giving an advantage over 'sample' and 'size'. See -s
    parser.add_option('-m', '--mean', action='store', type='float', default=0.5)
    parser.add_option('-s', '--stddev', action='store', type='float', dest='std_dev', default=0.2)
    parser.add_option('-v', '--verbose', action='store_true', default=False)
    parser.add_option('-M', '--no-matplotlib', action='store_true', default=False)
    (options, args) = parser.parse_args()

    if options.no_matplotlib:
        plt = False
    else:
        try:
            import matplotlib.pyplot as plt
            #from pylab import hist
        except:
            print "matplotlib not available"
            plt = False

    g = StatGen(options.population, options.mean, options.std_dev, options.verbose)

    values = []
    errors = []
    rmean = 0
    for i in range(0, options.population):
        # Generate the sample and add it to the values list
        val = g.next()
        if val > 1 or val < 0:
            #Out of bounds values, there are usually a couple
            errors.append(val)
        values.append(val)
        rmean += val

    print "Summary:"

    rmean = rmean / options.population
    diff = rmean - options.mean
    print "Mean: %f / %f :: %f %f%% difference" % (rmean, options.mean, diff, (diff / rmean) * 100)
    adev = 0
    for val in values:
        adev += (val - options.mean) ** 2

    adev = sqrt(adev / options.population)
    print "Deviation: %f / %f :: %f %f%% difference" % (adev, options.std_dev,
    adev - options.std_dev,
    ((adev-options.std_dev)/options.std_dev)*100)

    if options.verbose:
        print "Errors:"
        for v in errors:
            print v
    if plt: #show histogram
        # Display graph
        #h = hist(values, bins=200)

        plt.hist(values, bins=options.population*0.2)
        plt.title("Mean result: %f Difference: %f %f%%" % (rmean, diff, (diff / rmean) * 100))
        plt.show()
	#!/usr/bin/python
	# Generate a (roughly normal) sample population for a given mean & standard deviation.
	# This was intended for use modelling hypothetical demographics in a sim game.

	"""\
	Demonstration of the StatGen class

	This script demonstrates the StatGen class which generates a normalized sample
	from a mean and standard deviation. The algorithm can be adjusted, tweaked and
	broken in many different ways which can provide some interesting results, but
	the current form provides an almost bell curve sample. Well it's more of a
	volcano than a bell curve, use your imagination!

	If matplotlib is available this script displays a histogram representation, this
	is recommended.

	The purpose of the algorithm was to provide sample data for a game
	"""

	from math import sqrt
	from optparse import OptionParser
	from random import random


	class StatGen(object):
	"""\
	Statistic generation class.

	This class generates a statistical population based on a given mean,
	standard deviation and sample size. While the result is not intended to
	be perfect the resulting sample should be more or less normalized which
	is achieved by using both a floating mean and standard deviation, and while
	i came up with the maths from scratch i have difficulty explaining it.
	"""
	limit = 2.6 # Theoretical limit in standard deviations

	def __init__(self, weight, mean, std_dev, verbose=False):
	self.verbose = verbose
	self.weight = weight + 1
	self.orig_mean = self.mean = mean
	self.std_dev = std_dev
	self.toggle = 1
	self.next(); self.next()

	def next(self):
	"""Get the next value"""
	# Create the offset from the current std_dev
	# The offset is the actual limit of the value
	offset = self.limit * self.std_dev
	val = offset * random() # Calculate the raw value
	#val = (sqrt(random())-self.limitself.orig_std_dev)(self.std_dev)

	# Decide whether the value should be greater or lesser than the mean.
	# This method simply alternates
	if self.toggle:
	self.toggle = 0
	val += self.mean
	else:
	self.toggle = 1
	val = self.mean - val
	# Alternate method, probably a better sample but slower
	#if random() > 0.5:
	# val += self.mean
	#else:
	# val = self.mean - val

	if self.weight > 1:
	# Recalculate the standard deviation
	diff = (val - self.mean) ** 2
	inter = (self.std_dev ** 2) * self.weight - diff
	self.weight -= 1
	try:
	self.std_dev = sqrt(inter / self.weight)
	except:
	pass
	# Calculate the new mean
	mean = (self.mean / self.weight) * (self.weight + 1) - (val / self.weight)
	self.mean = (mean - self.mean) / 2
	val += self.orig_mean # Align the value by adding the original mean
	if self.verbose:
	print "value: %f new mean: %f" % (val, self.mean)

	return val

	if __name__ == '__main__':
	# Test program portion

	# Parse command line
	parser = OptionParser()
	parser.add_option('-p', '--population', action='store', type='int', default=1000)
	# I am aware that population is technically incorrect, however it does start
	# with a 'P' giving an advantage over 'sample' and 'size'. See -s
	parser.add_option('-m', '--mean', action='store', type='float', default=0.5)
	parser.add_option('-s', '--stddev', action='store', type='float', dest='std_dev', default=0.2)
	parser.add_option('-v', '--verbose', action='store_true', default=False)
	parser.add_option('-M', '--no-matplotlib', action='store_true', default=False)
	(options, args) = parser.parse_args()

	if options.no_matplotlib:
	plt = False
	else:
	try:
	import matplotlib.pyplot as plt
	#from pylab import hist
	except:
	print "matplotlib not available"
	plt = False

	g = StatGen(options.population, options.mean, options.std_dev, options.verbose)

	values = []
	errors = []
	rmean = 0
	for i in range(0, options.population):
	# Generate the sample and add it to the values list
	val = g.next()
	if val > 1 or val < 0:
	#Out of bounds values, there are usually a couple
	errors.append(val)
	values.append(val)
	rmean += val

	print "Summary:"

	rmean = rmean / options.population
	diff = rmean - options.mean
	print "Mean: %f / %f :: %f %f%% difference" % (rmean, options.mean, diff, (diff / rmean) * 100)
	adev = 0
	for val in values:
	adev += (val - options.mean) ** 2

	adev = sqrt(adev / options.population)
	print "Deviation: %f / %f :: %f %f%% difference" % (adev, options.std_dev,
	adev - options.std_dev,
	((adev-options.std_dev)/options.std_dev)*100)

	if options.verbose:
	print "Errors:"
	for v in errors:
	print v
	if plt: #show histogram
	# Display graph
	#h = hist(values, bins=200)

	plt.hist(values, bins=options.population*0.2)
	plt.title("Mean result: %f Difference: %f %f%%" % (rmean, diff, (diff / rmean) * 100))
	plt.show()