treviesweets/gist:dfde205260c75d10f16b

## gistfile1.py
### finding the best fit line for data with a random error ###

#
# a simplification of the example given at http://acodersmusings.blogspot.co.uk/2009/07/curve-fitting-with-pyevolve.html
#
# using a genetic algorithm to infer the parameters of a model - the simplest form possible, a straight line fit.
#

from pyevolve import G1DList, GSimpleGA, Selectors, Scaling, DBAdapters
import numpy
import matplotlib.pyplot as plot

#
# Create Random Data
#

N = 100
x = numpy.random.rand(N)
err = numpy.random.normal(0, 3, N) # some noise

# set the actual parameters
m = 10.0
b = 5.0

# a simple linear function
def linear(x, m, b):
    y = m * x + b
    return y

# create some noisy test data
yt = linear(x, m, b) + err
# wrap up the sample points
sample_points = [x, yt]

# create a generator function that tests the goodness of fit for each chromosome
def generate_obj_fcn(sample_points):
    # the objective function just calculates the square error
    def obj_fcn(chromosome):
        """ calculate the squared error of this chromosomes' parameters
        """

        # unpack the m (slope) + b (intercept) from the chromosome
        m_p = chromosome[0] / 100.0 # hack to allow decimals
        b_p = chromosome[1] / 100.0

        # unpack the sample points
        x = sample_points[0]
        yt = sample_points[1]

        # calculate the predicted y's
        yp = linear(x, m_p, b_p)

        # calculate the square error for each point
        deltas =  ( yp - yt ) ** 2

        # calculate the score
        # note the '-' the ga looks for the highest score
        score = -sum(deltas)
        return score
    return obj_fcn
#
# Create the genome
#

genome = G1DList.G1DList(2) # set up a list with 2 points 1 for m and 1 for b
genome.evaluator.set(generate_obj_fcn(sample_points))
genome.setParams(rangemin = 0.0, rangemax = 20.0 * 100.0) # hack to allow decimals

#
# Set up the engine
#

ga = GSimpleGA.GSimpleGA(genome)
ga.setPopulationSize(1000) # set the population to 1000

# set a selector
ga.selector.set(Selectors.GRouletteWheel) # set a selector - there are many in the pyevolve package
#ga.selector.set(Selectors.GTournamentSelector) # doesn't seem to make much difference here

#
# Change the scaling method
#

pop = ga.getPopulation()
pop.scaleMethod.set(Scaling.SigmaTruncScaling) # don't get this...

#
# Run the ga
#

ga.evolve() # freq_stats=10)

# pick the m and the b
mg = ga.bestIndividual()[0] / 100.0 # back to decimals
bg = ga.bestIndividual()[1] / 100.0

#
# do a numpy least square fit - a few less lines
#

x_in = numpy.vstack([x, numpy.ones(len(x))]).T
mn, bn = numpy.linalg.lstsq(x_in, yt)[0]

print '\nPyevolve - m : ', mg, ' b: ', bg
print 'Numpy    - m : ', mn, ' b: ', bn

#
# plot the two results
#

fig = plot.figure(figsize = (12, 6))
plot.scatter(x, yt, color = 'r')
plot.plot(x, linear(x, mn, bn), 'b--', label = 'numpy least sq')
plot.plot(x, linear(x, mg, bg), 'c--', label = 'ga least sq')
plot.title('Comparing Approaches')
plot.legend()
	### finding the best fit line for data with a random error ###

	#
	# a simplification of the example given at http://acodersmusings.blogspot.co.uk/2009/07/curve-fitting-with-pyevolve.html
	#
	# using a genetic algorithm to infer the parameters of a model - the simplest form possible, a straight line fit.
	#

	from pyevolve import G1DList, GSimpleGA, Selectors, Scaling, DBAdapters
	import numpy
	import matplotlib.pyplot as plot

	#
	# Create Random Data
	#

	N = 100
	x = numpy.random.rand(N)
	err = numpy.random.normal(0, 3, N) # some noise

	# set the actual parameters
	m = 10.0
	b = 5.0

	# a simple linear function
	def linear(x, m, b):
	y = m * x + b
	return y

	# create some noisy test data
	yt = linear(x, m, b) + err
	# wrap up the sample points
	sample_points = [x, yt]

	# create a generator function that tests the goodness of fit for each chromosome
	def generate_obj_fcn(sample_points):
	# the objective function just calculates the square error
	def obj_fcn(chromosome):
	""" calculate the squared error of this chromosomes' parameters
	"""

	# unpack the m (slope) + b (intercept) from the chromosome
	m_p = chromosome[0] / 100.0 # hack to allow decimals
	b_p = chromosome[1] / 100.0

	# unpack the sample points
	x = sample_points[0]
	yt = sample_points[1]

	# calculate the predicted y's
	yp = linear(x, m_p, b_p)

	# calculate the square error for each point
	deltas = ( yp - yt ) ** 2

	# calculate the score
	# note the '-' the ga looks for the highest score
	score = -sum(deltas)
	return score
	return obj_fcn
	#
	# Create the genome
	#

	genome = G1DList.G1DList(2) # set up a list with 2 points 1 for m and 1 for b
	genome.evaluator.set(generate_obj_fcn(sample_points))
	genome.setParams(rangemin = 0.0, rangemax = 20.0 * 100.0) # hack to allow decimals

	#
	# Set up the engine
	#

	ga = GSimpleGA.GSimpleGA(genome)
	ga.setPopulationSize(1000) # set the population to 1000

	# set a selector
	ga.selector.set(Selectors.GRouletteWheel) # set a selector - there are many in the pyevolve package
	#ga.selector.set(Selectors.GTournamentSelector) # doesn't seem to make much difference here

	#
	# Change the scaling method
	#

	pop = ga.getPopulation()
	pop.scaleMethod.set(Scaling.SigmaTruncScaling) # don't get this...

	#
	# Run the ga
	#

	ga.evolve() # freq_stats=10)

	# pick the m and the b
	mg = ga.bestIndividual()[0] / 100.0 # back to decimals
	bg = ga.bestIndividual()[1] / 100.0

	#
	# do a numpy least square fit - a few less lines
	#

	x_in = numpy.vstack([x, numpy.ones(len(x))]).T
	mn, bn = numpy.linalg.lstsq(x_in, yt)[0]

	print '\nPyevolve - m : ', mg, ' b: ', bg
	print 'Numpy - m : ', mn, ' b: ', bn

	#
	# plot the two results
	#

	fig = plot.figure(figsize = (12, 6))
	plot.scatter(x, yt, color = 'r')
	plot.plot(x, linear(x, mn, bn), 'b--', label = 'numpy least sq')
	plot.plot(x, linear(x, mg, bg), 'c--', label = 'ga least sq')
	plot.title('Comparing Approaches')
	plot.legend()