Skip to content

Instantly share code, notes, and snippets.

@foolnotion
Last active January 30, 2020 02:18
Show Gist options
  • Save foolnotion/5a99ae8c76c27de614320ac272ebe010 to your computer and use it in GitHub Desktop.
Save foolnotion/5a99ae8c76c27de614320ac272ebe010 to your computer and use it in GitHub Desktop.
# This file is part of EAP.
#
# EAP is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
#
# EAP is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with EAP. If not, see <http://www.gnu.org/licenses/>.
import operator
import math
import random
import warnings # suppress some warnings related to invalid values
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import timeit
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp
def evalSymbReg(individual, pset, X_train, y_train):
# Transform the tree expression in a callable function
func = gp.compile(expr=individual, pset=pset)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
y_pred = np.array([ func(*x) for x in X_train ])
min_ = np.nanmin(y_pred)
max_ = np.nanmax(y_pred)
if ~np.isfinite(min_) or ~np.isfinite(max_):
return 0,
mid_ = (min_ + max_) / 2
np.nan_to_num(y_pred, copy=False, nan=mid_, posinf=mid_, neginf=mid_)
fit = r2_score(y_train, y_pred)
if ~np.isfinite(fit):
fit = 0
return fit,
# load data
df = pd.read_csv('./data/Poly-10.csv', sep=',')
X = df.iloc[:,:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1234)
_, cols = X_train.shape
# set static height limit for all generated trees
pset = gp.PrimitiveSet("MAIN", cols)
pset.addPrimitive(np.add, 2, name="vadd")
pset.addPrimitive(np.subtract, 2, name="vsub")
pset.addPrimitive(np.multiply, 2, name="vmul")
pset.addPrimitive(np.divide, 2, name="vdiv")
pset.addPrimitive(np.negative, 1, name="vneg")
pset.addPrimitive(np.cos, 1, name="vcos")
pset.addPrimitive(np.sin, 1, name="vsin")
pset.addPrimitive(np.exp, 1, name="vexp")
pset.addPrimitive(np.log, 1, name="vlog")
pset.addEphemeralConstant("rand101", lambda: np.random.uniform(-1.0, 1.0))
creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)
maxHeight = 10
maxLength = 50
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=maxHeight)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evalSymbReg, pset=pset, X_train=X_train, y_train=y_train)
toolbox.register("select", tools.selTournament, tournsize=5)
limitHeight = gp.staticLimit(operator.attrgetter('height'), maxHeight)
limitLength = gp.staticLimit(len, maxLength)
mutOperators = [ gp.mutUniform ]
def mutOperator(*args, **kwargs):
mut = np.random.choice(mutOperators)
return mut(*args, **kwargs)
toolbox.register("mate", gp.cxOnePoint)
toolbox.decorate("mate", limitHeight)
toolbox.decorate("mate", limitLength)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register('mutate', mutOperator, expr=toolbox.expr_mut, pset=pset)
toolbox.decorate("mutate", limitHeight)
toolbox.decorate("mutate", limitLength)
def main():
np.seterr(all='ignore')
random.seed(318)
pool = multiprocessing.Pool()
toolbox.register("map", pool.map)
pop = toolbox.population(n=1000)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.nanmean)
stats.register("std", np.nanstd)
stats.register("min", np.nanmin)
stats.register("max", np.nanmax)
algorithms.eaSimple(pop, toolbox, cxpb=1, mutpb=0.25, ngen=100, stats=stats, halloffame=hof)
return pop, stats, hof
if __name__ == "__main__":
print(timeit.timeit(stmt=main, number=1))
@DMTSource
Copy link

DMTSource commented Dec 30, 2019

Here is a big update to the code, it was on Revision #3 I believe when posted. More info with discussion here:
https://mail.google.com/mail/u/0/#inbox/FMfcgxwGCbBGsFnpFGwRTTTjnrVCHPTk

import operator
import math
import random
import warnings # suppress some warnings related to invalid values

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import multiprocessing
import timeit

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

# load data
#df = pd.read_csv('./data/Poly-10.csv', sep=',')
df = pd.DataFrame(np.sin(np.random.random((10000, 10))), columns=list('ABCDEFGHIJ'))
#print df
X = df.iloc[:,:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1234)
_, cols = X_train.shape


def evalSymbReg(individual, pset):
    # Transform the tree expression in a callable function
    func = gp.compile(expr=individual, pset=pset)
    
    with warnings.catch_warnings(): # comment out when debugging
        warnings.simplefilter("ignore") # comment out when debugging
        y_pred = np.array([ func(*x) for x in X_train ])
        
        min_ = np.nanmin(y_pred)
        max_ = np.nanmax(y_pred)
        
        if ~np.isfinite(min_) or ~np.isfinite(max_):
            return -1000.,
        
        mid_ = (min_ + max_) / 2
        y_pred[np.where(~np.isfinite(y_pred))] = mid_ # using older numpy so manually doing nan_to_num
        #np.nan_to_num(y_pred, copy=False, nan=mid_, posinf=mid_, neginf=mid_)

        fit = r2_score(y_train, y_pred)
        
        if ~np.isfinite(fit):
            fit = -1000.,

        fit = np.clip(fit, -1000., 1.) #expensife for a single float
        
        return fit,



 # set static height limit for all generated trees
pset = gp.PrimitiveSet("MAIN", cols)
pset.addPrimitive(np.add, 2, name="vadd")
pset.addPrimitive(np.subtract, 2, name="vsub")
pset.addPrimitive(np.multiply, 2, name="vmul")
pset.addPrimitive(np.divide, 2, name="vdiv")
pset.addPrimitive(np.negative, 1, name="vneg")
pset.addPrimitive(np.cos, 1, name="vcos")
pset.addPrimitive(np.sin, 1, name="vsin")
pset.addPrimitive(np.exp, 1, name="vexp")
pset.addPrimitive(np.log, 1, name="vlog")
#pset.addEphemeralConstant("rand101", lambda: np.random.uniform(-1.0, 1.0)) #may be unable to pickle...
    
creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

maxHeight = 10
maxLength = 50

toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=maxHeight)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evalSymbReg, pset=pset)
toolbox.register("select", tools.selTournament, tournsize=5)

limitHeight = gp.staticLimit(operator.attrgetter('height'), maxHeight)
limitLength = gp.staticLimit(len, maxLength)

# Allow for random choice between 2 set up mutators
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register('mutUniform', gp.mutUniform,   expr=toolbox.expr_mut, pset=pset)
toolbox.register('mutEphemeral', gp.mutEphemeral, mode='all')

mutOperators = [ toolbox.mutUniform, toolbox.mutEphemeral ]

def mutOperator(*args, **kwargs):
    mut = np.random.choice(mutOperators)
    return mut(*args, **kwargs)
               

toolbox.register("mate", gp.cxOnePoint)
toolbox.register('mutate', mutOperator)


toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight)) 
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight)) 
toolbox.decorate("mate", gp.staticLimit(key=len, max_value=limitLength))
toolbox.decorate("mutate", gp.staticLimit(key=len, max_value=limitLength)) 
        
    
def main():
    np.seterr(all='ignore')
    random.seed(318)
    
    pool = multiprocessing.Pool()
    toolbox.register("map", pool.map)


    pop = toolbox.population(n=1000)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean) #using nanmean will hide nans that ARE in the pop, not ideal
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.25, ngen=5, stats=stats, halloffame=hof)

    print("\nBest Hof:\n%s"%hof[0])

    return pop, stats, hof



if __name__ == "__main__":
    print("\nTime To Evo: %0.2f"%timeit.timeit(stmt=main, number=1))

gen	nevals	avg     	std    	min  	max       
0  	1000  	-340.019	454.311	-1000	-0.0414717
1  	648   	-42.3892	184.116	-1000	-0.0414717
2  	627   	-17.5815	121.44 	-1000	-0.0414717
3  	603   	-20.5456	133.662	-1000	-0.0414717
4  	629   	-20.1901	132.79 	-1000	-0.0236097
5  	613   	-24.1563	146.697	-1000	-0.00993625

Best Hof:
vneg(vlog(vneg(vneg(vcos(vcos(vsin(vcos(vdiv(ARG0, ARG0)))))))))

Time To Evo: 54.19
[Finished in 54.7s]

@rbbgit
Copy link

rbbgit commented Jan 23, 2020

Hi,
Can you perhaps share your csv file?
Thanks in advance.

@foolnotion
Copy link
Author

You can find my data and latest code here: https://github.com/foolnotion/deap-symreg

@rbbgit
Copy link

rbbgit commented Jan 29, 2020

Thanks again. A suggestion/question: before going from one generation to the next, wouldn't it help to "simplify" the trees (i.e., the individuals)? For example, if we have a node "-' with leaves "x" and "x", we can simply replace "-" with a "0" (other example: x/x). This should also help with "not" reaching Python's limit on the tree depth. Any ideas on how to do it?

@foolnotion
Copy link
Author

for deap expressions i would look at simpy, but from a genetic programming perspective simplification/pruning is not great. like in nature genotypes include a lot of redundancy (cryptic genetic variation, other mechanisms for robustness/buffering). this actually increases the potential to evolve, what gp calls evolvability. simplification would cancel this effect, as it woukd offer less targets for mutation or crossover to create some adaptive change.

deap gp support is not quite mature imho. recombination operators (crossover, mutation) should be aware of tree and depth limits and prevent the generation of offspring individuals exceeding them.

@DMTSource
Copy link

I agree with foolnotion about evolvability and a desire to keep 'dead genes' around as complexity can pay off down the line. The above code utilizes length and depth operator limits in order to manage the progress of this grown over man generations:

limitLength = 100
limitHeight = 15
toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight)) 
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=limitHeight)) 
toolbox.decorate("mate", gp.staticLimit(key=len, max_value=limitLength))
toolbox.decorate("mutate", gp.staticLimit(key=len, max_value=limitLength)) 

Additionally, there are many other methods you can use in addition to this to manage size(so many white papers to read) if that is an issue perhaps many generations into the evolutions and problems start to arise because of individual size.

One example is using a double tournmanet:

toolbox.register("select", tools.selDoubleTournament, 
                           fitness_size=7,
                           parsimony_size=1.4,
                           fitness_first=True) 

https://deap.readthedocs.io/en/master/api/tools.html#deap.tools.selDoubleTournament

Another brute force method may be more... messy, like having your mutation operator randomly choose between the normal mutation, and something like a shrink operator with a growing probability as the size reaches some defines limit that we don't necessarily want to converge too...this encourages the evolvability up to a point that we happen to know already.
deap.gp.shrink
https://deap.readthedocs.io/en/master/api/tools.html#deap.gp.mutShrink

Here is an example of that...but this is sorta tricky to really tune so be wary of slapping it into your code:

toolbox.register("mutShrink", gp.mutShrink)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutUniform", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

limitLength = 65
target_mean_length = 30
def mixed_mutation(ind):
    """
    proba_by_size_i = np.round([0.5*(float(ind-30)/float(65)) for ind in [10,20,30,40,50,60,65]], 3) = array([-0.154, -0.077,  0.   ,  0.077,  0.154,  0.231,  0.269])
    """
    proba_by_size = 0.5*(float(len(ind)-target_mean_length)/float(limitLength))
    if random.random() < proba_by_size:
        ind, = toolbox.mutShrink(ind)
    else:
        ind, = toolbox.mutUniform(ind)
    
    return ind,
toolbox.register("mutate", mixed_mutation)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment