riccamastellone/XGBoost_EA.ipynb

## XGBoost_EA.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# XGBoost parameters | Evolutionary search\n",
    "Evolutionary algorithms (EA) are a subset of evolutionary computation, a generic population-based metaheuristic optimisation algorithm. An EA uses mechanisms inspired by biological evolution, such as reproduction, mutation, recombination, and selection. Candidate solutions to the optimisation problem play the role of individuals in a population, and the fitness function determines the quality of the solutions.\n",
    "\n",
    "\n",
    "We shall use EA to perform optimal parameter search for XGBoost\n",
    "\n",
    "The algorithm proceeds with first creating an initial random population of parameter values. The instances are scored using XGBoost k-fold CV. \n",
    "\n",
    "Next, a new generation of population is created as follows:\n",
    "- A small proportion of elite (i.e. top scoring) individuals is carried forward directly to the new population\n",
    "- The rest of the population is filled with randomly created individuals, by:\n",
    "    - randomly picking two parents from the top performing individuals of the last population (e.g. top 50%)\n",
    "    - combine the 'genes' (parameter values) randomly to create a new individual that inherits 50% of the genes from each parent.\n",
    "    - with a small probability, we mutate some gene's value\n",
    "    \n",
    "- The new population is evaluated, and the loop continues until convergence, or until a predefined number of generations has been reached. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from random import randint\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## PARAMETERS\n",
    "\n",
    "popSize = 50 # Population size, set from 20 to 100\n",
    "eliteSize = 0.1 # Percentage of elite instances to be ratained \n",
    "nGeneration=2; # Number of generations\n",
    "\n",
    "k_folds = 5 # Number of folds for cross-validation\n",
    "paramList=['depth','nRound','eta','gamma','min_child_weight','lamda','alpha','colsample_bytree','subsample','fitness']\n",
    "target_column = 'CUSTOMER_COUNT'\n",
    "csv_path = 'DS/Methodology2/ModelCleanPrediction.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## SCORING FUNCTIONS\n",
    "\n",
    "def rmspe(y, yhat):\n",
    "    return np.sqrt(np.mean((yhat/y-1) ** 2))\n",
    "\n",
    "def rmspe_xg(yhat, y):\n",
    "    y = y.get_label()\n",
    "    yhat = yhat\n",
    "    return \"rmspe\", rmspe(y,yhat)\n",
    "\n",
    "def RMSPE_objective(predts, dtrain):\n",
    "    labels = dtrain.get_label()\n",
    "    grad =  -1/labels+predts/(labels**2)\n",
    "    grad[labels==0]=0\n",
    "    hess = 1/(labels**2)\n",
    "    hess[labels==0]=0\n",
    "    return grad, hess "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DATA IMPORT\n",
    "data = pd.read_csv(csv_path, index_col=0)\n",
    "\n",
    "# XGBOOST MATRIX\n",
    "y = data.pop(target_column)\n",
    "dtrain = xgb.DMatrix(data, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create an initial population\n",
    "population=pd.DataFrame(np.zeros(shape=(popSize,len(paramList))),columns = paramList)\n",
    "population.depth=[randint(5,15) for p in range(0,popSize)]\n",
    "population.nRound=[randint(100,500) for p in range(0,popSize)] # n_boosting_rounds\n",
    "population.eta=[random.uniform(0.6, 1) for p in range(0,popSize)] # learning_rate\n",
    "population.gamma=[random.uniform(0.01, 0.03) for p in range(0,popSize)]\n",
    "population.min_child_weight=[randint(10,100) for p in range(0,popSize)]\n",
    "population.lamda =[random.uniform(0.1,1) for p in range(0,popSize)]\n",
    "population.alpha =[random.uniform(0.1, 1) for p in range(0,popSize)]\n",
    "population.colsample_bytree=[random.uniform(0.7, 1) for p in range(0,popSize)]\n",
    "population.subsample=[random.uniform(0.7, 1) for p in range(0,popSize)]\n",
    "population.fitness=[random.uniform(100, 100) for p in range(0,popSize)] # score\n",
    "\n",
    "# Create a new population based on an existing one\n",
    "def createNewPopulation(population, eliteSize=0.1, mutation_rate=0.2):\n",
    "    population.sort_values(by='fitness',ascending=1,inplace=True)\n",
    "    population.reset_index(drop=True,inplace=True)\n",
    "    popSize=population.shape[0]\n",
    "    nElite=int(round(eliteSize*popSize))\n",
    "    \n",
    "    new_population=population.copy(deep=True);\n",
    "    # Form a new population from the top 50% instances\n",
    "    for i in range(nElite,popSize):\n",
    "        # Get two random parents\n",
    "        p1=randint(nElite,int(popSize/2))\n",
    "        p2=randint(nElite,int(popSize/2))\n",
    "        \n",
    "        for attr in list(new_population.columns.values):\n",
    "            if(random.uniform(0,1)>0.5 ):\n",
    "                new_population.ix[i,attr]=population.ix[p1,attr]\n",
    "            else:\n",
    "                new_population.ix[i,attr]=population.ix[p2,attr]\n",
    "\n",
    "            # Generating some mutations\n",
    "            # \n",
    "            # A list of if/else if is horrible, but Python doesn't like switchs\n",
    "            # https://docs.python.org/3/tutorial/controlflow.html#if-statements \n",
    "            if(random.uniform(0,1)<mutation_rate ):\n",
    "                attr=list(new_population.columns.values)[randint(0,8)]\n",
    "                if(attr=='depth'):\n",
    "                    new_population.ix[i,attr]= max(3,new_population.ix[i,attr]+randint(-2,2))\n",
    "                elif(attr=='nRound'):\n",
    "                    new_population.ix[i,attr]= max(10,new_population.ix[i,attr]+randint(-50,50))\n",
    "                elif(attr=='eta'):\n",
    "                    new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05))\n",
    "                elif(attr=='gamma'):\n",
    "                    new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.005,0.005))\n",
    "                elif(attr=='min_child_weight'):\n",
    "                    new_population.ix[i,attr]= max(0,new_population.ix[i,attr]+randint(-2,2)  )                  \n",
    "                elif(attr=='lamda'):\n",
    "                    new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05))                   \n",
    "                elif(attr=='alpha'):\n",
    "                    new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05))                   \n",
    "                elif(attr=='colsample_bytree'):\n",
    "                    new_population.ix[i,attr]= min(1,max(0.6,new_population.ix[i,attr]+random.uniform(-0.05,0.05)))\n",
    "                elif(attr=='subsample'):\n",
    "                    new_population.ix[i,attr]= min(1,max(0.6,new_population.ix[i,attr]+random.uniform(-0.05,0.05)))                      \n",
    "    return new_population\n",
    "\n",
    "# Score each instance using k-fold CV\n",
    "def testInstance(population,i,dtrain):\n",
    "    params = {\"objective\": \"reg:linear\",\n",
    "          \"eta\": population.eta[i],\n",
    "          \"max_depth\": population.depth[i],\n",
    "          \"subsample\": population.subsample[i],\n",
    "          \"colsample_bytree\": population.colsample_bytree[i],\n",
    "          \"num_boost_round\":int(population.nRound[i]),\n",
    "          \"lambda\":population.lamda[i],\n",
    "          \"alpha\":population.alpha[i],\n",
    "          \"gamma\":population.gamma[i],\n",
    "          \"min_child_weight\":population.min_child_weight[i],\n",
    "          \"silent\": 1,\n",
    "          } \n",
    "    history = xgb.cv(\n",
    "        params,\n",
    "        dtrain,  \n",
    "        early_stopping_rounds=30,\n",
    "        num_boost_round  =int(population.nRound[i]),\n",
    "        nfold=k_folds, # Number of CV folds\n",
    "        feval=rmspe_xg, # Custom evaluation metric\n",
    "        obj=RMSPE_objective,\n",
    "        maximize=False # the lower the evaluation score the better (default aleady False)\n",
    "        )\n",
    "    return history[\"test-rmspe-mean\"].iloc[-1]\n",
    "\n",
    "\n",
    "# Main loop of the Evolutionary Algorithm: \n",
    "# Populations are created and avaluated.\n",
    "for run in range(nGeneration):\n",
    "    print(\"Generation %d\\n\" %run)\n",
    "    population = createNewPopulation(population,eliteSize=0.1,mutation_rate=0.2)\n",
    "    for i in range(popSize):\n",
    "        print (\"Testing instance %d \"%i)\n",
    "        population.ix[i,'fitness']=testInstance(population,i,dtrain)\n",
    "        print (\"> Fitness %f \\n \" % population.fitness[i])\n",
    "    population.sort_values(by='fitness',ascending=1,inplace=True)\n",
    "    population.reset_index(drop=True,inplace=True)\n",
    "    print(\"Generation %d Best fitness (%d-fold RMSPE CV): %f\" %(run, k_folds, population.fitness[0]))                \n",
    "                    \n",
    "                    \n",
    "# Output best parameters\n",
    "population.iloc[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# XGBoost parameters \| Evolutionary search\n",
	"Evolutionary algorithms (EA) are a subset of evolutionary computation, a generic population-based metaheuristic optimisation algorithm. An EA uses mechanisms inspired by biological evolution, such as reproduction, mutation, recombination, and selection. Candidate solutions to the optimisation problem play the role of individuals in a population, and the fitness function determines the quality of the solutions.\n",
	"\n",
	"\n",
	"We shall use EA to perform optimal parameter search for XGBoost\n",
	"\n",
	"The algorithm proceeds with first creating an initial random population of parameter values. The instances are scored using XGBoost k-fold CV. \n",
	"\n",
	"Next, a new generation of population is created as follows:\n",
	"- A small proportion of elite (i.e. top scoring) individuals is carried forward directly to the new population\n",
	"- The rest of the population is filled with randomly created individuals, by:\n",
	" - randomly picking two parents from the top performing individuals of the last population (e.g. top 50%)\n",
	" - combine the 'genes' (parameter values) randomly to create a new individual that inherits 50% of the genes from each parent.\n",
	" - with a small probability, we mutate some gene's value\n",
	" \n",
	"- The new population is evaluated, and the loop continues until convergence, or until a predefined number of generations has been reached. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
	" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
	]
	}
	],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import xgboost as xgb\n",
	"from sklearn.cross_validation import train_test_split\n",
	"from random import randint\n",
	"import random"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"## PARAMETERS\n",
	"\n",
	"popSize = 50 # Population size, set from 20 to 100\n",
	"eliteSize = 0.1 # Percentage of elite instances to be ratained \n",
	"nGeneration=2; # Number of generations\n",
	"\n",
	"k_folds = 5 # Number of folds for cross-validation\n",
	"paramList=['depth','nRound','eta','gamma','min_child_weight','lamda','alpha','colsample_bytree','subsample','fitness']\n",
	"target_column = 'CUSTOMER_COUNT'\n",
	"csv_path = 'DS/Methodology2/ModelCleanPrediction.csv'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"## SCORING FUNCTIONS\n",
	"\n",
	"def rmspe(y, yhat):\n",
	" return np.sqrt(np.mean((yhat/y-1) ** 2))\n",
	"\n",
	"def rmspe_xg(yhat, y):\n",
	" y = y.get_label()\n",
	" yhat = yhat\n",
	" return \"rmspe\", rmspe(y,yhat)\n",
	"\n",
	"def RMSPE_objective(predts, dtrain):\n",
	" labels = dtrain.get_label()\n",
	" grad = -1/labels+predts/(labels**2)\n",
	" grad[labels==0]=0\n",
	" hess = 1/(labels**2)\n",
	" hess[labels==0]=0\n",
	" return grad, hess "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# DATA IMPORT\n",
	"data = pd.read_csv(csv_path, index_col=0)\n",
	"\n",
	"# XGBOOST MATRIX\n",
	"y = data.pop(target_column)\n",
	"dtrain = xgb.DMatrix(data, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Create an initial population\n",
	"population=pd.DataFrame(np.zeros(shape=(popSize,len(paramList))),columns = paramList)\n",
	"population.depth=[randint(5,15) for p in range(0,popSize)]\n",
	"population.nRound=[randint(100,500) for p in range(0,popSize)] # n_boosting_rounds\n",
	"population.eta=[random.uniform(0.6, 1) for p in range(0,popSize)] # learning_rate\n",
	"population.gamma=[random.uniform(0.01, 0.03) for p in range(0,popSize)]\n",
	"population.min_child_weight=[randint(10,100) for p in range(0,popSize)]\n",
	"population.lamda =[random.uniform(0.1,1) for p in range(0,popSize)]\n",
	"population.alpha =[random.uniform(0.1, 1) for p in range(0,popSize)]\n",
	"population.colsample_bytree=[random.uniform(0.7, 1) for p in range(0,popSize)]\n",
	"population.subsample=[random.uniform(0.7, 1) for p in range(0,popSize)]\n",
	"population.fitness=[random.uniform(100, 100) for p in range(0,popSize)] # score\n",
	"\n",
	"# Create a new population based on an existing one\n",
	"def createNewPopulation(population, eliteSize=0.1, mutation_rate=0.2):\n",
	" population.sort_values(by='fitness',ascending=1,inplace=True)\n",
	" population.reset_index(drop=True,inplace=True)\n",
	" popSize=population.shape[0]\n",
	" nElite=int(round(eliteSize*popSize))\n",
	" \n",
	" new_population=population.copy(deep=True);\n",
	" # Form a new population from the top 50% instances\n",
	" for i in range(nElite,popSize):\n",
	" # Get two random parents\n",
	" p1=randint(nElite,int(popSize/2))\n",
	" p2=randint(nElite,int(popSize/2))\n",
	" \n",
	" for attr in list(new_population.columns.values):\n",
	" if(random.uniform(0,1)>0.5 ):\n",
	" new_population.ix[i,attr]=population.ix[p1,attr]\n",
	" else:\n",
	" new_population.ix[i,attr]=population.ix[p2,attr]\n",
	"\n",
	" # Generating some mutations\n",
	" # \n",
	" # A list of if/else if is horrible, but Python doesn't like switchs\n",
	" # https://docs.python.org/3/tutorial/controlflow.html#if-statements \n",
	" if(random.uniform(0,1)<mutation_rate ):\n",
	" attr=list(new_population.columns.values)[randint(0,8)]\n",
	" if(attr=='depth'):\n",
	" new_population.ix[i,attr]= max(3,new_population.ix[i,attr]+randint(-2,2))\n",
	" elif(attr=='nRound'):\n",
	" new_population.ix[i,attr]= max(10,new_population.ix[i,attr]+randint(-50,50))\n",
	" elif(attr=='eta'):\n",
	" new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05))\n",
	" elif(attr=='gamma'):\n",
	" new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.005,0.005))\n",
	" elif(attr=='min_child_weight'):\n",
	" new_population.ix[i,attr]= max(0,new_population.ix[i,attr]+randint(-2,2) ) \n",
	" elif(attr=='lamda'):\n",
	" new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05)) \n",
	" elif(attr=='alpha'):\n",
	" new_population.ix[i,attr]= max(0.1,new_population.ix[i,attr]+random.uniform(-0.05,0.05)) \n",
	" elif(attr=='colsample_bytree'):\n",
	" new_population.ix[i,attr]= min(1,max(0.6,new_population.ix[i,attr]+random.uniform(-0.05,0.05)))\n",
	" elif(attr=='subsample'):\n",
	" new_population.ix[i,attr]= min(1,max(0.6,new_population.ix[i,attr]+random.uniform(-0.05,0.05))) \n",
	" return new_population\n",
	"\n",
	"# Score each instance using k-fold CV\n",
	"def testInstance(population,i,dtrain):\n",
	" params = {\"objective\": \"reg:linear\",\n",
	" \"eta\": population.eta[i],\n",
	" \"max_depth\": population.depth[i],\n",
	" \"subsample\": population.subsample[i],\n",
	" \"colsample_bytree\": population.colsample_bytree[i],\n",
	" \"num_boost_round\":int(population.nRound[i]),\n",
	" \"lambda\":population.lamda[i],\n",
	" \"alpha\":population.alpha[i],\n",
	" \"gamma\":population.gamma[i],\n",
	" \"min_child_weight\":population.min_child_weight[i],\n",
	" \"silent\": 1,\n",
	" } \n",
	" history = xgb.cv(\n",
	" params,\n",
	" dtrain, \n",
	" early_stopping_rounds=30,\n",
	" num_boost_round =int(population.nRound[i]),\n",
	" nfold=k_folds, # Number of CV folds\n",
	" feval=rmspe_xg, # Custom evaluation metric\n",
	" obj=RMSPE_objective,\n",
	" maximize=False # the lower the evaluation score the better (default aleady False)\n",
	" )\n",
	" return history[\"test-rmspe-mean\"].iloc[-1]\n",
	"\n",
	"\n",
	"# Main loop of the Evolutionary Algorithm: \n",
	"# Populations are created and avaluated.\n",
	"for run in range(nGeneration):\n",
	" print(\"Generation %d\\n\" %run)\n",
	" population = createNewPopulation(population,eliteSize=0.1,mutation_rate=0.2)\n",
	" for i in range(popSize):\n",
	" print (\"Testing instance %d \"%i)\n",
	" population.ix[i,'fitness']=testInstance(population,i,dtrain)\n",
	" print (\"> Fitness %f \\n \" % population.fitness[i])\n",
	" population.sort_values(by='fitness',ascending=1,inplace=True)\n",
	" population.reset_index(drop=True,inplace=True)\n",
	" print(\"Generation %d Best fitness (%d-fold RMSPE CV): %f\" %(run, k_folds, population.fitness[0])) \n",
	" \n",
	" \n",
	"# Output best parameters\n",
	"population.iloc[0]"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}