rhydomako/2016_09_16_BRL.ipynb

## 2016_09_16_BRL.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bayesian Rule List classifier and the Titanic dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pylab as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from scipy.special import gammaln\n",
    "from scipy.stats import poisson, beta\n",
    "from collections import defaultdict, Counter, namedtuple\n",
    "from fim import fpgrowth #this is PyFIM, available from http://www.borgelt.net/pyfim.html\n",
    "from bitarray import bitarray\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bayesian Rule List classifier, based on https://arxiv.org/abs/1511.01644 (reference code at http://lethalletham.com/BRL_supplement_code.zip):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 449,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class RuleListFitter(object):\n",
    "    \"\"\"\n",
    "    Bayesian Rule List\n",
    "    \n",
    "    References:\n",
    "    https://arxiv.org/abs/1511.01644\n",
    "    https://arxiv.org/abs/1602.08610\n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self, lambda_=2, eta=2, niter=10000, chains=3, warmup=None, thinning=1,\n",
    "                 minsupport=10, maxlhs=2, alpha=(1.,1.), verbose=False):\n",
    "        self.lambda_ = lambda_\n",
    "        self.eta = eta\n",
    "        self.niter = niter\n",
    "        self.chains = chains\n",
    "        if warmup >= 0:\n",
    "            self.warmup = warmup\n",
    "        else:\n",
    "            self.warmup = self.niter//2\n",
    "        self.thinning = thinning\n",
    "        self.minsupport = minsupport\n",
    "        self.maxlhs = maxlhs\n",
    "        self.alpha = alpha\n",
    "        self.verbose = verbose\n",
    "        self.rules = None\n",
    "        self.jinit = None\n",
    "        self.estimate = None\n",
    "        self.RuleMetadata = namedtuple(\"RuleMetadata\",\n",
    "                                       \"rulelist N log_posterior n_rules cardinality_sum theta theta_low theta_high\")\n",
    "    \n",
    "    def __repr__(self):\n",
    "        return \"\"\"<RuleListFitter lambda_=%r, eta=%r, niter=%r, chains=%r, warmup=%r, thinning=%r, \n",
    "minsupport=%r, maxlhs=%r, alpha=%r, verbose=%r>\n",
    "               \"\"\" % (self.lambda_,self.eta,self.niter,self.chains, self.warmup, self.thinning,\n",
    "                      self.minsupport, self.maxlhs, self.alpha, self.verbose)\n",
    "    \n",
    "    def __str__(self):\n",
    "        return self.tostring()\n",
    "    \n",
    "    def tostring(self):\n",
    "        str_ = ''\n",
    "        if self.estimate:\n",
    "            for i, rule in enumerate(self.estimate.rulelist):\n",
    "                if i==0:\n",
    "                    str_ += 'IF (' + \" AND \".join([str(r) for r in rule]) + \") THEN \"\n",
    "                elif i==len(self.estimate.rulelist)-1:\n",
    "                    str_ += 'ELSE '\n",
    "                else:\n",
    "                    str_ += 'ELSE IF (' + \" AND \".join([str(r) for r in rule]) + \") THEN \"\n",
    "                str_ += \"survival probability %0.2f (%0.2f-%0.2f)\" % (self.estimate.theta[i],\n",
    "                                                                       self.estimate.theta_low[i],\n",
    "                                                                       self.estimate.theta_high[i])\n",
    "                str_ += '\\n'\n",
    "            str_ += \"(log posterior: %0.3f)\" % self.estimate.log_posterior\n",
    "        else:\n",
    "            str_ = \"<Untrained RuleList>\"\n",
    "        return str_\n",
    "\n",
    "    def rule_metadata(self, rulelist):\n",
    "        n_rules = len(rulelist)-1\n",
    "        cardinality_sum = np.sum([len(rule) for rule in rulelist if rule != ()])\n",
    "        N = np.array(self.captured(rulelist))\n",
    "        log_posterior = self.log_posterior(rulelist,N)\n",
    "        theta = (N[:,0] + self.alpha[0])/np.array([np.sum(n + self.alpha) for n in N])\n",
    "        theta_low, theta_high = beta.interval(0.95,N[:,0] + self.alpha[0],N[:,1] + self.alpha[1])\n",
    "        return self.RuleMetadata(rulelist, N, log_posterior, n_rules, cardinality_sum, theta, theta_low, theta_high)\n",
    "     \n",
    "    def fit(self, X, y):\n",
    "\n",
    "        def mine_rules(data, minsupport, maxlhs):\n",
    "            # TODO::Make column-values unique\n",
    "            fp_rules = fpgrowth(\n",
    "                data,\n",
    "                supp=minsupport,\n",
    "                zmax=maxlhs\n",
    "            )\n",
    "            return set([x[0] for x in fp_rules if np.nan not in x[0]])\n",
    "\n",
    "        data_neg = X[y == 0] #negative rows\n",
    "        data_pos = X[y == 1] #predictor rows\n",
    "        self.positive_rows = bitarray((y==1).tolist())\n",
    "        self.negative_rows = ~self.positive_rows\n",
    "        \n",
    "        self.rules = mine_rules(data_pos, self.minsupport, self.maxlhs) \\\n",
    "                   | mine_rules(data_neg, self.minsupport, self.maxlhs) \\\n",
    "                   | {()} #null rule\n",
    "            \n",
    "        if self.verbose:\n",
    "            print \"Number of rows:\", len(X)\n",
    "            print \"Mined association rules:\", len(self.rules)\n",
    "        \n",
    "        self.jinit = { rule:self.calculate_bitarray(X, rule) for rule in self.rules }\n",
    "        \n",
    "        self.pmf_lambda = { i:poisson.logpmf(i, self.lambda_) for i in range(len(self.rules)+2) }\n",
    "        self.pmf_eta = { i:poisson.logpmf(i, self.eta) for i in range(1,self.maxlhs+1) }\n",
    "        self.normalization_eta = poisson.cdf(self.maxlhs,self.eta) - poisson.pmf(0,self.eta)\n",
    "\n",
    "        self.trace = []\n",
    "        self.log_posteriors = {}\n",
    "        \n",
    "        for chain in xrange(self.chains): #run chains serially\n",
    "            if self.verbose:\n",
    "                print \"Chain:\", chain\n",
    "            chain_trace = []\n",
    "            chain_log_posteriors = {}\n",
    "        \n",
    "            rulelist = self.initialize_rulelist() #generate a seed rulelist\n",
    "            chain_log_posteriors[rulelist.__str__()] = [self.rule_metadata(rulelist), 0]\n",
    "\n",
    "            for i in xrange(self.niter):\n",
    "                rulelist_star, logQ = self.mutate_rulelist(rulelist) # generate proposal\n",
    "\n",
    "                if rulelist_star.__str__() not in chain_log_posteriors:\n",
    "                    chain_log_posteriors[rulelist_star.__str__()] = [self.rule_metadata(rulelist_star),0]\n",
    "\n",
    "                # M-H coefficient\n",
    "                r = min(1, np.exp(  chain_log_posteriors[rulelist_star.__str__()][0].log_posterior\n",
    "                                  - chain_log_posteriors[rulelist.__str__()][0].log_posterior\n",
    "                                  + logQ))\n",
    "\n",
    "                if np.random.uniform() < r:\n",
    "                    rulelist = rulelist_star #accept proposal\n",
    "\n",
    "                if (i >= self.warmup) and (i % self.thinning == 0): #warmup and thinning (if any)\n",
    "                    chain_log_posteriors[rulelist.__str__()][1] += 1\n",
    "                    chain_trace.append(rulelist)\n",
    "                    \n",
    "            #merge individual chain into global trace/metadata\n",
    "            self.trace += chain_trace\n",
    "            for key, value in chain_log_posteriors.items():\n",
    "                if key not in self.log_posteriors:\n",
    "                    self.log_posteriors[key] = value\n",
    "                else:\n",
    "                    self.log_posteriors[key][1] += value[1]\n",
    "                    \n",
    "        self.estimate = self.point_estimate()\n",
    "        \n",
    "        return self\n",
    "   \n",
    "    def calculate_bitarray(self, X, rule):\n",
    "        rule_set = set(rule)\n",
    "        return bitarray([rule_set <= set(x) for x in X])\n",
    "\n",
    "    def initialize_rulelist(self): #self.rules, self.lambda_, self.eta, self.verbose\n",
    "        # Sample a decision list length m \\sim p(m|λ)\n",
    "        m = np.Inf\n",
    "        while m >= len(self.rules): # Initial list can be zero as long as we add into it later\n",
    "            m = poisson.rvs(self.lambda_)\n",
    "            \n",
    "        avaliable_rules = self.rules.copy()\n",
    "        d = []\n",
    "        for _ in range(m):\n",
    "            #Sample the cardinality of antecedent aj in d as cj \\sim p(cj|c<j,A,η).\n",
    "            avaliable_cardinalities = Counter([len(r) for r in avaliable_rules if len(r)>0]).keys()\n",
    "            c = 0\n",
    "            while c==0 or c>max(avaliable_cardinalities) or c not in avaliable_cardinalities:\n",
    "                c = poisson.rvs(self.eta)        \n",
    "\n",
    "            #Sample aj of cardinality cj from p(aj|a<j,cj,A).\n",
    "            rules = [r for r in avaliable_rules if len(r) == c]\n",
    "            rule = rules[np.random.randint(len(rules))]\n",
    "            \n",
    "            avaliable_rules = avaliable_rules - set([rule])\n",
    "            d.append(rule)\n",
    "            \n",
    "        d.append(()) #null rule\n",
    "        \n",
    "        if self.verbose:\n",
    "            print \"Initial rulelist (m=%d):\" % m,\n",
    "            print d\n",
    "\n",
    "        return d\n",
    "\n",
    "    \n",
    "    def captured(self, rulelist): #self.jinit\n",
    "        jcaptured = {}\n",
    "        \n",
    "        captured = ~self.jinit[()]\n",
    "        N = []\n",
    "        \n",
    "        for rule in rulelist:\n",
    "            #j.captures ← j.init ∨ ¬captured\n",
    "            #captured ← j.captures ∧ captured\n",
    "            jcaptured[rule] = self.jinit[rule] & (~captured)\n",
    "            captured = jcaptured[rule] ^ captured\n",
    "            N.append([(jcaptured[rule] & self.positive_rows).count(),(jcaptured[rule] & self.negative_rows).count()])\n",
    "        \n",
    "        return N\n",
    "\n",
    "            \n",
    "    def log_posterior(self, rulelist, N):\n",
    "        log_likelihood = self.log_likelihood(N)\n",
    "        log_prior = self.log_prior(rulelist)\n",
    "        return log_likelihood + log_prior\n",
    "    \n",
    "    \n",
    "    def log_likelihood(self, N):\n",
    "        \"\"\"\n",
    "        p(y|x, d, α) =\n",
    "        \\prod_{j=1}^{m} \\frac{\\gamma(N_{j,+}+α_0)\\gamma(N_{j,-}+α_1)}\n",
    "                             {\\sum(\\gamma(\\gamma(N_{j,+}+\\gamma(N_{j,-}+α_0+α_1))}\n",
    "        \"\"\"\n",
    "        numerator = gammaln(N+self.alpha)\n",
    "        denomerator = gammaln(np.sum(N+self.alpha,axis=1))\n",
    "        return np.sum(numerator) - np.sum(denomerator)\n",
    "    \n",
    "    \n",
    "    def log_prior(self, rulelist): #self.rules, self.pmf_lambda, self.pmf_eta, self.normalization_eta\n",
    "        \"\"\"\n",
    "        p(d|A,λ,η) = p(m|A,λ) \\prod_{j=1}^{m} p(cj|c<j,A,η) p(aj|a<j,cj,A)\n",
    "        \"\"\"\n",
    "                \n",
    "        all_rules = set([r for r in self.rules if len(r)>0])\n",
    "        all_cardinalities = set(Counter([len(r) for r in all_rules]).keys())\n",
    "        \n",
    "        rulelist = [rule for rule in rulelist if len(rule)>0]\n",
    "        log_prior = self.pmf_lambda[len(rulelist)]\n",
    "\n",
    "        remaining_rules = all_rules.copy()\n",
    "        for rule in rulelist:\n",
    "            \n",
    "            cardinalities = Counter([len(r) for r in remaining_rules])            \n",
    "            remaining_cardinalities = set(cardinalities.keys())\n",
    "            eliminated_cardinalities = all_cardinalities - remaining_cardinalities\n",
    "\n",
    "            log_prior += self.pmf_eta[len(rule)]\n",
    "            log_prior -= np.log(self.normalization_eta - sum([self.pmf_eta[l] for l in eliminated_cardinalities]))\n",
    "            log_prior -= np.log(cardinalities[len(rule)])\n",
    "            \n",
    "            remaining_rules = remaining_rules - set([rule])\n",
    "\n",
    "        return log_prior\n",
    "    \n",
    "\n",
    "    def mutate_rulelist(self, rulelist): #self.rules\n",
    "\n",
    "        rulelist = [rule for rule in rulelist if len(rule)>0]\n",
    "        \n",
    "        # calculate the PMF distribution for the available mutation paths\n",
    "        if len(rulelist) == 0: # No rules in the rulelist yet -- can only insert new rule\n",
    "            path_probabilites = [0, 1., 0]\n",
    "        elif len(rulelist) == 1: # Only one rule -- can only insert and remove\n",
    "            path_probabilites = [0, 0.5, 0.5]\n",
    "        elif len(rulelist) == len(self.rules) - 1: # List have every possible rule, can only swap or remove\n",
    "            path_probabilites = [0.5, 0, 0.5]\n",
    "        elif len(rulelist) == len(self.rules) - 2: # Only one rule remaining, so the inverse probabilites have a correction\n",
    "            path_probabilites = [1./3, 1./3, 1./3]\n",
    "        else: # All paths possible\n",
    "            path_probabilites = [1./3, 1./3, 1./3]\n",
    "            \n",
    "        mutation = np.random.choice(['swap','insert','remove'], p=path_probabilites)\n",
    "\n",
    "        # Q(d|d*)    p(d|d*,swap*)p(swap*) + p(d|d*,insert*)p(insert*) + p(d|d*,remove*)p(remove*)\n",
    "        # ------- =  -----------------------------------------------------------------------------\n",
    "        # Q(d*|d)       p(d*|d,swap)p(swap) + p(d*|d,insert)p(insert) + p(d*|d,remove)p(remove)\n",
    "        \n",
    "        if mutation == 'swap':\n",
    "            Q = 1.0\n",
    "        elif mutation == 'insert':\n",
    "            if len(rulelist) == 0:\n",
    "                # After an insert, we can only get back to an empty list via a remove\n",
    "                # But there are two possible operations from the d* state (insert, or remove), so p(remove*)=0.5\n",
    "                \n",
    "                # Q(d|d*)     0 + 0*0.5 + (1/|d*|)*0.5           \n",
    "                # ------- =   --------------------       =  0.5*(float(len(self.rules)-1-len(rulelist)))\n",
    "                # Q(d*|d)     0 + 0 + 1/((|a|-|d|)(|d|+1))*1.0 \n",
    "                Q = (0.5)*(float(len(self.rules)-1-len(rulelist)))\n",
    "            elif len(rulelist) == 1:\n",
    "                # Q(d|d*)    0*(1/3) + 0*(1/3) + (1/|d*|)*(1/3)\n",
    "                # ------- =  ----------------------------------        = (2/3)*(float(len(self.rules)-1-len(rulelist)))\n",
    "                # Q(d*|d)    0 + 1/((|a|-|d|)(|d|+1))*(1/2) + 0*(1/2)\n",
    "                Q = (2./3)*(float(len(self.rules)-1-len(rulelist)))\n",
    "            elif len(rulelist) == len(self.rules) - 2:\n",
    "                # Q(d|d*)    0*(1/2) + 0*0 + (1/|d*|)*(1/2)\n",
    "                # ------- =  ----------------------------------        = (3/2)*(float(len(self.rules)-1-len(rulelist)))\n",
    "                # Q(d*|d)    0*(1/3) + 1/((|a|-|d|)(|d|+1))*(1/3) + 0*(1/3)\n",
    "                Q = (3./2)*(float(len(self.rules)-1-len(rulelist)))\n",
    "            else:\n",
    "                # Q(d|d*)    0*(1/3) + 0*(1/3) + (1/|d*|)*(1/3)\n",
    "                # ------- =  ----------------------------------        = (1)*(float(len(self.rules)-1-len(rulelist)))\n",
    "                # Q(d*|d)    0*(1/3) + 1/((|a|-|d|)(|d|+1))*(1/3) + 0*(1/3)\n",
    "                Q = (1.)*(float(len(self.rules)-1-len(rulelist)))\n",
    "        elif mutation == 'remove':\n",
    "            if len(rulelist) == 1:\n",
    "                # Q(d|d*)    0*(0) + 1/((|a|-|d*|)(|d*|+1))*(1) + 0*(0)      \n",
    "                # ------- =  ----------------------------------        = (2)/(float(len(self.rules)-len(rulelist)))\n",
    "                # Q(d*|d)    0 + 0*(1/2) + 1/|d|*(1/2)\n",
    "                Q = (2)/(float(len(self.rules)-1.-len(rulelist)-1.))\n",
    "            elif len(rulelist) == len(self.rules) - 1:\n",
    "                # Q(d|d*)    0*(1/3) + 1/((|a|-|d*|)(|d*|+1))*(1/3) + 0*(1./3)\n",
    "                # ------- =  ----------------------------------        = (2./3)/(float(len(self.rules)-len(rulelist)))\n",
    "                # Q(d*|d)    0*(1/2) + 0*(0) + 1/|d|*(1/2)\n",
    "                Q = (2./3)/(float(len(self.rules)-1.-len(rulelist)-1.))                \n",
    "            elif len(rulelist) == len(self.rules) - 2:\n",
    "                # Q(d|d*)    0*(1/3) + 1/((|a|-|d*|)(|d*|+1))*(1/3) + 0*(1./3)\n",
    "                # ------- =  ----------------------------------        = (1.)/(float(len(self.rules)-len(rulelist)))\n",
    "                # Q(d*|d)    0*(1/3) + 0*(1/3) + 1/|d|*(1/3)\n",
    "                Q = (1.)/(float(len(self.rules)-1.-len(rulelist)-1.))                \n",
    "            else:\n",
    "                # Q(d|d*)    0*(1/3) + 1/((|a|-|d*|)(|d*|+1))*(1/3) + 0*(1./3)\n",
    "                # ------- =  ----------------------------------        = (1.)/(float(len(self.rules)-len(rulelist)))\n",
    "                # Q(d*|d)    0*(1/3) + 0*(1/3) + 1/|d|*(1/3)\n",
    "                Q = (1.)/(float(len(self.rules)-1.-len(rulelist)-1.))                \n",
    "        else:\n",
    "            raise\n",
    "\n",
    "        # perform the mutation\n",
    "        if mutation == 'swap':\n",
    "            a,b = np.random.permutation(range(len(rulelist)))[:2]\n",
    "            rulelist[a], rulelist[b] = rulelist[b], rulelist[a]\n",
    "        elif mutation == 'insert':\n",
    "            try:\n",
    "                new_rules = list(set(self.rules) - set(rulelist) - set([()]))\n",
    "                new_rule = new_rules[np.random.randint(len(new_rules))]\n",
    "            except:\n",
    "                print rulelist\n",
    "                print list(set(self.rules) - set(rulelist) - set([()]))\n",
    "            rulelist.insert(np.random.randint(len(rulelist)+1), new_rule)\n",
    "        elif mutation == 'remove':\n",
    "            rulelist.pop(np.random.randint(len(rulelist)))\n",
    "        else:\n",
    "            raise\n",
    "            \n",
    "        rulelist.append(())\n",
    "        return rulelist, np.log(Q)\n",
    "    \n",
    "    def point_estimate(self): #self.log_posteriors, self.verbose\n",
    "        if len(self.log_posteriors) == 0:\n",
    "            return None\n",
    "                \n",
    "        #find the average rule length and width\n",
    "        lengths = 0.\n",
    "        widths = 0.\n",
    "        n = 0\n",
    "        n_rules = 0\n",
    "        for rulelist in self.log_posteriors.values():\n",
    "            lengths += rulelist[0].n_rules * rulelist[1]\n",
    "            widths += rulelist[0].cardinality_sum\n",
    "            n += rulelist[1]\n",
    "            n_rules += rulelist[0].n_rules\n",
    "            \n",
    "        avg_length = lengths/n\n",
    "        avg_width = widths/n_rules\n",
    "            \n",
    "        if self.verbose:\n",
    "            print \"Posterior average length:\", avg_length\n",
    "            print \"Posterior average width:\", avg_width\n",
    "            \n",
    "        #filter for only rulelists around the average\n",
    "        min_length = int(np.floor(avg_length))\n",
    "        min_width  = int(np.floor(avg_width))\n",
    "        max_length = int(np.ceil(avg_length))\n",
    "        max_width  = int(np.ceil(avg_width))\n",
    "        \n",
    "        keys = []\n",
    "        posteriors = []\n",
    "        for key,rulelist in self.log_posteriors.items():\n",
    "            metadata = rulelist[0]\n",
    "            try:\n",
    "                avg_cardinality = float(metadata.cardinality_sum)/float(metadata.n_rules)\n",
    "            except:\n",
    "                print metadata\n",
    "                continue\n",
    "            if metadata.n_rules >= min_length and metadata.n_rules <= max_length \\\n",
    "                    and avg_cardinality >= min_width and avg_cardinality <= max_width:\n",
    "                keys.append(key)\n",
    "                posteriors.append(metadata.log_posterior)\n",
    "                \n",
    "        #return rulelist with maximum posterior value\n",
    "        max_key = keys[np.argmax(posteriors)]\n",
    "        return self.log_posteriors[max_key][0]\n",
    "    \n",
    "    def predict_proba(self, X):\n",
    "        jinit = { rule:self.calculate_bitarray(X, rule) for rule in self.rules }\n",
    "        \n",
    "        captured = ~jinit[()]\n",
    "        predictions = -1.*np.zeros(len(X))\n",
    "        \n",
    "        for i, rule in enumerate(self.estimate.rulelist):\n",
    "            #j.captures ← j.init ∨ ¬captured\n",
    "            #captured ← j.captures ∧ captured\n",
    "            jcaptured = jinit[rule] & (~captured)\n",
    "            captured = jcaptured ^ captured\n",
    "\n",
    "            predictions[np.array(jcaptured.tolist())] = self.estimate.theta[i]\n",
    "            \n",
    "        return predictions\n",
    "    \n",
    "    def predict(self, X):\n",
    "        return (self.predict_proba(X)>=0.5).astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load the training data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 407,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv('train.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bin the age-column, and re-label class values (the rulelist model requires discrete variables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 408,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data['binned_age'] = pd.cut(data['Age'], [0,18,np.Inf], labels=['child','adult'])\n",
    "data['cabin_class'] = pd.cut(data['Pclass'], [0,1,2,3], labels=['1st class', '2nd class', '3rd class'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 409,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = data[['cabin_class','Sex','binned_age']].values\n",
    "y = data['Survived'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the rulelist model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 450,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rl = RuleListFitter(lambda_=3, eta=2, chains=3, niter=50000, maxlhs=3, verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 451,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of rows: 891\n",
      "Mined association rules: 33\n",
      "Chain: 0\n",
      "Initial rulelist (m=4): [('adult', 'male'), ('female',), ('male',), ('2nd class', 'adult', 'female'), ()]\n",
      "Chain: 1\n",
      "Initial rulelist (m=3): [('3rd class', 'adult'), ('2nd class', 'male'), ('1st class', 'adult', 'female'), ()]\n",
      "Chain: 2\n",
      "Initial rulelist (m=1): [('adult', 'male'), ()]\n",
      "Posterior average length: 5.62808\n",
      "Posterior average width: 1.85391685397\n",
      "CPU times: user 2min 14s, sys: 7.06 s, total: 2min 21s\n",
      "Wall time: 2min 35s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<RuleListFitter lambda_=3, eta=2, niter=50000, chains=3, warmup=25000, thinning=1, \n",
       "minsupport=10, maxlhs=3, alpha=(1.0, 1.0), verbose=True>\n",
       "               "
      ]
     },
     "execution_count": 451,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "rl.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is the rulelist:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 452,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "IF (3rd class AND female) THEN survival probability 0.50 (0.42-0.58)\n",
      "ELSE IF (female) THEN survival probability 0.94 (0.90-0.97)\n",
      "ELSE IF (child AND 3rd class) THEN survival probability 0.23 (0.13-0.35)\n",
      "ELSE IF (child) THEN survival probability 0.64 (0.43-0.82)\n",
      "ELSE IF (1st class) THEN survival probability 0.35 (0.27-0.44)\n",
      "ELSE survival probability 0.12 (0.09-0.15)\n",
      "(log posterior: -420.154)\n"
     ]
    }
   ],
   "source": [
    "print rl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make predictions on the test set using this rulelist:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 443,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_data = pd.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 444,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_data['binned_age'] = pd.cut(test_data['Age'], [0,18,np.Inf], labels=['child','adult'])\n",
    "test_data['cabin_class'] = pd.cut(test_data['Pclass'], [0,1,2,3], labels=['1st class', '2nd class', '3rd class'])\n",
    "\n",
    "X_test = test_data[['cabin_class','Sex','binned_age']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 445,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.11508951,  0.5       ,  0.11508951,  0.11508951,  0.5       ,\n",
       "        0.22641509,  0.5       ,  0.11508951,  0.5       ,  0.11508951])"
      ]
     },
     "execution_count": 445,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rl.predict_proba(X_test)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 446,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])"
      ]
     },
     "execution_count": 446,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions = rl.predict(X_test)\n",
    "predictions[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 447,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictions_out = pd.Series(predictions, index=test_data.PassengerId, name=\"Survived\")\n",
    "predictions_out.to_csv(\"BRL_predictions.csv\", header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 448,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PassengerId,Survived\r\n",
      "892,0\r\n",
      "893,1\r\n",
      "894,0\r\n",
      "895,0\r\n",
      "896,1\r\n",
      "897,0\r\n",
      "898,1\r\n",
      "899,0\r\n",
      "900,1\r\n"
     ]
    }
   ],
   "source": [
    "!head BRL_predictions.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This gives an accuracy of 0.76555 on Kaggle, which is the same as with my previous logistic regression model. However, the interprebility of the rulelist is very useful and easy to understand."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}