maheshakya/Decision_tree_imp.ipynb

## Decision_tree_imp.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Required subroutines\n",
    "\n",
    "def get_list_diff(a, b):\n",
    "    \"\"\"Returns difference between 2 lists\"\"\"\n",
    "    return list(set(a) - set(b))\n",
    "\n",
    "def get_distinct_values(col):\n",
    "    \"\"\"Returns distinct values in an attribute and\n",
    "    their counts.\n",
    "    \"\"\"\n",
    "    return np.unique(col, return_counts=True)\n",
    "\n",
    "def calc_entropy(a):\n",
    "    \"\"\"Calculates entropy of a single value of an attribute.\"\"\"\n",
    "    total = a.shape[0]\n",
    "    entropy = 0\n",
    "    values, counts = get_distinct_values(a)\n",
    "    for count in counts:\n",
    "        prob = count/float(total)\n",
    "        entropy += (-1)*prob*np.log2(prob)\n",
    "    return entropy\n",
    "\n",
    "def calc_expected_entropy(a):\n",
    "    \"\"\"Calculates expected entropy of an attribute.\"\"\"\n",
    "    total = a.shape[0]\n",
    "    expected_entropy = 0\n",
    "    values, counts = get_distinct_values(a[:, 0])\n",
    "    for value, count in zip(values, counts):        \n",
    "        value_entropy = calc_entropy(a[np.where(a[:, 0] == value),\n",
    "                                       1][0])\n",
    "        expected_entropy += (count/float(total))*value_entropy\n",
    "    return expected_entropy\n",
    "\n",
    "def calc_information_gain(a):\n",
    "    \"\"\"Calculate information gain of an attribute.\"\"\"\n",
    "    return calc_entropy(a[:,1]) - calc_expected_entropy(a)\n",
    "\n",
    "def pick_attribute(features, labels, attributes):\n",
    "    \"\"\"Picks the best attribute based on the information\n",
    "    gain.\n",
    "    \"\"\"\n",
    "    att = attributes[0]\n",
    "    max_info_gain = 0\n",
    "    for i in attributes:\n",
    "        info_gain = calc_information_gain(np.hstack((features[:, i].reshape(features.shape[0], 1),\n",
    "                                                    labels.reshape(labels.shape[0],\n",
    "                                                                   1))))\n",
    "        if info_gain > max_info_gain:\n",
    "            max_info_gain = info_gain\n",
    "            att = i        \n",
    "    return att"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Tree structure\n",
    "\n",
    "class TreeNode(object):\n",
    "    \"\"\"Class that stores information of a decision tree node\"\"\"\n",
    "    def __init__(self, value=None, split_attribute=None, children=None, depth=None):\n",
    "        self.value = value\n",
    "        # If leaf, split_attribute will be label\n",
    "        self.split_attribute = split_attribute\n",
    "        # If leaf, children will an emptly list\n",
    "        self.children = children\n",
    "    \n",
    "    def set_split_attribute(self, split_attribute):\n",
    "        self.split_attribute = split_attribute\n",
    "        \n",
    "    def set_value(self, value):\n",
    "        self.value = value\n",
    "        \n",
    "    def set_children(self, children):\n",
    "        self.children = children"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Build tree\n",
    "        \n",
    "def build_tree(features, labels, attributes, all_features, depth = 30):\n",
    "    \"\"\"Builds decition tree with the feature vectors and labels\"\"\"\n",
    "    # If all examples have same label\n",
    "    if (np.unique(labels).shape[0] == 1 or depth == 0):\n",
    "        return TreeNode(split_attribute=labels[0])\n",
    "    \n",
    "    # Create root node\n",
    "    root_node = TreeNode()\n",
    "    children = []\n",
    "    # Pick the best attribute to divide upon\n",
    "    best_att = pick_attribute(features, labels, attributes)\n",
    "    root_node.set_split_attribute(best_att)\n",
    "    \n",
    "    # Iterate over each distinct value of the selected attribute\n",
    "    for value in np.unique(all_features[:, best_att]):\n",
    "        subset_indices = np.where(features[:, best_att] == value)\n",
    "        if subset_indices[0].shape[0] == 0 or len(attributes) == 1:\n",
    "            # If no examples, add a leaf node with most frequent label\n",
    "            u, indices = np.unique(labels, return_inverse=True)            \n",
    "            label = u[np.argmax(np.bincount(indices))]\n",
    "            child = TreeNode(value=value, split_attribute=label)\n",
    "        else:\n",
    "            # Else recurse with subset of examples\n",
    "            child = build_tree(features[subset_indices, :][0],\n",
    "                               labels[subset_indices],\n",
    "                               get_list_diff(attributes, [best_att]),\n",
    "                               all_features,\n",
    "                               depth-1)\n",
    "        child.set_value(value)\n",
    "        children.append(child)\n",
    "    root_node.set_children(children)        \n",
    "    return root_node"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def print_tree(tree, s):\n",
    "    \"\"\"Print decision tree\"\"\"\n",
    "    if tree.children == None:\n",
    "        print s, tree.value, \",\", tree.split_attribute\n",
    "        return \n",
    "    else:\n",
    "        print s, tree.value, \",\", tree.split_attribute\n",
    "        s += \"    \"\n",
    "        for child in tree.children:\n",
    "            print_tree(child, s)\n",
    "\n",
    "def predict(tree, feature_vector):\n",
    "    \"\"\"Predict label for a given feature vector\"\"\"\n",
    "    if tree.children == None:\n",
    "        return tree.split_attribute\n",
    "    else:\n",
    "        feature_value = feature_vector[tree.split_attribute]\n",
    "        for child in tree.children:\n",
    "            # (Needed to put int(), otherwise comparison does not work!)\n",
    "#             if int(feature_value) == int(child.value):\n",
    "            if feature_value == child.value:\n",
    "                return predict(child, feature_vector)\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# File reading\n",
    "def read_file(file_path):\n",
    "    \"\"\"Reads a text file and returns it as a ndarray.\"\"\"\n",
    "    return np.genfromtxt(file_path, delimiter=',', dtype='|S21')\n",
    "\n",
    "# Evaluation\n",
    "def calc_accuracy(tree, test_set):\n",
    "    \"\"\"Calcuates classification accuracy of the decision tree with\n",
    "    a test data set.\n",
    "    \"\"\"\n",
    "    number_of_features = test_set.shape[1] - 1\n",
    "    correct_count = 0\n",
    "    total = test_set.shape[0]\n",
    "    \n",
    "    for i in range(total):\n",
    "        pred = predict(tree, test_set[i, :number_of_features])\n",
    "        if pred == test_set[i, number_of_features]:\n",
    "            correct_count += 1\n",
    "    return correct_count/float(total)\n",
    "\n",
    "# k-fold cross validation\n",
    "def k_fold_cross_validate(tree, datasets, depth, n_features):\n",
    "    \"\"\"Perferms k-fold cross validation and returns average accuracy.\"\"\"    \n",
    "    average_accuracy = 0\n",
    "    for i in range(len(datasets)):\n",
    "        train = np.empty((1, datasets[i].shape[1]), dtype='|S16')\n",
    "        for j in range(dataset):\n",
    "            if (j!=i):\n",
    "                train = np.vstack((train, datasets[j]))\n",
    "        train = train[1:, :]\n",
    "        tree = build_tree(train[:, :n_features], train[:, n_features],\n",
    "                          np.arange(n_features), train[:, :n_features], depth)        \n",
    "        average_accuracy += calc_accuracy(tree, datasets[i])\n",
    "    average_accuracy /= float(len(datasets))\n",
    "    return average_accuracy\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([['x', 's', 'y', ..., 'n', 'g', 'e'],\n",
       "        ['b', 's', 'w', ..., 'n', 'm', 'e'],\n",
       "        ['x', 'y', 'w', ..., 's', 'u', 'p'],\n",
       "        ..., \n",
       "        ['f', 'f', 'g', ..., 's', 'g', 'e'],\n",
       "        ['f', 's', 'n', ..., 'v', 'g', 'p'],\n",
       "        ['x', 'y', 'y', ..., 'n', 'm', 'e']], \n",
       "       dtype='|S21'), array([['f', 's', 'g', ..., 'a', 'g', 'e'],\n",
       "        ['x', 's', 'w', ..., 's', 'g', 'p'],\n",
       "        ['x', 'f', 'g', ..., 's', 'g', 'e'],\n",
       "        ..., \n",
       "        ['f', 'f', 'w', ..., 'a', 'g', 'e'],\n",
       "        ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
       "        ['x', 'f', 'n', ..., 'v', 'd', 'e']], \n",
       "       dtype='|S21'), array([['x', 's', 'w', ..., 'a', 'g', 'e'],\n",
       "        ['x', 's', 'n', ..., 'a', 'g', 'e'],\n",
       "        ['x', 'y', 'n', ..., 'v', 'd', 'e'],\n",
       "        ..., \n",
       "        ['f', 'f', 'n', ..., 's', 'g', 'e'],\n",
       "        ['x', 'f', 'e', ..., 'y', 'd', 'e'],\n",
       "        ['x', 'y', 'g', ..., 'y', 'd', 'e']], \n",
       "       dtype='|S21'), array([['x', 'y', 'e', ..., 'y', 'd', 'e'],\n",
       "        ['f', 'f', 'n', ..., 'v', 'd', 'e'],\n",
       "        ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
       "        ..., \n",
       "        ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
       "        ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
       "        ['f', 'y', 'e', ..., 'v', 'd', 'e']], \n",
       "       dtype='|S21'), array([['f', 'y', 'n', ..., 'v', 'd', 'e'],\n",
       "        ['x', 'f', 'g', ..., 'y', 'd', 'e'],\n",
       "        ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
       "        ..., \n",
       "        ['f', 'f', 'g', ..., 'v', 'd', 'p'],\n",
       "        ['f', 'y', 'g', ..., 'y', 'p', 'p'],\n",
       "        ['x', 'f', 'y', ..., 'v', 'd', 'p']], \n",
       "       dtype='|S21'), array([['x', 'y', 'y', ..., 'y', 'p', 'p'],\n",
       "        ['f', 'f', 'y', ..., 'v', 'd', 'p'],\n",
       "        ['x', 'y', 'g', ..., 'y', 'd', 'p'],\n",
       "        ..., \n",
       "        ['f', 's', 'n', ..., 'y', 'd', 'e'],\n",
       "        ['b', 'y', 'n', ..., 'y', 'p', 'e'],\n",
       "        ['x', 'y', 'n', ..., 'y', 'p', 'e']], \n",
       "       dtype='|S21')]"
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datasets = []\n",
    "k = 6\n",
    "for i in range(k):\n",
    "    file_name = 'datasets/SettingA/CVSplits/training_0' + str(i) + '.data'\n",
    "    datasets.append(read_file(file_name))\n",
    "datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3822, 23) (1822, 23)\n"
     ]
    }
   ],
   "source": [
    "train_data = read_file('datasets/SettingA/training.data')\n",
    "test_data = read_file('datasets/SettingA/CVSplits/training_00.data')\n",
    "print train_data.shape, test_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " None , 4\n",
      "     a , e\n",
      "     c , p\n",
      "     f , p\n",
      "     l , e\n",
      "     m , p\n",
      "     n , 2\n",
      "         b , p\n",
      "         c , e\n",
      "         e , e\n",
      "         g , e\n",
      "         n , e\n",
      "         p , 0\n",
      "             b , p\n",
      "             c , p\n",
      "             f , 8\n",
      "                 g , p\n",
      "                 h , p\n",
      "                 k , p\n",
      "                 n , p\n",
      "                 p , p\n",
      "                 r , p\n",
      "                 u , p\n",
      "                 w , 1\n",
      "                     f , e\n",
      "                     g , e\n",
      "                     s , p\n",
      "                     y , p\n",
      "                 y , p\n",
      "             k , p\n",
      "             s , p\n",
      "             x , e\n",
      "         w , 3\n",
      "             f , e\n",
      "             t , p\n",
      "         y , p\n",
      "     p , p\n"
     ]
    }
   ],
   "source": [
    "tree = build_tree(train_data[:, :15], train_data[:, 22], list(np.arange(15)), train_data[:, :15], 5)\n",
    "print_tree(tree, \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calc_accuracy(tree, test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['b', 'c', 'e', 'g', 'n', 'p', 'w', 'y'], \n",
       "      dtype='|S21')"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_data[:, 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.96419241338405048, 0.38027468867166203)"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calc_entropy(train_data[:, 22]), calc_expected_entropy(np.hstack((train_data[:, 19].reshape(train_data.shape[0], 1),\n",
    "                                                    train_data[:, 22].reshape(train_data[:, 22].shape[0],\n",
    "                                                                   1))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "a = 'a'\n",
    "b = 'b'\n",
    "test = np.array([[1, 1, 2, 3, 3, 3, 2, 1, 1, 3, 1, 2, 2, 3],\n",
    "                 [1, 1, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2],\n",
    "                 [1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1],\n",
    "                 [1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2],\n",
    "                 [a, a, b, b, b, a, b, a, b, b, b, b, b, a]]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4\n"
     ]
    }
   ],
   "source": [
    "a = 5\n",
    "print a-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 1, 2, 3]\n",
      "[1, 2, 3]\n",
      "[1, 2, 3]\n"
     ]
    }
   ],
   "source": [
    "tree = build_tree(test[:, :4], test[:, 4], [0, 1, 2, 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " None , 0\n",
      "     1 , 2\n",
      "         1 , a\n",
      "         2 , b\n",
      "     2 , b\n",
      "     3 , 3\n",
      "         1 , b\n",
      "         2 , a\n"
     ]
    }
   ],
   "source": [
    "print_tree(tree, \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'b'"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict(tree, np.array([2, 1, 1, 1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['a', 'a', 'b', 'b', 'b', 'a', 'b', 'a', 'b', 'b', 'b', 'b', 'b', 'a'], \n",
       "      dtype='|S21')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test[:, 4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pick_attribute(test[:, :4], test[:, 4], [0, 1, 2, 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<__main__.TreeNode at 0x7fdc8e94acd0>, <__main__.TreeNode at 0x7fdc8e94aa10>]"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree.children[0].children"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "a = np.array([-1, 3, 3, 3, 5, 4, 2, 1, -7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "a = np.empty((1, 5), dtype='|S16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['a', 'b', 'c', 'd', 'e'],\n",
       "       ['a', 'b', 'c', 'd', 'e']], \n",
       "      dtype='|S16')"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.vstack((a, [['a', 'b', 'c', 'd', 'e'], ['a', 'b', 'c', 'd', 'e']]))[1:, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 3, 4, 8]"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Raw Cell Format",
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Required subroutines\n",
	"\n",
	"def get_list_diff(a, b):\n",
	" \"\"\"Returns difference between 2 lists\"\"\"\n",
	" return list(set(a) - set(b))\n",
	"\n",
	"def get_distinct_values(col):\n",
	" \"\"\"Returns distinct values in an attribute and\n",
	" their counts.\n",
	" \"\"\"\n",
	" return np.unique(col, return_counts=True)\n",
	"\n",
	"def calc_entropy(a):\n",
	" \"\"\"Calculates entropy of a single value of an attribute.\"\"\"\n",
	" total = a.shape[0]\n",
	" entropy = 0\n",
	" values, counts = get_distinct_values(a)\n",
	" for count in counts:\n",
	" prob = count/float(total)\n",
	" entropy += (-1)probnp.log2(prob)\n",
	" return entropy\n",
	"\n",
	"def calc_expected_entropy(a):\n",
	" \"\"\"Calculates expected entropy of an attribute.\"\"\"\n",
	" total = a.shape[0]\n",
	" expected_entropy = 0\n",
	" values, counts = get_distinct_values(a[:, 0])\n",
	" for value, count in zip(values, counts): \n",
	" value_entropy = calc_entropy(a[np.where(a[:, 0] == value),\n",
	" 1][0])\n",
	" expected_entropy += (count/float(total))*value_entropy\n",
	" return expected_entropy\n",
	"\n",
	"def calc_information_gain(a):\n",
	" \"\"\"Calculate information gain of an attribute.\"\"\"\n",
	" return calc_entropy(a[:,1]) - calc_expected_entropy(a)\n",
	"\n",
	"def pick_attribute(features, labels, attributes):\n",
	" \"\"\"Picks the best attribute based on the information\n",
	" gain.\n",
	" \"\"\"\n",
	" att = attributes[0]\n",
	" max_info_gain = 0\n",
	" for i in attributes:\n",
	" info_gain = calc_information_gain(np.hstack((features[:, i].reshape(features.shape[0], 1),\n",
	" labels.reshape(labels.shape[0],\n",
	" 1))))\n",
	" if info_gain > max_info_gain:\n",
	" max_info_gain = info_gain\n",
	" att = i \n",
	" return att"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Tree structure\n",
	"\n",
	"class TreeNode(object):\n",
	" \"\"\"Class that stores information of a decision tree node\"\"\"\n",
	" def __init__(self, value=None, split_attribute=None, children=None, depth=None):\n",
	" self.value = value\n",
	" # If leaf, split_attribute will be label\n",
	" self.split_attribute = split_attribute\n",
	" # If leaf, children will an emptly list\n",
	" self.children = children\n",
	" \n",
	" def set_split_attribute(self, split_attribute):\n",
	" self.split_attribute = split_attribute\n",
	" \n",
	" def set_value(self, value):\n",
	" self.value = value\n",
	" \n",
	" def set_children(self, children):\n",
	" self.children = children"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 123,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Build tree\n",
	" \n",
	"def build_tree(features, labels, attributes, all_features, depth = 30):\n",
	" \"\"\"Builds decition tree with the feature vectors and labels\"\"\"\n",
	" # If all examples have same label\n",
	" if (np.unique(labels).shape[0] == 1 or depth == 0):\n",
	" return TreeNode(split_attribute=labels[0])\n",
	" \n",
	" # Create root node\n",
	" root_node = TreeNode()\n",
	" children = []\n",
	" # Pick the best attribute to divide upon\n",
	" best_att = pick_attribute(features, labels, attributes)\n",
	" root_node.set_split_attribute(best_att)\n",
	" \n",
	" # Iterate over each distinct value of the selected attribute\n",
	" for value in np.unique(all_features[:, best_att]):\n",
	" subset_indices = np.where(features[:, best_att] == value)\n",
	" if subset_indices[0].shape[0] == 0 or len(attributes) == 1:\n",
	" # If no examples, add a leaf node with most frequent label\n",
	" u, indices = np.unique(labels, return_inverse=True) \n",
	" label = u[np.argmax(np.bincount(indices))]\n",
	" child = TreeNode(value=value, split_attribute=label)\n",
	" else:\n",
	" # Else recurse with subset of examples\n",
	" child = build_tree(features[subset_indices, :][0],\n",
	" labels[subset_indices],\n",
	" get_list_diff(attributes, [best_att]),\n",
	" all_features,\n",
	" depth-1)\n",
	" child.set_value(value)\n",
	" children.append(child)\n",
	" root_node.set_children(children) \n",
	" return root_node"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def print_tree(tree, s):\n",
	" \"\"\"Print decision tree\"\"\"\n",
	" if tree.children == None:\n",
	" print s, tree.value, \",\", tree.split_attribute\n",
	" return \n",
	" else:\n",
	" print s, tree.value, \",\", tree.split_attribute\n",
	" s += \" \"\n",
	" for child in tree.children:\n",
	" print_tree(child, s)\n",
	"\n",
	"def predict(tree, feature_vector):\n",
	" \"\"\"Predict label for a given feature vector\"\"\"\n",
	" if tree.children == None:\n",
	" return tree.split_attribute\n",
	" else:\n",
	" feature_value = feature_vector[tree.split_attribute]\n",
	" for child in tree.children:\n",
	" # (Needed to put int(), otherwise comparison does not work!)\n",
	"# if int(feature_value) == int(child.value):\n",
	" if feature_value == child.value:\n",
	" return predict(child, feature_vector)\n",
	" return None"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 140,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# File reading\n",
	"def read_file(file_path):\n",
	" \"\"\"Reads a text file and returns it as a ndarray.\"\"\"\n",
	" return np.genfromtxt(file_path, delimiter=',', dtype='\|S21')\n",
	"\n",
	"# Evaluation\n",
	"def calc_accuracy(tree, test_set):\n",
	" \"\"\"Calcuates classification accuracy of the decision tree with\n",
	" a test data set.\n",
	" \"\"\"\n",
	" number_of_features = test_set.shape[1] - 1\n",
	" correct_count = 0\n",
	" total = test_set.shape[0]\n",
	" \n",
	" for i in range(total):\n",
	" pred = predict(tree, test_set[i, :number_of_features])\n",
	" if pred == test_set[i, number_of_features]:\n",
	" correct_count += 1\n",
	" return correct_count/float(total)\n",
	"\n",
	"# k-fold cross validation\n",
	"def k_fold_cross_validate(tree, datasets, depth, n_features):\n",
	" \"\"\"Perferms k-fold cross validation and returns average accuracy.\"\"\" \n",
	" average_accuracy = 0\n",
	" for i in range(len(datasets)):\n",
	" train = np.empty((1, datasets[i].shape[1]), dtype='\|S16')\n",
	" for j in range(dataset):\n",
	" if (j!=i):\n",
	" train = np.vstack((train, datasets[j]))\n",
	" train = train[1:, :]\n",
	" tree = build_tree(train[:, :n_features], train[:, n_features],\n",
	" np.arange(n_features), train[:, :n_features], depth) \n",
	" average_accuracy += calc_accuracy(tree, datasets[i])\n",
	" average_accuracy /= float(len(datasets))\n",
	" return average_accuracy\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 142,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[array([['x', 's', 'y', ..., 'n', 'g', 'e'],\n",
	" ['b', 's', 'w', ..., 'n', 'm', 'e'],\n",
	" ['x', 'y', 'w', ..., 's', 'u', 'p'],\n",
	" ..., \n",
	" ['f', 'f', 'g', ..., 's', 'g', 'e'],\n",
	" ['f', 's', 'n', ..., 'v', 'g', 'p'],\n",
	" ['x', 'y', 'y', ..., 'n', 'm', 'e']], \n",
	" dtype='\|S21'), array([['f', 's', 'g', ..., 'a', 'g', 'e'],\n",
	" ['x', 's', 'w', ..., 's', 'g', 'p'],\n",
	" ['x', 'f', 'g', ..., 's', 'g', 'e'],\n",
	" ..., \n",
	" ['f', 'f', 'w', ..., 'a', 'g', 'e'],\n",
	" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
	" ['x', 'f', 'n', ..., 'v', 'd', 'e']], \n",
	" dtype='\|S21'), array([['x', 's', 'w', ..., 'a', 'g', 'e'],\n",
	" ['x', 's', 'n', ..., 'a', 'g', 'e'],\n",
	" ['x', 'y', 'n', ..., 'v', 'd', 'e'],\n",
	" ..., \n",
	" ['f', 'f', 'n', ..., 's', 'g', 'e'],\n",
	" ['x', 'f', 'e', ..., 'y', 'd', 'e'],\n",
	" ['x', 'y', 'g', ..., 'y', 'd', 'e']], \n",
	" dtype='\|S21'), array([['x', 'y', 'e', ..., 'y', 'd', 'e'],\n",
	" ['f', 'f', 'n', ..., 'v', 'd', 'e'],\n",
	" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
	" ..., \n",
	" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
	" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
	" ['f', 'y', 'e', ..., 'v', 'd', 'e']], \n",
	" dtype='\|S21'), array([['f', 'y', 'n', ..., 'v', 'd', 'e'],\n",
	" ['x', 'f', 'g', ..., 'y', 'd', 'e'],\n",
	" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
	" ..., \n",
	" ['f', 'f', 'g', ..., 'v', 'd', 'p'],\n",
	" ['f', 'y', 'g', ..., 'y', 'p', 'p'],\n",
	" ['x', 'f', 'y', ..., 'v', 'd', 'p']], \n",
	" dtype='\|S21'), array([['x', 'y', 'y', ..., 'y', 'p', 'p'],\n",
	" ['f', 'f', 'y', ..., 'v', 'd', 'p'],\n",
	" ['x', 'y', 'g', ..., 'y', 'd', 'p'],\n",
	" ..., \n",
	" ['f', 's', 'n', ..., 'y', 'd', 'e'],\n",
	" ['b', 'y', 'n', ..., 'y', 'p', 'e'],\n",
	" ['x', 'y', 'n', ..., 'y', 'p', 'e']], \n",
	" dtype='\|S21')]"
	]
	},
	"execution_count": 142,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"datasets = []\n",
	"k = 6\n",
	"for i in range(k):\n",
	" file_name = 'datasets/SettingA/CVSplits/training_0' + str(i) + '.data'\n",
	" datasets.append(read_file(file_name))\n",
	"datasets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(3822, 23) (1822, 23)\n"
	]
	}
	],
	"source": [
	"train_data = read_file('datasets/SettingA/training.data')\n",
	"test_data = read_file('datasets/SettingA/CVSplits/training_00.data')\n",
	"print train_data.shape, test_data.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 126,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" None , 4\n",
	" a , e\n",
	" c , p\n",
	" f , p\n",
	" l , e\n",
	" m , p\n",
	" n , 2\n",
	" b , p\n",
	" c , e\n",
	" e , e\n",
	" g , e\n",
	" n , e\n",
	" p , 0\n",
	" b , p\n",
	" c , p\n",
	" f , 8\n",
	" g , p\n",
	" h , p\n",
	" k , p\n",
	" n , p\n",
	" p , p\n",
	" r , p\n",
	" u , p\n",
	" w , 1\n",
	" f , e\n",
	" g , e\n",
	" s , p\n",
	" y , p\n",
	" y , p\n",
	" k , p\n",
	" s , p\n",
	" x , e\n",
	" w , 3\n",
	" f , e\n",
	" t , p\n",
	" y , p\n",
	" p , p\n"
	]
	}
	],
	"source": [
	"tree = build_tree(train_data[:, :15], train_data[:, 22], list(np.arange(15)), train_data[:, :15], 5)\n",
	"print_tree(tree, \"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.0"
	]
	},
	"execution_count": 119,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"calc_accuracy(tree, test_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array(['b', 'c', 'e', 'g', 'n', 'p', 'w', 'y'], \n",
	" dtype='\|S21')"
	]
	},
	"execution_count": 103,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.unique(train_data[:, 2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 100,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(0.96419241338405048, 0.38027468867166203)"
	]
	},
	"execution_count": 100,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"calc_entropy(train_data[:, 22]), calc_expected_entropy(np.hstack((train_data[:, 19].reshape(train_data.shape[0], 1),\n",
	" train_data[:, 22].reshape(train_data[:, 22].shape[0],\n",
	" 1))))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 104,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"a = 'a'\n",
	"b = 'b'\n",
	"test = np.array([[1, 1, 2, 3, 3, 3, 2, 1, 1, 3, 1, 2, 2, 3],\n",
	" [1, 1, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2],\n",
	" [1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1],\n",
	" [1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2],\n",
	" [a, a, b, b, b, a, b, a, b, b, b, b, b, a]]).T"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 122,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4\n"
	]
	}
	],
	"source": [
	"a = 5\n",
	"print a-1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[0, 1, 2, 3]\n",
	"[1, 2, 3]\n",
	"[1, 2, 3]\n"
	]
	}
	],
	"source": [
	"tree = build_tree(test[:, :4], test[:, 4], [0, 1, 2, 3])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" None , 0\n",
	" 1 , 2\n",
	" 1 , a\n",
	" 2 , b\n",
	" 2 , b\n",
	" 3 , 3\n",
	" 1 , b\n",
	" 2 , a\n"
	]
	}
	],
	"source": [
	"print_tree(tree, \"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'b'"
	]
	},
	"execution_count": 46,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predict(tree, np.array([2, 1, 1, 1]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array(['a', 'a', 'b', 'b', 'b', 'a', 'b', 'a', 'b', 'b', 'b', 'b', 'b', 'a'], \n",
	" dtype='\|S21')"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test[:, 4]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 133,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0"
	]
	},
	"execution_count": 133,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pick_attribute(test[:, :4], test[:, 4], [0, 1, 2, 3])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 164,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[<__main__.TreeNode at 0x7fdc8e94acd0>, <__main__.TreeNode at 0x7fdc8e94aa10>]"
	]
	},
	"execution_count": 164,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tree.children[0].children"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 150,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"a = np.array([-1, 3, 3, 3, 5, 4, 2, 1, -7])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 132,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"a = np.empty((1, 5), dtype='\|S16')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 139,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([['a', 'b', 'c', 'd', 'e'],\n",
	" ['a', 'b', 'c', 'd', 'e']], \n",
	" dtype='\|S16')"
	]
	},
	"execution_count": 139,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.vstack((a, [['a', 'b', 'c', 'd', 'e'], ['a', 'b', 'c', 'd', 'e']]))[1:, :]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 131,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[1, 3, 4, 8]"
	]
	},
	"execution_count": 131,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"a"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"celltoolbar": "Raw Cell Format",
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}