Last active
September 12, 2016 15:50
-
-
Save maheshakya/c6c3011c476672d0f2c21855a278e27d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Required subroutines\n", | |
"\n", | |
"def get_list_diff(a, b):\n", | |
" \"\"\"Returns difference between 2 lists\"\"\"\n", | |
" return list(set(a) - set(b))\n", | |
"\n", | |
"def get_distinct_values(col):\n", | |
" \"\"\"Returns distinct values in an attribute and\n", | |
" their counts.\n", | |
" \"\"\"\n", | |
" return np.unique(col, return_counts=True)\n", | |
"\n", | |
"def calc_entropy(a):\n", | |
" \"\"\"Calculates entropy of a single value of an attribute.\"\"\"\n", | |
" total = a.shape[0]\n", | |
" entropy = 0\n", | |
" values, counts = get_distinct_values(a)\n", | |
" for count in counts:\n", | |
" prob = count/float(total)\n", | |
" entropy += (-1)*prob*np.log2(prob)\n", | |
" return entropy\n", | |
"\n", | |
"def calc_expected_entropy(a):\n", | |
" \"\"\"Calculates expected entropy of an attribute.\"\"\"\n", | |
" total = a.shape[0]\n", | |
" expected_entropy = 0\n", | |
" values, counts = get_distinct_values(a[:, 0])\n", | |
" for value, count in zip(values, counts): \n", | |
" value_entropy = calc_entropy(a[np.where(a[:, 0] == value),\n", | |
" 1][0])\n", | |
" expected_entropy += (count/float(total))*value_entropy\n", | |
" return expected_entropy\n", | |
"\n", | |
"def calc_information_gain(a):\n", | |
" \"\"\"Calculate information gain of an attribute.\"\"\"\n", | |
" return calc_entropy(a[:,1]) - calc_expected_entropy(a)\n", | |
"\n", | |
"def pick_attribute(features, labels, attributes):\n", | |
" \"\"\"Picks the best attribute based on the information\n", | |
" gain.\n", | |
" \"\"\"\n", | |
" att = attributes[0]\n", | |
" max_info_gain = 0\n", | |
" for i in attributes:\n", | |
" info_gain = calc_information_gain(np.hstack((features[:, i].reshape(features.shape[0], 1),\n", | |
" labels.reshape(labels.shape[0],\n", | |
" 1))))\n", | |
" if info_gain > max_info_gain:\n", | |
" max_info_gain = info_gain\n", | |
" att = i \n", | |
" return att" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Tree structure\n", | |
"\n", | |
"class TreeNode(object):\n", | |
" \"\"\"Class that stores information of a decision tree node\"\"\"\n", | |
" def __init__(self, value=None, split_attribute=None, children=None, depth=None):\n", | |
" self.value = value\n", | |
" # If leaf, split_attribute will be label\n", | |
" self.split_attribute = split_attribute\n", | |
" # If leaf, children will an emptly list\n", | |
" self.children = children\n", | |
" \n", | |
" def set_split_attribute(self, split_attribute):\n", | |
" self.split_attribute = split_attribute\n", | |
" \n", | |
" def set_value(self, value):\n", | |
" self.value = value\n", | |
" \n", | |
" def set_children(self, children):\n", | |
" self.children = children" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 123, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Build tree\n", | |
" \n", | |
"def build_tree(features, labels, attributes, all_features, depth = 30):\n", | |
" \"\"\"Builds decition tree with the feature vectors and labels\"\"\"\n", | |
" # If all examples have same label\n", | |
" if (np.unique(labels).shape[0] == 1 or depth == 0):\n", | |
" return TreeNode(split_attribute=labels[0])\n", | |
" \n", | |
" # Create root node\n", | |
" root_node = TreeNode()\n", | |
" children = []\n", | |
" # Pick the best attribute to divide upon\n", | |
" best_att = pick_attribute(features, labels, attributes)\n", | |
" root_node.set_split_attribute(best_att)\n", | |
" \n", | |
" # Iterate over each distinct value of the selected attribute\n", | |
" for value in np.unique(all_features[:, best_att]):\n", | |
" subset_indices = np.where(features[:, best_att] == value)\n", | |
" if subset_indices[0].shape[0] == 0 or len(attributes) == 1:\n", | |
" # If no examples, add a leaf node with most frequent label\n", | |
" u, indices = np.unique(labels, return_inverse=True) \n", | |
" label = u[np.argmax(np.bincount(indices))]\n", | |
" child = TreeNode(value=value, split_attribute=label)\n", | |
" else:\n", | |
" # Else recurse with subset of examples\n", | |
" child = build_tree(features[subset_indices, :][0],\n", | |
" labels[subset_indices],\n", | |
" get_list_diff(attributes, [best_att]),\n", | |
" all_features,\n", | |
" depth-1)\n", | |
" child.set_value(value)\n", | |
" children.append(child)\n", | |
" root_node.set_children(children) \n", | |
" return root_node" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def print_tree(tree, s):\n", | |
" \"\"\"Print decision tree\"\"\"\n", | |
" if tree.children == None:\n", | |
" print s, tree.value, \",\", tree.split_attribute\n", | |
" return \n", | |
" else:\n", | |
" print s, tree.value, \",\", tree.split_attribute\n", | |
" s += \" \"\n", | |
" for child in tree.children:\n", | |
" print_tree(child, s)\n", | |
"\n", | |
"def predict(tree, feature_vector):\n", | |
" \"\"\"Predict label for a given feature vector\"\"\"\n", | |
" if tree.children == None:\n", | |
" return tree.split_attribute\n", | |
" else:\n", | |
" feature_value = feature_vector[tree.split_attribute]\n", | |
" for child in tree.children:\n", | |
" # (Needed to put int(), otherwise comparison does not work!)\n", | |
"# if int(feature_value) == int(child.value):\n", | |
" if feature_value == child.value:\n", | |
" return predict(child, feature_vector)\n", | |
" return None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# File reading\n", | |
"def read_file(file_path):\n", | |
" \"\"\"Reads a text file and returns it as a ndarray.\"\"\"\n", | |
" return np.genfromtxt(file_path, delimiter=',', dtype='|S21')\n", | |
"\n", | |
"# Evaluation\n", | |
"def calc_accuracy(tree, test_set):\n", | |
" \"\"\"Calcuates classification accuracy of the decision tree with\n", | |
" a test data set.\n", | |
" \"\"\"\n", | |
" number_of_features = test_set.shape[1] - 1\n", | |
" correct_count = 0\n", | |
" total = test_set.shape[0]\n", | |
" \n", | |
" for i in range(total):\n", | |
" pred = predict(tree, test_set[i, :number_of_features])\n", | |
" if pred == test_set[i, number_of_features]:\n", | |
" correct_count += 1\n", | |
" return correct_count/float(total)\n", | |
"\n", | |
"# k-fold cross validation\n", | |
"def k_fold_cross_validate(tree, datasets, depth, n_features):\n", | |
" \"\"\"Perferms k-fold cross validation and returns average accuracy.\"\"\" \n", | |
" average_accuracy = 0\n", | |
" for i in range(len(datasets)):\n", | |
" train = np.empty((1, datasets[i].shape[1]), dtype='|S16')\n", | |
" for j in range(dataset):\n", | |
" if (j!=i):\n", | |
" train = np.vstack((train, datasets[j]))\n", | |
" train = train[1:, :]\n", | |
" tree = build_tree(train[:, :n_features], train[:, n_features],\n", | |
" np.arange(n_features), train[:, :n_features], depth) \n", | |
" average_accuracy += calc_accuracy(tree, datasets[i])\n", | |
" average_accuracy /= float(len(datasets))\n", | |
" return average_accuracy\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 142, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[array([['x', 's', 'y', ..., 'n', 'g', 'e'],\n", | |
" ['b', 's', 'w', ..., 'n', 'm', 'e'],\n", | |
" ['x', 'y', 'w', ..., 's', 'u', 'p'],\n", | |
" ..., \n", | |
" ['f', 'f', 'g', ..., 's', 'g', 'e'],\n", | |
" ['f', 's', 'n', ..., 'v', 'g', 'p'],\n", | |
" ['x', 'y', 'y', ..., 'n', 'm', 'e']], \n", | |
" dtype='|S21'), array([['f', 's', 'g', ..., 'a', 'g', 'e'],\n", | |
" ['x', 's', 'w', ..., 's', 'g', 'p'],\n", | |
" ['x', 'f', 'g', ..., 's', 'g', 'e'],\n", | |
" ..., \n", | |
" ['f', 'f', 'w', ..., 'a', 'g', 'e'],\n", | |
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n", | |
" ['x', 'f', 'n', ..., 'v', 'd', 'e']], \n", | |
" dtype='|S21'), array([['x', 's', 'w', ..., 'a', 'g', 'e'],\n", | |
" ['x', 's', 'n', ..., 'a', 'g', 'e'],\n", | |
" ['x', 'y', 'n', ..., 'v', 'd', 'e'],\n", | |
" ..., \n", | |
" ['f', 'f', 'n', ..., 's', 'g', 'e'],\n", | |
" ['x', 'f', 'e', ..., 'y', 'd', 'e'],\n", | |
" ['x', 'y', 'g', ..., 'y', 'd', 'e']], \n", | |
" dtype='|S21'), array([['x', 'y', 'e', ..., 'y', 'd', 'e'],\n", | |
" ['f', 'f', 'n', ..., 'v', 'd', 'e'],\n", | |
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n", | |
" ..., \n", | |
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n", | |
" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n", | |
" ['f', 'y', 'e', ..., 'v', 'd', 'e']], \n", | |
" dtype='|S21'), array([['f', 'y', 'n', ..., 'v', 'd', 'e'],\n", | |
" ['x', 'f', 'g', ..., 'y', 'd', 'e'],\n", | |
" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n", | |
" ..., \n", | |
" ['f', 'f', 'g', ..., 'v', 'd', 'p'],\n", | |
" ['f', 'y', 'g', ..., 'y', 'p', 'p'],\n", | |
" ['x', 'f', 'y', ..., 'v', 'd', 'p']], \n", | |
" dtype='|S21'), array([['x', 'y', 'y', ..., 'y', 'p', 'p'],\n", | |
" ['f', 'f', 'y', ..., 'v', 'd', 'p'],\n", | |
" ['x', 'y', 'g', ..., 'y', 'd', 'p'],\n", | |
" ..., \n", | |
" ['f', 's', 'n', ..., 'y', 'd', 'e'],\n", | |
" ['b', 'y', 'n', ..., 'y', 'p', 'e'],\n", | |
" ['x', 'y', 'n', ..., 'y', 'p', 'e']], \n", | |
" dtype='|S21')]" | |
] | |
}, | |
"execution_count": 142, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"datasets = []\n", | |
"k = 6\n", | |
"for i in range(k):\n", | |
" file_name = 'datasets/SettingA/CVSplits/training_0' + str(i) + '.data'\n", | |
" datasets.append(read_file(file_name))\n", | |
"datasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(3822, 23) (1822, 23)\n" | |
] | |
} | |
], | |
"source": [ | |
"train_data = read_file('datasets/SettingA/training.data')\n", | |
"test_data = read_file('datasets/SettingA/CVSplits/training_00.data')\n", | |
"print train_data.shape, test_data.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 126, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" None , 4\n", | |
" a , e\n", | |
" c , p\n", | |
" f , p\n", | |
" l , e\n", | |
" m , p\n", | |
" n , 2\n", | |
" b , p\n", | |
" c , e\n", | |
" e , e\n", | |
" g , e\n", | |
" n , e\n", | |
" p , 0\n", | |
" b , p\n", | |
" c , p\n", | |
" f , 8\n", | |
" g , p\n", | |
" h , p\n", | |
" k , p\n", | |
" n , p\n", | |
" p , p\n", | |
" r , p\n", | |
" u , p\n", | |
" w , 1\n", | |
" f , e\n", | |
" g , e\n", | |
" s , p\n", | |
" y , p\n", | |
" y , p\n", | |
" k , p\n", | |
" s , p\n", | |
" x , e\n", | |
" w , 3\n", | |
" f , e\n", | |
" t , p\n", | |
" y , p\n", | |
" p , p\n" | |
] | |
} | |
], | |
"source": [ | |
"tree = build_tree(train_data[:, :15], train_data[:, 22], list(np.arange(15)), train_data[:, :15], 5)\n", | |
"print_tree(tree, \"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1.0" | |
] | |
}, | |
"execution_count": 119, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"calc_accuracy(tree, test_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['b', 'c', 'e', 'g', 'n', 'p', 'w', 'y'], \n", | |
" dtype='|S21')" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.unique(train_data[:, 2])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.96419241338405048, 0.38027468867166203)" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"calc_entropy(train_data[:, 22]), calc_expected_entropy(np.hstack((train_data[:, 19].reshape(train_data.shape[0], 1),\n", | |
" train_data[:, 22].reshape(train_data[:, 22].shape[0],\n", | |
" 1))))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = 'a'\n", | |
"b = 'b'\n", | |
"test = np.array([[1, 1, 2, 3, 3, 3, 2, 1, 1, 3, 1, 2, 2, 3],\n", | |
" [1, 1, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2],\n", | |
" [1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1],\n", | |
" [1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2],\n", | |
" [a, a, b, b, b, a, b, a, b, b, b, b, b, a]]).T" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4\n" | |
] | |
} | |
], | |
"source": [ | |
"a = 5\n", | |
"print a-1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0, 1, 2, 3]\n", | |
"[1, 2, 3]\n", | |
"[1, 2, 3]\n" | |
] | |
} | |
], | |
"source": [ | |
"tree = build_tree(test[:, :4], test[:, 4], [0, 1, 2, 3])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" None , 0\n", | |
" 1 , 2\n", | |
" 1 , a\n", | |
" 2 , b\n", | |
" 2 , b\n", | |
" 3 , 3\n", | |
" 1 , b\n", | |
" 2 , a\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tree(tree, \"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'b'" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"predict(tree, np.array([2, 1, 1, 1]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['a', 'a', 'b', 'b', 'b', 'a', 'b', 'a', 'b', 'b', 'b', 'b', 'b', 'a'], \n", | |
" dtype='|S21')" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test[:, 4]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 133, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 133, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pick_attribute(test[:, :4], test[:, 4], [0, 1, 2, 3])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 164, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[<__main__.TreeNode at 0x7fdc8e94acd0>, <__main__.TreeNode at 0x7fdc8e94aa10>]" | |
] | |
}, | |
"execution_count": 164, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tree.children[0].children" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = np.array([-1, 3, 3, 3, 5, 4, 2, 1, -7])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 132, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = np.empty((1, 5), dtype='|S16')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 139, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([['a', 'b', 'c', 'd', 'e'],\n", | |
" ['a', 'b', 'c', 'd', 'e']], \n", | |
" dtype='|S16')" | |
] | |
}, | |
"execution_count": 139, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.vstack((a, [['a', 'b', 'c', 'd', 'e'], ['a', 'b', 'c', 'd', 'e']]))[1:, :]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 131, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1, 3, 4, 8]" | |
] | |
}, | |
"execution_count": 131, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"celltoolbar": "Raw Cell Format", | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment