Skip to content

Instantly share code, notes, and snippets.

@maheshakya
Last active September 12, 2016 15:50
Show Gist options
  • Save maheshakya/c6c3011c476672d0f2c21855a278e27d to your computer and use it in GitHub Desktop.
Save maheshakya/c6c3011c476672d0f2c21855a278e27d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Required subroutines\n",
"\n",
"def get_list_diff(a, b):\n",
" \"\"\"Returns difference between 2 lists\"\"\"\n",
" return list(set(a) - set(b))\n",
"\n",
"def get_distinct_values(col):\n",
" \"\"\"Returns distinct values in an attribute and\n",
" their counts.\n",
" \"\"\"\n",
" return np.unique(col, return_counts=True)\n",
"\n",
"def calc_entropy(a):\n",
" \"\"\"Calculates entropy of a single value of an attribute.\"\"\"\n",
" total = a.shape[0]\n",
" entropy = 0\n",
" values, counts = get_distinct_values(a)\n",
" for count in counts:\n",
" prob = count/float(total)\n",
" entropy += (-1)*prob*np.log2(prob)\n",
" return entropy\n",
"\n",
"def calc_expected_entropy(a):\n",
" \"\"\"Calculates expected entropy of an attribute.\"\"\"\n",
" total = a.shape[0]\n",
" expected_entropy = 0\n",
" values, counts = get_distinct_values(a[:, 0])\n",
" for value, count in zip(values, counts): \n",
" value_entropy = calc_entropy(a[np.where(a[:, 0] == value),\n",
" 1][0])\n",
" expected_entropy += (count/float(total))*value_entropy\n",
" return expected_entropy\n",
"\n",
"def calc_information_gain(a):\n",
" \"\"\"Calculate information gain of an attribute.\"\"\"\n",
" return calc_entropy(a[:,1]) - calc_expected_entropy(a)\n",
"\n",
"def pick_attribute(features, labels, attributes):\n",
" \"\"\"Picks the best attribute based on the information\n",
" gain.\n",
" \"\"\"\n",
" att = attributes[0]\n",
" max_info_gain = 0\n",
" for i in attributes:\n",
" info_gain = calc_information_gain(np.hstack((features[:, i].reshape(features.shape[0], 1),\n",
" labels.reshape(labels.shape[0],\n",
" 1))))\n",
" if info_gain > max_info_gain:\n",
" max_info_gain = info_gain\n",
" att = i \n",
" return att"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Tree structure\n",
"\n",
"class TreeNode(object):\n",
" \"\"\"Class that stores information of a decision tree node\"\"\"\n",
" def __init__(self, value=None, split_attribute=None, children=None, depth=None):\n",
" self.value = value\n",
" # If leaf, split_attribute will be label\n",
" self.split_attribute = split_attribute\n",
" # If leaf, children will an emptly list\n",
" self.children = children\n",
" \n",
" def set_split_attribute(self, split_attribute):\n",
" self.split_attribute = split_attribute\n",
" \n",
" def set_value(self, value):\n",
" self.value = value\n",
" \n",
" def set_children(self, children):\n",
" self.children = children"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Build tree\n",
" \n",
"def build_tree(features, labels, attributes, all_features, depth = 30):\n",
" \"\"\"Builds decition tree with the feature vectors and labels\"\"\"\n",
" # If all examples have same label\n",
" if (np.unique(labels).shape[0] == 1 or depth == 0):\n",
" return TreeNode(split_attribute=labels[0])\n",
" \n",
" # Create root node\n",
" root_node = TreeNode()\n",
" children = []\n",
" # Pick the best attribute to divide upon\n",
" best_att = pick_attribute(features, labels, attributes)\n",
" root_node.set_split_attribute(best_att)\n",
" \n",
" # Iterate over each distinct value of the selected attribute\n",
" for value in np.unique(all_features[:, best_att]):\n",
" subset_indices = np.where(features[:, best_att] == value)\n",
" if subset_indices[0].shape[0] == 0 or len(attributes) == 1:\n",
" # If no examples, add a leaf node with most frequent label\n",
" u, indices = np.unique(labels, return_inverse=True) \n",
" label = u[np.argmax(np.bincount(indices))]\n",
" child = TreeNode(value=value, split_attribute=label)\n",
" else:\n",
" # Else recurse with subset of examples\n",
" child = build_tree(features[subset_indices, :][0],\n",
" labels[subset_indices],\n",
" get_list_diff(attributes, [best_att]),\n",
" all_features,\n",
" depth-1)\n",
" child.set_value(value)\n",
" children.append(child)\n",
" root_node.set_children(children) \n",
" return root_node"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def print_tree(tree, s):\n",
" \"\"\"Print decision tree\"\"\"\n",
" if tree.children == None:\n",
" print s, tree.value, \",\", tree.split_attribute\n",
" return \n",
" else:\n",
" print s, tree.value, \",\", tree.split_attribute\n",
" s += \" \"\n",
" for child in tree.children:\n",
" print_tree(child, s)\n",
"\n",
"def predict(tree, feature_vector):\n",
" \"\"\"Predict label for a given feature vector\"\"\"\n",
" if tree.children == None:\n",
" return tree.split_attribute\n",
" else:\n",
" feature_value = feature_vector[tree.split_attribute]\n",
" for child in tree.children:\n",
" # (Needed to put int(), otherwise comparison does not work!)\n",
"# if int(feature_value) == int(child.value):\n",
" if feature_value == child.value:\n",
" return predict(child, feature_vector)\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# File reading\n",
"def read_file(file_path):\n",
" \"\"\"Reads a text file and returns it as a ndarray.\"\"\"\n",
" return np.genfromtxt(file_path, delimiter=',', dtype='|S21')\n",
"\n",
"# Evaluation\n",
"def calc_accuracy(tree, test_set):\n",
" \"\"\"Calcuates classification accuracy of the decision tree with\n",
" a test data set.\n",
" \"\"\"\n",
" number_of_features = test_set.shape[1] - 1\n",
" correct_count = 0\n",
" total = test_set.shape[0]\n",
" \n",
" for i in range(total):\n",
" pred = predict(tree, test_set[i, :number_of_features])\n",
" if pred == test_set[i, number_of_features]:\n",
" correct_count += 1\n",
" return correct_count/float(total)\n",
"\n",
"# k-fold cross validation\n",
"def k_fold_cross_validate(tree, datasets, depth, n_features):\n",
" \"\"\"Perferms k-fold cross validation and returns average accuracy.\"\"\" \n",
" average_accuracy = 0\n",
" for i in range(len(datasets)):\n",
" train = np.empty((1, datasets[i].shape[1]), dtype='|S16')\n",
" for j in range(dataset):\n",
" if (j!=i):\n",
" train = np.vstack((train, datasets[j]))\n",
" train = train[1:, :]\n",
" tree = build_tree(train[:, :n_features], train[:, n_features],\n",
" np.arange(n_features), train[:, :n_features], depth) \n",
" average_accuracy += calc_accuracy(tree, datasets[i])\n",
" average_accuracy /= float(len(datasets))\n",
" return average_accuracy\n"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[array([['x', 's', 'y', ..., 'n', 'g', 'e'],\n",
" ['b', 's', 'w', ..., 'n', 'm', 'e'],\n",
" ['x', 'y', 'w', ..., 's', 'u', 'p'],\n",
" ..., \n",
" ['f', 'f', 'g', ..., 's', 'g', 'e'],\n",
" ['f', 's', 'n', ..., 'v', 'g', 'p'],\n",
" ['x', 'y', 'y', ..., 'n', 'm', 'e']], \n",
" dtype='|S21'), array([['f', 's', 'g', ..., 'a', 'g', 'e'],\n",
" ['x', 's', 'w', ..., 's', 'g', 'p'],\n",
" ['x', 'f', 'g', ..., 's', 'g', 'e'],\n",
" ..., \n",
" ['f', 'f', 'w', ..., 'a', 'g', 'e'],\n",
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
" ['x', 'f', 'n', ..., 'v', 'd', 'e']], \n",
" dtype='|S21'), array([['x', 's', 'w', ..., 'a', 'g', 'e'],\n",
" ['x', 's', 'n', ..., 'a', 'g', 'e'],\n",
" ['x', 'y', 'n', ..., 'v', 'd', 'e'],\n",
" ..., \n",
" ['f', 'f', 'n', ..., 's', 'g', 'e'],\n",
" ['x', 'f', 'e', ..., 'y', 'd', 'e'],\n",
" ['x', 'y', 'g', ..., 'y', 'd', 'e']], \n",
" dtype='|S21'), array([['x', 'y', 'e', ..., 'y', 'd', 'e'],\n",
" ['f', 'f', 'n', ..., 'v', 'd', 'e'],\n",
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
" ..., \n",
" ['f', 'f', 'g', ..., 'v', 'd', 'e'],\n",
" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
" ['f', 'y', 'e', ..., 'v', 'd', 'e']], \n",
" dtype='|S21'), array([['f', 'y', 'n', ..., 'v', 'd', 'e'],\n",
" ['x', 'f', 'g', ..., 'y', 'd', 'e'],\n",
" ['f', 'y', 'n', ..., 'y', 'd', 'e'],\n",
" ..., \n",
" ['f', 'f', 'g', ..., 'v', 'd', 'p'],\n",
" ['f', 'y', 'g', ..., 'y', 'p', 'p'],\n",
" ['x', 'f', 'y', ..., 'v', 'd', 'p']], \n",
" dtype='|S21'), array([['x', 'y', 'y', ..., 'y', 'p', 'p'],\n",
" ['f', 'f', 'y', ..., 'v', 'd', 'p'],\n",
" ['x', 'y', 'g', ..., 'y', 'd', 'p'],\n",
" ..., \n",
" ['f', 's', 'n', ..., 'y', 'd', 'e'],\n",
" ['b', 'y', 'n', ..., 'y', 'p', 'e'],\n",
" ['x', 'y', 'n', ..., 'y', 'p', 'e']], \n",
" dtype='|S21')]"
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"datasets = []\n",
"k = 6\n",
"for i in range(k):\n",
" file_name = 'datasets/SettingA/CVSplits/training_0' + str(i) + '.data'\n",
" datasets.append(read_file(file_name))\n",
"datasets"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3822, 23) (1822, 23)\n"
]
}
],
"source": [
"train_data = read_file('datasets/SettingA/training.data')\n",
"test_data = read_file('datasets/SettingA/CVSplits/training_00.data')\n",
"print train_data.shape, test_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" None , 4\n",
" a , e\n",
" c , p\n",
" f , p\n",
" l , e\n",
" m , p\n",
" n , 2\n",
" b , p\n",
" c , e\n",
" e , e\n",
" g , e\n",
" n , e\n",
" p , 0\n",
" b , p\n",
" c , p\n",
" f , 8\n",
" g , p\n",
" h , p\n",
" k , p\n",
" n , p\n",
" p , p\n",
" r , p\n",
" u , p\n",
" w , 1\n",
" f , e\n",
" g , e\n",
" s , p\n",
" y , p\n",
" y , p\n",
" k , p\n",
" s , p\n",
" x , e\n",
" w , 3\n",
" f , e\n",
" t , p\n",
" y , p\n",
" p , p\n"
]
}
],
"source": [
"tree = build_tree(train_data[:, :15], train_data[:, 22], list(np.arange(15)), train_data[:, :15], 5)\n",
"print_tree(tree, \"\")"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calc_accuracy(tree, test_data)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['b', 'c', 'e', 'g', 'n', 'p', 'w', 'y'], \n",
" dtype='|S21')"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.unique(train_data[:, 2])"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.96419241338405048, 0.38027468867166203)"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calc_entropy(train_data[:, 22]), calc_expected_entropy(np.hstack((train_data[:, 19].reshape(train_data.shape[0], 1),\n",
" train_data[:, 22].reshape(train_data[:, 22].shape[0],\n",
" 1))))"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = 'a'\n",
"b = 'b'\n",
"test = np.array([[1, 1, 2, 3, 3, 3, 2, 1, 1, 3, 1, 2, 2, 3],\n",
" [1, 1, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2],\n",
" [1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1],\n",
" [1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2],\n",
" [a, a, b, b, b, a, b, a, b, b, b, b, b, a]]).T"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4\n"
]
}
],
"source": [
"a = 5\n",
"print a-1"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1, 2, 3]\n",
"[1, 2, 3]\n",
"[1, 2, 3]\n"
]
}
],
"source": [
"tree = build_tree(test[:, :4], test[:, 4], [0, 1, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" None , 0\n",
" 1 , 2\n",
" 1 , a\n",
" 2 , b\n",
" 2 , b\n",
" 3 , 3\n",
" 1 , b\n",
" 2 , a\n"
]
}
],
"source": [
"print_tree(tree, \"\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'b'"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict(tree, np.array([2, 1, 1, 1]))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['a', 'a', 'b', 'b', 'b', 'a', 'b', 'a', 'b', 'b', 'b', 'b', 'b', 'a'], \n",
" dtype='|S21')"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[:, 4]"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pick_attribute(test[:, :4], test[:, 4], [0, 1, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<__main__.TreeNode at 0x7fdc8e94acd0>, <__main__.TreeNode at 0x7fdc8e94aa10>]"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tree.children[0].children"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = np.array([-1, 3, 3, 3, 5, 4, 2, 1, -7])"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = np.empty((1, 5), dtype='|S16')"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([['a', 'b', 'c', 'd', 'e'],\n",
" ['a', 'b', 'c', 'd', 'e']], \n",
" dtype='|S16')"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.vstack((a, [['a', 'b', 'c', 'd', 'e'], ['a', 'b', 'c', 'd', 'e']]))[1:, :]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[1, 3, 4, 8]"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"celltoolbar": "Raw Cell Format",
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment