Skip to content

Instantly share code, notes, and snippets.

@alexland
Created May 30, 2014 21:41
Show Gist options
  • Save alexland/f361e7d2f1e4e696daad to your computer and use it in GitHub Desktop.
Save alexland/f361e7d2f1e4e696daad to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:b36f80df3bf1f1bc15998bd0e858e404cad16214a1468d0cd028664e9545fdbe"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# imports, configs, etc.\n",
"\n",
"import os\n",
"import sys\n",
"import re\n",
"import csv as CSV\n",
"from copy import deepcopy\n",
"import collections as CL\n",
"import itertools as IT\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import numpy as NP\n",
"from scipy import linalg as LA\n",
"from IPython.display import display\n",
"from sympy.interactive import printing\n",
"import sympy as SYM\n",
"from sympy import Matrix as MAT\n",
"from sympy.mpmath import *\n",
"printing.init_printing()\n",
"\n",
"from IPython.external import mathjax; mathjax.install_mathjax()\n",
"\n",
"%matplotlib inline\n",
"from matplotlib import pyplot as PLT\n",
"from matplotlib import cm as CM\n",
"from mpl_toolkits import axes_grid1 as AG\n",
"from mpl_toolkits.mplot3d import Axes3D as AX\n",
"\n",
"NP.set_printoptions(precision=3, suppress=True)\n",
"PLT.rcParams['figure.figsize'] = (8.0, 7.0)\n",
"\n",
"%config InlineBackend.figure_format = 'svg' \n",
"\n",
"my_font_config = {'family' : 'sans-serif',\n",
" 'color' : '#2A52BE',\n",
" 'weight' : 'normal',\n",
" 'size' : 14,\n",
" }\n",
"\n",
"from nltk.corpus import stopwords\n",
"\n",
"DATA_DIR = \"~/data\"\n",
"DATA_DIR = os.path.expanduser(DATA_DIR)\n",
"PROJ_DIR = os.path.join(DATA_DIR, \"mobile-apps\")\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"/Users/dougybarbo/.ipython/nbextensions/mathjax/MathJax.js already exists\n"
]
}
],
"prompt_number": 94
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(len(stopwords.words('english')))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"127\n"
]
}
],
"prompt_number": 95
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"I. Data Processing"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"_In sum, the following data procesing workflow transforms the supplied raw data into data suitable for input to a machine learning algorithm. In particular, this workflow begins with a sequence of data instances, each is a raw \"bag of words\"; this sequence is transformed into a structured \"incidence matrix\" in which the data instances are represented by the rows. Each column represents an attribute or feature, which in this case is the presence or absence of a given term._"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"labels_file = os.path.join(PROJ_DIR, \"class_labels.txt\")\n",
"data_file = os.path.join(PROJ_DIR, \"data.txt\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 96
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"SW = stopwords.words('english')\n",
"\n",
"import re\n",
"ptn_nwc = \"[!#$%&'*+/=?`{|}~^.-]\"\n",
"ptn_nwc_obj = re.compile(ptn_nwc, re.MULTILINE)\n",
"\n",
"# open the two files\n",
"# read them in\n",
"# normalize: lower case the text \n",
"# remove end-of-line whitespace\n",
"# remove punctuation\n",
"# tokenize the lines\n",
"\n",
"with open(data_file, mode='r', encoding='utf-8') as fh:\n",
" d = ( ptn_nwc_obj.sub('', line.strip().lower()).split() \n",
" for line in fh.readlines() )\n",
"\n",
"with open(labels_file, mode='r', encoding='utf-8') as fh:\n",
" l = [ int(line.strip()) for line in fh.readlines() ] \n",
" \n",
"# remove 'stop words' (using the NLTK set) &\n",
"# remove words comprised of three letters or fewer\n",
"d = (filter(lambda v: (v not in SW) & (len(v) > 4), line) for line in d)\n",
"d = deepcopy([list(line) for line in d])\n",
"\n",
"# remove frequent terms common to all mobile apps:\n",
"# (generated by scraping the app summaries \n",
"# from AppData's 1000 most popular mobiles apps)\n",
"DOMAIN_STOP_WORDS = ['android', 'free', 'iphone', 'twitter', 'download', \n",
" 'feature', 'features', 'applications', 'application', \n",
" 'user', 'users', 'version', 'versions', 'facebook', \n",
" 'phone', 'available', 'using', 'information', 'provide',\n",
" 'include', 'every', 'device', 'mobile', 'friend',\n",
" 'different', 'please', 'simple', 'email', 'share', 'follow',\n",
" 'great', 'screen', 'provide', 'acces', 'first', 'sound', 'video',]\n",
" \n",
"d = (filter(lambda v: (v not in DOMAIN_STOP_WORDS), line) for line in d)\n",
"\n",
"# normalize: simple word stemming\n",
"def stem(word):\n",
" if word.endswith('s'):\n",
" return word[:-1]\n",
" else:\n",
" return word\n",
"\n",
"d = (list(map(stem, line)) for line in d)\n",
"\n",
"d1 = deepcopy([list(line) for line in d])\n",
"\n",
"lx = NP.array([len(line) for line in d1])\n",
"\n",
"# ~ 75 lines have 10 words or fewer\n",
"idx = lx > 10\n",
"sum(-idx)\n",
"\n",
"# so (temporarily) filter lines having 10 words or fewer &\n",
"# filter their corresponding class labels\n",
"\n",
"idx = idx.tolist()\n",
"d = IT.compress(d1, idx)\n",
"l = IT.compress(l, idx)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 97
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# partition the data & class labels into class I and class 0\n",
"\n",
"d = deepcopy([list(line) for line in d])\n",
"l = deepcopy([line for line in l])\n",
"\n",
"assert len(d) == len(l)\n",
"\n",
"# shuffle both containers\n",
"idx = NP.random.permutation(NP.arange(len(d)))\n",
"d, l = NP.array(d), NP.array(l, dtype='int8')\n",
"d, l = d[idx], l[idx]\n",
"\n",
"idx1 = l==1\n",
"idx0 = l==0\n",
"d1, l1 = d[idx1], l[idx1]\n",
"d0, l0 = d[idx0], l[idx0] \n",
"\n",
"L = NP.array(l)\n",
"\n",
"assert d1.size == l1.size\n",
"assert d0.size == l0.size"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 98
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"q1 = NP.array([len(line) for line in d1])\n",
"q0 = NP.array([len(line) for line in d0])\n",
"\n",
"print(round(q0.mean(), 2))\n",
"print(round(q1.mean(), 2))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"81.87\n",
"84.53\n"
]
}
],
"prompt_number": 99
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# look at the data by class\n",
"w1 = [ word for line in d1 for word in line ]\n",
"w0 = [ word for line in d0 for word in line ]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 100
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import collections as CL\n",
"\n",
"words1 = CL.defaultdict(int)\n",
"words0 = CL.defaultdict(int)\n",
"\n",
"for word in w1:\n",
" words1[word] += 1\n",
"\n",
"for word in w0:\n",
" words0[word] += 1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 101
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"w1_freq = sorted(zip(words1.values(), words1.keys()), reverse=True)\n",
"w0_freq = sorted(zip(words0.values(), words0.keys()), reverse=True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 102
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"v1 = [t[1] for t in w1_freq[:100]]\n",
"v0 = [t[1] for t in w0_freq[:100]]\n",
"v1.extend(v0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 103
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"II. Constructing the Feature Vector"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"from the 50 most common terms in each class"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def build_feature_vector(data, feature_vector):\n",
" \"\"\"\n",
" returns: a structured 2D data array comprised of in which\n",
" each column encodes one discrete feature; each row\n",
" represents one data instance\n",
" pass in: \n",
" (i) the data: a nested list in which each list is one data instance,\n",
" or 'bag of words';\n",
" (ii) a template feature vector: a list of terms, comprising a subset\n",
" of the population whose frequency will be counted to to supply\n",
" the values comprising each feature vector \n",
" this fn transforms a sequence of word bags (each bag is a python list)\n",
" into a structured 1D NumPy array of features\n",
" \"\"\"\n",
" fv = set(feature_vector)\n",
" # maps each most-frequent term to an offset in feature vector\n",
" term_vector_lut = { t:i for i, t in enumerate(fv) }\n",
" # remove all words from each line not in the feature_vector\n",
" d = (filter(lambda q: q in fv, line) for line in data)\n",
" d = deepcopy([list(line) for line in d])\n",
" # initialize the empty 2D NumPy array returned \n",
" m, n = len(d), len(term_vector_lut)\n",
" D = NP.zeros((m, n))\n",
" dx = CL.defaultdict(int)\n",
" c = 0\n",
" for line in d:\n",
" new_row = NP.zeros(len(fv))\n",
" for w in line:\n",
" idx = term_vector_lut[w]\n",
" new_row[idx] += 1\n",
" D[c,:] = new_row\n",
" c += 1\n",
" return D"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 104
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"v1 = [t[1] for t in w1_freq[:50]]\n",
"v0 = [t[1] for t in w0_freq[:50]]\n",
"\n",
"v1.extend(v0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 105
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"D = build_feature_vector(d, v1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 106
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%timeit build_feature_vector(d, v1)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"10 loops, best of 3: 177 ms per loop\n"
]
}
],
"prompt_number": 107
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# are any attributes empty?\n",
"# (if so, remove this feature--no predictive value and will caluse \n",
"# division by 0 when i attempt to mean center the data\n",
"\n",
"feature_val_sum = D.sum(axis=0)\n",
"assert feature_val_sum.min() > 0"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 108
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"r1 = D[0,:]\n",
"v = D[:50,:12]\n",
"# print(v)\n",
"print(D.shape)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(2701, 86)\n"
]
}
],
"prompt_number": 109
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"save the structured data to disk"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"NP.unique(L)\n",
"idx1 = L==1\n",
"idx0 = L==0\n",
"\n",
"D1 = D[idx1,]\n",
"D0 = D[idx0,]\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 110
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dfs = os.path.join(PROJ_DIR, 'data_structured.csv')\n",
"\n",
"with open(dfs, 'w', encoding='utf-8') as fh:\n",
" writer = CSV.writer(fh, delimiter=',', quotechar='|', \n",
" quoting=CSV.QUOTE_MINIMAL)\n",
" writer.writerows(D.tolist())"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 111
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def persist_structured_data(data, file_path):\n",
" \"\"\"\n",
" returns: nothing, creates a file called 'data_structured.csv'\n",
" in the file_path passed in\n",
" pass in: \n",
" (i) 2D NumPy array\n",
" (ii) unix absolute file path\n",
" \"\"\"\n",
" dfs = os.path.join(file_path, 'data_structured.csv')\n",
" with open(dfs, 'w', encoding='utf-8') as fh:\n",
" writer = CSV.writer(fh, delimiter=',', quotechar='|', quoting=CSV.QUOTE_MINIMAL)\n",
" writer.writerows(D.tolist())"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 112
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"persist_structured_data(D, PROJ_DIR)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 113
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"some simple analysis of the data to assess the general suitability of this data set for use in building a classifier"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import warnings\n",
"warnings.filterwarnings('ignore', r\"object.__format__ with a non-empty format string is deprecated\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 114
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"a1 = [ (w, c) for c, w in w1_freq[:25] ]\n",
"a0 = [ (w, c) for c, w in w0_freq[:25] ]\n",
"a10 = zip(a1, a0)\n",
"H1 = '{0} most frequent terms by class'.format(len(a1))\n",
"h2a = 'class I'\n",
"h2b = 'class 0'\n",
"ula, ulb = 15 * '_', 15 * '_'\n",
"print(\"{0:^50}\\n\".format(H1))\n",
"print(\"{0:^20}\\t{1:^30}\".format(h2a, h2b))\n",
"print(\"{0:30}\\t{1:35}\".format(ula, ulb))\n",
"\n",
"for itm in a10:\n",
" print(\"{0} {1:32} {2}\".format(' ', itm[0][0], itm[1][0]))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 25 most frequent terms by class \n",
"\n",
" class I \t class 0 \n",
"_______________ \t_______________ \n",
" workout player\n",
" exercise world\n",
" training level\n",
" fitnes game\n",
" weight friend\n",
" muscle score\n",
" track wallpaper\n",
" calorie bible\n",
" health track\n",
" program sport\n",
" daily church\n",
" timer acces\n",
" level support\n",
" routine search\n",
" trainer point\n",
" running photo\n",
" personal football\n",
" distance right\n",
" heart music\n",
" minute favorite\n",
" support update\n",
" result league\n",
" progres speed\n",
" allow including\n",
" sport number\n"
]
}
],
"prompt_number": 115
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# how many terms appear in both classes?\n",
"c1_terms = { w[0] for w in a1 }\n",
"c0_terms = { w[0] for w in a0 }\n",
"\n",
"print(len(c1_terms & c0_terms))\n",
"print(\"terms in both classes: \\n\\n{0}\".format(c1_terms & c0_terms))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"4\n",
"terms in both classes: \n",
"\n",
"{'support', 'level', 'sport', 'track'}\n"
]
}
],
"prompt_number": 116
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"frequencies of all terms in the feature vector by class"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"idx1 = L==1\n",
"idx1 = idx1.squeeze()\n",
"idx0 = L==0\n",
"idx0 = idx0.squeeze()\n",
"\n",
"d1 = D[idx1,]\n",
"d0 = D[idx0,]\n",
"s1 = d1.sum(axis=0)\n",
"s0 = d0.sum(axis=0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 117
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fig = PLT.figure(figsize=(8, 6))\n",
"ttl = \"class I vs II frequencies across the Term Vector\"\n",
"ax = fig.add_subplot(111, xticks=[])\n",
"ax.plot(s0, color='#FF7E00', lw=0.7, ms=None)\n",
"ax.plot(s1, color='#2E5894', lw=0.7, ms=None)\n",
"fig.text(.5, .88, ttl, ha='center', va='top', color='#062A78', fontsize=12)\n",
"ax.grid(True)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"svg": [
"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Created with matplotlib (http://matplotlib.org/) -->\n",
"<svg height=\"358pt\" version=\"1.1\" viewBox=\"0 0 488 358\" width=\"488pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
" <defs>\n",
" <style type=\"text/css\">\n",
"*{stroke-linecap:butt;stroke-linejoin:round;}\n",
" </style>\n",
" </defs>\n",
" <g id=\"figure_1\">\n",
" <g id=\"patch_1\">\n",
" <path d=\"\n",
"M0 358.878\n",
"L488.853 358.878\n",
"L488.853 0\n",
"L0 0\n",
"z\n",
"\" style=\"fill:#ffffff;\"/>\n",
" </g>\n",
" <g id=\"axes_1\">\n",
" <g id=\"patch_2\">\n",
" <path d=\"\n",
"M35.2531 346.839\n",
"L481.653 346.839\n",
"L481.653 12.0391\n",
"L35.2531 12.0391\n",
"z\n",
"\" style=\"fill:#ffffff;\"/>\n",
" </g>\n",
" <g id=\"line2d_1\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 336.393\n",
"L40.5049 290.057\n",
"L45.7567 323.939\n",
"L51.0084 327.287\n",
"L56.2602 335.188\n",
"L61.5119 342.821\n",
"L66.7637 338.402\n",
"L72.0155 342.688\n",
"L77.2672 345.232\n",
"L82.519 330.635\n",
"L87.7708 339.875\n",
"L93.0225 326.215\n",
"L98.2743 345.366\n",
"L103.526 322.733\n",
"L108.778 312.689\n",
"L114.03 339.34\n",
"L119.281 323.001\n",
"L124.533 344.964\n",
"L129.785 320.859\n",
"L135.037 344.027\n",
"L140.288 332.644\n",
"L145.54 323.403\n",
"L150.792 320.457\n",
"L156.044 338.268\n",
"L161.295 314.966\n",
"L166.547 315.234\n",
"L171.799 323.403\n",
"L177.051 292.601\n",
"L182.303 346.303\n",
"L187.554 320.859\n",
"L192.806 318.046\n",
"L198.058 341.214\n",
"L203.31 310.949\n",
"L208.561 320.725\n",
"L213.813 320.457\n",
"L219.065 321.26\n",
"L224.317 324.608\n",
"L229.568 309.208\n",
"L234.82 320.323\n",
"L240.072 342.821\n",
"L245.324 311.484\n",
"L250.575 313.895\n",
"L255.827 338.804\n",
"L261.079 331.572\n",
"L266.331 322.6\n",
"L271.583 330.367\n",
"L276.834 281.084\n",
"L282.086 314.297\n",
"L287.338 309.341\n",
"L292.59 325.948\n",
"L297.841 330.501\n",
"L303.093 326.483\n",
"L308.345 335.188\n",
"L313.597 315.904\n",
"L318.848 342.42\n",
"L324.1 304.922\n",
"L329.352 336.929\n",
"L334.604 342.42\n",
"L339.855 313.761\n",
"L345.107 339.473\n",
"L350.359 326.483\n",
"L355.611 290.057\n",
"L360.863 338.67\n",
"L366.114 326.483\n",
"L371.366 322.733\n",
"L376.618 339.34\n",
"L381.87 322.064\n",
"L387.121 321.662\n",
"L392.373 324.207\n",
"L397.625 341.081\n",
"L402.877 295.949\n",
"L408.128 302.378\n",
"L413.38 328.492\n",
"L418.632 342.42\n",
"L423.884 295.28\n",
"L429.135 309.475\n",
"L434.387 323.135\n",
"L439.639 326.483\n",
"L444.891 321.796\n",
"L450.143 339.072\n",
"L455.394 344.964\n",
"L460.646 343.625\n",
"L465.898 326.215\n",
"L471.15 325.948\n",
"L476.401 321.26\n",
"L481.653 327.153\" style=\"fill:none;stroke:#ff7e00;stroke-linecap:square;stroke-width:0.7;\"/>\n",
" </g>\n",
" <g id=\"line2d_2\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 310.413\n",
"L40.5049 299.164\n",
"L45.7567 331.572\n",
"L51.0084 308.136\n",
"L56.2602 318.448\n",
"L61.5119 321.662\n",
"L66.7637 301.708\n",
"L72.0155 206.223\n",
"L77.2672 312.288\n",
"L82.519 321.796\n",
"L87.7708 275.861\n",
"L93.0225 307.065\n",
"L98.2743 299.297\n",
"L103.526 327.153\n",
"L108.778 327.153\n",
"L114.03 307.199\n",
"L119.281 346.705\n",
"L124.533 300.905\n",
"L129.785 346.437\n",
"L135.037 318.984\n",
"L140.288 320.189\n",
"L145.54 218.008\n",
"L150.792 316.573\n",
"L156.044 302.512\n",
"L161.295 325.948\n",
"L166.547 310.011\n",
"L171.799 329.429\n",
"L177.051 344.027\n",
"L182.303 253.497\n",
"L187.554 334.518\n",
"L192.806 327.287\n",
"L198.058 270.906\n",
"L203.31 318.85\n",
"L208.561 322.867\n",
"L213.813 330.501\n",
"L219.065 326.081\n",
"L224.317 322.198\n",
"L229.568 257.514\n",
"L234.82 346.303\n",
"L240.072 39.8944\n",
"L245.324 306.261\n",
"L250.575 332.376\n",
"L255.827 321.662\n",
"L261.079 282.424\n",
"L266.331 310.011\n",
"L271.583 321.126\n",
"L276.834 340.679\n",
"L282.086 346.169\n",
"L287.338 309.341\n",
"L292.59 331.17\n",
"L297.841 322.198\n",
"L303.093 337.599\n",
"L308.345 304.386\n",
"L313.597 331.17\n",
"L318.848 316.707\n",
"L324.1 346.705\n",
"L329.352 198.857\n",
"L334.604 321.26\n",
"L339.855 326.081\n",
"L345.107 304.922\n",
"L350.359 322.867\n",
"L355.611 327.956\n",
"L360.863 104.444\n",
"L366.114 330.635\n",
"L371.366 333.045\n",
"L376.618 275.326\n",
"L381.87 336.259\n",
"L387.121 331.572\n",
"L392.373 337.197\n",
"L397.625 288.182\n",
"L402.877 339.072\n",
"L408.128 342.42\n",
"L413.38 312.422\n",
"L418.632 319.385\n",
"L423.884 313.493\n",
"L429.135 346.839\n",
"L434.387 326.617\n",
"L439.639 319.252\n",
"L444.891 326.885\n",
"L450.143 305.19\n",
"L455.394 316.707\n",
"L460.646 313.895\n",
"L465.898 332.242\n",
"L471.15 334.652\n",
"L476.401 319.252\n",
"L481.653 315.368\" style=\"fill:none;stroke:#2e5894;stroke-linecap:square;stroke-width:0.7;\"/>\n",
" </g>\n",
" <g id=\"matplotlib.axis_1\"/>\n",
" <g id=\"matplotlib.axis_2\">\n",
" <g id=\"ytick_1\">\n",
" <g id=\"line2d_3\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 346.839\n",
"L481.653 346.839\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_4\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L4 0\" id=\"md7965d1ba0\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_5\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L-4 0\" id=\"md9a1c1a7cd\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_1\">\n",
" <!-- 0 -->\n",
" <defs>\n",
" <path d=\"\n",
"M31.7812 66.4062\n",
"Q24.1719 66.4062 20.3281 58.9062\n",
"Q16.5 51.4219 16.5 36.375\n",
"Q16.5 21.3906 20.3281 13.8906\n",
"Q24.1719 6.39062 31.7812 6.39062\n",
"Q39.4531 6.39062 43.2812 13.8906\n",
"Q47.125 21.3906 47.125 36.375\n",
"Q47.125 51.4219 43.2812 58.9062\n",
"Q39.4531 66.4062 31.7812 66.4062\n",
"M31.7812 74.2188\n",
"Q44.0469 74.2188 50.5156 64.5156\n",
"Q56.9844 54.8281 56.9844 36.375\n",
"Q56.9844 17.9688 50.5156 8.26562\n",
"Q44.0469 -1.42188 31.7812 -1.42188\n",
"Q19.5312 -1.42188 13.0625 8.26562\n",
"Q6.59375 17.9688 6.59375 36.375\n",
"Q6.59375 54.8281 13.0625 64.5156\n",
"Q19.5312 74.2188 31.7812 74.2188\" id=\"BitstreamVeraSans-Roman-30\"/>\n",
" </defs>\n",
" <g transform=\"translate(26.2140625 349.5984375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_2\">\n",
" <g id=\"line2d_6\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 279.879\n",
"L481.653 279.879\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_7\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"279.8790625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_8\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"279.8790625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_2\">\n",
" <!-- 500 -->\n",
" <defs>\n",
" <path d=\"\n",
"M10.7969 72.9062\n",
"L49.5156 72.9062\n",
"L49.5156 64.5938\n",
"L19.8281 64.5938\n",
"L19.8281 46.7344\n",
"Q21.9688 47.4688 24.1094 47.8281\n",
"Q26.2656 48.1875 28.4219 48.1875\n",
"Q40.625 48.1875 47.75 41.5\n",
"Q54.8906 34.8125 54.8906 23.3906\n",
"Q54.8906 11.625 47.5625 5.09375\n",
"Q40.2344 -1.42188 26.9062 -1.42188\n",
"Q22.3125 -1.42188 17.5469 -0.640625\n",
"Q12.7969 0.140625 7.71875 1.70312\n",
"L7.71875 11.625\n",
"Q12.1094 9.23438 16.7969 8.0625\n",
"Q21.4844 6.89062 26.7031 6.89062\n",
"Q35.1562 6.89062 40.0781 11.3281\n",
"Q45.0156 15.7656 45.0156 23.3906\n",
"Q45.0156 31 40.0781 35.4375\n",
"Q35.1562 39.8906 26.7031 39.8906\n",
"Q22.75 39.8906 18.8125 39.0156\n",
"Q14.8906 38.1406 10.7969 36.2812\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-35\"/>\n",
" </defs>\n",
" <g transform=\"translate(13.6015625 282.6384375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-35\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"127.24609375\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_3\">\n",
" <g id=\"line2d_9\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 212.919\n",
"L481.653 212.919\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_10\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"212.9190625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_11\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"212.9190625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_3\">\n",
" <!-- 1000 -->\n",
" <defs>\n",
" <path d=\"\n",
"M12.4062 8.29688\n",
"L28.5156 8.29688\n",
"L28.5156 63.9219\n",
"L10.9844 60.4062\n",
"L10.9844 69.3906\n",
"L28.4219 72.9062\n",
"L38.2812 72.9062\n",
"L38.2812 8.29688\n",
"L54.3906 8.29688\n",
"L54.3906 0\n",
"L12.4062 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-31\"/>\n",
" </defs>\n",
" <g transform=\"translate(7.565625 215.6784375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-31\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"127.24609375\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"190.869140625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_4\">\n",
" <g id=\"line2d_12\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 145.959\n",
"L481.653 145.959\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_13\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"145.9590625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_14\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"145.9590625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_4\">\n",
" <!-- 1500 -->\n",
" <g transform=\"translate(7.565625 148.7184375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-31\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-35\"/>\n",
" <use x=\"127.24609375\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"190.869140625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_5\">\n",
" <g id=\"line2d_15\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 78.9991\n",
"L481.653 78.9991\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_16\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"78.9990625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_17\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"78.9990625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_5\">\n",
" <!-- 2000 -->\n",
" <defs>\n",
" <path d=\"\n",
"M19.1875 8.29688\n",
"L53.6094 8.29688\n",
"L53.6094 0\n",
"L7.32812 0\n",
"L7.32812 8.29688\n",
"Q12.9375 14.1094 22.625 23.8906\n",
"Q32.3281 33.6875 34.8125 36.5312\n",
"Q39.5469 41.8438 41.4219 45.5312\n",
"Q43.3125 49.2188 43.3125 52.7812\n",
"Q43.3125 58.5938 39.2344 62.25\n",
"Q35.1562 65.9219 28.6094 65.9219\n",
"Q23.9688 65.9219 18.8125 64.3125\n",
"Q13.6719 62.7031 7.8125 59.4219\n",
"L7.8125 69.3906\n",
"Q13.7656 71.7812 18.9375 73\n",
"Q24.125 74.2188 28.4219 74.2188\n",
"Q39.75 74.2188 46.4844 68.5469\n",
"Q53.2188 62.8906 53.2188 53.4219\n",
"Q53.2188 48.9219 51.5312 44.8906\n",
"Q49.8594 40.875 45.4062 35.4062\n",
"Q44.1875 33.9844 37.6406 27.2188\n",
"Q31.1094 20.4531 19.1875 8.29688\" id=\"BitstreamVeraSans-Roman-32\"/>\n",
" </defs>\n",
" <g transform=\"translate(7.2 81.7584375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-32\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"127.24609375\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"190.869140625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_6\">\n",
" <g id=\"line2d_18\">\n",
" <path clip-path=\"url(#p0bf5753ecd)\" d=\"\n",
"M35.2531 12.0391\n",
"L481.653 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_19\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"35.253125\" xlink:href=\"#md7965d1ba0\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_20\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"481.653125\" xlink:href=\"#md9a1c1a7cd\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_6\">\n",
" <!-- 2500 -->\n",
" <g transform=\"translate(7.2 14.7984375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-32\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-35\"/>\n",
" <use x=\"127.24609375\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"190.869140625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"patch_3\">\n",
" <path d=\"\n",
"M35.2531 346.839\n",
"L35.2531 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_4\">\n",
" <path d=\"\n",
"M481.653 346.839\n",
"L481.653 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_5\">\n",
" <path d=\"\n",
"M35.2531 346.839\n",
"L481.653 346.839\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_6\">\n",
" <path d=\"\n",
"M35.2531 12.0391\n",
"L481.653 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_7\">\n",
" <!-- class I vs II frequencies across the Term Vector -->\n",
" <defs>\n",
" <path d=\"\n",
"M9.42188 75.9844\n",
"L18.4062 75.9844\n",
"L18.4062 0\n",
"L9.42188 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-6c\"/>\n",
" <path d=\"\n",
"M34.2812 27.4844\n",
"Q23.3906 27.4844 19.1875 25\n",
"Q14.9844 22.5156 14.9844 16.5\n",
"Q14.9844 11.7188 18.1406 8.90625\n",
"Q21.2969 6.10938 26.7031 6.10938\n",
"Q34.1875 6.10938 38.7031 11.4062\n",
"Q43.2188 16.7031 43.2188 25.4844\n",
"L43.2188 27.4844\n",
"z\n",
"\n",
"M52.2031 31.2031\n",
"L52.2031 0\n",
"L43.2188 0\n",
"L43.2188 8.29688\n",
"Q40.1406 3.32812 35.5469 0.953125\n",
"Q30.9531 -1.42188 24.3125 -1.42188\n",
"Q15.9219 -1.42188 10.9531 3.29688\n",
"Q6 8.01562 6 15.9219\n",
"Q6 25.1406 12.1719 29.8281\n",
"Q18.3594 34.5156 30.6094 34.5156\n",
"L43.2188 34.5156\n",
"L43.2188 35.4062\n",
"Q43.2188 41.6094 39.1406 45\n",
"Q35.0625 48.3906 27.6875 48.3906\n",
"Q23 48.3906 18.5469 47.2656\n",
"Q14.1094 46.1406 10.0156 43.8906\n",
"L10.0156 52.2031\n",
"Q14.9375 54.1094 19.5781 55.0469\n",
"Q24.2188 56 28.6094 56\n",
"Q40.4844 56 46.3438 49.8438\n",
"Q52.2031 43.7031 52.2031 31.2031\" id=\"BitstreamVeraSans-Roman-61\"/>\n",
" <path id=\"BitstreamVeraSans-Roman-20\"/>\n",
" <path d=\"\n",
"M52 44.1875\n",
"Q55.375 50.25 60.0625 53.125\n",
"Q64.75 56 71.0938 56\n",
"Q79.6406 56 84.2812 50.0156\n",
"Q88.9219 44.0469 88.9219 33.0156\n",
"L88.9219 0\n",
"L79.8906 0\n",
"L79.8906 32.7188\n",
"Q79.8906 40.5781 77.0938 44.375\n",
"Q74.3125 48.1875 68.6094 48.1875\n",
"Q61.625 48.1875 57.5625 43.5469\n",
"Q53.5156 38.9219 53.5156 30.9062\n",
"L53.5156 0\n",
"L44.4844 0\n",
"L44.4844 32.7188\n",
"Q44.4844 40.625 41.7031 44.4062\n",
"Q38.9219 48.1875 33.1094 48.1875\n",
"Q26.2188 48.1875 22.1562 43.5312\n",
"Q18.1094 38.875 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q21.1875 51.2188 25.4844 53.6094\n",
"Q29.7812 56 35.6875 56\n",
"Q41.6562 56 45.8281 52.9688\n",
"Q50 49.9531 52 44.1875\" id=\"BitstreamVeraSans-Roman-6d\"/>\n",
" <path d=\"\n",
"M8.5 21.5781\n",
"L8.5 54.6875\n",
"L17.4844 54.6875\n",
"L17.4844 21.9219\n",
"Q17.4844 14.1562 20.5 10.2656\n",
"Q23.5312 6.39062 29.5938 6.39062\n",
"Q36.8594 6.39062 41.0781 11.0312\n",
"Q45.3125 15.6719 45.3125 23.6875\n",
"L45.3125 54.6875\n",
"L54.2969 54.6875\n",
"L54.2969 0\n",
"L45.3125 0\n",
"L45.3125 8.40625\n",
"Q42.0469 3.42188 37.7188 1\n",
"Q33.4062 -1.42188 27.6875 -1.42188\n",
"Q18.2656 -1.42188 13.375 4.4375\n",
"Q8.5 10.2969 8.5 21.5781\" id=\"BitstreamVeraSans-Roman-75\"/>\n",
" <path d=\"\n",
"M56.2031 29.5938\n",
"L56.2031 25.2031\n",
"L14.8906 25.2031\n",
"Q15.4844 15.9219 20.4844 11.0625\n",
"Q25.4844 6.20312 34.4219 6.20312\n",
"Q39.5938 6.20312 44.4531 7.46875\n",
"Q49.3125 8.73438 54.1094 11.2812\n",
"L54.1094 2.78125\n",
"Q49.2656 0.734375 44.1875 -0.34375\n",
"Q39.1094 -1.42188 33.8906 -1.42188\n",
"Q20.7969 -1.42188 13.1562 6.1875\n",
"Q5.51562 13.8125 5.51562 26.8125\n",
"Q5.51562 40.2344 12.7656 48.1094\n",
"Q20.0156 56 32.3281 56\n",
"Q43.3594 56 49.7812 48.8906\n",
"Q56.2031 41.7969 56.2031 29.5938\n",
"M47.2188 32.2344\n",
"Q47.125 39.5938 43.0938 43.9844\n",
"Q39.0625 48.3906 32.4219 48.3906\n",
"Q24.9062 48.3906 20.3906 44.1406\n",
"Q15.875 39.8906 15.1875 32.1719\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-65\"/>\n",
" <path d=\"\n",
"M48.7812 52.5938\n",
"L48.7812 44.1875\n",
"Q44.9688 46.2969 41.1406 47.3438\n",
"Q37.3125 48.3906 33.4062 48.3906\n",
"Q24.6562 48.3906 19.8125 42.8438\n",
"Q14.9844 37.3125 14.9844 27.2969\n",
"Q14.9844 17.2812 19.8125 11.7344\n",
"Q24.6562 6.20312 33.4062 6.20312\n",
"Q37.3125 6.20312 41.1406 7.25\n",
"Q44.9688 8.29688 48.7812 10.4062\n",
"L48.7812 2.09375\n",
"Q45.0156 0.34375 40.9844 -0.53125\n",
"Q36.9688 -1.42188 32.4219 -1.42188\n",
"Q20.0625 -1.42188 12.7812 6.34375\n",
"Q5.51562 14.1094 5.51562 27.2969\n",
"Q5.51562 40.6719 12.8594 48.3281\n",
"Q20.2188 56 33.0156 56\n",
"Q37.1562 56 41.1094 55.1406\n",
"Q45.0625 54.2969 48.7812 52.5938\" id=\"BitstreamVeraSans-Roman-63\"/>\n",
" <path d=\"\n",
"M9.8125 72.9062\n",
"L19.6719 72.9062\n",
"L19.6719 0\n",
"L9.8125 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-49\"/>\n",
" <path d=\"\n",
"M2.98438 54.6875\n",
"L12.5 54.6875\n",
"L29.5938 8.79688\n",
"L46.6875 54.6875\n",
"L56.2031 54.6875\n",
"L35.6875 0\n",
"L23.4844 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-76\"/>\n",
" <path d=\"\n",
"M37.1094 75.9844\n",
"L37.1094 68.5\n",
"L28.5156 68.5\n",
"Q23.6875 68.5 21.7969 66.5469\n",
"Q19.9219 64.5938 19.9219 59.5156\n",
"L19.9219 54.6875\n",
"L34.7188 54.6875\n",
"L34.7188 47.7031\n",
"L19.9219 47.7031\n",
"L19.9219 0\n",
"L10.8906 0\n",
"L10.8906 47.7031\n",
"L2.29688 47.7031\n",
"L2.29688 54.6875\n",
"L10.8906 54.6875\n",
"L10.8906 58.5\n",
"Q10.8906 67.625 15.1406 71.7969\n",
"Q19.3906 75.9844 28.6094 75.9844\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-66\"/>\n",
" <path d=\"\n",
"M-0.296875 72.9062\n",
"L61.375 72.9062\n",
"L61.375 64.5938\n",
"L35.5 64.5938\n",
"L35.5 0\n",
"L25.5938 0\n",
"L25.5938 64.5938\n",
"L-0.296875 64.5938\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-54\"/>\n",
" <path d=\"\n",
"M18.3125 70.2188\n",
"L18.3125 54.6875\n",
"L36.8125 54.6875\n",
"L36.8125 47.7031\n",
"L18.3125 47.7031\n",
"L18.3125 18.0156\n",
"Q18.3125 11.3281 20.1406 9.42188\n",
"Q21.9688 7.51562 27.5938 7.51562\n",
"L36.8125 7.51562\n",
"L36.8125 0\n",
"L27.5938 0\n",
"Q17.1875 0 13.2344 3.875\n",
"Q9.28125 7.76562 9.28125 18.0156\n",
"L9.28125 47.7031\n",
"L2.6875 47.7031\n",
"L2.6875 54.6875\n",
"L9.28125 54.6875\n",
"L9.28125 70.2188\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-74\"/>\n",
" <path d=\"\n",
"M54.8906 33.0156\n",
"L54.8906 0\n",
"L45.9062 0\n",
"L45.9062 32.7188\n",
"Q45.9062 40.4844 42.875 44.3281\n",
"Q39.8438 48.1875 33.7969 48.1875\n",
"Q26.5156 48.1875 22.3125 43.5469\n",
"Q18.1094 38.9219 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q21.3438 51.125 25.7031 53.5625\n",
"Q30.0781 56 35.7969 56\n",
"Q45.2188 56 50.0469 50.1719\n",
"Q54.8906 44.3438 54.8906 33.0156\" id=\"BitstreamVeraSans-Roman-6e\"/>\n",
" <path d=\"\n",
"M41.1094 46.2969\n",
"Q39.5938 47.1719 37.8125 47.5781\n",
"Q36.0312 48 33.8906 48\n",
"Q26.2656 48 22.1875 43.0469\n",
"Q18.1094 38.0938 18.1094 28.8125\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q20.9531 51.1719 25.4844 53.5781\n",
"Q30.0312 56 36.5312 56\n",
"Q37.4531 56 38.5781 55.875\n",
"Q39.7031 55.7656 41.0625 55.5156\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-72\"/>\n",
" <path d=\"\n",
"M54.8906 33.0156\n",
"L54.8906 0\n",
"L45.9062 0\n",
"L45.9062 32.7188\n",
"Q45.9062 40.4844 42.875 44.3281\n",
"Q39.8438 48.1875 33.7969 48.1875\n",
"Q26.5156 48.1875 22.3125 43.5469\n",
"Q18.1094 38.9219 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 75.9844\n",
"L18.1094 75.9844\n",
"L18.1094 46.1875\n",
"Q21.3438 51.125 25.7031 53.5625\n",
"Q30.0781 56 35.7969 56\n",
"Q45.2188 56 50.0469 50.1719\n",
"Q54.8906 44.3438 54.8906 33.0156\" id=\"BitstreamVeraSans-Roman-68\"/>\n",
" <path d=\"\n",
"M28.6094 0\n",
"L0.78125 72.9062\n",
"L11.0781 72.9062\n",
"L34.1875 11.5312\n",
"L57.3281 72.9062\n",
"L67.5781 72.9062\n",
"L39.7969 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-56\"/>\n",
" <path d=\"\n",
"M14.7969 27.2969\n",
"Q14.7969 17.3906 18.875 11.75\n",
"Q22.9531 6.10938 30.0781 6.10938\n",
"Q37.2031 6.10938 41.2969 11.75\n",
"Q45.4062 17.3906 45.4062 27.2969\n",
"Q45.4062 37.2031 41.2969 42.8438\n",
"Q37.2031 48.4844 30.0781 48.4844\n",
"Q22.9531 48.4844 18.875 42.8438\n",
"Q14.7969 37.2031 14.7969 27.2969\n",
"M45.4062 8.20312\n",
"Q42.5781 3.32812 38.25 0.953125\n",
"Q33.9375 -1.42188 27.875 -1.42188\n",
"Q17.9688 -1.42188 11.7344 6.48438\n",
"Q5.51562 14.4062 5.51562 27.2969\n",
"Q5.51562 40.1875 11.7344 48.0938\n",
"Q17.9688 56 27.875 56\n",
"Q33.9375 56 38.25 53.625\n",
"Q42.5781 51.2656 45.4062 46.3906\n",
"L45.4062 54.6875\n",
"L54.3906 54.6875\n",
"L54.3906 -20.7969\n",
"L45.4062 -20.7969\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-71\"/>\n",
" <path d=\"\n",
"M30.6094 48.3906\n",
"Q23.3906 48.3906 19.1875 42.75\n",
"Q14.9844 37.1094 14.9844 27.2969\n",
"Q14.9844 17.4844 19.1562 11.8438\n",
"Q23.3438 6.20312 30.6094 6.20312\n",
"Q37.7969 6.20312 41.9844 11.8594\n",
"Q46.1875 17.5312 46.1875 27.2969\n",
"Q46.1875 37.0156 41.9844 42.7031\n",
"Q37.7969 48.3906 30.6094 48.3906\n",
"M30.6094 56\n",
"Q42.3281 56 49.0156 48.375\n",
"Q55.7188 40.7656 55.7188 27.2969\n",
"Q55.7188 13.875 49.0156 6.21875\n",
"Q42.3281 -1.42188 30.6094 -1.42188\n",
"Q18.8438 -1.42188 12.1719 6.21875\n",
"Q5.51562 13.875 5.51562 27.2969\n",
"Q5.51562 40.7656 12.1719 48.375\n",
"Q18.8438 56 30.6094 56\" id=\"BitstreamVeraSans-Roman-6f\"/>\n",
" <path d=\"\n",
"M9.42188 54.6875\n",
"L18.4062 54.6875\n",
"L18.4062 0\n",
"L9.42188 0\n",
"z\n",
"\n",
"M9.42188 75.9844\n",
"L18.4062 75.9844\n",
"L18.4062 64.5938\n",
"L9.42188 64.5938\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-69\"/>\n",
" <path d=\"\n",
"M44.2812 53.0781\n",
"L44.2812 44.5781\n",
"Q40.4844 46.5312 36.375 47.5\n",
"Q32.2812 48.4844 27.875 48.4844\n",
"Q21.1875 48.4844 17.8438 46.4375\n",
"Q14.5 44.3906 14.5 40.2812\n",
"Q14.5 37.1562 16.8906 35.375\n",
"Q19.2812 33.5938 26.5156 31.9844\n",
"L29.5938 31.2969\n",
"Q39.1562 29.25 43.1875 25.5156\n",
"Q47.2188 21.7812 47.2188 15.0938\n",
"Q47.2188 7.46875 41.1875 3.01562\n",
"Q35.1562 -1.42188 24.6094 -1.42188\n",
"Q20.2188 -1.42188 15.4531 -0.5625\n",
"Q10.6875 0.296875 5.42188 2\n",
"L5.42188 11.2812\n",
"Q10.4062 8.6875 15.2344 7.39062\n",
"Q20.0625 6.10938 24.8125 6.10938\n",
"Q31.1562 6.10938 34.5625 8.28125\n",
"Q37.9844 10.4531 37.9844 14.4062\n",
"Q37.9844 18.0625 35.5156 20.0156\n",
"Q33.0625 21.9688 24.7031 23.7812\n",
"L21.5781 24.5156\n",
"Q13.2344 26.2656 9.51562 29.9062\n",
"Q5.8125 33.5469 5.8125 39.8906\n",
"Q5.8125 47.6094 11.2812 51.7969\n",
"Q16.75 56 26.8125 56\n",
"Q31.7812 56 36.1719 55.2656\n",
"Q40.5781 54.5469 44.2812 53.0781\" id=\"BitstreamVeraSans-Roman-73\"/>\n",
" </defs>\n",
" <g style=\"fill:#062a78;\" transform=\"translate(111.3528125 29.7971875)scale(0.12 -0.12)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"54.98046875\" xlink:href=\"#BitstreamVeraSans-Roman-6c\"/>\n",
" <use x=\"82.763671875\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"144.04296875\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"196.142578125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"248.2421875\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"280.029296875\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"309.521484375\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"341.30859375\" xlink:href=\"#BitstreamVeraSans-Roman-76\"/>\n",
" <use x=\"400.48828125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"452.587890625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"484.375\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"513.8671875\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"543.359375\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"575.146484375\" xlink:href=\"#BitstreamVeraSans-Roman-66\"/>\n",
" <use x=\"610.3515625\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"649.21484375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"710.73828125\" xlink:href=\"#BitstreamVeraSans-Roman-71\"/>\n",
" <use x=\"774.21484375\" xlink:href=\"#BitstreamVeraSans-Roman-75\"/>\n",
" <use x=\"837.59375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"899.1171875\" xlink:href=\"#BitstreamVeraSans-Roman-6e\"/>\n",
" <use x=\"962.49609375\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"1017.4765625\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"1045.259765625\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1106.783203125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1158.8828125\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1190.669921875\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"1251.94921875\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"1306.9296875\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"1345.79296875\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"1406.974609375\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1459.07421875\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1511.173828125\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1542.9609375\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"1582.169921875\" xlink:href=\"#BitstreamVeraSans-Roman-68\"/>\n",
" <use x=\"1645.548828125\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1707.072265625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1738.859375\" xlink:href=\"#BitstreamVeraSans-Roman-54\"/>\n",
" <use x=\"1782.943359375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1844.466796875\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"1883.830078125\" xlink:href=\"#BitstreamVeraSans-Roman-6d\"/>\n",
" <use x=\"1981.2421875\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"2013.029296875\" xlink:href=\"#BitstreamVeraSans-Roman-56\"/>\n",
" <use x=\"2073.6875\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"2135.2109375\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"2190.19140625\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"2229.400390625\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"2290.58203125\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <defs>\n",
" <clipPath id=\"p0bf5753ecd\">\n",
" <rect height=\"334.8\" width=\"446.4\" x=\"35.253125\" y=\"12.0390625\"/>\n",
" </clipPath>\n",
" </defs>\n",
"</svg>\n"
],
"text": [
"<matplotlib.figure.Figure at 0x10790b0f0>"
]
}
],
"prompt_number": 122
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# check for degeneracy in the transformed data matrix:"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# by calculating the covariance matrix of the data matrix (matrix whose rows \n",
"# is comprised of feature vectors--just first 25 most frequent terms)\n",
"D1 = D[:,:20]\n",
"C = NP.corrcoef(D1, rowvar=0)\n",
"C.shape\n",
"\n",
"# a correctly computed covariance matrix will have '1's down the main diagonal &\n",
"# have a shape of n x n (from the original m x n array\n",
"dg = C.diagonal()\n",
"\n",
"assert NP.trace(C) == dg.size\n",
"assert C.shape == (D1.shape[1], D1.shape[1])\n",
"\n",
"NP.set_printoptions(precision=2, suppress=True, linewidth=130)\n",
"from pprint import pprint\n",
"print(C)\n",
"\n",
"\n",
"# fig = PLT.figure(figsize=(8, 6))\n",
"# ax = fig.add_subplot(111, xticks=[], yticks=[])\n",
"# ax.imshow(C, cmap=CM.Greys, interpolation='nearest')\n",
"\n",
"# the covariance matrix, rendered below as a 'heatmap' indicates minimal feature covariance\n",
"# this is good news for the data in its current form, but it also indicates that\n",
"# PCA might not be a useful pre-processing technique for this data"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[ 1. 0. 0.02 0.04 0.01 0.03 0.14 0.08 0.09 -0.01 0.1 0.11 0.04 0.05 0.02 0.16 -0.02 0.04 -0.02 -0. ]\n",
" [ 0. 1. 0.06 -0.01 0.09 0.07 0.01 0.11 0.11 0.02 0.04 0.07 0.03 -0.01 -0.04 0.06 -0.03 0.03 0.16 0.05]\n",
" [ 0.02 0.06 1. 0.05 0.1 -0. -0. 0. 0.04 -0.02 -0. 0.01 -0.01 -0.01 -0.01 0.03 -0.02 -0.02 0.01 -0. ]\n",
" [ 0.04 -0.01 0.05 1. 0.04 0.06 0.08 0.03 0.02 -0.03 0.04 0.05 0.09 0.04 0.04 0.12 -0.01 0.03 -0.04 -0.02]\n",
" [ 0.01 0.09 0.1 0.04 1. 0.02 0.03 0.17 0.18 0.05 0.08 0.08 0.04 -0.02 -0.03 0.06 -0.02 0.06 -0.01 0.05]\n",
" [ 0.03 0.07 -0. 0.06 0.02 1. 0.02 0.03 0.01 0.01 0.06 0.02 0.02 -0.01 -0.01 0.06 -0. 0.02 -0.01 0.02]\n",
" [ 0.14 0.01 -0. 0.08 0.03 0.02 1. 0.05 -0.01 -0.02 0.18 0.01 0.01 0.03 -0.01 0.15 -0.01 0.05 -0.03 -0.02]\n",
" [ 0.08 0.11 0. 0.03 0.17 0.03 0.05 1. 0.32 0.08 0.18 0.13 0.12 0.02 -0.02 0.23 -0.03 0.27 -0.04 0. ]\n",
" [ 0.09 0.11 0.04 0.02 0.18 0.01 -0.01 0.32 1. 0.03 0.07 0.05 0.13 -0.02 0. 0.11 -0.02 0.1 -0.02 -0. ]\n",
" [-0.01 0.02 -0.02 -0.03 0.05 0.01 -0.02 0.08 0.03 1. 0.07 0.03 0.05 0.05 0.05 0. -0.02 0.04 -0.03 -0.01]\n",
" [ 0.1 0.04 -0. 0.04 0.08 0.06 0.18 0.18 0.07 0.07 1. 0.07 0.06 0.06 0.04 0.14 -0.02 0.1 -0.03 0.01]\n",
" [ 0.11 0.07 0.01 0.05 0.08 0.02 0.01 0.13 0.05 0.03 0.07 1. 0.01 0.02 0.06 0.12 -0.02 0.06 -0.03 0.01]\n",
" [ 0.04 0.03 -0.01 0.09 0.04 0.02 0.01 0.12 0.13 0.05 0.06 0.01 1. -0.03 0.01 0.18 -0.02 0.17 -0.02 -0.01]\n",
" [ 0.05 -0.01 -0.01 0.04 -0.02 -0.01 0.03 0.02 -0.02 0.05 0.06 0.02 -0.03 1. 0.16 -0.03 0.02 -0.02 -0.02 -0.01]\n",
" [ 0.02 -0.04 -0.01 0.04 -0.03 -0.01 -0.01 -0.02 0. 0.05 0.04 0.06 0.01 0.16 1. 0.02 0.02 -0.02 0.03 -0.01]\n",
" [ 0.16 0.06 0.03 0.12 0.06 0.06 0.15 0.23 0.11 0. 0.14 0.12 0.18 -0.03 0.02 1. -0.03 0.12 -0.01 -0.01]\n",
" [-0.02 -0.03 -0.02 -0.01 -0.02 -0. -0.01 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0.02 0.02 -0.03 1. -0.02 -0.01 -0.01]\n",
" [ 0.04 0.03 -0.02 0.03 0.06 0.02 0.05 0.27 0.1 0.04 0.1 0.06 0.17 -0.02 -0.02 0.12 -0.02 1. -0.03 -0.02]\n",
" [-0.02 0.16 0.01 -0.04 -0.01 -0.01 -0.03 -0.04 -0.02 -0.03 -0.03 -0.03 -0.02 -0.02 0.03 -0.01 -0.01 -0.03 1. -0.01]\n",
" [-0. 0.05 -0. -0.02 0.05 0.02 -0.02 0. -0. -0.01 0.01 0.01 -0.01 -0.01 -0.01 -0.01 -0.01 -0.02 -0.01 1. ]]\n"
]
}
],
"prompt_number": 133
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"A Few Utility Functions"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def standardize(data):\n",
" \"\"\"\n",
" mean centers the data & scales it to unit variance\n",
" \"\"\"\n",
" data_mean = data.mean(axis=0)\n",
" data_std = data.std(axis=0)\n",
" data -= data_mean\n",
" data /= data_std\n",
" return data"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 134
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def partition_data(data, class_labels, train_test_ratio=.9):\n",
" \"\"\"\n",
" returns: data & class labels, split into training and test groups,\n",
" as 2 x 2-tuples; \n",
" these 2 containers are suitable to pass to scikit-learn classifier\n",
" objects\n",
" to call their 'fit' method, pass in *tr; \n",
" for 'predict', pass in te[0];\n",
" for 'score' pass in *te\n",
" pass in: \n",
" data, 2D NumPy array\n",
" class labels, 1D NumPy array\n",
" train:test ratio: 0 < f < 1, default is 0.9\n",
" call this function bound to two variables, \n",
" eg, train, test = partition_data()\n",
" \"\"\"\n",
" # create a vector that holds the row indices\n",
" NP.random.seed(0)\n",
" idx = NP.random.permutation(data.shape[0])\n",
" # now order both data and class labels arrays by idx\n",
" D = data[idx,]\n",
" L = class_labels[idx]\n",
" # allocate the data to test & train partitions according to\n",
" # the train_test_ratio passed in\n",
" q = int(NP.ceil(train_test_ratio * D.shape[0]))\n",
" D_tr = D[:q,:]\n",
" D_te = D[q:,:]\n",
" L_tr = L[:q]\n",
" L_te = L[q:]\n",
" assert D_tr.shape[0] + D_te.shape[0] == D.shape[0]\n",
" assert L_tr.shape[0] + L_te.shape[0] == L.shape[0]\n",
" # 1D array required by scikit-learn\n",
" L_tr, L_te = NP.squeeze(L_tr), NP.squeeze(L_te)\n",
" return (D_tr, L_tr), (D_te, L_te)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 135
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def create_confmat(actual, predicted, prettyprint=1):\n",
" \"\"\"\n",
" returns: confusion matrix displayed by \n",
" pass in: 2 x 1D NumPy arrays\n",
" \"\"\"\n",
" from sympy import Matrix as MAT\n",
" a, p = NP.squeeze(actual), NP.squeeze(predicted)\n",
" idx0, idx1 = a==0, a==1\n",
" x0, y0 = a[idx0], p[idx0]\n",
" x1, y1 = a[idx1], p[idx1]\n",
" c00 = NP.where((a==0) & (a==p))[0].size\n",
" c11 = NP.where((a==1) & (a==p))[0].size\n",
" c01 = NP.where((a==0) & (a!=p))[0].size\n",
" c10 = NP.where((a==1) & (a!=p))[0].size\n",
" CM = NP.zeros((2, 2))\n",
" CM[0,0] = c00\n",
" CM[1,1] = c11\n",
" CM[0,1] = c01\n",
" CM[1,0] = c10\n",
" if prettyprint:\n",
" return MAT(CM)\n",
" else:\n",
" return CM"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 136
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def fraction_correct(actual, predicted):\n",
" \"\"\"\n",
" returns: correctly classified instances as a decimal fraction\n",
" pass in: two 1D arrays comprised of class labels represented as integers\n",
" 'actual' is the result returned fro the call to the classifier object's\n",
" 'predict' method (passing in the unlabeled testing data)\n",
" \"\"\"\n",
" actual, predicted = NP.squeeze(actual), NP.squeeze(predicted)\n",
" fc = (actual.size - NP.abs(actual - predicted).sum()) / actual.size\n",
" return round(fc, 3)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 137
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"III. Prepare the Data for Input to a Classifier"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# mean center the data & standardize to unit variance\n",
"D = standardize(D)\n",
"\n",
"# some assertion fixtures:\n",
"mx = D.mean(axis=0)\n",
"ms = NP.zeros(D.shape[1])\n",
"vx = D.var(axis=0)\n",
"vs = NP.ones(D.shape[1])\n",
"\n",
"# assertions\n",
"NP.testing.assert_array_almost_equal(mx, ms, decimal=4)\n",
"NP.testing.assert_array_almost_equal(vx, vs, decimal=4)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 138
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# shuffle the data\n",
"L = L.reshape(-1, 1)\n",
"DL = NP.hstack((D, L))\n",
"idx = NP.random.permutation(NP.arange(D.shape[0]))\n",
"DL = DL[idx,]\n",
"\n",
"D, L = NP.hsplit(DL, [-1])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 139
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# partition the data into training & test sets\n",
"tr, te = partition_data(D, L)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 140
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"IV. Build the Classifiers"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from sklearn import svm as SVM\n",
"from sklearn.svm import SVC\n",
"from sklearn import linear_model as LM\n",
"from sklearn.grid_search import GridSearchCV\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.metrics import roc_curve as ROC\n",
"from sklearn.metrics import auc as AUC\n",
"from sklearn.metrics import classification_report"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 141
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Logistic Regression"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"instantiate the logistic regressor (actually a classifier)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"lr = LM.LogisticRegression(C=.1, penalty='l1', tol=1e-6)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 142
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"train this classifier on the labeled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"lr.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 143,
"text": [
"LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, penalty='l1', random_state=None, tol=1e-06)"
]
}
],
"prompt_number": 143
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"use the trained classifier to predict the class of the unlabled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"lr_pred = lr.predict(te[0])\n",
"\n",
"st = '(logistic regression) fraction of testing instances correctly predicted: '\n",
"print(\"{0}{1}\".format(st, fraction_correct(lr_pred, te[1])))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(logistic regression) fraction of testing instances correctly predicted: 0.856\n"
]
}
],
"prompt_number": 144
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"support vector machine"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"instantiate the support vector machine classifier"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
" svc = SVM.NuSVC(\n",
" nu=0.3, # lower bound on % data as support vectors\n",
" kernel='poly', # begin w/ polynomial kernel\n",
" degree=2, # simplest polynomial \n",
" gamma=0.0, # only relevant for other kernel types (eg, rbf); ignored otherwise\n",
" coef0=1000, # degree & cofe0 are the hyperparamaters for polynomial kernel\n",
" shrinking=True, \n",
" probability=True, # need to set this flag to 'True' for ROC calculation\n",
" tol=0.0005, # convergence criterion \n",
" cache_size=200, \n",
" verbose=True, # sends to terminal, not ipython nb!\n",
" max_iter=-1, # no iteration count threshold\n",
" random_state=None\n",
" )"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 145
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"train the svm classifier on the labeled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import warnings\n",
"wstr = \"\"\"using a non-integer number instead of an integer will result in an error in the future\"\"\"\n",
"warnings.filterwarnings('ignore', wstr)\n",
"\n",
"svc.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[LibSVM]"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 146,
"text": [
"NuSVC(cache_size=200, coef0=1000, degree=2, gamma=0.0, kernel='poly',\n",
" max_iter=-1, nu=0.3, probability=True, random_state=None,\n",
" shrinking=True, tol=0.0005, verbose=True)"
]
}
],
"prompt_number": 146
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"use the trained classifier to predict the class of the unlabled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc_pred = svc.predict(te[0])\n",
"\n",
"st = '(svm) fraction of testing instances correctly predicted: '\n",
"print(\"{0}{1}\".format(st, fraction_correct(svc_pred, te[1])))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(svm) fraction of testing instances correctly predicted: 0.844\n"
]
}
],
"prompt_number": 147
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Evaluate Classifier Performance"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"ROC"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# because we have a binary classification problem, \n",
"# we can use ROC to evaluate the quality of these models\n",
"\n",
"#logistic regression\n",
"pred_prob_lr = lr.predict_proba(te[0])\n",
"false_pos_rate_lr, true_pos_rate_lr, thresholds_lr = ROC(te[1], pred_prob_lr[:,1])\n",
"roc_auc_lr = AUC(false_pos_rate_lr, true_pos_rate_lr)\n",
"print(\"Logisitc Regression, area under the curve: {0:>9.3f}\".format(roc_auc_lr))\n",
"\n",
"# svm\n",
"pred_prob_svm = svc.predict_proba(te[0])\n",
"false_pos_rate_svm, true_pos_rate_svm, thresholds_svm = ROC(te[1], pred_prob_svm[:,1])\n",
"roc_auc_svm = AUC(false_pos_rate_svm, true_pos_rate_svm)\n",
"print(\"SVM, area under the curve: {0:>25.3f}\".format(roc_auc_svm))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Logisitc Regression, area under the curve: 0.929\n",
"SVM, area under the curve: 0.933\n"
]
}
],
"prompt_number": 148
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# plot the ROC curves for each classifier\n",
"\n",
"fpr_lr, tpr_lr = false_pos_rate_lr, true_pos_rate_lr\n",
"fpr_svm, tpr_svm = false_pos_rate_svm, true_pos_rate_svm\n",
"fig = PLT.figure(figsize=(8, 6))\n",
"ax1 = fig.add_subplot(111)\n",
"ax1.plot(fpr_lr, tpr_lr, color='#FF7F49', lw=1.5) # logistic regression ROC curve is orange\n",
"ax1.plot(fpr_svm, tpr_svm, color='#0D98BA', lw=1.5) # svm ROC curve is blue\n",
"ax1.plot([0, 1], [0, 1], 'k--')\n",
"fig.text(.5, .88, ttl, ha='center', va='top', color='#062A78', fontsize=12)\n",
"PLT.xlim([0., 1.])\n",
"PLT.ylim([0., 1.])\n",
"PLT.xlabel('false positive rate')\n",
"PLT.ylabel('true positive rate')\n",
"ax1.grid(True)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "display_data",
"svg": [
"<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Created with matplotlib (http://matplotlib.org/) -->\n",
"<svg height=\"382pt\" version=\"1.1\" viewBox=\"0 0 501 382\" width=\"501pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
" <defs>\n",
" <style type=\"text/css\">\n",
"*{stroke-linecap:butt;stroke-linejoin:round;}\n",
" </style>\n",
" </defs>\n",
" <g id=\"figure_1\">\n",
" <g id=\"patch_1\">\n",
" <path d=\"\n",
"M0 382.395\n",
"L501.231 382.395\n",
"L501.231 0\n",
"L0 0\n",
"z\n",
"\" style=\"fill:#ffffff;\"/>\n",
" </g>\n",
" <g id=\"axes_1\">\n",
" <g id=\"patch_2\">\n",
" <path d=\"\n",
"M40.5609 346.839\n",
"L486.961 346.839\n",
"L486.961 12.0391\n",
"L40.5609 12.0391\n",
"z\n",
"\" style=\"fill:#ffffff;\"/>\n",
" </g>\n",
" <g id=\"line2d_1\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 344.43\n",
"L40.5609 240.859\n",
"L43.9686 240.859\n",
"L43.9686 137.288\n",
"L47.3762 137.288\n",
"L47.3762 130.062\n",
"L50.7838 130.062\n",
"L50.7838 122.836\n",
"L54.1915 122.836\n",
"L54.1915 113.202\n",
"L57.5991 113.202\n",
"L57.5991 108.384\n",
"L61.0067 108.384\n",
"L61.0067 101.158\n",
"L64.4144 101.158\n",
"L64.4144 98.7499\n",
"L67.822 98.7499\n",
"L67.822 86.7067\n",
"L71.2296 86.7067\n",
"L71.2296 81.8894\n",
"L74.6373 81.8894\n",
"L74.6373 74.6635\n",
"L81.4525 74.6635\n",
"L81.4525 72.2549\n",
"L88.2678 72.2549\n",
"L132.567 57.8031\n",
"L132.567 55.3945\n",
"L135.975 55.3945\n",
"L135.975 52.9858\n",
"L146.198 52.9858\n",
"L146.198 43.3513\n",
"L156.42 43.3513\n",
"L156.42 40.9427\n",
"L187.089 40.9427\n",
"L187.089 36.1254\n",
"L197.312 36.1254\n",
"L197.312 33.7168\n",
"L200.72 33.7168\n",
"L200.72 31.3081\n",
"L245.019 31.3081\n",
"L245.019 28.8995\n",
"L258.649 28.8995\n",
"L258.649 26.4909\n",
"L265.465 26.4909\n",
"L265.465 24.0822\n",
"L272.28 24.0822\n",
"L272.28 21.6736\n",
"L299.541 21.6736\n",
"L299.541 19.265\n",
"L313.172 19.265\n",
"L313.172 16.8563\n",
"L316.579 16.8563\n",
"L316.579 12.0391\n",
"L486.961 12.0391\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#ff7f49;stroke-linecap:square;stroke-width:1.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_2\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 332.387\n",
"L40.5609 238.451\n",
"L43.9686 238.451\n",
"L43.9686 216.773\n",
"L47.3762 216.773\n",
"L47.3762 134.879\n",
"L50.7838 134.879\n",
"L50.7838 125.245\n",
"L54.1915 125.245\n",
"L54.1915 105.976\n",
"L57.5991 105.976\n",
"L57.5991 101.158\n",
"L71.2296 101.158\n",
"L71.2296 91.524\n",
"L74.6373 91.524\n",
"L74.6373 89.1153\n",
"L78.0449 81.8894\n",
"L81.4525 81.8894\n",
"L81.4525 79.4808\n",
"L84.8602 79.4808\n",
"L84.8602 72.2549\n",
"L88.2678 72.2549\n",
"L88.2678 69.8463\n",
"L91.6754 69.8463\n",
"L91.6754 67.4376\n",
"L95.0831 67.4376\n",
"L95.0831 65.029\n",
"L112.121 65.029\n",
"L112.121 62.6204\n",
"L142.79 48.1686\n",
"L142.79 38.534\n",
"L146.198 38.534\n",
"L146.198 36.1254\n",
"L149.605 36.1254\n",
"L149.605 33.7168\n",
"L156.42 33.7168\n",
"L156.42 31.3081\n",
"L210.943 31.3081\n",
"L210.943 28.8995\n",
"L258.649 28.8995\n",
"L258.649 24.0822\n",
"L268.872 24.0822\n",
"L268.872 21.6736\n",
"L275.688 21.6736\n",
"L275.688 19.265\n",
"L282.503 19.265\n",
"L282.503 14.4477\n",
"L292.726 14.4477\n",
"L292.726 12.0391\n",
"L486.961 12.0391\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#0d98ba;stroke-linecap:square;stroke-width:1.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_3\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 346.839\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:6.000000,6.000000;stroke-dashoffset:0.0;\"/>\n",
" </g>\n",
" <g id=\"matplotlib.axis_1\">\n",
" <g id=\"xtick_1\">\n",
" <g id=\"line2d_4\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 346.839\n",
"L40.5609 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_5\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L0 -4\" id=\"mc7db9fdffb\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_6\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L0 4\" id=\"m5a7d422ac3\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_1\">\n",
" <!-- 0.0 -->\n",
" <defs>\n",
" <path d=\"\n",
"M10.6875 12.4062\n",
"L21 12.4062\n",
"L21 0\n",
"L10.6875 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-2e\"/>\n",
" <path d=\"\n",
"M31.7812 66.4062\n",
"Q24.1719 66.4062 20.3281 58.9062\n",
"Q16.5 51.4219 16.5 36.375\n",
"Q16.5 21.3906 20.3281 13.8906\n",
"Q24.1719 6.39062 31.7812 6.39062\n",
"Q39.4531 6.39062 43.2812 13.8906\n",
"Q47.125 21.3906 47.125 36.375\n",
"Q47.125 51.4219 43.2812 58.9062\n",
"Q39.4531 66.4062 31.7812 66.4062\n",
"M31.7812 74.2188\n",
"Q44.0469 74.2188 50.5156 64.5156\n",
"Q56.9844 54.8281 56.9844 36.375\n",
"Q56.9844 17.9688 50.5156 8.26562\n",
"Q44.0469 -1.42188 31.7812 -1.42188\n",
"Q19.5312 -1.42188 13.0625 8.26562\n",
"Q6.59375 17.9688 6.59375 36.375\n",
"Q6.59375 54.8281 13.0625 64.5156\n",
"Q19.5312 74.2188 31.7812 74.2188\" id=\"BitstreamVeraSans-Roman-30\"/>\n",
" </defs>\n",
" <g transform=\"translate(33.27109375 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"xtick_2\">\n",
" <g id=\"line2d_7\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M129.841 346.839\n",
"L129.841 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_8\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"129.8409375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_9\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"129.8409375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_2\">\n",
" <!-- 0.2 -->\n",
" <defs>\n",
" <path d=\"\n",
"M19.1875 8.29688\n",
"L53.6094 8.29688\n",
"L53.6094 0\n",
"L7.32812 0\n",
"L7.32812 8.29688\n",
"Q12.9375 14.1094 22.625 23.8906\n",
"Q32.3281 33.6875 34.8125 36.5312\n",
"Q39.5469 41.8438 41.4219 45.5312\n",
"Q43.3125 49.2188 43.3125 52.7812\n",
"Q43.3125 58.5938 39.2344 62.25\n",
"Q35.1562 65.9219 28.6094 65.9219\n",
"Q23.9688 65.9219 18.8125 64.3125\n",
"Q13.6719 62.7031 7.8125 59.4219\n",
"L7.8125 69.3906\n",
"Q13.7656 71.7812 18.9375 73\n",
"Q24.125 74.2188 28.4219 74.2188\n",
"Q39.75 74.2188 46.4844 68.5469\n",
"Q53.2188 62.8906 53.2188 53.4219\n",
"Q53.2188 48.9219 51.5312 44.8906\n",
"Q49.8594 40.875 45.4062 35.4062\n",
"Q44.1875 33.9844 37.6406 27.2188\n",
"Q31.1094 20.4531 19.1875 8.29688\" id=\"BitstreamVeraSans-Roman-32\"/>\n",
" </defs>\n",
" <g transform=\"translate(122.71984375 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-32\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"xtick_3\">\n",
" <g id=\"line2d_10\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M219.121 346.839\n",
"L219.121 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_11\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"219.1209375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_12\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"219.1209375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_3\">\n",
" <!-- 0.4 -->\n",
" <defs>\n",
" <path d=\"\n",
"M37.7969 64.3125\n",
"L12.8906 25.3906\n",
"L37.7969 25.3906\n",
"z\n",
"\n",
"M35.2031 72.9062\n",
"L47.6094 72.9062\n",
"L47.6094 25.3906\n",
"L58.0156 25.3906\n",
"L58.0156 17.1875\n",
"L47.6094 17.1875\n",
"L47.6094 0\n",
"L37.7969 0\n",
"L37.7969 17.1875\n",
"L4.89062 17.1875\n",
"L4.89062 26.7031\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-34\"/>\n",
" </defs>\n",
" <g transform=\"translate(211.77953125 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-34\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"xtick_4\">\n",
" <g id=\"line2d_13\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M308.401 346.839\n",
"L308.401 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_14\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"308.4009375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_15\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"308.4009375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_4\">\n",
" <!-- 0.6 -->\n",
" <defs>\n",
" <path d=\"\n",
"M33.0156 40.375\n",
"Q26.375 40.375 22.4844 35.8281\n",
"Q18.6094 31.2969 18.6094 23.3906\n",
"Q18.6094 15.5312 22.4844 10.9531\n",
"Q26.375 6.39062 33.0156 6.39062\n",
"Q39.6562 6.39062 43.5312 10.9531\n",
"Q47.4062 15.5312 47.4062 23.3906\n",
"Q47.4062 31.2969 43.5312 35.8281\n",
"Q39.6562 40.375 33.0156 40.375\n",
"M52.5938 71.2969\n",
"L52.5938 62.3125\n",
"Q48.875 64.0625 45.0938 64.9844\n",
"Q41.3125 65.9219 37.5938 65.9219\n",
"Q27.8281 65.9219 22.6719 59.3281\n",
"Q17.5312 52.7344 16.7969 39.4062\n",
"Q19.6719 43.6562 24.0156 45.9219\n",
"Q28.375 48.1875 33.5938 48.1875\n",
"Q44.5781 48.1875 50.9531 41.5156\n",
"Q57.3281 34.8594 57.3281 23.3906\n",
"Q57.3281 12.1562 50.6875 5.35938\n",
"Q44.0469 -1.42188 33.0156 -1.42188\n",
"Q20.3594 -1.42188 13.6719 8.26562\n",
"Q6.98438 17.9688 6.98438 36.375\n",
"Q6.98438 53.6562 15.1875 63.9375\n",
"Q23.3906 74.2188 37.2031 74.2188\n",
"Q40.9219 74.2188 44.7031 73.4844\n",
"Q48.4844 72.75 52.5938 71.2969\" id=\"BitstreamVeraSans-Roman-36\"/>\n",
" </defs>\n",
" <g transform=\"translate(301.09390625 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-36\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"xtick_5\">\n",
" <g id=\"line2d_16\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M397.681 346.839\n",
"L397.681 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_17\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"397.6809375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_18\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"397.6809375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_5\">\n",
" <!-- 0.8 -->\n",
" <defs>\n",
" <path d=\"\n",
"M31.7812 34.625\n",
"Q24.75 34.625 20.7188 30.8594\n",
"Q16.7031 27.0938 16.7031 20.5156\n",
"Q16.7031 13.9219 20.7188 10.1562\n",
"Q24.75 6.39062 31.7812 6.39062\n",
"Q38.8125 6.39062 42.8594 10.1719\n",
"Q46.9219 13.9688 46.9219 20.5156\n",
"Q46.9219 27.0938 42.8906 30.8594\n",
"Q38.875 34.625 31.7812 34.625\n",
"M21.9219 38.8125\n",
"Q15.5781 40.375 12.0312 44.7188\n",
"Q8.5 49.0781 8.5 55.3281\n",
"Q8.5 64.0625 14.7188 69.1406\n",
"Q20.9531 74.2188 31.7812 74.2188\n",
"Q42.6719 74.2188 48.875 69.1406\n",
"Q55.0781 64.0625 55.0781 55.3281\n",
"Q55.0781 49.0781 51.5312 44.7188\n",
"Q48 40.375 41.7031 38.8125\n",
"Q48.8281 37.1562 52.7969 32.3125\n",
"Q56.7812 27.4844 56.7812 20.5156\n",
"Q56.7812 9.90625 50.3125 4.23438\n",
"Q43.8438 -1.42188 31.7812 -1.42188\n",
"Q19.7344 -1.42188 13.25 4.23438\n",
"Q6.78125 9.90625 6.78125 20.5156\n",
"Q6.78125 27.4844 10.7812 32.3125\n",
"Q14.7969 37.1562 21.9219 38.8125\n",
"M18.3125 54.3906\n",
"Q18.3125 48.7344 21.8438 45.5625\n",
"Q25.3906 42.3906 31.7812 42.3906\n",
"Q38.1406 42.3906 41.7188 45.5625\n",
"Q45.3125 48.7344 45.3125 54.3906\n",
"Q45.3125 60.0625 41.7188 63.2344\n",
"Q38.1406 66.4062 31.7812 66.4062\n",
"Q25.3906 66.4062 21.8438 63.2344\n",
"Q18.3125 60.0625 18.3125 54.3906\" id=\"BitstreamVeraSans-Roman-38\"/>\n",
" </defs>\n",
" <g transform=\"translate(390.40125 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-38\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"xtick_6\">\n",
" <g id=\"line2d_19\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M486.961 346.839\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_20\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#mc7db9fdffb\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_21\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#m5a7d422ac3\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_6\">\n",
" <!-- 1.0 -->\n",
" <defs>\n",
" <path d=\"\n",
"M12.4062 8.29688\n",
"L28.5156 8.29688\n",
"L28.5156 63.9219\n",
"L10.9844 60.4062\n",
"L10.9844 69.3906\n",
"L28.4219 72.9062\n",
"L38.2812 72.9062\n",
"L38.2812 8.29688\n",
"L54.3906 8.29688\n",
"L54.3906 0\n",
"L12.4062 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-31\"/>\n",
" </defs>\n",
" <g transform=\"translate(479.890625 358.4375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-31\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_7\">\n",
" <!-- false positive rate -->\n",
" <defs>\n",
" <path d=\"\n",
"M9.42188 75.9844\n",
"L18.4062 75.9844\n",
"L18.4062 0\n",
"L9.42188 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-6c\"/>\n",
" <path d=\"\n",
"M34.2812 27.4844\n",
"Q23.3906 27.4844 19.1875 25\n",
"Q14.9844 22.5156 14.9844 16.5\n",
"Q14.9844 11.7188 18.1406 8.90625\n",
"Q21.2969 6.10938 26.7031 6.10938\n",
"Q34.1875 6.10938 38.7031 11.4062\n",
"Q43.2188 16.7031 43.2188 25.4844\n",
"L43.2188 27.4844\n",
"z\n",
"\n",
"M52.2031 31.2031\n",
"L52.2031 0\n",
"L43.2188 0\n",
"L43.2188 8.29688\n",
"Q40.1406 3.32812 35.5469 0.953125\n",
"Q30.9531 -1.42188 24.3125 -1.42188\n",
"Q15.9219 -1.42188 10.9531 3.29688\n",
"Q6 8.01562 6 15.9219\n",
"Q6 25.1406 12.1719 29.8281\n",
"Q18.3594 34.5156 30.6094 34.5156\n",
"L43.2188 34.5156\n",
"L43.2188 35.4062\n",
"Q43.2188 41.6094 39.1406 45\n",
"Q35.0625 48.3906 27.6875 48.3906\n",
"Q23 48.3906 18.5469 47.2656\n",
"Q14.1094 46.1406 10.0156 43.8906\n",
"L10.0156 52.2031\n",
"Q14.9375 54.1094 19.5781 55.0469\n",
"Q24.2188 56 28.6094 56\n",
"Q40.4844 56 46.3438 49.8438\n",
"Q52.2031 43.7031 52.2031 31.2031\" id=\"BitstreamVeraSans-Roman-61\"/>\n",
" <path id=\"BitstreamVeraSans-Roman-20\"/>\n",
" <path d=\"\n",
"M56.2031 29.5938\n",
"L56.2031 25.2031\n",
"L14.8906 25.2031\n",
"Q15.4844 15.9219 20.4844 11.0625\n",
"Q25.4844 6.20312 34.4219 6.20312\n",
"Q39.5938 6.20312 44.4531 7.46875\n",
"Q49.3125 8.73438 54.1094 11.2812\n",
"L54.1094 2.78125\n",
"Q49.2656 0.734375 44.1875 -0.34375\n",
"Q39.1094 -1.42188 33.8906 -1.42188\n",
"Q20.7969 -1.42188 13.1562 6.1875\n",
"Q5.51562 13.8125 5.51562 26.8125\n",
"Q5.51562 40.2344 12.7656 48.1094\n",
"Q20.0156 56 32.3281 56\n",
"Q43.3594 56 49.7812 48.8906\n",
"Q56.2031 41.7969 56.2031 29.5938\n",
"M47.2188 32.2344\n",
"Q47.125 39.5938 43.0938 43.9844\n",
"Q39.0625 48.3906 32.4219 48.3906\n",
"Q24.9062 48.3906 20.3906 44.1406\n",
"Q15.875 39.8906 15.1875 32.1719\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-65\"/>\n",
" <path d=\"\n",
"M2.98438 54.6875\n",
"L12.5 54.6875\n",
"L29.5938 8.79688\n",
"L46.6875 54.6875\n",
"L56.2031 54.6875\n",
"L35.6875 0\n",
"L23.4844 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-76\"/>\n",
" <path d=\"\n",
"M18.1094 8.20312\n",
"L18.1094 -20.7969\n",
"L9.07812 -20.7969\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.3906\n",
"Q20.9531 51.2656 25.2656 53.625\n",
"Q29.5938 56 35.5938 56\n",
"Q45.5625 56 51.7812 48.0938\n",
"Q58.0156 40.1875 58.0156 27.2969\n",
"Q58.0156 14.4062 51.7812 6.48438\n",
"Q45.5625 -1.42188 35.5938 -1.42188\n",
"Q29.5938 -1.42188 25.2656 0.953125\n",
"Q20.9531 3.32812 18.1094 8.20312\n",
"M48.6875 27.2969\n",
"Q48.6875 37.2031 44.6094 42.8438\n",
"Q40.5312 48.4844 33.4062 48.4844\n",
"Q26.2656 48.4844 22.1875 42.8438\n",
"Q18.1094 37.2031 18.1094 27.2969\n",
"Q18.1094 17.3906 22.1875 11.75\n",
"Q26.2656 6.10938 33.4062 6.10938\n",
"Q40.5312 6.10938 44.6094 11.75\n",
"Q48.6875 17.3906 48.6875 27.2969\" id=\"BitstreamVeraSans-Roman-70\"/>\n",
" <path d=\"\n",
"M37.1094 75.9844\n",
"L37.1094 68.5\n",
"L28.5156 68.5\n",
"Q23.6875 68.5 21.7969 66.5469\n",
"Q19.9219 64.5938 19.9219 59.5156\n",
"L19.9219 54.6875\n",
"L34.7188 54.6875\n",
"L34.7188 47.7031\n",
"L19.9219 47.7031\n",
"L19.9219 0\n",
"L10.8906 0\n",
"L10.8906 47.7031\n",
"L2.29688 47.7031\n",
"L2.29688 54.6875\n",
"L10.8906 54.6875\n",
"L10.8906 58.5\n",
"Q10.8906 67.625 15.1406 71.7969\n",
"Q19.3906 75.9844 28.6094 75.9844\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-66\"/>\n",
" <path d=\"\n",
"M18.3125 70.2188\n",
"L18.3125 54.6875\n",
"L36.8125 54.6875\n",
"L36.8125 47.7031\n",
"L18.3125 47.7031\n",
"L18.3125 18.0156\n",
"Q18.3125 11.3281 20.1406 9.42188\n",
"Q21.9688 7.51562 27.5938 7.51562\n",
"L36.8125 7.51562\n",
"L36.8125 0\n",
"L27.5938 0\n",
"Q17.1875 0 13.2344 3.875\n",
"Q9.28125 7.76562 9.28125 18.0156\n",
"L9.28125 47.7031\n",
"L2.6875 47.7031\n",
"L2.6875 54.6875\n",
"L9.28125 54.6875\n",
"L9.28125 70.2188\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-74\"/>\n",
" <path d=\"\n",
"M41.1094 46.2969\n",
"Q39.5938 47.1719 37.8125 47.5781\n",
"Q36.0312 48 33.8906 48\n",
"Q26.2656 48 22.1875 43.0469\n",
"Q18.1094 38.0938 18.1094 28.8125\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q20.9531 51.1719 25.4844 53.5781\n",
"Q30.0312 56 36.5312 56\n",
"Q37.4531 56 38.5781 55.875\n",
"Q39.7031 55.7656 41.0625 55.5156\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-72\"/>\n",
" <path d=\"\n",
"M30.6094 48.3906\n",
"Q23.3906 48.3906 19.1875 42.75\n",
"Q14.9844 37.1094 14.9844 27.2969\n",
"Q14.9844 17.4844 19.1562 11.8438\n",
"Q23.3438 6.20312 30.6094 6.20312\n",
"Q37.7969 6.20312 41.9844 11.8594\n",
"Q46.1875 17.5312 46.1875 27.2969\n",
"Q46.1875 37.0156 41.9844 42.7031\n",
"Q37.7969 48.3906 30.6094 48.3906\n",
"M30.6094 56\n",
"Q42.3281 56 49.0156 48.375\n",
"Q55.7188 40.7656 55.7188 27.2969\n",
"Q55.7188 13.875 49.0156 6.21875\n",
"Q42.3281 -1.42188 30.6094 -1.42188\n",
"Q18.8438 -1.42188 12.1719 6.21875\n",
"Q5.51562 13.875 5.51562 27.2969\n",
"Q5.51562 40.7656 12.1719 48.375\n",
"Q18.8438 56 30.6094 56\" id=\"BitstreamVeraSans-Roman-6f\"/>\n",
" <path d=\"\n",
"M9.42188 54.6875\n",
"L18.4062 54.6875\n",
"L18.4062 0\n",
"L9.42188 0\n",
"z\n",
"\n",
"M9.42188 75.9844\n",
"L18.4062 75.9844\n",
"L18.4062 64.5938\n",
"L9.42188 64.5938\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-69\"/>\n",
" <path d=\"\n",
"M44.2812 53.0781\n",
"L44.2812 44.5781\n",
"Q40.4844 46.5312 36.375 47.5\n",
"Q32.2812 48.4844 27.875 48.4844\n",
"Q21.1875 48.4844 17.8438 46.4375\n",
"Q14.5 44.3906 14.5 40.2812\n",
"Q14.5 37.1562 16.8906 35.375\n",
"Q19.2812 33.5938 26.5156 31.9844\n",
"L29.5938 31.2969\n",
"Q39.1562 29.25 43.1875 25.5156\n",
"Q47.2188 21.7812 47.2188 15.0938\n",
"Q47.2188 7.46875 41.1875 3.01562\n",
"Q35.1562 -1.42188 24.6094 -1.42188\n",
"Q20.2188 -1.42188 15.4531 -0.5625\n",
"Q10.6875 0.296875 5.42188 2\n",
"L5.42188 11.2812\n",
"Q10.4062 8.6875 15.2344 7.39062\n",
"Q20.0625 6.10938 24.8125 6.10938\n",
"Q31.1562 6.10938 34.5625 8.28125\n",
"Q37.9844 10.4531 37.9844 14.4062\n",
"Q37.9844 18.0625 35.5156 20.0156\n",
"Q33.0625 21.9688 24.7031 23.7812\n",
"L21.5781 24.5156\n",
"Q13.2344 26.2656 9.51562 29.9062\n",
"Q5.8125 33.5469 5.8125 39.8906\n",
"Q5.8125 47.6094 11.2812 51.7969\n",
"Q16.75 56 26.8125 56\n",
"Q31.7812 56 36.1719 55.2656\n",
"Q40.5781 54.5469 44.2812 53.0781\" id=\"BitstreamVeraSans-Roman-73\"/>\n",
" </defs>\n",
" <g transform=\"translate(219.1859375 373.115625)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-66\"/>\n",
" <use x=\"35.205078125\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"96.484375\" xlink:href=\"#BitstreamVeraSans-Roman-6c\"/>\n",
" <use x=\"124.267578125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"176.3671875\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"237.890625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"269.677734375\" xlink:href=\"#BitstreamVeraSans-Roman-70\"/>\n",
" <use x=\"333.154296875\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"394.3359375\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"446.435546875\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"474.21875\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"513.427734375\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"541.2109375\" xlink:href=\"#BitstreamVeraSans-Roman-76\"/>\n",
" <use x=\"600.390625\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"661.9140625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"693.701171875\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"734.814453125\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"796.09375\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"835.302734375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"matplotlib.axis_2\">\n",
" <g id=\"ytick_1\">\n",
" <g id=\"line2d_22\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 346.839\n",
"L486.961 346.839\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_23\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L4 0\" id=\"md7965d1ba0\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_24\">\n",
" <defs>\n",
" <path d=\"\n",
"M0 0\n",
"L-4 0\" id=\"md9a1c1a7cd\" style=\"stroke:#000000;stroke-width:0.5;\"/>\n",
" </defs>\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"346.8390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_8\">\n",
" <!-- 0.0 -->\n",
" <g transform=\"translate(21.98125 349.5984375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_2\">\n",
" <g id=\"line2d_25\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 279.879\n",
"L486.961 279.879\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_26\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"279.8790625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_27\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"279.8790625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_9\">\n",
" <!-- 0.2 -->\n",
" <g transform=\"translate(22.31875 282.6384375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-32\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_3\">\n",
" <g id=\"line2d_28\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 212.919\n",
"L486.961 212.919\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_29\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"212.9190625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_30\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"212.9190625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_10\">\n",
" <!-- 0.4 -->\n",
" <g transform=\"translate(21.878125 215.6784375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-34\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_4\">\n",
" <g id=\"line2d_31\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 145.959\n",
"L486.961 145.959\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_32\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"145.9590625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_33\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"145.9590625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_11\">\n",
" <!-- 0.6 -->\n",
" <g transform=\"translate(21.946875 148.7184375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-36\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_5\">\n",
" <g id=\"line2d_34\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 78.9991\n",
"L486.961 78.9991\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_35\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"78.9990625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_36\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"78.9990625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_12\">\n",
" <!-- 0.8 -->\n",
" <g transform=\"translate(22.0015625 81.7584375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-38\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"ytick_6\">\n",
" <g id=\"line2d_37\">\n",
" <path clip-path=\"url(#pa76d04f499)\" d=\"\n",
"M40.5609 12.0391\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#000000;stroke-dasharray:1.000000,3.000000;stroke-dashoffset:0.0;stroke-width:0.5;\"/>\n",
" </g>\n",
" <g id=\"line2d_38\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"40.5609375\" xlink:href=\"#md7965d1ba0\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"line2d_39\">\n",
" <g>\n",
" <use style=\"stroke:#000000;stroke-width:0.5;\" x=\"486.9609375\" xlink:href=\"#md9a1c1a7cd\" y=\"12.0390625\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_13\">\n",
" <!-- 1.0 -->\n",
" <g transform=\"translate(22.4203125 14.7984375)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-31\"/>\n",
" <use x=\"63.623046875\" xlink:href=\"#BitstreamVeraSans-Roman-2e\"/>\n",
" <use x=\"95.41015625\" xlink:href=\"#BitstreamVeraSans-Roman-30\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_14\">\n",
" <!-- true positive rate -->\n",
" <defs>\n",
" <path d=\"\n",
"M8.5 21.5781\n",
"L8.5 54.6875\n",
"L17.4844 54.6875\n",
"L17.4844 21.9219\n",
"Q17.4844 14.1562 20.5 10.2656\n",
"Q23.5312 6.39062 29.5938 6.39062\n",
"Q36.8594 6.39062 41.0781 11.0312\n",
"Q45.3125 15.6719 45.3125 23.6875\n",
"L45.3125 54.6875\n",
"L54.2969 54.6875\n",
"L54.2969 0\n",
"L45.3125 0\n",
"L45.3125 8.40625\n",
"Q42.0469 3.42188 37.7188 1\n",
"Q33.4062 -1.42188 27.6875 -1.42188\n",
"Q18.2656 -1.42188 13.375 4.4375\n",
"Q8.5 10.2969 8.5 21.5781\" id=\"BitstreamVeraSans-Roman-75\"/>\n",
" </defs>\n",
" <g transform=\"translate(14.7984375 222.38046875)rotate(-90.0)scale(0.1 -0.1)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"39.208984375\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"80.322265625\" xlink:href=\"#BitstreamVeraSans-Roman-75\"/>\n",
" <use x=\"143.701171875\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"205.224609375\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"237.01171875\" xlink:href=\"#BitstreamVeraSans-Roman-70\"/>\n",
" <use x=\"300.48828125\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"361.669921875\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"413.76953125\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"441.552734375\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"480.76171875\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"508.544921875\" xlink:href=\"#BitstreamVeraSans-Roman-76\"/>\n",
" <use x=\"567.724609375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"629.248046875\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"661.03515625\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"702.1484375\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"763.427734375\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"802.63671875\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <g id=\"patch_3\">\n",
" <path d=\"\n",
"M40.5609 346.839\n",
"L40.5609 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_4\">\n",
" <path d=\"\n",
"M486.961 346.839\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_5\">\n",
" <path d=\"\n",
"M40.5609 346.839\n",
"L486.961 346.839\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" <g id=\"patch_6\">\n",
" <path d=\"\n",
"M40.5609 12.0391\n",
"L486.961 12.0391\" style=\"fill:none;stroke:#000000;\"/>\n",
" </g>\n",
" </g>\n",
" <g id=\"text_15\">\n",
" <!-- class I vs II frequencies across the Term Vector -->\n",
" <defs>\n",
" <path d=\"\n",
"M54.8906 33.0156\n",
"L54.8906 0\n",
"L45.9062 0\n",
"L45.9062 32.7188\n",
"Q45.9062 40.4844 42.875 44.3281\n",
"Q39.8438 48.1875 33.7969 48.1875\n",
"Q26.5156 48.1875 22.3125 43.5469\n",
"Q18.1094 38.9219 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q21.3438 51.125 25.7031 53.5625\n",
"Q30.0781 56 35.7969 56\n",
"Q45.2188 56 50.0469 50.1719\n",
"Q54.8906 44.3438 54.8906 33.0156\" id=\"BitstreamVeraSans-Roman-6e\"/>\n",
" <path d=\"\n",
"M28.6094 0\n",
"L0.78125 72.9062\n",
"L11.0781 72.9062\n",
"L34.1875 11.5312\n",
"L57.3281 72.9062\n",
"L67.5781 72.9062\n",
"L39.7969 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-56\"/>\n",
" <path d=\"\n",
"M14.7969 27.2969\n",
"Q14.7969 17.3906 18.875 11.75\n",
"Q22.9531 6.10938 30.0781 6.10938\n",
"Q37.2031 6.10938 41.2969 11.75\n",
"Q45.4062 17.3906 45.4062 27.2969\n",
"Q45.4062 37.2031 41.2969 42.8438\n",
"Q37.2031 48.4844 30.0781 48.4844\n",
"Q22.9531 48.4844 18.875 42.8438\n",
"Q14.7969 37.2031 14.7969 27.2969\n",
"M45.4062 8.20312\n",
"Q42.5781 3.32812 38.25 0.953125\n",
"Q33.9375 -1.42188 27.875 -1.42188\n",
"Q17.9688 -1.42188 11.7344 6.48438\n",
"Q5.51562 14.4062 5.51562 27.2969\n",
"Q5.51562 40.1875 11.7344 48.0938\n",
"Q17.9688 56 27.875 56\n",
"Q33.9375 56 38.25 53.625\n",
"Q42.5781 51.2656 45.4062 46.3906\n",
"L45.4062 54.6875\n",
"L54.3906 54.6875\n",
"L54.3906 -20.7969\n",
"L45.4062 -20.7969\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-71\"/>\n",
" <path d=\"\n",
"M52 44.1875\n",
"Q55.375 50.25 60.0625 53.125\n",
"Q64.75 56 71.0938 56\n",
"Q79.6406 56 84.2812 50.0156\n",
"Q88.9219 44.0469 88.9219 33.0156\n",
"L88.9219 0\n",
"L79.8906 0\n",
"L79.8906 32.7188\n",
"Q79.8906 40.5781 77.0938 44.375\n",
"Q74.3125 48.1875 68.6094 48.1875\n",
"Q61.625 48.1875 57.5625 43.5469\n",
"Q53.5156 38.9219 53.5156 30.9062\n",
"L53.5156 0\n",
"L44.4844 0\n",
"L44.4844 32.7188\n",
"Q44.4844 40.625 41.7031 44.4062\n",
"Q38.9219 48.1875 33.1094 48.1875\n",
"Q26.2188 48.1875 22.1562 43.5312\n",
"Q18.1094 38.875 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 54.6875\n",
"L18.1094 54.6875\n",
"L18.1094 46.1875\n",
"Q21.1875 51.2188 25.4844 53.6094\n",
"Q29.7812 56 35.6875 56\n",
"Q41.6562 56 45.8281 52.9688\n",
"Q50 49.9531 52 44.1875\" id=\"BitstreamVeraSans-Roman-6d\"/>\n",
" <path d=\"\n",
"M9.8125 72.9062\n",
"L19.6719 72.9062\n",
"L19.6719 0\n",
"L9.8125 0\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-49\"/>\n",
" <path d=\"\n",
"M48.7812 52.5938\n",
"L48.7812 44.1875\n",
"Q44.9688 46.2969 41.1406 47.3438\n",
"Q37.3125 48.3906 33.4062 48.3906\n",
"Q24.6562 48.3906 19.8125 42.8438\n",
"Q14.9844 37.3125 14.9844 27.2969\n",
"Q14.9844 17.2812 19.8125 11.7344\n",
"Q24.6562 6.20312 33.4062 6.20312\n",
"Q37.3125 6.20312 41.1406 7.25\n",
"Q44.9688 8.29688 48.7812 10.4062\n",
"L48.7812 2.09375\n",
"Q45.0156 0.34375 40.9844 -0.53125\n",
"Q36.9688 -1.42188 32.4219 -1.42188\n",
"Q20.0625 -1.42188 12.7812 6.34375\n",
"Q5.51562 14.1094 5.51562 27.2969\n",
"Q5.51562 40.6719 12.8594 48.3281\n",
"Q20.2188 56 33.0156 56\n",
"Q37.1562 56 41.1094 55.1406\n",
"Q45.0625 54.2969 48.7812 52.5938\" id=\"BitstreamVeraSans-Roman-63\"/>\n",
" <path d=\"\n",
"M-0.296875 72.9062\n",
"L61.375 72.9062\n",
"L61.375 64.5938\n",
"L35.5 64.5938\n",
"L35.5 0\n",
"L25.5938 0\n",
"L25.5938 64.5938\n",
"L-0.296875 64.5938\n",
"z\n",
"\" id=\"BitstreamVeraSans-Roman-54\"/>\n",
" <path d=\"\n",
"M54.8906 33.0156\n",
"L54.8906 0\n",
"L45.9062 0\n",
"L45.9062 32.7188\n",
"Q45.9062 40.4844 42.875 44.3281\n",
"Q39.8438 48.1875 33.7969 48.1875\n",
"Q26.5156 48.1875 22.3125 43.5469\n",
"Q18.1094 38.9219 18.1094 30.9062\n",
"L18.1094 0\n",
"L9.07812 0\n",
"L9.07812 75.9844\n",
"L18.1094 75.9844\n",
"L18.1094 46.1875\n",
"Q21.3438 51.125 25.7031 53.5625\n",
"Q30.0781 56 35.7969 56\n",
"Q45.2188 56 50.0469 50.1719\n",
"Q54.8906 44.3438 54.8906 33.0156\" id=\"BitstreamVeraSans-Roman-68\"/>\n",
" </defs>\n",
" <g style=\"fill:#062a78;\" transform=\"translate(116.660625 29.7971875)scale(0.12 -0.12)\">\n",
" <use xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"54.98046875\" xlink:href=\"#BitstreamVeraSans-Roman-6c\"/>\n",
" <use x=\"82.763671875\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"144.04296875\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"196.142578125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"248.2421875\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"280.029296875\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"309.521484375\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"341.30859375\" xlink:href=\"#BitstreamVeraSans-Roman-76\"/>\n",
" <use x=\"400.48828125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"452.587890625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"484.375\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"513.8671875\" xlink:href=\"#BitstreamVeraSans-Roman-49\"/>\n",
" <use x=\"543.359375\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"575.146484375\" xlink:href=\"#BitstreamVeraSans-Roman-66\"/>\n",
" <use x=\"610.3515625\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"649.21484375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"710.73828125\" xlink:href=\"#BitstreamVeraSans-Roman-71\"/>\n",
" <use x=\"774.21484375\" xlink:href=\"#BitstreamVeraSans-Roman-75\"/>\n",
" <use x=\"837.59375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"899.1171875\" xlink:href=\"#BitstreamVeraSans-Roman-6e\"/>\n",
" <use x=\"962.49609375\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"1017.4765625\" xlink:href=\"#BitstreamVeraSans-Roman-69\"/>\n",
" <use x=\"1045.259765625\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1106.783203125\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1158.8828125\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1190.669921875\" xlink:href=\"#BitstreamVeraSans-Roman-61\"/>\n",
" <use x=\"1251.94921875\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"1306.9296875\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"1345.79296875\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"1406.974609375\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1459.07421875\" xlink:href=\"#BitstreamVeraSans-Roman-73\"/>\n",
" <use x=\"1511.173828125\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1542.9609375\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"1582.169921875\" xlink:href=\"#BitstreamVeraSans-Roman-68\"/>\n",
" <use x=\"1645.548828125\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1707.072265625\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"1738.859375\" xlink:href=\"#BitstreamVeraSans-Roman-54\"/>\n",
" <use x=\"1782.943359375\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"1844.466796875\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" <use x=\"1883.830078125\" xlink:href=\"#BitstreamVeraSans-Roman-6d\"/>\n",
" <use x=\"1981.2421875\" xlink:href=\"#BitstreamVeraSans-Roman-20\"/>\n",
" <use x=\"2013.029296875\" xlink:href=\"#BitstreamVeraSans-Roman-56\"/>\n",
" <use x=\"2073.6875\" xlink:href=\"#BitstreamVeraSans-Roman-65\"/>\n",
" <use x=\"2135.2109375\" xlink:href=\"#BitstreamVeraSans-Roman-63\"/>\n",
" <use x=\"2190.19140625\" xlink:href=\"#BitstreamVeraSans-Roman-74\"/>\n",
" <use x=\"2229.400390625\" xlink:href=\"#BitstreamVeraSans-Roman-6f\"/>\n",
" <use x=\"2290.58203125\" xlink:href=\"#BitstreamVeraSans-Roman-72\"/>\n",
" </g>\n",
" </g>\n",
" </g>\n",
" <defs>\n",
" <clipPath id=\"pa76d04f499\">\n",
" <rect height=\"334.8\" width=\"446.4\" x=\"40.5609375\" y=\"12.0390625\"/>\n",
" </clipPath>\n",
" </defs>\n",
"</svg>\n"
],
"text": [
"<matplotlib.figure.Figure at 0x1073bcf98>"
]
}
],
"prompt_number": 170
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Confusion Matrix"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# confusion matrix for svm:\n",
"create_confmat(svc_pred, te[1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"latex": [
"$$\\left[\\begin{matrix}125.0 & 31.0\\\\6.0 & 108.0\\end{matrix}\\right]$$"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 171,
"text": [
"\u23a1125.0 31.0 \u23a4\n",
"\u23a2 \u23a5\n",
"\u23a3 6.0 108.0\u23a6"
]
}
],
"prompt_number": 171
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# confusion matrix for logistic regression:\n",
"create_confmat(lr_pred, te[1])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"latex": [
"$$\\left[\\begin{matrix}123.0 & 31.0\\\\8.0 & 108.0\\end{matrix}\\right]$$"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 172,
"text": [
"\u23a1123.0 31.0 \u23a4\n",
"\u23a2 \u23a5\n",
"\u23a3 8.0 108.0\u23a6"
]
}
],
"prompt_number": 172
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"grid search to optimize _hyperparamater selection_ (not yet run)"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"use grid search & k-fold cross-validation to optimize selection of svm hyper-paramaters"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# using a two-step grid search for efficiency:\n",
" # 1st grid search: broad search over a coarse grid; then\n",
" # 2nd sgrid search: fine-grained mesh centered on the param vals from the prior step\n",
"\n",
"# C & gamma are the relevant hyperparamaters for the rbf kernel\n",
"\n",
"gamma_range = NP.logspace(-2, 3, 10)\n",
"C_range = NP.logspace(-2, 3, 10)\n",
"\n",
"svc = GridSearchCV(\n",
" estimator=SVM.SVC(kernel='rbf'),\n",
" param_grid=dict(gamma=gamma_range, C=C_range),\n",
" n_jobs=4,\n",
" )\n",
"\n",
"svc.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 173,
"text": [
"GridSearchCV(cv=None,\n",
" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
" kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
" shrinking=True, tol=0.001, verbose=False),\n",
" fit_params={}, iid=True, loss_func=None, n_jobs=4,\n",
" param_grid={'C': array([ 0.01 , 0.03594, 0.12915, 0.46416, 1.6681 , 5.99484, 21.54435, 77.42637, 278.25594, 1000. ]), 'gamma': array([ 0.01 , 0.03594, 0.12915, 0.46416, 1.6681 , 5.99484, 21.54435, 77.42637, 278.25594, 1000. ])},\n",
" pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
" verbose=0)"
]
}
],
"prompt_number": 173
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"best_param_vals = svc.best_params_\n",
"\n",
"print(\"best score from Grid Search: {0:.2f}\".format(svc.best_score_))\n",
"print(\"\\n\")\n",
"print(\"best paramater values from Grid Search \\n\")\n",
"for param, val in best_param_vals.items():\n",
" print(\"{0:^12}{1:.2f}\".format(param, val))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"best score from Grid Search: 0.86\n",
"\n",
"\n",
"best paramater values from Grid Search \n",
"\n",
" C 617.87\n",
" gamma 0.00\n"
]
}
],
"prompt_number": 179
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def get_fine_mesh_params(best_params):\n",
" \"\"\"\n",
" returns: dict to pass in to call GridSearchCV's param_grid arg\n",
" one key (str) for each hyper-paramr; vals are 1D NumPy arrays\n",
" storing sequence of values\n",
" \n",
" pass in: dict returned from calling a GridSearchCV object's \n",
" (classifier) best_params_ method, one key per hyper-param;\n",
" each val is a single (best) param value (scalar)\n",
" \"\"\"\n",
" param_grid_fine = dict.fromkeys(best_params.keys())\n",
" for param, val in best_params.items():\n",
" lo = NP.log(val) - 1\n",
" hi = NP.log(val) + 1\n",
" param_grid_fine[param] = NP.logspace(lo, hi, 20)\n",
" return param_grid_fine"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 180
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"param_grid_fine = get_fine_mesh_params(svc.best_params_)\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 181
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc = GridSearchCV(\n",
" estimator=SVM.SVC(kernel='rbf'), \n",
" param_grid = param_grid_fine, \n",
" n_jobs=4,\n",
" )"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 182
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 183,
"text": [
"GridSearchCV(cv=None,\n",
" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
" kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
" shrinking=True, tol=0.001, verbose=False),\n",
" fit_params={}, iid=True, loss_func=None, n_jobs=4,\n",
" param_grid={'C': array([ 266860.08706, 340053.13362, 433321.20198, 552170.36846, 703616.88837, 896601.40037, 1142516.73664,\n",
" 1455880.49825, 1855192.10109, 2364024.88809, 3012417.78047, 3838648.62414, 4891493.92065, 6233108.3458 ,\n",
" 7942694.04824, 10121176.34476, 12897161.84202, 16434530.72185, 20942111.40062, 26686008.70562]), 'gamma': array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])},\n",
" pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
" verbose=0)"
]
}
],
"prompt_number": 183
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"now train this GridSearch-optimized trained classifier on the labeled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc_pred = svc.predict(te[0])\n",
"\n",
"st = '(svm) fraction of testing instances correctly predicted: '\n",
"print(\"{0}{1}\".format(st, fraction_correct(svc_pred, te[1])))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(svm) fraction of testing instances correctly predicted: 0.841\n"
]
}
],
"prompt_number": 184
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"st = '(svm) fraction of testing instances correctly predicted: '\n",
"print(\"{0}{1:.2f}\".format(st, svc.score(*te)))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(svm) fraction of testing instances correctly predicted: 0.84\n"
]
}
],
"prompt_number": 185
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# i just wanted to verify that the two results above agree; \n",
"# i was not aware until i saw _score_ in the classifier instance methods\n",
"# that i could call it to get % correct on the test set,\n",
"# previously i had been calling _pred_ then comparing those results with the test set labels"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 186
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"a second grid search using a polynomial kernel"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# C & gamma are the relevant hyperparamaters for the rbf kernel\n",
"\n",
"degree_range = NP.arange(2, 4)\n",
"coef0_range = NP.logspace(-2, 3, 10)\n",
"\n",
"param_grid_coarse = dict(degree=degree_range, coef0=coef0_range)\n",
"\n",
"svc = GridSearchCV(\n",
" estimator=SVM.NuSVC(kernel='poly', nu=0.3), \n",
" param_grid=param_grid_coarse,\n",
" n_jobs=4,\n",
" )\n",
"\n",
"svc.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 187,
"text": [
"GridSearchCV(cv=None,\n",
" estimator=NuSVC(cache_size=200, coef0=0.0, degree=3, gamma=0.0, kernel='poly',\n",
" max_iter=-1, nu=0.3, probability=False, random_state=None,\n",
" shrinking=True, tol=0.001, verbose=False),\n",
" fit_params={}, iid=True, loss_func=None, n_jobs=4,\n",
" param_grid={'coef0': array([ 0.01 , 0.03594, 0.12915, 0.46416, 1.6681 , 5.99484, 21.54435, 77.42637, 278.25594, 1000. ]), 'degree': array([2, 3])},\n",
" pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
" verbose=0)"
]
}
],
"prompt_number": 187
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"param_grid_fine = get_fine_mesh_params(svc.best_params_)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 188
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc = GridSearchCV(\n",
" estimator=SVM.SVC(kernel='rbf'), \n",
" param_grid = param_grid_fine, \n",
" n_jobs=4,\n",
" )"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 189
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc.fit(*tr)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 190,
"text": [
"GridSearchCV(cv=None,\n",
" estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
" kernel='rbf', max_iter=-1, probability=False, random_state=None,\n",
" shrinking=True, tol=0.001, verbose=False),\n",
" fit_params={}, iid=True, loss_func=None, n_jobs=4,\n",
" param_grid={'coef0': array([ 6.17873, 7.87341, 10.03288, 12.78465, 16.29117, 20.75942, 26.45322, 33.70867, 42.95412, 54.73535,\n",
" 69.74789, 88.878 , 113.25501, 144.31802, 183.90085, 234.34025, 298.61392, 380.51624, 484.88233, 617.87343]), 'degree': array([ 0.49... 7.09646, 9.04284, 11.52307, 14.68356, 18.71089, 23.84282, 30.38232, 38.71542, 49.3341 ])},\n",
" pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
" verbose=0)"
]
}
],
"prompt_number": 190
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"now train this GridSearch-optimized trained classifier on the labeled training data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"svc_pred = svc.predict(te[0])\n",
"\n",
"st = '(svm) fraction of testing instances correctly predicted: '\n",
"print(\"{0}{1:.2f}\".format(st, svc.score(*te)))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(svm) fraction of testing instances correctly predicted: 0.86\n"
]
}
],
"prompt_number": 191
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment