qguv/gist:a95db5756e808540b4e7

## gistfile1.txt
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package nps_chat to\n",
      "[nltk_data]     /Users/quintus/nltk_data...\n",
      "[nltk_data]   Package nps_chat is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /Users/quintus/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download(\"nps_chat\")\n",
    "nltk.download(\"punkt\")\n",
    "posts = nltk.corpus.nps_chat.xml_posts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def dialogue_act_features(post):\n",
    "    features = {}\n",
    "    for word in nltk.word_tokenize(post):\n",
    "        features[\"{}\".format(word.lower())] = True\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# build a \n",
    "\n",
    "# parameterizing to easily experiment with other classifiers/splits\n",
    "WHICH_CLASSIFIER = nltk.NaiveBayesClassifier\n",
    "SPLIT_AT_ELEMENT = 5283\n",
    "\n",
    "featuresets = [(dialogue_act_features(post.text), post.get(\"class\")) for post in posts]\n",
    "train_set, test_set = featuresets[:SPLIT_AT_ELEMENT - 1], featuresets[SPLIT_AT_ELEMENT:]\n",
    "classifier = WHICH_CLASSIFIER.train(train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 63.93%\n",
      "Most Informative Features\n",
      "                    part = True           System : Statem =    389.0 : 1.0\n",
      "                      no = True           nAnswe : Emotio =    267.4 : 1.0\n",
      "                      hi = True            Greet : System =    226.1 : 1.0\n",
      "                   empty = True            Other : System =    215.7 : 1.0\n",
      "                     yes = True           yAnswe : System =    167.8 : 1.0\n",
      "                    even = True           Clarif : System =    138.7 : 1.0\n",
      "                       i = True           nAnswe : Greet  =    127.6 : 1.0\n",
      "                       0 = True            Other : Statem =    124.5 : 1.0\n",
      "                   pages = True            Other : Statem =    124.5 : 1.0\n",
      "                       ? = True           ynQues : Greet  =    122.7 : 1.0\n",
      "                     brb = True              Bye : Statem =    120.4 : 1.0\n",
      "                     lol = True           Emotio : System =    115.8 : 1.0\n",
      "                       > = True            Other : Emotio =    112.0 : 1.0\n",
      "                      tc = True              Bye : Statem =    110.0 : 1.0\n",
      "                    what = True           whQues : Greet  =    109.7 : 1.0\n",
      "                    nope = True           nAnswe : Statem =    107.9 : 1.0\n",
      "                    lmao = True           Emotio : System =     75.1 : 1.0\n",
      "                     how = True           whQues : Emotio =     74.8 : 1.0\n",
      "                   blank = True            Other : Statem =     74.7 : 1.0\n",
      "                   where = True           whQues : System =     74.5 : 1.0\n",
      "                    chat = True           ynQues : System =     73.7 : 1.0\n",
      "                     the = True           Contin : Emotio =     67.5 : 1.0\n",
      "                       < = True            Other : ynQues =     66.5 : 1.0\n",
      "                 martini = True           Clarif : Statem =     62.2 : 1.0\n",
      "         10-19-40suser13 = True           Clarif : Statem =     62.2 : 1.0\n",
      "                     who = True           whQues : System =     60.6 : 1.0\n",
      "                   whats = True           whQues : Statem =     60.2 : 1.0\n",
      "                     are = True           whQues : Greet  =     59.2 : 1.0\n",
      "                    true = True           Accept : Statem =     56.5 : 1.0\n",
      "                       * = True            Other : System =     55.5 : 1.0\n",
      "                      na = True           ynQues : Greet  =     54.7 : 1.0\n",
      "                      me = True           ynQues : Greet  =     54.7 : 1.0\n",
      "                       ^ = True            Other : Greet  =     54.2 : 1.0\n",
      "                       2 = True            Other : Greet  =     54.2 : 1.0\n",
      "         10-24-40suser26 = True              Bye : Statem =     53.4 : 1.0\n",
      "                       & = True            Other : Statem =     53.3 : 1.0\n",
      "                     too = True           yAnswe : System =     52.9 : 1.0\n",
      "                  parent = True           yAnswe : Statem =     50.9 : 1.0\n",
      "                    good = True              Bye : System =     50.6 : 1.0\n",
      "                       ! = True           Emphas : whQues =     50.5 : 1.0\n",
      "                     not = True           Reject : System =     49.0 : 1.0\n",
      "                       l = True            Other : Emotio =     48.0 : 1.0\n",
      "                      in = True           nAnswe : Greet  =     47.0 : 1.0\n",
      "                     wan = True           ynQues : Greet  =     46.5 : 1.0\n",
      "                    said = True           Clarif : System =     46.2 : 1.0\n",
      "                    come = True           Clarif : System =     46.2 : 1.0\n",
      "                     him = True           Clarif : System =     46.2 : 1.0\n",
      "                     iam = True           nAnswe : Statem =     46.2 : 1.0\n",
      "                cheating = True           nAnswe : Statem =     46.2 : 1.0\n",
      "                   close = True           nAnswe : Statem =     46.2 : 1.0\n"
     ]
    }
   ],
   "source": [
    "# Show us the most informative features and estimate prediction accuracy\n",
    "print(\"Accuracy: {:.2%}\".format(nltk.classify.accuracy(classifier, train_set)))\n",
    "classifier.show_most_informative_features(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Confusion Matrix\n",
      "\n",
      "           |                                                                w         y |\n",
      "           |                   C                             S              h         n |\n",
      "           |                   o         E                   t              Q         Q |\n",
      "           |              C    n    E    m                   a         n    u    y    u |\n",
      "           |    A         l    t    m    p              R    t    S    A    e    A    e |\n",
      "           |    c         a    i    o    h    G    O    e    e    y    n    s    n    s |\n",
      "           |    c         r    n    t    a    r    t    j    m    s    s    t    s    t |\n",
      "           |    e    B    i    u    i    s    e    h    e    e    t    w    i    w    i |\n",
      "           |    p    y    f    e    o    i    e    e    c    n    e    e    o    e    o |\n",
      "           |    t    e    y    r    n    s    t    r    t    t    m    r    n    r    n |\n",
      "-----------+----------------------------------------------------------------------------+\n",
      "    Accept |  <19>   1    1    1    1    1    .   14    2   54    .    1    1   13    1 |\n",
      "       Bye |    .  <49>   .    .    .    1    .   13    .   28    2    .    .    .    . |\n",
      "   Clarify |    .    .   <.>   .    .    .    .    .    .   13    .    .    .    .    . |\n",
      " Continuer |    .    .    .   <1>   .    1    2   13    1   65    1    2    2    .    4 |\n",
      "   Emotion |    1    .    1    1 <323>  16    3   22    1   95   14    4    .    1    1 |\n",
      "  Emphasis |    .    .    .    .    .  <39>   .   21    5    7   14    .    .    1    2 |\n",
      "     Greet |    .    1    3    .    6    7 <536>  20    .   67    7    .    7    1    4 |\n",
      "     Other |    .    .    .    .    1    .    .   <6>   .   12    4    .    .    .    . |\n",
      "    Reject |    1    .    1    .    .    .    .   18  <15>  27    .    9    1    .    3 |\n",
      " Statement |    5    1   30   11   11   29    6  506   29 <752>  26   60   17    6   78 |\n",
      "    System |    .    .    4    .    9   24    .  104    1    6<1277>   1    2    1    2 |\n",
      "   nAnswer |    .    .    .    .    .    .    .    1    3   18    .  <15>   .    .    1 |\n",
      "whQuestion |    1    .    .    1    .    1    1   50    1    7    2    1 <193>   .   34 |\n",
      "   yAnswer |    6    .    .    .    .    3    .    5    1   16    2    2    .  <21>   . |\n",
      "ynQuestion |    1    .    .    .    .    .    .   62    6   10    1    5   29    . <149>|\n",
      "-----------+----------------------------------------------------------------------------+\n",
      "(row = reference; col = test)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"Confusion Matrix\\n\")\n",
    "predicted, actual = [], []\n",
    "for i, (features, post) in enumerate(test_set):\n",
    "    actual.append(post)\n",
    "    predicted.append(classifier.classify(features))\n",
    "print(nltk.ConfusionMatrix(actual, predicted))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package nps_chat to\n",
	"[nltk_data] /Users/quintus/nltk_data...\n",
	"[nltk_data] Package nps_chat is already up-to-date!\n",
	"[nltk_data] Downloading package punkt to /Users/quintus/nltk_data...\n",
	"[nltk_data] Package punkt is already up-to-date!\n"
	]
	}
	],
	"source": [
	"import nltk\n",
	"nltk.download(\"nps_chat\")\n",
	"nltk.download(\"punkt\")\n",
	"posts = nltk.corpus.nps_chat.xml_posts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def dialogue_act_features(post):\n",
	" features = {}\n",
	" for word in nltk.word_tokenize(post):\n",
	" features[\"{}\".format(word.lower())] = True\n",
	" return features"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# build a \n",
	"\n",
	"# parameterizing to easily experiment with other classifiers/splits\n",
	"WHICH_CLASSIFIER = nltk.NaiveBayesClassifier\n",
	"SPLIT_AT_ELEMENT = 5283\n",
	"\n",
	"featuresets = [(dialogue_act_features(post.text), post.get(\"class\")) for post in posts]\n",
	"train_set, test_set = featuresets[:SPLIT_AT_ELEMENT - 1], featuresets[SPLIT_AT_ELEMENT:]\n",
	"classifier = WHICH_CLASSIFIER.train(train_set)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy: 63.93%\n",
	"Most Informative Features\n",
	" part = True System : Statem = 389.0 : 1.0\n",
	" no = True nAnswe : Emotio = 267.4 : 1.0\n",
	" hi = True Greet : System = 226.1 : 1.0\n",
	" empty = True Other : System = 215.7 : 1.0\n",
	" yes = True yAnswe : System = 167.8 : 1.0\n",
	" even = True Clarif : System = 138.7 : 1.0\n",
	" i = True nAnswe : Greet = 127.6 : 1.0\n",
	" 0 = True Other : Statem = 124.5 : 1.0\n",
	" pages = True Other : Statem = 124.5 : 1.0\n",
	" ? = True ynQues : Greet = 122.7 : 1.0\n",
	" brb = True Bye : Statem = 120.4 : 1.0\n",
	" lol = True Emotio : System = 115.8 : 1.0\n",
	" > = True Other : Emotio = 112.0 : 1.0\n",
	" tc = True Bye : Statem = 110.0 : 1.0\n",
	" what = True whQues : Greet = 109.7 : 1.0\n",
	" nope = True nAnswe : Statem = 107.9 : 1.0\n",
	" lmao = True Emotio : System = 75.1 : 1.0\n",
	" how = True whQues : Emotio = 74.8 : 1.0\n",
	" blank = True Other : Statem = 74.7 : 1.0\n",
	" where = True whQues : System = 74.5 : 1.0\n",
	" chat = True ynQues : System = 73.7 : 1.0\n",
	" the = True Contin : Emotio = 67.5 : 1.0\n",
	" < = True Other : ynQues = 66.5 : 1.0\n",
	" martini = True Clarif : Statem = 62.2 : 1.0\n",
	" 10-19-40suser13 = True Clarif : Statem = 62.2 : 1.0\n",
	" who = True whQues : System = 60.6 : 1.0\n",
	" whats = True whQues : Statem = 60.2 : 1.0\n",
	" are = True whQues : Greet = 59.2 : 1.0\n",
	" true = True Accept : Statem = 56.5 : 1.0\n",
	" * = True Other : System = 55.5 : 1.0\n",
	" na = True ynQues : Greet = 54.7 : 1.0\n",
	" me = True ynQues : Greet = 54.7 : 1.0\n",
	" ^ = True Other : Greet = 54.2 : 1.0\n",
	" 2 = True Other : Greet = 54.2 : 1.0\n",
	" 10-24-40suser26 = True Bye : Statem = 53.4 : 1.0\n",
	" & = True Other : Statem = 53.3 : 1.0\n",
	" too = True yAnswe : System = 52.9 : 1.0\n",
	" parent = True yAnswe : Statem = 50.9 : 1.0\n",
	" good = True Bye : System = 50.6 : 1.0\n",
	" ! = True Emphas : whQues = 50.5 : 1.0\n",
	" not = True Reject : System = 49.0 : 1.0\n",
	" l = True Other : Emotio = 48.0 : 1.0\n",
	" in = True nAnswe : Greet = 47.0 : 1.0\n",
	" wan = True ynQues : Greet = 46.5 : 1.0\n",
	" said = True Clarif : System = 46.2 : 1.0\n",
	" come = True Clarif : System = 46.2 : 1.0\n",
	" him = True Clarif : System = 46.2 : 1.0\n",
	" iam = True nAnswe : Statem = 46.2 : 1.0\n",
	" cheating = True nAnswe : Statem = 46.2 : 1.0\n",
	" close = True nAnswe : Statem = 46.2 : 1.0\n"
	]
	}
	],
	"source": [
	"# Show us the most informative features and estimate prediction accuracy\n",
	"print(\"Accuracy: {:.2%}\".format(nltk.classify.accuracy(classifier, train_set)))\n",
	"classifier.show_most_informative_features(50)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Confusion Matrix\n",
	"\n",
	" \| w y \|\n",
	" \| C S h n \|\n",
	" \| o E t Q Q \|\n",
	" \| C n E m a n u y u \|\n",
	" \| A l t m p R t S A e A e \|\n",
	" \| c a i o h G O e e y n s n s \|\n",
	" \| c r n t a r t j m s s t s t \|\n",
	" \| e B i u i s e h e e t w i w i \|\n",
	" \| p y f e o i e e c n e e o e o \|\n",
	" \| t e y r n s t r t t m r n r n \|\n",
	"-----------+----------------------------------------------------------------------------+\n",
	" Accept \| <19> 1 1 1 1 1 . 14 2 54 . 1 1 13 1 \|\n",
	" Bye \| . <49> . . . 1 . 13 . 28 2 . . . . \|\n",
	" Clarify \| . . <.> . . . . . . 13 . . . . . \|\n",
	" Continuer \| . . . <1> . 1 2 13 1 65 1 2 2 . 4 \|\n",
	" Emotion \| 1 . 1 1 <323> 16 3 22 1 95 14 4 . 1 1 \|\n",
	" Emphasis \| . . . . . <39> . 21 5 7 14 . . 1 2 \|\n",
	" Greet \| . 1 3 . 6 7 <536> 20 . 67 7 . 7 1 4 \|\n",
	" Other \| . . . . 1 . . <6> . 12 4 . . . . \|\n",
	" Reject \| 1 . 1 . . . . 18 <15> 27 . 9 1 . 3 \|\n",
	" Statement \| 5 1 30 11 11 29 6 506 29 <752> 26 60 17 6 78 \|\n",
	" System \| . . 4 . 9 24 . 104 1 6<1277> 1 2 1 2 \|\n",
	" nAnswer \| . . . . . . . 1 3 18 . <15> . . 1 \|\n",
	"whQuestion \| 1 . . 1 . 1 1 50 1 7 2 1 <193> . 34 \|\n",
	" yAnswer \| 6 . . . . 3 . 5 1 16 2 2 . <21> . \|\n",
	"ynQuestion \| 1 . . . . . . 62 6 10 1 5 29 . <149>\|\n",
	"-----------+----------------------------------------------------------------------------+\n",
	"(row = reference; col = test)\n",
	"\n"
	]
	}
	],
	"source": [
	"print(\"Confusion Matrix\\n\")\n",
	"predicted, actual = [], []\n",
	"for i, (features, post) in enumerate(test_set):\n",
	" actual.append(post)\n",
	" predicted.append(classifier.classify(features))\n",
	"print(nltk.ConfusionMatrix(actual, predicted))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}