Skip to content

Instantly share code, notes, and snippets.

@qguv
Created March 30, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qguv/a95db5756e808540b4e7 to your computer and use it in GitHub Desktop.
Save qguv/a95db5756e808540b4e7 to your computer and use it in GitHub Desktop.
CMLS Classwork 2015-03-30
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package nps_chat to\n",
"[nltk_data] /Users/quintus/nltk_data...\n",
"[nltk_data] Package nps_chat is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /Users/quintus/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"import nltk\n",
"nltk.download(\"nps_chat\")\n",
"nltk.download(\"punkt\")\n",
"posts = nltk.corpus.nps_chat.xml_posts()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def dialogue_act_features(post):\n",
" features = {}\n",
" for word in nltk.word_tokenize(post):\n",
" features[\"{}\".format(word.lower())] = True\n",
" return features"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# build a \n",
"\n",
"# parameterizing to easily experiment with other classifiers/splits\n",
"WHICH_CLASSIFIER = nltk.NaiveBayesClassifier\n",
"SPLIT_AT_ELEMENT = 5283\n",
"\n",
"featuresets = [(dialogue_act_features(post.text), post.get(\"class\")) for post in posts]\n",
"train_set, test_set = featuresets[:SPLIT_AT_ELEMENT - 1], featuresets[SPLIT_AT_ELEMENT:]\n",
"classifier = WHICH_CLASSIFIER.train(train_set)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 63.93%\n",
"Most Informative Features\n",
" part = True System : Statem = 389.0 : 1.0\n",
" no = True nAnswe : Emotio = 267.4 : 1.0\n",
" hi = True Greet : System = 226.1 : 1.0\n",
" empty = True Other : System = 215.7 : 1.0\n",
" yes = True yAnswe : System = 167.8 : 1.0\n",
" even = True Clarif : System = 138.7 : 1.0\n",
" i = True nAnswe : Greet = 127.6 : 1.0\n",
" 0 = True Other : Statem = 124.5 : 1.0\n",
" pages = True Other : Statem = 124.5 : 1.0\n",
" ? = True ynQues : Greet = 122.7 : 1.0\n",
" brb = True Bye : Statem = 120.4 : 1.0\n",
" lol = True Emotio : System = 115.8 : 1.0\n",
" > = True Other : Emotio = 112.0 : 1.0\n",
" tc = True Bye : Statem = 110.0 : 1.0\n",
" what = True whQues : Greet = 109.7 : 1.0\n",
" nope = True nAnswe : Statem = 107.9 : 1.0\n",
" lmao = True Emotio : System = 75.1 : 1.0\n",
" how = True whQues : Emotio = 74.8 : 1.0\n",
" blank = True Other : Statem = 74.7 : 1.0\n",
" where = True whQues : System = 74.5 : 1.0\n",
" chat = True ynQues : System = 73.7 : 1.0\n",
" the = True Contin : Emotio = 67.5 : 1.0\n",
" < = True Other : ynQues = 66.5 : 1.0\n",
" martini = True Clarif : Statem = 62.2 : 1.0\n",
" 10-19-40suser13 = True Clarif : Statem = 62.2 : 1.0\n",
" who = True whQues : System = 60.6 : 1.0\n",
" whats = True whQues : Statem = 60.2 : 1.0\n",
" are = True whQues : Greet = 59.2 : 1.0\n",
" true = True Accept : Statem = 56.5 : 1.0\n",
" * = True Other : System = 55.5 : 1.0\n",
" na = True ynQues : Greet = 54.7 : 1.0\n",
" me = True ynQues : Greet = 54.7 : 1.0\n",
" ^ = True Other : Greet = 54.2 : 1.0\n",
" 2 = True Other : Greet = 54.2 : 1.0\n",
" 10-24-40suser26 = True Bye : Statem = 53.4 : 1.0\n",
" & = True Other : Statem = 53.3 : 1.0\n",
" too = True yAnswe : System = 52.9 : 1.0\n",
" parent = True yAnswe : Statem = 50.9 : 1.0\n",
" good = True Bye : System = 50.6 : 1.0\n",
" ! = True Emphas : whQues = 50.5 : 1.0\n",
" not = True Reject : System = 49.0 : 1.0\n",
" l = True Other : Emotio = 48.0 : 1.0\n",
" in = True nAnswe : Greet = 47.0 : 1.0\n",
" wan = True ynQues : Greet = 46.5 : 1.0\n",
" said = True Clarif : System = 46.2 : 1.0\n",
" come = True Clarif : System = 46.2 : 1.0\n",
" him = True Clarif : System = 46.2 : 1.0\n",
" iam = True nAnswe : Statem = 46.2 : 1.0\n",
" cheating = True nAnswe : Statem = 46.2 : 1.0\n",
" close = True nAnswe : Statem = 46.2 : 1.0\n"
]
}
],
"source": [
"# Show us the most informative features and estimate prediction accuracy\n",
"print(\"Accuracy: {:.2%}\".format(nltk.classify.accuracy(classifier, train_set)))\n",
"classifier.show_most_informative_features(50)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix\n",
"\n",
" | w y |\n",
" | C S h n |\n",
" | o E t Q Q |\n",
" | C n E m a n u y u |\n",
" | A l t m p R t S A e A e |\n",
" | c a i o h G O e e y n s n s |\n",
" | c r n t a r t j m s s t s t |\n",
" | e B i u i s e h e e t w i w i |\n",
" | p y f e o i e e c n e e o e o |\n",
" | t e y r n s t r t t m r n r n |\n",
"-----------+----------------------------------------------------------------------------+\n",
" Accept | <19> 1 1 1 1 1 . 14 2 54 . 1 1 13 1 |\n",
" Bye | . <49> . . . 1 . 13 . 28 2 . . . . |\n",
" Clarify | . . <.> . . . . . . 13 . . . . . |\n",
" Continuer | . . . <1> . 1 2 13 1 65 1 2 2 . 4 |\n",
" Emotion | 1 . 1 1 <323> 16 3 22 1 95 14 4 . 1 1 |\n",
" Emphasis | . . . . . <39> . 21 5 7 14 . . 1 2 |\n",
" Greet | . 1 3 . 6 7 <536> 20 . 67 7 . 7 1 4 |\n",
" Other | . . . . 1 . . <6> . 12 4 . . . . |\n",
" Reject | 1 . 1 . . . . 18 <15> 27 . 9 1 . 3 |\n",
" Statement | 5 1 30 11 11 29 6 506 29 <752> 26 60 17 6 78 |\n",
" System | . . 4 . 9 24 . 104 1 6<1277> 1 2 1 2 |\n",
" nAnswer | . . . . . . . 1 3 18 . <15> . . 1 |\n",
"whQuestion | 1 . . 1 . 1 1 50 1 7 2 1 <193> . 34 |\n",
" yAnswer | 6 . . . . 3 . 5 1 16 2 2 . <21> . |\n",
"ynQuestion | 1 . . . . . . 62 6 10 1 5 29 . <149>|\n",
"-----------+----------------------------------------------------------------------------+\n",
"(row = reference; col = test)\n",
"\n"
]
}
],
"source": [
"print(\"Confusion Matrix\\n\")\n",
"predicted, actual = [], []\n",
"for i, (features, post) in enumerate(test_set):\n",
" actual.append(post)\n",
" predicted.append(classifier.classify(features))\n",
"print(nltk.ConfusionMatrix(actual, predicted))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment