Created
March 30, 2015 14:11
-
-
Save qguv/a95db5756e808540b4e7 to your computer and use it in GitHub Desktop.
CMLS Classwork 2015-03-30
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package nps_chat to\n", | |
"[nltk_data] /Users/quintus/nltk_data...\n", | |
"[nltk_data] Package nps_chat is already up-to-date!\n", | |
"[nltk_data] Downloading package punkt to /Users/quintus/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"import nltk\n", | |
"nltk.download(\"nps_chat\")\n", | |
"nltk.download(\"punkt\")\n", | |
"posts = nltk.corpus.nps_chat.xml_posts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def dialogue_act_features(post):\n", | |
" features = {}\n", | |
" for word in nltk.word_tokenize(post):\n", | |
" features[\"{}\".format(word.lower())] = True\n", | |
" return features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# build a \n", | |
"\n", | |
"# parameterizing to easily experiment with other classifiers/splits\n", | |
"WHICH_CLASSIFIER = nltk.NaiveBayesClassifier\n", | |
"SPLIT_AT_ELEMENT = 5283\n", | |
"\n", | |
"featuresets = [(dialogue_act_features(post.text), post.get(\"class\")) for post in posts]\n", | |
"train_set, test_set = featuresets[:SPLIT_AT_ELEMENT - 1], featuresets[SPLIT_AT_ELEMENT:]\n", | |
"classifier = WHICH_CLASSIFIER.train(train_set)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy: 63.93%\n", | |
"Most Informative Features\n", | |
" part = True System : Statem = 389.0 : 1.0\n", | |
" no = True nAnswe : Emotio = 267.4 : 1.0\n", | |
" hi = True Greet : System = 226.1 : 1.0\n", | |
" empty = True Other : System = 215.7 : 1.0\n", | |
" yes = True yAnswe : System = 167.8 : 1.0\n", | |
" even = True Clarif : System = 138.7 : 1.0\n", | |
" i = True nAnswe : Greet = 127.6 : 1.0\n", | |
" 0 = True Other : Statem = 124.5 : 1.0\n", | |
" pages = True Other : Statem = 124.5 : 1.0\n", | |
" ? = True ynQues : Greet = 122.7 : 1.0\n", | |
" brb = True Bye : Statem = 120.4 : 1.0\n", | |
" lol = True Emotio : System = 115.8 : 1.0\n", | |
" > = True Other : Emotio = 112.0 : 1.0\n", | |
" tc = True Bye : Statem = 110.0 : 1.0\n", | |
" what = True whQues : Greet = 109.7 : 1.0\n", | |
" nope = True nAnswe : Statem = 107.9 : 1.0\n", | |
" lmao = True Emotio : System = 75.1 : 1.0\n", | |
" how = True whQues : Emotio = 74.8 : 1.0\n", | |
" blank = True Other : Statem = 74.7 : 1.0\n", | |
" where = True whQues : System = 74.5 : 1.0\n", | |
" chat = True ynQues : System = 73.7 : 1.0\n", | |
" the = True Contin : Emotio = 67.5 : 1.0\n", | |
" < = True Other : ynQues = 66.5 : 1.0\n", | |
" martini = True Clarif : Statem = 62.2 : 1.0\n", | |
" 10-19-40suser13 = True Clarif : Statem = 62.2 : 1.0\n", | |
" who = True whQues : System = 60.6 : 1.0\n", | |
" whats = True whQues : Statem = 60.2 : 1.0\n", | |
" are = True whQues : Greet = 59.2 : 1.0\n", | |
" true = True Accept : Statem = 56.5 : 1.0\n", | |
" * = True Other : System = 55.5 : 1.0\n", | |
" na = True ynQues : Greet = 54.7 : 1.0\n", | |
" me = True ynQues : Greet = 54.7 : 1.0\n", | |
" ^ = True Other : Greet = 54.2 : 1.0\n", | |
" 2 = True Other : Greet = 54.2 : 1.0\n", | |
" 10-24-40suser26 = True Bye : Statem = 53.4 : 1.0\n", | |
" & = True Other : Statem = 53.3 : 1.0\n", | |
" too = True yAnswe : System = 52.9 : 1.0\n", | |
" parent = True yAnswe : Statem = 50.9 : 1.0\n", | |
" good = True Bye : System = 50.6 : 1.0\n", | |
" ! = True Emphas : whQues = 50.5 : 1.0\n", | |
" not = True Reject : System = 49.0 : 1.0\n", | |
" l = True Other : Emotio = 48.0 : 1.0\n", | |
" in = True nAnswe : Greet = 47.0 : 1.0\n", | |
" wan = True ynQues : Greet = 46.5 : 1.0\n", | |
" said = True Clarif : System = 46.2 : 1.0\n", | |
" come = True Clarif : System = 46.2 : 1.0\n", | |
" him = True Clarif : System = 46.2 : 1.0\n", | |
" iam = True nAnswe : Statem = 46.2 : 1.0\n", | |
" cheating = True nAnswe : Statem = 46.2 : 1.0\n", | |
" close = True nAnswe : Statem = 46.2 : 1.0\n" | |
] | |
} | |
], | |
"source": [ | |
"# Show us the most informative features and estimate prediction accuracy\n", | |
"print(\"Accuracy: {:.2%}\".format(nltk.classify.accuracy(classifier, train_set)))\n", | |
"classifier.show_most_informative_features(50)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Confusion Matrix\n", | |
"\n", | |
" | w y |\n", | |
" | C S h n |\n", | |
" | o E t Q Q |\n", | |
" | C n E m a n u y u |\n", | |
" | A l t m p R t S A e A e |\n", | |
" | c a i o h G O e e y n s n s |\n", | |
" | c r n t a r t j m s s t s t |\n", | |
" | e B i u i s e h e e t w i w i |\n", | |
" | p y f e o i e e c n e e o e o |\n", | |
" | t e y r n s t r t t m r n r n |\n", | |
"-----------+----------------------------------------------------------------------------+\n", | |
" Accept | <19> 1 1 1 1 1 . 14 2 54 . 1 1 13 1 |\n", | |
" Bye | . <49> . . . 1 . 13 . 28 2 . . . . |\n", | |
" Clarify | . . <.> . . . . . . 13 . . . . . |\n", | |
" Continuer | . . . <1> . 1 2 13 1 65 1 2 2 . 4 |\n", | |
" Emotion | 1 . 1 1 <323> 16 3 22 1 95 14 4 . 1 1 |\n", | |
" Emphasis | . . . . . <39> . 21 5 7 14 . . 1 2 |\n", | |
" Greet | . 1 3 . 6 7 <536> 20 . 67 7 . 7 1 4 |\n", | |
" Other | . . . . 1 . . <6> . 12 4 . . . . |\n", | |
" Reject | 1 . 1 . . . . 18 <15> 27 . 9 1 . 3 |\n", | |
" Statement | 5 1 30 11 11 29 6 506 29 <752> 26 60 17 6 78 |\n", | |
" System | . . 4 . 9 24 . 104 1 6<1277> 1 2 1 2 |\n", | |
" nAnswer | . . . . . . . 1 3 18 . <15> . . 1 |\n", | |
"whQuestion | 1 . . 1 . 1 1 50 1 7 2 1 <193> . 34 |\n", | |
" yAnswer | 6 . . . . 3 . 5 1 16 2 2 . <21> . |\n", | |
"ynQuestion | 1 . . . . . . 62 6 10 1 5 29 . <149>|\n", | |
"-----------+----------------------------------------------------------------------------+\n", | |
"(row = reference; col = test)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Confusion Matrix\\n\")\n", | |
"predicted, actual = [], []\n", | |
"for i, (features, post) in enumerate(test_set):\n", | |
" actual.append(post)\n", | |
" predicted.append(classifier.classify(features))\n", | |
"print(nltk.ConfusionMatrix(actual, predicted))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment