Skip to content

Instantly share code, notes, and snippets.

@bage79
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bage79/eb2012efb05a0946427f to your computer and use it in GitHub Desktop.
Save bage79/eb2012efb05a0946427f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:976b053b3fc9365d88c7eb49e0505472456f537c77473ecef8011caaf34121f2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.1 Supervised Classification (\uac10\ub3c5\ud559\uc2b5\uc5d0 \uc758\ud55c \ubd84\ub958)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\ubd84\ub958\ub294 \uc785\ub825\uc5d0 \ub300\ud55c \uc801\uc808\ud55c \ub77c\ubca8(class)\uc744 \uace0\ub974\ub294 \uc791\uc5c5\uc774\ub2e4. <BR>\n",
"<BR>\n",
"Classification is the task of choosing the correct class label for a given input.<BR>\n",
"A classifier is called supervised if it is built based on training corpora containing the correct label for each input."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/supervised-classification.png\" width=\"700\"><BR>\n",
"(Figure 6-1) Supervised Classification"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Gender Identification (\uc131\ubcc4 \ud310\ubcc4)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import names as name2gender\n",
"import random\n",
"import sys\n",
"\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n",
"names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n",
"[(name,'female') for name in name2gender.words('female.txt')])\n",
"random.shuffle(names)\n",
"\n",
"print('len(names):', len(names))\n",
"pprint(names[:10])\n",
"print()\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\ub9c8\uc9c0\ub9c9 \ubb38\uc790)\n",
"def gender_features(word):\n",
" return {'last_letter': word[-1]}\n",
"\n",
"print(\"gender_features('Shrek'):\", gender_features('Shrek'))\n",
"print(\"names ended with 'k':\")\n",
"pprint([(name, gender) for (name,gender) in names if gender_features(name)['last_letter']=='k'][:10])\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n",
"featuresets = [(gender_features(name), gender) for (name,gender) in names]\n",
"train_set = featuresets[500:]\n",
"test_set = featuresets[:500] # \ud14c\uc2a4\ud2b8\uc14b\uc744 500\uac1c \ubf51\uc74c.\n",
"print('len(train_set):', len(train_set))\n",
"print('len(test_set):', len(test_set))\n",
"pprint(test_set[:10])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier=nltk.NaiveBayesClassifier.train(train_set)\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud14c\uc2a4\ud2b8\n",
"print(\"classifier.classify(gender_features('Neo')):\", classifier.classify(gender_features('Neo'))) \n",
"print(\"classifier.classify(gender_features('Trinity')):\", classifier.classify(gender_features('Trinity')))\n",
"print(\"classifier.classify(gender_features('Tony')):\", classifier.classify(gender_features('Tony')))\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print('accuracy:', nltk.classify.accuracy(classifier, test_set)) # \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uc815\ud655\ub960 \ud655\uc778\n",
"print()\n",
"\n",
"classifier.show_most_informative_features(10) # likelihood-ratio (\uc6b0\ub3c4\ube44)\n",
"print()\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(names): 7944\n",
"[(u'Hynda', u'female'),\n",
" (u'Jabez', u'male'),\n",
" (u'Tracy', u'female'),\n",
" (u'Isahella', u'female'),\n",
" (u'Wayland', u'male'),\n",
" (u'Vally', u'female'),\n",
" (u'Dee Dee', u'female'),\n",
" (u'Anastassia', u'female'),\n",
" (u'Sophie', u'female'),\n",
" (u'Mariele', u'female')]\n",
"\n",
"gender_features('Shrek'): {u'last_letter': u'k'}\n",
"names ended with 'k':\n",
"[(u'Izak', u'male'),\n",
" (u'Vick', u'male'),\n",
" (u'Dirk', u'male'),\n",
" (u'Merrick', u'male'),\n",
" (u'Roderick', u'male'),\n",
" (u'Tuck', u'male'),\n",
" (u'Erik', u'male'),\n",
" (u'Ulrick', u'male'),\n",
" (u'Kirk', u'male'),\n",
" (u'Jack', u'male')]\n",
"\n",
"len(train_set): 7444\n",
"len(test_set): 500\n",
"[({u'last_letter': u'a'}, u'female'),\n",
" ({u'last_letter': u'z'}, u'male'),\n",
" ({u'last_letter': u'y'}, u'female'),\n",
" ({u'last_letter': u'a'}, u'female'),\n",
" ({u'last_letter': u'd'}, u'male'),\n",
" ({u'last_letter': u'y'}, u'female'),\n",
" ({u'last_letter': u'e'}, u'female'),\n",
" ({u'last_letter': u'a'}, u'female'),\n",
" ({u'last_letter': u'e'}, u'female'),\n",
" ({u'last_letter': u'e'}, u'female')]\n",
"\n",
"\n",
"classifier.classify(gender_features('Neo')): male\n",
"classifier.classify(gender_features('Trinity')): female\n",
"classifier.classify(gender_features('Tony')): female\n",
"\n",
"accuracy: 0.756\n",
"\n",
"Most Informative Features\n",
" last_letter = u'a' female : male = 36.1 : 1.0\n",
" last_letter = u'k' male : female = 32.1 : 1.0\n",
" last_letter = u'f' male : female = 16.6 : 1.0\n",
" last_letter = u'p' male : female = 12.5 : 1.0\n",
" last_letter = u'v' male : female = 11.2 : 1.0\n",
" last_letter = u'd' male : female = 9.6 : 1.0\n",
" last_letter = u'm' male : female = 9.0 : 1.0\n",
" last_letter = u'o' male : female = 8.6 : 1.0\n",
" last_letter = u'w' male : female = 7.5 : 1.0\n",
" last_letter = u'r' male : female = 6.6 : 1.0\n",
"\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# (\ud301) LazyMap\uc744 \uc774\uc6a9\ud558\uc5ec \uba54\ubaa8\ub9ac \uc0ac\uc6a9\uc744 \ucd5c\uc18c\ud654 \ud558\ub77c.\n",
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"from nltk.classify import apply_features \n",
"import sys\n",
"\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n",
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n",
"# [(name,'female') for name in name2gender.words('female.txt')])\n",
"# random.shuffle(names)\n",
"\n",
"print('len(names):', len(names))\n",
"pprint(names[:10])\n",
"print()\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\ub9c8\uc9c0\ub9c9 \ubb38\uc790)\n",
"def gender_features(word):\n",
" return {'last_letter': word[-1]}\n",
"\n",
"print(\"gender_features('Shrek'):\", gender_features('Shrek'))\n",
"pprint([(name, gender) for (name,gender) in names if gender_features(name)['last_letter']=='k'][:10])\n",
"print()\n",
"\n",
"\n",
"%reload_ext memory_profiler\n",
"\n",
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ub370\uc774\ud0c0\uc14b \ud06c\uae30\uac00 \uc791\uc744 \ub54c)\n",
"%memit train_set = [(gender_features(name), gender) for (name,gender) in names][500:]\n",
"print(\"train_set:\", type(train_set), sys.getsizeof(train_set), 'bytes')\n",
"%memit classifier=nltk.NaiveBayesClassifier.train(train_set)\n",
"print()\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ub370\uc774\ud0c0\uc14b \ud06c\uae30\uac00 \ud074 \ub54c)\n",
"%memit train_set2 = apply_features(gender_features, names[500:])\n",
"print(\"train_set2:\", type(train_set2), sys.getsizeof(train_set2), 'bytes')\n",
"%memit classifier=nltk.NaiveBayesClassifier.train(train_set2)\n",
"print()\n",
"print()\n",
"\n",
"\n",
"\n",
"# \ub05d \uae00\uc790\uc5d0 \ud574\ub2f9\ud558\ub294 \uc774\ub984 \ud655\uc778 (\uad50\uc7ac\uc5d0 \uc5c6\uc74c)\n",
"def list_from_last_letter(names, letter): \n",
" li = []\n",
" for name, gender in names:\n",
" if name.endswith(letter):\n",
" li.append((name, gender))\n",
" return li\n",
"\n",
"print(\"ends with 'k'\")\n",
"pprint(list_from_last_letter(names, 'k')[:10])"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(names): 7944\n",
"[(u'Hynda', u'female'),\n",
" (u'Jabez', u'male'),\n",
" (u'Tracy', u'female'),\n",
" (u'Isahella', u'female'),\n",
" (u'Wayland', u'male'),\n",
" (u'Vally', u'female'),\n",
" (u'Dee Dee', u'female'),\n",
" (u'Anastassia', u'female'),\n",
" (u'Sophie', u'female'),\n",
" (u'Mariele', u'female')]\n",
"\n",
"gender_features('Shrek'): {u'last_letter': u'k'}\n",
"[(u'Izak', u'male'),\n",
" (u'Vick', u'male'),\n",
" (u'Dirk', u'male'),\n",
" (u'Merrick', u'male'),\n",
" (u'Roderick', u'male'),\n",
" (u'Tuck', u'male'),\n",
" (u'Erik', u'male'),\n",
" (u'Ulrick', u'male'),\n",
" (u'Kirk', u'male'),\n",
" (u'Jack', u'male')]\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"peak memory: 70.50 MiB, increment: 5.88 MiB\n",
"train_set: <type 'list'> 59624 bytes\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"peak memory: 70.51 MiB, increment: 0.00 MiB\n",
"\n",
"\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"peak memory: 70.51 MiB, increment: 0.00 MiB\n",
"train_set2: <class 'nltk.util.LazyMap'> 64 bytes\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"peak memory: 70.51 MiB, increment: 0.00 MiB\n",
"\n",
"\n",
"ends with 'k'\n",
"[(u'Izak', u'male'),\n",
" (u'Vick', u'male'),\n",
" (u'Dirk', u'male'),\n",
" (u'Merrick', u'male'),\n",
" (u'Roderick', u'male'),\n",
" (u'Tuck', u'male'),\n",
" (u'Erik', u'male'),\n",
" (u'Ulrick', u'male'),\n",
" (u'Kirk', u'male'),\n",
" (u'Jack', u'male')]\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Choosing the Right Features (\uc62c\ubc14\ub978 \ud2b9\uc9d5 \uc120\ud0dd)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import names as name2gender\n",
"import random\n",
"import sys\n",
"\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n",
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n",
"# [(name,'female') for name in name2gender.words('female.txt')])\n",
"# random.shuffle(names)\n",
"\n",
"print('len(names):', len(names))\n",
"pprint(names[:10])\n",
"print()\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\uccab/\ub05d \ubb38\uc790, \ubb38\uc790\ubcc4 \uac1c\uc218, \ud3ec\ud568\uc5ec\ubd80)\n",
"def gender_features2(name):\n",
" features={}\n",
" features['firstletter']=name[0].lower()\n",
" features['lastletter']=name[-1].lower()\n",
" for letter in 'abcdefghijklmnopqrstuvwxyz':\n",
" features['count(%s)'%letter]=name.lower().count(letter)\n",
" features['has(%s)'%letter]=(letter in name.lower())\n",
" return features\n",
"\n",
"print(\"gender_features2('Shrek'):\")\n",
"pprint(gender_features2('Shrek'))\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n",
"featuresets=[(gender_features2(name),gender) for (name, gender) in names]\n",
"train_set=featuresets[500:]\n",
"test_set=featuresets[:500]\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier=nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n",
"classifier.show_most_informative_features(100)\n",
"print()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(names): 7944\n",
"[(u'Hynda', u'female'),\n",
" (u'Jabez', u'male'),\n",
" (u'Tracy', u'female'),\n",
" (u'Isahella', u'female'),\n",
" (u'Wayland', u'male'),\n",
" (u'Vally', u'female'),\n",
" (u'Dee Dee', u'female'),\n",
" (u'Anastassia', u'female'),\n",
" (u'Sophie', u'female'),\n",
" (u'Mariele', u'female')]\n",
"\n",
"gender_features2('Shrek'):\n",
"{u'count(a)': 0,\n",
" u'count(b)': 0,\n",
" u'count(c)': 0,\n",
" u'count(d)': 0,\n",
" u'count(e)': 1,\n",
" u'count(f)': 0,\n",
" u'count(g)': 0,\n",
" u'count(h)': 1,\n",
" u'count(i)': 0,\n",
" u'count(j)': 0,\n",
" u'count(k)': 1,\n",
" u'count(l)': 0,\n",
" u'count(m)': 0,\n",
" u'count(n)': 0,\n",
" u'count(o)': 0,\n",
" u'count(p)': 0,\n",
" u'count(q)': 0,\n",
" u'count(r)': 1,\n",
" u'count(s)': 1,\n",
" u'count(t)': 0,\n",
" u'count(u)': 0,\n",
" u'count(v)': 0,\n",
" u'count(w)': 0,\n",
" u'count(x)': 0,\n",
" u'count(y)': 0,\n",
" u'count(z)': 0,\n",
" u'firstletter': u's',\n",
" u'has(a)': False,\n",
" u'has(b)': False,\n",
" u'has(c)': False,\n",
" u'has(d)': False,\n",
" u'has(e)': True,\n",
" u'has(f)': False,\n",
" u'has(g)': False,\n",
" u'has(h)': True,\n",
" u'has(i)': False,\n",
" u'has(j)': False,\n",
" u'has(k)': True,\n",
" u'has(l)': False,\n",
" u'has(m)': False,\n",
" u'has(n)': False,\n",
" u'has(o)': False,\n",
" u'has(p)': False,\n",
" u'has(q)': False,\n",
" u'has(r)': True,\n",
" u'has(s)': True,\n",
" u'has(t)': False,\n",
" u'has(u)': False,\n",
" u'has(v)': False,\n",
" u'has(w)': False,\n",
" u'has(x)': False,\n",
" u'has(y)': False,\n",
" u'has(z)': False,\n",
" u'lastletter': u'k'}\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.778\n",
"Most Informative Features\n",
" lastletter = u'a' female : male = 36.1 : 1.0\n",
" lastletter = u'k' male : female = 32.1 : 1.0\n",
" lastletter = u'f' male : female = 16.6 : 1.0\n",
" lastletter = u'p' male : female = 12.5 : 1.0\n",
" lastletter = u'v' male : female = 11.2 : 1.0\n",
" lastletter = u'd' male : female = 9.6 : 1.0\n",
" lastletter = u'm' male : female = 9.0 : 1.0\n",
" count(v) = 2 female : male = 8.9 : 1.0\n",
" lastletter = u'o' male : female = 8.6 : 1.0\n",
" lastletter = u'w' male : female = 7.5 : 1.0\n",
" lastletter = u'r' male : female = 6.6 : 1.0\n",
" lastletter = u'g' male : female = 4.9 : 1.0\n",
" firstletter = u'w' male : female = 4.7 : 1.0\n",
" count(a) = 3 female : male = 4.5 : 1.0\n",
" count(w) = 1 male : female = 4.5 : 1.0\n",
" has(w) = True male : female = 4.5 : 1.0\n",
" lastletter = u't' male : female = 4.4 : 1.0\n",
" lastletter = u'b' male : female = 4.3 : 1.0\n",
" lastletter = u's' male : female = 4.0 : 1.0\n",
" lastletter = u'j' male : female = 3.9 : 1.0\n",
" lastletter = u'i' female : male = 3.7 : 1.0\n",
" lastletter = u'z' male : female = 3.6 : 1.0\n",
" count(o) = 2 male : female = 3.5 : 1.0\n",
" count(f) = 2 male : female = 3.3 : 1.0\n",
" firstletter = u'u' male : female = 3.3 : 1.0\n",
" count(e) = 3 female : male = 3.2 : 1.0\n",
" count(a) = 2 female : male = 3.1 : 1.0\n",
" count(w) = 2 male : female = 3.0 : 1.0\n",
" lastletter = u'u' male : female = 3.0 : 1.0\n",
" count(d) = 3 male : female = 2.8 : 1.0\n",
" count(i) = 3 male : female = 2.7 : 1.0\n",
" count(l) = 3 female : male = 2.6 : 1.0\n",
" count(u) = 2 male : female = 2.6 : 1.0\n",
" firstletter = u'q' male : female = 2.6 : 1.0\n",
" count(p) = 3 female : male = 2.6 : 1.0\n",
" count(y) = 2 female : male = 2.4 : 1.0\n",
" count(m) = 3 male : female = 2.4 : 1.0\n",
" firstletter = u'k' female : male = 2.3 : 1.0\n",
" firstletter = u'h' male : female = 2.2 : 1.0\n",
" count(h) = 2 male : female = 2.1 : 1.0\n",
" lastletter = u'n' male : female = 2.1 : 1.0\n",
" count(p) = 2 male : female = 2.0 : 1.0\n",
" count(k) = 2 female : male = 2.0 : 1.0\n",
" firstletter = u'x' male : female = 2.0 : 1.0\n",
" count(r) = 2 male : female = 1.9 : 1.0\n",
" count(i) = 2 female : male = 1.9 : 1.0\n",
" count(d) = 2 male : female = 1.9 : 1.0\n",
" lastletter = u'x' male : female = 1.9 : 1.0\n",
" count(n) = 3 female : male = 1.8 : 1.0\n",
" lastletter = u'l' male : female = 1.8 : 1.0\n",
" firstletter = u'z' male : female = 1.8 : 1.0\n",
" has(u) = True male : female = 1.8 : 1.0\n",
" count(u) = 1 male : female = 1.8 : 1.0\n",
" lastletter = u'e' female : male = 1.8 : 1.0\n",
" count(t) = 3 female : male = 1.8 : 1.0\n",
" firstletter = u'l' female : male = 1.8 : 1.0\n",
" count(p) = 1 male : female = 1.8 : 1.0\n",
" has(f) = True male : female = 1.7 : 1.0\n",
" has(p) = True male : female = 1.7 : 1.0\n",
" count(e) = 2 female : male = 1.7 : 1.0\n",
" count(n) = 2 female : male = 1.7 : 1.0\n",
" count(t) = 2 female : male = 1.7 : 1.0\n",
" count(h) = 3 male : female = 1.7 : 1.0\n",
" firstletter = u'c' female : male = 1.7 : 1.0\n",
" firstletter = u't' male : female = 1.6 : 1.0\n",
" count(l) = 2 female : male = 1.6 : 1.0\n",
" firstletter = u'y' male : female = 1.6 : 1.0\n",
" count(f) = 1 male : female = 1.6 : 1.0\n",
" has(a) = False male : female = 1.6 : 1.0\n",
" count(a) = 0 male : female = 1.6 : 1.0\n",
" has(o) = True male : female = 1.5 : 1.0\n",
" count(b) = 2 female : male = 1.5 : 1.0\n",
" lastletter = u'h' male : female = 1.5 : 1.0\n",
" count(v) = 1 male : female = 1.5 : 1.0\n",
" count(g) = 1 male : female = 1.5 : 1.0\n",
" has(g) = True male : female = 1.5 : 1.0\n",
" count(z) = 1 male : female = 1.4 : 1.0\n",
" firstletter = u'o' male : female = 1.4 : 1.0\n",
" count(m) = 2 male : female = 1.4 : 1.0\n",
" count(o) = 1 male : female = 1.4 : 1.0\n",
" firstletter = u'r' male : female = 1.4 : 1.0\n",
" firstletter = u'm' female : male = 1.4 : 1.0\n",
" has(z) = True male : female = 1.4 : 1.0\n",
" count(b) = 3 male : female = 1.4 : 1.0\n",
" has(i) = True female : male = 1.4 : 1.0\n",
" has(h) = True male : female = 1.4 : 1.0\n",
" firstletter = u'p' male : female = 1.4 : 1.0\n",
" count(s) = 2 female : male = 1.4 : 1.0\n",
" has(v) = True male : female = 1.4 : 1.0\n",
" has(a) = True female : male = 1.4 : 1.0\n",
" count(h) = 1 male : female = 1.3 : 1.0\n",
" count(i) = 1 female : male = 1.3 : 1.0\n",
" has(i) = False male : female = 1.3 : 1.0\n",
" count(i) = 0 male : female = 1.3 : 1.0\n",
" has(x) = True male : female = 1.3 : 1.0\n",
" count(t) = 1 male : female = 1.3 : 1.0\n",
" has(l) = True female : male = 1.3 : 1.0\n",
" count(s) = 1 male : female = 1.3 : 1.0\n",
" count(b) = 1 male : female = 1.3 : 1.0\n",
" firstletter = u's' male : female = 1.3 : 1.0\n",
"\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\ud2b9\uc9d5\uc774 \ub108\ubb34 \ub9ce\uace0 \ud559\uc2b5\uc14b(training set)\uc774 \uc791\uc744 \uacbd\uc6b0, overfitting\n",
"(\uacfc\uc801\ud569)\uc774 \uc0dd\uae38 \uc218 \uc788\uc73c\ub2c8, \uc870\uc2ec\ud558\uc790."
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"error analysis (\uc624\ub958\ubd84\uc11d)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\uc624\ub958 \ubd84\uc11d(error analysis)\ub97c \uc704\ud574\uc11c trianing set\uc744 training set, dev-test set\uc73c\ub85c \ubd84\ub9ac\ud568.<BR>\n",
"<img src=\"http://www.nltk.org/images/corpus-org.png\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b(dev-test)\uc744 \uc5ec\ub7ec\ubc8c \uc900\ube44\ud558\uc5ec, \uac01 \uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uac01\uac01 \uc624\ub958 \ubd84\uc11d\uc744 \uc218\ud589."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import names as name2gender\n",
"import random\n",
"import sys\n",
"\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131. (\uc774\ub984, \uc131\ubcc4)\n",
"# names = ([(name, 'male') for name in name2gender.words('male.txt')] + \\\n",
"# [(name,'female') for name in name2gender.words('female.txt')])\n",
"# random.shuffle(names)\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\uc774\ub984) -> (\uccab/\ub05d \ubb38\uc790, \ubb38\uc790\ubcc4 \uac1c\uc218, \ud3ec\ud568\uc5ec\ubd80)\n",
"def gender_features2(name):\n",
" features={}\n",
" features['firstletter']=name[0].lower()\n",
" features['lastletter']=name[-1].lower()\n",
" for letter in 'abcdefghijklmnopqrstuvwxyz':\n",
" features['count(%s)'%letter]=name.lower().count(letter)\n",
" features['has(%s)'%letter]=(letter in name.lower())\n",
" return features\n",
"\n",
"# \uc624\ub958\ubd84\uc11d\uc744 \uc704\ud574, \ud559\uc2b5\uc14b\uc744 \ubd84\ub9ac.\n",
"train_names=names[1500:] # \uac1c\ubc1c\uc6a9 \ud559\uc2b5\uc14b\n",
"devtest_names=names[500:1500] # \uac1c\ubc1c\uc6a9 \ud14c\uc2a4\ud2b8\uc14b 1000\uac1c\n",
"# test_names=names[:500] # \uc2e4\uc804\uc6a9 \ud14c\uc2a4\ud2b8\uc14b 500\uac1c\n",
"\n",
"# \ud559\uc2b5\uc14b\uacfc \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131. (\ud2b9\uc9d5, \uc131\ubcc4)\n",
"train_set = [(gender_features2(n), g) for (n,g) in train_names]\n",
"devtest_set = [(gender_features2(n), g) for (n,g) in devtest_names]\n",
"# test_set = [(gender_features2(n), g) for (n,g) in test_names]\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
"# \uc624\ub958 \uae30\ub85d\n",
"errors=[]\n",
"for(name, tag) in devtest_names:\n",
" guess=classifier.classify(gender_features(name))\n",
" if guess != tag:\n",
" errors.append((tag,guess,name))\n",
"\n",
"# \uc624\ub958 \ubd84\uc11d\n",
"print(\"error analysis (names ending with 'n')\")\n",
"for (tag, guess, name) in sorted(errors):\n",
" if name.endswith('n'):\n",
" print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)) # \uc815\ub2f5, \ubd84\ub958\uae30\uc758 \ucd94\uce21, \uc785\ub825\ub370\uc774\ud0c0(\uc774\ub984)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"error analysis (names ending with 'n')\n",
"correct=male guess=female name=Alwin \n",
"correct=male guess=female name=Archon \n",
"correct=male guess=female name=Baron \n",
"correct=male guess=female name=Bjorn \n",
"correct=male guess=female name=Brandon \n",
"correct=male guess=female name=Clinton \n",
"correct=male guess=female name=Coleman \n",
"correct=male guess=female name=Darien \n",
"correct=male guess=female name=Darin \n",
"correct=male guess=female name=Darren \n",
"correct=male guess=female name=Donn \n",
"correct=male guess=female name=Elton \n",
"correct=male guess=female name=Erwin \n",
"correct=male guess=female name=Evan \n",
"correct=male guess=female name=Fabian \n",
"correct=male guess=female name=Ferguson \n",
"correct=male guess=female name=Gideon \n",
"correct=male guess=female name=Gretchen \n",
"correct=male guess=female name=Hanan \n",
"correct=male guess=female name=Hassan \n",
"correct=male guess=female name=Huntington \n",
"correct=male guess=female name=Juan \n",
"correct=male guess=female name=Ken \n",
"correct=male guess=female name=Lynn \n",
"correct=male guess=female name=Milton \n",
"correct=male guess=female name=Muffin \n",
"correct=male guess=female name=Nathan \n",
"correct=male guess=female name=Oran \n",
"correct=male guess=female name=Orion \n",
"correct=male guess=female name=Orrin \n",
"correct=male guess=female name=Patin \n",
"correct=male guess=female name=Quintin \n",
"correct=male guess=female name=Ramon \n",
"correct=male guess=female name=Reagan \n",
"correct=male guess=female name=Reuben \n",
"correct=male guess=female name=Reuven \n",
"correct=male guess=female name=Robin \n",
"correct=male guess=female name=Ron \n",
"correct=male guess=female name=Ronen \n",
"correct=male guess=female name=Shaughn \n",
"correct=male guess=female name=Shawn \n",
"correct=male guess=female name=Shimon \n",
"correct=male guess=female name=Simeon \n",
"correct=male guess=female name=Simon \n",
"correct=male guess=female name=Stan \n",
"correct=male guess=female name=Tarzan \n",
"correct=male guess=female name=Tedman \n",
"correct=male guess=female name=Torin \n",
"correct=male guess=female name=Trenton \n",
"correct=male guess=female name=Tristan \n",
"correct=male guess=female name=Tyson \n",
"correct=male guess=female name=Vaughan \n",
"correct=male guess=female name=Vernon \n",
"correct=male guess=female name=Washington \n",
"correct=male guess=female name=Waylen \n",
"correct=male guess=female name=Waylon \n",
"correct=male guess=female name=Weston \n",
"correct=male guess=female name=Weylin \n",
"correct=male guess=female name=Wilburn \n",
"correct=male guess=female name=Wyatan \n",
"correct=male guess=female name=Wynn \n",
"correct=male guess=female name=Zebulen \n",
"correct=male guess=female name=Zebulon \n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Document Classification (\ubb38\uc11c \ubd84\ub958)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\ubb38\uc11c \ubd84\ub958 (\uc601\ud654 \ub9ac\ubdf0\ub97c \ud1b5\ud55c \uac10\uc131 \ubd84\uc11d)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"from nltk.corpus import movie_reviews\n",
"import random\n",
"import nltk\n",
"\n",
"\n",
"print(\"movie_reviews.categories():\", movie_reviews.categories()) # category\ub294 pos or neg\n",
"print(\"movie_reviews.fileids('pos'):\", movie_reviews.fileids('pos')[:10], \"...\")\n",
"print()\n",
"\n",
"# \uc785\ub825 \ub370\uc774\ud0c0 \uc0dd\uc131 (\ubb38\uc11c, \uae0d\uc815/\ubd80\uc815)\n",
"documents = [(list(movie_reviews.words(fileid)), category)\n",
" for category in movie_reviews.categories()\n",
" for fileid in movie_reviews.fileids(category)]\n",
"# random.shuffle(documents)\n",
"\n",
"print(\"documents[0]:\", documents[0][0][:10], \"...\", documents[0][1])\n",
"print()\n",
"\n",
"all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # \ubb38\uc11c\uc5d0 \ucd9c\ud604\ub41c \ub2e8\uc5b4 \ubaa9\ub85d\uc744 \ube48\ub3c4\uc21c\uc73c\ub85c \uc815\ub82c\n",
"print(\"len(all_words):\", len(all_words))\n",
"\n",
"print(\"nltk.__version__:\", nltk.__version__)\n",
"if nltk.__version__.startswith('3.'):\n",
" word_features = [k for (k,v) in all_words.most_common(2000)] # \uc790\uc8fc \ucd9c\ud604\ud55c \ub2e8\uc5b4 \ubaa9\ub85d (for nltk 3.x)\n",
"else:\n",
" word_features = all_words.keys()[:2000] # \uc790\uc8fc \ucd9c\ud604\ud55c \ub2e8\uc5b4 \ubaa9\ub85d (for nltk 2.x)\n",
"\n",
"print(\"word_features:\", word_features[:10], \"...\")\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\ubb38\uc11c) -> (\ub2e8\uc5b4\ud3ec\ud568 \uc5ec\ubd80)\n",
"def document_features(document): \n",
" document_words = set(document)\n",
" features = {}\n",
" for word in word_features:\n",
" features['contains(%s)' % word] = (word in document_words) # word in set \uc774 word in list \ubcf4\ub2e4 \ube60\ub974\ub2e4. (4\uc7a5 \ucc38\uc870)\n",
" return features\n",
"pprint(document_features(movie_reviews.words('pos/cv957_8737.txt')).items()[:10])\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \uae0d\uc815/\ubd80\uc815)\n",
"featuresets = [(document_features(doc), category) for (doc, category) in documents]\n",
"train_set, test_set = featuresets[100:], featuresets[:100]\n",
"print(\"featuresets[0]:\", featuresets[0][0].items()[:20], \"...\", featuresets[0][1])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n",
"print(classifier.show_most_informative_features(5))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"movie_reviews.categories(): [u'neg', u'pos']\n",
"movie_reviews.fileids('pos'): [u'pos/cv000_29590.txt', u'pos/cv001_18431.txt', u'pos/cv002_15918.txt', u'pos/cv003_11664.txt', u'pos/cv004_11636.txt', u'pos/cv005_29443.txt', u'pos/cv006_15448.txt', u'pos/cv007_4968.txt', u'pos/cv008_29435.txt', u'pos/cv009_29592.txt'] ...\n",
"\n",
"documents[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" [u'plot', u':', u'two', u'teen', u'couples', u'go', u'to', u'a', u'church', u'party'] ... neg\n",
"\n",
"len(all_words):"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 39768\n",
"nltk.__version__: 3.0.1\n",
"word_features: [u',', u'the', u'.', u'a', u'and', u'of', u'to', u\"'\", u'is', u'in'] ...\n",
"[(u'contains(waste)', False),\n",
" (u'contains(lot)', False),\n",
" (u'contains(*)', True),\n",
" (u'contains(black)', False),\n",
" (u'contains(rated)', False),\n",
" (u'contains(potential)', False),\n",
" (u'contains(m)', False),\n",
" (u'contains(understand)', False),\n",
" (u'contains(drug)', True),\n",
" (u'contains(case)', False)]\n",
"\n",
"featuresets[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" [(u'contains(waste)', False), (u'contains(lot)', False), (u'contains(*)', False), (u'contains(black)', False), (u'contains(rated)', False), (u'contains(potential)', False), (u'contains(m)', False), (u'contains(understand)', False), (u'contains(drug)', False), (u'contains(case)', False), (u'contains(created)', False), (u'contains(kiss)', False), (u'contains(needed)', False), (u'contains(c)', False), (u'contains(about)', True), (u'contains(toy)', False), (u'contains(longer)', False), (u'contains(ready)', False), (u'contains(certainly)', False), (u'contains(lame)', False)] ... neg\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.86\n",
"Most Informative Features\n",
" contains(outstanding) = True pos : neg = 10.4 : 1.0\n",
" contains(seagal) = True neg : pos = 8.7 : 1.0\n",
" contains(mulan) = True pos : neg = 8.1 : 1.0\n",
" contains(wonderfully) = True pos : neg = 6.3 : 1.0\n",
" contains(damon) = True pos : neg = 5.7 : 1.0\n",
"None\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import gc; gc.collect() # release memory."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"0"
]
}
],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\ubb38\uc11c \ubd84\ub958 (\uc601\ud654 \ub9ac\ubdf0\ub97c \ud1b5\ud55c \uac10\uc131 \ubd84\uc11d) #2 (\uad50\uc7ac\uc5d0 \uc5c6\uc74c) = \ubaa8\ub4e0 \ub2e8\uc5b4\uac00 \uc544\ub2cc \uc774\ub984(\ubc30\uc6b0)\ub85c\ub9cc \ud2b9\uc9d5\uc744 \ucd94\ucd9c\ud558\uba74 \uc5b4\ub5bb\uac8c \ub420\uae4c?"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import movie_reviews\n",
"from nltk.corpus import names as name2gender\n",
"import random\n",
"\n",
"# \uc785\ub825 \ub370\uc774\ud0c0 \uc0dd\uc131 (\ubb38\uc11c, \uae0d\uc815/\ubd80\uc815)\n",
"# documents = [(list(movie_reviews.words(fileid)), category)\n",
"# for category in movie_reviews.categories()\n",
"# for fileid in movie_reviews.fileids(category)]\n",
"# random.shuffle(documents)\n",
"\n",
"all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())\n",
"# print(\"len(all_words):\", len(all_words))\n",
"\n",
"_names = set([name.lower() for name in name2gender.words('male.txt')] + \\\n",
"[name.lower() for name in name2gender.words('female.txt')]) # \uc774\ub984 \ubaa9\ub85d\n",
" \n",
"if nltk.__version__.startswith('3.'): \n",
" actor_names = [name.lower() for (name,v) in all_words.most_common() if name in _names] # \uc601\ud654 \ub9ac\ubdf0\uc548\uc5d0 \uc788\ub294 \uc774\ub984 \ubaa9\ub85d.\n",
"else:\n",
" actor_names = [name.lower() for (name,v) in all_words.keys() if name in _names] # \uc601\ud654 \ub9ac\ubdf0\uc548\uc5d0 \uc788\ub294 \uc774\ub984 \ubaa9\ub85d.\n",
" \n",
"actor_names = actor_names[:2000] # \uc774\uc804 \ubd84\uc11d\uacfc \uc870\uac74\uc744 \uac19\uac8c \ud558\uae30 \uc704\ud574, feature(\uc774\ub984) \uac1c\uc218\ub97c 2000\uac1c\ub85c \uc81c\ud55c.\n",
"print(\"len(actor_names):\", len(actor_names), actor_names[:100], \"...\")\n",
"print('jolie in actor_names:', 'jolie' in actor_names)\n",
"print()\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758. (\ubb38\uc11c) -> (\ubc30\uc6b0\uc774\ub984 \ud3ec\ud568\uc5ec\ubd80)\n",
"def document_features2(document): \n",
" document_words = set(document)\n",
" features = {}\n",
" for word in actor_names:\n",
" features['contains(%s)' % word] = (word in document_words) # word in set \uc774 word in list \ubcf4\ub2e4 \ube60\ub974\ub2e4. (4\uc7a5 \ucc38\uc870)\n",
" return features\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \uae0d\uc815/\ubd80\uc815)\n",
"featuresets = [(document_features2(doc), category) for (doc, category) in documents]\n",
"train_set, test_set = featuresets[100:], featuresets[:100]\n",
"print(\"featuresets[0]:\", featuresets[0][0].items()[:20], \"...\", featuresets[0][1])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print('accuracy:', nltk.classify.accuracy(classifier, test_set))\n",
"print(classifier.show_most_informative_features(5))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(actor_names): 2000 [u'will', u'see', u'way', u'don', u'love', u'may', u'say', u'john', u'star', u'guy', u'job', u'james', u'case', u'michael', u'town', u'david', u'else', u'son', u'kevin', u'joe', u'worth', u'jack', u'major', u'robert', u'jackie', u'tom', u'lee', u'peter', u'hope', u'king', u'oscar', u'saw', u'paul', u'van', u'smith', u'george', u'chance', u'chris', u'happy', u'art', u'robin', u'ryan', u'william', u'ben', u'red', u'rock', u'rich', u'jim', u'harry', u'bob', u'bill', u'sam', u'martin', u'murphy', u'mark', u'scott', u'truman', u'cameron', u'bruce', u'frank', u'richard', u'chase', u'carter', u'fan', u'haven', u'allen', u'tim', u'west', u'park', u'steve', u'eddie', u'chan', u'max', u'woody', u'wait', u'simon', u'mary', u'steven', u'nick', u'willis', u'grace', u'mike', u'carry', u'sean', u'french', u'jerry', u'jackson', u'tarzan', u'pace', u'trip', u'billy', u'julia', u'la', u'christopher', u'matthew', u'danny', u'win', u'fox', u'julie', u'jennifer'] ...\n",
"jolie in actor_names: True\n",
"\n",
"featuresets[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" [(u'contains(andre)', False), (u'contains(malka)', False), (u'contains(torry)', False), (u'contains(terence)', False), (u'contains(shaw)', False), (u'contains(lex)', False), (u'contains(philippe)', False), (u'contains(jolie)', False), (u'contains(rea)', False), (u'contains(petra)', False), (u'contains(di)', False), (u'contains(nanni)', False), (u'contains(case)', False), (u'contains(laure)', False), (u'contains(lydia)', False), (u'contains(rick)', False), (u'contains(mathilda)', False), (u'contains(kelsey)', False), (u'contains(chip)', False), (u'contains(leila)', False)] ... neg\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.65\n",
"Most Informative Features\n",
" contains(hudson) = True neg : pos = 10.7 : 1.0\n",
" contains(elliot) = True pos : neg = 9.9 : 1.0\n",
" contains(ivy) = True neg : pos = 7.8 : 1.0\n",
" contains(terri) = True neg : pos = 7.8 : 1.0\n",
" contains(hugo) = True pos : neg = 6.9 : 1.0\n",
"None\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import gc; gc.collect() # release memory."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"0"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Part-of-Speech Tagging (\ud488\uc0ac \ubd80\ucc29)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"brown corpus pos tags: http://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals\n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import brown\n",
"\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac)\n",
"suffix_fdist = nltk.FreqDist()\n",
"print(\"len(brown.words()):\", len(brown.words()))\n",
"for word in brown.words()[:100000]: # \uba54\ubaa8\ub9ac\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uace0, \ub108\ubb34 \uc624\ub798 \uac78\ub824\uc11c, \ub370\uc774\ud0c0\ub97c \uc77c\ubd80\ub9cc \uc0ac\uc6a9\ud568.\n",
" word = word.lower()\n",
" suffix_fdist[word[-1:]] += 1\n",
" suffix_fdist[word[-2:]] += 1\n",
" suffix_fdist[word[-3:]] += 1\n",
"print(\"nltk.__version__:\", nltk.__version__)\n",
"if nltk.__version__.startswith('3.'): \n",
" common_suffixes = [k for (k,v) in suffix_fdist.most_common(100)] # for nltk 3.x \n",
"else:\n",
" common_suffixes = suffix_fdist.keys()[:100] # for nltk 2.x\n",
"suffix_fdist=None\n",
"print(\"common_suffixes:\", common_suffixes) \n",
"print()\n",
"\n",
"def pos_features(word):\n",
" features = {}\n",
" for suffix in common_suffixes:\n",
" features['endswith(%s)' % suffix] = word.lower().endswith(suffix)\n",
" return features\n",
" \n",
"# \ud14c\uc2a4\ud2b8\uc6a9\n",
"def pos_features_print(word): # True\uc778 feature\ub9cc \ucd9c\ub825\ud568. (\uad50\uc7ac\uc5d0 \uc5c6\uc74c)\n",
" print(\"pos_features('\"+word+\"'):\", [(k, v) for (k, v) in pos_features(word).items() if v is True])\n",
" \n",
"pos_features_print('studied') \n",
"print()\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac)\n",
"tagged_words = brown.tagged_words(categories='news')\n",
"print(\"len(tagged_words):\", len(tagged_words))\n",
"tagged_words = tagged_words[:10000] # \uba54\ubaa8\ub9ac\ub3c4 \ub9ce\uc774 \uc0ac\uc6a9\ud558\uace0, \ub108\ubb34 \uc624\ub798 \uac78\ub824\uc11c, \ub370\uc774\ud0c0\ub97c \uc77c\ubd80\ub9cc \uc0ac\uc6a9\ud568.\n",
"print(\"tagged_words:\", tagged_words[:10], \"...\")\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n",
"featuresets = [(pos_features(word), tag) for (word, tag) in tagged_words]\n",
"size = int(len(featuresets) * 0.1) # test set size\n",
"train_set, test_set = featuresets[size:], featuresets[:size]\n",
"tagged_words = None\n",
"print(\"featuresets:\")\n",
"pprint(featuresets[0])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.DecisionTreeClassifier.train(train_set)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud14c\uc2a4\ud2b8\n",
"print(\"classifier.classify(pos_features('cats')):\", classifier.classify(pos_features('cats'))) # NNS = plural noun\n",
"print()\n",
"\n",
"print(classifier.pseudocode(depth=4))\n",
"print(classifier.pp(depth=4))\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(brown.words()): 1161192\n",
"nltk.__version__:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 3.0.1\n",
"common_suffixes: [u'e', u',', u'.', u's', u'd', u'n', u't', u'he', u'a', u'the', u'of', u'r', u'y', u'to', u'in', u'o', u'ed', u'on', u'f', u'l', u'nd', u'er', u'g', u'ng', u'and', u'is', u'at', u'as', u'ing', u'h', u'es', u'or', u're', u'an', u'``', u\"''\", u'ion', u'al', u'm', u'nt', u'st', u'll', u'en', u'it', u'be', u'ly', u'by', u'rs', u'th', u'ent', u'ts', u'for', u'k', u\"'\", u';', u'hat', u'le', u'ce', u'ay', u'ted', u'ld', u've', u'w', u'te', u'me', u'ry', u'his', u'se', u'ns', u'ut', u'`', u'ch', u'was', u'i', u\"'s\", u'ers', u'ere', u'id', u'ty', u'--', u'ith', u'ne', u'ter', u'her', u'ill', u'p', u')', u'(', u'ey', u'0', u'ate', u'aid', u'ar', u'day', u'ad', u':', u'et', u'om', u'nce', u's.']\n",
"\n",
"pos_features('studied'): [(u'endswith(d)', True), (u'endswith(ed)', True)]\n",
"\n",
"len(tagged_words):"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 100554\n",
"tagged_words: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN')] ...\n",
"\n",
"featuresets:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"({u\"endswith('')\": False,\n",
" u\"endswith(')\": False,\n",
" u\"endswith('s)\": False,\n",
" u'endswith(()': False,\n",
" u'endswith())': False,\n",
" u'endswith(,)': False,\n",
" u'endswith(--)': False,\n",
" u'endswith(.)': False,\n",
" u'endswith(0)': False,\n",
" u'endswith(:)': False,\n",
" u'endswith(;)': False,\n",
" u'endswith(`)': False,\n",
" u'endswith(``)': False,\n",
" u'endswith(a)': False,\n",
" u'endswith(ad)': False,\n",
" u'endswith(aid)': False,\n",
" u'endswith(al)': False,\n",
" u'endswith(an)': False,\n",
" u'endswith(and)': False,\n",
" u'endswith(ar)': False,\n",
" u'endswith(as)': False,\n",
" u'endswith(at)': False,\n",
" u'endswith(ate)': False,\n",
" u'endswith(ay)': False,\n",
" u'endswith(be)': False,\n",
" u'endswith(by)': False,\n",
" u'endswith(ce)': False,\n",
" u'endswith(ch)': False,\n",
" u'endswith(d)': False,\n",
" u'endswith(day)': False,\n",
" u'endswith(e)': True,\n",
" u'endswith(ed)': False,\n",
" u'endswith(en)': False,\n",
" u'endswith(ent)': False,\n",
" u'endswith(er)': False,\n",
" u'endswith(ere)': False,\n",
" u'endswith(ers)': False,\n",
" u'endswith(es)': False,\n",
" u'endswith(et)': False,\n",
" u'endswith(ey)': False,\n",
" u'endswith(f)': False,\n",
" u'endswith(for)': False,\n",
" u'endswith(g)': False,\n",
" u'endswith(h)': False,\n",
" u'endswith(hat)': False,\n",
" u'endswith(he)': True,\n",
" u'endswith(her)': False,\n",
" u'endswith(his)': False,\n",
" u'endswith(i)': False,\n",
" u'endswith(id)': False,\n",
" u'endswith(ill)': False,\n",
" u'endswith(in)': False,\n",
" u'endswith(ing)': False,\n",
" u'endswith(ion)': False,\n",
" u'endswith(is)': False,\n",
" u'endswith(it)': False,\n",
" u'endswith(ith)': False,\n",
" u'endswith(k)': False,\n",
" u'endswith(l)': False,\n",
" u'endswith(ld)': False,\n",
" u'endswith(le)': False,\n",
" u'endswith(ll)': False,\n",
" u'endswith(ly)': False,\n",
" u'endswith(m)': False,\n",
" u'endswith(me)': False,\n",
" u'endswith(n)': False,\n",
" u'endswith(nce)': False,\n",
" u'endswith(nd)': False,\n",
" u'endswith(ne)': False,\n",
" u'endswith(ng)': False,\n",
" u'endswith(ns)': False,\n",
" u'endswith(nt)': False,\n",
" u'endswith(o)': False,\n",
" u'endswith(of)': False,\n",
" u'endswith(om)': False,\n",
" u'endswith(on)': False,\n",
" u'endswith(or)': False,\n",
" u'endswith(p)': False,\n",
" u'endswith(r)': False,\n",
" u'endswith(re)': False,\n",
" u'endswith(rs)': False,\n",
" u'endswith(ry)': False,\n",
" u'endswith(s)': False,\n",
" u'endswith(s.)': False,\n",
" u'endswith(se)': False,\n",
" u'endswith(st)': False,\n",
" u'endswith(t)': False,\n",
" u'endswith(te)': False,\n",
" u'endswith(ted)': False,\n",
" u'endswith(ter)': False,\n",
" u'endswith(th)': False,\n",
" u'endswith(the)': True,\n",
" u'endswith(to)': False,\n",
" u'endswith(ts)': False,\n",
" u'endswith(ty)': False,\n",
" u'endswith(ut)': False,\n",
" u'endswith(ve)': False,\n",
" u'endswith(w)': False,\n",
" u'endswith(was)': False,\n",
" u'endswith(y)': False},\n",
" u'AT')\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.68\n",
"\n",
"classifier.classify(pos_features('cats')): NNS\n",
"\n",
"if endswith(he) == False: \n",
" if endswith(s) == False: \n",
" if endswith(,) == False: \n",
" if endswith(.) == False: return u'``'\n",
" if endswith(.) == True: return u'.'\n",
" if endswith(,) == True: return u','\n",
" if endswith(s) == True: \n",
" if endswith(was) == False: \n",
" if endswith(is) == False: return u'NN'\n",
" if endswith(is) == True: return u'BEZ'\n",
" if endswith(was) == True: return u'BEDZ'\n",
"if endswith(he) == True: \n",
" if endswith(the) == False: return u'PPS'\n",
" if endswith(the) == True: return u'AT'\n",
"\n",
"endswith(he)=False? ................................... ``\n",
" endswith(s)=False? .................................. ``\n",
" endswith(,)=False? ................................ ``\n",
" endswith(.)=False? .............................. ``\n",
" endswith(.)=True? ............................... .\n",
" endswith(,)=True? ................................. ,\n",
" endswith(s)=True? ................................... NN\n",
" endswith(was)=False? .............................. NN\n",
" endswith(is)=False? ............................. NN\n",
" endswith(is)=True? .............................. BEZ\n",
" endswith(was)=True? ............................... BEDZ\n",
"endswith(he)=True? .................................... AT\n",
" endswith(the)=False? ................................ PPS\n",
" endswith(the)=True? ................................. AT\n",
"\n"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import gc; gc.collect() # release memory."
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"0"
]
}
],
"prompt_number": 10
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Exploiting Context (\ubb38\ub9e5 \ud65c\uc6a9)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import brown\n",
"\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4)\n",
"def pos_features(sentence, i):\n",
" features = {\"suffix(1)\": sentence[i][-1:],\n",
" \"suffix(2)\": sentence[i][-2:],\n",
" \"suffix(3)\": sentence[i][-3:]}\n",
" if i == 0:\n",
" features[\"prev-word\"] = \"<START>\"\n",
" else:\n",
" features[\"prev-word\"] = sentence[i-1]\n",
" return features\n",
"\n",
"print(\"brown.sents()[0][7]:\", brown.sents()[0][7])\n",
"print(\"brown.sents()[0][8]:\", brown.sents()[0][8])\n",
"print(\"pos_features(brown.sents()[0], 8):\", pos_features(brown.sents()[0], 8))\n",
"print()\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac)\n",
"tagged_sents = brown.tagged_sents(categories='news')\n",
"print(\"tagged_sents[0]:\", tagged_sents[0])\n",
"print(\"nltk.tag.untag(tagged_sents[0]):\", nltk.tag.untag(tagged_sents[0]))\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n",
"featuresets = []\n",
"for tagged_sent in tagged_sents:\n",
" untagged_sent = nltk.tag.untag(tagged_sent)\n",
" for i, (word, tag) in enumerate(tagged_sent):\n",
" featuresets.append( (pos_features(untagged_sent, i), tag) )\n",
"size = int(len(featuresets) * 0.1)\n",
"train_set, test_set = featuresets[size:], featuresets[:size]\n",
"print(\"train_set[0]:\", train_set[0])\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"print(\"featuresets[0]:\", featuresets[0])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n",
"print()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"brown.sents()[0][7]: an\n",
"brown.sents()[0][8]: investigation\n",
"pos_features(brown.sents()[0], 8): {u'suffix(3)': u'ion', u'prev-word': u'an', u'suffix(2)': u'on', u'suffix(1)': u'n'}\n",
"\n",
"tagged_sents[0]: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u\"Atlanta's\", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u\"''\", u\"''\"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')]\n",
"nltk.tag.untag(tagged_sents[0]): [u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u\"Atlanta's\", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u\"''\", u'that', u'any', u'irregularities', u'took', u'place', u'.']\n",
"\n",
"train_set[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" ({u'suffix(3)': u'our', u'prev-word': u'of', u'suffix(2)': u'ur', u'suffix(1)': u'r'}, u'PP$')\n",
"featuresets[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" ({u'suffix(3)': u'The', u'prev-word': u'<START>', u'suffix(2)': u'he', u'suffix(1)': u'e'}, u'AT')\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.789159622079\n",
"\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"?nltk.NaiveBayesClassifier"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Sequence Classification"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import brown\n",
"\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4, \uc55e\ud488\uc0ac)\n",
"def pos_features(sentence, i, history):\n",
" features = {\"suffix(1)\": sentence[i][-1:],\n",
" \"suffix(2)\": sentence[i][-2:],\n",
" \"suffix(3)\": sentence[i][-3:]}\n",
" if i == 0:\n",
" features[\"prev-word\"] = \"<START>\"\n",
" features[\"prev-tag\"] = \"<START>\"\n",
" else:\n",
" features[\"prev-word\"] = sentence[i-1]\n",
" features[\"prev-tag\"] = history[i-1]\n",
" return features\n",
"\n",
"# \ubd84\ub958\uae30 \uc815\uc758 (\ucd08\uae30\ud654\uc5d0\uc11c \uc790\ub3d9\uc73c\ub85c \ud559\uc2b5\ud568) \n",
"class ConsecutivePosTagger(nltk.TaggerI):\n",
" def __init__(self, train_sents):\n",
" train_set = []\n",
" for tagged_sent in train_sents:\n",
" untagged_sent = nltk.tag.untag(tagged_sent)\n",
" history = []\n",
" for i, (word, tag) in enumerate(tagged_sent):\n",
" featureset = pos_features(untagged_sent, i, history)\n",
" train_set.append( (featureset, tag) )\n",
" history.append(tag)\n",
" self.classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
" def tag(self, sentence):\n",
" history = []\n",
" for i, word in enumerate(sentence):\n",
" featureset = pos_features(sentence, i, history)\n",
" tag = self.classifier.classify(featureset)\n",
" history.append(tag)\n",
" return zip(sentence, history)\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4, \ud488\uc0ac) \n",
"tagged_sents = brown.tagged_sents(categories='news')\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n",
"size = int(len(tagged_sents) * 0.1)\n",
"train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]\n",
"tagger = ConsecutivePosTagger(train_sents)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print(\"accuracy:\", tagger.evaluate(test_sents))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"accuracy: 0.798052851182\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"?nltk.TaggerI"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Other Methods for Sequence Classification"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"Hidden Markov Model (HMM) <BR>\n",
"Maximum Entropy Markov Model (MEMM) <BR>\n",
"Linear-Chain Conditional Random Field Model (CRF) <BR>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.2 Further Examples of Supervised Classification"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Sentence Segmentation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"\n",
"\n",
"# \uc785\ub825\ub370\uc774\ud0c0 \uc0dd\uc131 (\ub2e8\uc5b4 \ubaa9\ub85d, \uacbd\uacc4\uc704\uce58)\n",
"sents = nltk.corpus.treebank_raw.sents()\n",
"tokens = []\n",
"boundaries = set() # \ub04a\uc5b4\uc9c0\ub294 \ub2e8\uc5b4 \uc704\uce58. (0\ubd80\ud130 \uc2dc\uc791)\n",
"offset = 0\n",
"for sent in sents: \n",
" tokens.extend(sent)\n",
" offset += len(sent)\n",
" boundaries.add(offset-1)\n",
"print(\"len(sents):\", len(sents), sents[0:3], \"...\")\n",
"print()\n",
"print(\"len(tokens):\", len(tokens), tokens[0:30], \"...\")\n",
"print()\n",
"print(\"len(boundaries):\", len(boundaries), sorted(list(boundaries))[0:10], \"...\")\n",
"print()\n",
"\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4\ubaa9\ub85d) -> (\ub2e4\uc74c\ub2e8\uc5b4\uc758 \ub300\ubb38\uc790\uc2dc\uc791\uc5ec\ubd80, \uc774\uc804\ub2e8\uc5b4, \ub2e8\uc5b4, \uc774\uc804\ub2e8\uc5b4\uac00 \ud55c \ubb38\uc790\uc778\uc9c0)\n",
"def punct_features(tokens, i): # by punctuation(\uad6c\ub450\uc810)\n",
" try:\n",
" return {'next-word-capitalized': tokens[i+1][0].isupper(),\n",
" 'prevword': tokens[i-1].lower(),\n",
" 'punct': tokens[i],\n",
" 'prev-word-is-one-char': len(tokens[i-1]) == 1}\n",
" except:\n",
" return {'next-word-capitalized': False,\n",
" 'prevword': '',\n",
" 'punct': tokens[i],\n",
" 'prev-word-is-one-char': False}\n",
" \n",
"featuresets = [(punct_features(tokens, i), (i in boundaries))\n",
" for i in range(1, len(tokens)-1)\n",
" if tokens[i] in '.?!']\n",
"print(\"featuresets:\", featuresets[0])\n",
"print()\n",
"\n",
"# \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \uc0dd\uc131 (\ud2b9\uc9d5, \ud488\uc0ac)\n",
"size = int(len(featuresets) * 0.1)\n",
"train_set, test_set = featuresets[size:], featuresets[:size]\n",
"print(\"train_set[0]:\", train_set[0])\n",
"print()\n",
"\n",
"# \ubd84\ub958\uae30 \ud559\uc2b5\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
"# \ubd84\ub958\uae30 \ud3c9\uac00\n",
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n",
"print()\n",
"\n",
"# \ubb38\uc7a5 \ubd84\ub9ac\uae30\n",
"def segment_sentences(words):\n",
" start = 0\n",
" sents = []\n",
" for i, word in enumerate(words):\n",
" if word in '.?!' and classifier.classify(punct_features(words, i)) == True: \n",
" sents.append(words[start:i+1])\n",
" start = i+1\n",
" if start < len(words):\n",
" sents.append(words[start:])\n",
" return sents\n",
"\n",
"# \ubb38\uc7a5 \ubd84\ub9ac\uae30 \ud14c\uc2a4\ud2b8\n",
"sents = nltk.corpus.treebank_raw.sents()[:10]\n",
"words=[]\n",
"for s in sents:\n",
" words.extend(s)\n",
"# print(\"words:\", words)\n",
"# print()\n",
"print(\"correct:\\n\", '\\n'.join([' '.join(s) for s in sents ]))\n",
"print()\n",
"print(\"guess:\\n\", '\\n'.join([' '.join(s) for s in segment_sentences(words)]))\n",
"print()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"len(sents): 4193 [[u'.', u'START'], [u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.'], [u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.', u'V', u'.,', u'the', u'Dutch', u'publishing', u'group', u'.']] ...\n",
"\n",
"len(tokens): 101797 [u'.', u'START', u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director', u'Nov', u'.', u'29', u'.', u'Mr', u'.', u'Vinken', u'is', u'chairman', u'of', u'Elsevier', u'N', u'.'] ...\n",
"\n",
"len(boundaries): 4193 [1, 20, 36, 38, 64, 66, 102, 134, 163, 199] ...\n",
"\n",
"featuresets: ({u'next-word-capitalized': False, u'punct': u'.', u'prev-word-is-one-char': False, u'prevword': u'nov'}, False)\n",
"\n",
"train_set[0]: ({u'next-word-capitalized': True, u'punct': u'.', u'prev-word-is-one-char': False, u'prevword': u'popular'}, True)\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.936026936027\n",
"\n",
"correct:\n",
" . START\n",
"Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .\n",
"Mr . Vinken is chairman of Elsevier N . V ., the Dutch publishing group .\n",
". START\n",
"Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate .\n",
". START\n",
"A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported .\n",
"The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that show up decades later , researchers said .\n",
"Lorillard Inc ., the unit of New York - based Loews Corp . that makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 .\n",
"Although preliminary findings were reported more than a year ago , the latest results appear in today ' s New England Journal of Medicine , a forum likely to bring new attention to the problem .\n",
"\n",
"guess:\n",
" . START Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 . Mr . Vinken is chairman of Elsevier N . V ., the Dutch publishing group .\n",
". START Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named a nonexecutive director of this British industrial conglomerate . . START A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago , researchers reported .\n",
"The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that show up decades later , researchers said .\n",
"Lorillard Inc ., the unit of New York - based Loews Corp . that makes Kent cigarettes , stopped using crocidolite in its Micronite cigarette filters in 1956 .\n",
"Although preliminary findings were reported more than a year ago , the latest results appear in today ' s New England Journal of Medicine , a forum likely to bring new attention to the problem .\n",
"\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Identifying Dialogue Act Types (\ud654\ud589 \uc885\ub958 \uc815\uc758)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"Act types: \"Statement,\" \"Emotion,\" \"ynQuestion\", and \"Continuer.\" "
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"Accept, Bye, Clarify, Continuer, Emotion, Emphasis, Greet, No Answer, Other, Reject, Statement, System, Wh-Question, Yes Answer, Yes/No Question."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"\n",
"\n",
"posts = nltk.corpus.nps_chat.xml_posts()[:10000]\n",
"print(\"posts[0]:\", posts[0].text)\n",
"print()\n",
"\n",
"def dialogue_act_features(post):\n",
" features = {}\n",
" for word in nltk.word_tokenize(post):\n",
" features['contains(%s)' % word.lower()] = True\n",
" return features\n",
"\n",
"featuresets = [(dialogue_act_features(post.text), post.get('class'))\n",
" for post in posts]\n",
"size = int(len(featuresets) * 0.1)\n",
"train_set, test_set = featuresets[size:], featuresets[:size]\n",
"classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"print(\"featuresets[0]:\", featuresets[0])\n",
"print()\n",
"\n",
"print(\"accuracy:\", nltk.classify.accuracy(classifier, test_set))\n",
"print(classifier.classify(dialogue_act_features(\"My name is Hyewoong?\")))\n",
"print(classifier.classify(dialogue_act_features(\"What a beautiful girl?\")))\n",
"print(classifier.classify(dialogue_act_features(\"Do you want my love?\")))\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"posts[0]: now im left with this gay name\n",
"\n",
"featuresets[0]:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" ({u'contains(im)': True, u'contains(now)': True, u'contains(this)': True, u'contains(left)': True, u'contains(name)': True, u'contains(with)': True, u'contains(gay)': True}, 'Statement')\n",
"\n",
"accuracy:"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" 0.668\n",
"whQuestion\n",
"whQuestion\n",
"ynQuestion\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Recognizing Textual Entailment (\ud14d\uc2a4\ud2b8 \ud568\uc758 \uc778\uc2dd)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"Challenge 3, Pair 34 (True) <BR>\n",
"<BR>\n",
"T: Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.<BR>\n",
"<BR>\n",
"H: China is a member of SCO.<BR>\n",
"<BR>\n",
"<BR>\n",
"<BR>\n",
"Challenge 3, Pair 81 (False)<BR>\n",
"<BR>\n",
"T: According to NC Articles of Organization, the members of LLC company are H. Nelson Beavers, III, H. Chester Beavers and Jennie Beavers Stewart.<BR>\n",
"<BR>\n",
"H: Jennie Beavers Stewart is a share-holder of Carolina Analytical Laboratory.<BR>"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"\n",
"\n",
"def rte_features(rtepair):\n",
" extractor = nltk.RTEFeatureExtractor(rtepair)\n",
" features = {}\n",
" features['word_overlap'] = len(extractor.overlap('word'))\n",
" features['word_hyp_extra'] = len(extractor.hyp_extra('word'))\n",
" features['ne_overlap'] = len(extractor.overlap('ne'))\n",
" features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))\n",
" return features\n",
"\n",
"rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]\n",
"print(\"rtepair:\", rtepair.__dict__)\n",
"print()\n",
"print(\"text:\", rtepair.text)\n",
"print()\n",
"print(\"hypothesis(=keyword) :\", rtepair.hyp)\n",
"print()\n",
"\n",
"extractor = nltk.RTEFeatureExtractor(rtepair)\n",
"print(\"text_words:\", extractor.text_words) \n",
"print(\"overlap('word'):\", extractor.overlap('word'))\n",
"print(\"overlap('ne')\", extractor.overlap('ne'))\n",
"print(\"hyp_words:\", extractor.hyp_words)\n",
"print(\"hyp_extra('word'):\", extractor.hyp_extra('word'))\n",
"?nltk.RTEFeatureExtractor"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"rtepair: {'task': 'IE', 'text': 'Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.', 'challenge': '3', 'value': 1, 'hyp': 'China is a member of SCO.', 'length': 'short', 'gid': u'3-34', 'id': '34'}\n",
"\n",
"text: Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.\n",
"\n",
"hypothesis(=keyword) : China is a member of SCO.\n",
"\n",
"text_words: set(['Russia', 'Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'terrorism', 'was', 'that', 'republics', 'Co', 'representing', 'former', 'Parviz', 'central', 'meeting', 'together', 'binds'])\n",
"overlap('word'): set([])\n",
"overlap('ne') set(['SCO', 'China'])\n",
"hyp_words: set(['member', 'SCO', 'China'])\n",
"hyp_extra('word'): set(['member'])\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(help(extractor.overlap))\n",
"print(help(extractor.hyp_extra))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Help on method overlap in module nltk.classify.rte_classify:\n",
"\n",
"overlap(self, toktype, debug=False) method of nltk.classify.rte_classify.RTEFeatureExtractor instance\n",
" Compute the overlap between text and hypothesis.\n",
" \n",
" :param toktype: distinguish Named Entities from ordinary words\n",
" :type toktype: 'ne' or 'word'\n",
"\n",
"None\n",
"Help on method hyp_extra in module nltk.classify.rte_classify:\n",
"\n",
"hyp_extra(self, toktype, debug=True) method of nltk.classify.rte_classify.RTEFeatureExtractor instance\n",
" Compute the extraneous material in the hypothesis.\n",
" \n",
" :param toktype: distinguish Named Entities from ordinary words\n",
" :type toktype: 'ne' or 'word'\n",
"\n",
"None\n"
]
}
],
"prompt_number": 17
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Scaling Up to Large Datasets"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\ub370\uc774\ud0c0\ub7c9\uc774 \ub9ce\uc744 \uacbd\uc6b0, \uc21c\uc218 python\ubcf4\ub2e4 C\ub85c \uad6c\ud604\ub41c python \ud328\ud0a4\uc9c0\ub97c \uc0ac\uc6a9\ud558\ub294 \uac8c \uc88b\ub2e4. (\uc218\ud589 \uc18d\ub3c4) <BR>\n",
"<BR>\n",
"we recommend that you explore NLTK's facilities for interfacing with external machine learning packages <BR>\n",
"... to train classifier models significantly faster than the pure-Python classifier implementation"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.3 Evaluation (\ud3c9\uac00)"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"The Test Set / Accuracy"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"However, it is very important that the test set be distinct from the training corpus: <BR>\n",
"it is common to err on the side of safety by using 10% of the overall data for evaluation \n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# \uad50\uc7ac\uc758 \uc18c\uc2a4\uac00 \ub3d9\uc791\ud558\ub3c4\ub85d \uc77c\ubd80 \uc218\uc815\ud568. \uc624\ub798 \uac78\ub9bc.\n",
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"import random\n",
"from nltk.corpus import brown\n",
"\n",
"\n",
"# \ud2b9\uc9d5 \ucd94\ucd9c \ud568\uc218 \uc815\uc758 (\ub2e8\uc5b4) -> (\uc811\ubbf8\uc0ac, \uc55e\ub2e8\uc5b4, \uc55e\ud488\uc0ac)\n",
"def pos_features(sentence, i, history):\n",
" features = {\"suffix(1)\": sentence[i][-1:],\n",
" \"suffix(2)\": sentence[i][-2:],\n",
" \"suffix(3)\": sentence[i][-3:]}\n",
" if i == 0:\n",
" features[\"prev-word\"] = \"<START>\"\n",
" features[\"prev-tag\"] = \"<START>\"\n",
" else:\n",
" features[\"prev-word\"] = sentence[i-1]\n",
" features[\"prev-tag\"] = history[i-1]\n",
" return features\n",
"\n",
"# \ubd84\ub958\uae30 \uc815\uc758 (\ucd08\uae30\ud654\uc5d0\uc11c \uc790\ub3d9\uc73c\ub85c \ud559\uc2b5\ud568) \n",
"class ConsecutivePosTagger(nltk.TaggerI):\n",
" def __init__(self, train_sents):\n",
" train_set = []\n",
" for tagged_sent in train_sents:\n",
" untagged_sent = nltk.tag.untag(tagged_sent)\n",
" history = []\n",
" for i, (word, tag) in enumerate(tagged_sent):\n",
" featureset = pos_features(untagged_sent, i, history)\n",
" train_set.append( (featureset, tag) )\n",
" history.append(tag)\n",
" self.classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
"\n",
" def tag(self, sentence):\n",
" history = []\n",
" for i, word in enumerate(sentence):\n",
" featureset = pos_features(sentence, i, history)\n",
" tag = self.classifier.classify(featureset)\n",
" history.append(tag)\n",
" return zip(sentence, history)\n",
"\n",
"# \uc801\uc808\ud558\uc9c0 \uc54a\uc740 \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc758 \uacbd\uc6b0\n",
"# 1. \uac19\uc740 \uc7a5\ub974\ub85c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131\ud558\uba74, \ud3c9\uac00 \uacb0\uacfc\ub97c \ud655\uc2e0\ud558\uae30 \uc5b4\ub835\ub2e4. (?)\n",
"# 2. random.shuffle()\uc744 \ud558\uba74, \uac19\uc740 \ubb38\uc11c\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b \ubb38\uc7a5\uc774 \uc0dd\uc131\ub420 \uc218 \uc788\uc5b4 \uc88b\uc9c0 \uc54a\ub2e4.\n",
"tagged_sents = list(brown.tagged_sents(categories='news'))\n",
"print(\"tagged_sents[0]:\", tagged_sents[0])\n",
"random.shuffle(tagged_sents)\n",
"size = int(len(tagged_sents) * 0.1)\n",
"train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] \n",
"tagger = ConsecutivePosTagger(train_sents)\n",
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n",
"print()\n",
"\n",
"# 1. \ub2e4\ub978 \uc7a5\ub974\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc744 \uc0dd\uc131\ud558\ub3c4\ub85d \uc218\uc815.\n",
"train_sents = brown.tagged_sents(categories='news')\n",
"test_sents = brown.tagged_sents(categories='fiction')\n",
"tagger = ConsecutivePosTagger(train_sents)\n",
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n",
"print()\n",
"\n",
"# 2. \uac19\uc740 \ubb38\uc11c\uc5d0\uc11c \ud559\uc2b5\uc14b, \ud14c\uc2a4\ud2b8\uc14b\uc774 \uc0dd\uc131\ub418\uc9c0 \uc54a\ub3c4\ub85d \uc218\uc815.\n",
"file_ids = brown.fileids(categories='news')\n",
"size = int(len(file_ids) * 0.1)\n",
"train_sents = brown.tagged_sents(file_ids[size:])\n",
"test_sents = brown.tagged_sents(file_ids[:size])\n",
"tagger = ConsecutivePosTagger(train_sents)\n",
"print('Accuracy: %4.2f' % tagger.evaluate(test_sents))\n",
"print()\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"tagged_sents[0]: [(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL'), (u'Grand', u'JJ-TL'), (u'Jury', u'NN-TL'), (u'said', u'VBD'), (u'Friday', u'NR'), (u'an', u'AT'), (u'investigation', u'NN'), (u'of', u'IN'), (u\"Atlanta's\", u'NP$'), (u'recent', u'JJ'), (u'primary', u'NN'), (u'election', u'NN'), (u'produced', u'VBD'), (u'``', u'``'), (u'no', u'AT'), (u'evidence', u'NN'), (u\"''\", u\"''\"), (u'that', u'CS'), (u'any', u'DTI'), (u'irregularities', u'NNS'), (u'took', u'VBD'), (u'place', u'NN'), (u'.', u'.')]\n",
"Accuracy: 0.79"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n",
"Accuracy: 0.79"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n",
"Accuracy: 0.79"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"\n"
]
}
],
"prompt_number": 18
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Precision and Recall (\uc815\ud655\ub960, \uc7ac\ud604\uc728)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/precision-recall.png\" width=\"700\">\n",
"<img src=\"http://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/700px-Precisionrecall.svg.png\" width=\"700\">\n",
"<img src=\"https://fbcdn-sphotos-c-a.akamaihd.net/hphotos-ak-xpa1/v/t1.0-9/10991051_844288612293942_8690474408857494396_n.jpg?oh=f4a68cc3875ebea360d2e2fbb1db68f8&oe=554DA29E&__gda__=1434765606_73492ef515b8cf34ddc9a82af0aff2d4\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"F-Measure (F-Score, F1 score)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"http://ko.wikipedia.org/wiki/\uc870\ud654_\ud3c9\uade0 <BR>\n",
"http://en.wikipedia.org/wiki/F1_score <BR>\n",
"<img src=\"http://upload.wikimedia.org/math/9/9/1/991d55cc29b4867c88c6c22d438265f9.png\">"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Confusion Matrices"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"label \uc885\ub958\uac00 3\uac1c \uc774\uc0c1\uc77c \ub54c, label\ubcc4 \uc624\ub958\ube44\uc728\uc744 \ube44\uad50\ud560 \ub54c confusion matrice \uac00 \uc720\uc6a9\ud558\ub2e4"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"?nltk.UnigramTagger\n",
"?nltk.BigramTagger"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"from nltk.corpus import brown\n",
"\n",
"\n",
"file_ids = brown.fileids(categories='editorial')\n",
"size = int(len(file_ids) * 0.1)\n",
"train_sents = brown.tagged_sents(file_ids[size:])\n",
" \n",
"def tag_list(tagged_sents):\n",
" return [tag for sent in tagged_sents for (word, tag) in sent]\n",
"def apply_tagger(tagger, corpus):\n",
" return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]\n",
"\n",
"gold = tag_list(brown.tagged_sents(categories='editorial')) # \uc0ac\uc124\n",
" \n",
"t0 = nltk.DefaultTagger('NN')\n",
"test = tag_list(apply_tagger(t0, brown.tagged_sents(categories='editorial')))\n",
"cm = nltk.ConfusionMatrix(gold, test)\n",
"print(\"nltk.DefaultTagger('NN'):\")\n",
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n",
"print()\n",
"\n",
"t1 = nltk.UnigramTagger(train_sents, backoff=t0)\n",
"test = tag_list(apply_tagger(t1, brown.tagged_sents(categories='editorial')))\n",
"cm = nltk.ConfusionMatrix(gold, test)\n",
"print(\"nltk.UnigramTagger(train_sents):\")\n",
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n",
"print()\n",
"\n",
"t2 = nltk.BigramTagger(train_sents, backoff=t1)\n",
"test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))\n",
"cm = nltk.ConfusionMatrix(gold, test)\n",
"print(\"nltk.BigramTagger(train_sents):\")\n",
"print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))\n",
"print()\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"nltk.DefaultTagger('NN'):\n",
" | N |\n",
" | N I A J N V N |\n",
" | N N T J . S , B P |\n",
"----+----------------------------------------------------------------+\n",
" NN | <12.5%> . . . . . . . . |\n",
" IN | 10.1% <.> . . . . . . . |\n",
" AT | 8.6% . <.> . . . . . . |\n",
" JJ | 5.8% . . <.> . . . . . |\n",
" . | 4.9% . . . <.> . . . . |\n",
"NNS | 4.8% . . . . <.> . . . |\n",
" , | 4.4% . . . . . <.> . . |\n",
" VB | 3.5% . . . . . . <.> . |\n",
" NP | 3.1% . . . . . . . <.>|\n",
"----+----------------------------------------------------------------+\n",
"(row = reference; col = test)\n",
"\n",
"\n",
"nltk.UnigramTagger(train_sents):"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" | N |\n",
" | N I A J N V N |\n",
" | N N T J . S , B P |\n",
"----+----------------------------------------------------------------+\n",
" NN | <11.8%> 0.0% . 0.2% . . . 0.3% 0.0% |\n",
" IN | 0.0% <8.9%> . 0.0% . 0.0% . . . |\n",
" AT | . . <8.6%> . . . . . . |\n",
" JJ | 0.2% . . <5.6%> . . . 0.0% 0.0% |\n",
" . | . . . . <4.8%> . . . . |\n",
"NNS | 0.1% . . . . <4.6%> . . 0.0% |\n",
" , | . . . . . . <4.4%> . . |\n",
" VB | 0.4% . . 0.0% . . . <3.0%> . |\n",
" NP | 0.1% . . 0.0% . . . . <2.9%>|\n",
"----+----------------------------------------------------------------+\n",
"(row = reference; col = test)\n",
"\n",
"\n",
"nltk.BigramTagger(train_sents):"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
" | N |\n",
" | N I A J N V N |\n",
" | N N T J . S , B P |\n",
"----+----------------------------------------------------------------+\n",
" NN | <12.3%> 0.0% . 0.0% . . . 0.1% 0.0% |\n",
" IN | 0.0% <9.1%> . 0.0% . 0.0% . . . |\n",
" AT | . . <8.6%> . . . . . . |\n",
" JJ | 0.2% . . <5.6%> . . . 0.0% 0.0% |\n",
" . | . . . . <4.8%> . . . 0.0% |\n",
"NNS | 0.1% . . . . <4.7%> . . . |\n",
" , | . . . . . . <4.4%> . . |\n",
" VB | 0.1% . . . . . . <3.4%> . |\n",
" NP | 0.1% . . 0.0% . . . . <2.9%>|\n",
"----+----------------------------------------------------------------+\n",
"(row = reference; col = test)\n",
"\n",
"\n"
]
}
],
"prompt_number": 23
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"K Cross-Validation (K \uad50\ucc28 \uac80\uc99d)"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"\uc77c\ubc18\uc801\uc73c\ub85c \ud559\uc2b5/\ud14c\uc2a4\ud2b8\uc6a9\uc758 \ucda9\ubd84\ud55c \ub370\uc774\ud0c0\uac00 \uc5c6\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud558\ub294 \ubc29\ubc95. <BR>\n",
"\ucf54\ud37c\uc2a4(\ub370\uc774\ud0c0)\ub97c K\uac1c\uc758 folds(subset)\ub85c \ub098\ub208 \ud6c4, \ud55c fold\ub97c \ud14c\uc2a4\ud2b8\uc14b\uc73c\ub85c \uc120\ud0dd\ud558\ub294 \ubc29\ubc95. <BR>\n",
"\uc774 \ub54c, \ud574\ub2f9 fold\uc678\uc758 \ub098\uba38\uc9c0 fold\ub294 \ud559\uc2b5\uc14b\uc774 \ub41c\ub2e4. <BR>\n",
"<BR>\n",
"\ub9cc\uc57d K\ubc88\uc758 \ud3c9\uac00 \uc810\uc218\uac00 \ube44\uc2b7\ud560 \uacbd\uc6b0, \uadf8 \uacb0\uacfc\ub97c \ud655\uc2e0\ud558\uae30 \uc88b\ub2e4.\n",
"<BR>\n",
"e.g. 10 folds cross-validation <BR>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.4 Decision Trees"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/decision-tree.png\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Entropy and Information Gain"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"H = \u2212\u03a3l |in| labelsP(l) \u00d7 log2P(l).\n",
"<img src=\"http://www.nltk.org/images/Binary_entropy_plot.png\" width=\"200\"> <BR>\n",
"class: a,b \uc77c \ub54c, <BR>\n",
"\uac00\ub85c: a\uc758 \ud655\ub960 = a\uc758 \ube48\ub3c4/(a\uc758 \ube48\ub3c4 + b\uc758 \ube48\ub3c4) <BR>\n",
"\uc138\ub85c: entropy <BR>"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function, unicode_literals \n",
"from pprint import pprint\n",
"import nltk\n",
"\n",
"import math\n",
"def entropy(labels):\n",
" freqdist = nltk.FreqDist(labels)\n",
" probs = [freqdist.freq(l) for l in nltk.FreqDist(labels)]\n",
" return -sum([p * math.log(p,2) for p in probs])\n",
"\n",
"print(\"entropy(['male', 'male', 'male', 'male']):\", entropy(['male', 'male', 'male', 'male']))\n",
"print(\"entropy(['male', 'female', 'male', 'male']):\", entropy(['male', 'female', 'male', 'male']))\n",
"print(\"entropy(['female', 'male', 'female', 'male']):\", entropy(['female', 'male', 'female', 'male']))\n",
"print(\"entropy(['female', 'female', 'male', 'female']):\", entropy(['female', 'female', 'male', 'female']))\n",
"print(\"entropy(['female', 'female', 'female', 'female']):\", entropy(['female', 'female', 'female', 'female']))\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"entropy(['male', 'male', 'male', 'male']): -0.0\n",
"entropy(['male', 'female', 'male', 'male']): 0.811278124459\n",
"entropy(['female', 'male', 'female', 'male']): 1.0\n",
"entropy(['female', 'female', 'male', 'female']): 0.811278124459\n",
"entropy(['female', 'female', 'female', 'female']): -0.0\n"
]
}
],
"prompt_number": 24
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.5 Naive Bayes Classifiers"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/naive-bayes-triangle.png\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/naive_bayes_bargraph.png\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Underlying Probabilistic Model"
]
},
{
"cell_type": "heading",
"level": 6,
"metadata": {},
"source": [
"<img src=\"http://www.nltk.org/images/naive_bayes_graph.png\" width=\"700\">"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Zero Counts and Smoothing"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Non-Binary Features"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"The Naivete of Independence"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"The Cause of Double-Counting"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.6 Maximum Entropy Classifiers"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"The Maximum Entropy Model"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Maximizing Entropy"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Generative Versus Conditional Classifiers"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"6.7 Modeling Linguistic Patterns"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"What Do Models Tell Us?"
]
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment