Skip to content

Instantly share code, notes, and snippets.

@vchahun
Created July 20, 2013 20:03
Show Gist options
  • Save vchahun/6046246 to your computer and use it in GitHub Desktop.
Save vchahun/6046246 to your computer and use it in GitHub Desktop.
Unsupervised word classes
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "word-classes"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import sys\nsys.path.append('/home/vchahune/tools/ruslem/env/lib/python2.7/site-packages/')",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "from pymorphy2.tagger import Morph\ntagger = Morph.load('/home/vchahune/tools/ruslem/dict')",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "import re\nimport io\nfrom collections import Counter",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "from IPython.display import HTML",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 22
},
{
"cell_type": "code",
"collapsed": false,
"input": "confusion = {l: Counter() for l in 'ABC'}",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "with io.open('/usr0/home/cdyer/projects/cpyp/lozenge/3-types.sorted.txt', encoding='utf8') as f:\n for line in f:\n infered_cls, word = line[:-1].split(':')\n true_cls = tagger.parse(re.sub('[ \\^><]', '', word))[0][1].cls\n confusion[infered_cls][true_cls] += 1",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": "rows = 'ABC'\ncols = sorted(set(key for c in confusion.values() for key in c.keys()))",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
},
{
"cell_type": "code",
"collapsed": false,
"input": "header = ''.join('<th>{}</th>'.format(col) for col in ['']+cols)\ntable_rows = '\\n'.join('<tr><th>{}</th>{}</tr>'.format(row, ''.join('<td>{}</td>'.format(confusion[row][col]) for col in cols)) for row in rows)\nHTML(\"\"\"\n<table>\n<tr>{}</tr>\n{}\n</table>\n\"\"\".format(header, table_rows))",
"language": "python",
"metadata": {},
"outputs": [
{
"html": "\n<table>\n<tr><th></th><th>ADJF</th><th>ADJS</th><th>ADVB</th><th>COMP</th><th>CONJ</th><th>GRND</th><th>INFN</th><th>INTJ</th><th>NOUN</th><th>NPRO</th><th>NUMR</th><th>PRCL</th><th>PRED</th><th>PREP</th><th>PRTF</th><th>PRTS</th><th>VERB</th></tr>\n<tr><th>A</th><td>13147</td><td>554</td><td>432</td><td>69</td><td>10</td><td>1027</td><td>1902</td><td>2</td><td>3761</td><td>10</td><td>11</td><td>2</td><td>4</td><td>3</td><td>7137</td><td>436</td><td>8936</td></tr>\n<tr><th>B</th><td>11101</td><td>908</td><td>333</td><td>48</td><td>7</td><td>29</td><td>12</td><td>2</td><td>18967</td><td>3</td><td>4</td><td>3</td><td>3</td><td>5</td><td>2192</td><td>1155</td><td>223</td></tr>\n<tr><th>C</th><td>7673</td><td>490</td><td>503</td><td>75</td><td>31</td><td>828</td><td>2349</td><td>6</td><td>19477</td><td>30</td><td>103</td><td>8</td><td>10</td><td>24</td><td>3908</td><td>743</td><td>7531</td></tr>\n</table>\n",
"output_type": "pyout",
"prompt_number": 25,
"text": "<IPython.core.display.HTML at 0x41fb390>"
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment