haje01/NLTK ch7-ex.ipynb

## NLTK ch7-ex.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:cd3679987cb24ec1c4bd7fc9d70b6eb5c880b01bdab784b91811dd3652185cde"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Excercise 7-2\n",
      "\n",
      "Write a tag pattern to match noun phrases containing plural head nouns. \n",
      "\n",
      "*Examples:*\n",
      "\n",
      "    many/JJ researchers/NNS\n",
      "    two/CD weeks/NNS\n",
      "    both/DT new/JJ potisions/NNS\n",
      "\n",
      "Try to do this by generalizing the tag pattern that handled singular noun phrases.    "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nltk"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grammar = r\"\"\"\n",
      "    NPS: {<CD|DT>?<JJ>*<NNS>}\n",
      "\n",
      "    NP: {<DT|PP\\$>?<JJ>*<NN>}\n",
      "        {<NNP>+}\n",
      "    NPS: {<CD>?<NP>}\n",
      "\"\"\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grammar = r\"\"\"\n",
      "    NPH: {<DT|PP\\$>?<JJ>*}\n",
      "    NP: {<NPH><NN>}\n",
      "        {<NNP>+}\n",
      "    NPS: {<NPH><NNS>}\n",
      "\"\"\"\n",
      "cp = nltk.RegexpParser(grammar)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 138
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "sentence = [(\"Rapunzel\", \"NNP\"), (\"let\", \"VBD\"), (\"down\", \"RP\"), \n",
      "            (\"her\", \"PP$\"), (\"long\", \"JJ\"), (\"golden\", \"JJ\"), \n",
      "            (\"hair\", \"NN\")]\n",
      "print cp.parse(sentence)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(S\n",
        "  (NP Rapunzel/NNP)\n",
        "  let/VBD\n",
        "  down/RP\n",
        "  (NP (NPH her/PP$ long/JJ golden/JJ) hair/NN))\n"
       ]
      }
     ],
     "prompt_number": 139
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "sentence = [(\"Many\", \"JJ\"), (\"researchers\", \"NNS\"), (\"saw\", \"VBD\"), \n",
      "            (\"it\", \"PPS\")]\n",
      "print cp.parse(sentence)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(S (NPS (NPH Many/JJ) researchers/NNS) saw/VBD it/PPS)\n"
       ]
      }
     ],
     "prompt_number": 140
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Excercise 7-15\n",
      "Develop an NP chunker that convert POS tagged text into a list of tuples, where each tuple consists of a verb followed by a sequence of noun phrase and prepositions\n",
      "\n",
      "*Example:*\n",
      "\n",
      "    the little cat sat on the mat -> ('sat', 'on', 'NP')"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "grammar = r\"\"\"\n",
      "    NP: {<DT|PP\\$>?<JJ>*<NN>}\n",
      "        {<NNP>+}\n",
      "    VFNP: {<VBD><IN><NP>}    \n",
      "\"\"\"\n",
      "cp = nltk.RegexpParser(grammar)\n",
      "\n",
      "def print_vfnp(tree):\n",
      "    for subtree in tree:\n",
      "        if type(subtree) == nltk.tree.Tree:\n",
      "            if subtree.label() == 'VFNP':\n",
      "                for e in subtree:\n",
      "                    if type(e) == nltk.tree.Tree:\n",
      "                        print e.label()\n",
      "                    else:\n",
      "                        print e[0],\n",
      "            else:\n",
      "                print_vfnp(subtree)\n",
      "\n",
      "sent = [('the', 'DT'), ('little', 'JJ'), ('cat', 'NN'), ('sat', 'VBD'), \n",
      "        ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')]\n",
      "trees = cp.parse(sent)\n",
      "print_vfnp(trees)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "sat on NP\n"
       ]
      }
     ],
     "prompt_number": 141
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for i in range(1, 100):\n",
      "    sent = nltk.corpus.treebank.tagged_sents()[i]\n",
      "    trees = cp.parse(sent)\n",
      "    print_vfnp(trees)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "heard of NP\n",
        "was"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " under NP\n",
        "lengthened"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " by NP\n",
        "dropped"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " at NP\n",
        "was"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " for NP\n"
       ]
      }
     ],
     "prompt_number": 142
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": "",
	"signature": "sha256:cd3679987cb24ec1c4bd7fc9d70b6eb5c880b01bdab784b91811dd3652185cde"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Excercise 7-2\n",
	"\n",
	"Write a tag pattern to match noun phrases containing plural head nouns. \n",
	"\n",
	"Examples:\n",
	"\n",
	" many/JJ researchers/NNS\n",
	" two/CD weeks/NNS\n",
	" both/DT new/JJ potisions/NNS\n",
	"\n",
	"Try to do this by generalizing the tag pattern that handled singular noun phrases. "
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import nltk"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"grammar = r\"\"\"\n",
	" NPS: {<CD\|DT>?<JJ>*<NNS>}\n",
	"\n",
	" NP: {<DT\|PP\\$>?<JJ>*<NN>}\n",
	" {<NNP>+}\n",
	" NPS: {<CD>?<NP>}\n",
	"\"\"\""
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"grammar = r\"\"\"\n",
	" NPH: {<DT\|PP\\$>?<JJ>*}\n",
	" NP: {<NPH><NN>}\n",
	" {<NNP>+}\n",
	" NPS: {<NPH><NNS>}\n",
	"\"\"\"\n",
	"cp = nltk.RegexpParser(grammar)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 138
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": [
	"sentence = [(\"Rapunzel\", \"NNP\"), (\"let\", \"VBD\"), (\"down\", \"RP\"), \n",
	" (\"her\", \"PP$\"), (\"long\", \"JJ\"), (\"golden\", \"JJ\"), \n",
	" (\"hair\", \"NN\")]\n",
	"print cp.parse(sentence)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"(S\n",
	" (NP Rapunzel/NNP)\n",
	" let/VBD\n",
	" down/RP\n",
	" (NP (NPH her/PP$ long/JJ golden/JJ) hair/NN))\n"
	]
	}
	],
	"prompt_number": 139
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": [
	"sentence = [(\"Many\", \"JJ\"), (\"researchers\", \"NNS\"), (\"saw\", \"VBD\"), \n",
	" (\"it\", \"PPS\")]\n",
	"print cp.parse(sentence)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"(S (NPS (NPH Many/JJ) researchers/NNS) saw/VBD it/PPS)\n"
	]
	}
	],
	"prompt_number": 140
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Excercise 7-15\n",
	"Develop an NP chunker that convert POS tagged text into a list of tuples, where each tuple consists of a verb followed by a sequence of noun phrase and prepositions\n",
	"\n",
	"Example:\n",
	"\n",
	" the little cat sat on the mat -> ('sat', 'on', 'NP')"
	]
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": [
	"grammar = r\"\"\"\n",
	" NP: {<DT\|PP\\$>?<JJ>*<NN>}\n",
	" {<NNP>+}\n",
	" VFNP: {<VBD><IN><NP>} \n",
	"\"\"\"\n",
	"cp = nltk.RegexpParser(grammar)\n",
	"\n",
	"def print_vfnp(tree):\n",
	" for subtree in tree:\n",
	" if type(subtree) == nltk.tree.Tree:\n",
	" if subtree.label() == 'VFNP':\n",
	" for e in subtree:\n",
	" if type(e) == nltk.tree.Tree:\n",
	" print e.label()\n",
	" else:\n",
	" print e[0],\n",
	" else:\n",
	" print_vfnp(subtree)\n",
	"\n",
	"sent = [('the', 'DT'), ('little', 'JJ'), ('cat', 'NN'), ('sat', 'VBD'), \n",
	" ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')]\n",
	"trees = cp.parse(sent)\n",
	"print_vfnp(trees)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"sat on NP\n"
	]
	}
	],
	"prompt_number": 141
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"for i in range(1, 100):\n",
	" sent = nltk.corpus.treebank.tagged_sents()[i]\n",
	" trees = cp.parse(sent)\n",
	" print_vfnp(trees)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"heard of NP\n",
	"was"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" under NP\n",
	"lengthened"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" by NP\n",
	"dropped"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" at NP\n",
	"was"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" for NP\n"
	]
	}
	],
	"prompt_number": 142
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": []
	}
	],
	"metadata": {}
	}
	]
	}