Skip to content

Instantly share code, notes, and snippets.

@douglasgoodwin
Created May 7, 2019 05:37
Show Gist options
  • Save douglasgoodwin/0742f3ba83c419a92ed8b6efb22defc2 to your computer and use it in GitHub Desktop.
Save douglasgoodwin/0742f3ba83c419a92ed8b6efb22defc2 to your computer and use it in GitHub Desktop.
NLTK analysis of Jane Austen
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import nltk\n",
"from nltk.parse.malt import MaltParser\n",
"\n",
"mp = MaltParser('/Users/dgoodwin/janeausten/maltparser-1.9.2', 'engmalt.linear-1.7.mco')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"Tree('of', ['The', 'flaring', 'lamps', Tree('were', ['a', 'carriage', Tree('in', ['immediately', 'view.'])])])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quote = \"\"\"The flaring lamps of a carriage were immediately in view. \n",
"By their uncertain light she thought she could discern it to be drawn by four horses; \n",
"and this, while it told the excess of her poor mother’s alarm, \n",
"gave some explanation to such unexpected rapidity.\"\"\"\n",
"\n",
"quote = \"The flaring lamps of a carriage were immediately in view.\"\n",
"mp.parse_one(quote.split()).tree()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"|Abbreviation | part o |\n",
"|-- |-- |\n",
"| NN | singular noun |\n",
"NNS plural noun\n",
"NNP proper noun\n",
"VBD past tense verb \n",
"VBZ 3rd person singular present tense verb\n",
"VBP non-3rd person singular present tense verb\n",
"VBN past participle\n",
"PRP pronoun\n",
"PRP possessive pronoun \n",
"JJ adjective\n",
"IN preposition complementizer \n",
"DT determiner\n",
"\n",
"NP noun phrase \n",
"VP verb phrase \n",
"PP prepositional phrase \n",
"S sentence\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('The', 'DT'),\n",
" ('flaring', 'VBG'),\n",
" ('lamps', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('a', 'DT'),\n",
" ('carriage', 'NN'),\n",
" ('were', 'VBD'),\n",
" ('immediately', 'RB'),\n",
" ('in', 'IN'),\n",
" ('view', 'NN'),\n",
" ('.', '.')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokens = nltk.word_tokenize(quote)\n",
"tagged = nltk.pos_tag(tokens)\n",
"\n",
"tagged"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"Tree('chased', [Tree('man', ['A', Tree('in', [Tree('pajamas', ['his'])])]), Tree('with', ['a', 'cat', Tree('broom.', ['a'])])])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quote = \"A man in his pajamas chased a cat with a broom.\"\n",
"mp.parse_one(quote.split()).tree()\n",
"\n",
"# Does the cat have a broom?"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"Tree('with', ['A', 'man', Tree('chased', [Tree('broom', ['a']), Tree('cat', ['a']), Tree('in', [Tree('pajamas.', ['his'])])])])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quote = \"A man with a broom chased a cat in his pajamas.\"\n",
"mp.parse_one(quote.split()).tree()\n",
"\n",
"# Cats don't wear pajamas."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"Tree('chased', [Tree('man', ['A']), Tree('cat', ['a']), Tree('in', [Tree('with', ['his', 'pajamas', Tree('broom.', ['a'])])])])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quote = \"A man chased a cat in his pajamas with a broom.\"\n",
"mp.parse_one(quote.split()).tree()\n",
"\n",
"# Does the man have a cat in his pajamas?"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"original: 14862748245026736845 A\n",
"lowercased: 11901859001352538922 a\n",
"lemma: 14862748245026736845 A\n",
"shape: 101 X\n",
"prefix: 14862748245026736845 A\n",
"suffix: 14862748245026736845 A\n",
"log probability: -20.0\n",
"Brown cluster id: 0\n",
" — — — — — — — — — — — — — — — — — — — — \n",
"original: 3104811030673030468 man\n",
"lowercased: 3104811030673030468 man\n",
"lemma: 3104811030673030468 man\n",
"shape: 4088098365541558500 xxx\n",
"prefix: 646772771845179972 m\n",
"suffix: 3104811030673030468 man\n",
"log probability: -20.0\n",
"Brown cluster id: 0\n",
" — — — — — — — — — — — — — — — — — — — — \n",
"original: 3002984154512732771 in\n",
"lowercased: 3002984154512732771 in\n",
"lemma: 3002984154512732771 in\n",
"shape: 4370460163704169311 xx\n",
"prefix: 5097672513440128799 i\n",
"suffix: 3002984154512732771 in\n",
"log probability: -20.0\n",
"Brown cluster id: 0\n",
" — — — — — — — — — — — — — — — — — — — — \n",
"original: 2661093235354845946 his\n",
"lowercased: 2661093235354845946 his\n",
"lemma: 2661093235354845946 his\n",
"shape: 4088098365541558500 xxx\n",
"prefix: 15817570140490810055 h\n",
"suffix: 2661093235354845946 his\n",
"log probability: -20.0\n",
"Brown cluster id: 0\n",
" — — — — — — — — — — — — — — — — — — — — \n"
]
}
],
"source": [
"# from spacy.en import English\n",
"from spacy.lang.en import English\n",
"\n",
"parser = English()\n",
"parsedData = parser(quote)\n",
"\n",
"for i, token in enumerate(parsedData):\n",
" print(\"original:\", token.orth, token.orth_)\n",
" print(\"lowercased:\", token.lower, token.lower_)\n",
" print(\"lemma:\", token.lemma, token.lemma_)\n",
" print(\"shape:\", token.shape, token.shape_)\n",
" print(\"prefix:\", token.prefix, token.prefix_)\n",
" print(\"suffix:\", token.suffix, token.suffix_)\n",
" print(\"log probability:\", token.prob)\n",
" print(\"Brown cluster id:\", token.cluster)\n",
" print(\" — — — — — — — — — — — — — — — — — — — — \")\n",
" if i > 2:\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment