Skip to content

Instantly share code, notes, and snippets.

@aparrish aparrish/notes-2018-03-23.ipynb Secret
Created Mar 23, 2018

Embed
What would you like to do?
rwet notez 2018-03-23
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# notes 2018-03-23"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## tuples"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"t = (1, 2, 3, 4, 5)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1, 2, 3, 4, 5)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tuple"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(t)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t[2]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3, 4, 5)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t[2:5]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'tuple' object has no attribute 'append'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-7189fd0c031d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m17\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m: 'tuple' object has no attribute 'append'"
]
}
],
"source": [
"t.append(17)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'tuple' object does not support item assignment",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-0ea2d4b041bb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m12345\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: 'tuple' object does not support item assignment"
]
}
],
"source": [
"t[3] = 12345"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_stuff = {}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_stuff[\"cheese\"] = 1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_stuff[1234] = 4"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cheese': 1, 1234: 4}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_stuff"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "unhashable type: 'list'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-13-a61cb05993cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmy_stuff\u001b[0m\u001b[0;34m[\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"a\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
]
}
],
"source": [
"my_stuff[ [\"a\", \"b\"] ] = 6"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_stuff[ (\"a\", \"b\") ] = 6"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cheese': 1, 1234: 4, ('a', 'b'): 6}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_stuff"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_stuff[('a', 'b')]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"some_words = ('this', 'is', 'fun')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tuple"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(some_words)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_stuff[some_words] = 90"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'cheese': 1, 1234: 4, ('a', 'b'): 6, ('this', 'is', 'fun'): 90}"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_stuff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## functions"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(\"asdfasdfasdf\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"my_name = \"allison PARRISH\""
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ALLISON PARRISH'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_name.upper()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'allison parrish'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_name.lower()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Allison Parrish'"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_name.title()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Allison parrish'"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_name.capitalize()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Allison PARRISH'"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_name[0].upper() + my_name[1:]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# in javascript, function uclower(s) {}\n",
"def uclower(s):\n",
" return s[0].upper() + s[1:]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Allison PARRISH'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uclower(my_name)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'int' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-34-24f34f303882>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0muclower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m12345\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-32-4ead1bffe1c0>\u001b[0m in \u001b[0;36muclower\u001b[0;34m(s)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# in javascript, function uclower(s) {}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0muclower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: 'int' object is not subscriptable"
]
}
],
"source": [
"uclower(12345)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def smooshes(bob, horatio):\n",
" bob_stuff = bob[:3]\n",
" horatio_stuff = horatio[-3:]\n",
" bobatio = bob_stuff + horatio_stuff\n",
" return bobatio"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'hello!'"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"smooshes(\"help me this is weird!\", \"this is my cello!\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def abc():\n",
" return 7"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"function"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(abc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## markov chains!!@#!@%*!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"wait but first\n",
"\n",
"# n-grams\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"text = \"condescendences\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"character n-gram analysis of this string, length 2:\n",
"\n",
" co\n",
" on\n",
" nd\n",
" de\n",
" es\n",
" sc\n",
" ce\n",
" en\n",
" nd\n",
" de\n",
" en\n",
" nc\n",
" ce\n",
" es\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"words = open(\"frost.txt\").read().lower().split()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['two',\n",
" 'roads',\n",
" 'diverged',\n",
" 'in',\n",
" 'a',\n",
" 'yellow',\n",
" 'wood,',\n",
" 'and',\n",
" 'sorry',\n",
" 'i',\n",
" 'could',\n",
" 'not',\n",
" 'travel',\n",
" 'both',\n",
" 'and',\n",
" 'be',\n",
" 'one',\n",
" 'traveler,',\n",
" 'long',\n",
" 'i',\n",
" 'stood',\n",
" 'and',\n",
" 'looked',\n",
" 'down',\n",
" 'one',\n",
" 'as',\n",
" 'far',\n",
" 'as',\n",
" 'i',\n",
" 'could',\n",
" 'to',\n",
" 'where',\n",
" 'it',\n",
" 'bent',\n",
" 'in',\n",
" 'the',\n",
" 'undergrowth;',\n",
" 'then',\n",
" 'took',\n",
" 'the',\n",
" 'other,',\n",
" 'as',\n",
" 'just',\n",
" 'as',\n",
" 'fair,',\n",
" 'and',\n",
" 'having',\n",
" 'perhaps',\n",
" 'the',\n",
" 'better',\n",
" 'claim,',\n",
" 'because',\n",
" 'it',\n",
" 'was',\n",
" 'grassy',\n",
" 'and',\n",
" 'wanted',\n",
" 'wear;',\n",
" 'though',\n",
" 'as',\n",
" 'for',\n",
" 'that',\n",
" 'the',\n",
" 'passing',\n",
" 'there',\n",
" 'had',\n",
" 'worn',\n",
" 'them',\n",
" 'really',\n",
" 'about',\n",
" 'the',\n",
" 'same,',\n",
" 'and',\n",
" 'both',\n",
" 'that',\n",
" 'morning',\n",
" 'equally',\n",
" 'lay',\n",
" 'in',\n",
" 'leaves',\n",
" 'no',\n",
" 'step',\n",
" 'had',\n",
" 'trodden',\n",
" 'black.',\n",
" 'oh,',\n",
" 'i',\n",
" 'kept',\n",
" 'the',\n",
" 'first',\n",
" 'for',\n",
" 'another',\n",
" 'day!',\n",
" 'yet',\n",
" 'knowing',\n",
" 'how',\n",
" 'way',\n",
" 'leads',\n",
" 'on',\n",
" 'to',\n",
" 'way,',\n",
" 'i',\n",
" 'doubted',\n",
" 'if',\n",
" 'i',\n",
" 'should',\n",
" 'ever',\n",
" 'come',\n",
" 'back.',\n",
" 'i',\n",
" 'shall',\n",
" 'be',\n",
" 'telling',\n",
" 'this',\n",
" 'with',\n",
" 'a',\n",
" 'sigh',\n",
" 'somewhere',\n",
" 'ages',\n",
" 'and',\n",
" 'ages',\n",
" 'hence:',\n",
" 'two',\n",
" 'roads',\n",
" 'diverged',\n",
" 'in',\n",
" 'a',\n",
" 'wood,',\n",
" 'and',\n",
" 'i---',\n",
" 'i',\n",
" 'took',\n",
" 'the',\n",
" 'one',\n",
" 'less',\n",
" 'travelled',\n",
" 'by,',\n",
" 'and',\n",
" 'that',\n",
" 'has',\n",
" 'made',\n",
" 'all',\n",
" 'the',\n",
" 'difference.']"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"word-level n-gram analysis of length 5\n",
"\n",
" Two roads diverged in a\n",
" roads diverged in a yellow\n",
" diverged in a yellow wood\n",
" in a yellow wood, And"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(range(10))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"pairs = [(words[i], words[i+1]) for i in range(len(words) - 1)]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# does the same thing as the list comprehension above\n",
"pairs = []\n",
"for i in range(len(words) - 1):\n",
" this_pair = (words[i], words[i+1])\n",
" pairs.append(this_pair)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('two', 'roads'),\n",
" ('roads', 'diverged'),\n",
" ('diverged', 'in'),\n",
" ('in', 'a'),\n",
" ('a', 'yellow'),\n",
" ('yellow', 'wood,'),\n",
" ('wood,', 'and'),\n",
" ('and', 'sorry'),\n",
" ('sorry', 'i'),\n",
" ('i', 'could'),\n",
" ('could', 'not'),\n",
" ('not', 'travel'),\n",
" ('travel', 'both'),\n",
" ('both', 'and'),\n",
" ('and', 'be'),\n",
" ('be', 'one'),\n",
" ('one', 'traveler,'),\n",
" ('traveler,', 'long'),\n",
" ('long', 'i'),\n",
" ('i', 'stood'),\n",
" ('stood', 'and'),\n",
" ('and', 'looked'),\n",
" ('looked', 'down'),\n",
" ('down', 'one'),\n",
" ('one', 'as'),\n",
" ('as', 'far'),\n",
" ('far', 'as'),\n",
" ('as', 'i'),\n",
" ('i', 'could'),\n",
" ('could', 'to'),\n",
" ('to', 'where'),\n",
" ('where', 'it'),\n",
" ('it', 'bent'),\n",
" ('bent', 'in'),\n",
" ('in', 'the'),\n",
" ('the', 'undergrowth;'),\n",
" ('undergrowth;', 'then'),\n",
" ('then', 'took'),\n",
" ('took', 'the'),\n",
" ('the', 'other,'),\n",
" ('other,', 'as'),\n",
" ('as', 'just'),\n",
" ('just', 'as'),\n",
" ('as', 'fair,'),\n",
" ('fair,', 'and'),\n",
" ('and', 'having'),\n",
" ('having', 'perhaps'),\n",
" ('perhaps', 'the'),\n",
" ('the', 'better'),\n",
" ('better', 'claim,'),\n",
" ('claim,', 'because'),\n",
" ('because', 'it'),\n",
" ('it', 'was'),\n",
" ('was', 'grassy'),\n",
" ('grassy', 'and'),\n",
" ('and', 'wanted'),\n",
" ('wanted', 'wear;'),\n",
" ('wear;', 'though'),\n",
" ('though', 'as'),\n",
" ('as', 'for'),\n",
" ('for', 'that'),\n",
" ('that', 'the'),\n",
" ('the', 'passing'),\n",
" ('passing', 'there'),\n",
" ('there', 'had'),\n",
" ('had', 'worn'),\n",
" ('worn', 'them'),\n",
" ('them', 'really'),\n",
" ('really', 'about'),\n",
" ('about', 'the'),\n",
" ('the', 'same,'),\n",
" ('same,', 'and'),\n",
" ('and', 'both'),\n",
" ('both', 'that'),\n",
" ('that', 'morning'),\n",
" ('morning', 'equally'),\n",
" ('equally', 'lay'),\n",
" ('lay', 'in'),\n",
" ('in', 'leaves'),\n",
" ('leaves', 'no'),\n",
" ('no', 'step'),\n",
" ('step', 'had'),\n",
" ('had', 'trodden'),\n",
" ('trodden', 'black.'),\n",
" ('black.', 'oh,'),\n",
" ('oh,', 'i'),\n",
" ('i', 'kept'),\n",
" ('kept', 'the'),\n",
" ('the', 'first'),\n",
" ('first', 'for'),\n",
" ('for', 'another'),\n",
" ('another', 'day!'),\n",
" ('day!', 'yet'),\n",
" ('yet', 'knowing'),\n",
" ('knowing', 'how'),\n",
" ('how', 'way'),\n",
" ('way', 'leads'),\n",
" ('leads', 'on'),\n",
" ('on', 'to'),\n",
" ('to', 'way,'),\n",
" ('way,', 'i'),\n",
" ('i', 'doubted'),\n",
" ('doubted', 'if'),\n",
" ('if', 'i'),\n",
" ('i', 'should'),\n",
" ('should', 'ever'),\n",
" ('ever', 'come'),\n",
" ('come', 'back.'),\n",
" ('back.', 'i'),\n",
" ('i', 'shall'),\n",
" ('shall', 'be'),\n",
" ('be', 'telling'),\n",
" ('telling', 'this'),\n",
" ('this', 'with'),\n",
" ('with', 'a'),\n",
" ('a', 'sigh'),\n",
" ('sigh', 'somewhere'),\n",
" ('somewhere', 'ages'),\n",
" ('ages', 'and'),\n",
" ('and', 'ages'),\n",
" ('ages', 'hence:'),\n",
" ('hence:', 'two'),\n",
" ('two', 'roads'),\n",
" ('roads', 'diverged'),\n",
" ('diverged', 'in'),\n",
" ('in', 'a'),\n",
" ('a', 'wood,'),\n",
" ('wood,', 'and'),\n",
" ('and', 'i---'),\n",
" ('i---', 'i'),\n",
" ('i', 'took'),\n",
" ('took', 'the'),\n",
" ('the', 'one'),\n",
" ('one', 'less'),\n",
" ('less', 'travelled'),\n",
" ('travelled', 'by,'),\n",
" ('by,', 'and'),\n",
" ('and', 'that'),\n",
" ('that', 'has'),\n",
" ('has', 'made'),\n",
" ('made', 'all'),\n",
" ('all', 'the'),\n",
" ('the', 'difference.')]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pairs"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pair_count = Counter(pairs)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({('a', 'sigh'): 1,\n",
" ('a', 'wood,'): 1,\n",
" ('a', 'yellow'): 1,\n",
" ('about', 'the'): 1,\n",
" ('ages', 'and'): 1,\n",
" ('ages', 'hence:'): 1,\n",
" ('all', 'the'): 1,\n",
" ('and', 'ages'): 1,\n",
" ('and', 'be'): 1,\n",
" ('and', 'both'): 1,\n",
" ('and', 'having'): 1,\n",
" ('and', 'i---'): 1,\n",
" ('and', 'looked'): 1,\n",
" ('and', 'sorry'): 1,\n",
" ('and', 'that'): 1,\n",
" ('and', 'wanted'): 1,\n",
" ('another', 'day!'): 1,\n",
" ('as', 'fair,'): 1,\n",
" ('as', 'far'): 1,\n",
" ('as', 'for'): 1,\n",
" ('as', 'i'): 1,\n",
" ('as', 'just'): 1,\n",
" ('back.', 'i'): 1,\n",
" ('be', 'one'): 1,\n",
" ('be', 'telling'): 1,\n",
" ('because', 'it'): 1,\n",
" ('bent', 'in'): 1,\n",
" ('better', 'claim,'): 1,\n",
" ('black.', 'oh,'): 1,\n",
" ('both', 'and'): 1,\n",
" ('both', 'that'): 1,\n",
" ('by,', 'and'): 1,\n",
" ('claim,', 'because'): 1,\n",
" ('come', 'back.'): 1,\n",
" ('could', 'not'): 1,\n",
" ('could', 'to'): 1,\n",
" ('day!', 'yet'): 1,\n",
" ('diverged', 'in'): 2,\n",
" ('doubted', 'if'): 1,\n",
" ('down', 'one'): 1,\n",
" ('equally', 'lay'): 1,\n",
" ('ever', 'come'): 1,\n",
" ('fair,', 'and'): 1,\n",
" ('far', 'as'): 1,\n",
" ('first', 'for'): 1,\n",
" ('for', 'another'): 1,\n",
" ('for', 'that'): 1,\n",
" ('grassy', 'and'): 1,\n",
" ('had', 'trodden'): 1,\n",
" ('had', 'worn'): 1,\n",
" ('has', 'made'): 1,\n",
" ('having', 'perhaps'): 1,\n",
" ('hence:', 'two'): 1,\n",
" ('how', 'way'): 1,\n",
" ('i', 'could'): 2,\n",
" ('i', 'doubted'): 1,\n",
" ('i', 'kept'): 1,\n",
" ('i', 'shall'): 1,\n",
" ('i', 'should'): 1,\n",
" ('i', 'stood'): 1,\n",
" ('i', 'took'): 1,\n",
" ('i---', 'i'): 1,\n",
" ('if', 'i'): 1,\n",
" ('in', 'a'): 2,\n",
" ('in', 'leaves'): 1,\n",
" ('in', 'the'): 1,\n",
" ('it', 'bent'): 1,\n",
" ('it', 'was'): 1,\n",
" ('just', 'as'): 1,\n",
" ('kept', 'the'): 1,\n",
" ('knowing', 'how'): 1,\n",
" ('lay', 'in'): 1,\n",
" ('leads', 'on'): 1,\n",
" ('leaves', 'no'): 1,\n",
" ('less', 'travelled'): 1,\n",
" ('long', 'i'): 1,\n",
" ('looked', 'down'): 1,\n",
" ('made', 'all'): 1,\n",
" ('morning', 'equally'): 1,\n",
" ('no', 'step'): 1,\n",
" ('not', 'travel'): 1,\n",
" ('oh,', 'i'): 1,\n",
" ('on', 'to'): 1,\n",
" ('one', 'as'): 1,\n",
" ('one', 'less'): 1,\n",
" ('one', 'traveler,'): 1,\n",
" ('other,', 'as'): 1,\n",
" ('passing', 'there'): 1,\n",
" ('perhaps', 'the'): 1,\n",
" ('really', 'about'): 1,\n",
" ('roads', 'diverged'): 2,\n",
" ('same,', 'and'): 1,\n",
" ('shall', 'be'): 1,\n",
" ('should', 'ever'): 1,\n",
" ('sigh', 'somewhere'): 1,\n",
" ('somewhere', 'ages'): 1,\n",
" ('sorry', 'i'): 1,\n",
" ('step', 'had'): 1,\n",
" ('stood', 'and'): 1,\n",
" ('telling', 'this'): 1,\n",
" ('that', 'has'): 1,\n",
" ('that', 'morning'): 1,\n",
" ('that', 'the'): 1,\n",
" ('the', 'better'): 1,\n",
" ('the', 'difference.'): 1,\n",
" ('the', 'first'): 1,\n",
" ('the', 'one'): 1,\n",
" ('the', 'other,'): 1,\n",
" ('the', 'passing'): 1,\n",
" ('the', 'same,'): 1,\n",
" ('the', 'undergrowth;'): 1,\n",
" ('them', 'really'): 1,\n",
" ('then', 'took'): 1,\n",
" ('there', 'had'): 1,\n",
" ('this', 'with'): 1,\n",
" ('though', 'as'): 1,\n",
" ('to', 'way,'): 1,\n",
" ('to', 'where'): 1,\n",
" ('took', 'the'): 2,\n",
" ('travel', 'both'): 1,\n",
" ('traveler,', 'long'): 1,\n",
" ('travelled', 'by,'): 1,\n",
" ('trodden', 'black.'): 1,\n",
" ('two', 'roads'): 2,\n",
" ('undergrowth;', 'then'): 1,\n",
" ('wanted', 'wear;'): 1,\n",
" ('was', 'grassy'): 1,\n",
" ('way', 'leads'): 1,\n",
" ('way,', 'i'): 1,\n",
" ('wear;', 'though'): 1,\n",
" ('where', 'it'): 1,\n",
" ('with', 'a'): 1,\n",
" ('wood,', 'and'): 2,\n",
" ('worn', 'them'): 1,\n",
" ('yellow', 'wood,'): 1,\n",
" ('yet', 'knowing'): 1})"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pair_count"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(('two', 'roads'), 2),\n",
" (('roads', 'diverged'), 2),\n",
" (('diverged', 'in'), 2),\n",
" (('in', 'a'), 2),\n",
" (('wood,', 'and'), 2),\n",
" (('i', 'could'), 2),\n",
" (('took', 'the'), 2),\n",
" (('a', 'yellow'), 1),\n",
" (('yellow', 'wood,'), 1),\n",
" (('and', 'sorry'), 1)]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pair_count.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"frost_str = open(\"frost.txt\").read()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Two roads diverged in a yellow wood,\\nAnd sorry I could not travel both\\nAnd be one traveler, long I stood\\nAnd looked down one as far as I could\\nTo where it bent in the undergrowth;\\n\\nThen took the other, as just as fair,\\nAnd having perhaps the better claim,\\nBecause it was grassy and wanted wear;\\nThough as for that the passing there\\nHad worn them really about the same,\\n\\nAnd both that morning equally lay\\nIn leaves no step had trodden black.\\nOh, I kept the first for another day!\\nYet knowing how way leads on to way,\\nI doubted if I should ever come back.\\n\\nI shall be telling this with a sigh\\nSomewhere ages and ages hence:\\nTwo roads diverged in a wood, and I---\\nI took the one less travelled by,\\nAnd that has made all the difference.\\n'"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frost_str"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"char_pairs = [(frost_str[i], frost_str[i+1]) \n",
" for i in range(len(frost_str) - 1)]"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('T', 'w'),\n",
" ('w', 'o'),\n",
" ('o', ' '),\n",
" (' ', 'r'),\n",
" ('r', 'o'),\n",
" ('o', 'a'),\n",
" ('a', 'd'),\n",
" ('d', 's'),\n",
" ('s', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'i'),\n",
" ('i', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'g'),\n",
" ('g', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'i'),\n",
" ('i', 'n'),\n",
" ('n', ' '),\n",
" (' ', 'a'),\n",
" ('a', ' '),\n",
" (' ', 'y'),\n",
" ('y', 'e'),\n",
" ('e', 'l'),\n",
" ('l', 'l'),\n",
" ('l', 'o'),\n",
" ('o', 'w'),\n",
" ('w', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'd'),\n",
" ('d', ','),\n",
" (',', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 's'),\n",
" ('s', 'o'),\n",
" ('o', 'r'),\n",
" ('r', 'r'),\n",
" ('r', 'y'),\n",
" ('y', ' '),\n",
" (' ', 'I'),\n",
" ('I', ' '),\n",
" (' ', 'c'),\n",
" ('c', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 'l'),\n",
" ('l', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'n'),\n",
" ('n', 'o'),\n",
" ('o', 't'),\n",
" ('t', ' '),\n",
" (' ', 't'),\n",
" ('t', 'r'),\n",
" ('r', 'a'),\n",
" ('a', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'l'),\n",
" ('l', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'o'),\n",
" ('o', 't'),\n",
" ('t', 'h'),\n",
" ('h', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'o'),\n",
" ('o', 'n'),\n",
" ('n', 'e'),\n",
" ('e', ' '),\n",
" (' ', 't'),\n",
" ('t', 'r'),\n",
" ('r', 'a'),\n",
" ('a', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'l'),\n",
" ('l', 'e'),\n",
" ('e', 'r'),\n",
" ('r', ','),\n",
" (',', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'o'),\n",
" ('o', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 'I'),\n",
" ('I', ' '),\n",
" (' ', 's'),\n",
" ('s', 't'),\n",
" ('t', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'd'),\n",
" ('d', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'k'),\n",
" ('k', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'o'),\n",
" ('o', 'w'),\n",
" ('w', 'n'),\n",
" ('n', ' '),\n",
" (' ', 'o'),\n",
" ('o', 'n'),\n",
" ('n', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'f'),\n",
" ('f', 'a'),\n",
" ('a', 'r'),\n",
" ('r', ' '),\n",
" (' ', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'I'),\n",
" ('I', ' '),\n",
" (' ', 'c'),\n",
" ('c', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 'l'),\n",
" ('l', 'd'),\n",
" ('d', '\\n'),\n",
" ('\\n', 'T'),\n",
" ('T', 'o'),\n",
" ('o', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'i'),\n",
" ('i', 't'),\n",
" ('t', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'e'),\n",
" ('e', 'n'),\n",
" ('n', 't'),\n",
" ('t', ' '),\n",
" (' ', 'i'),\n",
" ('i', 'n'),\n",
" ('n', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'u'),\n",
" ('u', 'n'),\n",
" ('n', 'd'),\n",
" ('d', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'g'),\n",
" ('g', 'r'),\n",
" ('r', 'o'),\n",
" ('o', 'w'),\n",
" ('w', 't'),\n",
" ('t', 'h'),\n",
" ('h', ';'),\n",
" (';', '\\n'),\n",
" ('\\n', '\\n'),\n",
" ('\\n', 'T'),\n",
" ('T', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'n'),\n",
" ('n', ' '),\n",
" (' ', 't'),\n",
" ('t', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'k'),\n",
" ('k', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'o'),\n",
" ('o', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'r'),\n",
" ('r', ','),\n",
" (',', ' '),\n",
" (' ', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'j'),\n",
" ('j', 'u'),\n",
" ('u', 's'),\n",
" ('s', 't'),\n",
" ('t', ' '),\n",
" (' ', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'f'),\n",
" ('f', 'a'),\n",
" ('a', 'i'),\n",
" ('i', 'r'),\n",
" ('r', ','),\n",
" (',', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 'v'),\n",
" ('v', 'i'),\n",
" ('i', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 'p'),\n",
" ('p', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 'p'),\n",
" ('p', 's'),\n",
" ('s', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'e'),\n",
" ('e', 't'),\n",
" ('t', 't'),\n",
" ('t', 'e'),\n",
" ('e', 'r'),\n",
" ('r', ' '),\n",
" (' ', 'c'),\n",
" ('c', 'l'),\n",
" ('l', 'a'),\n",
" ('a', 'i'),\n",
" ('i', 'm'),\n",
" ('m', ','),\n",
" (',', '\\n'),\n",
" ('\\n', 'B'),\n",
" ('B', 'e'),\n",
" ('e', 'c'),\n",
" ('c', 'a'),\n",
" ('a', 'u'),\n",
" ('u', 's'),\n",
" ('s', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'i'),\n",
" ('i', 't'),\n",
" ('t', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'g'),\n",
" ('g', 'r'),\n",
" ('r', 'a'),\n",
" ('a', 's'),\n",
" ('s', 's'),\n",
" ('s', 'y'),\n",
" ('y', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'a'),\n",
" ('a', 'n'),\n",
" ('n', 't'),\n",
" ('t', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'e'),\n",
" ('e', 'a'),\n",
" ('a', 'r'),\n",
" ('r', ';'),\n",
" (';', '\\n'),\n",
" ('\\n', 'T'),\n",
" ('T', 'h'),\n",
" ('h', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 'g'),\n",
" ('g', 'h'),\n",
" ('h', ' '),\n",
" (' ', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'f'),\n",
" ('f', 'o'),\n",
" ('o', 'r'),\n",
" ('r', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 't'),\n",
" ('t', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'p'),\n",
" ('p', 'a'),\n",
" ('a', 's'),\n",
" ('s', 's'),\n",
" ('s', 'i'),\n",
" ('i', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'e'),\n",
" ('e', '\\n'),\n",
" ('\\n', 'H'),\n",
" ('H', 'a'),\n",
" ('a', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'o'),\n",
" ('o', 'r'),\n",
" ('r', 'n'),\n",
" ('n', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'm'),\n",
" ('m', ' '),\n",
" (' ', 'r'),\n",
" ('r', 'e'),\n",
" ('e', 'a'),\n",
" ('a', 'l'),\n",
" ('l', 'l'),\n",
" ('l', 'y'),\n",
" ('y', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'b'),\n",
" ('b', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 't'),\n",
" ('t', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 's'),\n",
" ('s', 'a'),\n",
" ('a', 'm'),\n",
" ('m', 'e'),\n",
" ('e', ','),\n",
" (',', '\\n'),\n",
" ('\\n', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'o'),\n",
" ('o', 't'),\n",
" ('t', 'h'),\n",
" ('h', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 't'),\n",
" ('t', ' '),\n",
" (' ', 'm'),\n",
" ('m', 'o'),\n",
" ('o', 'r'),\n",
" ('r', 'n'),\n",
" ('n', 'i'),\n",
" ('i', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 'e'),\n",
" ('e', 'q'),\n",
" ('q', 'u'),\n",
" ('u', 'a'),\n",
" ('a', 'l'),\n",
" ('l', 'l'),\n",
" ('l', 'y'),\n",
" ('y', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'a'),\n",
" ('a', 'y'),\n",
" ('y', '\\n'),\n",
" ('\\n', 'I'),\n",
" ('I', 'n'),\n",
" ('n', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'e'),\n",
" ('e', 'a'),\n",
" ('a', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 's'),\n",
" ('s', ' '),\n",
" (' ', 'n'),\n",
" ('n', 'o'),\n",
" ('o', ' '),\n",
" (' ', 's'),\n",
" ('s', 't'),\n",
" ('t', 'e'),\n",
" ('e', 'p'),\n",
" ('p', ' '),\n",
" (' ', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 'd'),\n",
" ('d', ' '),\n",
" (' ', 't'),\n",
" ('t', 'r'),\n",
" ('r', 'o'),\n",
" ('o', 'd'),\n",
" ('d', 'd'),\n",
" ('d', 'e'),\n",
" ('e', 'n'),\n",
" ('n', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'l'),\n",
" ('l', 'a'),\n",
" ('a', 'c'),\n",
" ('c', 'k'),\n",
" ('k', '.'),\n",
" ('.', '\\n'),\n",
" ('\\n', 'O'),\n",
" ('O', 'h'),\n",
" ('h', ','),\n",
" (',', ' '),\n",
" (' ', 'I'),\n",
" ('I', ' '),\n",
" (' ', 'k'),\n",
" ('k', 'e'),\n",
" ('e', 'p'),\n",
" ('p', 't'),\n",
" ('t', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'f'),\n",
" ('f', 'i'),\n",
" ('i', 'r'),\n",
" ('r', 's'),\n",
" ('s', 't'),\n",
" ('t', ' '),\n",
" (' ', 'f'),\n",
" ('f', 'o'),\n",
" ('o', 'r'),\n",
" ('r', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'n'),\n",
" ('n', 'o'),\n",
" ('o', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'r'),\n",
" ('r', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'a'),\n",
" ('a', 'y'),\n",
" ('y', '!'),\n",
" ('!', '\\n'),\n",
" ('\\n', 'Y'),\n",
" ('Y', 'e'),\n",
" ('e', 't'),\n",
" ('t', ' '),\n",
" (' ', 'k'),\n",
" ('k', 'n'),\n",
" ('n', 'o'),\n",
" ('o', 'w'),\n",
" ('w', 'i'),\n",
" ('i', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 'h'),\n",
" ('h', 'o'),\n",
" ('o', 'w'),\n",
" ('w', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'a'),\n",
" ('a', 'y'),\n",
" ('y', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'e'),\n",
" ('e', 'a'),\n",
" ('a', 'd'),\n",
" ('d', 's'),\n",
" ('s', ' '),\n",
" (' ', 'o'),\n",
" ('o', 'n'),\n",
" ('n', ' '),\n",
" (' ', 't'),\n",
" ('t', 'o'),\n",
" ('o', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'a'),\n",
" ('a', 'y'),\n",
" ('y', ','),\n",
" (',', '\\n'),\n",
" ('\\n', 'I'),\n",
" ('I', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 'b'),\n",
" ('b', 't'),\n",
" ('t', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'i'),\n",
" ('i', 'f'),\n",
" ('f', ' '),\n",
" (' ', 'I'),\n",
" ('I', ' '),\n",
" (' ', 's'),\n",
" ('s', 'h'),\n",
" ('h', 'o'),\n",
" ('o', 'u'),\n",
" ('u', 'l'),\n",
" ('l', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'e'),\n",
" ('e', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'r'),\n",
" ('r', ' '),\n",
" (' ', 'c'),\n",
" ('c', 'o'),\n",
" ('o', 'm'),\n",
" ('m', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'a'),\n",
" ('a', 'c'),\n",
" ('c', 'k'),\n",
" ('k', '.'),\n",
" ('.', '\\n'),\n",
" ('\\n', '\\n'),\n",
" ('\\n', 'I'),\n",
" ('I', ' '),\n",
" (' ', 's'),\n",
" ('s', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 'l'),\n",
" ('l', 'l'),\n",
" ('l', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'e'),\n",
" ('e', ' '),\n",
" (' ', 't'),\n",
" ('t', 'e'),\n",
" ('e', 'l'),\n",
" ('l', 'l'),\n",
" ('l', 'i'),\n",
" ('i', 'n'),\n",
" ('n', 'g'),\n",
" ('g', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'i'),\n",
" ('i', 's'),\n",
" ('s', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'i'),\n",
" ('i', 't'),\n",
" ('t', 'h'),\n",
" ('h', ' '),\n",
" (' ', 'a'),\n",
" ('a', ' '),\n",
" (' ', 's'),\n",
" ('s', 'i'),\n",
" ('i', 'g'),\n",
" ('g', 'h'),\n",
" ('h', '\\n'),\n",
" ('\\n', 'S'),\n",
" ('S', 'o'),\n",
" ('o', 'm'),\n",
" ('m', 'e'),\n",
" ('e', 'w'),\n",
" ('w', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'g'),\n",
" ('g', 'e'),\n",
" ('e', 's'),\n",
" ('s', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'g'),\n",
" ('g', 'e'),\n",
" ('e', 's'),\n",
" ('s', ' '),\n",
" (' ', 'h'),\n",
" ('h', 'e'),\n",
" ('e', 'n'),\n",
" ('n', 'c'),\n",
" ('c', 'e'),\n",
" ('e', ':'),\n",
" (':', '\\n'),\n",
" ('\\n', 'T'),\n",
" ('T', 'w'),\n",
" ('w', 'o'),\n",
" ('o', ' '),\n",
" (' ', 'r'),\n",
" ('r', 'o'),\n",
" ('o', 'a'),\n",
" ('a', 'd'),\n",
" ('d', 's'),\n",
" ('s', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'i'),\n",
" ('i', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'g'),\n",
" ('g', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'i'),\n",
" ('i', 'n'),\n",
" ('n', ' '),\n",
" (' ', 'a'),\n",
" ('a', ' '),\n",
" (' ', 'w'),\n",
" ('w', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'd'),\n",
" ('d', ','),\n",
" (',', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'I'),\n",
" ('I', '-'),\n",
" ('-', '-'),\n",
" ('-', '-'),\n",
" ('-', '\\n'),\n",
" ('\\n', 'I'),\n",
" ('I', ' '),\n",
" (' ', 't'),\n",
" ('t', 'o'),\n",
" ('o', 'o'),\n",
" ('o', 'k'),\n",
" ('k', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'o'),\n",
" ('o', 'n'),\n",
" ('n', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'l'),\n",
" ('l', 'e'),\n",
" ('e', 's'),\n",
" ('s', 's'),\n",
" ('s', ' '),\n",
" (' ', 't'),\n",
" ('t', 'r'),\n",
" ('r', 'a'),\n",
" ('a', 'v'),\n",
" ('v', 'e'),\n",
" ('e', 'l'),\n",
" ('l', 'l'),\n",
" ('l', 'e'),\n",
" ('e', 'd'),\n",
" ('d', ' '),\n",
" (' ', 'b'),\n",
" ('b', 'y'),\n",
" ('y', ','),\n",
" (',', '\\n'),\n",
" ('\\n', 'A'),\n",
" ('A', 'n'),\n",
" ('n', 'd'),\n",
" ('d', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 't'),\n",
" ('t', ' '),\n",
" (' ', 'h'),\n",
" ('h', 'a'),\n",
" ('a', 's'),\n",
" ('s', ' '),\n",
" (' ', 'm'),\n",
" ('m', 'a'),\n",
" ('a', 'd'),\n",
" ('d', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'a'),\n",
" ('a', 'l'),\n",
" ('l', 'l'),\n",
" ('l', ' '),\n",
" (' ', 't'),\n",
" ('t', 'h'),\n",
" ('h', 'e'),\n",
" ('e', ' '),\n",
" (' ', 'd'),\n",
" ('d', 'i'),\n",
" ('i', 'f'),\n",
" ('f', 'f'),\n",
" ('f', 'e'),\n",
" ('e', 'r'),\n",
" ('r', 'e'),\n",
" ('e', 'n'),\n",
" ('n', 'c'),\n",
" ('c', 'e'),\n",
" ('e', '.'),\n",
" ('.', '\\n')]"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"char_pairs"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[((' ', 't'), 22),\n",
" (('t', 'h'), 20),\n",
" (('d', ' '), 19),\n",
" (('e', ' '), 18),\n",
" (('s', ' '), 16),\n",
" ((' ', 'a'), 16),\n",
" (('h', 'e'), 16),\n",
" (('e', 'r'), 13),\n",
" (('t', ' '), 12),\n",
" ((' ', 'w'), 10),\n",
" (('n', 'd'), 10),\n",
" (('n', ' '), 9),\n",
" ((' ', 'b'), 9),\n",
" (('a', 's'), 9),\n",
" (('i', 'n'), 8),\n",
" (('I', ' '), 8),\n",
" (('h', 'a'), 8),\n",
" (('v', 'e'), 7),\n",
" (('l', 'l'), 7),\n",
" ((' ', 's'), 7),\n",
" (('a', 'd'), 6),\n",
" ((' ', 'd'), 6),\n",
" (('e', 'd'), 6),\n",
" ((' ', 'i'), 6),\n",
" (('o', 'o'), 6),\n",
" ((',', '\\n'), 6),\n",
" (('\\n', 'A'), 6),\n",
" (('A', 'n'), 6),\n",
" ((' ', 'I'), 6),\n",
" (('o', 'u'), 6),\n",
" ((' ', 'l'), 6),\n",
" (('n', 'g'), 6),\n",
" (('g', ' '), 6),\n",
" (('r', ' '), 6),\n",
" (('w', 'o'), 5),\n",
" (('o', ' '), 5),\n",
" (('e', 'l'), 5),\n",
" (('o', 'w'), 5),\n",
" (('o', 'r'), 5),\n",
" (('y', ' '), 5),\n",
" (('o', 't'), 5),\n",
" (('a', 'v'), 5),\n",
" ((' ', 'o'), 5),\n",
" (('o', 'n'), 5),\n",
" (('l', 'e'), 5),\n",
" ((' ', 'f'), 5),\n",
" (('r', 'e'), 5),\n",
" (('e', 'n'), 5),\n",
" ((' ', 'h'), 5),\n",
" (('t', 'e'), 5),\n",
" (('a', 'n'), 5),\n",
" (('r', 'o'), 4),\n",
" (('g', 'e'), 4),\n",
" (('o', 'd'), 4),\n",
" ((' ', 'c'), 4),\n",
" (('n', 'o'), 4),\n",
" (('t', 'r'), 4),\n",
" (('r', 'a'), 4),\n",
" (('b', 'e'), 4),\n",
" ((',', ' '), 4),\n",
" (('s', 't'), 4),\n",
" (('t', 'o'), 4),\n",
" (('\\n', 'T'), 4),\n",
" (('w', 'a'), 4),\n",
" (('e', 'a'), 4),\n",
" (('a', 'l'), 4),\n",
" (('a', 'y'), 4),\n",
" (('\\n', 'I'), 4),\n",
" (('e', 's'), 4),\n",
" ((' ', 'r'), 3),\n",
" (('d', 's'), 3),\n",
" (('d', 'i'), 3),\n",
" (('r', 'g'), 3),\n",
" (('a', ' '), 3),\n",
" (('l', 'o'), 3),\n",
" (('c', 'o'), 3),\n",
" (('u', 'l'), 3),\n",
" (('l', 'd'), 3),\n",
" (('l', ' '), 3),\n",
" (('b', 'o'), 3),\n",
" (('n', 'e'), 3),\n",
" (('r', ','), 3),\n",
" (('o', 'k'), 3),\n",
" (('i', 't'), 3),\n",
" (('d', 'e'), 3),\n",
" (('\\n', '\\n'), 3),\n",
" (('l', 'a'), 3),\n",
" (('s', 's'), 3),\n",
" (('h', 'o'), 3),\n",
" (('h', ' '), 3),\n",
" (('a', 't'), 3),\n",
" (('m', 'e'), 3),\n",
" (('.', '\\n'), 3),\n",
" (('T', 'w'), 2),\n",
" (('o', 'a'), 2),\n",
" (('i', 'v'), 2),\n",
" (('w', ' '), 2),\n",
" (('d', ','), 2),\n",
" ((' ', 'n'), 2),\n",
" (('h', '\\n'), 2),\n",
" (('d', '\\n'), 2),\n",
" (('k', 'e'), 2),\n",
" (('d', 'o'), 2),\n",
" (('f', 'a'), 2),\n",
" (('a', 'r'), 2),\n",
" (('w', 'h'), 2),\n",
" (('n', 't'), 2),\n",
" (('g', 'r'), 2),\n",
" ((';', '\\n'), 2),\n",
" (('T', 'h'), 2),\n",
" (('k', ' '), 2),\n",
" (('u', 's'), 2),\n",
" (('a', 'i'), 2),\n",
" (('i', 'r'), 2),\n",
" ((' ', 'p'), 2),\n",
" (('e', 't'), 2),\n",
" (('g', 'h'), 2),\n",
" (('f', 'o'), 2),\n",
" (('s', 'i'), 2),\n",
" (('r', 'n'), 2),\n",
" (('l', 'y'), 2),\n",
" ((' ', 'm'), 2),\n",
" ((' ', 'e'), 2),\n",
" (('e', 'p'), 2),\n",
" (('a', 'c'), 2),\n",
" (('c', 'k'), 2),\n",
" (('k', '.'), 2),\n",
" ((' ', 'k'), 2),\n",
" (('w', 'i'), 2),\n",
" (('y', ','), 2),\n",
" (('i', 'f'), 2),\n",
" (('s', 'h'), 2),\n",
" (('o', 'm'), 2),\n",
" (('a', 'g'), 2),\n",
" (('n', 'c'), 2),\n",
" (('c', 'e'), 2),\n",
" (('-', '-'), 2),\n",
" ((' ', 'y'), 1),\n",
" (('y', 'e'), 1),\n",
" (('s', 'o'), 1),\n",
" (('r', 'r'), 1),\n",
" (('r', 'y'), 1),\n",
" (('w', 'n'), 1),\n",
" (('T', 'o'), 1),\n",
" ((' ', 'u'), 1),\n",
" (('u', 'n'), 1),\n",
" (('w', 't'), 1),\n",
" (('h', ';'), 1),\n",
" ((' ', 'j'), 1),\n",
" (('j', 'u'), 1),\n",
" (('v', 'i'), 1),\n",
" (('p', 'e'), 1),\n",
" (('r', 'h'), 1),\n",
" (('a', 'p'), 1),\n",
" (('p', 's'), 1),\n",
" (('t', 't'), 1),\n",
" (('c', 'l'), 1),\n",
" (('i', 'm'), 1),\n",
" (('m', ','), 1),\n",
" (('\\n', 'B'), 1),\n",
" (('B', 'e'), 1),\n",
" (('e', 'c'), 1),\n",
" (('c', 'a'), 1),\n",
" (('a', 'u'), 1),\n",
" (('s', 'e'), 1),\n",
" ((' ', 'g'), 1),\n",
" (('s', 'y'), 1),\n",
" (('w', 'e'), 1),\n",
" (('r', ';'), 1),\n",
" (('u', 'g'), 1),\n",
" (('p', 'a'), 1),\n",
" (('e', '\\n'), 1),\n",
" (('\\n', 'H'), 1),\n",
" (('H', 'a'), 1),\n",
" (('e', 'm'), 1),\n",
" (('m', ' '), 1),\n",
" (('a', 'b'), 1),\n",
" (('u', 't'), 1),\n",
" (('s', 'a'), 1),\n",
" (('a', 'm'), 1),\n",
" (('e', ','), 1),\n",
" (('m', 'o'), 1),\n",
" (('n', 'i'), 1),\n",
" (('e', 'q'), 1),\n",
" (('q', 'u'), 1),\n",
" (('u', 'a'), 1),\n",
" (('y', '\\n'), 1),\n",
" (('I', 'n'), 1),\n",
" (('p', ' '), 1),\n",
" (('d', 'd'), 1),\n",
" (('b', 'l'), 1),\n",
" (('\\n', 'O'), 1),\n",
" (('O', 'h'), 1),\n",
" (('h', ','), 1),\n",
" (('p', 't'), 1),\n",
" (('f', 'i'), 1),\n",
" (('r', 's'), 1),\n",
" (('d', 'a'), 1),\n",
" (('y', '!'), 1),\n",
" (('!', '\\n'), 1),\n",
" (('\\n', 'Y'), 1),\n",
" (('Y', 'e'), 1),\n",
" (('k', 'n'), 1),\n",
" (('u', 'b'), 1),\n",
" (('b', 't'), 1),\n",
" (('f', ' '), 1),\n",
" (('e', 'v'), 1),\n",
" (('b', 'a'), 1),\n",
" (('l', 'i'), 1),\n",
" (('h', 'i'), 1),\n",
" (('i', 's'), 1),\n",
" (('i', 'g'), 1),\n",
" (('\\n', 'S'), 1),\n",
" (('S', 'o'), 1),\n",
" (('e', 'w'), 1),\n",
" (('e', ':'), 1),\n",
" ((':', '\\n'), 1),\n",
" (('I', '-'), 1),\n",
" (('-', '\\n'), 1),\n",
" (('b', 'y'), 1),\n",
" (('m', 'a'), 1),\n",
" (('f', 'f'), 1),\n",
" (('f', 'e'), 1),\n",
" (('e', '.'), 1)]"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(char_pairs).most_common()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## n-grams of arbitrary length"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seven_grams = [tuple(words[i:i+7]) for i in range(len(words) - 6)]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood,'),\n",
" ('roads', 'diverged', 'in', 'a', 'yellow', 'wood,', 'and'),\n",
" ('diverged', 'in', 'a', 'yellow', 'wood,', 'and', 'sorry'),\n",
" ('in', 'a', 'yellow', 'wood,', 'and', 'sorry', 'i'),\n",
" ('a', 'yellow', 'wood,', 'and', 'sorry', 'i', 'could'),\n",
" ('yellow', 'wood,', 'and', 'sorry', 'i', 'could', 'not'),\n",
" ('wood,', 'and', 'sorry', 'i', 'could', 'not', 'travel'),\n",
" ('and', 'sorry', 'i', 'could', 'not', 'travel', 'both'),\n",
" ('sorry', 'i', 'could', 'not', 'travel', 'both', 'and'),\n",
" ('i', 'could', 'not', 'travel', 'both', 'and', 'be'),\n",
" ('could', 'not', 'travel', 'both', 'and', 'be', 'one'),\n",
" ('not', 'travel', 'both', 'and', 'be', 'one', 'traveler,'),\n",
" ('travel', 'both', 'and', 'be', 'one', 'traveler,', 'long'),\n",
" ('both', 'and', 'be', 'one', 'traveler,', 'long', 'i'),\n",
" ('and', 'be', 'one', 'traveler,', 'long', 'i', 'stood'),\n",
" ('be', 'one', 'traveler,', 'long', 'i', 'stood', 'and'),\n",
" ('one', 'traveler,', 'long', 'i', 'stood', 'and', 'looked'),\n",
" ('traveler,', 'long', 'i', 'stood', 'and', 'looked', 'down'),\n",
" ('long', 'i', 'stood', 'and', 'looked', 'down', 'one'),\n",
" ('i', 'stood', 'and', 'looked', 'down', 'one', 'as'),\n",
" ('stood', 'and', 'looked', 'down', 'one', 'as', 'far'),\n",
" ('and', 'looked', 'down', 'one', 'as', 'far', 'as'),\n",
" ('looked', 'down', 'one', 'as', 'far', 'as', 'i'),\n",
" ('down', 'one', 'as', 'far', 'as', 'i', 'could'),\n",
" ('one', 'as', 'far', 'as', 'i', 'could', 'to'),\n",
" ('as', 'far', 'as', 'i', 'could', 'to', 'where'),\n",
" ('far', 'as', 'i', 'could', 'to', 'where', 'it'),\n",
" ('as', 'i', 'could', 'to', 'where', 'it', 'bent'),\n",
" ('i', 'could', 'to', 'where', 'it', 'bent', 'in'),\n",
" ('could', 'to', 'where', 'it', 'bent', 'in', 'the'),\n",
" ('to', 'where', 'it', 'bent', 'in', 'the', 'undergrowth;'),\n",
" ('where', 'it', 'bent', 'in', 'the', 'undergrowth;', 'then'),\n",
" ('it', 'bent', 'in', 'the', 'undergrowth;', 'then', 'took'),\n",
" ('bent', 'in', 'the', 'undergrowth;', 'then', 'took', 'the'),\n",
" ('in', 'the', 'undergrowth;', 'then', 'took', 'the', 'other,'),\n",
" ('the', 'undergrowth;', 'then', 'took', 'the', 'other,', 'as'),\n",
" ('undergrowth;', 'then', 'took', 'the', 'other,', 'as', 'just'),\n",
" ('then', 'took', 'the', 'other,', 'as', 'just', 'as'),\n",
" ('took', 'the', 'other,', 'as', 'just', 'as', 'fair,'),\n",
" ('the', 'other,', 'as', 'just', 'as', 'fair,', 'and'),\n",
" ('other,', 'as', 'just', 'as', 'fair,', 'and', 'having'),\n",
" ('as', 'just', 'as', 'fair,', 'and', 'having', 'perhaps'),\n",
" ('just', 'as', 'fair,', 'and', 'having', 'perhaps', 'the'),\n",
" ('as', 'fair,', 'and', 'having', 'perhaps', 'the', 'better'),\n",
" ('fair,', 'and', 'having', 'perhaps', 'the', 'better', 'claim,'),\n",
" ('and', 'having', 'perhaps', 'the', 'better', 'claim,', 'because'),\n",
" ('having', 'perhaps', 'the', 'better', 'claim,', 'because', 'it'),\n",
" ('perhaps', 'the', 'better', 'claim,', 'because', 'it', 'was'),\n",
" ('the', 'better', 'claim,', 'because', 'it', 'was', 'grassy'),\n",
" ('better', 'claim,', 'because', 'it', 'was', 'grassy', 'and'),\n",
" ('claim,', 'because', 'it', 'was', 'grassy', 'and', 'wanted'),\n",
" ('because', 'it', 'was', 'grassy', 'and', 'wanted', 'wear;'),\n",
" ('it', 'was', 'grassy', 'and', 'wanted', 'wear;', 'though'),\n",
" ('was', 'grassy', 'and', 'wanted', 'wear;', 'though', 'as'),\n",
" ('grassy', 'and', 'wanted', 'wear;', 'though', 'as', 'for'),\n",
" ('and', 'wanted', 'wear;', 'though', 'as', 'for', 'that'),\n",
" ('wanted', 'wear;', 'though', 'as', 'for', 'that', 'the'),\n",
" ('wear;', 'though', 'as', 'for', 'that', 'the', 'passing'),\n",
" ('though', 'as', 'for', 'that', 'the', 'passing', 'there'),\n",
" ('as', 'for', 'that', 'the', 'passing', 'there', 'had'),\n",
" ('for', 'that', 'the', 'passing', 'there', 'had', 'worn'),\n",
" ('that', 'the', 'passing', 'there', 'had', 'worn', 'them'),\n",
" ('the', 'passing', 'there', 'had', 'worn', 'them', 'really'),\n",
" ('passing', 'there', 'had', 'worn', 'them', 'really', 'about'),\n",
" ('there', 'had', 'worn', 'them', 'really', 'about', 'the'),\n",
" ('had', 'worn', 'them', 'really', 'about', 'the', 'same,'),\n",
" ('worn', 'them', 'really', 'about', 'the', 'same,', 'and'),\n",
" ('them', 'really', 'about', 'the', 'same,', 'and', 'both'),\n",
" ('really', 'about', 'the', 'same,', 'and', 'both', 'that'),\n",
" ('about', 'the', 'same,', 'and', 'both', 'that', 'morning'),\n",
" ('the', 'same,', 'and', 'both', 'that', 'morning', 'equally'),\n",
" ('same,', 'and', 'both', 'that', 'morning', 'equally', 'lay'),\n",
" ('and', 'both', 'that', 'morning', 'equally', 'lay', 'in'),\n",
" ('both', 'that', 'morning', 'equally', 'lay', 'in', 'leaves'),\n",
" ('that', 'morning', 'equally', 'lay', 'in', 'leaves', 'no'),\n",
" ('morning', 'equally', 'lay', 'in', 'leaves', 'no', 'step'),\n",
" ('equally', 'lay', 'in', 'leaves', 'no', 'step', 'had'),\n",
" ('lay', 'in', 'leaves', 'no', 'step', 'had', 'trodden'),\n",
" ('in', 'leaves', 'no', 'step', 'had', 'trodden', 'black.'),\n",
" ('leaves', 'no', 'step', 'had', 'trodden', 'black.', 'oh,'),\n",
" ('no', 'step', 'had', 'trodden', 'black.', 'oh,', 'i'),\n",
" ('step', 'had', 'trodden', 'black.', 'oh,', 'i', 'kept'),\n",
" ('had', 'trodden', 'black.', 'oh,', 'i', 'kept', 'the'),\n",
" ('trodden', 'black.', 'oh,', 'i', 'kept', 'the', 'first'),\n",
" ('black.', 'oh,', 'i', 'kept', 'the', 'first', 'for'),\n",
" ('oh,', 'i', 'kept', 'the', 'first', 'for', 'another'),\n",
" ('i', 'kept', 'the', 'first', 'for', 'another', 'day!'),\n",
" ('kept', 'the', 'first', 'for', 'another', 'day!', 'yet'),\n",
" ('the', 'first', 'for', 'another', 'day!', 'yet', 'knowing'),\n",
" ('first', 'for', 'another', 'day!', 'yet', 'knowing', 'how'),\n",
" ('for', 'another', 'day!', 'yet', 'knowing', 'how', 'way'),\n",
" ('another', 'day!', 'yet', 'knowing', 'how', 'way', 'leads'),\n",
" ('day!', 'yet', 'knowing', 'how', 'way', 'leads', 'on'),\n",
" ('yet', 'knowing', 'how', 'way', 'leads', 'on', 'to'),\n",
" ('knowing', 'how', 'way', 'leads', 'on', 'to', 'way,'),\n",
" ('how', 'way', 'leads', 'on', 'to', 'way,', 'i'),\n",
" ('way', 'leads', 'on', 'to', 'way,', 'i', 'doubted'),\n",
" ('leads', 'on', 'to', 'way,', 'i', 'doubted', 'if'),\n",
" ('on', 'to', 'way,', 'i', 'doubted', 'if', 'i'),\n",
" ('to', 'way,', 'i', 'doubted', 'if', 'i', 'should'),\n",
" ('way,', 'i', 'doubted', 'if', 'i', 'should', 'ever'),\n",
" ('i', 'doubted', 'if', 'i', 'should', 'ever', 'come'),\n",
" ('doubted', 'if', 'i', 'should', 'ever', 'come', 'back.'),\n",
" ('if', 'i', 'should', 'ever', 'come', 'back.', 'i'),\n",
" ('i', 'should', 'ever', 'come', 'back.', 'i', 'shall'),\n",
" ('should', 'ever', 'come', 'back.', 'i', 'shall', 'be'),\n",
" ('ever', 'come', 'back.', 'i', 'shall', 'be', 'telling'),\n",
" ('come', 'back.', 'i', 'shall', 'be', 'telling', 'this'),\n",
" ('back.', 'i', 'shall', 'be', 'telling', 'this', 'with'),\n",
" ('i', 'shall', 'be', 'telling', 'this', 'with', 'a'),\n",
" ('shall', 'be', 'telling', 'this', 'with', 'a', 'sigh'),\n",
" ('be', 'telling', 'this', 'with', 'a', 'sigh', 'somewhere'),\n",
" ('telling', 'this', 'with', 'a', 'sigh', 'somewhere', 'ages'),\n",
" ('this', 'with', 'a', 'sigh', 'somewhere', 'ages', 'and'),\n",
" ('with', 'a', 'sigh', 'somewhere', 'ages', 'and', 'ages'),\n",
" ('a', 'sigh', 'somewhere', 'ages', 'and', 'ages', 'hence:'),\n",
" ('sigh', 'somewhere', 'ages', 'and', 'ages', 'hence:', 'two'),\n",
" ('somewhere', 'ages', 'and', 'ages', 'hence:', 'two', 'roads'),\n",
" ('ages', 'and', 'ages', 'hence:', 'two', 'roads', 'diverged'),\n",
" ('and', 'ages', 'hence:', 'two', 'roads', 'diverged', 'in'),\n",
" ('ages', 'hence:', 'two', 'roads', 'diverged', 'in', 'a'),\n",
" ('hence:', 'two', 'roads', 'diverged', 'in', 'a', 'wood,'),\n",
" ('two', 'roads', 'diverged', 'in', 'a', 'wood,', 'and'),\n",
" ('roads', 'diverged', 'in', 'a', 'wood,', 'and', 'i---'),\n",
" ('diverged', 'in', 'a', 'wood,', 'and', 'i---', 'i'),\n",
" ('in', 'a', 'wood,', 'and', 'i---', 'i', 'took'),\n",
" ('a', 'wood,', 'and', 'i---', 'i', 'took', 'the'),\n",
" ('wood,', 'and', 'i---', 'i', 'took', 'the', 'one'),\n",
" ('and', 'i---', 'i', 'took', 'the', 'one', 'less'),\n",
" ('i---', 'i', 'took', 'the', 'one', 'less', 'travelled'),\n",
" ('i', 'took', 'the', 'one', 'less', 'travelled', 'by,'),\n",
" ('took', 'the', 'one', 'less', 'travelled', 'by,', 'and'),\n",
" ('the', 'one', 'less', 'travelled', 'by,', 'and', 'that'),\n",
" ('one', 'less', 'travelled', 'by,', 'and', 'that', 'has'),\n",
" ('less', 'travelled', 'by,', 'and', 'that', 'has', 'made'),\n",
" ('travelled', 'by,', 'and', 'that', 'has', 'made', 'all'),\n",
" ('by,', 'and', 'that', 'has', 'made', 'all', 'the'),\n",
" ('and', 'that', 'has', 'made', 'all', 'the', 'difference.')]"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seven_grams"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"stuff = [1, 2, 3]"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1, 2, 3)"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tuple(stuff)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"other_stuff = (4, 5, 6)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[4, 5, 6]"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(other_stuff)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ngrams_for_sequence(n, seq):\n",
" return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)]"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('c', 'o'),\n",
" ('o', 'n'),\n",
" ('n', 'd'),\n",
" ('d', 'e'),\n",
" ('e', 's'),\n",
" ('s', 'c'),\n",
" ('c', 'e'),\n",
" ('e', 'n'),\n",
" ('n', 'd'),\n",
" ('d', 'e'),\n",
" ('e', 'n'),\n",
" ('n', 'c'),\n",
" ('c', 'e'),\n",
" ('e', 's')]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ngrams_for_sequence(2, \"condescendences\")"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('T', 'w', 'o', ' ', 'r', 'o', 'a', 'd', 's', ' '),\n",
" ('w', 'o', ' ', 'r', 'o', 'a', 'd', 's', ' ', 'd'),\n",
" ('o', ' ', 'r', 'o', 'a', 'd', 's', ' ', 'd', 'i'),\n",
" (' ', 'r', 'o', 'a', 'd', 's', ' ', 'd', 'i', 'v'),\n",
" ('r', 'o', 'a', 'd', 's', ' ', 'd', 'i', 'v', 'e'),\n",
" ('o', 'a', 'd', 's', ' ', 'd', 'i', 'v', 'e', 'r'),\n",
" ('a', 'd', 's', ' ', 'd', 'i', 'v', 'e', 'r', 'g'),\n",
" ('d', 's', ' ', 'd', 'i', 'v', 'e', 'r', 'g', 'e'),\n",
" ('s', ' ', 'd', 'i', 'v', 'e', 'r', 'g', 'e', 'd'),\n",
" (' ', 'd', 'i', 'v', 'e', 'r', 'g', 'e', 'd', ' '),\n",
" ('d', 'i', 'v', 'e', 'r', 'g', 'e', 'd', ' ', 'i'),\n",
" ('i', 'v', 'e', 'r', 'g', 'e', 'd', ' ', 'i', 'n'),\n",
" ('v', 'e', 'r', 'g', 'e', 'd', ' ', 'i', 'n', ' '),\n",
" ('e', 'r', 'g', 'e', 'd', ' ', 'i', 'n', ' ', 'a'),\n",
" ('r', 'g', 'e', 'd', ' ', 'i', 'n', ' ', 'a', ' '),\n",
" ('g', 'e', 'd', ' ', 'i', 'n', ' ', 'a', ' ', 'y'),\n",
" ('e', 'd', ' ', 'i', 'n', ' ', 'a', ' ', 'y', 'e'),\n",
" ('d', ' ', 'i', 'n', ' ', 'a', ' ', 'y', 'e', 'l'),\n",
" (' ', 'i', 'n', ' ', 'a', ' ', 'y', 'e', 'l', 'l'),\n",
" ('i', 'n', ' ', 'a', ' ', 'y', 'e', 'l', 'l', 'o'),\n",
" ('n', ' ', 'a', ' ', 'y', 'e', 'l', 'l', 'o', 'w'),\n",
" (' ', 'a', ' ', 'y', 'e', 'l', 'l', 'o', 'w', ' '),\n",
" ('a', ' ', 'y', 'e', 'l', 'l', 'o', 'w', ' ', 'w'),\n",
" (' ', 'y', 'e', 'l', 'l', 'o', 'w', ' ', 'w', 'o'),\n",
" ('y', 'e', 'l', 'l', 'o', 'w', ' ', 'w', 'o', 'o'),\n",
" ('e', 'l', 'l', 'o', 'w', ' ', 'w', 'o', 'o', 'd'),\n",
" ('l', 'l', 'o', 'w', ' ', 'w', 'o', 'o', 'd', ','),\n",
" ('l', 'o', 'w', ' ', 'w', 'o', 'o', 'd', ',', '\\n'),\n",
" ('o', 'w', ' ', 'w', 'o', 'o', 'd', ',', '\\n', 'A'),\n",
" ('w', ' ', 'w', 'o', 'o', 'd', ',', '\\n', 'A', 'n'),\n",
" (' ', 'w', 'o', 'o', 'd', ',', '\\n', 'A', 'n', 'd'),\n",
" ('w', 'o', 'o', 'd', ',', '\\n', 'A', 'n', 'd', ' '),\n",
" ('o', 'o', 'd', ',', '\\n', 'A', 'n', 'd', ' ', 's'),\n",
" ('o', 'd', ',', '\\n', 'A', 'n', 'd', ' ', 's', 'o'),\n",
" ('d', ',', '\\n', 'A', 'n', 'd', ' ', 's', 'o', 'r'),\n",
" (',', '\\n', 'A', 'n', 'd', ' ', 's', 'o', 'r', 'r'),\n",
" ('\\n', 'A', 'n', 'd', ' ', 's', 'o', 'r', 'r', 'y'),\n",
" ('A', 'n', 'd', ' ', 's', 'o', 'r', 'r', 'y', ' '),\n",
" ('n', 'd', ' ', 's', 'o', 'r', 'r', 'y', ' ', 'I'),\n",
" ('d', ' ', 's', 'o', 'r', 'r', 'y', ' ', 'I', ' '),\n",
" (' ', 's', 'o', 'r', 'r', 'y', ' ', 'I', ' ', 'c'),\n",
" ('s', 'o', 'r', 'r', 'y', ' ', 'I', ' ', 'c', 'o'),\n",
" ('o', 'r', 'r', 'y', ' ', 'I', ' ', 'c', 'o', 'u'),\n",
" ('r', 'r', 'y', ' ', 'I', ' ', 'c', 'o', 'u', 'l'),\n",
" ('r', 'y', ' ', 'I', ' ', 'c', 'o', 'u', 'l', 'd'),\n",
" ('y', ' ', 'I', ' ', 'c', 'o', 'u', 'l', 'd', ' '),\n",
" (' ', 'I', ' ', 'c', 'o', 'u', 'l', 'd', ' ', 'n'),\n",
" ('I', ' ', 'c', 'o', 'u', 'l', 'd', ' ', 'n', 'o'),\n",
" (' ', 'c', 'o', 'u', 'l', 'd', ' ', 'n', 'o', 't'),\n",
" ('c', 'o', 'u', 'l', 'd', ' ', 'n', 'o', 't', ' '),\n",
" ('o', 'u', 'l', 'd', ' ', 'n', 'o', 't', ' ', 't'),\n",
" ('u', 'l', 'd', ' ', 'n', 'o', 't', ' ', 't', 'r'),\n",
" ('l', 'd', ' ', 'n', 'o', 't', ' ', 't', 'r', 'a'),\n",
" ('d', ' ', 'n', 'o', 't', ' ', 't', 'r', 'a', 'v'),\n",
" (' ', 'n', 'o', 't', ' ', 't', 'r', 'a', 'v', 'e'),\n",
" ('n', 'o', 't', ' ', 't', 'r', 'a', 'v', 'e', 'l'),\n",
" ('o', 't', ' ', 't', 'r', 'a', 'v', 'e', 'l', ' '),\n",
" ('t', ' ', 't', 'r', 'a', 'v', 'e', 'l', ' ', 'b'),\n",
" (' ', 't', 'r', 'a', 'v', 'e', 'l', ' ', 'b', 'o'),\n",
" ('t', 'r', 'a', 'v', 'e', 'l', ' ', 'b', 'o', 't'),\n",
" ('r', 'a', 'v', 'e', 'l', ' ', 'b', 'o', 't', 'h'),\n",
" ('a', 'v', 'e', 'l', ' ', 'b', 'o', 't', 'h', '\\n'),\n",
" ('v', 'e', 'l', ' ', 'b', 'o', 't', 'h', '\\n', 'A'),\n",
" ('e', 'l', ' ', 'b', 'o', 't', 'h', '\\n', 'A', 'n'),\n",
" ('l', ' ', 'b', 'o', 't', 'h', '\\n', 'A', 'n', 'd'),\n",
" (' ', 'b', 'o', 't', 'h', '\\n', 'A', 'n', 'd', ' '),\n",
" ('b', 'o', 't', 'h', '\\n', 'A', 'n', 'd', ' ', 'b'),\n",
" ('o', 't', 'h', '\\n', 'A', 'n', 'd', ' ', 'b', 'e'),\n",
" ('t', 'h', '\\n', 'A', 'n', 'd', ' ', 'b', 'e', ' '),\n",
" ('h', '\\n', 'A', 'n', 'd', ' ', 'b', 'e', ' ', 'o'),\n",