Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mattalhonte/80ab7ed302473e8114a8 to your computer and use it in GitHub Desktop.
Save mattalhonte/80ab7ed302473e8114a8 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:32839fddcfb80693665b2ca10367c475856d92f5dd6b8ce6a9e604f30ab1e43f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.util import ngrams\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"\n",
"#Importing the dataset\n",
"%cd C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF\n",
"data = pd.read_csv(\"procPublicationRequest_Oct-Dec_2014_clean - procPublicationRequest_Oct-Dec_2014_clean.csv\")\n",
"\n",
"#Snagging the \"human_readable\" column\n",
"human_readableList = list(data['human_readable'])\n",
"\n",
"#Turn the values into strings\n",
"strReadable = [str(a) for a in human_readableList]\n",
"\n",
"#Split into individual words\n",
"listOfLists = [a.split() for a in strReadable]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\Matt\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\Matt\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF\n"
]
}
],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#While we're here, let's output the raw words to a text file\n",
"myCorpus = ''\n",
"for myEntry in strReadable:\n",
" myCorpus = myCorpus + \"\\n\"+ myEntry"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"f = open('rawCorpus', 'w')\n",
"f.write(myCorpus)\n",
"f.close()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Now we have a data file that'll probably a little faster to mess with (maybe?)\n",
"file = open('rawCorpus.txt')\n",
"t = file.read()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's tokenize it and turn into an NLTK Text file\n",
"myCorpusTokenized = nltk.word_tokenize(t)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpusText = nltk.Text(myCorpusTokenized)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Now that we've got a bigger body of text, we can look at more interesting patterns in phrasing\n",
"corpusText.collocations()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"New York; substantially similar; similar titles; titles within; HEREBY\n",
"GIVEN; within agency; York City; sidewalk caf; 10:00 A.M.; proposed\n",
"contract; Annual Contracting; Contracting Plan; agency intends; 2015\n",
"Annual; public hearing; square foot; unenclosed sidewalk; COMMUNITY\n",
"BOARD; four years; End date\n"
]
}
],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpusFreqDist = nltk.FreqDist(corpusText)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Most commom words!\n",
"list(corpusFreqDist.most_common(50))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 36,
"text": [
"[(',', 8033),\n",
" ('the', 4860),\n",
" ('of', 4570),\n",
" ('.', 2727),\n",
" ('and', 2338),\n",
" ('to', 2179),\n",
" (')', 1806),\n",
" ('(', 1747),\n",
" (':', 1714),\n",
" ('in', 1520),\n",
" ('a', 1259),\n",
" ('at', 1207),\n",
" ('for', 1125),\n",
" ('New', 1107),\n",
" ('York', 988),\n",
" ('Street', 910),\n",
" ('on', 875),\n",
" ('be', 674),\n",
" ('City', 665),\n",
" ('The', 641),\n",
" ('proposed', 561),\n",
" ('by', 519),\n",
" ('an', 513),\n",
" ('is', 501),\n",
" ('contract', 498),\n",
" ('Manhattan', 470),\n",
" ('will', 461),\n",
" ('agency', 397),\n",
" ('that', 396),\n",
" ('$', 382),\n",
" ('2014', 379),\n",
" ('from', 369),\n",
" ('Borough', 363),\n",
" ('Floor', 360),\n",
" ('within', 354),\n",
" ('Avenue', 332),\n",
" ('NY', 329),\n",
" ('date', 321),\n",
" ('1', 321),\n",
" (\"'s\", 311),\n",
" ('Board', 292),\n",
" ('public', 285),\n",
" ('similar', 280),\n",
" ('or', 279),\n",
" ('Services', 278),\n",
" ('Department', 274),\n",
" ('titles', 272),\n",
" ('substantially', 272),\n",
" ('as', 264),\n",
" ('with', 252)]"
]
}
],
"prompt_number": 36
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's clean things up a little bit. Changing everything to lower-case is usually a good idea. \n",
"#\"Public Hearing\" will equal \"PUBLIC HEARING\"\n",
"lowerTokens = [w.lower() for w in myCorpusTokenized]\n",
"lowerText = nltk.Text(lowerTokens)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"list(nltk.FreqDist(lowerText).most_common(50))\n",
"#Already saved some doubling-up! Note the 5649 mentions of \"the\", instead of 4860 like in the last list"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 57,
"text": [
"[(',', 8033),\n",
" ('the', 5649),\n",
" ('of', 4822),\n",
" ('.', 2727),\n",
" ('to', 2409),\n",
" ('and', 2392),\n",
" (')', 1806),\n",
" ('(', 1747),\n",
" (':', 1714),\n",
" ('in', 1686),\n",
" ('a', 1419),\n",
" ('for', 1261),\n",
" ('at', 1215),\n",
" ('new', 1191),\n",
" ('york', 1003),\n",
" ('street', 988),\n",
" ('on', 888),\n",
" ('is', 704),\n",
" ('city', 698),\n",
" ('be', 674),\n",
" ('agency', 647),\n",
" ('contract', 623),\n",
" ('proposed', 583),\n",
" ('an', 533),\n",
" ('public', 532),\n",
" ('by', 522),\n",
" ('services', 510),\n",
" ('manhattan', 479),\n",
" ('floor', 469),\n",
" ('will', 461),\n",
" ('that', 425),\n",
" ('borough', 414),\n",
" ('notice', 401),\n",
" ('board', 396),\n",
" ('hearing', 385),\n",
" ('$', 382),\n",
" ('from', 379),\n",
" ('2014', 379),\n",
" ('date', 363),\n",
" ('within', 356),\n",
" ('avenue', 336),\n",
" ('ny', 329),\n",
" ('district', 325),\n",
" (\"'s\", 324),\n",
" ('1', 321),\n",
" ('application', 315),\n",
" ('community', 307),\n",
" ('a.m.', 291),\n",
" ('personnel', 290),\n",
" ('department', 285)]"
]
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's see some bigrams!\n",
"corpusBigrams = list(ngrams(lowerTokens,2))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpusBigramFreqs = nltk.FreqDist(corpusBigrams)\n",
"corpusBigramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 59,
"text": [
"[(('of', 'the'), 1267),\n",
" (('new', 'york'), 999),\n",
" (('in', 'the'), 666),\n",
" (('street', ','), 633),\n",
" (('.', 'the'), 491),\n",
" ((',', 'new'), 460),\n",
" (('the', 'proposed'), 454),\n",
" (('agency', ':'), 409),\n",
" ((',', 'and'), 392),\n",
" (('borough', 'of'), 378),\n",
" (('york', ','), 372),\n",
" ((',', '2014'), 363),\n",
" (('contract', ':'), 343),\n",
" (('for', 'the'), 335),\n",
" (('to', 'the'), 334),\n",
" (('york', 'city'), 321),\n",
" (('date', 'of'), 317),\n",
" (('on', 'the'), 316),\n",
" ((',', 'ny'), 314),\n",
" (('will', 'be'), 308),\n",
" (('public', 'hearing'), 280),\n",
" (('substantially', 'similar'), 276),\n",
" (('in', 'substantially'), 272),\n",
" (('personnel', 'in'), 272),\n",
" (('similar', 'titles'), 272),\n",
" (('titles', 'within'), 269),\n",
" (('department', 'of'), 266),\n",
" (('floor', ','), 264),\n",
" (('within', 'agency'), 264),\n",
" (('pursuant', 'to'), 259),\n",
" (('at', 'the'), 252),\n",
" (('for', 'a'), 241),\n",
" (('proposed', 'contract'), 228),\n",
" (('of', 'manhattan'), 226),\n",
" (('the', 'new'), 221),\n",
" (('notice', 'is'), 217),\n",
" (('the', 'following'), 214),\n",
" (('of', 'a'), 213),\n",
" (('is', 'hereby'), 213),\n",
" (('hereby', 'given'), 212),\n",
" (('--', '--'), 210),\n",
" (('manhattan', ','), 208),\n",
" ((',', 'manhattan'), 207),\n",
" ((',', 'at'), 204),\n",
" (('the', 'borough'), 202),\n",
" (('the', 'agency'), 201),\n",
" (('(', 's'), 196),\n",
" (('s', ')'), 196),\n",
" (('office', 'of'), 189),\n",
" (('of', 'services'), 188)]"
]
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's see tri-grams!\n",
"corpusTrigrams = list(ngrams(lowerTokens,3))\n",
"corpusTrigramFreqs = nltk.FreqDist(corpusTrigrams)\n",
"corpusTrigramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 38,
"text": [
"[((',', 'new', 'york'), 457),\n",
" (('new', 'york', ','), 372),\n",
" (('new', 'york', 'city'), 321),\n",
" (('of', 'the', 'proposed'), 307),\n",
" (('date', 'of', 'the'), 274),\n",
" (('in', 'substantially', 'similar'), 272),\n",
" (('personnel', 'in', 'substantially'), 272),\n",
" (('substantially', 'similar', 'titles'), 272),\n",
" (('similar', 'titles', 'within'), 269),\n",
" (('within', 'agency', ':'), 264),\n",
" (('titles', 'within', 'agency'), 264),\n",
" (('borough', 'of', 'manhattan'), 223),\n",
" (('notice', 'is', 'hereby'), 213),\n",
" (('is', 'hereby', 'given'), 212),\n",
" (('the', 'new', 'york'), 206),\n",
" (('--', '--', '--'), 204),\n",
" (('the', 'proposed', 'contract'), 201),\n",
" (('(', 's', ')'), 196),\n",
" (('proposed', 'contract', ':'), 189),\n",
" (('in', 'the', 'borough'), 187),\n",
" (('the', 'borough', 'of'), 187),\n",
" (('hereby', 'given', 'that'), 185),\n",
" (('york', ',', 'ny'), 173),\n",
" (('agency', 'intends', 'to'), 169),\n",
" (('the', 'agency', 'intends'), 166),\n",
" (('of', 'new', 'york'), 165),\n",
" (('end', 'date', 'of'), 145),\n",
" ((',', 'borough', 'of'), 144),\n",
" (('to', 'utilize', ':'), 137),\n",
" (('intends', 'to', 'utilize'), 137),\n",
" (('a', 'term', 'of'), 136),\n",
" (('headcount', 'of', 'personnel'), 136),\n",
" (('of', 'personnel', 'in'), 136),\n",
" (('for', 'a', 'term'), 136),\n",
" (('start', 'date', 'of'), 133),\n",
" (('not', 'included', 'in'), 128),\n",
" (('annual', 'contracting', 'plan'), 126),\n",
" (('contracting', 'plan', 'and'), 126),\n",
" (('s', ')', 'not'), 126),\n",
" (('plan', 'and', 'schedule'), 126),\n",
" ((')', 'not', 'included'), 126),\n",
" (('and', 'operate', 'an'), 125),\n",
" ((',', 'and', 'operate'), 124),\n",
" (('caf', 'for', 'a'), 124),\n",
" (('fy', '2015', 'annual'), 124),\n",
" (('2015', 'annual', 'contracting'), 124),\n",
" (('maintain', ',', 'and'), 124),\n",
" (('sidewalk', 'caf', 'for'), 124),\n",
" (('years', '.', ')'), 123),\n",
" (('unenclosed', 'sidewalk', 'caf'), 120)]"
]
}
],
"prompt_number": 38
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#We can keep going!\n",
"corpus4grams = list(ngrams(lowerTokens,4))\n",
"corpus4gramFreqs = nltk.FreqDist(corpus4grams)\n",
"corpus4gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 39,
"text": [
"[(('personnel', 'in', 'substantially', 'similar'), 272),\n",
" (('in', 'substantially', 'similar', 'titles'), 272),\n",
" (('substantially', 'similar', 'titles', 'within'), 269),\n",
" ((',', 'new', 'york', ','), 269),\n",
" (('titles', 'within', 'agency', ':'), 264),\n",
" (('similar', 'titles', 'within', 'agency'), 264),\n",
" (('date', 'of', 'the', 'proposed'), 264),\n",
" (('notice', 'is', 'hereby', 'given'), 212),\n",
" (('the', 'new', 'york', 'city'), 199),\n",
" (('of', 'the', 'proposed', 'contract'), 199),\n",
" (('--', '--', '--', '--'), 198),\n",
" (('the', 'proposed', 'contract', ':'), 186),\n",
" (('is', 'hereby', 'given', 'that'), 185),\n",
" (('in', 'the', 'borough', 'of'), 183),\n",
" (('new', 'york', ',', 'ny'), 173),\n",
" (('the', 'agency', 'intends', 'to'), 166),\n",
" (('agency', 'intends', 'to', 'utilize'), 137),\n",
" (('intends', 'to', 'utilize', ':'), 137),\n",
" (('for', 'a', 'term', 'of'), 136),\n",
" (('headcount', 'of', 'personnel', 'in'), 136),\n",
" (('of', 'personnel', 'in', 'substantially'), 136),\n",
" (('start', 'date', 'of', 'the'), 131),\n",
" (('end', 'date', 'of', 'the'), 131),\n",
" (('s', ')', 'not', 'included'), 126),\n",
" (('(', 's', ')', 'not'), 126),\n",
" (('annual', 'contracting', 'plan', 'and'), 126),\n",
" ((')', 'not', 'included', 'in'), 126),\n",
" (('contracting', 'plan', 'and', 'schedule'), 126),\n",
" (('sidewalk', 'caf', 'for', 'a'), 124),\n",
" ((',', 'and', 'operate', 'an'), 124),\n",
" (('fy', '2015', 'annual', 'contracting'), 124),\n",
" (('caf', 'for', 'a', 'term'), 124),\n",
" (('maintain', ',', 'and', 'operate'), 124),\n",
" (('2015', 'annual', 'contracting', 'plan'), 124),\n",
" (('the', 'borough', 'of', 'manhattan'), 123),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for'), 120),\n",
" (('new', 'york', 'city', 'charter'), 116),\n",
" (('hereby', 'given', 'that', 'the'), 116),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk'), 112),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf'), 112),\n",
" (('and', 'operate', 'an', 'unenclosed'), 112),\n",
" (('22', 'reade', 'street', ','), 109),\n",
" (('city', 'of', 'new', 'york'), 108),\n",
" (('in', 'the', 'matter', 'of'), 106),\n",
" (('the', 'city', 'of', 'new'), 105),\n",
" (('of', 'services', 'sought', ':'), 104),\n",
" (('solicitation', 'the', 'agency', 'intends'), 103),\n",
" (('method', 'of', 'solicitation', 'the'), 103),\n",
" (('of', 'solicitation', 'the', 'agency'), 103),\n",
" (('a', 'term', 'of', 'four'), 102)]"
]
}
],
"prompt_number": 39
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus5grams = list(ngrams(lowerTokens,5))\n",
"corpus5gramFreqs = nltk.FreqDist(corpus5grams)\n",
"corpus5gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 40,
"text": [
"[(('personnel', 'in', 'substantially', 'similar', 'titles'), 272),\n",
" (('in', 'substantially', 'similar', 'titles', 'within'), 269),\n",
" (('similar', 'titles', 'within', 'agency', ':'), 264),\n",
" (('substantially', 'similar', 'titles', 'within', 'agency'), 264),\n",
" (('--', '--', '--', '--', '--'), 192),\n",
" (('of', 'the', 'proposed', 'contract', ':'), 186),\n",
" (('date', 'of', 'the', 'proposed', 'contract'), 186),\n",
" (('notice', 'is', 'hereby', 'given', 'that'), 185),\n",
" ((',', 'new', 'york', ',', 'ny'), 152),\n",
" (('agency', 'intends', 'to', 'utilize', ':'), 137),\n",
" (('headcount', 'of', 'personnel', 'in', 'substantially'), 136),\n",
" (('of', 'personnel', 'in', 'substantially', 'similar'), 136),\n",
" (('the', 'agency', 'intends', 'to', 'utilize'), 134),\n",
" (('start', 'date', 'of', 'the', 'proposed'), 131),\n",
" (('end', 'date', 'of', 'the', 'proposed'), 131),\n",
" (('(', 's', ')', 'not', 'included'), 126),\n",
" (('annual', 'contracting', 'plan', 'and', 'schedule'), 126),\n",
" (('s', ')', 'not', 'included', 'in'), 126),\n",
" (('maintain', ',', 'and', 'operate', 'an'), 124),\n",
" (('caf', 'for', 'a', 'term', 'of'), 124),\n",
" (('2015', 'annual', 'contracting', 'plan', 'and'), 124),\n",
" (('fy', '2015', 'annual', 'contracting', 'plan'), 124),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term'), 124),\n",
" (('in', 'the', 'borough', 'of', 'manhattan'), 121),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a'), 120),\n",
" (('is', 'hereby', 'given', 'that', 'the'), 116),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for'), 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf'), 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed'), 112),\n",
" (('and', 'operate', 'an', 'unenclosed', 'sidewalk'), 112),\n",
" (('the', 'city', 'of', 'new', 'york'), 104),\n",
" (('method', 'of', 'solicitation', 'the', 'agency'), 103),\n",
" (('of', 'solicitation', 'the', 'agency', 'intends'), 103),\n",
" (('solicitation', 'the', 'agency', 'intends', 'to'), 103),\n",
" (('a', 'term', 'of', 'four', 'years'), 102),\n",
" (('for', 'a', 'term', 'of', 'four'), 102),\n",
" (('of', 'four', 'years', '.', ')'), 101),\n",
" (('term', 'of', 'four', 'years', '.'), 101),\n",
" (('the', 'borough', 'of', 'manhattan', '('), 100),\n",
" ((',', '22', 'reade', 'street', ','), 100),\n",
" (('borough', 'of', 'manhattan', '(', 'to'), 99),\n",
" (('to', 'maintain', ',', 'and', 'operate'), 98),\n",
" (('floor', ',', 'new', 'york', ','), 96),\n",
" (('solicitation', '(', 's', ')', 'not'), 94),\n",
" (('titles', 'within', 'agency', ':', 'none'), 93),\n",
" (('none', 'headcount', 'of', 'personnel', 'in'), 92),\n",
" (('agency', ':', 'none', 'headcount', 'of'), 92),\n",
" ((':', 'none', 'headcount', 'of', 'personnel'), 92),\n",
" (('within', 'agency', ':', 'none', 'headcount'), 91),\n",
" (('continue', 'to', 'maintain', ',', 'and'), 89)]"
]
}
],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus6grams = list(ngrams(lowerTokens,6))\n",
"corpus6gramFreqs = nltk.FreqDist(corpus6grams)\n",
"corpus6gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 41,
"text": [
"[(('personnel', 'in', 'substantially', 'similar', 'titles', 'within'), 269),\n",
" (('substantially', 'similar', 'titles', 'within', 'agency', ':'), 264),\n",
" (('in', 'substantially', 'similar', 'titles', 'within', 'agency'), 264),\n",
" (('--', '--', '--', '--', '--', '--'), 186),\n",
" (('date', 'of', 'the', 'proposed', 'contract', ':'), 186),\n",
" (('headcount', 'of', 'personnel', 'in', 'substantially', 'similar'), 136),\n",
" (('of', 'personnel', 'in', 'substantially', 'similar', 'titles'), 136),\n",
" (('the', 'agency', 'intends', 'to', 'utilize', ':'), 134),\n",
" (('(', 's', ')', 'not', 'included', 'in'), 126),\n",
" (('2015', 'annual', 'contracting', 'plan', 'and', 'schedule'), 124),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term', 'of'), 124),\n",
" (('fy', '2015', 'annual', 'contracting', 'plan', 'and'), 124),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term'), 120),\n",
" (('notice', 'is', 'hereby', 'given', 'that', 'the'), 116),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a'), 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for'), 112),\n",
" (('and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf'), 112),\n",
" (('maintain', ',', 'and', 'operate', 'an', 'unenclosed'), 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk'), 112),\n",
" (('method', 'of', 'solicitation', 'the', 'agency', 'intends'), 103),\n",
" (('of', 'solicitation', 'the', 'agency', 'intends', 'to'), 103),\n",
" (('solicitation', 'the', 'agency', 'intends', 'to', 'utilize'), 103),\n",
" (('caf', 'for', 'a', 'term', 'of', 'four'), 102),\n",
" (('for', 'a', 'term', 'of', 'four', 'years'), 102),\n",
" (('term', 'of', 'four', 'years', '.', ')'), 101),\n",
" (('a', 'term', 'of', 'four', 'years', '.'), 101),\n",
" (('in', 'the', 'borough', 'of', 'manhattan', '('), 100),\n",
" (('the', 'borough', 'of', 'manhattan', '(', 'to'), 99),\n",
" (('to', 'maintain', ',', 'and', 'operate', 'an'), 98),\n",
" (('end', 'date', 'of', 'the', 'proposed', 'contract'), 96),\n",
" (('solicitation', '(', 's', ')', 'not', 'included'), 94),\n",
" (('similar', 'titles', 'within', 'agency', ':', 'none'), 93),\n",
" (('agency', ':', 'none', 'headcount', 'of', 'personnel'), 92),\n",
" (('none', 'headcount', 'of', 'personnel', 'in', 'substantially'), 92),\n",
" ((':', 'none', 'headcount', 'of', 'personnel', 'in'), 92),\n",
" (('titles', 'within', 'agency', ':', 'none', 'headcount'), 91),\n",
" (('within', 'agency', ':', 'none', 'headcount', 'of'), 91),\n",
" (('start', 'date', 'of', 'the', 'proposed', 'contract'), 90),\n",
" (('continue', 'to', 'maintain', ',', 'and', 'operate'), 89),\n",
" (('borough', 'of', 'manhattan', '(', 'to', 'continue'), 87),\n",
" (('of', 'manhattan', '(', 'to', 'continue', 'to'), 86),\n",
" (('(', 'to', 'continue', 'to', 'maintain', ','), 86),\n",
" (('to', 'continue', 'to', 'maintain', ',', 'and'), 86),\n",
" (('similar', 'titles', 'within', 'agency', ':', '0'), 83),\n",
" (('spector', 'hall', ',', '22', 'reade', 'street'), 82),\n",
" (('hall', ',', '22', 'reade', 'street', ','), 82),\n",
" (('in', 'spector', 'hall', ',', '22', 'reade'), 81),\n",
" (('manhattan', '(', 'to', 'continue', 'to', 'maintain'), 74),\n",
" (('floor', ',', 'new', 'york', ',', 'ny'), 72),\n",
" ((',', 'new', 'york', ',', 'ny', '10007'), 66)]"
]
}
],
"prompt_number": 41
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus7grams = list(ngrams(lowerTokens,7))\n",
"corpus7gramFreqs = nltk.FreqDist(corpus7grams)\n",
"corpus7gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 42,
"text": [
"[(('personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency'),\n",
" 264),\n",
" (('in', 'substantially', 'similar', 'titles', 'within', 'agency', ':'), 264),\n",
" (('--', '--', '--', '--', '--', '--', '--'), 180),\n",
" (('headcount', 'of', 'personnel', 'in', 'substantially', 'similar', 'titles'),\n",
" 136),\n",
" (('of', 'personnel', 'in', 'substantially', 'similar', 'titles', 'within'),\n",
" 133),\n",
" (('fy', '2015', 'annual', 'contracting', 'plan', 'and', 'schedule'), 124),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of'), 120),\n",
" (('and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for'), 112),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term'), 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a'), 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf'), 112),\n",
" (('maintain', ',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk'), 112),\n",
" (('of', 'solicitation', 'the', 'agency', 'intends', 'to', 'utilize'), 103),\n",
" (('solicitation', 'the', 'agency', 'intends', 'to', 'utilize', ':'), 103),\n",
" (('method', 'of', 'solicitation', 'the', 'agency', 'intends', 'to'), 103),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four'), 102),\n",
" (('caf', 'for', 'a', 'term', 'of', 'four', 'years'), 102),\n",
" (('a', 'term', 'of', 'four', 'years', '.', ')'), 101),\n",
" (('for', 'a', 'term', 'of', 'four', 'years', '.'), 101),\n",
" (('in', 'the', 'borough', 'of', 'manhattan', '(', 'to'), 99),\n",
" (('end', 'date', 'of', 'the', 'proposed', 'contract', ':'), 96),\n",
" (('solicitation', '(', 's', ')', 'not', 'included', 'in'), 94),\n",
" (('substantially', 'similar', 'titles', 'within', 'agency', ':', 'none'), 93),\n",
" (('none', 'headcount', 'of', 'personnel', 'in', 'substantially', 'similar'),\n",
" 92),\n",
" (('agency', ':', 'none', 'headcount', 'of', 'personnel', 'in'), 92),\n",
" ((':', 'none', 'headcount', 'of', 'personnel', 'in', 'substantially'), 92),\n",
" (('to', 'maintain', ',', 'and', 'operate', 'an', 'unenclosed'), 91),\n",
" (('titles', 'within', 'agency', ':', 'none', 'headcount', 'of'), 91),\n",
" (('similar', 'titles', 'within', 'agency', ':', 'none', 'headcount'), 91),\n",
" (('within', 'agency', ':', 'none', 'headcount', 'of', 'personnel'), 91),\n",
" (('start', 'date', 'of', 'the', 'proposed', 'contract', ':'), 90),\n",
" (('continue', 'to', 'maintain', ',', 'and', 'operate', 'an'), 89),\n",
" (('the', 'borough', 'of', 'manhattan', '(', 'to', 'continue'), 87),\n",
" (('to', 'continue', 'to', 'maintain', ',', 'and', 'operate'), 86),\n",
" (('(', 'to', 'continue', 'to', 'maintain', ',', 'and'), 86),\n",
" (('borough', 'of', 'manhattan', '(', 'to', 'continue', 'to'), 86),\n",
" (('substantially', 'similar', 'titles', 'within', 'agency', ':', '0'), 83),\n",
" (('spector', 'hall', ',', '22', 'reade', 'street', ','), 82),\n",
" (('in', 'spector', 'hall', ',', '22', 'reade', 'street'), 81),\n",
" (('manhattan', '(', 'to', 'continue', 'to', 'maintain', ','), 74),\n",
" (('of', 'manhattan', '(', 'to', 'continue', 'to', 'maintain'), 74),\n",
" (('notice', 'is', 'hereby', 'given', 'that', 'the', 'mayor'), 65),\n",
" (('s', ')', 'not', 'included', 'in', 'the', 'fy'), 63),\n",
" (('hereby', 'given', 'that', 'the', 'mayor', 'will', 'be'), 63),\n",
" (('schedule', 'that', 'is', 'published', 'pursuant', 'to', 'new'), 63),\n",
" (('contracting', 'plan', 'and', 'schedule', 'notice', 'is', 'hereby'), 63),\n",
" (('plan', 'and', 'schedule', 'notice', 'is', 'hereby', 'given'), 63),\n",
" (('annual', 'contracting', 'plan', 'and', 'schedule', 'notice', 'is'), 63),\n",
" (('to', 'new', 'york', 'city', 'charter', '312', '('), 63),\n",
" (('annual', 'contracting', 'plan', 'and', 'schedule', 'that', 'is'), 63)]"
]
}
],
"prompt_number": 42
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus8grams = list(ngrams(lowerTokens,8))\n",
"corpus8gramFreqs = nltk.FreqDist(corpus8grams)\n",
"corpus8gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 43,
"text": [
"[(('personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':'),\n",
" 264),\n",
" (('--', '--', '--', '--', '--', '--', '--', '--'), 174),\n",
" (('headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within'),\n",
" 133),\n",
" (('of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency'),\n",
" 129),\n",
" (('and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a'), 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for'), 112),\n",
" (('maintain', ',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf'),\n",
" 112),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of'), 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term'), 112),\n",
" (('of', 'solicitation', 'the', 'agency', 'intends', 'to', 'utilize', ':'),\n",
" 103),\n",
" (('method',\n",
" 'of',\n",
" 'solicitation',\n",
" 'the',\n",
" 'agency',\n",
" 'intends',\n",
" 'to',\n",
" 'utilize'),\n",
" 103),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four', 'years'), 102),\n",
" (('caf', 'for', 'a', 'term', 'of', 'four', 'years', '.'), 101),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four'), 101),\n",
" (('for', 'a', 'term', 'of', 'four', 'years', '.', ')'), 101),\n",
" (('in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none'),\n",
" 93),\n",
" (('agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially'),\n",
" 92),\n",
" (('none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles'),\n",
" 92),\n",
" ((':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar'),\n",
" 92),\n",
" (('titles', 'within', 'agency', ':', 'none', 'headcount', 'of', 'personnel'),\n",
" 91),\n",
" (('to', 'maintain', ',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk'),\n",
" 91),\n",
" (('similar', 'titles', 'within', 'agency', ':', 'none', 'headcount', 'of'),\n",
" 91),\n",
" (('substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount'),\n",
" 91),\n",
" (('within', 'agency', ':', 'none', 'headcount', 'of', 'personnel', 'in'), 91),\n",
" (('in', 'the', 'borough', 'of', 'manhattan', '(', 'to', 'continue'), 87),\n",
" (('the', 'borough', 'of', 'manhattan', '(', 'to', 'continue', 'to'), 86),\n",
" (('(', 'to', 'continue', 'to', 'maintain', ',', 'and', 'operate'), 86),\n",
" (('to', 'continue', 'to', 'maintain', ',', 'and', 'operate', 'an'), 86),\n",
" (('in', 'substantially', 'similar', 'titles', 'within', 'agency', ':', '0'),\n",
" 83),\n",
" (('continue', 'to', 'maintain', ',', 'and', 'operate', 'an', 'unenclosed'),\n",
" 82),\n",
" (('in', 'spector', 'hall', ',', '22', 'reade', 'street', ','), 81),\n",
" (('borough', 'of', 'manhattan', '(', 'to', 'continue', 'to', 'maintain'), 74),\n",
" (('manhattan', '(', 'to', 'continue', 'to', 'maintain', ',', 'and'), 74),\n",
" (('of', 'manhattan', '(', 'to', 'continue', 'to', 'maintain', ','), 74),\n",
" (('notice', 'is', 'hereby', 'given', 'that', 'the', 'mayor', 'will'), 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant'),\n",
" 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby'),\n",
" 63),\n",
" (('to', 'new', 'york', 'city', 'charter', '312', '(', 'a'), 63),\n",
" (('(', 's', ')', 'not', 'included', 'in', 'the', 'fy'), 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published'),\n",
" 63),\n",
" (('pursuant', 'to', 'new', 'york', 'city', 'charter', '312', '('), 63),\n",
" (('that', 'is', 'published', 'pursuant', 'to', 'new', 'york', 'city'), 63),\n",
" (('city', 'charter', '312', '(', 'a', ')', ':', 'agency'), 63),\n",
" (('schedule', 'notice', 'is', 'hereby', 'given', 'that', 'the', 'mayor'), 63),\n",
" (('published', 'pursuant', 'to', 'new', 'york', 'city', 'charter', '312'),\n",
" 63),\n",
" (('schedule', 'that', 'is', 'published', 'pursuant', 'to', 'new', 'york'),\n",
" 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given'),\n",
" 63),\n",
" (('york', 'city', 'charter', '312', '(', 'a', ')', ':'), 63),\n",
" (('and', 'schedule', 'notice', 'is', 'hereby', 'given', 'that', 'the'), 63),\n",
" (('plan', 'and', 'schedule', 'notice', 'is', 'hereby', 'given', 'that'), 63)]"
]
}
],
"prompt_number": 43
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus9grams = list(ngrams(lowerTokens,9))\n",
"corpus9gramFreqs = nltk.FreqDist(corpus9grams)\n",
"corpus9gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 44,
"text": [
"[(('--', '--', '--', '--', '--', '--', '--', '--', '--'), 168),\n",
" (('of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':'),\n",
" 129),\n",
" (('headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency'),\n",
" 129),\n",
" (('maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for'),\n",
" 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a'),\n",
" 112),\n",
" (('and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term'),\n",
" 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of'),\n",
" 112),\n",
" (('method',\n",
" 'of',\n",
" 'solicitation',\n",
" 'the',\n",
" 'agency',\n",
" 'intends',\n",
" 'to',\n",
" 'utilize',\n",
" ':'),\n",
" 103),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four', 'years', '.'), 101),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four', 'years'),\n",
" 101),\n",
" (('caf', 'for', 'a', 'term', 'of', 'four', 'years', '.', ')'), 101),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four'),\n",
" 94),\n",
" (('personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none'),\n",
" 93),\n",
" ((':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles'),\n",
" 92),\n",
" (('agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar'),\n",
" 92),\n",
" (('within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially'),\n",
" 91),\n",
" (('to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf'),\n",
" 91),\n",
" (('substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of'),\n",
" 91),\n",
" (('similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel'),\n",
" 91),\n",
" (('titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in'),\n",
" 91),\n",
" (('in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount'),\n",
" 91),\n",
" (('none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within'),\n",
" 89),\n",
" (('(', 'to', 'continue', 'to', 'maintain', ',', 'and', 'operate', 'an'), 86),\n",
" (('in', 'the', 'borough', 'of', 'manhattan', '(', 'to', 'continue', 'to'),\n",
" 86),\n",
" (('personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" '0'),\n",
" 83),\n",
" (('continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk'),\n",
" 82),\n",
" (('to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed'),\n",
" 79),\n",
" (('of', 'manhattan', '(', 'to', 'continue', 'to', 'maintain', ',', 'and'),\n",
" 74),\n",
" (('manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate'),\n",
" 74),\n",
" (('borough', 'of', 'manhattan', '(', 'to', 'continue', 'to', 'maintain', ','),\n",
" 74),\n",
" (('the',\n",
" 'borough',\n",
" 'of',\n",
" 'manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain'),\n",
" 74),\n",
" (('and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the',\n",
" 'mayor'),\n",
" 63),\n",
" (('schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city'),\n",
" 63),\n",
" (('to', 'new', 'york', 'city', 'charter', '312', '(', 'a', ')'), 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that'),\n",
" 63),\n",
" (('city', 'charter', '312', '(', 'a', ')', ':', 'agency', ':'), 63),\n",
" (('pursuant', 'to', 'new', 'york', 'city', 'charter', '312', '(', 'a'), 63),\n",
" (('notice', 'is', 'hereby', 'given', 'that', 'the', 'mayor', 'will', 'be'),\n",
" 63),\n",
" (('that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter'),\n",
" 63),\n",
" (('york', 'city', 'charter', '312', '(', 'a', ')', ':', 'agency'), 63),\n",
" (('and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york'),\n",
" 63),\n",
" (('plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the'),\n",
" 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given'),\n",
" 63),\n",
" (('schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the',\n",
" 'mayor',\n",
" 'will'),\n",
" 63),\n",
" (('plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new'),\n",
" 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to'),\n",
" 63),\n",
" (('new', 'york', 'city', 'charter', '312', '(', 'a', ')', ':'), 63),\n",
" (('published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter',\n",
" '312',\n",
" '('),\n",
" 63),\n",
" (('is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter',\n",
" '312'),\n",
" 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant'),\n",
" 63)]"
]
}
],
"prompt_number": 44
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"corpus5grams = ngrams(lowerTokens,5)\n",
"corpus5gramFreqs = nltk.FreqDist(corpus5grams)\n",
"corpus5gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 47,
"text": [
"[(('personnel', 'in', 'substantially', 'similar', 'titles'), 272),\n",
" (('in', 'substantially', 'similar', 'titles', 'within'), 269),\n",
" (('similar', 'titles', 'within', 'agency', ':'), 264),\n",
" (('substantially', 'similar', 'titles', 'within', 'agency'), 264),\n",
" (('--', '--', '--', '--', '--'), 192),\n",
" (('of', 'the', 'proposed', 'contract', ':'), 186),\n",
" (('date', 'of', 'the', 'proposed', 'contract'), 186),\n",
" (('notice', 'is', 'hereby', 'given', 'that'), 185),\n",
" ((',', 'new', 'york', ',', 'ny'), 152),\n",
" (('agency', 'intends', 'to', 'utilize', ':'), 137),\n",
" (('headcount', 'of', 'personnel', 'in', 'substantially'), 136),\n",
" (('of', 'personnel', 'in', 'substantially', 'similar'), 136),\n",
" (('the', 'agency', 'intends', 'to', 'utilize'), 134),\n",
" (('start', 'date', 'of', 'the', 'proposed'), 131),\n",
" (('end', 'date', 'of', 'the', 'proposed'), 131),\n",
" (('(', 's', ')', 'not', 'included'), 126),\n",
" (('annual', 'contracting', 'plan', 'and', 'schedule'), 126),\n",
" (('s', ')', 'not', 'included', 'in'), 126),\n",
" (('maintain', ',', 'and', 'operate', 'an'), 124),\n",
" (('caf', 'for', 'a', 'term', 'of'), 124),\n",
" (('2015', 'annual', 'contracting', 'plan', 'and'), 124),\n",
" (('fy', '2015', 'annual', 'contracting', 'plan'), 124),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term'), 124),\n",
" (('in', 'the', 'borough', 'of', 'manhattan'), 121),\n",
" (('unenclosed', 'sidewalk', 'caf', 'for', 'a'), 120),\n",
" (('is', 'hereby', 'given', 'that', 'the'), 116),\n",
" (('an', 'unenclosed', 'sidewalk', 'caf', 'for'), 112),\n",
" (('operate', 'an', 'unenclosed', 'sidewalk', 'caf'), 112),\n",
" ((',', 'and', 'operate', 'an', 'unenclosed'), 112),\n",
" (('and', 'operate', 'an', 'unenclosed', 'sidewalk'), 112),\n",
" (('the', 'city', 'of', 'new', 'york'), 104),\n",
" (('method', 'of', 'solicitation', 'the', 'agency'), 103),\n",
" (('of', 'solicitation', 'the', 'agency', 'intends'), 103),\n",
" (('solicitation', 'the', 'agency', 'intends', 'to'), 103),\n",
" (('a', 'term', 'of', 'four', 'years'), 102),\n",
" (('for', 'a', 'term', 'of', 'four'), 102),\n",
" (('of', 'four', 'years', '.', ')'), 101),\n",
" (('term', 'of', 'four', 'years', '.'), 101),\n",
" (('the', 'borough', 'of', 'manhattan', '('), 100),\n",
" ((',', '22', 'reade', 'street', ','), 100),\n",
" (('borough', 'of', 'manhattan', '(', 'to'), 99),\n",
" (('to', 'maintain', ',', 'and', 'operate'), 98),\n",
" (('floor', ',', 'new', 'york', ','), 96),\n",
" (('solicitation', '(', 's', ')', 'not'), 94),\n",
" (('titles', 'within', 'agency', ':', 'none'), 93),\n",
" (('none', 'headcount', 'of', 'personnel', 'in'), 92),\n",
" (('agency', ':', 'none', 'headcount', 'of'), 92),\n",
" ((':', 'none', 'headcount', 'of', 'personnel'), 92),\n",
" (('within', 'agency', ':', 'none', 'headcount'), 91),\n",
" (('continue', 'to', 'maintain', ',', 'and'), 89)]"
]
}
],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#...and let's stop here for now\n",
"corpus10grams = list(ngrams(lowerTokens,10))\n",
"corpus10gramFreqs = nltk.FreqDist(corpus10grams)\n",
"corpus10gramFreqs.most_common(50)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 45,
"text": [
"[(('--', '--', '--', '--', '--', '--', '--', '--', '--', '--'), 162),\n",
" (('headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':'),\n",
" 129),\n",
" (('maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a'),\n",
" 112),\n",
" ((',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term'),\n",
" 112),\n",
" (('and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term',\n",
" 'of'),\n",
" 112),\n",
" (('sidewalk', 'caf', 'for', 'a', 'term', 'of', 'four', 'years', '.', ')'),\n",
" 101),\n",
" (('unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term',\n",
" 'of',\n",
" 'four',\n",
" 'years',\n",
" '.'),\n",
" 100),\n",
" (('operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term',\n",
" 'of',\n",
" 'four'),\n",
" 94),\n",
" (('an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for',\n",
" 'a',\n",
" 'term',\n",
" 'of',\n",
" 'four',\n",
" 'years'),\n",
" 94),\n",
" (('agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles'),\n",
" 92),\n",
" (('in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of'),\n",
" 91),\n",
" (('substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel'),\n",
" 91),\n",
" (('titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially'),\n",
" 91),\n",
" (('within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar'),\n",
" 91),\n",
" (('personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount'),\n",
" 91),\n",
" (('similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in'),\n",
" 91),\n",
" (('to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf',\n",
" 'for'),\n",
" 91),\n",
" ((':',\n",
" 'none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within'),\n",
" 89),\n",
" (('none',\n",
" 'headcount',\n",
" 'of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency'),\n",
" 85),\n",
" (('of',\n",
" 'personnel',\n",
" 'in',\n",
" 'substantially',\n",
" 'similar',\n",
" 'titles',\n",
" 'within',\n",
" 'agency',\n",
" ':',\n",
" '0'),\n",
" 83),\n",
" (('continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk',\n",
" 'caf'),\n",
" 82),\n",
" (('(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed'),\n",
" 79),\n",
" (('to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an',\n",
" 'unenclosed',\n",
" 'sidewalk'),\n",
" 79),\n",
" (('the',\n",
" 'borough',\n",
" 'of',\n",
" 'manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ','),\n",
" 74),\n",
" (('manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate',\n",
" 'an'),\n",
" 74),\n",
" (('in',\n",
" 'the',\n",
" 'borough',\n",
" 'of',\n",
" 'manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain'),\n",
" 74),\n",
" (('borough',\n",
" 'of',\n",
" 'manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and'),\n",
" 74),\n",
" (('of',\n",
" 'manhattan',\n",
" '(',\n",
" 'to',\n",
" 'continue',\n",
" 'to',\n",
" 'maintain',\n",
" ',',\n",
" 'and',\n",
" 'operate'),\n",
" 74),\n",
" (('published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter',\n",
" '312',\n",
" '(',\n",
" 'a'),\n",
" 63),\n",
" (('new', 'york', 'city', 'charter', '312', '(', 'a', ')', ':', 'agency'), 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to'),\n",
" 63),\n",
" (('is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter',\n",
" '312',\n",
" '('),\n",
" 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the'),\n",
" 63),\n",
" (('schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the',\n",
" 'mayor',\n",
" 'will',\n",
" 'be'),\n",
" 63),\n",
" (('and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city'),\n",
" 63),\n",
" (('plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york'),\n",
" 63),\n",
" (('york', 'city', 'charter', '312', '(', 'a', ')', ':', 'agency', ':'), 63),\n",
" (('pursuant', 'to', 'new', 'york', 'city', 'charter', '312', '(', 'a', ')'),\n",
" 63),\n",
" (('to', 'new', 'york', 'city', 'charter', '312', '(', 'a', ')', ':'), 63),\n",
" (('schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter'),\n",
" 63),\n",
" (('that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new',\n",
" 'york',\n",
" 'city',\n",
" 'charter',\n",
" '312'),\n",
" 63),\n",
" (('annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that'),\n",
" 63),\n",
" (('and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the',\n",
" 'mayor',\n",
" 'will'),\n",
" 63),\n",
" (('plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is',\n",
" 'hereby',\n",
" 'given',\n",
" 'that',\n",
" 'the',\n",
" 'mayor'),\n",
" 63),\n",
" (('contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant',\n",
" 'to',\n",
" 'new'),\n",
" 63),\n",
" (('included',\n",
" 'in',\n",
" 'fy',\n",
" '2015',\n",
" 'annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice'),\n",
" 62),\n",
" (('fy',\n",
" '2015',\n",
" 'annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published'),\n",
" 62),\n",
" (('in',\n",
" 'the',\n",
" 'fy',\n",
" '2015',\n",
" 'annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that'),\n",
" 62),\n",
" (('in',\n",
" 'fy',\n",
" '2015',\n",
" 'annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'notice',\n",
" 'is'),\n",
" 62),\n",
" (('2015',\n",
" 'annual',\n",
" 'contracting',\n",
" 'plan',\n",
" 'and',\n",
" 'schedule',\n",
" 'that',\n",
" 'is',\n",
" 'published',\n",
" 'pursuant'),\n",
" 62)]"
]
}
],
"prompt_number": 45
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment