Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mattalhonte/9b123ff1f459648ede69 to your computer and use it in GitHub Desktop.
Save mattalhonte/9b123ff1f459648ede69 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:8daed3828e6ac1650a59cde7fd5c3a55170c45e3319dd91ae7dbfcc4bb18fca4"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.util import ngrams\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"\n",
"#Importing the dataset\n",
"%cd C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF\n",
"data = pd.read_csv(\"procPublicationRequest_Oct-Dec_2014_clean - procPublicationRequest_Oct-Dec_2014_clean.csv\")\n",
"\n",
"#Snagging the \"human_readable\" column\n",
"human_readableList = list(data['human_readable'])\n",
"\n",
"#Turn the values into strings\n",
"strReadable = [str(a) for a in human_readableList]\n",
"\n",
"#Split into individual words\n",
"listOfLists = [a.split() for a in strReadable]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\Matt\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\Matt\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's see an entry!\n",
"strReadable[0]\n",
"#It's a giant mess of text!"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"'OWNERS ARE WANTED BY THE PROPERTY CLERK DIVISION OF THE NEW YORK CITY POLICE DEPARTMENT. The following listed property is in the custody, of the Property Clerk Division without claimants. Recovered, lost, abandoned property, obtained from prisoners, emotionally disturbed, intoxicated and deceased persons; and property obtained from persons incapable of caring for themselves. Motor vehicles, boats, bicycles, business machines, cameras, calculating machines, electrical and optical property, furniture, furs, handbags, hardware, jewelry, photographic equipment, radios, robes, sound systems, surgical and musical instruments, tools, wearing apparel, communications equipment, computers, and other miscellaneous articles. INQUIRIES Inquiries relating to such property should be made in the Borough concerned, at the following office of the Property Clerk. FOR MOTOR VEHICLES(All Boroughs): Springfield Gardens Auto Pound, 174-20 North Boundary Road, Queens, NY 11430, (718) 553-9555 Erie Basin Auto Pound, 700 Columbia Street, Brooklyn, NY 11231, (718) 246-2030 FOR ALL OTHER PROPERTY Manhattan - 1 Police Plaza, New York, NY 10038, (646) 610-5906 Brooklyn - 84th Precinct, 301 Gold Street, Brooklyn, NY 11201, (718) 875-6675 Bronx Property Clerk - 215 East 161 Street, Bronx, NY 10451, (718) 590-2806 Queens Property Clerk - 47-07 Pearson Place, Long Island City, NY 11101, (718) 433-2678 Staten Island Property Clerk - 1 Edgewater Plaza, Staten Island, NY 10301, (718) 876-8484'"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Here's what happens when we tokenize with NLTK\n",
"firstEntryTokenized = nltk.word_tokenize(strReadable[0])\n",
"#It splits it into a list of individual words\n",
"firstEntryTokenized"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"['OWNERS',\n",
" 'ARE',\n",
" 'WANTED',\n",
" 'BY',\n",
" 'THE',\n",
" 'PROPERTY',\n",
" 'CLERK',\n",
" 'DIVISION',\n",
" 'OF',\n",
" 'THE',\n",
" 'NEW',\n",
" 'YORK',\n",
" 'CITY',\n",
" 'POLICE',\n",
" 'DEPARTMENT',\n",
" '.',\n",
" 'The',\n",
" 'following',\n",
" 'listed',\n",
" 'property',\n",
" 'is',\n",
" 'in',\n",
" 'the',\n",
" 'custody',\n",
" ',',\n",
" 'of',\n",
" 'the',\n",
" 'Property',\n",
" 'Clerk',\n",
" 'Division',\n",
" 'without',\n",
" 'claimants',\n",
" '.',\n",
" 'Recovered',\n",
" ',',\n",
" 'lost',\n",
" ',',\n",
" 'abandoned',\n",
" 'property',\n",
" ',',\n",
" 'obtained',\n",
" 'from',\n",
" 'prisoners',\n",
" ',',\n",
" 'emotionally',\n",
" 'disturbed',\n",
" ',',\n",
" 'intoxicated',\n",
" 'and',\n",
" 'deceased',\n",
" 'persons',\n",
" ';',\n",
" 'and',\n",
" 'property',\n",
" 'obtained',\n",
" 'from',\n",
" 'persons',\n",
" 'incapable',\n",
" 'of',\n",
" 'caring',\n",
" 'for',\n",
" 'themselves',\n",
" '.',\n",
" 'Motor',\n",
" 'vehicles',\n",
" ',',\n",
" 'boats',\n",
" ',',\n",
" 'bicycles',\n",
" ',',\n",
" 'business',\n",
" 'machines',\n",
" ',',\n",
" 'cameras',\n",
" ',',\n",
" 'calculating',\n",
" 'machines',\n",
" ',',\n",
" 'electrical',\n",
" 'and',\n",
" 'optical',\n",
" 'property',\n",
" ',',\n",
" 'furniture',\n",
" ',',\n",
" 'furs',\n",
" ',',\n",
" 'handbags',\n",
" ',',\n",
" 'hardware',\n",
" ',',\n",
" 'jewelry',\n",
" ',',\n",
" 'photographic',\n",
" 'equipment',\n",
" ',',\n",
" 'radios',\n",
" ',',\n",
" 'robes',\n",
" ',',\n",
" 'sound',\n",
" 'systems',\n",
" ',',\n",
" 'surgical',\n",
" 'and',\n",
" 'musical',\n",
" 'instruments',\n",
" ',',\n",
" 'tools',\n",
" ',',\n",
" 'wearing',\n",
" 'apparel',\n",
" ',',\n",
" 'communications',\n",
" 'equipment',\n",
" ',',\n",
" 'computers',\n",
" ',',\n",
" 'and',\n",
" 'other',\n",
" 'miscellaneous',\n",
" 'articles',\n",
" '.',\n",
" 'INQUIRIES',\n",
" 'Inquiries',\n",
" 'relating',\n",
" 'to',\n",
" 'such',\n",
" 'property',\n",
" 'should',\n",
" 'be',\n",
" 'made',\n",
" 'in',\n",
" 'the',\n",
" 'Borough',\n",
" 'concerned',\n",
" ',',\n",
" 'at',\n",
" 'the',\n",
" 'following',\n",
" 'office',\n",
" 'of',\n",
" 'the',\n",
" 'Property',\n",
" 'Clerk',\n",
" '.',\n",
" 'FOR',\n",
" 'MOTOR',\n",
" 'VEHICLES',\n",
" '(',\n",
" 'All',\n",
" 'Boroughs',\n",
" ')',\n",
" ':',\n",
" 'Springfield',\n",
" 'Gardens',\n",
" 'Auto',\n",
" 'Pound',\n",
" ',',\n",
" '174-20',\n",
" 'North',\n",
" 'Boundary',\n",
" 'Road',\n",
" ',',\n",
" 'Queens',\n",
" ',',\n",
" 'NY',\n",
" '11430',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '553-9555',\n",
" 'Erie',\n",
" 'Basin',\n",
" 'Auto',\n",
" 'Pound',\n",
" ',',\n",
" '700',\n",
" 'Columbia',\n",
" 'Street',\n",
" ',',\n",
" 'Brooklyn',\n",
" ',',\n",
" 'NY',\n",
" '11231',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '246-2030',\n",
" 'FOR',\n",
" 'ALL',\n",
" 'OTHER',\n",
" 'PROPERTY',\n",
" 'Manhattan',\n",
" '-',\n",
" '1',\n",
" 'Police',\n",
" 'Plaza',\n",
" ',',\n",
" 'New',\n",
" 'York',\n",
" ',',\n",
" 'NY',\n",
" '10038',\n",
" ',',\n",
" '(',\n",
" '646',\n",
" ')',\n",
" '610-5906',\n",
" 'Brooklyn',\n",
" '-',\n",
" '84th',\n",
" 'Precinct',\n",
" ',',\n",
" '301',\n",
" 'Gold',\n",
" 'Street',\n",
" ',',\n",
" 'Brooklyn',\n",
" ',',\n",
" 'NY',\n",
" '11201',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '875-6675',\n",
" 'Bronx',\n",
" 'Property',\n",
" 'Clerk',\n",
" '-',\n",
" '215',\n",
" 'East',\n",
" '161',\n",
" 'Street',\n",
" ',',\n",
" 'Bronx',\n",
" ',',\n",
" 'NY',\n",
" '10451',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '590-2806',\n",
" 'Queens',\n",
" 'Property',\n",
" 'Clerk',\n",
" '-',\n",
" '47-07',\n",
" 'Pearson',\n",
" 'Place',\n",
" ',',\n",
" 'Long',\n",
" 'Island',\n",
" 'City',\n",
" ',',\n",
" 'NY',\n",
" '11101',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '433-2678',\n",
" 'Staten',\n",
" 'Island',\n",
" 'Property',\n",
" 'Clerk',\n",
" '-',\n",
" '1',\n",
" 'Edgewater',\n",
" 'Plaza',\n",
" ',',\n",
" 'Staten',\n",
" 'Island',\n",
" ',',\n",
" 'NY',\n",
" '10301',\n",
" ',',\n",
" '(',\n",
" '718',\n",
" ')',\n",
" '876-8484']"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#We can use some special functions if we then convert this into NLTK's special Text format\n",
"firstEntryText = nltk.Text(firstEntryTokenized)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#For instance, we can get a list of the most common words along with how often they show up\n",
"firstEntryFreqDist = nltk.FreqDist(firstEntryText)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's see the 10 most common words!\n",
"firstEntryFreqDist.most_common(10)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"[(',', 52),\n",
" (')', 8),\n",
" ('(', 8),\n",
" ('NY', 7),\n",
" ('718', 6),\n",
" ('Property', 5),\n",
" ('Clerk', 5),\n",
" ('property', 5),\n",
" ('the', 5),\n",
" ('-', 5)]"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#We can also search for phrases of different lengths\n",
"firstEntryBigrams = list(ngrams(firstEntryTokenized,2))\n",
"firstEntryBigrams"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"[('OWNERS', 'ARE'),\n",
" ('ARE', 'WANTED'),\n",
" ('WANTED', 'BY'),\n",
" ('BY', 'THE'),\n",
" ('THE', 'PROPERTY'),\n",
" ('PROPERTY', 'CLERK'),\n",
" ('CLERK', 'DIVISION'),\n",
" ('DIVISION', 'OF'),\n",
" ('OF', 'THE'),\n",
" ('THE', 'NEW'),\n",
" ('NEW', 'YORK'),\n",
" ('YORK', 'CITY'),\n",
" ('CITY', 'POLICE'),\n",
" ('POLICE', 'DEPARTMENT'),\n",
" ('DEPARTMENT', '.'),\n",
" ('.', 'The'),\n",
" ('The', 'following'),\n",
" ('following', 'listed'),\n",
" ('listed', 'property'),\n",
" ('property', 'is'),\n",
" ('is', 'in'),\n",
" ('in', 'the'),\n",
" ('the', 'custody'),\n",
" ('custody', ','),\n",
" (',', 'of'),\n",
" ('of', 'the'),\n",
" ('the', 'Property'),\n",
" ('Property', 'Clerk'),\n",
" ('Clerk', 'Division'),\n",
" ('Division', 'without'),\n",
" ('without', 'claimants'),\n",
" ('claimants', '.'),\n",
" ('.', 'Recovered'),\n",
" ('Recovered', ','),\n",
" (',', 'lost'),\n",
" ('lost', ','),\n",
" (',', 'abandoned'),\n",
" ('abandoned', 'property'),\n",
" ('property', ','),\n",
" (',', 'obtained'),\n",
" ('obtained', 'from'),\n",
" ('from', 'prisoners'),\n",
" ('prisoners', ','),\n",
" (',', 'emotionally'),\n",
" ('emotionally', 'disturbed'),\n",
" ('disturbed', ','),\n",
" (',', 'intoxicated'),\n",
" ('intoxicated', 'and'),\n",
" ('and', 'deceased'),\n",
" ('deceased', 'persons'),\n",
" ('persons', ';'),\n",
" (';', 'and'),\n",
" ('and', 'property'),\n",
" ('property', 'obtained'),\n",
" ('obtained', 'from'),\n",
" ('from', 'persons'),\n",
" ('persons', 'incapable'),\n",
" ('incapable', 'of'),\n",
" ('of', 'caring'),\n",
" ('caring', 'for'),\n",
" ('for', 'themselves'),\n",
" ('themselves', '.'),\n",
" ('.', 'Motor'),\n",
" ('Motor', 'vehicles'),\n",
" ('vehicles', ','),\n",
" (',', 'boats'),\n",
" ('boats', ','),\n",
" (',', 'bicycles'),\n",
" ('bicycles', ','),\n",
" (',', 'business'),\n",
" ('business', 'machines'),\n",
" ('machines', ','),\n",
" (',', 'cameras'),\n",
" ('cameras', ','),\n",
" (',', 'calculating'),\n",
" ('calculating', 'machines'),\n",
" ('machines', ','),\n",
" (',', 'electrical'),\n",
" ('electrical', 'and'),\n",
" ('and', 'optical'),\n",
" ('optical', 'property'),\n",
" ('property', ','),\n",
" (',', 'furniture'),\n",
" ('furniture', ','),\n",
" (',', 'furs'),\n",
" ('furs', ','),\n",
" (',', 'handbags'),\n",
" ('handbags', ','),\n",
" (',', 'hardware'),\n",
" ('hardware', ','),\n",
" (',', 'jewelry'),\n",
" ('jewelry', ','),\n",
" (',', 'photographic'),\n",
" ('photographic', 'equipment'),\n",
" ('equipment', ','),\n",
" (',', 'radios'),\n",
" ('radios', ','),\n",
" (',', 'robes'),\n",
" ('robes', ','),\n",
" (',', 'sound'),\n",
" ('sound', 'systems'),\n",
" ('systems', ','),\n",
" (',', 'surgical'),\n",
" ('surgical', 'and'),\n",
" ('and', 'musical'),\n",
" ('musical', 'instruments'),\n",
" ('instruments', ','),\n",
" (',', 'tools'),\n",
" ('tools', ','),\n",
" (',', 'wearing'),\n",
" ('wearing', 'apparel'),\n",
" ('apparel', ','),\n",
" (',', 'communications'),\n",
" ('communications', 'equipment'),\n",
" ('equipment', ','),\n",
" (',', 'computers'),\n",
" ('computers', ','),\n",
" (',', 'and'),\n",
" ('and', 'other'),\n",
" ('other', 'miscellaneous'),\n",
" ('miscellaneous', 'articles'),\n",
" ('articles', '.'),\n",
" ('.', 'INQUIRIES'),\n",
" ('INQUIRIES', 'Inquiries'),\n",
" ('Inquiries', 'relating'),\n",
" ('relating', 'to'),\n",
" ('to', 'such'),\n",
" ('such', 'property'),\n",
" ('property', 'should'),\n",
" ('should', 'be'),\n",
" ('be', 'made'),\n",
" ('made', 'in'),\n",
" ('in', 'the'),\n",
" ('the', 'Borough'),\n",
" ('Borough', 'concerned'),\n",
" ('concerned', ','),\n",
" (',', 'at'),\n",
" ('at', 'the'),\n",
" ('the', 'following'),\n",
" ('following', 'office'),\n",
" ('office', 'of'),\n",
" ('of', 'the'),\n",
" ('the', 'Property'),\n",
" ('Property', 'Clerk'),\n",
" ('Clerk', '.'),\n",
" ('.', 'FOR'),\n",
" ('FOR', 'MOTOR'),\n",
" ('MOTOR', 'VEHICLES'),\n",
" ('VEHICLES', '('),\n",
" ('(', 'All'),\n",
" ('All', 'Boroughs'),\n",
" ('Boroughs', ')'),\n",
" (')', ':'),\n",
" (':', 'Springfield'),\n",
" ('Springfield', 'Gardens'),\n",
" ('Gardens', 'Auto'),\n",
" ('Auto', 'Pound'),\n",
" ('Pound', ','),\n",
" (',', '174-20'),\n",
" ('174-20', 'North'),\n",
" ('North', 'Boundary'),\n",
" ('Boundary', 'Road'),\n",
" ('Road', ','),\n",
" (',', 'Queens'),\n",
" ('Queens', ','),\n",
" (',', 'NY'),\n",
" ('NY', '11430'),\n",
" ('11430', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '553-9555'),\n",
" ('553-9555', 'Erie'),\n",
" ('Erie', 'Basin'),\n",
" ('Basin', 'Auto'),\n",
" ('Auto', 'Pound'),\n",
" ('Pound', ','),\n",
" (',', '700'),\n",
" ('700', 'Columbia'),\n",
" ('Columbia', 'Street'),\n",
" ('Street', ','),\n",
" (',', 'Brooklyn'),\n",
" ('Brooklyn', ','),\n",
" (',', 'NY'),\n",
" ('NY', '11231'),\n",
" ('11231', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '246-2030'),\n",
" ('246-2030', 'FOR'),\n",
" ('FOR', 'ALL'),\n",
" ('ALL', 'OTHER'),\n",
" ('OTHER', 'PROPERTY'),\n",
" ('PROPERTY', 'Manhattan'),\n",
" ('Manhattan', '-'),\n",
" ('-', '1'),\n",
" ('1', 'Police'),\n",
" ('Police', 'Plaza'),\n",
" ('Plaza', ','),\n",
" (',', 'New'),\n",
" ('New', 'York'),\n",
" ('York', ','),\n",
" (',', 'NY'),\n",
" ('NY', '10038'),\n",
" ('10038', ','),\n",
" (',', '('),\n",
" ('(', '646'),\n",
" ('646', ')'),\n",
" (')', '610-5906'),\n",
" ('610-5906', 'Brooklyn'),\n",
" ('Brooklyn', '-'),\n",
" ('-', '84th'),\n",
" ('84th', 'Precinct'),\n",
" ('Precinct', ','),\n",
" (',', '301'),\n",
" ('301', 'Gold'),\n",
" ('Gold', 'Street'),\n",
" ('Street', ','),\n",
" (',', 'Brooklyn'),\n",
" ('Brooklyn', ','),\n",
" (',', 'NY'),\n",
" ('NY', '11201'),\n",
" ('11201', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '875-6675'),\n",
" ('875-6675', 'Bronx'),\n",
" ('Bronx', 'Property'),\n",
" ('Property', 'Clerk'),\n",
" ('Clerk', '-'),\n",
" ('-', '215'),\n",
" ('215', 'East'),\n",
" ('East', '161'),\n",
" ('161', 'Street'),\n",
" ('Street', ','),\n",
" (',', 'Bronx'),\n",
" ('Bronx', ','),\n",
" (',', 'NY'),\n",
" ('NY', '10451'),\n",
" ('10451', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '590-2806'),\n",
" ('590-2806', 'Queens'),\n",
" ('Queens', 'Property'),\n",
" ('Property', 'Clerk'),\n",
" ('Clerk', '-'),\n",
" ('-', '47-07'),\n",
" ('47-07', 'Pearson'),\n",
" ('Pearson', 'Place'),\n",
" ('Place', ','),\n",
" (',', 'Long'),\n",
" ('Long', 'Island'),\n",
" ('Island', 'City'),\n",
" ('City', ','),\n",
" (',', 'NY'),\n",
" ('NY', '11101'),\n",
" ('11101', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '433-2678'),\n",
" ('433-2678', 'Staten'),\n",
" ('Staten', 'Island'),\n",
" ('Island', 'Property'),\n",
" ('Property', 'Clerk'),\n",
" ('Clerk', '-'),\n",
" ('-', '1'),\n",
" ('1', 'Edgewater'),\n",
" ('Edgewater', 'Plaza'),\n",
" ('Plaza', ','),\n",
" (',', 'Staten'),\n",
" ('Staten', 'Island'),\n",
" ('Island', ','),\n",
" (',', 'NY'),\n",
" ('NY', '10301'),\n",
" ('10301', ','),\n",
" (',', '('),\n",
" ('(', '718'),\n",
" ('718', ')'),\n",
" (')', '876-8484')]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#And frequencies of phrases!\n",
"firstEntryFreq= nltk.FreqDist(firstEntryBigrams)\n",
"firstEntryFreq"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"FreqDist({(',', '('): 7, (',', 'NY'): 7, ('(', '718'): 6, ('718', ')'): 6, ('Property', 'Clerk'): 5, ('Clerk', '-'): 3, ('Street', ','): 3, (',', 'Brooklyn'): 2, ('Staten', 'Island'): 2, ('obtained', 'from'): 2, ...})"
]
}
],
"prompt_number": 20
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment