Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save mattalhonte/0a789fb50414be833ae4 to your computer and use it in GitHub Desktop.
Save mattalhonte/0a789fb50414be833ae4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:4605b64d978da5901887240a3bfc4cd72e17201f37a213b96a18e28e0d2a4051"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tag.util import tuple2str\n",
"import os\n",
"import os.path\n",
"from nltk.corpus.reader.plaintext import PlaintextCorpusReader\n",
"from nltk.corpus.reader.tagged import TaggedCorpusReader\n",
"\n",
"%cd C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"C:\\Users\\Matt\\Dropbox\\Python Workspace\\CROW\\CROL-PDF\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#I'm going to go with the original rawCorpus file I made because it preserved the line breaks. \n",
"#NLTK uses blank lines to separate \"paragraphs\" of a corpus. Since we want to be able to analyze different entries separately in order to\n",
"#build classifiers, we should try to preserve the spot where one entry starts and another begins\n",
"rawCorpus = PlaintextCorpusReader(os.getcwd(),'rawCorpus.txt')\n",
"splitOnEntry = rawCorpus.raw().split(\"\"\"\\r\\n\"\"\")\n",
"#Now we further split our entries down into individual word tokens\n",
"rawTokens = [nltk.word_tokenize(a) for a in splitOnEntry]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def list2sent(myList):\n",
" return ' '.join([tuple2str(a) for a in myList[:len(myList)-1]])+'.'\n",
"\n",
"taggedEntries = [list2sent(a) for a in posTaggedCorpus]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"taggedString = ''"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for myEntry in taggedEntries:\n",
" taggedString= taggedString + myEntry + \"\\n\" + \"\\n\" "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"f = open('taggedCorpus.txt', 'w')\n",
"f.write(taggedString)\n",
"f.close()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Now that we've done all that, we can play with our tagged corpus!\n",
"readTagged = TaggedCorpusReader(os.getcwd(), 'taggedCorpus.txt', sent_tokenizer=nltk.RegexpTokenizer(\"\"\".\\.\"\"\", gaps=True))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#We can zoom in on a few different levels. Here's paragraphs (one for each entry)\n",
"readTagged.paras()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"[[['.']], [['OWNERS', 'ARE', 'WANTED', 'BY', 'THE', 'PROPERTY', 'CLERK', 'DIVISION', 'OF', 'THE', 'NEW', 'YORK', 'CITY', 'POLICE', 'DEPARTMENT'], ['The', 'following', 'listed', 'property', 'is', 'in', 'the', 'custody', ',', 'of', 'the', 'Property', 'Clerk', 'Division', 'without', 'claimants'], ['Recovered', ',', 'lost', ',', 'abandoned', 'property', ',', 'obtained', 'from', 'prisoners', ',', 'emotionally', 'disturbed', ',', 'intoxicated', 'and', 'deceased', 'persons', ';', 'and', 'property', 'obtained', 'from', 'persons', 'incapable', 'of', 'caring', 'for', 'themselves'], ['Motor', 'vehicles', ',', 'boats', ',', 'bicycles', ',', 'business', 'machines', ',', 'cameras', ',', 'calculating', 'machines', ',', 'electrical', 'and', 'optical', 'property', ',', 'furniture', ',', 'furs', ',', 'handbags', ',', 'hardware', ',', 'jewelry', ',', 'photographic', 'equipment', ',', 'radios', ',', 'robes', ',', 'sound', 'systems', ',', 'surgical', 'and', 'musical', 'instruments', ',', 'tools', ',', 'wearing', 'apparel', ',', 'communications', 'equipment', ',', 'computers', ',', 'and', 'other', 'miscellaneous', 'articles'], ['INQUIRIES', 'Inquiries', 'relating', 'to', 'such', 'property', 'should', 'be', 'made', 'in', 'the', 'Borough', 'concerned', ',', 'at', 'the', 'following', 'office', 'of', 'the', 'Property', 'Clerk'], ['FOR', 'MOTOR', 'VEHICLES', '(', 'All', 'Boroughs', ')', ':', 'Springfield', 'Gardens', 'Auto', 'Pound', ',', '174-20', 'North', 'Boundary', 'Road', ',', 'Queens', ',', 'NY', '11430', ',', '(', '718', ')', '553-9555', 'Erie', 'Basin', 'Auto', 'Pound', ',', '700', 'Columbia', 'Street', ',', 'Brooklyn', ',', 'NY', '11231', ',', '(', '718', ')', '246-2030', 'FOR', 'ALL', 'OTHER', 'PROPERTY', 'Manhattan', '-', '1', 'Police', 'Plaza', ',', 'New', 'York', ',', 'NY', '10038', ',', '(', '646', ')', '610-5906', 'Brooklyn', '-', '84th', 'Precinct', ',', '301', 'Gold', 'Street', ',', 'Brooklyn', ',', 'NY', '11201', ',', '(', '718', ')', '875-6675', 'Bronx', 'Property', 'Clerk', '-', '215', 'East', '161', 'Street', ',', 'Bronx', ',', 'NY', '10451', ',', '(', '718', ')', '590-2806', 'Queens', 'Property', 'Clerk', '-', '47-07', 'Pearson', 'Place', ',', 'Long', 'Island', 'City', ',', 'NY', '11101', ',', '(', '718', ')', '433-2678', 'Staten', 'Island', 'Property', 'Clerk', '-', '1', 'Edgewater', 'Plaza', ',', 'Staten', 'Island', ',', 'NY', '10301', ',', '(', '718', ')'], []], ...]"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Sentence\n",
"readTagged.sents()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"[['.'], ['OWNERS', 'ARE', 'WANTED', 'BY', 'THE', 'PROPERTY', 'CLERK', 'DIVISION', 'OF', 'THE', 'NEW', 'YORK', 'CITY', 'POLICE', 'DEPARTMENT'], ...]"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Words\n",
"readTagged.words()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"['.', 'OWNERS', 'ARE', 'WANTED', 'BY', 'THE', ...]"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#And here's the interesting part - tags!\n",
"readTagged.tagged_paras()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"[[[('.', None)]], [[('OWNERS', 'NNS'), ('ARE', 'VBP'), ('WANTED', 'VBN'), ('BY', 'NNP'), ('THE', 'NNP'), ('PROPERTY', 'NNP'), ('CLERK', 'NNP'), ('DIVISION', 'NNP'), ('OF', 'NNP'), ('THE', 'NNP'), ('NEW', 'NNP'), ('YORK', 'NNP'), ('CITY', 'NNP'), ('POLICE', 'NNP'), ('DEPARTMENT', 'NNP')], [('The', 'DT'), ('following', 'VBG'), ('listed', 'VBN'), ('property', 'NN'), ('is', 'VBZ'), ('in', 'IN'), ('the', 'DT'), ('custody', 'NN'), (',', ','), ('of', 'IN'), ('the', 'DT'), ('Property', 'NNP'), ('Clerk', 'NNP'), ('Division', 'NNP'), ('without', 'IN'), ('claimants', 'NNS')], [('Recovered', 'NNP'), (',', ','), ('lost', 'VBD'), (',', ','), ('abandoned', 'VBN'), ('property', 'NN'), (',', ','), ('obtained', 'VBD'), ('from', 'IN'), ('prisoners', 'NNS'), (',', ','), ('emotionally', 'RB'), ('disturbed', 'VBN'), (',', ','), ('intoxicated', 'VBN'), ('and', 'CC'), ('deceased', 'VBN'), ('persons', 'NNS'), (';', ':'), ('and', 'CC'), ('property', 'NN'), ('obtained', 'VBD'), ('from', 'IN'), ('persons', 'NNS'), ('incapable', 'JJ'), ('of', 'IN'), ('caring', 'NN'), ('for', 'IN'), ('themselves', 'PRP')], [('Motor', 'NNP'), ('vehicles', 'NNS'), (',', ','), ('boats', 'NNS'), (',', ','), ('bicycles', 'NNS'), (',', ','), ('business', 'NN'), ('machines', 'NNS'), (',', ','), ('cameras', 'NNS'), (',', ','), ('calculating', 'VBG'), ('machines', 'NNS'), (',', ','), ('electrical', 'JJ'), ('and', 'CC'), ('optical', 'JJ'), ('property', 'NN'), (',', ','), ('furniture', 'NN'), (',', ','), ('furs', 'NNS'), (',', ','), ('handbags', 'NNS'), (',', ','), ('hardware', 'NN'), (',', ','), ('jewelry', 'NN'), (',', ','), ('photographic', 'JJ'), ('equipment', 'NN'), (',', ','), ('radios', 'NNS'), (',', ','), ('robes', 'NNS'), (',', ','), ('sound', 'VBP'), ('systems', 'NNS'), (',', ','), ('surgical', 'JJ'), ('and', 'CC'), ('musical', 'JJ'), ('instruments', 'NNS'), (',', ','), ('tools', 'NNS'), (',', ','), ('wearing', 'VBG'), ('apparel', 'NN'), (',', ','), ('communications', 'NNS'), ('equipment', 'NN'), (',', ','), ('computers', 'NNS'), (',', ','), ('and', 'CC'), ('other', 'JJ'), ('miscellaneous', 'JJ'), ('articles', 'NNS')], [('INQUIRIES', 'NNS'), ('Inquiries', 'NNPS'), ('relating', 'VBG'), ('to', 'TO'), ('such', 'JJ'), ('property', 'NN'), ('should', 'MD'), ('be', 'VB'), ('made', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('Borough', 'NNP'), ('concerned', 'VBD'), (',', ','), ('at', 'IN'), ('the', 'DT'), ('following', 'JJ'), ('office', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Property', 'NNP'), ('Clerk', 'NNP')], [('FOR', 'NNP'), ('MOTOR', 'NNP'), ('VEHICLES', 'NNP'), ('(', 'NNP'), ('All', 'NNP'), ('Boroughs', 'NNP'), (')', 'NNP'), (':', ':'), ('Springfield', 'NNP'), ('Gardens', 'NNP'), ('Auto', 'NNP'), ('Pound', 'NNP'), (',', ','), ('174-20', 'CD'), ('North', 'NNP'), ('Boundary', 'NNP'), ('Road', 'NNP'), (',', ','), ('Queens', 'NNP'), (',', ','), ('NY', 'NNP'), ('11430', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'CD'), ('553-9555', 'CD'), ('Erie', 'NNP'), ('Basin', 'NNP'), ('Auto', 'NNP'), ('Pound', 'NNP'), (',', ','), ('700', 'CD'), ('Columbia', 'NNP'), ('Street', 'NNP'), (',', ','), ('Brooklyn', 'NNP'), (',', ','), ('NY', 'NNP'), ('11231', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'CD'), ('246-2030', 'CD'), ('FOR', 'NNP'), ('ALL', 'NNP'), ('OTHER', 'NNP'), ('PROPERTY', 'NNP'), ('Manhattan', 'NNP'), ('-', ':'), ('1', 'CD'), ('Police', 'NNP'), ('Plaza', 'NNP'), (',', ','), ('New', 'NNP'), ('York', 'NNP'), (',', ','), ('NY', 'NNP'), ('10038', 'CD'), (',', ','), ('(', ':'), ('646', 'CD'), (')', 'CD'), ('610-5906', 'CD'), ('Brooklyn', 'NNP'), ('-', ':'), ('84th', 'JJ'), ('Precinct', 'NNP'), (',', ','), ('301', 'CD'), ('Gold', 'NNP'), ('Street', 'NNP'), (',', ','), ('Brooklyn', 'NNP'), (',', ','), ('NY', 'NNP'), ('11201', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'CD'), ('875-6675', 'CD'), ('Bronx', 'NNP'), ('Property', 'NNP'), ('Clerk', 'NNP'), ('-', ':'), ('215', 'CD'), ('East', 'NNP'), ('161', 'CD'), ('Street', 'NNP'), (',', ','), ('Bronx', 'NNP'), (',', ','), ('NY', 'NNP'), ('10451', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'CD'), ('590-2806', 'CD'), ('Queens', 'NNP'), ('Property', 'NNP'), ('Clerk', 'NNP'), ('-', ':'), ('47-07', 'CD'), ('Pearson', 'NNP'), ('Place', 'NNP'), (',', ','), ('Long', 'NNP'), ('Island', 'NNP'), ('City', 'NNP'), (',', ','), ('NY', 'NNP'), ('11101', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'CD'), ('433-2678', 'CD'), ('Staten', 'NNP'), ('Island', 'NNP'), ('Property', 'NNP'), ('Clerk', 'NNP'), ('-', ':'), ('1', 'CD'), ('Edgewater', 'NNP'), ('Plaza', 'NNP'), (',', ','), ('Staten', 'NNP'), ('Island', 'NNP'), (',', ','), ('NY', 'NNP'), ('10301', 'CD'), (',', ','), ('(', ':'), ('718', 'CD'), (')', 'C')], []], ...]"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Tagged sentences!\n",
"readTagged.tagged_sents()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"[[('.', None)], [('OWNERS', 'NNS'), ('ARE', 'VBP'), ('WANTED', 'VBN'), ('BY', 'NNP'), ('THE', 'NNP'), ('PROPERTY', 'NNP'), ('CLERK', 'NNP'), ('DIVISION', 'NNP'), ('OF', 'NNP'), ('THE', 'NNP'), ('NEW', 'NNP'), ('YORK', 'NNP'), ('CITY', 'NNP'), ('POLICE', 'NNP'), ('DEPARTMENT', 'NNP')], ...]"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Tagged words!\n",
"readTagged.tagged_words()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"[('.', None), ('OWNERS', 'NNS'), ('ARE', 'VBP'), ...]"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment