Created
July 2, 2015 05:58
-
-
Save mattalhonte/44aad806192751560fae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:d1d146b34ebc3e318e913d204a4804ee3afd0905f6c18726774751f08e17fe0f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import re" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Our culprits\n", | |
"directionStreets = [r\"\"\"50 N 1 #4i, New York, NY\"\"\", \n", | |
" r\"\"\"241 West 97th #5N, New York, NY\"\"\", \n", | |
" r\"\"\"10 W 15th , New York, NY\"\"\", \n", | |
" r\"\"\"115 E. 23rd , New York, NY\"\"\"]\n", | |
"\n", | |
"streetWithNoSuffix = r\"\"\"343 4th #6A, Brooklyn, NY\"\"\"" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Preprocessing stuff\n", | |
"\n", | |
"#The Street Name Dictionary uses 'NORTH' rather than 'N' or 'N.' or any other variation on that theme. Interestingly, this is actually\n", | |
"#inconsistent with city signage - there is no streetsign that says \"NORTH 1\", it reads \"N 1\". Still, streetsigns are not machine-readable,\n", | |
"#while the DOT's SND is nice and digitized\n", | |
"def expandDirections(myString):\n", | |
" myString = re.sub(r\"\"\"(?<=\\s)(N\\.?)(?=\\s)\"\"\", \"NORTH\", myString)\n", | |
" myString = re.sub(r\"\"\"(?<=\\s)(S\\.?)(?=\\s)\"\"\", \"SOUTH\", myString)\n", | |
" myString = re.sub(r\"\"\"(?<=\\s)(E\\.?)(?=\\s)\"\"\", \"EAST\", myString)\n", | |
" myString = re.sub(r\"\"\"(?<=\\s)(W\\.?)(?=\\s)\"\"\", \"WEST\", myString)\n", | |
" return myString\n", | |
"\n", | |
"#Street name dictionary doesn't give suffixes to digits (though it does give it to spelled-out words, so we get \"FIFTH AVENUE\" and \n", | |
"#\"5 AVENUE\", but not \"5TH AVENUE\"). This is again inconsistent with most signage, buuut machine-readable\n", | |
"def removeNumberSuffixes(myString):\n", | |
" return re.sub(r\"\"\"(?<=\\d)st|nd|rd|th\"\"\", r'', myString)\n", | |
"\n", | |
"#No reason to have a space before a comma, in NYC addresses or otherwise\n", | |
"def noSpaceCommas(myString):\n", | |
" return myString.replace(r\"\"\" ,\"\"\", r\"\"\",\"\"\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Put all the pre-processing together\n", | |
"def noSufExpDir(myString):\n", | |
" myString = expandDirections(myString)\n", | |
" myString = removeNumberSuffixes(myString)\n", | |
" myString = noSpaceCommas(myString)\n", | |
" return myString\n", | |
"\n", | |
"def noSufExpDirList(myList):\n", | |
" return [noSufExpDir(a) for a in myList]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Let's see what it does!\n", | |
"cleanedStreets = noSufExpDirList(directionStreets)\n", | |
"cleanedStreets" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 20, | |
"text": [ | |
"['50 NORTH 1 #4i, New York, NY',\n", | |
" '241 West 97 #5N, New York, NY',\n", | |
" '10 WEST 15, New York, NY',\n", | |
" '115 EAST 23, New York, NY']" | |
] | |
} | |
], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#A regex for streets with a direction but not the word \"street\"\n", | |
"#Works on addresses that have already been extracted, then extracts the street name\n", | |
"numberStreet = re.compile(r\"\"\"(?<=\\d\\s) #It'll be one space away from a digit\n", | |
" (?P<Direction>North|South|East|West) #It has a direction (either written or abbreviated)\n", | |
" (?P<streetNumber>\\s\\d{1,3}) #Street number\n", | |
" (?P<borAndState>(.*)(?<!Street),\\s(New\\sYork|Brooklyn|Queens|Staten\\s+Island|Bronx),\\s(New\\sYork|NY))\"\"\", \n", | |
" re.IGNORECASE | re.X)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Let's try and add the word \"street\" in a few places it's clearly implied\n", | |
"def addImpliedStreetToDirStreet(myString):\n", | |
" return re.sub(numberStreet, r\"\"\"\\1\\2 STREET\\3\"\"\", myString)\n", | |
"\n", | |
"def addImpliedStreetToDirList(myList):\n", | |
" return [addImpliedStreetToDirStreet(a) for a in myList]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 25 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Aaand test 'er out\n", | |
"addImpliedStreetToDirList(cleanedStreets)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 26, | |
"text": [ | |
"['50 NORTH 1 STREET #4i, New York, NY',\n", | |
" '241 West 97 STREET #5N, New York, NY',\n", | |
" '10 WEST 15 STREET, New York, NY',\n", | |
" '115 EAST 23 STREET, New York, NY']" | |
] | |
} | |
], | |
"prompt_number": 26 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"#Works for four of our tests! This is definitely over-fitted. For instance, needs to take into account places with a direction that \n", | |
"#aren't 'streets' - I know there's a South Avenue in Staten Island, for instance. But let's burn that bridge when we come to it!" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment