Skip to content

Instantly share code, notes, and snippets.

@mattalhonte
Created July 2, 2015 05:58
Show Gist options
  • Save mattalhonte/44aad806192751560fae to your computer and use it in GitHub Desktop.
Save mattalhonte/44aad806192751560fae to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:d1d146b34ebc3e318e913d204a4804ee3afd0905f6c18726774751f08e17fe0f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import re"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Our culprits\n",
"directionStreets = [r\"\"\"50 N 1 #4i, New York, NY\"\"\", \n",
" r\"\"\"241 West 97th #5N, New York, NY\"\"\", \n",
" r\"\"\"10 W 15th , New York, NY\"\"\", \n",
" r\"\"\"115 E. 23rd , New York, NY\"\"\"]\n",
"\n",
"streetWithNoSuffix = r\"\"\"343 4th #6A, Brooklyn, NY\"\"\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Preprocessing stuff\n",
"\n",
"#The Street Name Dictionary uses 'NORTH' rather than 'N' or 'N.' or any other variation on that theme. Interestingly, this is actually\n",
"#inconsistent with city signage - there is no streetsign that says \"NORTH 1\", it reads \"N 1\". Still, streetsigns are not machine-readable,\n",
"#while the DOT's SND is nice and digitized\n",
"def expandDirections(myString):\n",
" myString = re.sub(r\"\"\"(?<=\\s)(N\\.?)(?=\\s)\"\"\", \"NORTH\", myString)\n",
" myString = re.sub(r\"\"\"(?<=\\s)(S\\.?)(?=\\s)\"\"\", \"SOUTH\", myString)\n",
" myString = re.sub(r\"\"\"(?<=\\s)(E\\.?)(?=\\s)\"\"\", \"EAST\", myString)\n",
" myString = re.sub(r\"\"\"(?<=\\s)(W\\.?)(?=\\s)\"\"\", \"WEST\", myString)\n",
" return myString\n",
"\n",
"#Street name dictionary doesn't give suffixes to digits (though it does give it to spelled-out words, so we get \"FIFTH AVENUE\" and \n",
"#\"5 AVENUE\", but not \"5TH AVENUE\"). This is again inconsistent with most signage, buuut machine-readable\n",
"def removeNumberSuffixes(myString):\n",
" return re.sub(r\"\"\"(?<=\\d)st|nd|rd|th\"\"\", r'', myString)\n",
"\n",
"#No reason to have a space before a comma, in NYC addresses or otherwise\n",
"def noSpaceCommas(myString):\n",
" return myString.replace(r\"\"\" ,\"\"\", r\"\"\",\"\"\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Put all the pre-processing together\n",
"def noSufExpDir(myString):\n",
" myString = expandDirections(myString)\n",
" myString = removeNumberSuffixes(myString)\n",
" myString = noSpaceCommas(myString)\n",
" return myString\n",
"\n",
"def noSufExpDirList(myList):\n",
" return [noSufExpDir(a) for a in myList]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's see what it does!\n",
"cleanedStreets = noSufExpDirList(directionStreets)\n",
"cleanedStreets"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 20,
"text": [
"['50 NORTH 1 #4i, New York, NY',\n",
" '241 West 97 #5N, New York, NY',\n",
" '10 WEST 15, New York, NY',\n",
" '115 EAST 23, New York, NY']"
]
}
],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#A regex for streets with a direction but not the word \"street\"\n",
"#Works on addresses that have already been extracted, then extracts the street name\n",
"numberStreet = re.compile(r\"\"\"(?<=\\d\\s) #It'll be one space away from a digit\n",
" (?P<Direction>North|South|East|West) #It has a direction (either written or abbreviated)\n",
" (?P<streetNumber>\\s\\d{1,3}) #Street number\n",
" (?P<borAndState>(.*)(?<!Street),\\s(New\\sYork|Brooklyn|Queens|Staten\\s+Island|Bronx),\\s(New\\sYork|NY))\"\"\", \n",
" re.IGNORECASE | re.X)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Let's try and add the word \"street\" in a few places it's clearly implied\n",
"def addImpliedStreetToDirStreet(myString):\n",
" return re.sub(numberStreet, r\"\"\"\\1\\2 STREET\\3\"\"\", myString)\n",
"\n",
"def addImpliedStreetToDirList(myList):\n",
" return [addImpliedStreetToDirStreet(a) for a in myList]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Aaand test 'er out\n",
"addImpliedStreetToDirList(cleanedStreets)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
"['50 NORTH 1 STREET #4i, New York, NY',\n",
" '241 West 97 STREET #5N, New York, NY',\n",
" '10 WEST 15 STREET, New York, NY',\n",
" '115 EAST 23 STREET, New York, NY']"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Works for four of our tests! This is definitely over-fitted. For instance, needs to take into account places with a direction that \n",
"#aren't 'streets' - I know there's a South Avenue in Staten Island, for instance. But let's burn that bridge when we come to it!"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment