Created
April 4, 2018 22:15
-
-
Save tigerhawkvok/c33227cd5d95afc61dd92f6f244a55e1 to your computer and use it in GitHub Desktop.
Strip down a user-provided address to its essentials
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
""" | |
@author Philip Kahn | |
@license MIT | |
""" | |
def getBarebonesAddress(dirtyAddress): | |
""" | |
Will take most forms of addresses, and return an address of form | |
[Number] [Street] [Zip] | |
removing city (encoded in zip), state (encoded in zip), and suite/apt/etc (irrelevant to geocode) | |
If there's no street address, returns only a zip code. | |
""" | |
# Strip it and remove any duplications of zips, keeping the last from the canonical field | |
dirtyAddress = re.sub(r"^(.*?)\d{5} (\d{5})$", r"\1 \2", dirtyAddress.strip(), 0, re.IGNORECASE | re.MULTILINE) | |
# Split addresses of form "2435 Venice Dr E, South Lake Tahoe, CA 96150" and take only the leading part | |
workingAddress = dirtyAddress.split(",")[0].strip() | |
if dirtyAddress[-5:] != workingAddress[-5:]: | |
# If we've truncated the zip code, add it back on | |
workingAddress += " " + dirtyAddress[-5:] | |
############################# | |
# Run through regex filter: | |
# | |
# 1. Grab a numerical part, and any simple prefixes | |
# 2. If it exists, grab a cardinal direction | |
# 3. Grab the main part of the name, and any obvious suffixes. While the suffix specification isn't always needed, in some cases it helps | |
# 4. Street name cardinal suffixes | |
# 5. Check for suites, apts, etc and grab them now so they can be removed | |
# 6. Grab trailing character data, for example "Berkeley CA", before a zip | |
# 7. Match on a whole zip, return only the first five | |
############################# | |
bareAddress = re.sub(r"^([0-9\"\'a-z ,\.\/\\\-]+?(?: +(?:n|e|w|s){1,2})?(?: +(?!suite|#|suite#|suite #|ste|ste #|unit|unit #|apt|apt #)(?![a-z]{2}[ ,]+)[0-9\"\'a-z,\.\/\\\-]+?)*(?: +(?:ave|avenue|dr|drive|pkwy|parkway|st|street|rd|road|hwy|highway|pk|park|way|pl|place|ln|lane|cir|circle|ct|court)\.?(?: +(?:n|e|w|s){1,2})?)*)[ ,;]+?(?:(?:;, )*(?:suite|#|suite#|suite #|ste|ste #|unit|unit #|apt|apt #)\.? *[\S]+)*(?: *\d+ )*?(?:[ ,]*[a-z]{3,})*?[ ,]*(?:[ ,]+[a-z]{2}[ ,]+)?([0-9]{5})(?:-[0-9]{4})?$", r"\1 \2", workingAddress, 0, re.IGNORECASE | re.MULTILINE) | |
return bareAddress |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment