Created
September 24, 2013 15:05
-
-
Save mmattax/6686137 to your computer and use it in GitHub Desktop.
Parse address components [roughly] out of a string.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
def parse_address_str(address): | |
parts = { | |
'address': '', | |
'address2': '', | |
'city': '', | |
'state': '', | |
'zip': '' | |
} | |
# Try and parse the street address components. | |
lines = address.strip().split('\n') | |
if len(lines) == 1: | |
# Look for a comma between the street and city. | |
tokens = lines[0].split(',', 1) | |
if len(tokens): | |
parts['address'], address = tokens | |
else: | |
# We could do some really hacky parsing, but maybe it's better to | |
# simply return an empty value. | |
return parts | |
elif len(lines) == 2: | |
parts['address'], address = lines | |
elif len(lines) == 3: | |
parts['address'], parts['address2'], address = lines | |
# At this point we assume address contains the city, state, and zip. We now | |
# parse those values out. | |
address = address.strip() | |
# We hope for format like "San Francisco, CA 94105" | |
if ',' in address: | |
parts['city'], address = address.split(',', 1) | |
tokens = address.strip().split() | |
if len(tokens): | |
# Assume the zip/postal is the last token. | |
parts['zip'] = tokens.pop() | |
# Treat everything else as the state. | |
parts['state'] = ' '.join(tokens) | |
else: | |
# We hope for format like "San Francisco CA 94105" | |
tokens = address.strip().split() | |
if len(tokens) >= 3: | |
parts['zip'] = tokens.pop() | |
parts['state'] = tokens.pop() | |
parts['city'] = ' '.join(tokens) | |
elif len(tokens): | |
for part in ['city', 'state', 'zip']: | |
if len(tokens) == 0: | |
break | |
parts[part] = token.pop(0) | |
return parts | |
if __name__ == '__main__': | |
addys = [] | |
addys.append('123 Main Street, San Francisco CA 94105') | |
addys.append('123 Main Street, San Francisco, CA 94105') | |
addys.append("""\ | |
123 Main Street | |
San Francisco CA 94105 | |
""") | |
addys.append("""\ | |
123 Main Street | |
San Francisco, CA 94105 | |
""") | |
import pprint | |
pp = pprint.PrettyPrinter(indent=4) | |
pp.pprint([parse_address_str(addr) for addr in addys]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment