liquidgenius/usaddress_adapter.py

## usaddress_adapter.py
"""
* requires usaddress https://github.com/datamade/usaddress
* check out the website! https://parserator.datamade.us/usaddress

The usaddress package is pretty great for normalizing inconsistent address data,
especially when if you have a lot to process and can't rely on using a geocoding api.
The results are really granular, probably moreso than you'll need, and definitely
more than most CRM systems if integrating these addresses is your goal.

This is just a simple wrapper around the usaddress.tag() function that I feel will
fit most use cases. The usaddress.parse() function works as well, but tag has some
nice things I like such as combining certain address components,
stripping out extra commas, etc.

For example:
usaddress.tag('123 Fake Street, ste 123, san diego, CA 12345, USA')
>> (OrderedDict([('AddressNumber', u'123'),
>>               ('StreetName', u'Fake'),
>>               ('StreetNamePostType', u'Street'),
>>               ('OccupancyType', u'ste'),
>>               ('OccupancyIdentifier', u'123'),
>>               ('PlaceName', u'san diego'),
>>               ('StateName', u'CA'),
>>               ('ZipCode', u'12345'),
>>               ('CountryName', u'USA')]),
>>  'Street Address')

"""
import collections
import usaddress  # This is based on 0.5.4

Address = collections.namedtuple("Address", "street, city, state, zip_code, country")

def parse_address(address_string):
    tags, _ = usaddress.tag(address_string)

    def _combine(*label_prefixes):
        components = []
        for label, component in tags.iteritems():  # tags is an OrderedDict so this is fine
            if any(map(label.startswith, label_prefixes)):
                components.append(component)
        return ' '.join(components) or None

    return Address(
        # Note: If you prefer street_1 and street_2, the "Occupancy" labels
        # are generally what falls under street_2
        street=_combine('AddressNumber', 'StreetName', 'Occupancy'),
        city=_combine('PlaceName'),
        state=_combine('StateName'),
        zip_code=_combine('ZipCode'),
        country=_combine('CountryName'),
    )

parse_address('123 Fake Street   ,   ste 123,,    San Diego, CA 12345-1234, USA')
# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego',
# >>         state=u'CA', zip_code=u'12345-1234', country=u'USA')
	"""
	* requires usaddress https://github.com/datamade/usaddress
	* check out the website! https://parserator.datamade.us/usaddress

	The usaddress package is pretty great for normalizing inconsistent address data,
	especially when if you have a lot to process and can't rely on using a geocoding api.
	The results are really granular, probably moreso than you'll need, and definitely
	more than most CRM systems if integrating these addresses is your goal.

	This is just a simple wrapper around the usaddress.tag() function that I feel will
	fit most use cases. The usaddress.parse() function works as well, but tag has some
	nice things I like such as combining certain address components,
	stripping out extra commas, etc.

	For example:
	usaddress.tag('123 Fake Street, ste 123, san diego, CA 12345, USA')
	>> (OrderedDict([('AddressNumber', u'123'),
	>> ('StreetName', u'Fake'),
	>> ('StreetNamePostType', u'Street'),
	>> ('OccupancyType', u'ste'),
	>> ('OccupancyIdentifier', u'123'),
	>> ('PlaceName', u'san diego'),
	>> ('StateName', u'CA'),
	>> ('ZipCode', u'12345'),
	>> ('CountryName', u'USA')]),
	>> 'Street Address')

	"""
	import collections
	import usaddress # This is based on 0.5.4

	Address = collections.namedtuple("Address", "street, city, state, zip_code, country")

	def parse_address(address_string):
	tags, _ = usaddress.tag(address_string)

	def _combine(*label_prefixes):
	components = []
	for label, component in tags.iteritems(): # tags is an OrderedDict so this is fine
	if any(map(label.startswith, label_prefixes)):
	components.append(component)
	return ' '.join(components) or None

	return Address(
	# Note: If you prefer street_1 and street_2, the "Occupancy" labels
	# are generally what falls under street_2
	street=_combine('AddressNumber', 'StreetName', 'Occupancy'),
	city=_combine('PlaceName'),
	state=_combine('StateName'),
	zip_code=_combine('ZipCode'),
	country=_combine('CountryName'),
	)

	parse_address('123 Fake Street , ste 123,, San Diego, CA 12345-1234, USA')
	# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego',
	# >> state=u'CA', zip_code=u'12345-1234', country=u'USA')