Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Boston restaurants to canonical name and address
import re
import sys
import unicodedata
import pandas as pd
def clean_string(s):
if isinstance(s, unicode):
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
# lowercase everything
s = s.lower()
# all whitespace to single space
s = re.sub("[\s]+", " ", s)
# all non alphanumeric removed
s = re.sub("[^a-z0-9 ]", "", s)
return s
def load_boston_data(path_to_boston_data):
print "Loading saved data and cleaning inspections..."
# read CSV
inspections = pd.read_csv(path_to_boston_data,
dtype={"Zip": str})
# there's a unicode character at the start of the first column we
# need to remove
inspections.columns = ['BusinessName'] + inspections.columns[1:].tolist()
# add name+address column for primary key
inspections["name_and_address"] = map(clean_string, inspections.BusinessName.astype(str) +
" " + inspections.Address.astype(str) +
" " + inspections.City.astype(str) +
" " + inspections.State.astype(str) +
" " + inspections.Zip.astype(str))
return inspections
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment