sampathweb/canonical_restaurant_names.py

## canonical_restaurant_names.py
import re
import sys
import unicodedata

import pandas as pd


def clean_string(s):
    if isinstance(s, unicode):
        s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

    # lowercase everything
    s = s.lower()

    # all whitespace to single space
    s = re.sub("[\s]+", " ", s)

    # all non alphanumeric removed
    s = re.sub("[^a-z0-9 ]", "", s)

    return s


def load_boston_data(path_to_boston_data):
    print "Loading saved data and cleaning inspections..."

    # read CSV
    inspections = pd.read_csv(path_to_boston_data,
                              dtype={"Zip": str})

    # there's a unicode character at the start of the first column we
    # need to remove
    inspections.columns = ['BusinessName'] + inspections.columns[1:].tolist()

    # add name+address column for primary key
    inspections["name_and_address"] = map(clean_string, inspections.BusinessName.astype(str) +
                                                        " " + inspections.Address.astype(str) +
                                                        " " + inspections.City.astype(str) +
                                                        " " + inspections.State.astype(str) +
                                                        " " + inspections.Zip.astype(str))

    return inspections
	import re
	import sys
	import unicodedata

	import pandas as pd


	def clean_string(s):
	if isinstance(s, unicode):
	s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')

	# lowercase everything
	s = s.lower()

	# all whitespace to single space
	s = re.sub("[\s]+", " ", s)

	# all non alphanumeric removed
	s = re.sub("[^a-z0-9 ]", "", s)

	return s


	def load_boston_data(path_to_boston_data):
	print "Loading saved data and cleaning inspections..."

	# read CSV
	inspections = pd.read_csv(path_to_boston_data,
	dtype={"Zip": str})

	# there's a unicode character at the start of the first column we
	# need to remove
	inspections.columns = ['BusinessName'] + inspections.columns[1:].tolist()

	# add name+address column for primary key
	inspections["name_and_address"] = map(clean_string, inspections.BusinessName.astype(str) +
	" " + inspections.Address.astype(str) +
	" " + inspections.City.astype(str) +
	" " + inspections.State.astype(str) +
	" " + inspections.Zip.astype(str))

	return inspections