harlo/label_cleanser.py

## label_cleanser.py
from collections import namedtuple
from Levenshtein import ratio
from Levenshtein import distance
import re, csv, os

delimiter = ','
quotechar = '|'
quoting = csv.QUOTE_MINIMAL

BrandInfo = namedtuple('BrandInfo', 'model_text utqg_correlate')
TirelineInfo = namedtuple('TirelineInfo', 'tireline_text utqg_correlate')

debug = True

def padSingleDigit(s):
	"""
		(I ENDED UP NOT USING THIS)

		If the DOT id is not 2 digits, pad with a zero.
		This should only apply to an entity with and ID of 1-9
	"""
	s_o = s
	is_single_digit = None

	if len(s) != 2:
		rx = r'^(\d{1})'
		is_single_digit = re.findall(rx, s)
		if len(is_single_digit) == 1:
			s = str("0%s" % is_single_digit[0])

	if debug:
		print s_o, s, is_single_digit

	return s

def correlateLabel(s, label_set, scrutinize_suffixes=False):
	correlated = []
	CorrelateObject = namedtuple("CorrelateObject", "label lev_ratio lev_dist")
	pre_rx = r'.*\s(.{3,4})$'

	for label in label_set:
		r = ratio(s.lower(), label.lower())
		d = distance(s.lower(), label.lower())
		if r >= 0.91:
			'''
				in some cases, we have to scrutinize our data for
				false positives.  for instance, some manufacturers
				have multiple lines of a particular product
				(i.e. Firestone GTX and Firestone GTA) so we have to
				sniff out these suffixes and compare them.
			'''
			if scrutinize_suffixes:
				s_prefix = re.findall(pre_rx, s.lower())
				l_prefix = re.findall(pre_rx, label.lower())

				if len(s_prefix) == 1 and len(l_prefix) == 1:
					if debug:
						print "s_suffix: %s | l_suffix: %s (%.9f)" % (s_prefix, l_prefix, ratio(s_prefix[0], l_prefix[0]))

					if ratio(s_prefix[0], l_prefix[0]) < 0.8:
						continue

			correlated.append(CorrelateObject(label, r, d))

	if len(correlated) > 0:
		max_val = max(c.lev_ratio for c in correlated)
		best_matches = [c for c in correlated if c.lev_ratio == max_val]

		if debug:
			if max_val >= 0.91 and max_val != 1:
				print "%s | %s | %s " % (s.upper(), best_matches[0].label.lower(), best_matches)


		return best_matches[0].label

	return s

if __name__ == "__main__":
	this_dir = os.path.abspath(__file__)
	par_dir = os.path.abspath(os.path.join(this_dir, os.pardir))
	data_dump = os.path.join(par_dir, 'data')

	utqg_info = None
	utqg_brands = None
	utqg_tirelines = None
	new_brand_info = None
	new_tireline_info = None

	'''
		open up utqg csv.
		we want the brand column (1) and tireline column (3).
	'''
	UTQGInfo = namedtuple('UTQGInfo', 'brand tireline')
	with open(os.path.join(data_dump, "UTQG_data.csv"), 'rU') as utqg_data:
		utqg_csv = csv.reader(utqg_data, delimiter=delimiter, quotechar=quotechar)
		utqg_info = [UTQGInfo(x[0].lower(), x[2].lower()) for x in utqg_csv]

		'''
			create unique sets of brand and tireline observations.
		'''
		utqg_brands = list(set([x.brand for x in utqg_info]))
		utqg_tirelines = list(set([x.tireline for x in utqg_info]))

	with open(os.path.join(data_dump, "FLAT_CMPL_TIRE_excel.csv"), 'rU') as cmpl_data:
		cmpl_csv = csv.reader(cmpl_data, delimiter=delimiter, quotechar=quotechar)

		new_brand_info = [BrandInfo(x[3], correlateLabel(x[3], utqg_brands)) for x in cmpl_csv]
		new_tireline_info = [TirelineInfo(x[4], correlateLabel(x[4], utqg_tirelines, scrutinize_suffixes=True)) for x in cmpl_csv]
	from collections import namedtuple
	from Levenshtein import ratio
	from Levenshtein import distance
	import re, csv, os

	delimiter = ','
	quotechar = '\|'
	quoting = csv.QUOTE_MINIMAL

	BrandInfo = namedtuple('BrandInfo', 'model_text utqg_correlate')
	TirelineInfo = namedtuple('TirelineInfo', 'tireline_text utqg_correlate')

	debug = True

	def padSingleDigit(s):
	"""
	(I ENDED UP NOT USING THIS)

	If the DOT id is not 2 digits, pad with a zero.
	This should only apply to an entity with and ID of 1-9
	"""
	s_o = s
	is_single_digit = None

	if len(s) != 2:
	rx = r'^(\d{1})'
	is_single_digit = re.findall(rx, s)
	if len(is_single_digit) == 1:
	s = str("0%s" % is_single_digit[0])

	if debug:
	print s_o, s, is_single_digit

	return s

	def correlateLabel(s, label_set, scrutinize_suffixes=False):
	correlated = []
	CorrelateObject = namedtuple("CorrelateObject", "label lev_ratio lev_dist")
	pre_rx = r'.*\s(.{3,4})$'

	for label in label_set:
	r = ratio(s.lower(), label.lower())
	d = distance(s.lower(), label.lower())
	if r >= 0.91:
	'''
	in some cases, we have to scrutinize our data for
	false positives. for instance, some manufacturers
	have multiple lines of a particular product
	(i.e. Firestone GTX and Firestone GTA) so we have to
	sniff out these suffixes and compare them.
	'''
	if scrutinize_suffixes:
	s_prefix = re.findall(pre_rx, s.lower())
	l_prefix = re.findall(pre_rx, label.lower())

	if len(s_prefix) == 1 and len(l_prefix) == 1:
	if debug:
	print "s_suffix: %s \| l_suffix: %s (%.9f)" % (s_prefix, l_prefix, ratio(s_prefix[0], l_prefix[0]))

	if ratio(s_prefix[0], l_prefix[0]) < 0.8:
	continue

	correlated.append(CorrelateObject(label, r, d))

	if len(correlated) > 0:
	max_val = max(c.lev_ratio for c in correlated)
	best_matches = [c for c in correlated if c.lev_ratio == max_val]

	if debug:
	if max_val >= 0.91 and max_val != 1:
	print "%s \| %s \| %s " % (s.upper(), best_matches[0].label.lower(), best_matches)


	return best_matches[0].label

	return s

	if __name__ == "__main__":
	this_dir = os.path.abspath(__file__)
	par_dir = os.path.abspath(os.path.join(this_dir, os.pardir))
	data_dump = os.path.join(par_dir, 'data')

	utqg_info = None
	utqg_brands = None
	utqg_tirelines = None
	new_brand_info = None
	new_tireline_info = None

	'''
	open up utqg csv.
	we want the brand column (1) and tireline column (3).
	'''
	UTQGInfo = namedtuple('UTQGInfo', 'brand tireline')
	with open(os.path.join(data_dump, "UTQG_data.csv"), 'rU') as utqg_data:
	utqg_csv = csv.reader(utqg_data, delimiter=delimiter, quotechar=quotechar)
	utqg_info = [UTQGInfo(x[0].lower(), x[2].lower()) for x in utqg_csv]

	'''
	create unique sets of brand and tireline observations.
	'''
	utqg_brands = list(set([x.brand for x in utqg_info]))
	utqg_tirelines = list(set([x.tireline for x in utqg_info]))

	with open(os.path.join(data_dump, "FLAT_CMPL_TIRE_excel.csv"), 'rU') as cmpl_data:
	cmpl_csv = csv.reader(cmpl_data, delimiter=delimiter, quotechar=quotechar)

	new_brand_info = [BrandInfo(x[3], correlateLabel(x[3], utqg_brands)) for x in cmpl_csv]
	new_tireline_info = [TirelineInfo(x[4], correlateLabel(x[4], utqg_tirelines, scrutinize_suffixes=True)) for x in cmpl_csv]