tmacam/normalize_string.py

## normalize_string.py
#!/usr/bin/python
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:

"""A collection of string normalization routines.

You are probably looking for normalize_string, that does an aggressive (but
arguably sound) string normalization process.
"""


from HTMLParser import HTMLParser
import re
import unicodedata


ORD_A_MIN = ord(u'a')
ORD_Z_MIN = ord(u'z')
ORD_0 = ord(u'0')
ORD_9 = ord(u'9')


def try_redecode_utf8(s):
    """Try redecoding utf-8 data inside a (faux-)unicode string.

    >>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
    u'T\xeanis e Esporte'
    """
    keep_going = True
    redecoded = s
    # Keep redecoding until redecoding fails or there is no difference in output
    while keep_going:
        try:
            if isinstance(s, unicode):
                redecoded = s.encode('latin1').decode('utf-8')
            elif isinstance(s, str):
                redecoded = s.decode('utf-8')
            keep_going = (s != redecoded)
            s = redecoded
        except:
            keep_going = False
    return redecoded


class HTMLStripper(HTMLParser):
    "Remove tags and keeps HTML entities intact."
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_starttag(self, tag, attrs):
        # We took the decision that all/any tag is a word-splitter and thus
        # is converted to spaces.
        self.fed.append(' ')

    def handle_data(self, d):
        self.fed.append(d)

    def handle_charref(self, number):
        self.fed.append('&#%s;' % number)

    def handle_entityref(self, name):
        self.fed.append('&%s;' % name)

    def get_data(self):
        return u''.join(self.fed)


def isPlainASCIIAlphaNum(c):
    o = ord(c)
    if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
        return True
    return False


def strip_html_and_convert_entities(html):
    # Previously we used the following code, that depends on BeautifulSoup:
    #
    #   soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #   return u' '.join(soup.findAll(text=True))
    # Src:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    #
    # But it does not handle numeric character entities correctly. Our new
    # Approach does not depend on BeautifulSoup and uses HTMLParser, which is
    # part of python 2.6's std. library. So it is a double-win. :-)
    #
    # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
    parser = HTMLStripper()
    parser.feed(html)
    # HTML parser breaks if parsing ends/EOF on a single-letter broken entities
    # such as 'at&t'. Adding an extra space fixes this.
    parser.feed(' ')
    parser.close()
    return parser.unescape(parser.get_data())


def normalize_case(s):
    return s.lower()


def normalize_diacritics(input_str):
    # References:
    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
    nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


def normalize_to_plain_ascii(s):
    only_ascii = s.encode('ASCII', 'replace')  # unencodable chars become '?'
    return unicode(only_ascii)


def normalize_to_alphanum_and_spaces(s):
    return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)


# def normalize_diacritics_old(s):
#     """Converts to lowercase, normalizes diacritics and
#     converts non-alphanumeric chars into space.
#     """
#     s = s.lower()
#     s = unicodedata.normalize('NFKD', s)
#     # Filter ascii letters and numbers, discard everyone else
#     filtered = []
#     for c in s:
#         if isPlainASCIIAlphaNum(c):
#             filtered.append(c)
#         elif unicodedata.category(c) == 'Mn':
#             continue
#         else:
#             filtered.append(u' ')
#     return u' '.join(''.join(filtered).split())


def normalize_prepositions(s):
    """Replaces common prepositions by space."""
    prepositions = ['e', 'and', 'de', 'do', 'da']
    for prep in prepositions:
        pattern = r'\b' + prep + r'\b'
        s = re.sub(pattern, " ", s)
    return s


def normalize_whitespace(s):
    s = re.sub('\s+', ' ', s)
    return s.strip()


def normalize_string(s, fix_utf8=False):
    if fix_utf8:
        s = try_redecode_utf8(s)
    s = strip_html_and_convert_entities(s)
    s = normalize_case(s)
    s = normalize_diacritics(s)
    s = normalize_to_plain_ascii(s)
    s = normalize_to_alphanum_and_spaces(s)
    #s = normalize_prepositions(s)
    s = normalize_whitespace(s)
    # We don't need to re-normalize to known unicode form (say, NFC) since we
    # only have plain ASCII data, and alpha-numeric content, for that matter.
    # There is no "combined" nor "decomposed" unicode content in `s`.
    return s


def main():
    sample = [u"Cine e foto",
              u"Cine & foto",
              u"Cine&Foto",  # BeautifulSoup breaks for this one.
              u"Cine+foto",
              u"Cíñe_e.foto",
              u"<a>&Ccedil;ine&nbsp;e<br>Foto",
              u'Cine\u65e5\u672c\u8a9eFoto',
              u'Carrinhos e Ve&iacute;culos',
              u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
              u'Soul e R&B',  # we used break on this one.
              u'T\xc3\xaanis e Esporte',
             ]
    from collections import defaultdict
    categories = defaultdict(list)
    for i in sample:
        n = normalize_string(i, fix_utf8=True)
        categories[n].append(i)

    for k, v in categories.items():
        print k, v

    return categories


if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:

	"""A collection of string normalization routines.

	You are probably looking for normalize_string, that does an aggressive (but
	arguably sound) string normalization process.
	"""


	from HTMLParser import HTMLParser
	import re
	import unicodedata


	ORD_A_MIN = ord(u'a')
	ORD_Z_MIN = ord(u'z')
	ORD_0 = ord(u'0')
	ORD_9 = ord(u'9')


	def try_redecode_utf8(s):
	"""Try redecoding utf-8 data inside a (faux-)unicode string.

	>>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
	u'T\xeanis e Esporte'
	"""
	keep_going = True
	redecoded = s
	# Keep redecoding until redecoding fails or there is no difference in output
	while keep_going:
	try:
	if isinstance(s, unicode):
	redecoded = s.encode('latin1').decode('utf-8')
	elif isinstance(s, str):
	redecoded = s.decode('utf-8')
	keep_going = (s != redecoded)
	s = redecoded
	except:
	keep_going = False
	return redecoded


	class HTMLStripper(HTMLParser):
	"Remove tags and keeps HTML entities intact."
	def __init__(self):
	self.reset()
	self.fed = []

	def handle_starttag(self, tag, attrs):
	# We took the decision that all/any tag is a word-splitter and thus
	# is converted to spaces.
	self.fed.append(' ')

	def handle_data(self, d):
	self.fed.append(d)

	def handle_charref(self, number):
	self.fed.append('&#%s;' % number)

	def handle_entityref(self, name):
	self.fed.append('&%s;' % name)

	def get_data(self):
	return u''.join(self.fed)


	def isPlainASCIIAlphaNum(c):
	o = ord(c)
	if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
	return True
	return False


	def strip_html_and_convert_entities(html):
	# Previously we used the following code, that depends on BeautifulSoup:
	#
	# soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
	# return u' '.join(soup.findAll(text=True))
	# Src:
	# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
	#
	# But it does not handle numeric character entities correctly. Our new
	# Approach does not depend on BeautifulSoup and uses HTMLParser, which is
	# part of python 2.6's std. library. So it is a double-win. :-)
	#
	# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
	# http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
	parser = HTMLStripper()
	parser.feed(html)
	# HTML parser breaks if parsing ends/EOF on a single-letter broken entities
	# such as 'at&t'. Adding an extra space fixes this.
	parser.feed(' ')
	parser.close()
	return parser.unescape(parser.get_data())


	def normalize_case(s):
	return s.lower()


	def normalize_diacritics(input_str):
	# References:
	# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
	# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
	nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
	return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


	def normalize_to_plain_ascii(s):
	only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?'
	return unicode(only_ascii)


	def normalize_to_alphanum_and_spaces(s):
	return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)


	# def normalize_diacritics_old(s):
	# """Converts to lowercase, normalizes diacritics and
	# converts non-alphanumeric chars into space.
	# """
	# s = s.lower()
	# s = unicodedata.normalize('NFKD', s)
	# # Filter ascii letters and numbers, discard everyone else
	# filtered = []
	# for c in s:
	# if isPlainASCIIAlphaNum(c):
	# filtered.append(c)
	# elif unicodedata.category(c) == 'Mn':
	# continue
	# else:
	# filtered.append(u' ')
	# return u' '.join(''.join(filtered).split())


	def normalize_prepositions(s):
	"""Replaces common prepositions by space."""
	prepositions = ['e', 'and', 'de', 'do', 'da']
	for prep in prepositions:
	pattern = r'\b' + prep + r'\b'
	s = re.sub(pattern, " ", s)
	return s


	def normalize_whitespace(s):
	s = re.sub('\s+', ' ', s)
	return s.strip()


	def normalize_string(s, fix_utf8=False):
	if fix_utf8:
	s = try_redecode_utf8(s)
	s = strip_html_and_convert_entities(s)
	s = normalize_case(s)
	s = normalize_diacritics(s)
	s = normalize_to_plain_ascii(s)
	s = normalize_to_alphanum_and_spaces(s)
	#s = normalize_prepositions(s)
	s = normalize_whitespace(s)
	# We don't need to re-normalize to known unicode form (say, NFC) since we
	# only have plain ASCII data, and alpha-numeric content, for that matter.
	# There is no "combined" nor "decomposed" unicode content in `s`.
	return s


	def main():
	sample = [u"Cine e foto",
	u"Cine & foto",
	u"Cine&Foto", # BeautifulSoup breaks for this one.
	u"Cine+foto",
	u"Cíñe_e.foto",
	u"<a>Çine e<br>Foto",
	u'Cine\u65e5\u672c\u8a9eFoto',
	u'Carrinhos e Veículos',
	u'<a href="#">Cine <em>(é fóto¬ \u0394ημώ)</em></a>',
	u'Soul e R&B', # we used break on this one.
	u'T\xc3\xaanis e Esporte',
	]
	from collections import defaultdict
	categories = defaultdict(list)
	for i in sample:
	n = normalize_string(i, fix_utf8=True)
	categories[n].append(i)

	for k, v in categories.items():
	print k, v

	return categories


	if __name__ == '__main__':
	main()