Skip to content

Instantly share code, notes, and snippets.

@tmacam
Last active May 30, 2023 08:11
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tmacam/5422849 to your computer and use it in GitHub Desktop.
Save tmacam/5422849 to your computer and use it in GitHub Desktop.
string normalization in python: HTML stripping and HTML entity resolution.
#!/usr/bin/python
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:
"""A collection of string normalization routines.
You are probably looking for normalize_string, that does an aggressive (but
arguably sound) string normalization process.
"""
from HTMLParser import HTMLParser
import re
import unicodedata
ORD_A_MIN = ord(u'a')
ORD_Z_MIN = ord(u'z')
ORD_0 = ord(u'0')
ORD_9 = ord(u'9')
def try_redecode_utf8(s):
"""Try redecoding utf-8 data inside a (faux-)unicode string.
>>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
u'T\xeanis e Esporte'
"""
keep_going = True
redecoded = s
# Keep redecoding until redecoding fails or there is no difference in output
while keep_going:
try:
if isinstance(s, unicode):
redecoded = s.encode('latin1').decode('utf-8')
elif isinstance(s, str):
redecoded = s.decode('utf-8')
keep_going = (s != redecoded)
s = redecoded
except:
keep_going = False
return redecoded
class HTMLStripper(HTMLParser):
"Remove tags and keeps HTML entities intact."
def __init__(self):
self.reset()
self.fed = []
def handle_starttag(self, tag, attrs):
# We took the decision that all/any tag is a word-splitter and thus
# is converted to spaces.
self.fed.append(' ')
def handle_data(self, d):
self.fed.append(d)
def handle_charref(self, number):
self.fed.append('&#%s;' % number)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def get_data(self):
return u''.join(self.fed)
def isPlainASCIIAlphaNum(c):
o = ord(c)
if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
return True
return False
def strip_html_and_convert_entities(html):
# Previously we used the following code, that depends on BeautifulSoup:
#
# soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
# return u' '.join(soup.findAll(text=True))
# Src:
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
#
# But it does not handle numeric character entities correctly. Our new
# Approach does not depend on BeautifulSoup and uses HTMLParser, which is
# part of python 2.6's std. library. So it is a double-win. :-)
#
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
# http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
parser = HTMLStripper()
parser.feed(html)
# HTML parser breaks if parsing ends/EOF on a single-letter broken entities
# such as 'at&t'. Adding an extra space fixes this.
parser.feed(' ')
parser.close()
return parser.unescape(parser.get_data())
def normalize_case(s):
return s.lower()
def normalize_diacritics(input_str):
# References:
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def normalize_to_plain_ascii(s):
only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?'
return unicode(only_ascii)
def normalize_to_alphanum_and_spaces(s):
return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)
# def normalize_diacritics_old(s):
# """Converts to lowercase, normalizes diacritics and
# converts non-alphanumeric chars into space.
# """
# s = s.lower()
# s = unicodedata.normalize('NFKD', s)
# # Filter ascii letters and numbers, discard everyone else
# filtered = []
# for c in s:
# if isPlainASCIIAlphaNum(c):
# filtered.append(c)
# elif unicodedata.category(c) == 'Mn':
# continue
# else:
# filtered.append(u' ')
# return u' '.join(''.join(filtered).split())
def normalize_prepositions(s):
"""Replaces common prepositions by space."""
prepositions = ['e', 'and', 'de', 'do', 'da']
for prep in prepositions:
pattern = r'\b' + prep + r'\b'
s = re.sub(pattern, " ", s)
return s
def normalize_whitespace(s):
s = re.sub('\s+', ' ', s)
return s.strip()
def normalize_string(s, fix_utf8=False):
if fix_utf8:
s = try_redecode_utf8(s)
s = strip_html_and_convert_entities(s)
s = normalize_case(s)
s = normalize_diacritics(s)
s = normalize_to_plain_ascii(s)
s = normalize_to_alphanum_and_spaces(s)
#s = normalize_prepositions(s)
s = normalize_whitespace(s)
# We don't need to re-normalize to known unicode form (say, NFC) since we
# only have plain ASCII data, and alpha-numeric content, for that matter.
# There is no "combined" nor "decomposed" unicode content in `s`.
return s
def main():
sample = [u"Cine e foto",
u"Cine & foto",
u"Cine&Foto", # BeautifulSoup breaks for this one.
u"Cine+foto",
u"Cíñe_e.foto",
u"<a>&Ccedil;ine&nbsp;e<br>Foto",
u'Cine\u65e5\u672c\u8a9eFoto',
u'Carrinhos e Ve&iacute;culos',
u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
u'Soul e R&B', # we used break on this one.
u'T\xc3\xaanis e Esporte',
]
from collections import defaultdict
categories = defaultdict(list)
for i in sample:
n = normalize_string(i, fix_utf8=True)
categories[n].append(i)
for k, v in categories.items():
print k, v
return categories
if __name__ == '__main__':
main()
@zetechmoy
Copy link

Pretty cool code, I tested it and it works well but since it has been edited 8 years ago it needed an update so I made it compatible with python 3, tested with python 3.8.10. I also added a normalization for espace characters and added french prepositions (I'm french so I needed it 😛 ) and removed some warning about future deprecation.

Hope it could help someone one day, excited to see how it will looks like in 8 years ! ✌️

#!/usr/bin/python
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:

"""A collection of string normalization routines.

You are probably looking for normalize_string, that does an aggressive (but
arguably sound) string normalization process.
"""


import re, sys, html
import unicodedata

# https://stackoverflow.com/questions/38697037/how-to-convert-python-2-unicode-function-into-correct-python-3-x-syntax
if sys.version_info.major == 3:
    from html.parser import HTMLParser

    unicode = str
else:
    from HTMLParser import HTMLParser

ORD_A_MIN = ord("a")
ORD_Z_MIN = ord("z")
ORD_0 = ord("0")
ORD_9 = ord("9")


def try_redecode_utf8(s):
    """Try redecoding utf-8 data inside a (faux-)unicode string.

    >>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
    u'T\xeanis e Esporte'
    """
    keep_going = True
    redecoded = s
    # Keep redecoding until redecoding fails or there is no difference in output
    while keep_going:
        try:
            if isinstance(s, unicode):
                redecoded = s.encode("latin1").decode("utf-8")
            elif isinstance(s, str):
                redecoded = s.decode("utf-8")
            keep_going = s != redecoded
            s = redecoded
        except:
            keep_going = False
    return redecoded


class HTMLStripper(HTMLParser):
    "Remove tags and keeps HTML entities intact."

    def __init__(self):
        self.reset()
        self.fed = []
        super().__init__()

    def handle_starttag(self, tag, attrs):
        # We took the decision that all/any tag is a word-splitter and thus
        # is converted to spaces.
        self.fed.append(" ")

    def handle_data(self, d):
        self.fed.append(d)

    def handle_charref(self, number):
        self.fed.append("&#%s;" % number)

    def handle_entityref(self, name):
        self.fed.append("&%s;" % name)

    def get_data(self):
        return "".join(self.fed)


def is_plain_ascii_alpha_num(c):
    o = ord(c)
    if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
        return True
    return False


def strip_html_and_convert_entities(html_):
    # Previously we used the following code, that depends on BeautifulSoup:
    #
    #   soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #   return u' '.join(soup.findAll(text=True))
    # Src:
    #  http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    #
    # But it does not handle numeric character entities correctly. Our new
    # Approach does not depend on BeautifulSoup and uses HTMLParser, which is
    # part of python 2.6's std. library. So it is a double-win. :-)
    #
    # http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
    # http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
    parser = HTMLStripper()
    parser.feed(html_)
    # HTML parser breaks if parsing ends/EOF on a single-letter broken entities
    # such as 'at&t'. Adding an extra space fixes this.
    parser.feed(" ")
    parser.close()
    return html.unescape(parser.get_data())


def normalize_case(s):
    return s.lower()


def normalize_diacritics(input_str):
    # References:
    # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    # http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
    nkfd_form = unicodedata.normalize("NFKD", unicode(input_str))
    return "".join([c for c in nkfd_form if not unicodedata.combining(c)])


def normalize_to_plain_ascii(s):
    only_ascii = s.encode("ASCII", "replace")  # unencodable chars become '?'
    return unicode(only_ascii)


def normalize_to_alphanum_and_spaces(s):
    return "".join(i if is_plain_ascii_alpha_num(i) else " " for i in s)


# def normalize_diacritics_old(s):
#     """Converts to lowercase, normalizes diacritics and
#     converts non-alphanumeric chars into space.
#     """
#     s = s.lower()
#     s = unicodedata.normalize('NFKD', s)
#     # Filter ascii letters and numbers, discard everyone else
#     filtered = []
#     for c in s:
#         if isPlainASCIIAlphaNum(c):
#             filtered.append(c)
#         elif unicodedata.category(c) == 'Mn':
#             continue
#         else:
#             filtered.append(u' ')
#     return u' '.join(''.join(filtered).split())


def normalize_escape_characters(s):
    """Converts escape characters into space."""
    # Escape characters are:
    # \t \n \r \f \v \\ \"
    # http://stackoverflow.com/questions/241327/python-snippet-to-remove-certain-characters-from-a-string
    return re.sub(r"[\t \n \r \f \v \\ \"]", " ", s)


def normalize_prepositions(s):
    """Replaces common prepositions by space."""
    prepositions = ["e", "and", "de", "do", "da"]
    # all french determinants
    prepositions += ["et", "le", "la", "les", "un", "une", "des", "du", "de", "au", "aux", "à", "en", "sur", "par", "pour", "dans", "avec", "sans", "sous", "vers", "chez", "se", "y", "il", "elle", "on", "nous", "vous", "ils", "elles", "ce", "cet", "cette", "ceux", "celui", "celle", "celles", "ces", "ça", "sa", "son", "ses", "leur", "leurs", "mon", "ma", "mes", "ton", "ta", "tes", "nos", "vos", "leur", "leurs", "lui", "soi", "toi", "moi", "nous", "vous", "eux", "elles", "ils", "d'"]

    for prep in prepositions:
        pattern = r"\b" + prep + r"\b"
        s = re.sub(pattern, " ", s)
    return s


def normalize_whitespace(s):
    s = re.sub("\s+", " ", s)
    return s.strip()


def normalize_string(s, fix_utf8=False):
    if fix_utf8:
        s = try_redecode_utf8(s)

    s = normalize_escape_characters(s)
    s = strip_html_and_convert_entities(s)
    s = normalize_case(s)
    s = normalize_diacritics(s)
    s = normalize_to_plain_ascii(s)
    s = normalize_to_alphanum_and_spaces(s)
    s = normalize_prepositions(s)
    s = normalize_whitespace(s)
    # We don't need to re-normalize to known unicode form (say, NFC) since we
    # only have plain ASCII data, and alpha-numeric content, for that matter.
    # There is no "combined" nor "decomposed" unicode content in `s`.
    return s


def main():
    sample = [
        "Cine e foto",
        "Cine & foto",
        "Cine&Foto",  # BeautifulSoup breaks for this one.
        "Cine+foto",
        "Cíñe_e.foto",
        "<a>&Ccedil;ine&nbsp;e<br>Foto",
        "Cine\u65e5\u672c\u8a9eFoto",
        "Carrinhos e Ve&iacute;culos",
        '<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
        "Soul e R&B",  # we used break on this one.
        "T\xc3\xaanis e Esporte",
    ]
    from collections import defaultdict

    categories = defaultdict(list)
    for i in sample:
        n = normalize_string(i, fix_utf8=True)
        categories[n].append(i)

    for k, v in categories.items():
        print(k, v)

    return categories


if __name__ == "__main__":
    main()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment