Skip to content

Instantly share code, notes, and snippets.

@markmysourcecode
Forked from tmacam/normalize_string.py
Created May 30, 2023 08:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
string normalization in python: HTML stripping and HTML entity resolution.
#!/usr/bin/python
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:
"""A collection of string normalization routines.
You are probably looking for normalize_string, that does an aggressive (but
arguably sound) string normalization process.
"""
from HTMLParser import HTMLParser
import re
import unicodedata
ORD_A_MIN = ord(u'a')
ORD_Z_MIN = ord(u'z')
ORD_0 = ord(u'0')
ORD_9 = ord(u'9')
def try_redecode_utf8(s):
"""Try redecoding utf-8 data inside a (faux-)unicode string.
>>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
u'T\xeanis e Esporte'
"""
keep_going = True
redecoded = s
# Keep redecoding until redecoding fails or there is no difference in output
while keep_going:
try:
if isinstance(s, unicode):
redecoded = s.encode('latin1').decode('utf-8')
elif isinstance(s, str):
redecoded = s.decode('utf-8')
keep_going = (s != redecoded)
s = redecoded
except:
keep_going = False
return redecoded
class HTMLStripper(HTMLParser):
"Remove tags and keeps HTML entities intact."
def __init__(self):
self.reset()
self.fed = []
def handle_starttag(self, tag, attrs):
# We took the decision that all/any tag is a word-splitter and thus
# is converted to spaces.
self.fed.append(' ')
def handle_data(self, d):
self.fed.append(d)
def handle_charref(self, number):
self.fed.append('&#%s;' % number)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def get_data(self):
return u''.join(self.fed)
def isPlainASCIIAlphaNum(c):
o = ord(c)
if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
return True
return False
def strip_html_and_convert_entities(html):
# Previously we used the following code, that depends on BeautifulSoup:
#
# soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
# return u' '.join(soup.findAll(text=True))
# Src:
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
#
# But it does not handle numeric character entities correctly. Our new
# Approach does not depend on BeautifulSoup and uses HTMLParser, which is
# part of python 2.6's std. library. So it is a double-win. :-)
#
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
# http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
parser = HTMLStripper()
parser.feed(html)
# HTML parser breaks if parsing ends/EOF on a single-letter broken entities
# such as 'at&t'. Adding an extra space fixes this.
parser.feed(' ')
parser.close()
return parser.unescape(parser.get_data())
def normalize_case(s):
return s.lower()
def normalize_diacritics(input_str):
# References:
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def normalize_to_plain_ascii(s):
only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?'
return unicode(only_ascii)
def normalize_to_alphanum_and_spaces(s):
return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)
# def normalize_diacritics_old(s):
# """Converts to lowercase, normalizes diacritics and
# converts non-alphanumeric chars into space.
# """
# s = s.lower()
# s = unicodedata.normalize('NFKD', s)
# # Filter ascii letters and numbers, discard everyone else
# filtered = []
# for c in s:
# if isPlainASCIIAlphaNum(c):
# filtered.append(c)
# elif unicodedata.category(c) == 'Mn':
# continue
# else:
# filtered.append(u' ')
# return u' '.join(''.join(filtered).split())
def normalize_prepositions(s):
"""Replaces common prepositions by space."""
prepositions = ['e', 'and', 'de', 'do', 'da']
for prep in prepositions:
pattern = r'\b' + prep + r'\b'
s = re.sub(pattern, " ", s)
return s
def normalize_whitespace(s):
s = re.sub('\s+', ' ', s)
return s.strip()
def normalize_string(s, fix_utf8=False):
if fix_utf8:
s = try_redecode_utf8(s)
s = strip_html_and_convert_entities(s)
s = normalize_case(s)
s = normalize_diacritics(s)
s = normalize_to_plain_ascii(s)
s = normalize_to_alphanum_and_spaces(s)
#s = normalize_prepositions(s)
s = normalize_whitespace(s)
# We don't need to re-normalize to known unicode form (say, NFC) since we
# only have plain ASCII data, and alpha-numeric content, for that matter.
# There is no "combined" nor "decomposed" unicode content in `s`.
return s
def main():
sample = [u"Cine e foto",
u"Cine & foto",
u"Cine&Foto", # BeautifulSoup breaks for this one.
u"Cine+foto",
u"Cíñe_e.foto",
u"<a>&Ccedil;ine&nbsp;e<br>Foto",
u'Cine\u65e5\u672c\u8a9eFoto',
u'Carrinhos e Ve&iacute;culos',
u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
u'Soul e R&B', # we used break on this one.
u'T\xc3\xaanis e Esporte',
]
from collections import defaultdict
categories = defaultdict(list)
for i in sample:
n = normalize_string(i, fix_utf8=True)
categories[n].append(i)
for k, v in categories.items():
print k, v
return categories
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment