Skip to content

Instantly share code, notes, and snippets.

Forked from tmacam/
Created May 30, 2023 08:11
Show Gist options
  • Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
string normalization in python: HTML stripping and HTML entity resolution.
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8:
"""A collection of string normalization routines.
You are probably looking for normalize_string, that does an aggressive (but
arguably sound) string normalization process.
from HTMLParser import HTMLParser
import re
import unicodedata
ORD_A_MIN = ord(u'a')
ORD_Z_MIN = ord(u'z')
ORD_0 = ord(u'0')
ORD_9 = ord(u'9')
def try_redecode_utf8(s):
"""Try redecoding utf-8 data inside a (faux-)unicode string.
>>> try_redecode_utf8(u'T\xc3\xaanis e Esporte')
u'T\xeanis e Esporte'
keep_going = True
redecoded = s
# Keep redecoding until redecoding fails or there is no difference in output
while keep_going:
if isinstance(s, unicode):
redecoded = s.encode('latin1').decode('utf-8')
elif isinstance(s, str):
redecoded = s.decode('utf-8')
keep_going = (s != redecoded)
s = redecoded
keep_going = False
return redecoded
class HTMLStripper(HTMLParser):
"Remove tags and keeps HTML entities intact."
def __init__(self):
self.fed = []
def handle_starttag(self, tag, attrs):
# We took the decision that all/any tag is a word-splitter and thus
# is converted to spaces.
self.fed.append(' ')
def handle_data(self, d):
def handle_charref(self, number):
self.fed.append('&#%s;' % number)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def get_data(self):
return u''.join(self.fed)
def isPlainASCIIAlphaNum(c):
o = ord(c)
if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)):
return True
return False
def strip_html_and_convert_entities(html):
# Previously we used the following code, that depends on BeautifulSoup:
# soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
# return u' '.join(soup.findAll(text=True))
# Src:
# But it does not handle numeric character entities correctly. Our new
# Approach does not depend on BeautifulSoup and uses HTMLParser, which is
# part of python 2.6's std. library. So it is a double-win. :-)
parser = HTMLStripper()
# HTML parser breaks if parsing ends/EOF on a single-letter broken entities
# such as 'at&t'. Adding an extra space fixes this.
parser.feed(' ')
return parser.unescape(parser.get_data())
def normalize_case(s):
return s.lower()
def normalize_diacritics(input_str):
# References:
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def normalize_to_plain_ascii(s):
only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?'
return unicode(only_ascii)
def normalize_to_alphanum_and_spaces(s):
return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s)
# def normalize_diacritics_old(s):
# """Converts to lowercase, normalizes diacritics and
# converts non-alphanumeric chars into space.
# """
# s = s.lower()
# s = unicodedata.normalize('NFKD', s)
# # Filter ascii letters and numbers, discard everyone else
# filtered = []
# for c in s:
# if isPlainASCIIAlphaNum(c):
# filtered.append(c)
# elif unicodedata.category(c) == 'Mn':
# continue
# else:
# filtered.append(u' ')
# return u' '.join(''.join(filtered).split())
def normalize_prepositions(s):
"""Replaces common prepositions by space."""
prepositions = ['e', 'and', 'de', 'do', 'da']
for prep in prepositions:
pattern = r'\b' + prep + r'\b'
s = re.sub(pattern, " ", s)
return s
def normalize_whitespace(s):
s = re.sub('\s+', ' ', s)
return s.strip()
def normalize_string(s, fix_utf8=False):
if fix_utf8:
s = try_redecode_utf8(s)
s = strip_html_and_convert_entities(s)
s = normalize_case(s)
s = normalize_diacritics(s)
s = normalize_to_plain_ascii(s)
s = normalize_to_alphanum_and_spaces(s)
#s = normalize_prepositions(s)
s = normalize_whitespace(s)
# We don't need to re-normalize to known unicode form (say, NFC) since we
# only have plain ASCII data, and alpha-numeric content, for that matter.
# There is no "combined" nor "decomposed" unicode content in `s`.
return s
def main():
sample = [u"Cine e foto",
u"Cine & foto",
u"Cine&Foto", # BeautifulSoup breaks for this one.
u'Carrinhos e Ve&iacute;culos',
u'<a href="#">Cine <em>(&eacute; f&#x00f3;to&not; \u0394&#x03b7;&#956;&#x03CE;)</em></a>',
u'Soul e R&B', # we used break on this one.
u'T\xc3\xaanis e Esporte',
from collections import defaultdict
categories = defaultdict(list)
for i in sample:
n = normalize_string(i, fix_utf8=True)
for k, v in categories.items():
print k, v
return categories
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment