Last active
August 29, 2019 00:58
-
-
Save amcgregor/cc177559de1b38dd92024c9210cfa920 to your computer and use it in GitHub Desktop.
Example Unicode normalization and sanitization for the purpose of "slugification" -- making URL-safe.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from unicodedata import category, combining, normalize as uninorm | |
from ftfy import fix_text # Needs library; pip install ftfy | |
def normalize(value, slug=False, fold=True, *, sep='-', elide='CMPSZ'): | |
"""Normalize and optionally "slugify" a given string. | |
This is very much a multi-step process, initially ensuring we have Unicode text in the first place, | |
"fix the text" (see https://ftfy.readthedocs.io/ — it does a lot), optionally fold out combining characters, | |
and optionally elide dangerous characters (and compactify sequential elisions) for even greater URL-safeness. | |
Remember, though, that most URI components support full UTF-8! (At least, theoretically. There was a bit | |
of a dust-up with http://💩.la and now emoji are forbidden from domain names...) To keep URI more easily | |
typeable or readable over a phone, "slug" is provided to delete non-printable and non-alphanumeric content, | |
lowercase, then collapse consecutive replacements down into one. | |
""" | |
if value is None: # Sanitize literal nulls. | |
return '' | |
if not isinstance(value, str): # Attempt to decode, otherwise attempt to cast. | |
if hasattr(value, 'decode'): | |
try: | |
value = value.decode('utf8') | |
except UnicodeDecodeError: | |
value = value.decode('Windows-1252') # People say they want Latin-1. They actually want W-1252. | |
else: | |
value = str(value) # This makes non-string input acceptable. | |
value = fix_text(value, normalization='NFKC') # Ref: https://ftfy.readthedocs.io/en/latest/#using-ftfy | |
if fold: | |
value = uninorm('NFKD', value) # Separate out combining characters. | |
value = "".join(c for c in value if not combining(c)) # Chuck 'em. | |
value = uninorm('NFC', value) # Fold anything we didn't find offensive back together. | |
if slug: | |
value = "".join((sep if category(c)[0] in elide else c) for c in value) | |
value = re.sub(f'{sep}+', sep, value).lower() | |
return value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment