amcgregor/normalize.py

## normalize.py
import re
from unicodedata import category, combining, normalize as uninorm

from ftfy import fix_text  # Needs library; pip install ftfy


def normalize(value, slug=False, fold=True, *, sep='-', elide='CMPSZ'):
	"""Normalize and optionally "slugify" a given string.

	This is very much a multi-step process, initially ensuring we have Unicode text in the first place,
	"fix the text" (see https://ftfy.readthedocs.io/ — it does a lot), optionally fold out combining characters,
	and optionally elide dangerous characters (and compactify sequential elisions) for even greater URL-safeness.

	Remember, though, that most URI components support full UTF-8!  (At least, theoretically.  There was a bit
	of a dust-up with http://💩.la and now emoji are forbidden from domain names...)  To keep URI more easily
	typeable or readable over a phone, "slug" is provided to delete non-printable and non-alphanumeric content,
	lowercase, then collapse consecutive replacements down into one.
	"""

	if value is None:  # Sanitize literal nulls.
		return ''

	if not isinstance(value, str):  # Attempt to decode, otherwise attempt to cast.
		if hasattr(value, 'decode'):
			try:
				value = value.decode('utf8')
			except UnicodeDecodeError:
				value = value.decode('Windows-1252')  # People say they want Latin-1.  They actually want W-1252.
		else:
			value = str(value)  # This makes non-string input acceptable.

	value = fix_text(value, normalization='NFKC')  # Ref: https://ftfy.readthedocs.io/en/latest/#using-ftfy

	if fold:
		value = uninorm('NFKD', value)  # Separate out combining characters.
		value = "".join(c for c in value if not combining(c))  # Chuck 'em.
		value = uninorm('NFC', value)  # Fold anything we didn't find offensive back together.

	if slug:
		value = "".join((sep if category(c)[0] in elide else c) for c in value)
		value = re.sub(f'{sep}+', sep, value).lower()

	return value
	import re
	from unicodedata import category, combining, normalize as uninorm

	from ftfy import fix_text # Needs library; pip install ftfy


	def normalize(value, slug=False, fold=True, *, sep='-', elide='CMPSZ'):
	"""Normalize and optionally "slugify" a given string.

	This is very much a multi-step process, initially ensuring we have Unicode text in the first place,
	"fix the text" (see https://ftfy.readthedocs.io/ — it does a lot), optionally fold out combining characters,
	and optionally elide dangerous characters (and compactify sequential elisions) for even greater URL-safeness.

	Remember, though, that most URI components support full UTF-8! (At least, theoretically. There was a bit
	of a dust-up with http://💩.la and now emoji are forbidden from domain names...) To keep URI more easily
	typeable or readable over a phone, "slug" is provided to delete non-printable and non-alphanumeric content,
	lowercase, then collapse consecutive replacements down into one.
	"""

	if value is None: # Sanitize literal nulls.
	return ''

	if not isinstance(value, str): # Attempt to decode, otherwise attempt to cast.
	if hasattr(value, 'decode'):
	try:
	value = value.decode('utf8')
	except UnicodeDecodeError:
	value = value.decode('Windows-1252') # People say they want Latin-1. They actually want W-1252.
	else:
	value = str(value) # This makes non-string input acceptable.

	value = fix_text(value, normalization='NFKC') # Ref: https://ftfy.readthedocs.io/en/latest/#using-ftfy

	if fold:
	value = uninorm('NFKD', value) # Separate out combining characters.
	value = "".join(c for c in value if not combining(c)) # Chuck 'em.
	value = uninorm('NFC', value) # Fold anything we didn't find offensive back together.

	if slug:
	value = "".join((sep if category(c)[0] in elide else c) for c in value)
	value = re.sub(f'{sep}+', sep, value).lower()

	return value