haliphax/split_graphemes.py

## split_graphemes.py
"""Unicode utility for splitting a string into grapheme clusters"""

# stdlib
import logging
from os import environ
from sys import stdout
import unicodedata

# 3rd party
from emoji import is_emoji
from wcwidth import wcswidth

ZWNJ = "\u200c"
"""Zero-width non-joiner; blocks next grapheme from joining with cluster"""

ZWJ = "\u200d"
"""Zero-width joiner; joins next grapheme with cluster"""

VARIATION_SELECTORS = set((chr(c) for c in range(0xfe00, 0xfe0f + 1)))
"""Variation selectors; modify display of previous grapheme"""

EMOJI_VS = "\ufe0f"
"""'Emoji type' variation selector; display previous character as emoji"""

SKIN_TONES = set((chr(c) for c in range(0x1f3fb, 0x1f3ff + 1)))
"""Skin tone modifiers; modifies previous emoji with no ZWJ between"""

ASSUME_WIDE = False
"""
Whether to assume that all emoji matched by `is_emoji()` are Wide, even if their
`unicodedata.east_asian_width()` result is Narrow
"""

FORCE_EVS_WIDE = True
"""
Whether Narrow emoji which are joined by the Emoji Variation Selector `\\ufe0f`
will be forcibly labeled as Wide for the purposes of column offset compensation
(e.g. varation-selected male/female symbol emoji)
"""

VALID_ZWC = set(("\n",))
"""Unjoined zero-width characters that will not be stripped during parsing"""

log = logging.getLogger(__name__)
log.setLevel(getattr(logging, environ.get("loglevel", "INFO")))
streamHandler = logging.StreamHandler(stdout)
log.addHandler(streamHandler)


class Grapheme:

    """
    Class for storing (potentially clustered) graphemes

    The base character is stored separately from its various modifying
    sequences to accommodate terminals which do not support zero-width
    characters, combining characters, etc. Variation-selected emoji which are
    considered by the terminal (incorrectly) to be narrow graphemes are flagged
    so that the column offset caused during display can be compensated for.
    """

    char: str
    mods: str
    width: int
    force_width: bool = False

    def __init__(self, char: str = "", mods: str = "", width: int = 0):
        self.char = char
        self.mods = mods
        self.width = width

    def _modstr(self, s):
        return '0x%04X' % ord(s) if wcswidth(s) <= 0 else s

    def __repr__(self):
        return (
            f"Grapheme(char={self.char!r}, "
            f"mods={[self._modstr(c) for c in self.mods]}, "
            f"width={self.width}{' <forced>' if self.force_width else ''})"
        )

    def __str__(self):
        return "".join(
            (self.char, self.mods, (" " if self.force_width else ""))
        )


def graphemes(string: str):
    """
    Split a Unicode string into (potentially clustered) graphemes.

    Args:
        string: The string to parse.

    Returns:
        A list of `Grapheme` objects parsed from the input string.
    """

    def _append_cell(cell: Grapheme, cells: list[Grapheme]):
        cells.append(cell)
        return Grapheme()

    cell = Grapheme()
    cells: list[Grapheme] = []
    joined = False
    was_emoji = False

    for c in string:
        if c == ZWJ:
            if was_emoji:
                log.debug("ZWJ")
                cell.mods += c
                joined = True
            else:
                log.debug("unexpected ZWJ")

            continue

        if c == ZWNJ:
            joined = False

            if cell.char != "":
                log.debug("ZWNJ")
                cell.mods += c
                was_emoji = False
            else:
                log.debug("unexpected ZWNJ")

            continue

        if unicodedata.combining(c) != 0:
            if cells:
                log.debug(f"combining character: {'o' + c!r}")
                cell = cells.pop()
                cell.mods += c
            else:
                log.debug("unexpected combining character")

            continue

        if c in VARIATION_SELECTORS:
            if cell.char != "":
                if c == EMOJI_VS:
                    if was_emoji:
                        log.debug("emoji variation selector")
                        cell.mods += c

                        if FORCE_EVS_WIDE and cell.width < 2:
                            log.debug("forced wide")
                            cell.width = 2
                            cell.force_width = True
                    else:
                        log.debug("unexpected emoji variation selector")
                else:
                    log.debug(f"variation selector: {c!r}")
                    cell.mods += c
            else:
                log.debug("unexpected variation selector")

            continue

        if c in SKIN_TONES and was_emoji:
            log.debug(f"skin tone: {c!r}")
            cell.mods += c

            if not is_emoji(str(cell)):
                # separate skin tone modifier from emoji if invalid
                log.debug(f"invalid base: {cell!r}")
                cell.mods = cell.mods[:-1]
                cell = _append_cell(cell, cells)
                cell.char = f"{ZWNJ}{c}"
                cell.width = 2
                cell = _append_cell(cell, cells)
                was_emoji = False

            continue

        if joined:
            log.debug(f"joining: {c!r}")
            cell.mods += c
            joined = False
            continue

        if cell.char != "":
            cell = _append_cell(cell, cells)

        if unicodedata.east_asian_width(c) == "W":
            log.debug("wide")
            cell.width = 2
        else:
            cell.width = wcswidth(c)

        # assume the terminal will cause a problematic visual column offset when
        # displaying emoji that are (incorrectly) labeled as Narrow
        if ASSUME_WIDE and was_emoji and not cell.width:
            log.debug("assumed wide")
            cell.width = 2
            cell.force_width = True

        if is_emoji(c):
            log.debug(f"emoji: {c!r}")
            cell.char = c
            was_emoji = True
            continue

        if was_emoji:
            log.debug("end emoji")
            last = cells[-1]

            if not is_emoji(str(last).rstrip().rstrip(EMOJI_VS)):
                log.debug(f"invalid emoji: {last!r}")
                # strip all but base emoji character if invalid
                last.mods = ""

        was_emoji = False

        if wcswidth(c) < 1:
            hexstr = "0x%04X" % ord(c)

            if c not in VALID_ZWC:
                log.debug(f"stripping ZWC: {hexstr}")
                continue

            log.debug(f"ZWC: {hexstr}")

        cell.char = c
        cell = _append_cell(cell, cells)

    return cells


data = f"""
emoji:
------
🎨 - standard wide
♂{EMOJI_VS} - variation-selected narrow
👋🏾 - skin tone 👋{ZWNJ}🏾
🧑‍💻 - zero-width-joiner 🧑💻
👨‍👩‍👧 - multiple zwj 👨👩👧
🧙‍♂️ - zwj + evs narrow 🧙♂{EMOJI_VS}
🧑🏼‍🚒 - skin tone 🧑{ZWNJ}🏼, zwj 🚒
👮🏿‍♀️ - skin tone 👩{ZWNJ}🏿, zwj + evs narrow 👮♀{EMOJI_VS}
😭{ZWJ}🚒 - invalid zwj
😭🏿 invalid skin tone combo

combining characters:
---------------------
a - standard narrow
â - narrow + combining character
"""
grapheme_list = graphemes(data)

for g in grapheme_list:
    print(str(g), end="")
	"""Unicode utility for splitting a string into grapheme clusters"""

	# stdlib
	import logging
	from os import environ
	from sys import stdout
	import unicodedata

	# 3rd party
	from emoji import is_emoji
	from wcwidth import wcswidth

	ZWNJ = "\u200c"
	"""Zero-width non-joiner; blocks next grapheme from joining with cluster"""

	ZWJ = "\u200d"
	"""Zero-width joiner; joins next grapheme with cluster"""

	VARIATION_SELECTORS = set((chr(c) for c in range(0xfe00, 0xfe0f + 1)))
	"""Variation selectors; modify display of previous grapheme"""

	EMOJI_VS = "\ufe0f"
	"""'Emoji type' variation selector; display previous character as emoji"""

	SKIN_TONES = set((chr(c) for c in range(0x1f3fb, 0x1f3ff + 1)))
	"""Skin tone modifiers; modifies previous emoji with no ZWJ between"""

	ASSUME_WIDE = False
	"""
	Whether to assume that all emoji matched by `is_emoji()` are Wide, even if their
	`unicodedata.east_asian_width()` result is Narrow
	"""

	FORCE_EVS_WIDE = True
	"""
	Whether Narrow emoji which are joined by the Emoji Variation Selector `\\ufe0f`
	will be forcibly labeled as Wide for the purposes of column offset compensation
	(e.g. varation-selected male/female symbol emoji)
	"""

	VALID_ZWC = set(("\n",))
	"""Unjoined zero-width characters that will not be stripped during parsing"""

	log = logging.getLogger(__name__)
	log.setLevel(getattr(logging, environ.get("loglevel", "INFO")))
	streamHandler = logging.StreamHandler(stdout)
	log.addHandler(streamHandler)


	class Grapheme:

	"""
	Class for storing (potentially clustered) graphemes

	The base character is stored separately from its various modifying
	sequences to accommodate terminals which do not support zero-width
	characters, combining characters, etc. Variation-selected emoji which are
	considered by the terminal (incorrectly) to be narrow graphemes are flagged
	so that the column offset caused during display can be compensated for.
	"""

	char: str
	mods: str
	width: int
	force_width: bool = False

	def __init__(self, char: str = "", mods: str = "", width: int = 0):
	self.char = char
	self.mods = mods
	self.width = width

	def _modstr(self, s):
	return '0x%04X' % ord(s) if wcswidth(s) <= 0 else s

	def __repr__(self):
	return (
	f"Grapheme(char={self.char!r}, "
	f"mods={[self._modstr(c) for c in self.mods]}, "
	f"width={self.width}{' <forced>' if self.force_width else ''})"
	)

	def __str__(self):
	return "".join(
	(self.char, self.mods, (" " if self.force_width else ""))
	)


	def graphemes(string: str):
	"""
	Split a Unicode string into (potentially clustered) graphemes.

	Args:
	string: The string to parse.

	Returns:
	A list of `Grapheme` objects parsed from the input string.
	"""

	def _append_cell(cell: Grapheme, cells: list[Grapheme]):
	cells.append(cell)
	return Grapheme()

	cell = Grapheme()
	cells: list[Grapheme] = []
	joined = False
	was_emoji = False

	for c in string:
	if c == ZWJ:
	if was_emoji:
	log.debug("ZWJ")
	cell.mods += c
	joined = True
	else:
	log.debug("unexpected ZWJ")

	continue

	if c == ZWNJ:
	joined = False

	if cell.char != "":
	log.debug("ZWNJ")
	cell.mods += c
	was_emoji = False
	else:
	log.debug("unexpected ZWNJ")

	continue

	if unicodedata.combining(c) != 0:
	if cells:
	log.debug(f"combining character: {'o' + c!r}")
	cell = cells.pop()
	cell.mods += c
	else:
	log.debug("unexpected combining character")

	continue

	if c in VARIATION_SELECTORS:
	if cell.char != "":
	if c == EMOJI_VS:
	if was_emoji:
	log.debug("emoji variation selector")
	cell.mods += c

	if FORCE_EVS_WIDE and cell.width < 2:
	log.debug("forced wide")
	cell.width = 2
	cell.force_width = True
	else:
	log.debug("unexpected emoji variation selector")
	else:
	log.debug(f"variation selector: {c!r}")
	cell.mods += c
	else:
	log.debug("unexpected variation selector")

	continue

	if c in SKIN_TONES and was_emoji:
	log.debug(f"skin tone: {c!r}")
	cell.mods += c

	if not is_emoji(str(cell)):
	# separate skin tone modifier from emoji if invalid
	log.debug(f"invalid base: {cell!r}")
	cell.mods = cell.mods[:-1]
	cell = _append_cell(cell, cells)
	cell.char = f"{ZWNJ}{c}"
	cell.width = 2
	cell = _append_cell(cell, cells)
	was_emoji = False

	continue

	if joined:
	log.debug(f"joining: {c!r}")
	cell.mods += c
	joined = False
	continue

	if cell.char != "":
	cell = _append_cell(cell, cells)

	if unicodedata.east_asian_width(c) == "W":
	log.debug("wide")
	cell.width = 2
	else:
	cell.width = wcswidth(c)

	# assume the terminal will cause a problematic visual column offset when
	# displaying emoji that are (incorrectly) labeled as Narrow
	if ASSUME_WIDE and was_emoji and not cell.width:
	log.debug("assumed wide")
	cell.width = 2
	cell.force_width = True

	if is_emoji(c):
	log.debug(f"emoji: {c!r}")
	cell.char = c
	was_emoji = True
	continue

	if was_emoji:
	log.debug("end emoji")
	last = cells[-1]

	if not is_emoji(str(last).rstrip().rstrip(EMOJI_VS)):
	log.debug(f"invalid emoji: {last!r}")
	# strip all but base emoji character if invalid
	last.mods = ""

	was_emoji = False

	if wcswidth(c) < 1:
	hexstr = "0x%04X" % ord(c)

	if c not in VALID_ZWC:
	log.debug(f"stripping ZWC: {hexstr}")
	continue

	log.debug(f"ZWC: {hexstr}")

	cell.char = c
	cell = _append_cell(cell, cells)

	return cells


	data = f"""
	emoji:
	------
	🎨 - standard wide
	♂{EMOJI_VS} - variation-selected narrow
	👋🏾 - skin tone 👋{ZWNJ}🏾
	🧑‍💻 - zero-width-joiner 🧑💻
	👨‍👩‍👧 - multiple zwj 👨👩👧
	🧙‍♂️ - zwj + evs narrow 🧙♂{EMOJI_VS}
	🧑🏼‍🚒 - skin tone 🧑{ZWNJ}🏼, zwj 🚒
	👮🏿‍♀️ - skin tone 👩{ZWNJ}🏿, zwj + evs narrow 👮♀{EMOJI_VS}
	😭{ZWJ}🚒 - invalid zwj
	😭🏿 invalid skin tone combo

	combining characters:
	---------------------
	a - standard narrow
	â - narrow + combining character
	"""
	grapheme_list = graphemes(data)

	for g in grapheme_list:
	print(str(g), end="")