Last active
March 13, 2023 19:24
-
-
Save haliphax/dd5887b0ab043a05ae54cf0155a3b218 to your computer and use it in GitHub Desktop.
Split a string into grapheme clusters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Unicode utility for splitting a string into grapheme clusters""" | |
# stdlib | |
import logging | |
from os import environ | |
from sys import stdout | |
import unicodedata | |
# 3rd party | |
from emoji import is_emoji | |
from wcwidth import wcswidth | |
ZWNJ = "\u200c" | |
"""Zero-width non-joiner; blocks next grapheme from joining with cluster""" | |
ZWJ = "\u200d" | |
"""Zero-width joiner; joins next grapheme with cluster""" | |
VARIATION_SELECTORS = set((chr(c) for c in range(0xfe00, 0xfe0f + 1))) | |
"""Variation selectors; modify display of previous grapheme""" | |
EMOJI_VS = "\ufe0f" | |
"""'Emoji type' variation selector; display previous character as emoji""" | |
SKIN_TONES = set((chr(c) for c in range(0x1f3fb, 0x1f3ff + 1))) | |
"""Skin tone modifiers; modifies previous emoji with no ZWJ between""" | |
ASSUME_WIDE = False | |
""" | |
Whether to assume that all emoji matched by `is_emoji()` are Wide, even if their | |
`unicodedata.east_asian_width()` result is Narrow | |
""" | |
FORCE_EVS_WIDE = True | |
""" | |
Whether Narrow emoji which are joined by the Emoji Variation Selector `\\ufe0f` | |
will be forcibly labeled as Wide for the purposes of column offset compensation | |
(e.g. varation-selected male/female symbol emoji) | |
""" | |
VALID_ZWC = set(("\n",)) | |
"""Unjoined zero-width characters that will not be stripped during parsing""" | |
log = logging.getLogger(__name__) | |
log.setLevel(getattr(logging, environ.get("loglevel", "INFO"))) | |
streamHandler = logging.StreamHandler(stdout) | |
log.addHandler(streamHandler) | |
class Grapheme: | |
""" | |
Class for storing (potentially clustered) graphemes | |
The base character is stored separately from its various modifying | |
sequences to accommodate terminals which do not support zero-width | |
characters, combining characters, etc. Variation-selected emoji which are | |
considered by the terminal (incorrectly) to be narrow graphemes are flagged | |
so that the column offset caused during display can be compensated for. | |
""" | |
char: str | |
mods: str | |
width: int | |
force_width: bool = False | |
def __init__(self, char: str = "", mods: str = "", width: int = 0): | |
self.char = char | |
self.mods = mods | |
self.width = width | |
def _modstr(self, s): | |
return '0x%04X' % ord(s) if wcswidth(s) <= 0 else s | |
def __repr__(self): | |
return ( | |
f"Grapheme(char={self.char!r}, " | |
f"mods={[self._modstr(c) for c in self.mods]}, " | |
f"width={self.width}{' <forced>' if self.force_width else ''})" | |
) | |
def __str__(self): | |
return "".join( | |
(self.char, self.mods, (" " if self.force_width else "")) | |
) | |
def graphemes(string: str): | |
""" | |
Split a Unicode string into (potentially clustered) graphemes. | |
Args: | |
string: The string to parse. | |
Returns: | |
A list of `Grapheme` objects parsed from the input string. | |
""" | |
def _append_cell(cell: Grapheme, cells: list[Grapheme]): | |
cells.append(cell) | |
return Grapheme() | |
cell = Grapheme() | |
cells: list[Grapheme] = [] | |
joined = False | |
was_emoji = False | |
for c in string: | |
if c == ZWJ: | |
if was_emoji: | |
log.debug("ZWJ") | |
cell.mods += c | |
joined = True | |
else: | |
log.debug("unexpected ZWJ") | |
continue | |
if c == ZWNJ: | |
joined = False | |
if cell.char != "": | |
log.debug("ZWNJ") | |
cell.mods += c | |
was_emoji = False | |
else: | |
log.debug("unexpected ZWNJ") | |
continue | |
if unicodedata.combining(c) != 0: | |
if cells: | |
log.debug(f"combining character: {'o' + c!r}") | |
cell = cells.pop() | |
cell.mods += c | |
else: | |
log.debug("unexpected combining character") | |
continue | |
if c in VARIATION_SELECTORS: | |
if cell.char != "": | |
if c == EMOJI_VS: | |
if was_emoji: | |
log.debug("emoji variation selector") | |
cell.mods += c | |
if FORCE_EVS_WIDE and cell.width < 2: | |
log.debug("forced wide") | |
cell.width = 2 | |
cell.force_width = True | |
else: | |
log.debug("unexpected emoji variation selector") | |
else: | |
log.debug(f"variation selector: {c!r}") | |
cell.mods += c | |
else: | |
log.debug("unexpected variation selector") | |
continue | |
if c in SKIN_TONES and was_emoji: | |
log.debug(f"skin tone: {c!r}") | |
cell.mods += c | |
if not is_emoji(str(cell)): | |
# separate skin tone modifier from emoji if invalid | |
log.debug(f"invalid base: {cell!r}") | |
cell.mods = cell.mods[:-1] | |
cell = _append_cell(cell, cells) | |
cell.char = f"{ZWNJ}{c}" | |
cell.width = 2 | |
cell = _append_cell(cell, cells) | |
was_emoji = False | |
continue | |
if joined: | |
log.debug(f"joining: {c!r}") | |
cell.mods += c | |
joined = False | |
continue | |
if cell.char != "": | |
cell = _append_cell(cell, cells) | |
if unicodedata.east_asian_width(c) == "W": | |
log.debug("wide") | |
cell.width = 2 | |
else: | |
cell.width = wcswidth(c) | |
# assume the terminal will cause a problematic visual column offset when | |
# displaying emoji that are (incorrectly) labeled as Narrow | |
if ASSUME_WIDE and was_emoji and not cell.width: | |
log.debug("assumed wide") | |
cell.width = 2 | |
cell.force_width = True | |
if is_emoji(c): | |
log.debug(f"emoji: {c!r}") | |
cell.char = c | |
was_emoji = True | |
continue | |
if was_emoji: | |
log.debug("end emoji") | |
last = cells[-1] | |
if not is_emoji(str(last).rstrip().rstrip(EMOJI_VS)): | |
log.debug(f"invalid emoji: {last!r}") | |
# strip all but base emoji character if invalid | |
last.mods = "" | |
was_emoji = False | |
if wcswidth(c) < 1: | |
hexstr = "0x%04X" % ord(c) | |
if c not in VALID_ZWC: | |
log.debug(f"stripping ZWC: {hexstr}") | |
continue | |
log.debug(f"ZWC: {hexstr}") | |
cell.char = c | |
cell = _append_cell(cell, cells) | |
return cells | |
data = f""" | |
emoji: | |
------ | |
๐จ - standard wide | |
โ{EMOJI_VS} - variation-selected narrow | |
๐๐พ - skin tone ๐{ZWNJ}๐พ | |
๐งโ๐ป - zero-width-joiner ๐ง๐ป | |
๐จโ๐ฉโ๐ง - multiple zwj ๐จ๐ฉ๐ง | |
๐งโโ๏ธ - zwj + evs narrow ๐งโ{EMOJI_VS} | |
๐ง๐ผโ๐ - skin tone ๐ง{ZWNJ}๐ผ, zwj ๐ | |
๐ฎ๐ฟโโ๏ธ - skin tone ๐ฉ{ZWNJ}๐ฟ, zwj + evs narrow ๐ฎโ{EMOJI_VS} | |
๐ญ{ZWJ}๐ - invalid zwj | |
๐ญ๐ฟ invalid skin tone combo | |
combining characters: | |
--------------------- | |
a - standard narrow | |
aฬ - narrow + combining character | |
""" | |
grapheme_list = graphemes(data) | |
for g in grapheme_list: | |
print(str(g), end="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment