Skip to content

Instantly share code, notes, and snippets.

@haliphax
Last active March 13, 2023 19:24
Show Gist options
  • Save haliphax/dd5887b0ab043a05ae54cf0155a3b218 to your computer and use it in GitHub Desktop.
Save haliphax/dd5887b0ab043a05ae54cf0155a3b218 to your computer and use it in GitHub Desktop.
Split a string into grapheme clusters
"""Unicode utility for splitting a string into grapheme clusters"""
# stdlib
import logging
from os import environ
from sys import stdout
import unicodedata
# 3rd party
from emoji import is_emoji
from wcwidth import wcswidth
ZWNJ = "\u200c"
"""Zero-width non-joiner; blocks next grapheme from joining with cluster"""
ZWJ = "\u200d"
"""Zero-width joiner; joins next grapheme with cluster"""
VARIATION_SELECTORS = set((chr(c) for c in range(0xfe00, 0xfe0f + 1)))
"""Variation selectors; modify display of previous grapheme"""
EMOJI_VS = "\ufe0f"
"""'Emoji type' variation selector; display previous character as emoji"""
SKIN_TONES = set((chr(c) for c in range(0x1f3fb, 0x1f3ff + 1)))
"""Skin tone modifiers; modifies previous emoji with no ZWJ between"""
ASSUME_WIDE = False
"""
Whether to assume that all emoji matched by `is_emoji()` are Wide, even if their
`unicodedata.east_asian_width()` result is Narrow
"""
FORCE_EVS_WIDE = True
"""
Whether Narrow emoji which are joined by the Emoji Variation Selector `\\ufe0f`
will be forcibly labeled as Wide for the purposes of column offset compensation
(e.g. varation-selected male/female symbol emoji)
"""
VALID_ZWC = set(("\n",))
"""Unjoined zero-width characters that will not be stripped during parsing"""
log = logging.getLogger(__name__)
log.setLevel(getattr(logging, environ.get("loglevel", "INFO")))
streamHandler = logging.StreamHandler(stdout)
log.addHandler(streamHandler)
class Grapheme:
"""
Class for storing (potentially clustered) graphemes
The base character is stored separately from its various modifying
sequences to accommodate terminals which do not support zero-width
characters, combining characters, etc. Variation-selected emoji which are
considered by the terminal (incorrectly) to be narrow graphemes are flagged
so that the column offset caused during display can be compensated for.
"""
char: str
mods: str
width: int
force_width: bool = False
def __init__(self, char: str = "", mods: str = "", width: int = 0):
self.char = char
self.mods = mods
self.width = width
def _modstr(self, s):
return '0x%04X' % ord(s) if wcswidth(s) <= 0 else s
def __repr__(self):
return (
f"Grapheme(char={self.char!r}, "
f"mods={[self._modstr(c) for c in self.mods]}, "
f"width={self.width}{' <forced>' if self.force_width else ''})"
)
def __str__(self):
return "".join(
(self.char, self.mods, (" " if self.force_width else ""))
)
def graphemes(string: str):
"""
Split a Unicode string into (potentially clustered) graphemes.
Args:
string: The string to parse.
Returns:
A list of `Grapheme` objects parsed from the input string.
"""
def _append_cell(cell: Grapheme, cells: list[Grapheme]):
cells.append(cell)
return Grapheme()
cell = Grapheme()
cells: list[Grapheme] = []
joined = False
was_emoji = False
for c in string:
if c == ZWJ:
if was_emoji:
log.debug("ZWJ")
cell.mods += c
joined = True
else:
log.debug("unexpected ZWJ")
continue
if c == ZWNJ:
joined = False
if cell.char != "":
log.debug("ZWNJ")
cell.mods += c
was_emoji = False
else:
log.debug("unexpected ZWNJ")
continue
if unicodedata.combining(c) != 0:
if cells:
log.debug(f"combining character: {'o' + c!r}")
cell = cells.pop()
cell.mods += c
else:
log.debug("unexpected combining character")
continue
if c in VARIATION_SELECTORS:
if cell.char != "":
if c == EMOJI_VS:
if was_emoji:
log.debug("emoji variation selector")
cell.mods += c
if FORCE_EVS_WIDE and cell.width < 2:
log.debug("forced wide")
cell.width = 2
cell.force_width = True
else:
log.debug("unexpected emoji variation selector")
else:
log.debug(f"variation selector: {c!r}")
cell.mods += c
else:
log.debug("unexpected variation selector")
continue
if c in SKIN_TONES and was_emoji:
log.debug(f"skin tone: {c!r}")
cell.mods += c
if not is_emoji(str(cell)):
# separate skin tone modifier from emoji if invalid
log.debug(f"invalid base: {cell!r}")
cell.mods = cell.mods[:-1]
cell = _append_cell(cell, cells)
cell.char = f"{ZWNJ}{c}"
cell.width = 2
cell = _append_cell(cell, cells)
was_emoji = False
continue
if joined:
log.debug(f"joining: {c!r}")
cell.mods += c
joined = False
continue
if cell.char != "":
cell = _append_cell(cell, cells)
if unicodedata.east_asian_width(c) == "W":
log.debug("wide")
cell.width = 2
else:
cell.width = wcswidth(c)
# assume the terminal will cause a problematic visual column offset when
# displaying emoji that are (incorrectly) labeled as Narrow
if ASSUME_WIDE and was_emoji and not cell.width:
log.debug("assumed wide")
cell.width = 2
cell.force_width = True
if is_emoji(c):
log.debug(f"emoji: {c!r}")
cell.char = c
was_emoji = True
continue
if was_emoji:
log.debug("end emoji")
last = cells[-1]
if not is_emoji(str(last).rstrip().rstrip(EMOJI_VS)):
log.debug(f"invalid emoji: {last!r}")
# strip all but base emoji character if invalid
last.mods = ""
was_emoji = False
if wcswidth(c) < 1:
hexstr = "0x%04X" % ord(c)
if c not in VALID_ZWC:
log.debug(f"stripping ZWC: {hexstr}")
continue
log.debug(f"ZWC: {hexstr}")
cell.char = c
cell = _append_cell(cell, cells)
return cells
data = f"""
emoji:
------
๐ŸŽจ - standard wide
โ™‚{EMOJI_VS} - variation-selected narrow
๐Ÿ‘‹๐Ÿพ - skin tone ๐Ÿ‘‹{ZWNJ}๐Ÿพ
๐Ÿง‘โ€๐Ÿ’ป - zero-width-joiner ๐Ÿง‘๐Ÿ’ป
๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง - multiple zwj ๐Ÿ‘จ๐Ÿ‘ฉ๐Ÿ‘ง
๐Ÿง™โ€โ™‚๏ธ - zwj + evs narrow ๐Ÿง™โ™‚{EMOJI_VS}
๐Ÿง‘๐Ÿผโ€๐Ÿš’ - skin tone ๐Ÿง‘{ZWNJ}๐Ÿผ, zwj ๐Ÿš’
๐Ÿ‘ฎ๐Ÿฟโ€โ™€๏ธ - skin tone ๐Ÿ‘ฉ{ZWNJ}๐Ÿฟ, zwj + evs narrow ๐Ÿ‘ฎโ™€{EMOJI_VS}
๐Ÿ˜ญ{ZWJ}๐Ÿš’ - invalid zwj
๐Ÿ˜ญ๐Ÿฟ invalid skin tone combo
combining characters:
---------------------
a - standard narrow
aฬ‚ - narrow + combining character
"""
grapheme_list = graphemes(data)
for g in grapheme_list:
print(str(g), end="")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment