|
# coding: utf-8 |
|
# Investigating emoji handling in wxPython. |
|
# See: https://github.com/openstenoproject/plover/pull/370 |
|
import unittest |
|
import wx |
|
import sys |
|
|
|
|
|
|
|
# 2016-02-25T22:14:14Z-0500: @jeremy-w: |
|
# |
|
# This is an insurmountable bug in the wchar_t support in narrow builds |
|
# of Python 2. |
|
# |
|
# When we unicodify a utf-8 encoded emoji character, it actually does end |
|
# up correctly converted to utf-16-le for the PyUnicode, but when |
|
# the unicode to wchar conversion happens, it's not surrogate-pair aware |
|
# - it treats the unicode buffer as uniformly 16-bits per codepoint - |
|
# so it produces a bogus UTF-32-LE value in the wchar_t buffer, |
|
# which is what Cocoa chokes on. |
|
# |
|
# `PyUnicode_AsWideChar()` doesn't get called as such. |
|
# Instead, what's called is `PyUnicodeUCS2_AsWideChar()`. |
|
# At the start of that function: |
|
# rdi is the PyUnicode |
|
# to print its size (matches UTF-8 strlen): x/1u "$rdi + 0x10" |
|
# to print its contents: |
|
# - a buffer pointer is at $rdi+0x18 |
|
# print address with x/xg $rdi+0x18, then x/{{SIZE*2}}cb that addy |
|
# and note it's UTF-16-LE, which matches the UCS2 bit. |
|
# rsi is the target buffer pointer |
|
# rdx is the target buffer size |
|
# |
|
# And sure enough, its inner loop is basically: |
|
# const char *pybuf = ...; |
|
# char *wbuf = ...; |
|
# for (size_t i = 0; i < PyUnicode_SIZE(mumble); i += 2) { |
|
# wbuf[0] = pybuf[0]; |
|
# wbuf[1] = pybuf[1]; |
|
# wbuf += 4; |
|
# |
|
# And if wbuf was zeroed to begin with, that puts the UCS2 32-bits into |
|
# every other spot, and leaves 32 bits of zeroes after each, which is |
|
# exactly what we see. |
|
# |
|
# Of course: UCS-2, not UTF-16-LE in truth, so no support for surrogate pairs. |
|
|
|
# 2016-02-18T00:57:42Z-0500: @jeremy-w: |
|
# It looks like a core Python function that's supposed to spit out wchar_t from |
|
# a Python Unicode object isn't doing what wxPython thinks it does. wxWidgets |
|
# expects its wchar_t to be utf-32-native, but the Python function is spitting |
|
# out utf-16-le half-word stuffed in the high bytes of an otherwise zeroed |
|
# word. |
|
# |
|
# - wxWidgets ignores any encoding we set, because it's compiled to use wx_char_t as storage for wxString. |
|
# - Changing this to UTF-8 when building wxWidgets might fix it. |
|
# We want -DwxUSE_UNICODE but NOT -DwxUSE_UNICODE_WCHAR, so it uses UTF-8. |
|
# - As-is, wxPython just uses the PyDefaultEncoding to convert str->unicode, |
|
# after which it calls out to PyUnicode_AsWideChar. |
|
# - AFAICT, this is bungling the encoding somehow! |
|
# We pass in U+1F600. Plain .encode('utf-32-le') gives 00 F6 01 00; |
|
# but the FromEncodedObject is spitting out some bungled utf-16-le crud |
|
# 0x3d 0xd8 0x00 0x00 0x00 0xde 0x00 0x00 |
|
# - compare utf-16-le: 3D D8 00 DE |
|
# - This has added some padding zeros so [3D D8 00 00] [00 DE 00 00] |
|
# but it's just utf-16-le with the utf-16 bits stuffed in the high |
|
# half-word and zeros filling the rest! |
|
|
|
|
|
|
|
# Reference files: |
|
# WXWIDGETS |
|
# https://github.com/wxWidgets/wxWidgets/blob/4acb4cf4769232ca82187e8f848f95120b7edd8d/src/osx/core/cfstring.cpp#L598-L632 |
|
# https://github.com/wxWidgets/wxWidgets/blob/master/src/osx/cocoa/stattext.mm#L94-L102 |
|
# https://github.com/wxWidgets/wxWidgets/blob/master/src/osx/stattext_osx.cpp#L108-L112 |
|
# https://github.com/wxWidgets/wxWidgets/blob/4acb4cf4769232ca82187e8f848f95120b7edd8d/docs/doxygen/images/overview_wxstring_encoding.png |
|
# |
|
# WXPYTHON |
|
# https://github.com/wxWidgets/wxPython/blob/master/src/helpers.cpp#L2189 |
|
|
|
EMOJI = u"😀" # U+1F600 |
|
# >>> EMOJI.encode('utf-8') |
|
# '\xf0\x9f\x98\x80' |
|
# >>> EMOJI.encode('utf-16') |
|
# '\xff\xfe=\xd8\x00\xde' |
|
# NOTE: = is \x3d |
|
# >>> EMOJI.encode('utf-16-le') |
|
# '\x3d\xd8\x00\xde' |
|
# >>> EMOJI.encode('utf-32') # apparently 32-le |
|
# '\xff\xfe\x00\x00\x00\xf6\x01\x00' |
|
# >>> EMOJI.encode('utf-32-le') |
|
# '\x00\xf6\x01\x00' |
|
|
|
|
|
#wx.SetDefaultPyEncoding('utf-32') |
|
# UnicodeDecodeError: 'utf32' codec can't decode bytes in position 0-3: code point not in range(0x110000) |
|
|
|
|
|
app = wx.App(False) # Create a new app, don't redirect stdout/stderr to a window. |
|
print >>sys.stderr, "\n\n\n*** Creating Hello World frame ***\n\n" |
|
frame = wx.Frame(None, wx.ID_ANY, "Hello World") # A Frame is a top-level window. |
|
print >>sys.stderr, "\n\n\n*** Created Hello World frame ***\n\n" |
|
#frame.Show(True) |
|
|
|
|
|
# Encoding to UTF-16 gives: |
|
# Traceback (most recent call last): |
|
# File "plover/gui/test_gui.py", line 15, in test_emoji_label_should_not_crash_process |
|
# label = wx.StaticText(parent=frame, label=encoded_text) |
|
# File "/usr/local/lib/python2.7/site-packages/wx-3.0-osx_cocoa/wx/_controls.py", line 997, in __init__ |
|
# _controls_.StaticText_swiginit(self,_controls_.new_StaticText(*args, **kwargs)) |
|
# File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/utf_8.py", line 16, in decode |
|
# return codecs.utf_8_decode(input, errors, True) |
|
# UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte |
|
# # |
|
|
|
# Encoding as UTF-8 dies in wxWidgets code trying to make an NSString. |
|
# _ZN21wxStaticTextCocoaImpl8SetLabelERK8wxString14wxFontEncoding is promising |
|
# ^ break on this, c twice, and you're there |
|
# |
|
# so is wxCFStringRef::wxCFStringRef(wxString const&, wxFontEncoding) |
|
# wxFONTENCODING_UTF8 is 43 or 0x2b |
|
# the wxCFStr it creates comes back nil! WTH? |
|
# how do we tell if this is a unicode build of wx or what? |
|
# stepped in, i see wchar_t noise: |
|
# -> 0x1036c0288 <+48>: callq 0x1036eaf32 ; symbol stub for: std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> >::basic_string(std::__1::basic_string<wchar_t, std::__1::char_traits<wchar_t>, std::__1::allocator<wchar_t> > const&) |
|
# and we call CFStringCreateWithBytes(), so this is the UNICODE_WCHAR flavor? |
|
|
|
# >>> "utf-8: " + hex(134217984) + "; utf-32 native: " + hex(201326848) |
|
# 'utf-8: 0x8000100; utf-32 native: 0xc000100' |
|
|
|
# reset( CFStringCreateWithBytes( kCFAllocatorDefault, |
|
# (const UInt8*)data, size, cfencoding, false /* no BOM */ ) ); |
|
# rdi: allocator |
|
# rsi: data |
|
# rdx: size |
|
# rcx: encoding |
|
# r8: false |
|
# |
|
# Using utf8 encoding and fontencoding utf8: |
|
# What is the size? 0x8 (reg read rdx) |
|
# What are the bytes in data? (lldb) memory read -f x -s 1 -c 8 $rsi |
|
# 0x7fff5fbfc784: 0x3d 0xd8 0x00 0x00 0x00 0xde 0x00 0x00 |
|
# It's a messed up utf-16-le encoding: utf-16-le is 0x3d 0xd8 0x00 0xde |
|
# (Shorter GDB style also works: x/8xb $rsi) |
|
# What is the encoding? 0x000000001c000100 (reg read rcx) |
|
# kCFStringEncodingUTF32LE = 0x1c000100 /* kTextEncodingUnicodeDefault + kUnicodeUTF32LEFormat */ |
|
# How on earth…? This is the obnoxious wxUSE_UNICODE + wxUSE_UNICODE_WCHAR branch. It overrides our encoding info. The native buffer backing it is expected to be all wchar_t, in wxstr.data(). |
|
|
|
|
|
# Stuff goes wrong before we even get our wxString. It gets created with bogus data AFAICT. Silly wchar_t. |
|
# * thread #1: tid = 0x2a359f8, 0x000000010313764a libwx_osx_cocoau_core-3.0.dylib`wxStaticTextCocoaImpl::SetLabel(wxString const&, wxFontEncodin |
|
# g), queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 |
|
# * frame #0: 0x000000010313764a libwx_osx_cocoau_core-3.0.dylib`wxStaticTextCocoaImpl::SetLabel(wxString const&, wxFontEncoding) |
|
# frame #1: 0x00000001030ccd0f libwx_osx_cocoau_core-3.0.dylib`wxStaticText::DoSetLabel(wxString const&) + 127 |
|
# frame #2: 0x00000001030ccb75 libwx_osx_cocoau_core-3.0.dylib`wxStaticText::SetLabel(wxString const&) + 117 |
|
# frame #3: 0x0000000102c239ac _core_.so`_wrap_Window_SetLabel(_object*, _object*, _object*) + 197 |
|
# swig wrapper - already bogus? |
|
# calls wxString_in_helper(_object*) - suspicious |
|
# dummy plugin definition: https://github.com/wxWidgets/wxPython/blob/14476d72d92c44624d5754c4f1fac2e8d7bc30da/include/wx/wxPython/wxPython.h#L70 |
|
# definition: https://github.com/wxWidgets/wxPython/blob/master/src/helpers.cpp#L2189 |
|
# comes down to: PyUnicode_AsWideChar + PyUnicode_GET_SIZE() |
|
# default encoding only comes into play for going str->unicode prior to this |
|
# >>> is this a Python bug around wchar_t handling?! |
|
# |
|
# Ooh, SetDefaultPyEncoding(): https://github.com/wxWidgets/wxPython/blob/master/src/osx_cocoa/_core.py#L8429 Controls how it goes Unicode <=> wxString |
|
|
|
class EncodingTests(unittest.TestCase): |
|
def test_emoji_label_should_not_crash_process(self): |
|
text = EMOJI |
|
encoded_text = text #text.encode('utf8') |
|
print >>sys.stderr, "\n\n\nCreating static text with empty label.\n\n" |
|
label = wx.StaticText(parent=frame, label='') |
|
print >>sys.stderr, "\n\n\nWe gots a label.\n\n" |
|
# This encoding bit just gets overruled by the wxUSE_WCHAR_UNICODE |
|
# version of the code. Not the compiler flag I would have chosen… |
|
# Especially since wxPython seems to require we route stuff through as |
|
# utf8? |
|
#font = label.GetFont() |
|
#font.SetEncoding(wx.FONTENCODING_UTF8) |
|
#label.SetFont(font) |
|
print >>sys.stderr, "\n\n\n***\nfont is", label.GetFont(), "encoding is", label.GetFont().GetEncoding(), "\n***\n\n\n" |
|
label.SetLabel(encoded_text) |
|
frame.Show(True) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
Oops, description should say
PyUnicode_AsWideChar
not from-encoded; that's for the str to unicode conversion.