Skip to content

Instantly share code, notes, and snippets.

@methane
Created August 17, 2015 06:40
Show Gist options
  • Save methane/2376ac5d20642c05a8b6 to your computer and use it in GitHub Desktop.
Save methane/2376ac5d20642c05a8b6 to your computer and use it in GitHub Desktop.
Faster ascii w/ surrogateescape decode
import timeit
data1 = bytes(range(128)) * 8
data2 = bytes(range(256)) * 4
def decode_ascii_strict():
data1.decode('ascii')
def decode_ascii_surrogateescape1():
data1.decode('ascii', 'surrogateescape')
def decode_ascii_surrogateescape2():
data2.decode('ascii', 'surrogateescape')
def main():
print("strict:",
timeit.timeit(decode_ascii_strict, number=1000))
print("ascii + surrogateescape:",
timeit.timeit(decode_ascii_surrogateescape1, number=1000))
print("binary + surrogateescape:",
timeit.timeit(decode_ascii_surrogateescape2, number=1000))
if __name__ == '__main__':
main()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e28bae4..507344e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3979,6 +3979,23 @@ PyUnicode_GetDefaultEncoding(void)
return "utf-8";
}
+#define _Py_CODEC_ERROR_UNKNOWN 0
+#define _Py_CODEC_ERROR_SURROGATEPASS 1
+#define _Py_CODEC_ERROR_SURROGATEESCAPE 2
+#define _Py_CODEC_ERROR_OTHER -1
+
+static int
+detect_standard_errorhandler(const char *errors)
+{
+ if (errors == NULL)
+ return _Py_CODEC_ERROR_OTHER; /* strict */
+ if (strcmp(errors, "surrogatepass") == 0)
+ return _Py_CODEC_ERROR_SURROGATEPASS;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_CODEC_ERROR_SURROGATEESCAPE;
+ return _Py_CODEC_ERROR_OTHER;
+}
+
/* create or adjust a UnicodeDecodeError */
static void
make_decode_exception(PyObject **exceptionObject,
@@ -6729,6 +6746,7 @@ PyUnicode_DecodeASCII(const char *s,
const char *e;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ int errorType = _Py_CODEC_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6759,6 +6777,23 @@ PyUnicode_DecodeASCII(const char *s,
++s;
}
else {
+ if (errorType == _Py_CODEC_ERROR_UNKNOWN) {
+ errorType = detect_standard_errorhandler(errors);
+ if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE &&
+ kind < PyUnicode_2BYTE_KIND) {
+ if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, 0xffff) < 0)
+ return NULL;
+ kind = writer.kind;
+ data = writer.data;
+ }
+ }
+ if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) {
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ continue;
+ }
+
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(

Python 3.4 tip

without patch:

$ ./python.exe decodebench.py
strict: 0.0005547050022869371
ascii + surrogateescape: 0.0006213680026121438
binary + surrogateescape: 0.18936741600191453

$ ./python.exe decodebench.py
strict: 0.0004597200022544712
ascii + surrogateescape: 0.0005151459990884177
binary + surrogateescape: 0.19078156799514545

$ ./python.exe decodebench.py
strict: 0.00045938100083731115
ascii + surrogateescape: 0.0005216380013735034
binary + surrogateescape: 0.19628148400079226

with patch:

$ ./python.exe decodebench.py
strict: 0.0005446709983516484
ascii + surrogateescape: 0.0006286909992923029
binary + surrogateescape: 0.002856942002836149

$ ./python.exe decodebench.py
strict: 0.0004874210062553175
ascii + surrogateescape: 0.0005456319995573722
binary + surrogateescape: 0.002613579999888316

$ ./python.exe decodebench.py
strict: 0.00047337100113509223
ascii + surrogateescape: 0.000526505995367188
binary + surrogateescape: 0.0024735779952607118
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment