methane/decodebench.py Secret

## decodebench.py
import timeit


data1 = bytes(range(128)) * 8
data2 = bytes(range(256)) * 4


def decode_ascii_strict():
    data1.decode('ascii')


def decode_ascii_surrogateescape1():
    data1.decode('ascii', 'surrogateescape')


def decode_ascii_surrogateescape2():
    data2.decode('ascii', 'surrogateescape')


def main():
    print("strict:",
        timeit.timeit(decode_ascii_strict, number=1000))
    print("ascii + surrogateescape:",
        timeit.timeit(decode_ascii_surrogateescape1, number=1000))
    print("binary + surrogateescape:",
        timeit.timeit(decode_ascii_surrogateescape2, number=1000))


if __name__ == '__main__':
    main()

## faster-decode-ascii-surrogateescape.patch
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e28bae4..507344e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3979,6 +3979,23 @@ PyUnicode_GetDefaultEncoding(void)
     return "utf-8";
 }

+#define _Py_CODEC_ERROR_UNKNOWN 0
+#define _Py_CODEC_ERROR_SURROGATEPASS 1
+#define _Py_CODEC_ERROR_SURROGATEESCAPE 2
+#define _Py_CODEC_ERROR_OTHER -1
+
+static int
+detect_standard_errorhandler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_CODEC_ERROR_OTHER;  /* strict */
+    if (strcmp(errors, "surrogatepass") == 0)
+        return _Py_CODEC_ERROR_SURROGATEPASS;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_CODEC_ERROR_SURROGATEESCAPE;
+    return _Py_CODEC_ERROR_OTHER;
+}
+
 /* create or adjust a UnicodeDecodeError */
 static void
 make_decode_exception(PyObject **exceptionObject,
@@ -6729,6 +6746,7 @@ PyUnicode_DecodeASCII(const char *s,
     const char *e;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
+    int errorType = _Py_CODEC_ERROR_UNKNOWN;

     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6759,6 +6777,23 @@ PyUnicode_DecodeASCII(const char *s,
             ++s;
         }
         else {
+            if (errorType == _Py_CODEC_ERROR_UNKNOWN) {
+                errorType = detect_standard_errorhandler(errors);
+                if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE &&
+                    kind < PyUnicode_2BYTE_KIND) {
+                    if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, 0xffff) < 0)
+                        return NULL;
+                    kind = writer.kind;
+                    data = writer.data;
+                }
+            }
+            if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) {
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+                writer.pos++;
+                ++s;
+                continue;
+            }
+
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(

## result.md

      
    Raw
  

              result.md
            
          
    Python 3.4 tip
without patch:
$ ./python.exe decodebench.py
strict: 0.0005547050022869371
ascii + surrogateescape: 0.0006213680026121438
binary + surrogateescape: 0.18936741600191453

$ ./python.exe decodebench.py
strict: 0.0004597200022544712
ascii + surrogateescape: 0.0005151459990884177
binary + surrogateescape: 0.19078156799514545

$ ./python.exe decodebench.py
strict: 0.00045938100083731115
ascii + surrogateescape: 0.0005216380013735034
binary + surrogateescape: 0.19628148400079226

with patch:
$ ./python.exe decodebench.py
strict: 0.0005446709983516484
ascii + surrogateescape: 0.0006286909992923029
binary + surrogateescape: 0.002856942002836149

$ ./python.exe decodebench.py
strict: 0.0004874210062553175
ascii + surrogateescape: 0.0005456319995573722
binary + surrogateescape: 0.002613579999888316

$ ./python.exe decodebench.py
strict: 0.00047337100113509223
ascii + surrogateescape: 0.000526505995367188
binary + surrogateescape: 0.0024735779952607118
	import timeit


	data1 = bytes(range(128)) * 8
	data2 = bytes(range(256)) * 4


	def decode_ascii_strict():
	data1.decode('ascii')


	def decode_ascii_surrogateescape1():
	data1.decode('ascii', 'surrogateescape')


	def decode_ascii_surrogateescape2():
	data2.decode('ascii', 'surrogateescape')


	def main():
	print("strict:",
	timeit.timeit(decode_ascii_strict, number=1000))
	print("ascii + surrogateescape:",
	timeit.timeit(decode_ascii_surrogateescape1, number=1000))
	print("binary + surrogateescape:",
	timeit.timeit(decode_ascii_surrogateescape2, number=1000))


	if __name__ == '__main__':
	main()
	diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
	index e28bae4..507344e 100644
	--- a/Objects/unicodeobject.c
	+++ b/Objects/unicodeobject.c
	@@ -3979,6 +3979,23 @@ PyUnicode_GetDefaultEncoding(void)
	return "utf-8";
	}

	+#define _Py_CODEC_ERROR_UNKNOWN 0
	+#define _Py_CODEC_ERROR_SURROGATEPASS 1
	+#define _Py_CODEC_ERROR_SURROGATEESCAPE 2
	+#define _Py_CODEC_ERROR_OTHER -1
	+
	+static int
	+detect_standard_errorhandler(const char *errors)
	+{
	+ if (errors == NULL)
	+ return _Py_CODEC_ERROR_OTHER; /* strict */
	+ if (strcmp(errors, "surrogatepass") == 0)
	+ return _Py_CODEC_ERROR_SURROGATEPASS;
	+ if (strcmp(errors, "surrogateescape") == 0)
	+ return _Py_CODEC_ERROR_SURROGATEESCAPE;
	+ return _Py_CODEC_ERROR_OTHER;
	+}
	+
	/* create or adjust a UnicodeDecodeError */
	static void
	make_decode_exception(PyObject **exceptionObject,
	@@ -6729,6 +6746,7 @@ PyUnicode_DecodeASCII(const char *s,
	const char *e;
	PyObject *errorHandler = NULL;
	PyObject *exc = NULL;
	+ int errorType = _Py_CODEC_ERROR_UNKNOWN;

	if (size == 0)
	_Py_RETURN_UNICODE_EMPTY();
	@@ -6759,6 +6777,23 @@ PyUnicode_DecodeASCII(const char *s,
	++s;
	}
	else {
	+ if (errorType == _Py_CODEC_ERROR_UNKNOWN) {
	+ errorType = detect_standard_errorhandler(errors);
	+ if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE &&
	+ kind < PyUnicode_2BYTE_KIND) {
	+ if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos, 0xffff) < 0)
	+ return NULL;
	+ kind = writer.kind;
	+ data = writer.data;
	+ }
	+ }
	+ if (errorType == _Py_CODEC_ERROR_SURROGATEESCAPE) {
	+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
	+ writer.pos++;
	+ ++s;
	+ continue;
	+ }
	+
	startinpos = s-starts;
	endinpos = startinpos + 1;
	if (unicode_decode_call_errorhandler_writer(