Skip to content

Instantly share code, notes, and snippets.

@masakielastic
Last active December 19, 2015 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masakielastic/5973095 to your computer and use it in GitHub Desktop.
Save masakielastic/5973095 to your computer and use it in GitHub Desktop.
This gist is not maintained. See https://github.com/masakielastic/patches/tree/master/php_bugs_65082 for the latest infomation.
diff --git a/ext/json/json.c b/ext/json/json.c
index 5360841..1f2ede1 100644
--- a/ext/json/json.c
+++ b/ext/json/json.c
@@ -103,6 +103,8 @@ static PHP_MINIT_FUNCTION(json)
REGISTER_LONG_CONSTANT("JSON_PRETTY_PRINT", PHP_JSON_PRETTY_PRINT, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_UNESCAPED_UNICODE", PHP_JSON_UNESCAPED_UNICODE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_PARTIAL_OUTPUT_ON_ERROR", PHP_JSON_PARTIAL_OUTPUT_ON_ERROR, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("JSON_NOTUTF8_SUBSTITUTE", PHP_JSON_NOTUTF8_SUBSTITUTE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("JSON_NOTUTF8_IGNORE", PHP_JSON_NOTUTF8_IGNORE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_ERROR_NONE", PHP_JSON_ERROR_NONE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_ERROR_DEPTH", PHP_JSON_ERROR_DEPTH, CONST_CS | CONST_PERSISTENT);
@@ -358,47 +360,44 @@ static void json_encode_array(smart_str *buf, zval **val, int options TSRMLS_DC)
}
/* }}} */
-static int json_utf8_to_utf16(unsigned short *utf16, char utf8[], int len) /* {{{ */
+static int json_utf8_to_utf16(unsigned short *utf16, char utf8[], int utf8_len, int options) /* {{{ */
{
- size_t pos = 0, us;
- int j, status;
-
- if (utf16) {
- /* really convert the utf8 string */
- for (j=0 ; pos < len ; j++) {
- us = php_next_utf8_char((const unsigned char *)utf8, len, &pos, &status);
- if (status != SUCCESS) {
- return -1;
- }
- /* From http://en.wikipedia.org/wiki/UTF16 */
- if (us >= 0x10000) {
- us -= 0x10000;
- utf16[j++] = (unsigned short)((us >> 10) | 0xd800);
- utf16[j] = (unsigned short)((us & 0x3ff) | 0xdc00);
+ size_t pos = 0;
+ unsigned int code_point;
+ int len, status;
+
+ for (len = 0; pos < utf8_len; len++) {
+ code_point = php_next_utf8_char((const unsigned char *) utf8, utf8_len, &pos, &status);
+ if (status == FAILURE) {
+ if (options & PHP_JSON_NOTUTF8_IGNORE) {
+ /* ignore this invalid character */
+ len--;
+ continue;
+ } else if (options & PHP_JSON_NOTUTF8_SUBSTITUTE) {
+ /* Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) */
+ code_point = 0xfffd;
} else {
- utf16[j] = (unsigned short)us;
- }
- }
- } else {
- /* Only check if utf8 string is valid, and compute utf16 lenght */
- for (j=0 ; pos < len ; j++) {
- us = php_next_utf8_char((const unsigned char *)utf8, len, &pos, &status);
- if (status != SUCCESS) {
return -1;
}
- if (us >= 0x10000) {
- j++;
- }
+ }
+ /* From http://en.wikipedia.org/wiki/UTF16 */
+ if (code_point < 0x10000) {
+ utf16[len] = (unsigned short) code_point;
+ } else {
+ code_point -= 0x10000;
+ utf16[len++] = (unsigned short) ((code_point >> 10) | 0xd800);
+ utf16[len] = (unsigned short) ((code_point & 0x3ff) | 0xdc00);
}
}
- return j;
+
+ return len;
}
/* }}} */
static void json_escape_string(smart_str *buf, char *s, int len, int options TSRMLS_DC) /* {{{ */
{
- int pos = 0, ulen = 0;
+ int pos = 0;
unsigned short us;
unsigned short *utf16;
size_t newlen;
@@ -432,13 +431,13 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
}
- utf16 = (options & PHP_JSON_UNESCAPED_UNICODE) ? NULL : (unsigned short *) safe_emalloc(len, sizeof(unsigned short), 0);
- ulen = json_utf8_to_utf16(utf16, s, len);
- if (ulen <= 0) {
+ utf16 = (unsigned short *) safe_emalloc(len, sizeof(unsigned short), 0);
+ len = json_utf8_to_utf16(utf16, s, len, options);
+ if (len <= 0) {
if (utf16) {
efree(utf16);
}
- if (ulen < 0) {
+ if (len < 0) {
JSON_G(error_code) = PHP_JSON_ERROR_UTF8;
smart_str_appendl(buf, "null", 4);
} else {
@@ -446,9 +445,6 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
}
return;
}
- if (!(options & PHP_JSON_UNESCAPED_UNICODE)) {
- len = ulen;
- }
/* pre-allocate for string length plus 2 quotes */
smart_str_alloc(buf, len+2, 0);
@@ -456,7 +452,7 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
while (pos < len)
{
- us = (options & PHP_JSON_UNESCAPED_UNICODE) ? s[pos++] : utf16[pos++];
+ us = utf16[pos++];
switch (us)
{
@@ -533,13 +529,39 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
break;
default:
- if (us >= ' ' && ((options & PHP_JSON_UNESCAPED_UNICODE) || (us & 127) == us)) {
- smart_str_appendc(buf, (unsigned char) us);
+ if (options & PHP_JSON_UNESCAPED_UNICODE) {
+
+ if (us < 0x20) {
+ smart_str_appendl(buf, "\\u", 2);
+ smart_str_appendc(buf, digits[(us >> 12) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 8) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 4) & 0xf]);
+ smart_str_appendc(buf, digits[(us & 0xf)]);
+ } else if (us < 0x80) {
+ smart_str_appendc(buf, us);
+ } else if (us < 0x800) {
+ smart_str_appendc(buf, 0xc0 | (us >> 6));
+ smart_str_appendc(buf, 0x80 | (us & 0x3f));
+ } else if (us >= 0xd800 && us <= 0xdbff) {
+ unsigned int utf32;
+ utf32 = ((us & 0x3ff) << 10) + (utf16[pos++] & 0x3ff) + 0x10000;
+ smart_str_appendc(buf, 0xf0 | (utf32 >> 18));
+ smart_str_appendc(buf, 0x80 | ((utf32 >> 12) & 0x3f));
+ smart_str_appendc(buf, 0x80 | ((utf32 >> 6) & 0x3f));
+ smart_str_appendc(buf, 0x80 | (utf32 & 0x3f));
+ } else {
+ smart_str_appendc(buf, 0xe0 | (us >> 12));
+ smart_str_appendc(buf, 0x80 | ((us >> 6) & 0x3f));
+ smart_str_appendc(buf, 0x80 | (us & 0x3f));
+ }
+
+ } else if (us >= 0x20 && us < 0x80) {
+ smart_str_appendc(buf, (unsigned char) us);
} else {
smart_str_appendl(buf, "\\u", 2);
- smart_str_appendc(buf, digits[(us & 0xf000) >> 12]);
- smart_str_appendc(buf, digits[(us & 0xf00) >> 8]);
- smart_str_appendc(buf, digits[(us & 0xf0) >> 4]);
+ smart_str_appendc(buf, digits[(us >> 12) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 8) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 4) & 0xf]);
smart_str_appendc(buf, digits[(us & 0xf)]);
}
break;
@@ -670,7 +692,7 @@ PHP_JSON_API void php_json_decode_ex(zval *return_value, char *str, int str_len,
utf16 = (unsigned short *) safe_emalloc((str_len+1), sizeof(unsigned short), 1);
- utf16_len = json_utf8_to_utf16(utf16, str, str_len);
+ utf16_len = json_utf8_to_utf16(utf16, str, str_len, options);
if (utf16_len <= 0) {
if (utf16) {
efree(utf16);
@@ -869,4 +891,4 @@ static PHP_FUNCTION(json_last_error_msg)
* End:
* vim600: noet sw=4 ts=4 fdm=marker
* vim<600: noet sw=4 ts=4
- */
+ */
\ No newline at end of file
diff --git a/ext/json/php_json.h b/ext/json/php_json.h
index ec707ce..2360278 100644
--- a/ext/json/php_json.h
+++ b/ext/json/php_json.h
@@ -65,6 +65,8 @@ extern zend_class_entry *php_json_serializable_ce;
#define PHP_JSON_PRETTY_PRINT (1<<7)
#define PHP_JSON_UNESCAPED_UNICODE (1<<8)
#define PHP_JSON_PARTIAL_OUTPUT_ON_ERROR (1<<9)
+#define PHP_JSON_NOTUTF8_SUBSTITUTE (1<<10)
+#define PHP_JSON_NOTUTF8_IGNORE (1<<11)
/* Internal flags */
#define PHP_JSON_OUTPUT_ARRAY 0
diff --git a/ext/json/json.c b/ext/json/json.c
index 5360841..1b60032 100644
--- a/ext/json/json.c
+++ b/ext/json/json.c
@@ -103,6 +103,8 @@ static PHP_MINIT_FUNCTION(json)
REGISTER_LONG_CONSTANT("JSON_PRETTY_PRINT", PHP_JSON_PRETTY_PRINT, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_UNESCAPED_UNICODE", PHP_JSON_UNESCAPED_UNICODE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_PARTIAL_OUTPUT_ON_ERROR", PHP_JSON_PARTIAL_OUTPUT_ON_ERROR, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("JSON_NOTUTF8_SUBSTITUTE", PHP_JSON_NOTUTF8_SUBSTITUTE, CONST_CS | CONST_PERSISTENT);
+ REGISTER_LONG_CONSTANT("JSON_NOTUTF8_IGNORE", PHP_JSON_NOTUTF8_IGNORE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_ERROR_NONE", PHP_JSON_ERROR_NONE, CONST_CS | CONST_PERSISTENT);
REGISTER_LONG_CONSTANT("JSON_ERROR_DEPTH", PHP_JSON_ERROR_DEPTH, CONST_CS | CONST_PERSISTENT);
@@ -358,48 +360,76 @@ static void json_encode_array(smart_str *buf, zval **val, int options TSRMLS_DC)
}
/* }}} */
-static int json_utf8_to_utf16(unsigned short *utf16, char utf8[], int len) /* {{{ */
+static int json_utf8_to_utf32(unsigned int *utf32, char utf8[], int utf8_len, int options) /* {{{ */
{
- size_t pos = 0, us;
- int j, status;
-
- if (utf16) {
- /* really convert the utf8 string */
- for (j=0 ; pos < len ; j++) {
- us = php_next_utf8_char((const unsigned char *)utf8, len, &pos, &status);
- if (status != SUCCESS) {
- return -1;
- }
- /* From http://en.wikipedia.org/wiki/UTF16 */
- if (us >= 0x10000) {
- us -= 0x10000;
- utf16[j++] = (unsigned short)((us >> 10) | 0xd800);
- utf16[j] = (unsigned short)((us & 0x3ff) | 0xdc00);
+ size_t pos = 0;
+ unsigned int code_point;
+ int len, status;
+
+ for (len = 0; pos < utf8_len; len++) {
+ code_point = php_next_utf8_char((const unsigned char *) utf8, utf8_len, &pos, &status);
+ if (status == FAILURE) {
+
+ if (options & PHP_JSON_NOTUTF8_IGNORE) {
+ /* ignore this invalid character */
+ len--;
+ continue;
+ } else if (options & PHP_JSON_NOTUTF8_SUBSTITUTE) {
+ /* Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) */
+ code_point = 0xfffd;
} else {
- utf16[j] = (unsigned short)us;
+ return -1;
}
}
- } else {
- /* Only check if utf8 string is valid, and compute utf16 lenght */
- for (j=0 ; pos < len ; j++) {
- us = php_next_utf8_char((const unsigned char *)utf8, len, &pos, &status);
- if (status != SUCCESS) {
+
+ utf32[len] = code_point;
+
+ }
+
+ return len;
+}
+/* }}} */
+
+static int json_utf8_to_utf16(unsigned short *utf16, char utf8[], int utf8_len, int options) /* {{{ */
+{
+ size_t pos = 0;
+ unsigned int code_point;
+ int len, status;
+
+ for (len = 0; pos < utf8_len; len++) {
+ code_point = php_next_utf8_char((const unsigned char *) utf8, utf8_len, &pos, &status);
+ if (status == FAILURE) {
+ if (options & PHP_JSON_NOTUTF8_IGNORE) {
+ /* ignore this invalid character */
+ len--;
+ continue;
+ } else if (options & PHP_JSON_NOTUTF8_SUBSTITUTE) {
+ /* Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) */
+ code_point = 0xfffd;
+ } else {
return -1;
}
- if (us >= 0x10000) {
- j++;
- }
+ }
+ /* From http://en.wikipedia.org/wiki/UTF16 */
+ if (code_point < 0x10000) {
+ utf16[len] = (unsigned short) code_point;
+ } else {
+ code_point -= 0x10000;
+ utf16[len++] = (unsigned short) ((code_point >> 10) | 0xd800);
+ utf16[len] = (unsigned short) ((code_point & 0x3ff) | 0xdc00);
}
}
- return j;
+
+ return len;
}
/* }}} */
static void json_escape_string(smart_str *buf, char *s, int len, int options TSRMLS_DC) /* {{{ */
{
- int pos = 0, ulen = 0;
- unsigned short us;
+ int pos = 0;
+ unsigned int us;
+ unsigned int *utf32;
unsigned short *utf16;
size_t newlen;
@@ -432,13 +462,27 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
}
- utf16 = (options & PHP_JSON_UNESCAPED_UNICODE) ? NULL : (unsigned short *) safe_emalloc(len, sizeof(unsigned short), 0);
- ulen = json_utf8_to_utf16(utf16, s, len);
- if (ulen <= 0) {
- if (utf16) {
- efree(utf16);
+ if (options & PHP_JSON_UNESCAPED_UNICODE) {
+ utf32 = (unsigned int *) safe_emalloc(len, sizeof(unsigned int), 0);
+ len = json_utf8_to_utf32(utf32, s, len, options);
+ } else {
+ utf16 = (unsigned short *) safe_emalloc(len, sizeof(unsigned short), 0);
+ len = json_utf8_to_utf16(utf16, s, len, options);
+ }
+
+ if (len <= 0) {
+
+ if (options & PHP_JSON_UNESCAPED_UNICODE) {
+ if (utf32) {
+ efree(utf32);
+ }
+ } else {
+ if (utf16) {
+ efree(utf16);
+ }
}
- if (ulen < 0) {
+
+ if (len < 0) {
JSON_G(error_code) = PHP_JSON_ERROR_UTF8;
smart_str_appendl(buf, "null", 4);
} else {
@@ -446,9 +490,6 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
}
return;
}
- if (!(options & PHP_JSON_UNESCAPED_UNICODE)) {
- len = ulen;
- }
/* pre-allocate for string length plus 2 quotes */
smart_str_alloc(buf, len+2, 0);
@@ -456,7 +497,7 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
while (pos < len)
{
- us = (options & PHP_JSON_UNESCAPED_UNICODE) ? s[pos++] : utf16[pos++];
+ us = (options & PHP_JSON_UNESCAPED_UNICODE) ? utf32[pos++]: utf16[pos++];
switch (us)
{
@@ -533,13 +574,37 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
break;
default:
- if (us >= ' ' && ((options & PHP_JSON_UNESCAPED_UNICODE) || (us & 127) == us)) {
- smart_str_appendc(buf, (unsigned char) us);
+ if (options & PHP_JSON_UNESCAPED_UNICODE) {
+
+ if (us < 0x20) {
+ smart_str_appendl(buf, "\\u", 2);
+ smart_str_appendc(buf, digits[(us >> 12) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 8) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 4) & 0xf]);
+ smart_str_appendc(buf, digits[(us & 0xf)]);
+ } else if (us < 0x80) {
+ smart_str_appendc(buf, us);
+ } else if (us < 0x800) {
+ smart_str_appendc(buf, 0xc0 | (us >> 6));
+ smart_str_appendc(buf, 0x80 | (us & 0x3f));
+ } else if (us < 0x10000) {
+ smart_str_appendc(buf, 0xe0 | (us >> 12));
+ smart_str_appendc(buf, 0x80 | ((us >> 6) & 0x3f));
+ smart_str_appendc(buf, 0x80 | (us & 0x3f));
+ } else {
+ smart_str_appendc(buf, 0xf0 | (us >> 18));
+ smart_str_appendc(buf, 0x80 | ((us >> 12) & 0x3f));
+ smart_str_appendc(buf, 0x80 | ((us >> 6) & 0x3f));
+ smart_str_appendc(buf, 0x80 | (us & 0x3f));
+ }
+
+ } else if (us >= 0x20 && us < 0x80) {
+ smart_str_appendc(buf, (unsigned char) us);
} else {
smart_str_appendl(buf, "\\u", 2);
- smart_str_appendc(buf, digits[(us & 0xf000) >> 12]);
- smart_str_appendc(buf, digits[(us & 0xf00) >> 8]);
- smart_str_appendc(buf, digits[(us & 0xf0) >> 4]);
+ smart_str_appendc(buf, digits[(us >> 12) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 8) & 0xf]);
+ smart_str_appendc(buf, digits[(us >> 4) & 0xf]);
smart_str_appendc(buf, digits[(us & 0xf)]);
}
break;
@@ -547,9 +612,17 @@ static void json_escape_string(smart_str *buf, char *s, int len, int options TSR
}
smart_str_appendc(buf, '"');
- if (utf16) {
- efree(utf16);
+
+ if (options & PHP_JSON_UNESCAPED_UNICODE) {
+ if (utf32) {
+ efree(utf32);
+ }
+ } else {
+ if (utf16) {
+ efree(utf16);
+ }
}
+
}
/* }}} */
@@ -670,7 +743,7 @@ PHP_JSON_API void php_json_decode_ex(zval *return_value, char *str, int str_len,
utf16 = (unsigned short *) safe_emalloc((str_len+1), sizeof(unsigned short), 1);
- utf16_len = json_utf8_to_utf16(utf16, str, str_len);
+ utf16_len = json_utf8_to_utf16(utf16, str, str_len, options);
if (utf16_len <= 0) {
if (utf16) {
efree(utf16);
@@ -869,4 +942,4 @@ static PHP_FUNCTION(json_last_error_msg)
* End:
* vim600: noet sw=4 ts=4 fdm=marker
* vim<600: noet sw=4 ts=4
- */
+ */
\ No newline at end of file
diff --git a/ext/json/php_json.h b/ext/json/php_json.h
index ec707ce..2360278 100644
--- a/ext/json/php_json.h
+++ b/ext/json/php_json.h
@@ -65,6 +65,8 @@ extern zend_class_entry *php_json_serializable_ce;
#define PHP_JSON_PRETTY_PRINT (1<<7)
#define PHP_JSON_UNESCAPED_UNICODE (1<<8)
#define PHP_JSON_PARTIAL_OUTPUT_ON_ERROR (1<<9)
+#define PHP_JSON_NOTUTF8_SUBSTITUTE (1<<10)
+#define PHP_JSON_NOTUTF8_IGNORE (1<<11)
/* Internal flags */
#define PHP_JSON_OUTPUT_ARRAY 0
W1 = 110110yyyyyyyyyy
W2 = 110111xxxxxxxxxx
yyyyyyyyyy = W1 & 0x3FF
xxxxxxxxxx = W2 & 0x3FF
U = U' + 0x10000
= yyyyyyyyyyxxxxxxxxxx + 0x10000
= yyyyyyyyyy0000000000 + xxxxxxxxxx + 0x10000
= (W1 & 0x3FF) << 10 + (W2 & 0x3FF) + 0x10000
<?php
class JsonTest implements JsonSerializable {
private $test;
public function __construct($test) {
$this->test = $test;
}
public function jsonSerialize() {
return $this->test;
}
}
var_dump(
'{"a\ufffd":"a\ufffd"}' === json_encode(new JsonTest(["a\x80" => "a\x80"]), JSON_NOTUTF8_SUBSTITUTE),
'{"'."a\xEF\xBF\xBD".'":"'."a\xEF\xBF\xBD".'"}' === json_encode(new JsonTest(["a\x80" => "a\x80"]), JSON_UNESCAPED_UNICODE | JSON_NOTUTF8_SUBSTITUTE),
'{"a":"a"}' === json_encode(new JsonTest(["a\x80" => "a\x80"]), JSON_NOTUTF8_IGNORE),
'{"a":"a"}' === json_encode(new JsonTest(["a\x80" => "a\x80"]), JSON_UNESCAPED_UNICODE | JSON_NOTUTF8_IGNORE),
// https://en.wikipedia.org/wiki/UTF-8#Examples
// U+0024
'"'."\x24".'"' === json_encode("\x24", JSON_UNESCAPED_UNICODE),
// U+00A2
'"'."\xC2\xA2".'"' === json_encode("\xC2\xA2", JSON_UNESCAPED_UNICODE),
// U+20AC
'"'."\xE2\x82\xAC".'"' === json_encode("\xE2\x82\xAC", JSON_UNESCAPED_UNICODE),
// U+24B62
'"'."\xF0\xA4\xAD\xA2".'"' === json_encode("\xF0\xA4\xAD\xA2", JSON_UNESCAPED_UNICODE),
"a\xEF\xBF\xBD" === json_decode('"'."a\x80".'"', false, 512, JSON_NOTUTF8_SUBSTITUTE),
"a" === json_decode('"'."a\x80".'"', false, 512, JSON_NOTUTF8_IGNORE)
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment