Created
February 16, 2020 19:48
-
-
Save maksverver/c3d5da8a0a9f2ec1c2a225209f290e13 to your computer and use it in GitHub Desktop.
sqlite3 UTF16 conversion bugs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- sqlite3.c.bak 2020-02-16 19:12:52.784690613 +0100 | |
+++ sqlite3.c-maks 2020-02-16 19:12:39.044669802 +0100 | |
@@ -30552,10 +30552,14 @@ | |
#define READ_UTF16LE(zIn, TERM, c){ \ | |
c = (*zIn++); \ | |
c += ((*zIn++)<<8); \ | |
- if( c>=0xD800 && c<0xE000 && TERM ){ \ | |
- int c2 = (*zIn++); \ | |
- c2 += ((*zIn++)<<8); \ | |
- c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | |
+ if( c>=0xD800 ){ \ | |
+ int c2 = c<0xDC00 && TERM ? (zIn[0] | (zIn[1] << 8)) : 0; \ | |
+ if ( c2>=0xDC00 && c2<0xE000) { \ | |
+ zIn += 2; \ | |
+ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | |
+ } else { \ | |
+ c = 0xFFFD; \ | |
+ } \ | |
} \ | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sqlite3.h> | |
void print_utf8(const char *text) { | |
while (*text) printf(" %02x", *text++ & 0xff); | |
} | |
void print_utf16(const char *text) { | |
while (text[0] || text[1]) { | |
printf(" %02x%02x", (text[0] & 0xff), (text[1] & 0xff)); | |
text += 2; | |
} | |
} | |
int equal_utf16(const char *a, const char *b) { | |
while ((a[0] || a[1]) && a[0] == b[0] && a[1] == b[1]) a += 2, b += 2; | |
return !(a[0] || a[1] || b[0] || b[1]); | |
} | |
int test_utf16(const char *input, | |
const char *expected_output_utf8, | |
const char *expected_output_utf16) { | |
sqlite3 *db = NULL; | |
int err = sqlite3_open("test.db", &db); | |
err = sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS tab(col TEXT)", NULL, NULL, NULL); | |
assert(err = SQLITE_DONE); | |
printf("Input: "); | |
print_utf16(input); | |
printf(" (UTF-16)\n"); | |
// Write input to table. sqlite will convert it to utf-8, which is the default | |
// encoding for databases. | |
sqlite3_stmt *insert_stmt = NULL; | |
err = sqlite3_prepare(db, "INSERT INTO tab(col) VALUES (?)", -1, &insert_stmt, NULL); | |
assert(err == SQLITE_OK); | |
sqlite3_bind_text16(insert_stmt, 1, input, -1, NULL); | |
err = sqlite3_step(insert_stmt); | |
assert(err == SQLITE_DONE); | |
// Read back from table. | |
sqlite3_stmt *query_stmt = NULL; | |
err = sqlite3_prepare(db, "SELECT col FROM tab", -1, &query_stmt, NULL); | |
assert(err == SQLITE_OK); | |
err = sqlite3_step(query_stmt); | |
assert(err == SQLITE_ROW); | |
int failures = 0; | |
// Extract value as UTF-8. This doesn't require conversion because the | |
// value is stored as UTF-8 in the database. | |
const char *actual_output = sqlite3_column_text(query_stmt, 0); | |
assert(actual_output != NULL); | |
printf("Expected output: "); | |
print_utf8(expected_output_utf8); | |
printf(" (UTF-8)\n"); | |
printf("Actual output: "); | |
print_utf8(actual_output); | |
printf(" (UTF-8)\n"); | |
failures += strcmp(expected_output_utf8, actual_output) != 0; | |
// Extract value as UTF-16. This causes conversion from UTF-8 to UTF-16. | |
actual_output = sqlite3_column_text16(query_stmt, 0); | |
assert(actual_output != NULL); | |
printf("Expected output: "); | |
print_utf16(expected_output_utf16); | |
printf(" (UTF-16)\n"); | |
printf("Actual output: "); | |
print_utf16(actual_output); | |
printf(" (UTF-16)\n"); | |
failures += !equal_utf16(expected_output_utf16, actual_output); | |
printf("Test %s\n\n", failures == 0 ? "passed." : "FAILED!"); | |
sqlite3_finalize(insert_stmt); | |
sqlite3_finalize(query_stmt); | |
err = sqlite3_exec(db, "DROP TABLE tab", NULL, NULL, NULL); | |
assert(err == SQLITE_OK); | |
sqlite3_close(db); | |
return failures; | |
} | |
int main() { | |
printf("SQLite version %s (%s)\n\n", SQLITE_VERSION, SQLITE_SOURCE_ID); | |
// ASCII-only but encoded as UTF-16 works fine. | |
// On disk: UTF-8 66 6f 6f 20 24 20 62 61 72 | |
int failures = test_utf16( | |
"f\0o\0o\0 \0$\0 \0b\0a\0r\0\0", | |
"foo $ bar", | |
"f\0o\0o\0 \0$\0 \0b\0a\0r\0\0"); | |
// Valid UTF-16 encoding gets translated to valid UTF-8 encoding. | |
// On disk: UTF-8 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 | |
failures += test_utf16( | |
"f\0o\0o\0 \0\x3d\xd8\xa9\xdc \0b\0a\0r\0\0", | |
"foo \xf0\x9f\x92\xa9 bar", | |
"f\0o\0o\0 \0\x3d\xd8\xa9\xdc \0b\0a\0r\0\0"); | |
// \xd83d is a UTF-16 high surrogate character. | |
// Ends up inserting "foo \xf0\x9f\x90\xa0bar" (UTF-8). | |
// On disk: UTF-8 f09f90a0 => U0001f420 (space character gets corrupted) | |
failures += test_utf16( | |
"f\0o\0o\0 \0\x3d\xd8 \0b\0a\0r\0\0", | |
"foo \xef\xbf\xbd bar", | |
"f\0o\0o\0 \0\xfd\xff \0b\0a\0r\0\0"); | |
// \xdca9 is a UTF-16 low surrogate character. | |
// On disk: UTF-8 f0ba90a0 => U0003a420 (space character gets corrupted) | |
failures += test_utf16( | |
"f\0o\0o\0 \0\xa9\xdc \0b\0a\0r\0\0", | |
"foo \xef\xbf\xbd bar", | |
"f\0o\0o\0 \0\xfd\xff \0b\0a\0r\0\0"); | |
// \xd83d is a UTF-16 high surrogate character. | |
// It gets converted to UTF-8 on its own (without consuming the following | |
// character). | |
// On disk: UTF-8 eda0bd => U000d83d (invalid UTF-8) | |
failures += test_utf16( | |
"f\0o\0o\0 \0\x3d\xd8\0", | |
"foo \xef\xbf\xbd", | |
"f\0o\0o\0 \0\xfd\xff\0"); | |
return failures; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SQLite version 3.32.0 (2020-02-03 12:25:18 8130bbb4217bd4a4db1f6bf97115a60bee8b29943bed0c7bdf54bba5edbed8fc) | |
Input: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 24 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 24 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 3dd8 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 f0 9f 90 a0 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 3dd8 20dc 6200 6100 7200 (UTF-16) | |
Test FAILED! | |
Input: 6600 6f00 6f00 2000 a9dc 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 f0 ba 90 a0 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 a9d8 20dc 6200 6100 7200 (UTF-16) | |
Test FAILED! | |
Input: 6600 6f00 6f00 2000 3dd8 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd (UTF-8) | |
Actual output: 66 6f 6f 20 ed a0 bd (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 fdff (UTF-16) | |
Test FAILED! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SQLite version 3.32.0 (2020-02-03 12:25:18 8130bbb4217bd4a4db1f6bf97115a60bee8b29943bed0c7bdf54bba5edbed8fc) | |
Input: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 24 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 24 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 3dd8 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 a9dc 2000 6200 6100 7200 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Actual output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16) | |
Test passed. | |
Input: 6600 6f00 6f00 2000 3dd8 (UTF-16) | |
Expected output: 66 6f 6f 20 ef bf bd (UTF-8) | |
Actual output: 66 6f 6f 20 ef bf bd (UTF-8) | |
Expected output: 6600 6f00 6f00 2000 fdff (UTF-16) | |
Actual output: 6600 6f00 6f00 2000 fdff (UTF-16) | |
Test passed. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment