Skip to content

Instantly share code, notes, and snippets.

@maksverver
Created February 16, 2020 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maksverver/c3d5da8a0a9f2ec1c2a225209f290e13 to your computer and use it in GitHub Desktop.
Save maksverver/c3d5da8a0a9f2ec1c2a225209f290e13 to your computer and use it in GitHub Desktop.
sqlite3 UTF16 conversion bugs
--- sqlite3.c.bak 2020-02-16 19:12:52.784690613 +0100
+++ sqlite3.c-maks 2020-02-16 19:12:39.044669802 +0100
@@ -30552,10 +30552,14 @@
#define READ_UTF16LE(zIn, TERM, c){ \
c = (*zIn++); \
c += ((*zIn++)<<8); \
- if( c>=0xD800 && c<0xE000 && TERM ){ \
- int c2 = (*zIn++); \
- c2 += ((*zIn++)<<8); \
- c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
+ if( c>=0xD800 ){ \
+ int c2 = c<0xDC00 && TERM ? (zIn[0] | (zIn[1] << 8)) : 0; \
+ if ( c2>=0xDC00 && c2<0xE000) { \
+ zIn += 2; \
+ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
+ } else { \
+ c = 0xFFFD; \
+ } \
} \
}
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sqlite3.h>
void print_utf8(const char *text) {
while (*text) printf(" %02x", *text++ & 0xff);
}
void print_utf16(const char *text) {
while (text[0] || text[1]) {
printf(" %02x%02x", (text[0] & 0xff), (text[1] & 0xff));
text += 2;
}
}
int equal_utf16(const char *a, const char *b) {
while ((a[0] || a[1]) && a[0] == b[0] && a[1] == b[1]) a += 2, b += 2;
return !(a[0] || a[1] || b[0] || b[1]);
}
int test_utf16(const char *input,
const char *expected_output_utf8,
const char *expected_output_utf16) {
sqlite3 *db = NULL;
int err = sqlite3_open("test.db", &db);
err = sqlite3_exec(db, "CREATE TABLE IF NOT EXISTS tab(col TEXT)", NULL, NULL, NULL);
assert(err = SQLITE_DONE);
printf("Input: ");
print_utf16(input);
printf(" (UTF-16)\n");
// Write input to table. sqlite will convert it to utf-8, which is the default
// encoding for databases.
sqlite3_stmt *insert_stmt = NULL;
err = sqlite3_prepare(db, "INSERT INTO tab(col) VALUES (?)", -1, &insert_stmt, NULL);
assert(err == SQLITE_OK);
sqlite3_bind_text16(insert_stmt, 1, input, -1, NULL);
err = sqlite3_step(insert_stmt);
assert(err == SQLITE_DONE);
// Read back from table.
sqlite3_stmt *query_stmt = NULL;
err = sqlite3_prepare(db, "SELECT col FROM tab", -1, &query_stmt, NULL);
assert(err == SQLITE_OK);
err = sqlite3_step(query_stmt);
assert(err == SQLITE_ROW);
int failures = 0;
// Extract value as UTF-8. This doesn't require conversion because the
// value is stored as UTF-8 in the database.
const char *actual_output = sqlite3_column_text(query_stmt, 0);
assert(actual_output != NULL);
printf("Expected output: ");
print_utf8(expected_output_utf8);
printf(" (UTF-8)\n");
printf("Actual output: ");
print_utf8(actual_output);
printf(" (UTF-8)\n");
failures += strcmp(expected_output_utf8, actual_output) != 0;
// Extract value as UTF-16. This causes conversion from UTF-8 to UTF-16.
actual_output = sqlite3_column_text16(query_stmt, 0);
assert(actual_output != NULL);
printf("Expected output: ");
print_utf16(expected_output_utf16);
printf(" (UTF-16)\n");
printf("Actual output: ");
print_utf16(actual_output);
printf(" (UTF-16)\n");
failures += !equal_utf16(expected_output_utf16, actual_output);
printf("Test %s\n\n", failures == 0 ? "passed." : "FAILED!");
sqlite3_finalize(insert_stmt);
sqlite3_finalize(query_stmt);
err = sqlite3_exec(db, "DROP TABLE tab", NULL, NULL, NULL);
assert(err == SQLITE_OK);
sqlite3_close(db);
return failures;
}
int main() {
printf("SQLite version %s (%s)\n\n", SQLITE_VERSION, SQLITE_SOURCE_ID);
// ASCII-only but encoded as UTF-16 works fine.
// On disk: UTF-8 66 6f 6f 20 24 20 62 61 72
int failures = test_utf16(
"f\0o\0o\0 \0$\0 \0b\0a\0r\0\0",
"foo $ bar",
"f\0o\0o\0 \0$\0 \0b\0a\0r\0\0");
// Valid UTF-16 encoding gets translated to valid UTF-8 encoding.
// On disk: UTF-8 66 6f 6f 20 f0 9f 92 a9 20 62 61 72
failures += test_utf16(
"f\0o\0o\0 \0\x3d\xd8\xa9\xdc \0b\0a\0r\0\0",
"foo \xf0\x9f\x92\xa9 bar",
"f\0o\0o\0 \0\x3d\xd8\xa9\xdc \0b\0a\0r\0\0");
// \xd83d is a UTF-16 high surrogate character.
// Ends up inserting "foo \xf0\x9f\x90\xa0bar" (UTF-8).
// On disk: UTF-8 f09f90a0 => U0001f420 (space character gets corrupted)
failures += test_utf16(
"f\0o\0o\0 \0\x3d\xd8 \0b\0a\0r\0\0",
"foo \xef\xbf\xbd bar",
"f\0o\0o\0 \0\xfd\xff \0b\0a\0r\0\0");
// \xdca9 is a UTF-16 low surrogate character.
// On disk: UTF-8 f0ba90a0 => U0003a420 (space character gets corrupted)
failures += test_utf16(
"f\0o\0o\0 \0\xa9\xdc \0b\0a\0r\0\0",
"foo \xef\xbf\xbd bar",
"f\0o\0o\0 \0\xfd\xff \0b\0a\0r\0\0");
// \xd83d is a UTF-16 high surrogate character.
// It gets converted to UTF-8 on its own (without consuming the following
// character).
// On disk: UTF-8 eda0bd => U000d83d (invalid UTF-8)
failures += test_utf16(
"f\0o\0o\0 \0\x3d\xd8\0",
"foo \xef\xbf\xbd",
"f\0o\0o\0 \0\xfd\xff\0");
return failures;
}
SQLite version 3.32.0 (2020-02-03 12:25:18 8130bbb4217bd4a4db1f6bf97115a60bee8b29943bed0c7bdf54bba5edbed8fc)
Input: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 24 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 24 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 3dd8 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 f0 9f 90 a0 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 3dd8 20dc 6200 6100 7200 (UTF-16)
Test FAILED!
Input: 6600 6f00 6f00 2000 a9dc 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 f0 ba 90 a0 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 a9d8 20dc 6200 6100 7200 (UTF-16)
Test FAILED!
Input: 6600 6f00 6f00 2000 3dd8 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd (UTF-8)
Actual output: 66 6f 6f 20 ed a0 bd (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff (UTF-16)
Actual output: 6600 6f00 6f00 2000 fdff (UTF-16)
Test FAILED!
SQLite version 3.32.0 (2020-02-03 12:25:18 8130bbb4217bd4a4db1f6bf97115a60bee8b29943bed0c7bdf54bba5edbed8fc)
Input: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 24 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 24 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 2400 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 f0 9f 92 a9 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 3dd8 a9dc 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 3dd8 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 a9dc 2000 6200 6100 7200 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Actual output: 66 6f 6f 20 ef bf bd 20 62 61 72 (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Actual output: 6600 6f00 6f00 2000 fdff 2000 6200 6100 7200 (UTF-16)
Test passed.
Input: 6600 6f00 6f00 2000 3dd8 (UTF-16)
Expected output: 66 6f 6f 20 ef bf bd (UTF-8)
Actual output: 66 6f 6f 20 ef bf bd (UTF-8)
Expected output: 6600 6f00 6f00 2000 fdff (UTF-16)
Actual output: 6600 6f00 6f00 2000 fdff (UTF-16)
Test passed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment