Skip to content

Instantly share code, notes, and snippets.

@henrygab
Created April 28, 2020 22:47
Show Gist options
  • Save henrygab/92a7077e4cc9b3e6b098ecaa5fa2d908 to your computer and use it in GitHub Desktop.
Save henrygab/92a7077e4cc9b3e6b098ecaa5fa2d908 to your computer and use it in GitHub Desktop.
Brute-force validation of UTF-8 decoder
/*
* The MIT License (MIT)
*
* Copyright (c) 2020 Henry Gabryjelski
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <inttypes.h>
#include <stdio.h>
#include <assert.h>
const uint32_t CODEPOINT_LARGEST_VALID_UTF8 = 0x10FFFF;
const uint32_t CODEPOINT_REPLACEMENT_CHARACTER = 0xFFFD;
const uint32_t CODEPOINT_WORD_JOINER = 0x2060;
const uint32_t CODEPOINT_LOWEST_SURROGATE_HALF = 0xD800;
const uint32_t CODEPOINT_HIGHEST_SURROGATE_HALF = 0xDFFF;
constexpr static inline bool isInvalidUtf8Octet(uint8_t t) {
// see bullets in https://tools.ietf.org/html/rfc3629#section-1
return (t == 0xc0) || (t == 0xC1) || ((t >= 0xF5) && (t <= 0xFF));
}
constexpr static inline bool isCodepointSurrogateHalf(uint32_t codepoint) {
// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
// not legal Unicode values ... see RFC 3629.
return ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF));
}
constexpr static inline bool isValidCodepointForUTF8(uint32_t codepoint) {
if (isCodepointSurrogateHalf(codepoint)) {
return false;
} else if (codepoint > CODEPOINT_LARGEST_VALID_UTF8) {
return false;
} else {
return true;
}
}
constexpr static inline int expectedEncodingLengthForUTF8(uint32_t codepoint) {
if (!isValidCodepointForUTF8(codepoint)) {
return -1;
} else if (codepoint <= 0x00007F) {
return 1;
} else if (codepoint <= 0x0007FF) {
return 2;
} else if (codepoint <= 0x00FFFF) {
return 3;
} else if (codepoint <= 0x10FFFF) {
static_assert(0x10FFFF == CODEPOINT_LARGEST_VALID_UTF8);
return 4;
} else {
return -1; // belt and suspenders for validity
}
}
/* Functions under test */
static int8_t utf8Codepoint(const uint8_t *utf8, uint32_t *codepointp)
{
*codepointp = 0xFFFD; // always initialize output to known value ... 0xFFFD (REPLACEMENT CHARACTER) seems the natural choice
int codepoint;
int len;
// The upper bits define both the length of additional bytes for the multi-byte encoding,
// as well as defining how many bits of the first byte are included in the codepoint.
// Each additional byte starts with 0b10xxxxxx, encoding six additional bits for the codepoint.
//
// For key summary points, see:
// * https://tools.ietf.org/html/rfc3629#section-3
//
// if ((utf8[0] == 0xEF) && (utf8[1] == 0xBB) && (utf8[2] == 0xBF)) {
// // If the caller does not exclude the BOM (which is legal for UTF-8), convert it to U+2060 WORD JOINER
// *codepointp = CODEPOINT_WORD_JOINER;
// return 3;
// }
if (isInvalidUtf8Octet(utf8[0])) { // do not allow illegal octet sequences (e.g., 0xC0 0x80 should NOT decode to NULL)
return -1;
}
if (utf8[0] < 0x80) { // characters 0000 0000..0000 007F (up to 7 significant bits)
len = 1;
codepoint = utf8[0];
} else if ((utf8[0] & 0xe0) == 0xc0) { // characters 0000 0080..0000 07FF (up to 11 significant bits, so first byte encodes five bits)
len = 2;
codepoint = utf8[0] & 0x1f;
} else if ((utf8[0] & 0xf0) == 0xe0) { // characters 0000 8000..0000 FFFF (up to 16 significant bits, so first byte encodes four bits)
len = 3;
codepoint = utf8[0] & 0x0f;
} else if ((utf8[0] & 0xf8) == 0xf0) { // characters 0001 0000..0010 FFFF (up to 21 significant bits, so first byte encodes three bits)
len = 4;
codepoint = utf8[0] & 0x07;
} else { // UTF-8 is defined to only map to Unicode -- 0x00000000..0x0010FFFF
// 5-byte and 6-byte sequences are not legal per RFC3629
return -1;
}
for (int i = 1; i < len; i++) {
if ((utf8[i] & 0xc0) != 0x80) {
// the additional bytes in a valid UTF-8 multi-byte encoding cannot have either of the top two bits set
// This is more restrictive than isInvalidUtf8Octet()
return -1;
}
codepoint <<= 6; // each continuation byte adds six bits to the codepoint
codepoint |= utf8[i] & 0x3f; // mask off the top two continuation bits, and add the six relevant bits
}
// explicit validation to prevent overlong encodings
if ( (len == 1) && ((codepoint < 0x000000) || (codepoint > 0x00007F))) {
return -1;
} else if ((len == 2) && ((codepoint < 0x000080) || (codepoint > 0x0007FF))) {
return -1;
} else if ((len == 3) && ((codepoint < 0x000800) || (codepoint > 0x00FFFF))) {
return -1;
} else if ((len == 4) && ((codepoint < 0x010000) || (codepoint > 0x10FFFF))) {
// "You might expect larger code points than U+10FFFF
// to be expressible, but Unicode is limited in Sections 12
// of RFC3629 to match the limits of UTF-16." -- Wikipedia UTF-8 note
// See https://tools.ietf.org/html/rfc3629#section-12
return -1;
}
// high and low surrogate halves (U+D800 through U+DFFF) used by UTF-16 are
// not legal Unicode values ... see RFC 3629.
if ((codepoint >= CODEPOINT_LOWEST_SURROGATE_HALF) && (codepoint <= CODEPOINT_HIGHEST_SURROGATE_HALF)) {
return -1;
}
*codepointp = codepoint;
return len;
}
/* How to test those? */
/*
1. From i = 0 through at least 0x10FFFF + 1 ....
2.
*/
uint8_t bufferUnderTest[8] = {0,0,0,0,0}; // up to four bytes + NULL
void PrintBufferUnderTest(int encodingLength, bool outputNewline = true)
{
printf("Buffer under test (length %d): %02x %02x %02x %02x %02x", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
if (outputNewline) { printf("\n"); }
}
void ValidateDecoder()
{
// Simple! Just increment the four-byte values, and get the last code point.
// The decoder should ONLY decode 0x10FFFF total codepoints, and each one
// decoded should be exactly one greater than the prior decoded value.
// Exceptions:
// * Surrogate halves
// * Byte order mark (0xEF 0xBB 0xBF) --> 0x8288 (WORD JOINER)
uint32_t lastDecodedCodePoint = 0xFFFFFFFF; // invalid, and +1 will be zero
int8_t encodingLength = 1;
bool finished = false;
bool anyFailure = false;
while (!finished) {
uint32_t currentCodepoint = 0xFFFFFFFF;
int8_t len = utf8Codepoint(bufferUnderTest, &currentCodepoint);
// Check the result of attempt to decode
if (len < 0) {
// do nothing. This is a consequence of the optimized testing without having
// know if all inputs are valid.
} else if (len < encodingLength) {
// the reported encoded length is smaller ... this is not by itself a failure, and can be ignored
// as it was already handled by prior testing of the smaller length buffers.
} else if (len > encodingLength) {
assert(!"Reported success decoding, but length is greater than expected");
} else {
// function reported successful decoding.
// This is a failure unless one of the following two conditions hold:
bool isActuallyFailure = true;
if (lastDecodedCodePoint == (CODEPOINT_LOWEST_SURROGATE_HALF-1) && (CODEPOINT_HIGHEST_SURROGATE_HALF+1) == currentCodepoint) {
isActuallyFailure = false;
}
if (lastDecodedCodePoint + 1 == currentCodepoint) {
isActuallyFailure = false;
}
//PrintBufferUnderTest(encodingLength, false);
//printf(" ==> %08x (%" PRIu32 ") %s\n", currentCodepoint, currentCodepoint, (isActuallyFailure ? "(failure)" : ""));
// if (currentCodepoint == CODEPOINT_WORD_JOINER && bufferUnderTest[0] == 0xEF && bufferUnderTest[1] == 0xBB && bufferUnderTest[2] == 0xBE) {
// // Special-case ... byte order mark is allowed to convert to CODEPOINT_WORD_JOINER
// // This is neither failure (assertion) nor success (update of last decoded code point) ... should be ignored.
// } else
if (isActuallyFailure) {
printf("Failure: prior decoded code point %08x (%" PRIu32 "), this decoded code point is %08x (%" PRIu32 ")\n", lastDecodedCodePoint, lastDecodedCodePoint, currentCodepoint, currentCodepoint);
PrintBufferUnderTest(encodingLength);
assert(!"failure vs. last decoded");
anyFailure = true;
} else {
lastDecodedCodePoint = currentCodepoint;
}
}
if (true) { // update encodingLength / LOOP EXIT CONDITION
bool updateEncodingLength = true;
for (int i = 0; i < encodingLength; ++i) {
if (bufferUnderTest[i] != 0xFF) {
updateEncodingLength = false;
}
}
if (updateEncodingLength && encodingLength == 4) {
finished = true;
}
if (!finished && updateEncodingLength) {
if (anyFailure) {
finished = true;
}
printf("Updating encoding length from %d to %d\n", encodingLength, encodingLength+1);
bufferUnderTest[encodingLength] = 0xFF; // hack so incrementing uses common code
bufferUnderTest[0] = 0x00; // hack so does not test all the zero-
++encodingLength;
}
}
if (!finished) { // increment the byte(s) so all inputs are tested
bool showSomethingMore = false;
for (int i = encodingLength-1; (i >= 0); --i) {
bufferUnderTest[i] += 1;
if (bufferUnderTest[i] != 0x00) break; // only increment least significant byte(s)
if (encodingLength > 2 && i <= encodingLength-2) {
showSomethingMore = true;
}
}
if (showSomethingMore) {
printf("Now testing buffer of length %d: %02x %02x %02x %02x %02x\n", encodingLength, bufferUnderTest[0], bufferUnderTest[1], bufferUnderTest[2], bufferUnderTest[3], bufferUnderTest[4]);
}
}
}
if (lastDecodedCodePoint != CODEPOINT_LARGEST_VALID_UTF8) {
printf("Last decoded code point did not reach expected value (%08x != %08x)\r\n", lastDecodedCodePoint, CODEPOINT_LARGEST_VALID_UTF8);
assert(false);
}
printf("\nSUCCESS!\n");
}
void main(void) {
ValidateDecoder();
}
/* The function utf8Codepoint() comes from https://github.com/adafruit/Adafruit_TinyUSB_ArduinoCore
and requires the following notice be present, if this is considered a modified copy.
In an abundance of caution, the text is thus provided for reference purposes below.
* The MIT License (MIT)
*
* Copyright (c) 2019 Ha Thach for Adafruit Industries
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment