eao197/utf8_checker_speed.cpp

## utf8_checker_speed.cpp
#include <algorithm>
#include <array>
#include <chrono>
#include <iostream>
#include <span>
#include <string>
#include <string_view>
#include <cstdint>

namespace restinio
{

namespace utils
{

//
// utf8_checker_t
//

/*!
 * @brief Helper class for checking UTF-8 byte sequence during parsing
 * URI or incoming byte stream.
 */
class utf8_checker_t
{
	//! Enumeration of all possible checker states.
	enum class state_t
	{
		wait_first_byte,
		wait_second_of_two,
		wait_second_of_three,
		wait_second_of_four,
		wait_third_of_three,
		wait_third_of_four,
		wait_fourth_of_four,
		invalid,
	};

	//! The current UNICODE symbol.
	/*!
	 * Contains a valid value only if some bytes were successfully
	 * processed by process_byte() and the current state is
	 * wait_first_byte.
	 */
	std::uint32_t m_current_symbol = 0u;

	//! The current state of the checker.
	state_t m_state{ state_t::wait_first_byte };

	void
	on_first_byte( std::uint8_t byte ) noexcept
	{
		if( byte <= 0x7Fu )
		{
			m_state = state_t::wait_first_byte;
			m_current_symbol = byte;
		}
		else if( 0xC0u == (byte & 0xE0u) )
		{
			m_state = state_t::wait_second_of_two;
			m_current_symbol = (byte & 0x1Fu);
		}
		else if( 0xE0u == (byte & 0xF0u) )
		{
			m_state = state_t::wait_second_of_three;
			m_current_symbol = (byte & 0x0Fu);
		}
		else if( 0xF0u == (byte & 0xF8u) )
		{
			m_state = state_t::wait_second_of_four;
			m_current_symbol = (byte & 0x07u);
		}
		else
		{
			// Because UTF-8 can represent only ranges from:
			//
			// 0000 0000-0000 007F
			// 0000 0080-0000 07FF
			// 0000 0800-0000 FFFF
			// 0001 0000-0010 FFFF
			//
			// There is no need to check masks like 0b111110xx and so on.
			//
			// See https://datatracker.ietf.org/doc/html/rfc3629
			//
			m_state = state_t::invalid;
		}
	}

	void
	on_second_of_two( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			// Check for overlong sequence.
			// The valid range for two bytes representation is 0x0080..0x07FF.
			if( m_current_symbol < 0x0080u )
			{
				// The value is too small, it's overlong.
				m_state = state_t::invalid;
			}
			else
				// Three is no need to check the result value against
				// invalid ranges (0xD800..0xDFFF and 0x110000..)
				// because two bytes only represents 0x0080..0x07FF.
				m_state = state_t::wait_first_byte;
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

	void
	on_second_of_three( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			m_state = state_t::wait_third_of_three;
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

	void
	on_second_of_four( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			m_state = state_t::wait_third_of_four;
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

	void
	on_third_of_three( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			// Check for overlong sequence.
			// The valid range for three bytes representation is 0x0800..0xFFFF.
			if( m_current_symbol < 0x0800u )
			{
				// The value is too small, it's overlong.
				m_state = state_t::invalid;
			}
			else
			{
				// It's necessary to check illigal points 0xD800..0xDFFF.
				if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
					m_state = state_t::invalid;
				else
					m_state = state_t::wait_first_byte;
			}
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

	void
	on_third_of_four( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			m_state = state_t::wait_fourth_of_four;
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

	void
	on_fourth_of_four( std::uint8_t byte ) noexcept
	{
		if( 0x80u == (byte & 0xC0u) )
		{
			m_current_symbol <<= 6;
			m_current_symbol |= (byte & 0x3Fu);

			// Check for overlong sequence.
			// The valid range for three bytes representation is 0x10000..0x10FFFF.
			if( m_current_symbol < 0x10000u )
			{
				// The value is too small, it's overlong.
				m_state = state_t::invalid;
			}
			else
			{
				// It's necessary to check for values above 0x10FFFF.
				// There is no need to check 0xD800..0xDFFF range because
				// it was already handled by overlong check.
				if( m_current_symbol >= 0x110000 )
					m_state = state_t::invalid;
				else
					m_state = state_t::wait_first_byte;
			}
		}
		else
		{
			m_state = state_t::invalid;
		}
	}

public:
	utf8_checker_t() = default;

	/*!
	 * Checks another byte.
	 *
	 * @note
	 * The actual value of the current symbol can be obtained only if
	 * process_byte() returns `true` and the subsequent call to
	 * finalized() returns `true`:
	 *
	 * @code
	 * utf8checker_t checker;
	 * for( const auto ch : some_string )
	 * {
	 * 	if( checker.process_byte() )
	 * 	{
	 * 		if( checker.finalized() )
	 * 			process_unicode_symbol( checker.current_symbol() );
	 * 	}
	 * 	else
	 * 	{
	 * 		... // Invalid sequence found!
	 * 		break;
	 * 	}
	 * }
	 * @endcode
	 *
	 * @retval true if the sequence is still valid and the next byte
	 * can be given to the next call to process_byte().
	 *
	 * @retval false if the sequence is invalid an there is no sense
	 * to continue call process_byte().
	 */
	[[nodiscard]]
	bool
	process_byte( std::uint8_t byte ) noexcept
	{
		switch( m_state )
		{
			case state_t::wait_first_byte:
				on_first_byte( byte );
			break;

			case state_t::wait_second_of_two:
				on_second_of_two( byte );
			break;

			case state_t::wait_second_of_three:
				on_second_of_three( byte );
			break;

			case state_t::wait_second_of_four:
				on_second_of_four( byte );
			break;

			case state_t::wait_third_of_three:
				on_third_of_three( byte );
			break;

			case state_t::wait_third_of_four:
				on_third_of_four( byte );
			break;

			case state_t::wait_fourth_of_four:
				on_fourth_of_four( byte );
			break;

			case state_t::invalid:
				// Nothing to do.
			break;
		}

		return (state_t::invalid != m_state);
	}

	/*!
	 * @return true if the current sequence finalized.
	 */
	[[nodiscard]]
	bool
	finalized() const noexcept
	{
		return state_t::wait_first_byte == m_state;
	}

	/*!
	 * Return the object into the initial state.
	 */
	void
	reset() noexcept
	{
		m_current_symbol = 0u;
		m_state = state_t::wait_first_byte;
	}

	/*!
	 * Get the collected value of the current symbol.
	 *
	 * @note
	 * It returns the actual value only if:
	 *
	 * - some bytes were successfully feed into process_byte();
	 * - finalized() returns `true`.
	 */
	[[nodiscard]]
	std::uint32_t
	current_symbol() const noexcept { return m_current_symbol; }
};

} /* namespace utils */

} /* namespace restinio */

namespace decode_2009
{

constexpr std::uint32_t utf8_accept = 0;
constexpr std::uint32_t utf8_reject = 1;

static const uint8_t utf8d[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

std::uint32_t inline
decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	*codep = (*state != utf8_accept)
		? (byte & 0x3fu) | (*codep << 6)
		: (0xff >> type) & (byte);

	*state = utf8d[256 + *state*16 + type];
	return *state;
}

} /* namespace decode_2009 */

namespace decode_2010
{

constexpr std::uint32_t utf8_accept = 0;
constexpr std::uint32_t utf8_reject = 12;

static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
};

std::uint32_t inline
decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	*codep = (*state != utf8_accept) ?
		(byte & 0x3fu) | (*codep << 6) :
		(0xff >> type) & (byte);

	*state = utf8d[256 + *state + type];
	return *state;
}

} /* namespace decode_2010 */

bool
check_validity_with_restinio(std::string_view str, std::uint32_t & out)
{
	restinio::utils::utf8_checker_t checker;
	for( const auto ch : str )
	{
		if( checker.process_byte( ch ) )
		{
			if( checker.finalized() )
				out += checker.current_symbol();
		}
		else
			return false;
	}

	return true;
}

bool
check_validity_with_decode_2009(std::string_view str, std::uint32_t & out)
{
	std::uint32_t state = decode_2009::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
		switch( decode_2009::decode( &state, &code_point,
				static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
		{
		case decode_2009::utf8_accept:
			out += code_point;
		break;

		case decode_2009::utf8_reject:
			return false;
		}
	}

	return true;
}

bool
check_validity_with_decode_2010(std::string_view str, std::uint32_t & out)
{
	std::uint32_t state = decode_2010::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
		switch( decode_2010::decode( &state, &code_point,
				static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
		{
		case decode_2010::utf8_accept:
			out += code_point;
		break;

		case decode_2010::utf8_reject:
			return false;
		}
	}

	return true;
}

template< typename Checker >
std::pair< bool, std::uint32_t >
checking_loop(
	Checker && checker,
	unsigned int loops,
	std::string_view str )
{
	std::uint32_t out = 0;
	bool result = false;
	for( unsigned int i = 0; i < loops; ++i )
	{
		out = 0;
		result = checker( str, out );
	}

	return { result, out };
}

class duration_meter
{
	const char * _name;
	const std::chrono::high_resolution_clock::time_point _started_at;

public:
	duration_meter( const char * name )
		: _name{ name }
		, _started_at{ std::chrono::high_resolution_clock::now() }
	{}
	~duration_meter()
	{
		const auto f = std::chrono::high_resolution_clock::now();

		std::cout << "*** " << _name << ": "
			<< std::chrono::duration_cast<std::chrono::microseconds>(
				f - _started_at ).count()
			<< "us *** " << std::endl;
	}
};

template<typename Lambda>
decltype(auto)
measure( const char * name, Lambda && lambda )
{
	duration_meter meter{ name };
	return lambda();
}

int main()
{
	std::string_view text{
		"В последний раз статья, целиком посвященная открытому проекту RESTinio, вышла "
		"на Хабре в декабре 2020-го года, без малого три года назад. Это был рассказ "
		"о релизе версии 0.6.13. По сути, это был последний релиз, в котором в "
		"RESTinio появилось что-то новое и важное. Потом были только небольшие "
		"корректирующие релизы, исправляющие ошибки или адаптирующие RESTinio к "
		"свежим версиям зависимостей. "
		" "
		"И вот спустя три года нам удалось выпустить новое существенное обновление. А "
		"посему есть повод поговорить о том, что было удалено/добавлено/изменено в этой "
		"версии. Ну и о причинах паузы в развитии и перспективах проекта вообще. "
		" "
		"Кому интересно, милости прошу под кат. "
		" "
		"Для тех же, кто про данную разработку слышит в первый раз: это наша попытка "
		"сделать встраиваемый в C++ приложения HTTP(S)/WebSocket сервер, который бы "
		"обладал и большой гибкостью, и нормальной производительностью, освобождал бы "
		"пользователя от рутины, но не прятал бы абсолютно все детали 'под капот', и "
		"удовлетворял бы нашим представлениям о том, как подобные вещи должны "
		"выглядеть... "
		" "
		"Вроде бы получилось. Мне кажется, что раз уж RESTinio сумел набрать тысячу "
		"звезд на GitHub, результат понравился и пригодился не только нам. Впрочем, это "
		"уже совсем другая история. Давайте вернемся к рассказу об изменениях в версии "
		"0.7.0 и к тому, почему этих изменений пришлось ждать так долго... "
		" "
		"Что нового в 0.7.0 "
		"Переход на C++17 "
		"В версии 0.7.0 мы перешли с C++14 на C++17. Вероятно, это не самое лучшее из "
		"наших решений, ведь кто-то все еще вынужден оставаться на C++14 не имея "
		"возможности обновиться до C++17, однако мы для себя больше не видели смысла "
		"держаться за C++14. "
		" "
		"Выгода от перехода на C++17 заключалась прежде всего в том, что удалось "
		"избавиться от таких зависимостей, как optional-lite, string_view-lite и "
		"variant-lite, т.к. теперь это все доступно в стандартной библиотеке. Так что "
		"остается сказать большое спасибо Martin Moene за его труд по написанию и "
		"сопровождению этих библиотек, они нам здорово помогали в течении шести лет, но "
		"дальше мы пойдем с stdlib 🙂 "
		" "
		"Хотя осталась зависимость от expected-lite, но с ней придется жить еще долго. "
		"Если уж мы на 17-ые плюсы перебрались только в 2023-ем, то перехода на C++23 "
		"нужно будет подождать еще лет пять-шесть, а то и девять-десять 😆 "
		" "
		"Выгода от 17-го стандарта проявилась еще и в том, что в ряде мест мы смогли "
		"выбросить сложные (и не очень) шаблонные конструкции в пользу простых if "
		"constexpr и fold expressions. "
		" "
		"Так что дальше пойдем уже в рамках C++17. Если кого-то это расстраивает, то уж "
		"простите за откровенность, но за поддержку C++14 нам никто не платит. "
		" "
		"Переход на llhttp, Catch2 v3 и modern CMake "
		"Изначально RESTinio использовал nodejs/http-parser в качестве парсера "
		"HTTP-запросов. Но несколько лет назад его развитие и поддержка прекратились. "
		"Посему в версии 0.7.0 мы переехали на nodejs/llhttp. Собственно, этот переезд и "
		"был главной мотивацией для выпуска версии 0.7.0. "
		" "
		"Заодно мы обновили у себя Catch2. Эта библиотека начиная с версии 3.0 уже не "
		"является header-only и требует компиляции. "
	};

	auto v1 = measure( "   restinio", [&]() {
			return checking_loop(
					check_validity_with_restinio, 100'000u, text );
		} );

	auto v2 = measure( "decode_2009", [&]() {
			return checking_loop(
					check_validity_with_decode_2009, 100'000u, text );
		} );

	auto v3 = measure( "decode_2010", [&]() {
			return checking_loop(
					check_validity_with_decode_2010, 100'000u, text );
		} );

	std::cout << v1.first << " " << v1.second << std::endl;
	std::cout << v2.first << " " << v2.second << std::endl;
	std::cout << v3.first << " " << v3.second << std::endl;
}
	#include <algorithm>
	#include <array>
	#include <chrono>
	#include <iostream>
	#include <span>
	#include <string>
	#include <string_view>
	#include <cstdint>

	namespace restinio
	{

	namespace utils
	{

	//
	// utf8_checker_t
	//

	/*!
	* @brief Helper class for checking UTF-8 byte sequence during parsing
	* URI or incoming byte stream.
	*/
	class utf8_checker_t
	{
	//! Enumeration of all possible checker states.
	enum class state_t
	{
	wait_first_byte,
	wait_second_of_two,
	wait_second_of_three,
	wait_second_of_four,
	wait_third_of_three,
	wait_third_of_four,
	wait_fourth_of_four,
	invalid,
	};

	//! The current UNICODE symbol.
	/*!
	* Contains a valid value only if some bytes were successfully
	* processed by process_byte() and the current state is
	* wait_first_byte.
	*/
	std::uint32_t m_current_symbol = 0u;

	//! The current state of the checker.
	state_t m_state{ state_t::wait_first_byte };

	void
	on_first_byte( std::uint8_t byte ) noexcept
	{
	if( byte <= 0x7Fu )
	{
	m_state = state_t::wait_first_byte;
	m_current_symbol = byte;
	}
	else if( 0xC0u == (byte & 0xE0u) )
	{
	m_state = state_t::wait_second_of_two;
	m_current_symbol = (byte & 0x1Fu);
	}
	else if( 0xE0u == (byte & 0xF0u) )
	{
	m_state = state_t::wait_second_of_three;
	m_current_symbol = (byte & 0x0Fu);
	}
	else if( 0xF0u == (byte & 0xF8u) )
	{
	m_state = state_t::wait_second_of_four;
	m_current_symbol = (byte & 0x07u);
	}
	else
	{
	// Because UTF-8 can represent only ranges from:
	//
	// 0000 0000-0000 007F
	// 0000 0080-0000 07FF
	// 0000 0800-0000 FFFF
	// 0001 0000-0010 FFFF
	//
	// There is no need to check masks like 0b111110xx and so on.
	//
	// See https://datatracker.ietf.org/doc/html/rfc3629
	//
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_two( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for two bytes representation is 0x0080..0x07FF.
	if( m_current_symbol < 0x0080u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	// Three is no need to check the result value against
	// invalid ranges (0xD800..0xDFFF and 0x110000..)
	// because two bytes only represents 0x0080..0x07FF.
	m_state = state_t::wait_first_byte;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_three( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_third_of_three;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_third_of_four;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_third_of_three( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for three bytes representation is 0x0800..0xFFFF.
	if( m_current_symbol < 0x0800u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	{
	// It's necessary to check illigal points 0xD800..0xDFFF.
	if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
	m_state = state_t::invalid;
	else
	m_state = state_t::wait_first_byte;
	}
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_third_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_fourth_of_four;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_fourth_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for three bytes representation is 0x10000..0x10FFFF.
	if( m_current_symbol < 0x10000u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	{
	// It's necessary to check for values above 0x10FFFF.
	// There is no need to check 0xD800..0xDFFF range because
	// it was already handled by overlong check.
	if( m_current_symbol >= 0x110000 )
	m_state = state_t::invalid;
	else
	m_state = state_t::wait_first_byte;
	}
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	public:
	utf8_checker_t() = default;

	/*!
	* Checks another byte.
	*
	* @note
	* The actual value of the current symbol can be obtained only if
	* process_byte() returns `true` and the subsequent call to
	* finalized() returns `true`:
	*
	* @code
	* utf8checker_t checker;
	* for( const auto ch : some_string )
	* {
	* if( checker.process_byte() )
	* {
	* if( checker.finalized() )
	* process_unicode_symbol( checker.current_symbol() );
	* }
	* else
	* {
	* ... // Invalid sequence found!
	* break;
	* }
	* }
	* @endcode
	*
	* @retval true if the sequence is still valid and the next byte
	* can be given to the next call to process_byte().
	*
	* @retval false if the sequence is invalid an there is no sense
	* to continue call process_byte().
	*/
	[[nodiscard]]
	bool
	process_byte( std::uint8_t byte ) noexcept
	{
	switch( m_state )
	{
	case state_t::wait_first_byte:
	on_first_byte( byte );
	break;

	case state_t::wait_second_of_two:
	on_second_of_two( byte );
	break;

	case state_t::wait_second_of_three:
	on_second_of_three( byte );
	break;

	case state_t::wait_second_of_four:
	on_second_of_four( byte );
	break;

	case state_t::wait_third_of_three:
	on_third_of_three( byte );
	break;

	case state_t::wait_third_of_four:
	on_third_of_four( byte );
	break;

	case state_t::wait_fourth_of_four:
	on_fourth_of_four( byte );
	break;

	case state_t::invalid:
	// Nothing to do.
	break;
	}

	return (state_t::invalid != m_state);
	}

	/*!
	* @return true if the current sequence finalized.
	*/
	[[nodiscard]]
	bool
	finalized() const noexcept
	{
	return state_t::wait_first_byte == m_state;
	}

	/*!
	* Return the object into the initial state.
	*/
	void
	reset() noexcept
	{
	m_current_symbol = 0u;
	m_state = state_t::wait_first_byte;
	}

	/*!
	* Get the collected value of the current symbol.
	*
	* @note
	* It returns the actual value only if:
	*
	* - some bytes were successfully feed into process_byte();
	* - finalized() returns `true`.
	*/
	[[nodiscard]]
	std::uint32_t
	current_symbol() const noexcept { return m_current_symbol; }
	};

	} /* namespace utils */

	} /* namespace restinio */

	namespace decode_2009
	{

	constexpr std::uint32_t utf8_accept = 0;
	constexpr std::uint32_t utf8_reject = 1;

	static const uint8_t utf8d[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
	};

	std::uint32_t inline
	decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	codep = (state != utf8_accept)
	? (byte & 0x3fu) \| (*codep << 6)
	: (0xff >> type) & (byte);

	state = utf8d[256 + state*16 + type];
	return *state;
	}

	} /* namespace decode_2009 */

	namespace decode_2010
	{

	constexpr std::uint32_t utf8_accept = 0;
	constexpr std::uint32_t utf8_reject = 12;

	static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
	};

	std::uint32_t inline
	decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	codep = (state != utf8_accept) ?
	(byte & 0x3fu) \| (*codep << 6) :
	(0xff >> type) & (byte);

	state = utf8d[256 + state + type];
	return *state;
	}

	} /* namespace decode_2010 */

	bool
	check_validity_with_restinio(std::string_view str, std::uint32_t & out)
	{
	restinio::utils::utf8_checker_t checker;
	for( const auto ch : str )
	{
	if( checker.process_byte( ch ) )
	{
	if( checker.finalized() )
	out += checker.current_symbol();
	}
	else
	return false;
	}

	return true;
	}

	bool
	check_validity_with_decode_2009(std::string_view str, std::uint32_t & out)
	{
	std::uint32_t state = decode_2009::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
	switch( decode_2009::decode( &state, &code_point,
	static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
	{
	case decode_2009::utf8_accept:
	out += code_point;
	break;

	case decode_2009::utf8_reject:
	return false;
	}
	}

	return true;
	}

	bool
	check_validity_with_decode_2010(std::string_view str, std::uint32_t & out)
	{
	std::uint32_t state = decode_2010::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
	switch( decode_2010::decode( &state, &code_point,
	static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
	{
	case decode_2010::utf8_accept:
	out += code_point;
	break;

	case decode_2010::utf8_reject:
	return false;
	}
	}

	return true;
	}

	template< typename Checker >
	std::pair< bool, std::uint32_t >
	checking_loop(
	Checker && checker,
	unsigned int loops,
	std::string_view str )
	{
	std::uint32_t out = 0;
	bool result = false;
	for( unsigned int i = 0; i < loops; ++i )
	{
	out = 0;
	result = checker( str, out );
	}

	return { result, out };
	}

	class duration_meter
	{
	const char * _name;
	const std::chrono::high_resolution_clock::time_point _started_at;

	public:
	duration_meter( const char * name )
	: _name{ name }
	, _started_at{ std::chrono::high_resolution_clock::now() }
	{}
	~duration_meter()
	{
	const auto f = std::chrono::high_resolution_clock::now();

	std::cout << "*** " << _name << ": "
	<< std::chrono::duration_cast<std::chrono::microseconds>(
	f - _started_at ).count()
	<< "us *** " << std::endl;
	}
	};

	template<typename Lambda>
	decltype(auto)
	measure( const char * name, Lambda && lambda )
	{
	duration_meter meter{ name };
	return lambda();
	}

	int main()
	{
	std::string_view text{
	"В последний раз статья, целиком посвященная открытому проекту RESTinio, вышла "
	"на Хабре в декабре 2020-го года, без малого три года назад. Это был рассказ "
	"о релизе версии 0.6.13. По сути, это был последний релиз, в котором в "
	"RESTinio появилось что-то новое и важное. Потом были только небольшие "
	"корректирующие релизы, исправляющие ошибки или адаптирующие RESTinio к "
	"свежим версиям зависимостей. "
	" "
	"И вот спустя три года нам удалось выпустить новое существенное обновление. А "
	"посему есть повод поговорить о том, что было удалено/добавлено/изменено в этой "
	"версии. Ну и о причинах паузы в развитии и перспективах проекта вообще. "
	" "
	"Кому интересно, милости прошу под кат. "
	" "
	"Для тех же, кто про данную разработку слышит в первый раз: это наша попытка "
	"сделать встраиваемый в C++ приложения HTTP(S)/WebSocket сервер, который бы "
	"обладал и большой гибкостью, и нормальной производительностью, освобождал бы "
	"пользователя от рутины, но не прятал бы абсолютно все детали 'под капот', и "
	"удовлетворял бы нашим представлениям о том, как подобные вещи должны "
	"выглядеть... "
	" "
	"Вроде бы получилось. Мне кажется, что раз уж RESTinio сумел набрать тысячу "
	"звезд на GitHub, результат понравился и пригодился не только нам. Впрочем, это "
	"уже совсем другая история. Давайте вернемся к рассказу об изменениях в версии "
	"0.7.0 и к тому, почему этих изменений пришлось ждать так долго... "
	" "
	"Что нового в 0.7.0 "
	"Переход на C++17 "
	"В версии 0.7.0 мы перешли с C++14 на C++17. Вероятно, это не самое лучшее из "
	"наших решений, ведь кто-то все еще вынужден оставаться на C++14 не имея "
	"возможности обновиться до C++17, однако мы для себя больше не видели смысла "
	"держаться за C++14. "
	" "
	"Выгода от перехода на C++17 заключалась прежде всего в том, что удалось "
	"избавиться от таких зависимостей, как optional-lite, string_view-lite и "
	"variant-lite, т.к. теперь это все доступно в стандартной библиотеке. Так что "
	"остается сказать большое спасибо Martin Moene за его труд по написанию и "
	"сопровождению этих библиотек, они нам здорово помогали в течении шести лет, но "
	"дальше мы пойдем с stdlib 🙂 "
	" "
	"Хотя осталась зависимость от expected-lite, но с ней придется жить еще долго. "
	"Если уж мы на 17-ые плюсы перебрались только в 2023-ем, то перехода на C++23 "
	"нужно будет подождать еще лет пять-шесть, а то и девять-десять 😆 "
	" "
	"Выгода от 17-го стандарта проявилась еще и в том, что в ряде мест мы смогли "
	"выбросить сложные (и не очень) шаблонные конструкции в пользу простых if "
	"constexpr и fold expressions. "
	" "
	"Так что дальше пойдем уже в рамках C++17. Если кого-то это расстраивает, то уж "
	"простите за откровенность, но за поддержку C++14 нам никто не платит. "
	" "
	"Переход на llhttp, Catch2 v3 и modern CMake "
	"Изначально RESTinio использовал nodejs/http-parser в качестве парсера "
	"HTTP-запросов. Но несколько лет назад его развитие и поддержка прекратились. "
	"Посему в версии 0.7.0 мы переехали на nodejs/llhttp. Собственно, этот переезд и "
	"был главной мотивацией для выпуска версии 0.7.0. "
	" "
	"Заодно мы обновили у себя Catch2. Эта библиотека начиная с версии 3.0 уже не "
	"является header-only и требует компиляции. "
	};

	auto v1 = measure( " restinio", [&]() {
	return checking_loop(
	check_validity_with_restinio, 100'000u, text );
	} );

	auto v2 = measure( "decode_2009", [&]() {
	return checking_loop(
	check_validity_with_decode_2009, 100'000u, text );
	} );

	auto v3 = measure( "decode_2010", [&]() {
	return checking_loop(
	check_validity_with_decode_2010, 100'000u, text );
	} );

	std::cout << v1.first << " " << v1.second << std::endl;
	std::cout << v2.first << " " << v2.second << std::endl;
	std::cout << v3.first << " " << v3.second << std::endl;
	}