Last active
October 19, 2015 00:43
-
-
Save FrankHB/09a27dd51eb934984820 to your computer and use it in GitHub Desktop.
UTF-8 decoding/encoding performance test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// LICENSE: MIT | |
// Based on: https://github.com/9chu/minicodecvt/blob/master/TestCpp/main.cpp | |
// Depends on: | |
// rapidjson: https://github.com/miloyip/rapidjson . | |
// minicodecvt: https://github.com/9chu/minicodecvt | |
// YSLib: https://github.com/FrankHB/YSLib | |
// MCF: https://github.com/lhmouse/MCF | |
// Prerequisite: | |
// *-w64-mingw32-g++ supports '-std=c++14'. | |
// Run Tool/install-sysroot.sh in YSLib repo dir to install Sysroot | |
// and SHBuild. | |
// See also https://bitbucket.org/FrankHB/yslib/wiki/Prerequisitions.zh-CN.md. | |
#if 0 | |
/* | |
#!/usr/bin/env sh | |
export rapidjson_SRC=/e/Source/rapidjson/include | |
export minicodecvt_SRC=/e/Source/9chu/minicodecvt/minicodecvt | |
export MCF_SRC=/e/Source/lhmouse/MCF/MCF | |
export SYSROOT=/f/Programing/YSLib/sysroot/usr | |
export CXX=g++ | |
export AR=gcc-ar | |
#export C_CXXFLAGS_ARCH='-mfpmath=both -march=nocona' | |
export C_CXXFLAGS_ARCH='-mfpmath=sse -march=native' | |
export CXXFLAGS_STD='-std=c++14' | |
export CXXFLAGS="$CXXFLAGS_STD $C_CXXFLAGS_ARCH -pipe -fdata-sections -ffunction-sections \ | |
-pedantic-errors -Wall -Wcast-align -Wdeprecated -Wdeprecated-declarations \ | |
-Wextra -Wfloat-equal -Wformat=2 -Winvalid-pch -Wmissing-declarations \ | |
-Wmissing-include-dirs -Wmultichar -Wno-format-nonliteral -Wredundant-decls -Wshadow -Wsign-conversion \ | |
-Wdouble-promotion -Wlogical-op -Wtrampolines \ | |
-Wctor-dtor-privacy -Wnon-virtual-dtor -Woverloaded-virtual -Wsign-promo -Wdouble-promotion -Wlogical-op \ | |
-Wconditionally-supported -Wstrict-null-sentinel -Wzero-as-null-pointer-constant -mthreads \ | |
-O3 -DNDEBUG -s -fexpensive-optimizations -flto=jobserver -fomit-frame-pointer \ | |
-Wall -Wextra -pedantic -Wsign-conversion \ | |
-Wsuggest-attribute=noreturn \ | |
-Wno-type-limits -Wno-sign-conversion -Wno-zero-as-null-pointer-constant -Wnoexcept" | |
export LDFLAGS="-mthreads -s -fexpensive-optimizations -flto -Wl,--gc-sections" | |
# build minicodecvt | |
SHBuild $minicodecvt_SRC $CXXFLAGS -xj,3 | |
# build MCF | |
SHBuild $MCF_SRC $CXXFLAGS -xj,6 | |
$CXX a.cc $CXXFLAGS -Wno-shadow \ | |
-I$SYSROOT/include \ | |
-I$rapidjson_SRC \ | |
-I$minicodecvt_SRC .shbuild/minicodecvt.a \ | |
-I$MCF_SRC .shbuild/MCF.a \ | |
-L$SYSROOT/lib -DYF_DLL -DYB_DLL -lYFramework -lYBase | |
./a | |
$CXX a.cc -oa.static.exe $CXXFLAGS -Wno-sign-shadow \ | |
-I$SYSROOT/include \ | |
-I$rapidjson_SRC \ | |
-I$minicodecvt_SRC .shbuild/minicodecvt.a \ | |
-I$MCF_SRC .shbuild/MCF.a \ | |
-L$SYSROOT/lib -Wl,-dn -lYFramework -lYBase | |
*/ | |
#endif | |
#include <ysbuild.h> | |
#include YFM_CHRLib_MappingEx | |
#include YFM_Win32_YCLib_Consoles | |
#include <ytest/timing.hpp> | |
//#define Test_SMP 0 | |
//#define Test_UseCheck 0 | |
//#define Test_UseUnitStr 0 | |
//#define Test_UseBig5 0 | |
#ifndef Test_LongString | |
# define Test_LongString 1 | |
#endif | |
#ifndef Test_Performance | |
# define Test_Performance 1 | |
#endif | |
#ifndef Test_OutputContent | |
# define Test_OutputContent 1 | |
#endif | |
#ifndef USE_rapidjson | |
# define USE_rapidjson 1 | |
#endif | |
#ifndef USE_minicodecvt | |
# define USE_minicodecvt 1 | |
#endif | |
#ifndef USE_MCF | |
# define USE_MCF 1 | |
#endif | |
#include <iostream> | |
#if USE_rapidjson | |
# include <rapidjson/rapidjson.h> | |
# include <rapidjson/memorystream.h> | |
//# include <rapidjson/encodings.h> | |
#endif | |
#if USE_minicodecvt | |
#undef ynothrow | |
# include "coverter.h" | |
# include "gbk.h" | |
# include "shift_jis.h" | |
# include "big5.h" | |
# include "unicode.h" | |
#endif | |
#if USE_MCF | |
# include <Core/String.hpp> | |
#endif | |
namespace fast_utf8_decoder | |
{ | |
using ystdex::size_t; | |
using ystdex::byte; | |
using YSLib::u16string; | |
using YSLib::u32string; | |
using CHRLib::ConversionResult; | |
using state_t = std::uint_fast8_t; | |
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
enum UTF8_STATE : state_t | |
{ | |
UTF8_ACCEPT = 0, | |
UTF8_REJECT = 12 | |
}; | |
// Maps bytes to character classes that to reduce the size of the transition table and create bitmasks. | |
yconstexpr state_t utf8d1[]{ | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8 | |
}; | |
// Transition table that maps a combination of a state of the automaton and a character class to a state. | |
yconstexpr state_t utf8d2[]{ | |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
12,36,12,12,12,12,12,12,12,12,12,12 | |
}; | |
namespace | |
{ | |
state_t | |
decode(state_t res, char32_t& code, byte b) | |
{ | |
yconstraint(res <= 96); | |
const auto type(utf8d1[size_t(b)]); | |
yassume(type < UTF8_REJECT); | |
code = (res != UTF8_ACCEPT) ? (b & 0x3FU) | (code << 6) | |
: (0xFFU >> type) & b; | |
return utf8d2[res + type]; | |
} | |
} | |
template<typename _tIn, typename _tOut> | |
ConversionResult | |
ToUTF16(_tIn first, _tIn last, _tOut result, size_t n) | |
{ | |
auto s = first; | |
auto d = result; | |
char32_t code; | |
state_t res(0); | |
while(s < last) | |
{ | |
size_t dst_words_free(n - (d - result)); | |
auto src_current_end(s + dst_words_free - 1); | |
if(last < src_current_end) | |
src_current_end = last; | |
if(src_current_end <= s) | |
return ConversionResult::Unhandled; | |
while(s < src_current_end) | |
{ | |
const auto c(*s); | |
++s; | |
if(YB_UNLIKELY(res > 96)) | |
return ConversionResult::BadState; | |
res = decode(res, code, byte(c)); | |
if(res != UTF8_ACCEPT && res != UTF8_REJECT) | |
continue; | |
if(code > 0xFFFFU) | |
{ | |
*d = char16_t(0xD7C0U + (code >> 10U)); | |
++d; | |
*d = char16_t(0xDC00U + (code & 0x3FFU)); | |
++d; | |
} | |
else | |
{ | |
*d = char16_t(code); | |
++d; | |
} | |
} | |
} | |
if(res != UTF8_ACCEPT) | |
return ConversionResult::Invalid; | |
if((n - (d - result)) == 0) | |
return ConversionResult::Unhandled; | |
*d++ = 0; | |
return ConversionResult::OK; | |
} | |
template<typename _tIn, typename _tOut> | |
ConversionResult | |
ToUTF32(_tIn first, _tIn last, _tOut result, size_t n) | |
{ | |
auto s = first; | |
auto d = result; | |
char32_t code; | |
state_t res(0); | |
while(s < last) | |
{ | |
size_t dst_words_free(n - (d - result)); | |
auto src_current_end(s + dst_words_free - 1); | |
if(last < src_current_end) | |
src_current_end = last; | |
if(src_current_end <= s) | |
return ConversionResult::Unhandled; | |
while(s < src_current_end) | |
{ | |
const auto c(*s); | |
++s; | |
if(YB_UNLIKELY(res > 96)) | |
return ConversionResult::BadState; | |
res = decode(res, code, byte(c)); | |
if(res != UTF8_ACCEPT && res != UTF8_REJECT) | |
continue; | |
*d = code; | |
++d; | |
} | |
} | |
if(res != UTF8_ACCEPT) | |
return ConversionResult::Invalid; | |
if(n == d - result) | |
return ConversionResult::Unhandled; | |
*d++ = 0; | |
return ConversionResult::OK; | |
} | |
template<typename _tString> | |
u16string | |
ToUTF16(const _tString& str) | |
{ | |
const auto len(str.length() * 2); | |
u16string res(len, char16_t()); | |
const auto | |
cres(ToUTF16(str.cbegin(), str.cend(), res.begin(), len)); | |
#if Test_UseCheck | |
if(cres != ConversionResult::OK) | |
std::cerr << int(cres) << std::endl; | |
#else | |
yunused(cres); | |
#endif | |
return res; | |
} | |
template<typename _tString> | |
u32string | |
ToUTF32(const _tString& str) | |
{ | |
const auto len(str.length() * 2); | |
u32string res(len, char32_t()); | |
const auto | |
cres(ToUTF32(str.cbegin(), str.cend(), res.begin(), len)); | |
#if Test_UseCheck | |
if(cres != ConversionResult::OK) | |
std::cerr << int(cres) << std::endl; | |
#else | |
yunused(cres); | |
#endif | |
return res; | |
} | |
} | |
#if USE_rapidjson | |
namespace rapid_utf8_decoder | |
{ | |
using namespace rapidjson; | |
using ystdex::size_t; | |
using ystdex::byte; | |
using YSLib::u16string; | |
using YSLib::u32string; | |
using CHRLib::ConversionResult; | |
using state_t = std::uint_fast8_t; | |
template<typename _tOut> | |
ConversionResult | |
ToUTF16(MemoryStream& stream, _tOut result, size_t n) | |
{ | |
auto& last = stream.end_; | |
auto& s = stream.src_; | |
auto d = result; | |
unsigned code; | |
// state_t res(0); | |
while(s < last) | |
{ | |
size_t dst_words_free(n - (d - result)); | |
auto src_current_end(s + dst_words_free - 1); | |
if(last < src_current_end) | |
src_current_end = last; | |
if(src_current_end <= s) | |
return ConversionResult::Unhandled; | |
while(s < src_current_end) | |
{ | |
#if Test_UseCheck && 0 | |
if(YB_UNLIKELY(res >= 0x50)) | |
return ConversionResult::BadState; | |
#endif | |
if(!UTF8<>::Decode(stream, &code)) | |
continue; | |
if(code > 0xFFFFU) | |
{ | |
*d = char16_t(0xD7C0U + (code >> 10U)); | |
++d; | |
*d = char16_t(0xDC00U + (code & 0x3FFU)); | |
++d; | |
} | |
else | |
{ | |
*d = char16_t(code); | |
++d; | |
} | |
} | |
} | |
#if Test_UseCheck && 0 | |
if(res != UTF8_ACCEPT) | |
return ConversionResult::Invalid; | |
if((n - (d - result)) == 0) | |
return ConversionResult::Unhandled; | |
#endif | |
*d++ = 0; | |
return ConversionResult::OK; | |
} | |
template<typename _tOut> | |
ConversionResult | |
ToUTF32(MemoryStream& stream, _tOut result, size_t n) | |
{ | |
auto& first = stream.begin_; | |
auto& last = stream.end_; | |
auto& s = stream.src_; | |
auto d = result; | |
unsigned code; | |
// state_t res(0); | |
while(s < last) | |
{ | |
size_t dst_words_free(n - (d - result)); | |
auto src_current_end(s + dst_words_free - 1); | |
if(last < src_current_end) | |
src_current_end = last; | |
if(src_current_end <= s) | |
return ConversionResult::Unhandled; | |
while(s < src_current_end) | |
{ | |
#if Test_UseCheck && 0 | |
if(YB_UNLIKELY(res >= 0x50)) | |
return ConversionResult::BadState; | |
#endif | |
if(!UTF8<>::Decode(stream, &code)) | |
continue; | |
*d = code; | |
++d; | |
} | |
} | |
#if Test_UseCheck && 0 | |
if(res != UTF8_ACCEPT) | |
return ConversionResult::Invalid; | |
if((n - (d - result)) == 0) | |
return ConversionResult::Unhandled; | |
#endif | |
*d++ = 0; | |
return ConversionResult::OK; | |
} | |
template<typename _tString> | |
u16string | |
ToUTF16(const _tString& str) | |
{ | |
const auto len(str.length() * 2); | |
u16string res(len, char16_t()); | |
MemoryStream ms(str.data(), str.length()); | |
const auto cres(ToUTF16(ms, res.begin(), len)); | |
#if Test_UseCheck | |
if(cres != ConversionResult::OK) | |
std::cerr << int(cres) << std::endl; | |
#else | |
yunused(cres); | |
#endif | |
return res; | |
} | |
template<typename _tString> | |
u32string | |
ToUTF32(const _tString& str) | |
{ | |
const auto len(str.length() * 2); | |
u32string res(len, char32_t()); | |
MemoryStream ms(str.data(), str.length()); | |
const auto cres(ToUTF32(ms, res.begin(), len)); | |
#if Test_UseCheck | |
if(cres != ConversionResult::OK) | |
std::cerr << int(cres) << std::endl; | |
#else | |
yunused(cres); | |
#endif | |
return res; | |
} | |
} | |
#endif | |
#if USE_minicodecvt | |
namespace minicodecvt | |
{ | |
template<typename decoder_t, typename encoder_t, typename char_t = char> | |
std::basic_string<char_t> ConvertR(const char* nullTerminatedString) | |
{ | |
decoder_t decoder; | |
encoder_t encoder; | |
std::basic_string<char_t> ret; | |
ret.reserve(strlen(nullTerminatedString) * 4); | |
char32_t ucs4; | |
bool de_finished = true; | |
uint8_t* buf; | |
size_t size = 0; | |
bool en_finished = true; | |
while (*nullTerminatedString != 0) | |
{ | |
if (decoder(*nullTerminatedString, ucs4)) | |
{ | |
de_finished = true; | |
if (encoder(ucs4, buf, size)) | |
{ | |
en_finished = true; | |
if (buf) | |
{ | |
if (size % sizeof(char_t) != 0) | |
throw Exception::EncodeError("output character type is not match with encoder."); | |
ret.append(reinterpret_cast<char_t*>(buf), size / sizeof(char_t)); | |
} | |
} | |
else | |
en_finished = {}; | |
} | |
else | |
de_finished = {}; | |
++nullTerminatedString; | |
} | |
#if Test_UseCheck | |
if (!de_finished) | |
throw Exception::DecodeError("not all bytes is decoded."); | |
if (!en_finished) | |
throw Exception::EncodeError("not all bytes is encoded."); | |
#else | |
yunused(de_finished); | |
yunused(en_finished); | |
#endif | |
return std::move(ret); | |
} | |
template<typename decoder_t, typename encoder_t, typename char_t = char, typename iterator_t> | |
std::basic_string<char_t> ConvertR(iterator_t begin, iterator_t end) | |
{ | |
decoder_t decoder; | |
encoder_t encoder; | |
std::basic_string<char_t> ret; | |
ret.reserve(std::distance(begin, end) * 4); | |
char32_t ucs4; | |
bool de_finished = true; | |
uint8_t* buf; | |
size_t size = 0; | |
bool en_finished = true; | |
while (begin != end) | |
{ | |
if (decoder(*begin, ucs4)) | |
{ | |
de_finished = true; | |
if (encoder(ucs4, buf, size)) | |
{ | |
en_finished = true; | |
if (buf) | |
{ | |
if (size % sizeof(char_t) != 0) | |
throw Exception::EncodeError("output character type is not match with encoder."); | |
ret.append(reinterpret_cast<char_t*>(buf), size / sizeof(char_t)); | |
} | |
} | |
else | |
en_finished = {}; | |
} | |
else | |
de_finished = {}; | |
++begin; | |
} | |
#if Test_UseCheck | |
if (!de_finished) | |
throw Exception::DecodeError("not all bytes is decoded."); | |
if (!en_finished) | |
throw Exception::EncodeError("not all bytes is encoded."); | |
#else | |
yunused(de_finished); | |
yunused(en_finished); | |
#endif | |
return std::move(ret); | |
} | |
} | |
#endif | |
namespace CHRLib | |
{ | |
template<class _tDst = std::basic_string<char16_t>> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceUCS2LE(_tDst& str, const char* s, Encoding enc = CS_Default) | |
{ | |
str.resize(ystdex::ntctslen(s)); | |
str.resize(MBCSToUCS2(&str[0], s, enc)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char16_t>> | |
YB_FLATTEN _tDst& | |
EmplaceUCS2LE(_tDst& str, string_view sv, Encoding enc = CS_Default) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
#if 1 | |
str.resize(l); | |
str.resize(MBCSToUCS2(&str[0], s, s + l, enc)); | |
#else | |
static yconstexpr size_t blk_size(1 << 8U); | |
char16_t xbuf[blk_size]; | |
const size_t blk(l / blk_size); | |
str.reserve(l); | |
str.clear(); | |
for(size_t i(0); i < blk; ++i) | |
str.append(xbuf, MBCSToUCS2(xbuf, s + i * blk_size, | |
s + std::min<size_t>((i + 1) * blk_size, l), enc)); | |
#endif | |
return str; | |
} | |
template<class _tDst = std::basic_string<char16_t>> | |
inline YB_FLATTEN _tDst& | |
EmplaceUCS2LE(_tDst& str, u16string_view sv, Encoding = CharSet::ISO_10646_UCS_2) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
// FIXME: Correct conversion for encoding other than UCS-2LE. | |
return str = {s, sv.length()}; | |
} | |
template<class _tDst = std::basic_string<char16_t>> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceUCS2LE(_tDst& str, const char32_t* s, Encoding = CharSet::ISO_10646_UCS_4) | |
{ | |
str.resize(ystdex::ntctslen(s)); | |
str.resize(UCS4ToUCS2(&str[0], s)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char16_t>> | |
YB_FLATTEN _tDst& | |
EmplaceUCS2LE(_tDst& str, u32string_view sv, Encoding = CharSet::ISO_10646_UCS_4) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
str.resize(l); | |
str.resize(UCS4ToUCS2(&str[0], s, s + l)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char32_t>> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceUCS4LE(_tDst& str, const char* s, Encoding enc = CS_Default) | |
{ | |
str.resize(ystdex::ntctslen(s)); | |
str.resize(MBCSToUCS4(&str[0], s, enc)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char32_t>> | |
YB_FLATTEN _tDst& | |
EmplaceUCS4LE(_tDst& str, string_view sv, Encoding enc = CS_Default) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
str.resize(l); | |
str.resize(MBCSToUCS4(&str[0], s, s + l, enc)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char32_t>> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceUCS4LE(_tDst& str, const char16_t* s, Encoding = CharSet::ISO_10646_UCS_2) | |
{ | |
str.resize(ystdex::ntctslen(s)); | |
str.resize(UCS2ToUCS4(&str[0], s)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char32_t>> | |
YB_FLATTEN _tDst& | |
EmplaceUCS4LE(_tDst& str, u16string_view sv, Encoding = CharSet::ISO_10646_UCS_2) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
str.resize(l); | |
str.resize(UCS2ToUCS4(&str[0], s, s + l)); | |
return str; | |
} | |
template<class _tDst = std::basic_string<char32_t>> | |
YB_FLATTEN inline _tDst& | |
EmplaceUCS4LE(_tDst& str, u32string_view sv, Encoding = CharSet::ISO_10646_UCS_4) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
// FIXME: Correct conversion for encoding other than UCS-4LE. | |
return str = {s, sv.length()}; | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN inline YB_NONNULL(2) _tDst& | |
EmplaceMBCS(_tDst& str, const char* s, Encoding enc) | |
{ | |
return enc = CS_Default ? EmplaceMBCS<_tDst>(str, s) | |
: EmplaceMBCS<_tDst>(EmplaceUCS2LE(str, s, CS_Default), enc); | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN inline _tDst& | |
EmplaceMBCS(_tDst& str, string_view sv, Encoding enc) | |
{ | |
return enc = CS_Default ? EmplaceMBCS<_tDst>(str, sv) | |
: EmplaceMBCS<_tDst>(EmplaceUCS2LE(str, sv, CS_Default), enc); | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceMBCS(_tDst& str, const char16_t* s, Encoding enc = CS_Default) | |
{ | |
const auto w(FetchMaxCharWidth(enc)); | |
str.resize(ystdex::ntctslen(s) * (w == 0 ? sizeof(ucsint_t) : w)); | |
str.resize(UCS2ToMBCS(&str[0], s, enc)); | |
return str; | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN _tDst& | |
EmplaceMBCS(_tDst& str, u16string_view sv, Encoding enc = CS_Default) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
const auto w(FetchMaxCharWidth(enc)); | |
str.resize(l * (w == 0 ? sizeof(ucsint_t) : w)); | |
str.resize(UCS2ToMBCS(&str[0], s, s + l, enc)); | |
return str; | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN YB_NONNULL(2) _tDst& | |
EmplaceMBCS(_tDst& str, const char32_t* s, Encoding enc = CS_Default) | |
{ | |
str.resize(ystdex::ntctslen(s) * FetchMaxCharWidth(enc)); | |
str.resize(UCS4ToMBCS(&str[0], s, enc)); | |
return str; | |
} | |
template<class _tDst = std::string> | |
YB_FLATTEN _tDst& | |
EmplaceMBCS(_tDst& str, u32string_view sv, Encoding enc = CS_Default) | |
{ | |
const auto s(sv.data()); | |
yconstraint(s); | |
const auto l(sv.length()); | |
str.resize(l * FetchMaxCharWidth(enc)); | |
str.resize(UCS4ToMBCS(&str[0], s, s + l, enc)); | |
return str; | |
} | |
} // namespace CHRLib; | |
namespace | |
{ | |
#if Test_OutputContent | |
using namespace platform; | |
using namespace YSLib; | |
using namespace platform_ex; | |
using namespace CHRLib; | |
void | |
print(const string& n, u16string_view s){ | |
using namespace std; | |
cout << n << endl << s.length() << ':' << endl; | |
# if YCL_Win32 | |
auto& wcon(FetchStaticRef<WConsole>()); | |
wcon.WriteString(s); | |
# else | |
cout << EncodeArg(String(s).GetMBCS()); | |
# endif | |
cout << endl; | |
} | |
void | |
print(const string& n, string_view s){ | |
using namespace std; | |
auto& wcon(FetchStaticRef<WConsole>()); | |
cout << n << endl << s.length() << ':' << endl; | |
# if YCL_Win32 | |
wcon.WriteString(s); | |
# else | |
cout << EncodeArg(s.data()); | |
# endif | |
cout << endl; | |
} | |
#endif | |
template<typename _type> | |
void | |
TestPerformance(const _type& utf8str) | |
{ | |
#if Test_Performance | |
auto t_yw32 = [](auto b, auto e){ | |
// UTF8->UTF16 | |
auto utf16str1 = MBCSToWCS(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = WCSToMBCS(utf16str1); | |
// UTF8->UTF16 | |
auto utf16str1_test = MBCSToWCS(utf8str2); | |
#if Test_UseBig5 | |
auto utf16str = MBCSToWCS(Test_Big5Str, 936); | |
#endif | |
}; | |
auto t_ycu4 = [](auto b, auto e){ | |
// UTF8->UTF16: UTF8->UCS4LE | |
auto utf16str1_mid32 = MakeUCS4LE(string_view(b, e - b)); | |
// UTF8->UTF16: UCS4LE->UCS2 | |
auto utf16str1 = MakeUCS2LE(utf16str1_mid32); | |
// UTF16->UTF8: UCS2->UCS4LE | |
auto utf8str2_mid32 = MakeUCS4LE(utf16str1); | |
// UTF16->UTF8: UCS4LE->UTF8 | |
auto utf8str2 = MakeMBCS(utf8str2_mid32); | |
// UTF8->UTF16: UTF8->UCS4LE | |
auto utf16str1_test_mid32 = MakeUCS4LE(utf8str2); | |
// UTF8->UTF16: UCS4LE->UCS2 | |
auto utf16str1_test = MakeUCS2LE(utf16str1_test_mid32); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
auto t_ycu4_r = [](auto b, auto e){ | |
// UTF8->UTF16: UTF8->UCS4LE | |
auto utf_mid32 = MakeUCS4LE(string_view(b, e - b)); | |
// UTF8->UTF16: UCS4LE->UCS2 | |
auto utf16str1 = MakeUCS2LE(utf_mid32); | |
// UTF16->UTF8: UCS2->UCS4LE | |
EmplaceUCS4LE(utf_mid32, utf16str1); | |
// UTF16->UTF8: UCS4LE->UTF8 | |
auto utf8str2 = MakeMBCS(utf_mid32); | |
// UTF8->UTF16: UTF8->UCS4LE | |
EmplaceUCS4LE(utf_mid32, utf8str2); | |
// UTF8->UTF16: UCS4LE->UCS2 | |
EmplaceUCS2LE(utf16str1, utf_mid32); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
auto t_ychr = [](auto b, auto e){ | |
// UTF8->UTF16 | |
auto utf16str1 = MakeUCS2LE(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = MakeMBCS(utf16str1); | |
// UTF8->UTF16 | |
auto utf16str1_test = MakeUCS2LE(utf8str2); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
auto t_ychr_r = [](auto b, auto e){ | |
// UTF8->UTF16 | |
auto utf16str1 = MakeUCS2LE(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = MakeMBCS(utf16str1); | |
// UTF8->UTF16 | |
EmplaceUCS2LE(utf16str1, utf8str2); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
auto t_ysl = [](auto b, auto e){ | |
// UTF8->UTF16 | |
auto utf16str1 = String(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = utf16str1.GetMBCS(); | |
// UTF8->UTF16 | |
auto utf16str1_test = String(utf8str2); | |
#if Test_UseBig5 | |
auto utf16str = String(Test_Big5Str, CharSet::CP936); | |
#endif | |
}; | |
auto t_ysl_r = [](auto b, auto e){ | |
// UTF8->UTF16 | |
auto utf16str1 = String(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = utf16str1.GetMBCS(); | |
// UTF8->UTF16 | |
EmplaceUCS2LE(utf16str1, String(utf8str2)); | |
#if Test_UseBig5 | |
auto utf16str = String(Test_Big5Str, CharSet::CP936); | |
#endif | |
}; | |
auto t_fast = [](auto b, auto e){ | |
using namespace fast_utf8_decoder; | |
// UTF8->UTF16 | |
auto utf16str1 = ToUTF16(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = MakeMBCS(utf16str1); | |
// UTF8->UTF16 | |
auto utf16str1_test = ToUTF16(utf8str2); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
#if USE_rapidjson | |
auto t_rj = [](auto b, auto e){ | |
using namespace rapid_utf8_decoder; | |
// UTF8->UTF16 | |
auto utf16str1 = ToUTF16(string_view(b, e - b)); | |
// UTF16->UTF8 | |
auto utf8str2 = MakeMBCS(utf16str1); | |
// UTF8->UTF16 | |
auto utf16str1_test = ToUTF16(utf8str2); | |
#if Test_UseBig5 | |
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK); | |
#endif | |
}; | |
#endif | |
#if USE_minicodecvt | |
auto t_chu = [](auto b, auto e){ | |
using namespace minicodecvt; | |
// UTF8->UTF16 | |
auto utf16str1 = Convert<Decoder::UTF8, Encoder::UTF16>(b, e); | |
// UTF16->UTF8 | |
auto utf8str2 = Convert<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end()); | |
// UTF8->UTF16 | |
auto utf16str1_test = Convert<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end()); | |
# if Test_UseBig5 | |
// GBK->UCS4 | |
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str); | |
// UCS4->UTF16LE | |
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end()); | |
# endif | |
}; | |
auto t_chu2 = [](auto b, auto e){ | |
using namespace minicodecvt; | |
// UTF8->UTF16 | |
auto utf16str1 = ConvertR<Decoder::UTF8, Encoder::UTF16>(b, e); | |
// UTF16->UTF8 | |
auto utf8str2 = ConvertR<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end()); | |
// UTF8->UTF16 | |
auto utf16str1_test = ConvertR<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end()); | |
# if Test_UseBig5 | |
// GBK->UCS4 | |
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str); | |
// UCS4->UTF16LE | |
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end()); | |
# endif | |
}; | |
#endif | |
#if USE_MCF | |
auto t_mcf_r = [](auto b, auto e){ | |
using namespace MCF; | |
Utf32String temp; | |
Utf16String u16s; | |
Utf8String u8s(b, e); | |
// temp.Clear(); | |
// u16s.Clear(); | |
// UTF8->UTF16 | |
Utf8String::UnifyAppend(temp, u8s); | |
Utf16String::DeunifyAppend(u16s, temp); | |
temp.Clear(); | |
u8s.Clear(); | |
// UTF16->UTF8 | |
Utf16String::UnifyAppend(temp, u16s); | |
Utf8String::DeunifyAppend(u8s, temp); | |
temp.Clear(); | |
u16s.Clear(); | |
// UTF8->UTF16 | |
Utf8String::UnifyAppend(temp, u8s); | |
Utf16String::DeunifyAppend(u16s, temp); | |
}; | |
auto t_mcf2_r = [](auto b, auto e){ | |
using namespace MCF; | |
Utf32String temp; | |
WideString u16s; | |
Utf8String u8s(b, e); | |
// temp.Clear(); | |
// u16s.Clear(); | |
// UTF8->UTF16 | |
// Utf8String::UnifyAppend(temp, u8s); | |
u16s.Append(u8s); | |
WideString::DeunifyAppend(u16s, temp); | |
temp.Clear(); | |
u8s.Clear(); | |
// UTF16->UTF8 | |
WideString::UnifyAppend(temp, u16s); | |
Utf8String::DeunifyAppend(u8s, temp); | |
temp.Clear(); | |
u16s.Clear(); | |
// UTF8->UTF16 | |
// Utf8String::UnifyAppend(temp, u8s); | |
u16s.Append(u8s); | |
WideString::DeunifyAppend(u16s, temp); | |
}; | |
auto t_mcf3 = [](auto b, auto e){ | |
using namespace MCF; | |
Utf8String u8s(b, e); | |
// UTF8->UTF16 | |
Utf16String utf16str1; | |
Utf8String::UnifyAppend(utf16str1, u8s); | |
// UTF16->UTF8 | |
Utf8String utf8str2; | |
Utf8String::DeunifyAppend(utf8str2, utf16str1); | |
// UTF8->UTF16 | |
Utf16String utf16str1_test; | |
Utf8String::UnifyAppend(utf16str1_test, utf8str2); | |
}; | |
#endif | |
#if Test_LongString | |
string unitstr(utf8str.begin(), utf8str.end()); | |
ystdex::concat(unitstr, size_t(100000)); | |
const auto longstr(std::move(unitstr)); | |
#endif | |
auto ptest = [&](auto f, auto name){ | |
using namespace std; | |
cout << "[Test]" << name << ':' << endl | |
<< ytest::timing::total(100000, Timers::HighResolutionClock::now, | |
bind(f, &utf8str[0], &utf8str[0] + utf8str.size())).count() / 1e9 << endl; | |
#if Test_LongString | |
cout << "[Long]" << name << ':' << endl | |
<< ytest::timing::total(1, Timers::HighResolutionClock::now, | |
bind(f, &longstr[0], &longstr[0] + longstr.size())).count() / 1e9 << endl; | |
#endif | |
}; | |
ptest(t_yw32, "YFramework MBCSToWCS + WCSToMBCS (Win32)"); | |
ptest(t_ycu4, "YFramework MakeUCS4LE + MakeUCS2LE + MakeMBCS"); | |
ptest(t_ycu4_r, "YFramework MakeUCS4LE + MakeUCS2LE + EmplaceUCS4LE + MakeMBCS + EmplaceUCS2LE"); | |
ptest(t_ychr, "YFramework MakeUCS2LE + MakeMBCS"); | |
ptest(t_ychr_r, "YFramework MakeUCS2LE + MakeMBCS + EmplaceUCS2LE"); | |
ptest(t_ysl, "YFramework String + String::GetMBCS"); | |
ptest(t_ysl_r, "YFramework String + String::GetMBCS + EmplaceUCS2LE"); | |
ptest(t_fast, "Fast ToUTF16 + YFramework MakeMBCS"); | |
#if USE_rapidjson | |
ptest(t_rj, "Rapid ToUTF16 + YFramework MakeMBCS"); | |
#endif | |
#if USE_MCF | |
ptest(t_mcf_r, "MCF String UnifyAppend + DeunifyAppend (string reused)"); | |
ptest(t_mcf2_r, "MCF String UnifyAppend + DeunifyAppend W (string reused)"); | |
ptest(t_mcf3, "MCF String UnifyAppend"); | |
#endif | |
#if USE_minicodecvt | |
ptest(t_chu, "minicodecvt Convert"); | |
ptest(t_chu2, "minicodecvt Convert (reserved length)"); | |
#endif | |
#else | |
yunused(utf8str); | |
#endif | |
} | |
template<typename _type> | |
void | |
TestOutputContent(const _type& utf8str) | |
{ | |
#if Test_OutputContent | |
using platform::ucast; | |
using namespace minicodecvt; | |
// vec UTF8->UTF16 | |
auto utf16str1 = Convert<Decoder::UTF8, Encoder::UTF16>(begin(utf8str), end(utf8str)); | |
// UTF16->UTF8 | |
auto utf8str2 = Convert<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end()); | |
// UTF8->UTF16 | |
auto utf16str1_test = Convert<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end()); | |
print("std::vector<uint8_t> utf8str to string", string(begin(utf8str), | |
end(utf8str))); | |
print("minicodevct vec UTF8->UTF16", | |
u16string_view(reinterpret_cast<const char16_t*>(utf16str1.c_str()), | |
utf16str1.length() / sizeof(char16_t))); | |
print("minicodevct UTF16->UTF8", utf8str2); | |
print("minicodevct UTF8->UTF16", String(ucast(utf16str1_test.c_str()))); | |
print("fast_utf8_decoder::ToUTF16", fast_utf8_decoder::ToUTF16( | |
string(begin(utf8str), end(utf8str)))); | |
#if USE_rapidjson | |
print("rapid_utf8_decoder::ToUTF16", rapid_utf8_decoder::ToUTF16( | |
string(begin(utf8str), end(utf8str)))); | |
#endif | |
#if USE_MCF | |
print("MCF UnifyAppend + DeunifyAppend", [&]{ | |
using namespace MCF; | |
Utf8String u8s(&utf8str[0], &utf8str[utf8str.size()]); | |
Utf32String temp; | |
Utf16String u16s; | |
Utf8String::UnifyAppend(temp, u8s); | |
Utf16String::DeunifyAppend(u16s, temp); | |
return u16string(u16s.GetBegin(), u16s.GetEnd()); | |
}()); | |
#endif | |
print("YSLib::String", MakeUCS2LE(string(begin(utf8str), end(utf8str)))); | |
#if Test_UseBig5 | |
// GBK->UCS4 | |
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str); | |
// UCS4->UTF16LE | |
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end()); | |
// UCS4->BIG5 | |
auto big5 = Encode<Encoder::BIG5>(utf32str.begin(), utf32str.end()); | |
// BIG5->UCS4 | |
auto utf32str2 = Decode<Decoder::BIG5>(big5.c_str()); | |
// print(MakeMBCS(utf32str)); | |
print("minicodecv GBK->UCS4", String(utf32str)); | |
print("minicodecv UCS4->UTF16LE", String(Test_CastCPtr(char16_t, | |
utf16le.c_str()))); | |
print("minicodecv UCS4->BIG5", MBCSToMBCS(big5, 950, CP_UTF8)); | |
print("minicodecv BIG5->UCS4", String(utf32str2)); | |
print("minicodecv UCS4->UTF16", String(Test_CastCPtr(char16_t, Encode<Encoder::UTF16, | |
wchar_t>(utf32str2.begin(), utf32str2.end()).c_str()))); | |
#endif | |
#endif | |
} | |
} // unnamed namespace; | |
int main() | |
{ | |
using namespace YSLib; | |
#define Test_Seq0 0xE4, 0xBD, 0xA0, 0xE6, 0x98, 0xAF, 0xE6, 0x88, 0x91, 0xE7, \ | |
0x9A, 0x84, 0xE5, 0xB0, 0x8F, 0xE5, 0x91, 0x80, 0xE5, 0xB0, 0x8F, 0xE8, \ | |
0x8B, 0xB9, 0xE6, 0x9E, 0x9C | |
#if Test_UseUnitStr | |
#define Test_Seq { Test_Seq0 } | |
#define Test_Big5Str "妳是我的小呀小蘋果" | |
#else | |
#define Test_Big5Str0 "妳是我的小呀小蘋果" | |
#define Test_Seq1 Test_Seq0, Test_Seq0 | |
#define Test_Seq { Test_Seq1, Test_Seq1 } | |
#define Test_Big5Str1 Test_Big5Str0 Test_Big5Str0 | |
#define Test_Big5Str Test_Big5Str1 Test_Big5Str1 | |
#endif | |
#if Test_SMP | |
# define Test_SMP_Seq u8"𤭢𤭢𤭢𤭢𤭢0𤭢1𤭢2𤭢3𤭢4𤭢𤭢5𤭢" | |
const string utf8str0 = Test_SMP_Seq; | |
#else | |
const vector<unsigned char> utf8str0 = Test_Seq; | |
#endif | |
const string utf8str(utf8str0.begin(), utf8str0.end()); | |
TestPerformance(utf8str); | |
TestOutputContent(utf8str); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment