Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@FrankHB
Last active October 19, 2015 00:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FrankHB/09a27dd51eb934984820 to your computer and use it in GitHub Desktop.
Save FrankHB/09a27dd51eb934984820 to your computer and use it in GitHub Desktop.
UTF-8 decoding/encoding performance test
// LICENSE: MIT
// Based on: https://github.com/9chu/minicodecvt/blob/master/TestCpp/main.cpp
// Depends on:
// rapidjson: https://github.com/miloyip/rapidjson .
// minicodecvt: https://github.com/9chu/minicodecvt
// YSLib: https://github.com/FrankHB/YSLib
// MCF: https://github.com/lhmouse/MCF
// Prerequisite:
// *-w64-mingw32-g++ supports '-std=c++14'.
// Run Tool/install-sysroot.sh in YSLib repo dir to install Sysroot
// and SHBuild.
// See also https://bitbucket.org/FrankHB/yslib/wiki/Prerequisitions.zh-CN.md.
#if 0
/*
#!/usr/bin/env sh
export rapidjson_SRC=/e/Source/rapidjson/include
export minicodecvt_SRC=/e/Source/9chu/minicodecvt/minicodecvt
export MCF_SRC=/e/Source/lhmouse/MCF/MCF
export SYSROOT=/f/Programing/YSLib/sysroot/usr
export CXX=g++
export AR=gcc-ar
#export C_CXXFLAGS_ARCH='-mfpmath=both -march=nocona'
export C_CXXFLAGS_ARCH='-mfpmath=sse -march=native'
export CXXFLAGS_STD='-std=c++14'
export CXXFLAGS="$CXXFLAGS_STD $C_CXXFLAGS_ARCH -pipe -fdata-sections -ffunction-sections \
-pedantic-errors -Wall -Wcast-align -Wdeprecated -Wdeprecated-declarations \
-Wextra -Wfloat-equal -Wformat=2 -Winvalid-pch -Wmissing-declarations \
-Wmissing-include-dirs -Wmultichar -Wno-format-nonliteral -Wredundant-decls -Wshadow -Wsign-conversion \
-Wdouble-promotion -Wlogical-op -Wtrampolines \
-Wctor-dtor-privacy -Wnon-virtual-dtor -Woverloaded-virtual -Wsign-promo -Wdouble-promotion -Wlogical-op \
-Wconditionally-supported -Wstrict-null-sentinel -Wzero-as-null-pointer-constant -mthreads \
-O3 -DNDEBUG -s -fexpensive-optimizations -flto=jobserver -fomit-frame-pointer \
-Wall -Wextra -pedantic -Wsign-conversion \
-Wsuggest-attribute=noreturn \
-Wno-type-limits -Wno-sign-conversion -Wno-zero-as-null-pointer-constant -Wnoexcept"
export LDFLAGS="-mthreads -s -fexpensive-optimizations -flto -Wl,--gc-sections"
# build minicodecvt
SHBuild $minicodecvt_SRC $CXXFLAGS -xj,3
# build MCF
SHBuild $MCF_SRC $CXXFLAGS -xj,6
$CXX a.cc $CXXFLAGS -Wno-shadow \
-I$SYSROOT/include \
-I$rapidjson_SRC \
-I$minicodecvt_SRC .shbuild/minicodecvt.a \
-I$MCF_SRC .shbuild/MCF.a \
-L$SYSROOT/lib -DYF_DLL -DYB_DLL -lYFramework -lYBase
./a
$CXX a.cc -oa.static.exe $CXXFLAGS -Wno-sign-shadow \
-I$SYSROOT/include \
-I$rapidjson_SRC \
-I$minicodecvt_SRC .shbuild/minicodecvt.a \
-I$MCF_SRC .shbuild/MCF.a \
-L$SYSROOT/lib -Wl,-dn -lYFramework -lYBase
*/
#endif
#include <ysbuild.h>
#include YFM_CHRLib_MappingEx
#include YFM_Win32_YCLib_Consoles
#include <ytest/timing.hpp>
//#define Test_SMP 0
//#define Test_UseCheck 0
//#define Test_UseUnitStr 0
//#define Test_UseBig5 0
#ifndef Test_LongString
# define Test_LongString 1
#endif
#ifndef Test_Performance
# define Test_Performance 1
#endif
#ifndef Test_OutputContent
# define Test_OutputContent 1
#endif
#ifndef USE_rapidjson
# define USE_rapidjson 1
#endif
#ifndef USE_minicodecvt
# define USE_minicodecvt 1
#endif
#ifndef USE_MCF
# define USE_MCF 1
#endif
#include <iostream>
#if USE_rapidjson
# include <rapidjson/rapidjson.h>
# include <rapidjson/memorystream.h>
//# include <rapidjson/encodings.h>
#endif
#if USE_minicodecvt
#undef ynothrow
# include "coverter.h"
# include "gbk.h"
# include "shift_jis.h"
# include "big5.h"
# include "unicode.h"
#endif
#if USE_MCF
# include <Core/String.hpp>
#endif
namespace fast_utf8_decoder
{
using ystdex::size_t;
using ystdex::byte;
using YSLib::u16string;
using YSLib::u32string;
using CHRLib::ConversionResult;
using state_t = std::uint_fast8_t;
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
enum UTF8_STATE : state_t
{
UTF8_ACCEPT = 0,
UTF8_REJECT = 12
};
// Maps bytes to character classes that to reduce the size of the transition table and create bitmasks.
yconstexpr state_t utf8d1[]{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
};
// Transition table that maps a combination of a state of the automaton and a character class to a state.
yconstexpr state_t utf8d2[]{
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};
namespace
{
state_t
decode(state_t res, char32_t& code, byte b)
{
yconstraint(res <= 96);
const auto type(utf8d1[size_t(b)]);
yassume(type < UTF8_REJECT);
code = (res != UTF8_ACCEPT) ? (b & 0x3FU) | (code << 6)
: (0xFFU >> type) & b;
return utf8d2[res + type];
}
}
template<typename _tIn, typename _tOut>
ConversionResult
ToUTF16(_tIn first, _tIn last, _tOut result, size_t n)
{
auto s = first;
auto d = result;
char32_t code;
state_t res(0);
while(s < last)
{
size_t dst_words_free(n - (d - result));
auto src_current_end(s + dst_words_free - 1);
if(last < src_current_end)
src_current_end = last;
if(src_current_end <= s)
return ConversionResult::Unhandled;
while(s < src_current_end)
{
const auto c(*s);
++s;
if(YB_UNLIKELY(res > 96))
return ConversionResult::BadState;
res = decode(res, code, byte(c));
if(res != UTF8_ACCEPT && res != UTF8_REJECT)
continue;
if(code > 0xFFFFU)
{
*d = char16_t(0xD7C0U + (code >> 10U));
++d;
*d = char16_t(0xDC00U + (code & 0x3FFU));
++d;
}
else
{
*d = char16_t(code);
++d;
}
}
}
if(res != UTF8_ACCEPT)
return ConversionResult::Invalid;
if((n - (d - result)) == 0)
return ConversionResult::Unhandled;
*d++ = 0;
return ConversionResult::OK;
}
template<typename _tIn, typename _tOut>
ConversionResult
ToUTF32(_tIn first, _tIn last, _tOut result, size_t n)
{
auto s = first;
auto d = result;
char32_t code;
state_t res(0);
while(s < last)
{
size_t dst_words_free(n - (d - result));
auto src_current_end(s + dst_words_free - 1);
if(last < src_current_end)
src_current_end = last;
if(src_current_end <= s)
return ConversionResult::Unhandled;
while(s < src_current_end)
{
const auto c(*s);
++s;
if(YB_UNLIKELY(res > 96))
return ConversionResult::BadState;
res = decode(res, code, byte(c));
if(res != UTF8_ACCEPT && res != UTF8_REJECT)
continue;
*d = code;
++d;
}
}
if(res != UTF8_ACCEPT)
return ConversionResult::Invalid;
if(n == d - result)
return ConversionResult::Unhandled;
*d++ = 0;
return ConversionResult::OK;
}
template<typename _tString>
u16string
ToUTF16(const _tString& str)
{
const auto len(str.length() * 2);
u16string res(len, char16_t());
const auto
cres(ToUTF16(str.cbegin(), str.cend(), res.begin(), len));
#if Test_UseCheck
if(cres != ConversionResult::OK)
std::cerr << int(cres) << std::endl;
#else
yunused(cres);
#endif
return res;
}
template<typename _tString>
u32string
ToUTF32(const _tString& str)
{
const auto len(str.length() * 2);
u32string res(len, char32_t());
const auto
cres(ToUTF32(str.cbegin(), str.cend(), res.begin(), len));
#if Test_UseCheck
if(cres != ConversionResult::OK)
std::cerr << int(cres) << std::endl;
#else
yunused(cres);
#endif
return res;
}
}
#if USE_rapidjson
namespace rapid_utf8_decoder
{
using namespace rapidjson;
using ystdex::size_t;
using ystdex::byte;
using YSLib::u16string;
using YSLib::u32string;
using CHRLib::ConversionResult;
using state_t = std::uint_fast8_t;
template<typename _tOut>
ConversionResult
ToUTF16(MemoryStream& stream, _tOut result, size_t n)
{
auto& last = stream.end_;
auto& s = stream.src_;
auto d = result;
unsigned code;
// state_t res(0);
while(s < last)
{
size_t dst_words_free(n - (d - result));
auto src_current_end(s + dst_words_free - 1);
if(last < src_current_end)
src_current_end = last;
if(src_current_end <= s)
return ConversionResult::Unhandled;
while(s < src_current_end)
{
#if Test_UseCheck && 0
if(YB_UNLIKELY(res >= 0x50))
return ConversionResult::BadState;
#endif
if(!UTF8<>::Decode(stream, &code))
continue;
if(code > 0xFFFFU)
{
*d = char16_t(0xD7C0U + (code >> 10U));
++d;
*d = char16_t(0xDC00U + (code & 0x3FFU));
++d;
}
else
{
*d = char16_t(code);
++d;
}
}
}
#if Test_UseCheck && 0
if(res != UTF8_ACCEPT)
return ConversionResult::Invalid;
if((n - (d - result)) == 0)
return ConversionResult::Unhandled;
#endif
*d++ = 0;
return ConversionResult::OK;
}
template<typename _tOut>
ConversionResult
ToUTF32(MemoryStream& stream, _tOut result, size_t n)
{
auto& first = stream.begin_;
auto& last = stream.end_;
auto& s = stream.src_;
auto d = result;
unsigned code;
// state_t res(0);
while(s < last)
{
size_t dst_words_free(n - (d - result));
auto src_current_end(s + dst_words_free - 1);
if(last < src_current_end)
src_current_end = last;
if(src_current_end <= s)
return ConversionResult::Unhandled;
while(s < src_current_end)
{
#if Test_UseCheck && 0
if(YB_UNLIKELY(res >= 0x50))
return ConversionResult::BadState;
#endif
if(!UTF8<>::Decode(stream, &code))
continue;
*d = code;
++d;
}
}
#if Test_UseCheck && 0
if(res != UTF8_ACCEPT)
return ConversionResult::Invalid;
if((n - (d - result)) == 0)
return ConversionResult::Unhandled;
#endif
*d++ = 0;
return ConversionResult::OK;
}
template<typename _tString>
u16string
ToUTF16(const _tString& str)
{
const auto len(str.length() * 2);
u16string res(len, char16_t());
MemoryStream ms(str.data(), str.length());
const auto cres(ToUTF16(ms, res.begin(), len));
#if Test_UseCheck
if(cres != ConversionResult::OK)
std::cerr << int(cres) << std::endl;
#else
yunused(cres);
#endif
return res;
}
template<typename _tString>
u32string
ToUTF32(const _tString& str)
{
const auto len(str.length() * 2);
u32string res(len, char32_t());
MemoryStream ms(str.data(), str.length());
const auto cres(ToUTF32(ms, res.begin(), len));
#if Test_UseCheck
if(cres != ConversionResult::OK)
std::cerr << int(cres) << std::endl;
#else
yunused(cres);
#endif
return res;
}
}
#endif
#if USE_minicodecvt
namespace minicodecvt
{
template<typename decoder_t, typename encoder_t, typename char_t = char>
std::basic_string<char_t> ConvertR(const char* nullTerminatedString)
{
decoder_t decoder;
encoder_t encoder;
std::basic_string<char_t> ret;
ret.reserve(strlen(nullTerminatedString) * 4);
char32_t ucs4;
bool de_finished = true;
uint8_t* buf;
size_t size = 0;
bool en_finished = true;
while (*nullTerminatedString != 0)
{
if (decoder(*nullTerminatedString, ucs4))
{
de_finished = true;
if (encoder(ucs4, buf, size))
{
en_finished = true;
if (buf)
{
if (size % sizeof(char_t) != 0)
throw Exception::EncodeError("output character type is not match with encoder.");
ret.append(reinterpret_cast<char_t*>(buf), size / sizeof(char_t));
}
}
else
en_finished = {};
}
else
de_finished = {};
++nullTerminatedString;
}
#if Test_UseCheck
if (!de_finished)
throw Exception::DecodeError("not all bytes is decoded.");
if (!en_finished)
throw Exception::EncodeError("not all bytes is encoded.");
#else
yunused(de_finished);
yunused(en_finished);
#endif
return std::move(ret);
}
template<typename decoder_t, typename encoder_t, typename char_t = char, typename iterator_t>
std::basic_string<char_t> ConvertR(iterator_t begin, iterator_t end)
{
decoder_t decoder;
encoder_t encoder;
std::basic_string<char_t> ret;
ret.reserve(std::distance(begin, end) * 4);
char32_t ucs4;
bool de_finished = true;
uint8_t* buf;
size_t size = 0;
bool en_finished = true;
while (begin != end)
{
if (decoder(*begin, ucs4))
{
de_finished = true;
if (encoder(ucs4, buf, size))
{
en_finished = true;
if (buf)
{
if (size % sizeof(char_t) != 0)
throw Exception::EncodeError("output character type is not match with encoder.");
ret.append(reinterpret_cast<char_t*>(buf), size / sizeof(char_t));
}
}
else
en_finished = {};
}
else
de_finished = {};
++begin;
}
#if Test_UseCheck
if (!de_finished)
throw Exception::DecodeError("not all bytes is decoded.");
if (!en_finished)
throw Exception::EncodeError("not all bytes is encoded.");
#else
yunused(de_finished);
yunused(en_finished);
#endif
return std::move(ret);
}
}
#endif
namespace CHRLib
{
template<class _tDst = std::basic_string<char16_t>>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceUCS2LE(_tDst& str, const char* s, Encoding enc = CS_Default)
{
str.resize(ystdex::ntctslen(s));
str.resize(MBCSToUCS2(&str[0], s, enc));
return str;
}
template<class _tDst = std::basic_string<char16_t>>
YB_FLATTEN _tDst&
EmplaceUCS2LE(_tDst& str, string_view sv, Encoding enc = CS_Default)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
#if 1
str.resize(l);
str.resize(MBCSToUCS2(&str[0], s, s + l, enc));
#else
static yconstexpr size_t blk_size(1 << 8U);
char16_t xbuf[blk_size];
const size_t blk(l / blk_size);
str.reserve(l);
str.clear();
for(size_t i(0); i < blk; ++i)
str.append(xbuf, MBCSToUCS2(xbuf, s + i * blk_size,
s + std::min<size_t>((i + 1) * blk_size, l), enc));
#endif
return str;
}
template<class _tDst = std::basic_string<char16_t>>
inline YB_FLATTEN _tDst&
EmplaceUCS2LE(_tDst& str, u16string_view sv, Encoding = CharSet::ISO_10646_UCS_2)
{
const auto s(sv.data());
yconstraint(s);
// FIXME: Correct conversion for encoding other than UCS-2LE.
return str = {s, sv.length()};
}
template<class _tDst = std::basic_string<char16_t>>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceUCS2LE(_tDst& str, const char32_t* s, Encoding = CharSet::ISO_10646_UCS_4)
{
str.resize(ystdex::ntctslen(s));
str.resize(UCS4ToUCS2(&str[0], s));
return str;
}
template<class _tDst = std::basic_string<char16_t>>
YB_FLATTEN _tDst&
EmplaceUCS2LE(_tDst& str, u32string_view sv, Encoding = CharSet::ISO_10646_UCS_4)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
str.resize(l);
str.resize(UCS4ToUCS2(&str[0], s, s + l));
return str;
}
template<class _tDst = std::basic_string<char32_t>>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceUCS4LE(_tDst& str, const char* s, Encoding enc = CS_Default)
{
str.resize(ystdex::ntctslen(s));
str.resize(MBCSToUCS4(&str[0], s, enc));
return str;
}
template<class _tDst = std::basic_string<char32_t>>
YB_FLATTEN _tDst&
EmplaceUCS4LE(_tDst& str, string_view sv, Encoding enc = CS_Default)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
str.resize(l);
str.resize(MBCSToUCS4(&str[0], s, s + l, enc));
return str;
}
template<class _tDst = std::basic_string<char32_t>>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceUCS4LE(_tDst& str, const char16_t* s, Encoding = CharSet::ISO_10646_UCS_2)
{
str.resize(ystdex::ntctslen(s));
str.resize(UCS2ToUCS4(&str[0], s));
return str;
}
template<class _tDst = std::basic_string<char32_t>>
YB_FLATTEN _tDst&
EmplaceUCS4LE(_tDst& str, u16string_view sv, Encoding = CharSet::ISO_10646_UCS_2)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
str.resize(l);
str.resize(UCS2ToUCS4(&str[0], s, s + l));
return str;
}
template<class _tDst = std::basic_string<char32_t>>
YB_FLATTEN inline _tDst&
EmplaceUCS4LE(_tDst& str, u32string_view sv, Encoding = CharSet::ISO_10646_UCS_4)
{
const auto s(sv.data());
yconstraint(s);
// FIXME: Correct conversion for encoding other than UCS-4LE.
return str = {s, sv.length()};
}
template<class _tDst = std::string>
YB_FLATTEN inline YB_NONNULL(2) _tDst&
EmplaceMBCS(_tDst& str, const char* s, Encoding enc)
{
return enc = CS_Default ? EmplaceMBCS<_tDst>(str, s)
: EmplaceMBCS<_tDst>(EmplaceUCS2LE(str, s, CS_Default), enc);
}
template<class _tDst = std::string>
YB_FLATTEN inline _tDst&
EmplaceMBCS(_tDst& str, string_view sv, Encoding enc)
{
return enc = CS_Default ? EmplaceMBCS<_tDst>(str, sv)
: EmplaceMBCS<_tDst>(EmplaceUCS2LE(str, sv, CS_Default), enc);
}
template<class _tDst = std::string>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceMBCS(_tDst& str, const char16_t* s, Encoding enc = CS_Default)
{
const auto w(FetchMaxCharWidth(enc));
str.resize(ystdex::ntctslen(s) * (w == 0 ? sizeof(ucsint_t) : w));
str.resize(UCS2ToMBCS(&str[0], s, enc));
return str;
}
template<class _tDst = std::string>
YB_FLATTEN _tDst&
EmplaceMBCS(_tDst& str, u16string_view sv, Encoding enc = CS_Default)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
const auto w(FetchMaxCharWidth(enc));
str.resize(l * (w == 0 ? sizeof(ucsint_t) : w));
str.resize(UCS2ToMBCS(&str[0], s, s + l, enc));
return str;
}
template<class _tDst = std::string>
YB_FLATTEN YB_NONNULL(2) _tDst&
EmplaceMBCS(_tDst& str, const char32_t* s, Encoding enc = CS_Default)
{
str.resize(ystdex::ntctslen(s) * FetchMaxCharWidth(enc));
str.resize(UCS4ToMBCS(&str[0], s, enc));
return str;
}
template<class _tDst = std::string>
YB_FLATTEN _tDst&
EmplaceMBCS(_tDst& str, u32string_view sv, Encoding enc = CS_Default)
{
const auto s(sv.data());
yconstraint(s);
const auto l(sv.length());
str.resize(l * FetchMaxCharWidth(enc));
str.resize(UCS4ToMBCS(&str[0], s, s + l, enc));
return str;
}
} // namespace CHRLib;
namespace
{
#if Test_OutputContent
using namespace platform;
using namespace YSLib;
using namespace platform_ex;
using namespace CHRLib;
void
print(const string& n, u16string_view s){
using namespace std;
cout << n << endl << s.length() << ':' << endl;
# if YCL_Win32
auto& wcon(FetchStaticRef<WConsole>());
wcon.WriteString(s);
# else
cout << EncodeArg(String(s).GetMBCS());
# endif
cout << endl;
}
void
print(const string& n, string_view s){
using namespace std;
auto& wcon(FetchStaticRef<WConsole>());
cout << n << endl << s.length() << ':' << endl;
# if YCL_Win32
wcon.WriteString(s);
# else
cout << EncodeArg(s.data());
# endif
cout << endl;
}
#endif
template<typename _type>
void
TestPerformance(const _type& utf8str)
{
#if Test_Performance
auto t_yw32 = [](auto b, auto e){
// UTF8->UTF16
auto utf16str1 = MBCSToWCS(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = WCSToMBCS(utf16str1);
// UTF8->UTF16
auto utf16str1_test = MBCSToWCS(utf8str2);
#if Test_UseBig5
auto utf16str = MBCSToWCS(Test_Big5Str, 936);
#endif
};
auto t_ycu4 = [](auto b, auto e){
// UTF8->UTF16: UTF8->UCS4LE
auto utf16str1_mid32 = MakeUCS4LE(string_view(b, e - b));
// UTF8->UTF16: UCS4LE->UCS2
auto utf16str1 = MakeUCS2LE(utf16str1_mid32);
// UTF16->UTF8: UCS2->UCS4LE
auto utf8str2_mid32 = MakeUCS4LE(utf16str1);
// UTF16->UTF8: UCS4LE->UTF8
auto utf8str2 = MakeMBCS(utf8str2_mid32);
// UTF8->UTF16: UTF8->UCS4LE
auto utf16str1_test_mid32 = MakeUCS4LE(utf8str2);
// UTF8->UTF16: UCS4LE->UCS2
auto utf16str1_test = MakeUCS2LE(utf16str1_test_mid32);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
auto t_ycu4_r = [](auto b, auto e){
// UTF8->UTF16: UTF8->UCS4LE
auto utf_mid32 = MakeUCS4LE(string_view(b, e - b));
// UTF8->UTF16: UCS4LE->UCS2
auto utf16str1 = MakeUCS2LE(utf_mid32);
// UTF16->UTF8: UCS2->UCS4LE
EmplaceUCS4LE(utf_mid32, utf16str1);
// UTF16->UTF8: UCS4LE->UTF8
auto utf8str2 = MakeMBCS(utf_mid32);
// UTF8->UTF16: UTF8->UCS4LE
EmplaceUCS4LE(utf_mid32, utf8str2);
// UTF8->UTF16: UCS4LE->UCS2
EmplaceUCS2LE(utf16str1, utf_mid32);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
auto t_ychr = [](auto b, auto e){
// UTF8->UTF16
auto utf16str1 = MakeUCS2LE(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = MakeMBCS(utf16str1);
// UTF8->UTF16
auto utf16str1_test = MakeUCS2LE(utf8str2);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
auto t_ychr_r = [](auto b, auto e){
// UTF8->UTF16
auto utf16str1 = MakeUCS2LE(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = MakeMBCS(utf16str1);
// UTF8->UTF16
EmplaceUCS2LE(utf16str1, utf8str2);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
auto t_ysl = [](auto b, auto e){
// UTF8->UTF16
auto utf16str1 = String(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = utf16str1.GetMBCS();
// UTF8->UTF16
auto utf16str1_test = String(utf8str2);
#if Test_UseBig5
auto utf16str = String(Test_Big5Str, CharSet::CP936);
#endif
};
auto t_ysl_r = [](auto b, auto e){
// UTF8->UTF16
auto utf16str1 = String(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = utf16str1.GetMBCS();
// UTF8->UTF16
EmplaceUCS2LE(utf16str1, String(utf8str2));
#if Test_UseBig5
auto utf16str = String(Test_Big5Str, CharSet::CP936);
#endif
};
auto t_fast = [](auto b, auto e){
using namespace fast_utf8_decoder;
// UTF8->UTF16
auto utf16str1 = ToUTF16(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = MakeMBCS(utf16str1);
// UTF8->UTF16
auto utf16str1_test = ToUTF16(utf8str2);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
#if USE_rapidjson
auto t_rj = [](auto b, auto e){
using namespace rapid_utf8_decoder;
// UTF8->UTF16
auto utf16str1 = ToUTF16(string_view(b, e - b));
// UTF16->UTF8
auto utf8str2 = MakeMBCS(utf16str1);
// UTF8->UTF16
auto utf16str1_test = ToUTF16(utf8str2);
#if Test_UseBig5
// auto utf16str = MakeMBCS(Test_Big5Str, CharSet::GBK);
#endif
};
#endif
#if USE_minicodecvt
auto t_chu = [](auto b, auto e){
using namespace minicodecvt;
// UTF8->UTF16
auto utf16str1 = Convert<Decoder::UTF8, Encoder::UTF16>(b, e);
// UTF16->UTF8
auto utf8str2 = Convert<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end());
// UTF8->UTF16
auto utf16str1_test = Convert<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end());
# if Test_UseBig5
// GBK->UCS4
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str);
// UCS4->UTF16LE
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end());
# endif
};
auto t_chu2 = [](auto b, auto e){
using namespace minicodecvt;
// UTF8->UTF16
auto utf16str1 = ConvertR<Decoder::UTF8, Encoder::UTF16>(b, e);
// UTF16->UTF8
auto utf8str2 = ConvertR<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end());
// UTF8->UTF16
auto utf16str1_test = ConvertR<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end());
# if Test_UseBig5
// GBK->UCS4
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str);
// UCS4->UTF16LE
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end());
# endif
};
#endif
#if USE_MCF
auto t_mcf_r = [](auto b, auto e){
using namespace MCF;
Utf32String temp;
Utf16String u16s;
Utf8String u8s(b, e);
// temp.Clear();
// u16s.Clear();
// UTF8->UTF16
Utf8String::UnifyAppend(temp, u8s);
Utf16String::DeunifyAppend(u16s, temp);
temp.Clear();
u8s.Clear();
// UTF16->UTF8
Utf16String::UnifyAppend(temp, u16s);
Utf8String::DeunifyAppend(u8s, temp);
temp.Clear();
u16s.Clear();
// UTF8->UTF16
Utf8String::UnifyAppend(temp, u8s);
Utf16String::DeunifyAppend(u16s, temp);
};
auto t_mcf2_r = [](auto b, auto e){
using namespace MCF;
Utf32String temp;
WideString u16s;
Utf8String u8s(b, e);
// temp.Clear();
// u16s.Clear();
// UTF8->UTF16
// Utf8String::UnifyAppend(temp, u8s);
u16s.Append(u8s);
WideString::DeunifyAppend(u16s, temp);
temp.Clear();
u8s.Clear();
// UTF16->UTF8
WideString::UnifyAppend(temp, u16s);
Utf8String::DeunifyAppend(u8s, temp);
temp.Clear();
u16s.Clear();
// UTF8->UTF16
// Utf8String::UnifyAppend(temp, u8s);
u16s.Append(u8s);
WideString::DeunifyAppend(u16s, temp);
};
auto t_mcf3 = [](auto b, auto e){
using namespace MCF;
Utf8String u8s(b, e);
// UTF8->UTF16
Utf16String utf16str1;
Utf8String::UnifyAppend(utf16str1, u8s);
// UTF16->UTF8
Utf8String utf8str2;
Utf8String::DeunifyAppend(utf8str2, utf16str1);
// UTF8->UTF16
Utf16String utf16str1_test;
Utf8String::UnifyAppend(utf16str1_test, utf8str2);
};
#endif
#if Test_LongString
string unitstr(utf8str.begin(), utf8str.end());
ystdex::concat(unitstr, size_t(100000));
const auto longstr(std::move(unitstr));
#endif
auto ptest = [&](auto f, auto name){
using namespace std;
cout << "[Test]" << name << ':' << endl
<< ytest::timing::total(100000, Timers::HighResolutionClock::now,
bind(f, &utf8str[0], &utf8str[0] + utf8str.size())).count() / 1e9 << endl;
#if Test_LongString
cout << "[Long]" << name << ':' << endl
<< ytest::timing::total(1, Timers::HighResolutionClock::now,
bind(f, &longstr[0], &longstr[0] + longstr.size())).count() / 1e9 << endl;
#endif
};
ptest(t_yw32, "YFramework MBCSToWCS + WCSToMBCS (Win32)");
ptest(t_ycu4, "YFramework MakeUCS4LE + MakeUCS2LE + MakeMBCS");
ptest(t_ycu4_r, "YFramework MakeUCS4LE + MakeUCS2LE + EmplaceUCS4LE + MakeMBCS + EmplaceUCS2LE");
ptest(t_ychr, "YFramework MakeUCS2LE + MakeMBCS");
ptest(t_ychr_r, "YFramework MakeUCS2LE + MakeMBCS + EmplaceUCS2LE");
ptest(t_ysl, "YFramework String + String::GetMBCS");
ptest(t_ysl_r, "YFramework String + String::GetMBCS + EmplaceUCS2LE");
ptest(t_fast, "Fast ToUTF16 + YFramework MakeMBCS");
#if USE_rapidjson
ptest(t_rj, "Rapid ToUTF16 + YFramework MakeMBCS");
#endif
#if USE_MCF
ptest(t_mcf_r, "MCF String UnifyAppend + DeunifyAppend (string reused)");
ptest(t_mcf2_r, "MCF String UnifyAppend + DeunifyAppend W (string reused)");
ptest(t_mcf3, "MCF String UnifyAppend");
#endif
#if USE_minicodecvt
ptest(t_chu, "minicodecvt Convert");
ptest(t_chu2, "minicodecvt Convert (reserved length)");
#endif
#else
yunused(utf8str);
#endif
}
template<typename _type>
void
TestOutputContent(const _type& utf8str)
{
#if Test_OutputContent
using platform::ucast;
using namespace minicodecvt;
// vec UTF8->UTF16
auto utf16str1 = Convert<Decoder::UTF8, Encoder::UTF16>(begin(utf8str), end(utf8str));
// UTF16->UTF8
auto utf8str2 = Convert<Decoder::UTF16, Encoder::UTF8>(utf16str1.begin(), utf16str1.end());
// UTF8->UTF16
auto utf16str1_test = Convert<Decoder::UTF8, Encoder::UTF16, wchar_t>(utf8str2.begin(), utf8str2.end());
print("std::vector<uint8_t> utf8str to string", string(begin(utf8str),
end(utf8str)));
print("minicodevct vec UTF8->UTF16",
u16string_view(reinterpret_cast<const char16_t*>(utf16str1.c_str()),
utf16str1.length() / sizeof(char16_t)));
print("minicodevct UTF16->UTF8", utf8str2);
print("minicodevct UTF8->UTF16", String(ucast(utf16str1_test.c_str())));
print("fast_utf8_decoder::ToUTF16", fast_utf8_decoder::ToUTF16(
string(begin(utf8str), end(utf8str))));
#if USE_rapidjson
print("rapid_utf8_decoder::ToUTF16", rapid_utf8_decoder::ToUTF16(
string(begin(utf8str), end(utf8str))));
#endif
#if USE_MCF
print("MCF UnifyAppend + DeunifyAppend", [&]{
using namespace MCF;
Utf8String u8s(&utf8str[0], &utf8str[utf8str.size()]);
Utf32String temp;
Utf16String u16s;
Utf8String::UnifyAppend(temp, u8s);
Utf16String::DeunifyAppend(u16s, temp);
return u16string(u16s.GetBegin(), u16s.GetEnd());
}());
#endif
print("YSLib::String", MakeUCS2LE(string(begin(utf8str), end(utf8str))));
#if Test_UseBig5
// GBK->UCS4
auto utf32str = Decode<Decoder::GBK>(Test_Big5Str);
// UCS4->UTF16LE
auto utf16le = Encode<Encoder::UTF16LE, wchar_t>(utf32str.begin(), utf32str.end());
// UCS4->BIG5
auto big5 = Encode<Encoder::BIG5>(utf32str.begin(), utf32str.end());
// BIG5->UCS4
auto utf32str2 = Decode<Decoder::BIG5>(big5.c_str());
// print(MakeMBCS(utf32str));
print("minicodecv GBK->UCS4", String(utf32str));
print("minicodecv UCS4->UTF16LE", String(Test_CastCPtr(char16_t,
utf16le.c_str())));
print("minicodecv UCS4->BIG5", MBCSToMBCS(big5, 950, CP_UTF8));
print("minicodecv BIG5->UCS4", String(utf32str2));
print("minicodecv UCS4->UTF16", String(Test_CastCPtr(char16_t, Encode<Encoder::UTF16,
wchar_t>(utf32str2.begin(), utf32str2.end()).c_str())));
#endif
#endif
}
} // unnamed namespace;
int main()
{
using namespace YSLib;
#define Test_Seq0 0xE4, 0xBD, 0xA0, 0xE6, 0x98, 0xAF, 0xE6, 0x88, 0x91, 0xE7, \
0x9A, 0x84, 0xE5, 0xB0, 0x8F, 0xE5, 0x91, 0x80, 0xE5, 0xB0, 0x8F, 0xE8, \
0x8B, 0xB9, 0xE6, 0x9E, 0x9C
#if Test_UseUnitStr
#define Test_Seq { Test_Seq0 }
#define Test_Big5Str "妳是我的小呀小蘋果"
#else
#define Test_Big5Str0 "妳是我的小呀小蘋果"
#define Test_Seq1 Test_Seq0, Test_Seq0
#define Test_Seq { Test_Seq1, Test_Seq1 }
#define Test_Big5Str1 Test_Big5Str0 Test_Big5Str0
#define Test_Big5Str Test_Big5Str1 Test_Big5Str1
#endif
#if Test_SMP
# define Test_SMP_Seq u8"𤭢𤭢𤭢𤭢𤭢0𤭢1𤭢2𤭢3𤭢4𤭢𤭢5𤭢"
const string utf8str0 = Test_SMP_Seq;
#else
const vector<unsigned char> utf8str0 = Test_Seq;
#endif
const string utf8str(utf8str0.begin(), utf8str0.end());
TestPerformance(utf8str);
TestOutputContent(utf8str);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment