Skip to content

Instantly share code, notes, and snippets.

@hidva
Last active July 30, 2016 05:21
Show Gist options
  • Save hidva/51d6039cbf3db7877c1c642a15de5030 to your computer and use it in GitHub Desktop.
Save hidva/51d6039cbf3db7877c1c642a15de5030 to your computer and use it in GitHub Desktop.
关于 C++ 中对中文标点的处理
#include <locale>
#include <codecvt>
#include <string>
#include <iostream>
#include <gtest/gtest.h>
template <typename CharT, typename F>
int FilteredCompare(const CharT *lb, const CharT * const le,
const CharT *rb, const CharT * const re,
F lnot_cared, F rnot_cared) {
while (true) {
while (lb < le && lnot_cared(*lb))
++lb;
if (lb >= le)
break;
while (rb < re && rnot_cared(*rb))
++rb;
if (rb >= re)
break;
if (!std::char_traits<CharT>::eq(*lb, *rb))
return std::char_traits<CharT>::lt(*lb, *rb) ? -1 : 1;
++lb;
++rb;
}
// 此时只可能是: lb >= le 或者 rb >= re
if (lb >= le) {
for (; rb < re; ++rb) {
if (!rnot_cared(*rb))
return -1;
}
} else { // 此时只可能是 rb >= re
for (; lb < le; ++lb) {
if (!lnot_cared(*lb))
return 1;
}
}
return 0;
}
template <typename CharT,typename CharT2, typename F>
int FilteredCaseCompare(const CharT *lb, const CharT * const le,
const CharT *rb, const CharT * const re,
F lnot_cared, F rnot_cared,
const std::ctype<CharT2> &ctype_facet) {
while (true) {
while (lb < le && lnot_cared(*lb))
++lb;
if (lb >= le)
break;
while (rb < re && rnot_cared(*rb))
++rb;
if (rb >= re)
break;
CharT lc = ctype_facet.toupper(*lb);
CharT rc = ctype_facet.toupper(*rb);
if (!std::char_traits<CharT>::eq(lc, rc))
return std::char_traits<CharT>::lt(lc, rc) ? -1 : 1;
++lb;
++rb;
}
// 此时只可能是: lb >= le 或者 rb >= re
if (lb >= le) {
for (; rb < re; ++rb) {
if (!rnot_cared(*rb))
return -1;
}
} else { // 此时只可能是 rb >= re
for (; lb < le; ++lb) {
if (!lnot_cared(*lb))
return 1;
}
}
return 0;
}
template <typename CharT, typename F>
inline int FilteredCompare(const CharT *lb, const CharT * const le,
const CharT *rb, const CharT * const re,
F not_cared) {
return FilteredCompare(lb, le, rb, re, not_cared, not_cared);
}
template <typename CharT, typename CharT2, typename F>
inline int FilteredCaseCompare(const CharT *lb, const CharT * const le,
const CharT *rb, const CharT * const re,
F not_cared,
const std::ctype<CharT2> &ctype_facet) {
return FilteredCaseCompare(lb, le, rb, re, not_cared, not_cared, ctype_facet);
}
namespace {
const std::locale g_en_us_utf8_loc("en_US.UTF8");
const std::ctype<wchar_t> &g_unicode_ctype = std::use_facet<std::ctype<wchar_t>>(g_en_us_utf8_loc);
} // namespace
bool IsEqualedTitle(const std::u32string &left_title, const std::u32string &right_title) {
auto IsntCared = [] (char32_t ch) -> bool {
return g_unicode_ctype.is(
std::ctype_base::space |
std::ctype_base::cntrl |
std::ctype_base::punct |
std::ctype_base::blank,
ch);
};
return FilteredCaseCompare(left_title.data(), left_title.data() + left_title.size(),
right_title.data(), right_title.data() + right_title.size(),
IsntCared,
g_unicode_ctype) == 0;
}
inline std::u32string Utf8ToUtf32(const std::string &utf8) {
return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(utf8);
}
TEST(IsEqualedTitleTest,test) {
auto Test = [] (const std::string &left, const std::string &right, bool expected) {
EXPECT_EQ(expected, IsEqualedTitle(Utf8ToUtf32(left), Utf8ToUtf32(right)));
return ;
};
Test("hello", "hello", true);
Test("您好hello", "您好HeLlO", true);
Test("您 好 h e l l o", "您 好H eL l O", true);
Test("您,好 . h ::: e l ' l o", "您 好H. eL, l ,.';' O", true);
Test("您,。‘好’ . h :。:: e` “”l ' l o", "您 好H. 。 eL, 。 l ,.';' 《》 O", true);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment