Last active
July 30, 2016 05:21
-
-
Save hidva/51d6039cbf3db7877c1c642a15de5030 to your computer and use it in GitHub Desktop.
关于 C++ 中对中文标点的处理
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <locale> | |
#include <codecvt> | |
#include <string> | |
#include <iostream> | |
#include <gtest/gtest.h> | |
template <typename CharT, typename F> | |
int FilteredCompare(const CharT *lb, const CharT * const le, | |
const CharT *rb, const CharT * const re, | |
F lnot_cared, F rnot_cared) { | |
while (true) { | |
while (lb < le && lnot_cared(*lb)) | |
++lb; | |
if (lb >= le) | |
break; | |
while (rb < re && rnot_cared(*rb)) | |
++rb; | |
if (rb >= re) | |
break; | |
if (!std::char_traits<CharT>::eq(*lb, *rb)) | |
return std::char_traits<CharT>::lt(*lb, *rb) ? -1 : 1; | |
++lb; | |
++rb; | |
} | |
// 此时只可能是: lb >= le 或者 rb >= re | |
if (lb >= le) { | |
for (; rb < re; ++rb) { | |
if (!rnot_cared(*rb)) | |
return -1; | |
} | |
} else { // 此时只可能是 rb >= re | |
for (; lb < le; ++lb) { | |
if (!lnot_cared(*lb)) | |
return 1; | |
} | |
} | |
return 0; | |
} | |
template <typename CharT,typename CharT2, typename F> | |
int FilteredCaseCompare(const CharT *lb, const CharT * const le, | |
const CharT *rb, const CharT * const re, | |
F lnot_cared, F rnot_cared, | |
const std::ctype<CharT2> &ctype_facet) { | |
while (true) { | |
while (lb < le && lnot_cared(*lb)) | |
++lb; | |
if (lb >= le) | |
break; | |
while (rb < re && rnot_cared(*rb)) | |
++rb; | |
if (rb >= re) | |
break; | |
CharT lc = ctype_facet.toupper(*lb); | |
CharT rc = ctype_facet.toupper(*rb); | |
if (!std::char_traits<CharT>::eq(lc, rc)) | |
return std::char_traits<CharT>::lt(lc, rc) ? -1 : 1; | |
++lb; | |
++rb; | |
} | |
// 此时只可能是: lb >= le 或者 rb >= re | |
if (lb >= le) { | |
for (; rb < re; ++rb) { | |
if (!rnot_cared(*rb)) | |
return -1; | |
} | |
} else { // 此时只可能是 rb >= re | |
for (; lb < le; ++lb) { | |
if (!lnot_cared(*lb)) | |
return 1; | |
} | |
} | |
return 0; | |
} | |
template <typename CharT, typename F> | |
inline int FilteredCompare(const CharT *lb, const CharT * const le, | |
const CharT *rb, const CharT * const re, | |
F not_cared) { | |
return FilteredCompare(lb, le, rb, re, not_cared, not_cared); | |
} | |
template <typename CharT, typename CharT2, typename F> | |
inline int FilteredCaseCompare(const CharT *lb, const CharT * const le, | |
const CharT *rb, const CharT * const re, | |
F not_cared, | |
const std::ctype<CharT2> &ctype_facet) { | |
return FilteredCaseCompare(lb, le, rb, re, not_cared, not_cared, ctype_facet); | |
} | |
namespace { | |
const std::locale g_en_us_utf8_loc("en_US.UTF8"); | |
const std::ctype<wchar_t> &g_unicode_ctype = std::use_facet<std::ctype<wchar_t>>(g_en_us_utf8_loc); | |
} // namespace | |
bool IsEqualedTitle(const std::u32string &left_title, const std::u32string &right_title) { | |
auto IsntCared = [] (char32_t ch) -> bool { | |
return g_unicode_ctype.is( | |
std::ctype_base::space | | |
std::ctype_base::cntrl | | |
std::ctype_base::punct | | |
std::ctype_base::blank, | |
ch); | |
}; | |
return FilteredCaseCompare(left_title.data(), left_title.data() + left_title.size(), | |
right_title.data(), right_title.data() + right_title.size(), | |
IsntCared, | |
g_unicode_ctype) == 0; | |
} | |
inline std::u32string Utf8ToUtf32(const std::string &utf8) { | |
return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(utf8); | |
} | |
TEST(IsEqualedTitleTest,test) { | |
auto Test = [] (const std::string &left, const std::string &right, bool expected) { | |
EXPECT_EQ(expected, IsEqualedTitle(Utf8ToUtf32(left), Utf8ToUtf32(right))); | |
return ; | |
}; | |
Test("hello", "hello", true); | |
Test("您好hello", "您好HeLlO", true); | |
Test("您 好 h e l l o", "您 好H eL l O", true); | |
Test("您,好 . h ::: e l ' l o", "您 好H. eL, l ,.';' O", true); | |
Test("您,。‘好’ . h :。:: e` “”l ' l o", "您 好H. 。 eL, 。 l ,.';' 《》 O", true); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment