Created
March 18, 2012 11:43
-
-
Save fetus-hina/2070792 to your computer and use it in GitHub Desktop.
ICU 4.2.1 (RHEL 6) で Unicode Normalization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <stdexcept> | |
#define U_USING_ICU_NAMESPACE 0 // ヘッダで using namespace するな… | |
#include <unicode/normlzr.h> | |
namespace Normalizer { | |
namespace { | |
std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE:: */ UNormalizationMode mode); | |
} | |
std::string nfc(const std::string &utf8) { | |
return normalize(utf8, UNORM_NFC); | |
} | |
std::string nfd(const std::string &utf8) { | |
return normalize(utf8, UNORM_NFD); | |
} | |
std::string nfkc(const std::string &utf8) { | |
return normalize(utf8, UNORM_NFKC); | |
} | |
std::string nfkd(const std::string &utf8) { | |
return normalize(utf8, UNORM_NFKD); | |
} | |
namespace { | |
inline std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE::*/ UNormalizationMode mode) { | |
using U_ICU_NAMESPACE::UnicodeString; | |
using U_ICU_NAMESPACE::StringPiece; | |
//using U_ICU_NAMESPACE::UErrorCode; | |
const UnicodeString source = | |
UnicodeString::fromUTF8( | |
StringPiece(utf8.c_str(), utf8.size())); | |
UnicodeString result; | |
UErrorCode status = U_ZERO_ERROR; | |
U_ICU_NAMESPACE::Normalizer::normalize(source, mode, 0, result, status); | |
if(U_FAILURE(status)) { | |
throw std::runtime_error("Unicode normalization failed"); | |
} | |
std::string tmp; | |
return result.toUTF8String(tmp); | |
} | |
} | |
} | |
#include <iostream> | |
#include <iomanip> | |
#include <sstream> | |
std::string dump(const std::string &str) { | |
std::stringstream sstr; | |
sstr << std::hex << std::setw(2) << std::setfill('0'); | |
for(std::string::const_iterator it = str.begin(); it != str.end(); ++it) { | |
sstr << static_cast<int>(static_cast<unsigned char>(*it)) << " "; | |
} | |
return sstr.str(); | |
} | |
int main() { | |
const std::string form_c_org("ばぱア①㍻"); | |
const std::string form_d(Normalizer::nfd(form_c_org)); | |
const std::string form_c(Normalizer::nfc(form_d)); | |
const std::string form_kc(Normalizer::nfkc(form_c_org)); | |
const std::string form_kd(Normalizer::nfkd(form_c_org)); | |
std::cout | |
<< "NFC: " << form_c << std::endl << " " << dump(form_c) << std::endl | |
<< "NFD: " << form_d << std::endl << " " << dump(form_d) << std::endl | |
<< "NFKC:" << form_kc << std::endl << " " << dump(form_kc) << std::endl | |
<< "NFKD:" << form_kd << std::endl << " " << dump(form_kd) << std::endl; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NFC: ばぱア①㍻ | |
e3 81 b0 e3 81 b1 ef bd b1 e2 91 a0 e3 8d bb | |
NFD: ばぱア①㍻ | |
e3 81 af e3 82 99 e3 81 af e3 82 9a ef bd b1 e2 91 a0 e3 8d bb | |
NFKC:ばぱア1平成 | |
e3 81 b0 e3 81 b1 e3 82 a2 31 e5 b9 b3 e6 88 90 | |
NFKD:ばぱア1平成 | |
e3 81 af e3 82 99 e3 81 af e3 82 9a e3 82 a2 31 e5 b9 b3 e6 88 90 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment