Skip to content

Instantly share code, notes, and snippets.

@fetus-hina
Created March 18, 2012 11:43
Show Gist options
  • Save fetus-hina/2070792 to your computer and use it in GitHub Desktop.
Save fetus-hina/2070792 to your computer and use it in GitHub Desktop.
ICU 4.2.1 (RHEL 6) で Unicode Normalization
#include <string>
#include <stdexcept>
#define U_USING_ICU_NAMESPACE 0 // ヘッダで using namespace するな…
#include <unicode/normlzr.h>
namespace Normalizer {
namespace {
std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE:: */ UNormalizationMode mode);
}
std::string nfc(const std::string &utf8) {
return normalize(utf8, UNORM_NFC);
}
std::string nfd(const std::string &utf8) {
return normalize(utf8, UNORM_NFD);
}
std::string nfkc(const std::string &utf8) {
return normalize(utf8, UNORM_NFKC);
}
std::string nfkd(const std::string &utf8) {
return normalize(utf8, UNORM_NFKD);
}
namespace {
inline std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE::*/ UNormalizationMode mode) {
using U_ICU_NAMESPACE::UnicodeString;
using U_ICU_NAMESPACE::StringPiece;
//using U_ICU_NAMESPACE::UErrorCode;
const UnicodeString source =
UnicodeString::fromUTF8(
StringPiece(utf8.c_str(), utf8.size()));
UnicodeString result;
UErrorCode status = U_ZERO_ERROR;
U_ICU_NAMESPACE::Normalizer::normalize(source, mode, 0, result, status);
if(U_FAILURE(status)) {
throw std::runtime_error("Unicode normalization failed");
}
std::string tmp;
return result.toUTF8String(tmp);
}
}
}
#include <iostream>
#include <iomanip>
#include <sstream>
std::string dump(const std::string &str) {
std::stringstream sstr;
sstr << std::hex << std::setw(2) << std::setfill('0');
for(std::string::const_iterator it = str.begin(); it != str.end(); ++it) {
sstr << static_cast<int>(static_cast<unsigned char>(*it)) << " ";
}
return sstr.str();
}
int main() {
const std::string form_c_org("ばぱア①㍻");
const std::string form_d(Normalizer::nfd(form_c_org));
const std::string form_c(Normalizer::nfc(form_d));
const std::string form_kc(Normalizer::nfkc(form_c_org));
const std::string form_kd(Normalizer::nfkd(form_c_org));
std::cout
<< "NFC: " << form_c << std::endl << " " << dump(form_c) << std::endl
<< "NFD: " << form_d << std::endl << " " << dump(form_d) << std::endl
<< "NFKC:" << form_kc << std::endl << " " << dump(form_kc) << std::endl
<< "NFKD:" << form_kd << std::endl << " " << dump(form_kd) << std::endl;
}
NFC: ばぱア①㍻
e3 81 b0 e3 81 b1 ef bd b1 e2 91 a0 e3 8d bb
NFD: ばぱア①㍻
e3 81 af e3 82 99 e3 81 af e3 82 9a ef bd b1 e2 91 a0 e3 8d bb
NFKC:ばぱア1平成
e3 81 b0 e3 81 b1 e3 82 a2 31 e5 b9 b3 e6 88 90
NFKD:ばぱア1平成
e3 81 af e3 82 99 e3 81 af e3 82 9a e3 82 a2 31 e5 b9 b3 e6 88 90
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment