fetus-hina/normalizer.cpp

## normalizer.cpp
#include <string>
#include <stdexcept>

#define U_USING_ICU_NAMESPACE 0 // ヘッダで using namespace するな…
#include <unicode/normlzr.h>

namespace Normalizer {
    namespace {
        std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE:: */ UNormalizationMode mode);
    }

    std::string nfc(const std::string &utf8) {
        return normalize(utf8, UNORM_NFC);
    }

    std::string nfd(const std::string &utf8) {
        return normalize(utf8, UNORM_NFD);
    }

    std::string nfkc(const std::string &utf8) {
        return normalize(utf8, UNORM_NFKC);
    }

    std::string nfkd(const std::string &utf8) {
        return normalize(utf8, UNORM_NFKD);
    }

    namespace {
        inline std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE::*/ UNormalizationMode mode) {
            using U_ICU_NAMESPACE::UnicodeString;
            using U_ICU_NAMESPACE::StringPiece;
            //using U_ICU_NAMESPACE::UErrorCode;

            const UnicodeString source =
                UnicodeString::fromUTF8(
                    StringPiece(utf8.c_str(), utf8.size()));
            UnicodeString result;
            UErrorCode status = U_ZERO_ERROR;
            U_ICU_NAMESPACE::Normalizer::normalize(source, mode, 0, result, status);
            if(U_FAILURE(status)) {
                throw std::runtime_error("Unicode normalization failed");
            }
            std::string tmp;
            return result.toUTF8String(tmp);
        }
    }
}

#include <iostream>
#include <iomanip>
#include <sstream>

std::string dump(const std::string &str) {
    std::stringstream sstr;
    sstr << std::hex << std::setw(2) << std::setfill('0');
    for(std::string::const_iterator it = str.begin(); it != str.end(); ++it) {
        sstr << static_cast<int>(static_cast<unsigned char>(*it)) << " ";
    }
    return sstr.str();
}

int main() {
    const std::string form_c_org("ばぱｱ①㍻");
    const std::string form_d(Normalizer::nfd(form_c_org));
    const std::string form_c(Normalizer::nfc(form_d));
    const std::string form_kc(Normalizer::nfkc(form_c_org));
    const std::string form_kd(Normalizer::nfkd(form_c_org));

    std::cout
        << "NFC: " << form_c << std::endl << "    " << dump(form_c) << std::endl
        << "NFD: " << form_d << std::endl << "    " << dump(form_d) << std::endl
        << "NFKC:" << form_kc << std::endl << "    " << dump(form_kc) << std::endl
        << "NFKD:" << form_kd << std::endl << "    " << dump(form_kd) << std::endl;
}

## output.txt
NFC: ばぱｱ①㍻
    e3 81 b0 e3 81 b1 ef bd b1 e2 91 a0 e3 8d bb
NFD: ばぱｱ①㍻
    e3 81 af e3 82 99 e3 81 af e3 82 9a ef bd b1 e2 91 a0 e3 8d bb
NFKC:ばぱア1平成
    e3 81 b0 e3 81 b1 e3 82 a2 31 e5 b9 b3 e6 88 90
NFKD:ばぱア1平成
    e3 81 af e3 82 99 e3 81 af e3 82 9a e3 82 a2 31 e5 b9 b3 e6 88 90
	#include <string>
	#include <stdexcept>

	#define U_USING_ICU_NAMESPACE 0 // ヘッダで using namespace するな…
	#include <unicode/normlzr.h>

	namespace Normalizer {
	namespace {
	std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE:: */ UNormalizationMode mode);
	}

	std::string nfc(const std::string &utf8) {
	return normalize(utf8, UNORM_NFC);
	}

	std::string nfd(const std::string &utf8) {
	return normalize(utf8, UNORM_NFD);
	}

	std::string nfkc(const std::string &utf8) {
	return normalize(utf8, UNORM_NFKC);
	}

	std::string nfkd(const std::string &utf8) {
	return normalize(utf8, UNORM_NFKD);
	}

	namespace {
	inline std::string normalize(const std::string &utf8, /* U_ICU_NAMESPACE::*/ UNormalizationMode mode) {
	using U_ICU_NAMESPACE::UnicodeString;
	using U_ICU_NAMESPACE::StringPiece;
	//using U_ICU_NAMESPACE::UErrorCode;

	const UnicodeString source =
	UnicodeString::fromUTF8(
	StringPiece(utf8.c_str(), utf8.size()));
	UnicodeString result;
	UErrorCode status = U_ZERO_ERROR;
	U_ICU_NAMESPACE::Normalizer::normalize(source, mode, 0, result, status);
	if(U_FAILURE(status)) {
	throw std::runtime_error("Unicode normalization failed");
	}
	std::string tmp;
	return result.toUTF8String(tmp);
	}
	}
	}

	#include <iostream>
	#include <iomanip>
	#include <sstream>

	std::string dump(const std::string &str) {
	std::stringstream sstr;
	sstr << std::hex << std::setw(2) << std::setfill('0');
	for(std::string::const_iterator it = str.begin(); it != str.end(); ++it) {
	sstr << static_cast<int>(static_cast<unsigned char>(*it)) << " ";
	}
	return sstr.str();
	}

	int main() {
	const std::string form_c_org("ばぱｱ①㍻");
	const std::string form_d(Normalizer::nfd(form_c_org));
	const std::string form_c(Normalizer::nfc(form_d));
	const std::string form_kc(Normalizer::nfkc(form_c_org));
	const std::string form_kd(Normalizer::nfkd(form_c_org));

	std::cout
	<< "NFC: " << form_c << std::endl << " " << dump(form_c) << std::endl
	<< "NFD: " << form_d << std::endl << " " << dump(form_d) << std::endl
	<< "NFKC:" << form_kc << std::endl << " " << dump(form_kc) << std::endl
	<< "NFKD:" << form_kd << std::endl << " " << dump(form_kd) << std::endl;
	}
	NFC: ばぱｱ①㍻
	e3 81 b0 e3 81 b1 ef bd b1 e2 91 a0 e3 8d bb
	NFD: ばぱｱ①㍻
	e3 81 af e3 82 99 e3 81 af e3 82 9a ef bd b1 e2 91 a0 e3 8d bb
	NFKC:ばぱア1平成
	e3 81 b0 e3 81 b1 e3 82 a2 31 e5 b9 b3 e6 88 90
	NFKD:ばぱア1平成
	e3 81 af e3 82 99 e3 81 af e3 82 9a e3 82 a2 31 e5 b9 b3 e6 88 90