Skip to content

Instantly share code, notes, and snippets.

@gruzovator
Created March 17, 2017 06:18
Show Gist options
  • Save gruzovator/1f0dc242d69f44e277f14347339491d7 to your computer and use it in GitHub Desktop.
Save gruzovator/1f0dc242d69f44e277f14347339491d7 to your computer and use it in GitHub Desktop.
icu translit sketch
#include <unicode/translit.h>
class TranslitConverter
{
static const UnicodeString TRANSLITERATION_RULES;
public:
TranslitConverter()
{
UErrorCode status = U_ZERO_ERROR;
UParseError parserError;
m_transliterator.reset(Transliterator::createFromRules ("TranslitConverter",
TRANSLITERATION_RULES,
UTRANS_FORWARD, parserError, status));
if(!m_transliterator || U_FAILURE (status))
{
throw std::runtime_error("Failed to create translit convereter");
}
}
~TranslitConverter() {}
std::string translit(const std::string &utf8string)
{
UnicodeString data = icu::UnicodeString::fromUTF8(utf8string);
m_transliterator->transliterate(data);
std::string result;
data.toUTF8String(result);
return result;
}
private:
boost::scoped_ptr<icu::Transliterator> m_transliterator;
};
const UnicodeString TranslitConverter::TRANSLITERATION_RULES(
"::NFKD;"
// fix for old icu lin
"х > kh;"
"Х > Kh;"
"\\/ > \\-;"
//
"::Russian-Latin/BGN;"
"::[:Nonspacing Mark:] Remove;"
"::NFC;"
"::lower;"
"::[^a-z0-9[:separator:]-] Remove;"
"[[:separator:]-]+ > \\-;"
);
std::string translit(const std::string &s)
{
static boost::thread_specific_ptr<TranslitConverter> converterPtr;
if(!converterPtr.get())
{
converterPtr.reset(new TranslitConverter);
}
return converterPtr->translit(s);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment