Skip to content

Instantly share code, notes, and snippets.

@zenden2k
Last active August 29, 2015 14:27
Show Gist options
  • Save zenden2k/e37012806558eb96a9ee to your computer and use it in GitHub Desktop.
Save zenden2k/e37012806558eb96a9ee to your computer and use it in GitHub Desktop.
typedef enum {
conversionOK, /* conversion successful */
sourceExhausted, /* partial character in source, but hit end */
targetExhausted, /* insuff. room in target for conversion */
sourceIllegal /* source sequence is illegal/malformed */
} ConversionResult;
typedef enum {
strictConversion = 0,
lenientConversion
} ConversionFlags;
#define UTF32 uint32_t
#define UTF16 wchar_t
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_SUR_HIGH_START (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;
ConversionResult ConvertUTF32toUTF16(const UTF32* source, const UTF32* sourceEnd, UTF16* target, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
//const UTF32* source = *sourceStart;
while (source < sourceEnd) {
UTF32 ch;
if (target >= targetEnd) {
result = targetExhausted;
break;
}
ch = *source++;
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
else {
*target++ = UNI_REPLACEMENT_CHAR;
}
}
else {
*target++ = (UTF16)ch; /* normal case */
}
}
else if (ch > UNI_MAX_LEGAL_UTF32) {
if (flags == strictConversion) {
result = sourceIllegal;
}
else {
*target++ = UNI_REPLACEMENT_CHAR;
}
}
else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
--source; /* Back up source pointer! */
result = targetExhausted;
break;
}
ch -= halfBase;
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
}
// *sourceStart = source;
// *targetStart = target;
return result;
}
const std::wstring replace_html_entities(const std::wstring& content) {
enum code_mode {
mode_string,
mode_decimal,
mode_hex
};
std::wstringstream result;
std::wstringstream code;
bool code_parse = false;
code_mode mode = mode_string;
for (std::wstring::const_iterator it = content.begin(); it != content.end(); ++it) {
wchar_t c = *it;
if (code_parse) {
if (c == L';') {
std::wstring entity(code.str());
if (mode == mode_string) {
html_entity_map::const_iterator entity_it = html_entities_.find(entity);
if (entity_it == html_entities_.end())
result << L"&" << entity << L";";
else
result << (*entity_it).second;
}
else if (mode == mode_decimal) {
try {
uint16_t character = boost::lexical_cast<uint16_t>(code.str());
if (character == 0)
throw std::runtime_error("Incorrect HTML entity sequence");
result << static_cast<wchar_t>(character);
}
catch (const std::exception&) {
result << L"&#" << entity << L";";
}
}
else //hex
{
try {
std::wstringstream ss;
ss.exceptions(std::ios::failbit | std::ios::badbit);
uint32_t character;
ss << std::hex << entity.substr(1); //skip first 'x' or 'X'
ss >> character;
if (character == 0)
throw std::runtime_error("Incorrect HTML entity sequence");
UTF16 buf[3] = L"";
// replace_html_entities("Foo &#xA9; bar &#x1D306; baz &#x2603; qux") now works
if (ConvertUTF32toUTF16(&character, &character + 1, buf, buf + 4, strictConversion) == conversionOK) {
result << buf;
}
else {
throw std::runtime_error("Incorrect HTML entity sequence");
}
}
catch (const std::exception&) {
result << L"&#" << entity << L";";
}
}
code.str(L"");
code_parse = false;
}
else if (c == L'#' && code.rdbuf()->in_avail() == 0) {
mode = mode_decimal;
}
else if ((c == L'x' || c == L'X') && mode == mode_decimal && code.rdbuf()->in_avail() == 0) {
code << c;
mode = mode_hex;
}
else if ((mode == mode_string && isalnum(c))
|| (mode == mode_decimal && isdigit(c))
|| (mode == mode_hex && isxdigit(c))) {
code << c;
}
else {
result << L'&';
if (mode != mode_string)
result << L'#';
result << code.str() << c;
code.str(L"");
code_parse = false;
}
}
else {
if (c == L'&') {
code_parse = true;
mode = mode_string;
}
else {
result << c;
}
}
}
if (code_parse) {
result << L"&";
if (mode != mode_string)
result << L'#';
result << code.str();
}
return result.str();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment