Last active
August 29, 2015 14:27
-
-
Save zenden2k/e37012806558eb96a9ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
typedef enum { | |
conversionOK, /* conversion successful */ | |
sourceExhausted, /* partial character in source, but hit end */ | |
targetExhausted, /* insuff. room in target for conversion */ | |
sourceIllegal /* source sequence is illegal/malformed */ | |
} ConversionResult; | |
typedef enum { | |
strictConversion = 0, | |
lenientConversion | |
} ConversionFlags; | |
#define UTF32 uint32_t | |
#define UTF16 wchar_t | |
#define UNI_MAX_BMP (UTF32)0x0000FFFF | |
#define UNI_SUR_HIGH_START (UTF32)0xD800 | |
#define UNI_SUR_HIGH_END (UTF32)0xDBFF | |
#define UNI_SUR_LOW_START (UTF32)0xDC00 | |
#define UNI_SUR_LOW_END (UTF32)0xDFFF | |
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD | |
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF | |
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF | |
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF | |
static const int halfShift = 10; /* used for shifting by 10 bits */ | |
static const UTF32 halfBase = 0x0010000UL; | |
static const UTF32 halfMask = 0x3FFUL; | |
ConversionResult ConvertUTF32toUTF16(const UTF32* source, const UTF32* sourceEnd, UTF16* target, UTF16* targetEnd, ConversionFlags flags) { | |
ConversionResult result = conversionOK; | |
//const UTF32* source = *sourceStart; | |
while (source < sourceEnd) { | |
UTF32 ch; | |
if (target >= targetEnd) { | |
result = targetExhausted; | |
break; | |
} | |
ch = *source++; | |
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ | |
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ | |
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { | |
if (flags == strictConversion) { | |
--source; /* return to the illegal value itself */ | |
result = sourceIllegal; | |
break; | |
} | |
else { | |
*target++ = UNI_REPLACEMENT_CHAR; | |
} | |
} | |
else { | |
*target++ = (UTF16)ch; /* normal case */ | |
} | |
} | |
else if (ch > UNI_MAX_LEGAL_UTF32) { | |
if (flags == strictConversion) { | |
result = sourceIllegal; | |
} | |
else { | |
*target++ = UNI_REPLACEMENT_CHAR; | |
} | |
} | |
else { | |
/* target is a character in range 0xFFFF - 0x10FFFF. */ | |
if (target + 1 >= targetEnd) { | |
--source; /* Back up source pointer! */ | |
result = targetExhausted; | |
break; | |
} | |
ch -= halfBase; | |
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); | |
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); | |
} | |
} | |
// *sourceStart = source; | |
// *targetStart = target; | |
return result; | |
} | |
const std::wstring replace_html_entities(const std::wstring& content) { | |
enum code_mode { | |
mode_string, | |
mode_decimal, | |
mode_hex | |
}; | |
std::wstringstream result; | |
std::wstringstream code; | |
bool code_parse = false; | |
code_mode mode = mode_string; | |
for (std::wstring::const_iterator it = content.begin(); it != content.end(); ++it) { | |
wchar_t c = *it; | |
if (code_parse) { | |
if (c == L';') { | |
std::wstring entity(code.str()); | |
if (mode == mode_string) { | |
html_entity_map::const_iterator entity_it = html_entities_.find(entity); | |
if (entity_it == html_entities_.end()) | |
result << L"&" << entity << L";"; | |
else | |
result << (*entity_it).second; | |
} | |
else if (mode == mode_decimal) { | |
try { | |
uint16_t character = boost::lexical_cast<uint16_t>(code.str()); | |
if (character == 0) | |
throw std::runtime_error("Incorrect HTML entity sequence"); | |
result << static_cast<wchar_t>(character); | |
} | |
catch (const std::exception&) { | |
result << L"&#" << entity << L";"; | |
} | |
} | |
else //hex | |
{ | |
try { | |
std::wstringstream ss; | |
ss.exceptions(std::ios::failbit | std::ios::badbit); | |
uint32_t character; | |
ss << std::hex << entity.substr(1); //skip first 'x' or 'X' | |
ss >> character; | |
if (character == 0) | |
throw std::runtime_error("Incorrect HTML entity sequence"); | |
UTF16 buf[3] = L""; | |
// replace_html_entities("Foo © bar 𝌆 baz ☃ qux") now works | |
if (ConvertUTF32toUTF16(&character, &character + 1, buf, buf + 4, strictConversion) == conversionOK) { | |
result << buf; | |
} | |
else { | |
throw std::runtime_error("Incorrect HTML entity sequence"); | |
} | |
} | |
catch (const std::exception&) { | |
result << L"&#" << entity << L";"; | |
} | |
} | |
code.str(L""); | |
code_parse = false; | |
} | |
else if (c == L'#' && code.rdbuf()->in_avail() == 0) { | |
mode = mode_decimal; | |
} | |
else if ((c == L'x' || c == L'X') && mode == mode_decimal && code.rdbuf()->in_avail() == 0) { | |
code << c; | |
mode = mode_hex; | |
} | |
else if ((mode == mode_string && isalnum(c)) | |
|| (mode == mode_decimal && isdigit(c)) | |
|| (mode == mode_hex && isxdigit(c))) { | |
code << c; | |
} | |
else { | |
result << L'&'; | |
if (mode != mode_string) | |
result << L'#'; | |
result << code.str() << c; | |
code.str(L""); | |
code_parse = false; | |
} | |
} | |
else { | |
if (c == L'&') { | |
code_parse = true; | |
mode = mode_string; | |
} | |
else { | |
result << c; | |
} | |
} | |
} | |
if (code_parse) { | |
result << L"&"; | |
if (mode != mode_string) | |
result << L'#'; | |
result << code.str(); | |
} | |
return result.str(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment