zenden2k/utf16_to_utf32.cpp

## utf16_to_utf32.cpp
typedef enum {
    conversionOK, /* conversion successful */
    sourceExhausted, /* partial character in source, but hit end */
    targetExhausted, /* insuff. room in target for conversion */
    sourceIllegal /* source sequence is illegal/malformed */

} ConversionResult;

typedef enum {
    strictConversion = 0,
    lenientConversion

} ConversionFlags;

#define UTF32 uint32_t
#define UTF16 wchar_t
#define UNI_MAX_BMP   (UTF32)0x0000FFFF
#define UNI_SUR_HIGH_START  (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD

#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

ConversionResult ConvertUTF32toUTF16(const UTF32* source, const UTF32* sourceEnd, UTF16* target, UTF16* targetEnd, ConversionFlags flags) {
    ConversionResult result = conversionOK;
    //const UTF32* source = *sourceStart;
    while (source < sourceEnd) {
        UTF32 ch;
        if (target >= targetEnd) {
            result = targetExhausted;
            break;
        }
        ch = *source++;
        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
                if (flags == strictConversion) {
                    --source; /* return to the illegal value itself */
                    result = sourceIllegal;
                    break;
                }
                else {
                    *target++ = UNI_REPLACEMENT_CHAR;
                }
            }
            else {
                *target++ = (UTF16)ch; /* normal case */
            }
        }
        else if (ch > UNI_MAX_LEGAL_UTF32) {
            if (flags == strictConversion) {
                result = sourceIllegal;
            }
            else {
                *target++ = UNI_REPLACEMENT_CHAR;
            }
        }
        else {
            /* target is a character in range 0xFFFF - 0x10FFFF. */
            if (target + 1 >= targetEnd) {
                --source; /* Back up source pointer! */
                result = targetExhausted;
                break;
            }
            ch -= halfBase;
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
        }
    }
   // *sourceStart = source;
   // *targetStart = target;
    return result;
}

const std::wstring replace_html_entities(const std::wstring& content) {
    enum code_mode {
        mode_string,
        mode_decimal,
        mode_hex
    };

    std::wstringstream result;

    std::wstringstream code;
    bool code_parse = false;
    code_mode mode = mode_string;

    for (std::wstring::const_iterator it = content.begin(); it != content.end(); ++it) {
        wchar_t c = *it;
        if (code_parse) {
            if (c == L';') {
                std::wstring entity(code.str());
                if (mode == mode_string) {
                    html_entity_map::const_iterator entity_it = html_entities_.find(entity);
                    if (entity_it == html_entities_.end())
                        result << L"&" << entity << L";";
                    else
                        result << (*entity_it).second;
                }
                else if (mode == mode_decimal) {
                    try {
                        uint16_t character = boost::lexical_cast<uint16_t>(code.str());
                        if (character == 0)
                            throw std::runtime_error("Incorrect HTML entity sequence");

                        result << static_cast<wchar_t>(character);
                    }
                    catch (const std::exception&) {
                        result << L"&#" << entity << L";";
                    }
                }
                else //hex
                {
                    try {
                        std::wstringstream ss;


                        ss.exceptions(std::ios::failbit | std::ios::badbit);
                        uint32_t character;
                        ss << std::hex << entity.substr(1); //skip first 'x' or 'X'
                        ss >> character;
                        if (character == 0)
                            throw std::runtime_error("Incorrect HTML entity sequence");
                        UTF16 buf[3] = L"";
                        // replace_html_entities("Foo &#xA9; bar &#x1D306; baz &#x2603; qux") now works
                        if (ConvertUTF32toUTF16(&character, &character + 1, buf, buf + 4, strictConversion) == conversionOK) {
                            result << buf;
                        }
                        else {
                            throw std::runtime_error("Incorrect HTML entity sequence");
                        }


                    }
                    catch (const std::exception&) {
                        result << L"&#" << entity << L";";
                    }
                }

                code.str(L"");
                code_parse = false;
            }
            else if (c == L'#' && code.rdbuf()->in_avail() == 0) {
                mode = mode_decimal;
            }
            else if ((c == L'x' || c == L'X') && mode == mode_decimal && code.rdbuf()->in_avail() == 0) {
                code << c;
                mode = mode_hex;
            }
            else if ((mode == mode_string && isalnum(c))
                || (mode == mode_decimal && isdigit(c))
                || (mode == mode_hex && isxdigit(c))) {
                code << c;
            }
            else {
                result << L'&';
                if (mode != mode_string)
                    result << L'#';

                result << code.str() << c;
                code.str(L"");
                code_parse = false;
            }
        }
        else {
            if (c == L'&') {
                code_parse = true;
                mode = mode_string;
            }
            else {
                result << c;
            }
        }
    }

    if (code_parse) {
        result << L"&";
        if (mode != mode_string)
            result << L'#';

        result << code.str();
    }

    return result.str();
}
	typedef enum {
	conversionOK, /* conversion successful */
	sourceExhausted, /* partial character in source, but hit end */
	targetExhausted, /* insuff. room in target for conversion */
	sourceIllegal /* source sequence is illegal/malformed */

	} ConversionResult;

	typedef enum {
	strictConversion = 0,
	lenientConversion

	} ConversionFlags;

	#define UTF32 uint32_t
	#define UTF16 wchar_t
	#define UNI_MAX_BMP (UTF32)0x0000FFFF
	#define UNI_SUR_HIGH_START (UTF32)0xD800
	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
	#define UNI_SUR_LOW_START (UTF32)0xDC00
	#define UNI_SUR_LOW_END (UTF32)0xDFFF
	#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD

	#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
	#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
	#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
	static const int halfShift = 10; /* used for shifting by 10 bits */
	static const UTF32 halfBase = 0x0010000UL;
	static const UTF32 halfMask = 0x3FFUL;

	ConversionResult ConvertUTF32toUTF16(const UTF32* source, const UTF32* sourceEnd, UTF16* target, UTF16* targetEnd, ConversionFlags flags) {
	ConversionResult result = conversionOK;
	//const UTF32* source = *sourceStart;
	while (source < sourceEnd) {
	UTF32 ch;
	if (target >= targetEnd) {
	result = targetExhausted;
	break;
	}
	ch = *source++;
	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
	if (flags == strictConversion) {
	--source; /* return to the illegal value itself */
	result = sourceIllegal;
	break;
	}
	else {
	*target++ = UNI_REPLACEMENT_CHAR;
	}
	}
	else {
	target++ = (UTF16)ch; / normal case */
	}
	}
	else if (ch > UNI_MAX_LEGAL_UTF32) {
	if (flags == strictConversion) {
	result = sourceIllegal;
	}
	else {
	*target++ = UNI_REPLACEMENT_CHAR;
	}
	}
	else {
	/* target is a character in range 0xFFFF - 0x10FFFF. */
	if (target + 1 >= targetEnd) {
	--source; /* Back up source pointer! */
	result = targetExhausted;
	break;
	}
	ch -= halfBase;
	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
	}
	}
	// *sourceStart = source;
	// *targetStart = target;
	return result;
	}

	const std::wstring replace_html_entities(const std::wstring& content) {
	enum code_mode {
	mode_string,
	mode_decimal,
	mode_hex
	};

	std::wstringstream result;

	std::wstringstream code;
	bool code_parse = false;
	code_mode mode = mode_string;

	for (std::wstring::const_iterator it = content.begin(); it != content.end(); ++it) {
	wchar_t c = *it;
	if (code_parse) {
	if (c == L';') {
	std::wstring entity(code.str());
	if (mode == mode_string) {
	html_entity_map::const_iterator entity_it = html_entities_.find(entity);
	if (entity_it == html_entities_.end())
	result << L"&" << entity << L";";
	else
	result << (*entity_it).second;
	}
	else if (mode == mode_decimal) {
	try {
	uint16_t character = boost::lexical_cast<uint16_t>(code.str());
	if (character == 0)
	throw std::runtime_error("Incorrect HTML entity sequence");

	result << static_cast<wchar_t>(character);
	}
	catch (const std::exception&) {
	result << L"&#" << entity << L";";
	}
	}
	else //hex
	{
	try {
	std::wstringstream ss;


	ss.exceptions(std::ios::failbit \| std::ios::badbit);
	uint32_t character;
	ss << std::hex << entity.substr(1); //skip first 'x' or 'X'
	ss >> character;
	if (character == 0)
	throw std::runtime_error("Incorrect HTML entity sequence");
	UTF16 buf[3] = L"";
	// replace_html_entities("Foo © bar 𝌆 baz ☃ qux") now works
	if (ConvertUTF32toUTF16(&character, &character + 1, buf, buf + 4, strictConversion) == conversionOK) {
	result << buf;
	}
	else {
	throw std::runtime_error("Incorrect HTML entity sequence");
	}


	}
	catch (const std::exception&) {
	result << L"&#" << entity << L";";
	}
	}

	code.str(L"");
	code_parse = false;
	}
	else if (c == L'#' && code.rdbuf()->in_avail() == 0) {
	mode = mode_decimal;
	}
	else if ((c == L'x' \|\| c == L'X') && mode == mode_decimal && code.rdbuf()->in_avail() == 0) {
	code << c;
	mode = mode_hex;
	}
	else if ((mode == mode_string && isalnum(c))
	\|\| (mode == mode_decimal && isdigit(c))
	\|\| (mode == mode_hex && isxdigit(c))) {
	code << c;
	}
	else {
	result << L'&';
	if (mode != mode_string)
	result << L'#';

	result << code.str() << c;
	code.str(L"");
	code_parse = false;
	}
	}
	else {
	if (c == L'&') {
	code_parse = true;
	mode = mode_string;
	}
	else {
	result << c;
	}
	}
	}

	if (code_parse) {
	result << L"&";
	if (mode != mode_string)
	result << L'#';

	result << code.str();
	}

	return result.str();
	}