fbraem/UTF8Escape.cpp

## UTF8Escape.cpp
#include "Poco/UTF8Escape.h"
#include "Poco/UTF8Encoding.h"
#include "Poco/NumberFormatter.h"
#include "Poco/Ascii.h"

namespace Poco {

std::string UTF8Escape::escape(const std::string &s)
{
	return escape(s.begin(), s.end());
}

std::string UTF8Escape::escape(std::string::const_iterator& begin, std::string::const_iterator& end)
{
	static Poco::UInt32 offsetsFromUTF8[6] = {
		0x00000000UL, 0x00003080UL, 0x000E2080UL,
		0x03C82080UL, 0xFA082080UL, 0x82082080UL
	};

	std::string result;

	std::string::const_iterator it = begin;

	while(it != end)
	{
		Poco::UInt32 ch = 0;
		unsigned int sz = 0;

		do
		{
			ch <<= 6;
			ch += (unsigned char)*it++;
			sz++;
		}
		while (it != end && (*it & 0xC0) == 0x80 && sz < 6);
		ch -= offsetsFromUTF8[sz-1];

		if (ch == '\n') result += "\\n";
		else if (ch == '\t') result += "\\t";
		else if (ch == '\r') result += "\\r";
		else if (ch == '\b') result += "\\b";
		else if (ch == '\f') result += "\\f";
		else if (ch == '\v') result += "\\v";
		else if (ch == '\a') result += "\\a";
		else if (ch == '\\') result +=  "\\\\";
		else if (ch < 32 || ch == 0x7f)
		{
			result += "\\u";
			NumberFormatter::appendHex(result, (unsigned short) ch, 4);
		}
		else if (ch > 0xFFFF)
		{
			ch -= 0x10000;
			result += "\\u";
			NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4);
			result += "\\u";
			NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4);
		}
		else if (ch >= 0x80 && ch <= 0xFFFF)
		{
			result += "\\u";
			NumberFormatter::appendHex(result, (unsigned short) ch, 4);
		}
		else
		{
			result += (char) ch;
		}
	}
	return result;
}

std::string UTF8Escape::unescape(const std::string &s)
{
	return unescape(s.begin(), s.end());
}

std::string UTF8Escape::unescape(std::string::const_iterator& begin, std::string::const_iterator& end)
{
	std::string result;

	std::string::const_iterator it = begin;

	while (it != end)
	{
		Poco::UInt32 ch = (Poco::UInt32) *it++;

		if (ch == '\\')
		{
			if ( it == end )
			{
				//Invalid sequence!
			}

			if (*it == 'n')
			{
				ch = '\n';
				it++;
			}
			else if (*it == 't')
			{
				ch = '\t';
				it++;
			}
			else if (*it == 'r')
			{
				ch = '\r';
				it++;
			}
			else if (*it == 'b')
			{
				ch = '\b';
				it++;
			}
			else if (*it == 'f')
			{
				ch = '\f';
				it++;
			}
			else if (*it == 'v')
			{
				ch = '\v';
				it++;
			}
			else if (*it == 'a')
			{
				ch = '\a';
				it++;
			}
			else if (*it == 'u')
			{
				char digs[5];
				memset(digs, 0, 5);
				unsigned int dno = 0;

				it++;

				while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
				if (dno > 0)
				{
					ch = strtol(digs, NULL, 16);
				}

				if( ch >= 0xD800 && ch <= 0xDBFF )
				{
					if ( it == end || *it != '\\' )
					{
						//Invalid sequence!
					}
					else
					{
						it++;
						if ( it == end || *it != 'u' )
						{
							//Invalid sequence!
						}
						else
						{
							it++;
						}
					}

					// UTF-16 surrogate pair. Go fetch other half
					memset(digs, 0, 5);
					dno = 0;
					while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
					if (dno > 0)
					{
						Poco::UInt32 temp = strtol(digs, NULL, 16);
						if( temp >= 0xDC00 && temp <= 0xDFFF )
						{
							ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000;
						}
					}
				}
			}
			else if (*it == 'U')
			{
				char digs[9];
				memset(digs, 0, 9);
				unsigned int dno = 0;

				it++;
				while (it != end && Ascii::isHexDigit(*it) && dno < 8)
				{
					digs[dno++] = *it++;
				}
				if (dno > 0)
				{
					ch = strtol(digs, NULL, 16);
				}
			}
		}

		unsigned char utf8[4];
		UTF8Encoding encoding;
		int sz = encoding.convert(ch, utf8, 4);
		result.append((char*) utf8, sz);
	}

	return result;
}

} // Namespace Poco

## UTF8Escape.h
//
// UTF8Escape.h
//

#ifndef Foundation_UTF8Escape_INCLUDED
#define Foundation_UTF8Escape_INCLUDED

namespace Poco {

class Foundation_API UTF8Escape
{
public:

	static std::string escape(const std::string& s);

	static std::string escape(std::string::const_iterator& begin, std::string::const_iterator& end);

	static std::string unescape(const std::string& s);

	static std::string unescape(std::string::const_iterator& begin, std::string::const_iterator& end);

private:
};

} // namespace Poco

#endif // Foundation_UTF8Escape_INCLUDED
	#include "Poco/UTF8Escape.h"
	#include "Poco/UTF8Encoding.h"
	#include "Poco/NumberFormatter.h"
	#include "Poco/Ascii.h"

	namespace Poco {

	std::string UTF8Escape::escape(const std::string &s)
	{
	return escape(s.begin(), s.end());
	}

	std::string UTF8Escape::escape(std::string::const_iterator& begin, std::string::const_iterator& end)
	{
	static Poco::UInt32 offsetsFromUTF8[6] = {
	0x00000000UL, 0x00003080UL, 0x000E2080UL,
	0x03C82080UL, 0xFA082080UL, 0x82082080UL
	};

	std::string result;

	std::string::const_iterator it = begin;

	while(it != end)
	{
	Poco::UInt32 ch = 0;
	unsigned int sz = 0;

	do
	{
	ch <<= 6;
	ch += (unsigned char)*it++;
	sz++;
	}
	while (it != end && (*it & 0xC0) == 0x80 && sz < 6);
	ch -= offsetsFromUTF8[sz-1];

	if (ch == '\n') result += "\\n";
	else if (ch == '\t') result += "\\t";
	else if (ch == '\r') result += "\\r";
	else if (ch == '\b') result += "\\b";
	else if (ch == '\f') result += "\\f";
	else if (ch == '\v') result += "\\v";
	else if (ch == '\a') result += "\\a";
	else if (ch == '\\') result += "\\\\";
	else if (ch < 32 \|\| ch == 0x7f)
	{
	result += "\\u";
	NumberFormatter::appendHex(result, (unsigned short) ch, 4);
	}
	else if (ch > 0xFFFF)
	{
	ch -= 0x10000;
	result += "\\u";
	NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4);
	result += "\\u";
	NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4);
	}
	else if (ch >= 0x80 && ch <= 0xFFFF)
	{
	result += "\\u";
	NumberFormatter::appendHex(result, (unsigned short) ch, 4);
	}
	else
	{
	result += (char) ch;
	}
	}
	return result;
	}

	std::string UTF8Escape::unescape(const std::string &s)
	{
	return unescape(s.begin(), s.end());
	}

	std::string UTF8Escape::unescape(std::string::const_iterator& begin, std::string::const_iterator& end)
	{
	std::string result;

	std::string::const_iterator it = begin;

	while (it != end)
	{
	Poco::UInt32 ch = (Poco::UInt32) *it++;

	if (ch == '\\')
	{
	if ( it == end )
	{
	//Invalid sequence!
	}

	if (*it == 'n')
	{
	ch = '\n';
	it++;
	}
	else if (*it == 't')
	{
	ch = '\t';
	it++;
	}
	else if (*it == 'r')
	{
	ch = '\r';
	it++;
	}
	else if (*it == 'b')
	{
	ch = '\b';
	it++;
	}
	else if (*it == 'f')
	{
	ch = '\f';
	it++;
	}
	else if (*it == 'v')
	{
	ch = '\v';
	it++;
	}
	else if (*it == 'a')
	{
	ch = '\a';
	it++;
	}
	else if (*it == 'u')
	{
	char digs[5];
	memset(digs, 0, 5);
	unsigned int dno = 0;

	it++;

	while (it != end && Ascii::isHexDigit(it) && dno < 4) digs[dno++] = it++;
	if (dno > 0)
	{
	ch = strtol(digs, NULL, 16);
	}

	if( ch >= 0xD800 && ch <= 0xDBFF )
	{
	if ( it == end \|\| *it != '\\' )
	{
	//Invalid sequence!
	}
	else
	{
	it++;
	if ( it == end \|\| *it != 'u' )
	{
	//Invalid sequence!
	}
	else
	{
	it++;
	}
	}

	// UTF-16 surrogate pair. Go fetch other half
	memset(digs, 0, 5);
	dno = 0;
	while (it != end && Ascii::isHexDigit(it) && dno < 4) digs[dno++] = it++;
	if (dno > 0)
	{
	Poco::UInt32 temp = strtol(digs, NULL, 16);
	if( temp >= 0xDC00 && temp <= 0xDFFF )
	{
	ch = ( ( ( ch - 0xD800 ) << 10 ) \| ( temp - 0xDC00 ) ) + 0x10000;
	}
	}
	}
	}
	else if (*it == 'U')
	{
	char digs[9];
	memset(digs, 0, 9);
	unsigned int dno = 0;

	it++;
	while (it != end && Ascii::isHexDigit(*it) && dno < 8)
	{
	digs[dno++] = *it++;
	}
	if (dno > 0)
	{
	ch = strtol(digs, NULL, 16);
	}
	}
	}

	unsigned char utf8[4];
	UTF8Encoding encoding;
	int sz = encoding.convert(ch, utf8, 4);
	result.append((char*) utf8, sz);
	}

	return result;
	}

	} // Namespace Poco
	//
	// UTF8Escape.h
	//

	#ifndef Foundation_UTF8Escape_INCLUDED
	#define Foundation_UTF8Escape_INCLUDED

	namespace Poco {

	class Foundation_API UTF8Escape
	{
	public:

	static std::string escape(const std::string& s);

	static std::string escape(std::string::const_iterator& begin, std::string::const_iterator& end);

	static std::string unescape(const std::string& s);

	static std::string unescape(std::string::const_iterator& begin, std::string::const_iterator& end);

	private:
	};

	} // namespace Poco

	#endif // Foundation_UTF8Escape_INCLUDED