Last active
August 29, 2015 14:02
-
-
Save fbraem/d605938e6071b3de9df8 to your computer and use it in GitHub Desktop.
UTF8Escape class for POCO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "Poco/UTF8Escape.h" | |
#include "Poco/UTF8Encoding.h" | |
#include "Poco/NumberFormatter.h" | |
#include "Poco/Ascii.h" | |
namespace Poco { | |
std::string UTF8Escape::escape(const std::string &s) | |
{ | |
return escape(s.begin(), s.end()); | |
} | |
std::string UTF8Escape::escape(std::string::const_iterator& begin, std::string::const_iterator& end) | |
{ | |
static Poco::UInt32 offsetsFromUTF8[6] = { | |
0x00000000UL, 0x00003080UL, 0x000E2080UL, | |
0x03C82080UL, 0xFA082080UL, 0x82082080UL | |
}; | |
std::string result; | |
std::string::const_iterator it = begin; | |
while(it != end) | |
{ | |
Poco::UInt32 ch = 0; | |
unsigned int sz = 0; | |
do | |
{ | |
ch <<= 6; | |
ch += (unsigned char)*it++; | |
sz++; | |
} | |
while (it != end && (*it & 0xC0) == 0x80 && sz < 6); | |
ch -= offsetsFromUTF8[sz-1]; | |
if (ch == '\n') result += "\\n"; | |
else if (ch == '\t') result += "\\t"; | |
else if (ch == '\r') result += "\\r"; | |
else if (ch == '\b') result += "\\b"; | |
else if (ch == '\f') result += "\\f"; | |
else if (ch == '\v') result += "\\v"; | |
else if (ch == '\a') result += "\\a"; | |
else if (ch == '\\') result += "\\\\"; | |
else if (ch < 32 || ch == 0x7f) | |
{ | |
result += "\\u"; | |
NumberFormatter::appendHex(result, (unsigned short) ch, 4); | |
} | |
else if (ch > 0xFFFF) | |
{ | |
ch -= 0x10000; | |
result += "\\u"; | |
NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4); | |
result += "\\u"; | |
NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4); | |
} | |
else if (ch >= 0x80 && ch <= 0xFFFF) | |
{ | |
result += "\\u"; | |
NumberFormatter::appendHex(result, (unsigned short) ch, 4); | |
} | |
else | |
{ | |
result += (char) ch; | |
} | |
} | |
return result; | |
} | |
std::string UTF8Escape::unescape(const std::string &s) | |
{ | |
return unescape(s.begin(), s.end()); | |
} | |
std::string UTF8Escape::unescape(std::string::const_iterator& begin, std::string::const_iterator& end) | |
{ | |
std::string result; | |
std::string::const_iterator it = begin; | |
while (it != end) | |
{ | |
Poco::UInt32 ch = (Poco::UInt32) *it++; | |
if (ch == '\\') | |
{ | |
if ( it == end ) | |
{ | |
//Invalid sequence! | |
} | |
if (*it == 'n') | |
{ | |
ch = '\n'; | |
it++; | |
} | |
else if (*it == 't') | |
{ | |
ch = '\t'; | |
it++; | |
} | |
else if (*it == 'r') | |
{ | |
ch = '\r'; | |
it++; | |
} | |
else if (*it == 'b') | |
{ | |
ch = '\b'; | |
it++; | |
} | |
else if (*it == 'f') | |
{ | |
ch = '\f'; | |
it++; | |
} | |
else if (*it == 'v') | |
{ | |
ch = '\v'; | |
it++; | |
} | |
else if (*it == 'a') | |
{ | |
ch = '\a'; | |
it++; | |
} | |
else if (*it == 'u') | |
{ | |
char digs[5]; | |
memset(digs, 0, 5); | |
unsigned int dno = 0; | |
it++; | |
while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; | |
if (dno > 0) | |
{ | |
ch = strtol(digs, NULL, 16); | |
} | |
if( ch >= 0xD800 && ch <= 0xDBFF ) | |
{ | |
if ( it == end || *it != '\\' ) | |
{ | |
//Invalid sequence! | |
} | |
else | |
{ | |
it++; | |
if ( it == end || *it != 'u' ) | |
{ | |
//Invalid sequence! | |
} | |
else | |
{ | |
it++; | |
} | |
} | |
// UTF-16 surrogate pair. Go fetch other half | |
memset(digs, 0, 5); | |
dno = 0; | |
while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; | |
if (dno > 0) | |
{ | |
Poco::UInt32 temp = strtol(digs, NULL, 16); | |
if( temp >= 0xDC00 && temp <= 0xDFFF ) | |
{ | |
ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000; | |
} | |
} | |
} | |
} | |
else if (*it == 'U') | |
{ | |
char digs[9]; | |
memset(digs, 0, 9); | |
unsigned int dno = 0; | |
it++; | |
while (it != end && Ascii::isHexDigit(*it) && dno < 8) | |
{ | |
digs[dno++] = *it++; | |
} | |
if (dno > 0) | |
{ | |
ch = strtol(digs, NULL, 16); | |
} | |
} | |
} | |
unsigned char utf8[4]; | |
UTF8Encoding encoding; | |
int sz = encoding.convert(ch, utf8, 4); | |
result.append((char*) utf8, sz); | |
} | |
return result; | |
} | |
} // Namespace Poco |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// UTF8Escape.h | |
// | |
#ifndef Foundation_UTF8Escape_INCLUDED | |
#define Foundation_UTF8Escape_INCLUDED | |
namespace Poco { | |
class Foundation_API UTF8Escape | |
{ | |
public: | |
static std::string escape(const std::string& s); | |
static std::string escape(std::string::const_iterator& begin, std::string::const_iterator& end); | |
static std::string unescape(const std::string& s); | |
static std::string unescape(std::string::const_iterator& begin, std::string::const_iterator& end); | |
private: | |
}; | |
} // namespace Poco | |
#endif // Foundation_UTF8Escape_INCLUDED |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment