Last active
November 7, 2018 10:35
-
-
Save d3x0r/29d9d92d131a24af28fb89b67a1b57e4 to your computer and use it in GitHub Desktop.
A routine which will convert utf-16 to utf8 - assumes utf-16 is valid.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <wchar.h> | |
#include <malloc.h> | |
#include <stdint.h> | |
// returns pointer to utf8 string; | |
// the result must be released with free(). | |
// | |
// length is in count of wide characters passed. | |
// optional result_len may be used to get byte count of utf8 characters; | |
// the count does not include the final NUL; pass NULL to ignore. | |
char * UTF16ToUTF8 ( const wchar_t *wch, size_t len, size_t *result_len ) | |
{ | |
// Conversion to char* : | |
// Can just convert wchar_t* to char* using one of the | |
// conversion functions such as: | |
// WideCharToMultiByte() | |
// wcstombs_s() | |
// ... etc | |
size_t sizeInBytes; | |
char tmp[2]; | |
char *ch; | |
char *_ch; | |
const wchar_t *_wch = wch; | |
sizeInBytes = 1; // start with 1 for the ending nul | |
_ch = ch = tmp; | |
{ | |
size_t n; | |
// first, count how many bytes we will need. | |
for( n = 0; n < len; n++ ) | |
{ | |
if( !( wch[0] & 0xFF80 ) ) | |
sizeInBytes++; | |
else if( !( wch[0] & 0xF800 ) ) | |
sizeInBytes += 2; | |
else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 ) | |
&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) ) | |
&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 ) | |
&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) ) | |
) | |
{ | |
int longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) ); | |
if( !(longer_value & 0xFFFF0000 ) ) | |
sizeInBytes += 3; | |
else if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF | |
sizeInBytes += 2; | |
else | |
sizeInBytes += 4; | |
wch++; | |
} | |
else | |
{ | |
// just encode the 16 bits as it is. | |
sizeInBytes+= 3; | |
} | |
wch++; | |
} | |
} | |
wch = _wch; | |
_ch = ch = (char*)malloc(sizeInBytes); | |
{ | |
size_t n; | |
for( n = 0; n < len; n++ ) | |
{ | |
{ | |
if( !( wch[0] & 0xFF80 ) ) | |
{ | |
(*ch++) = ((unsigned char*)wch)[0]; | |
} | |
else if( !( wch[0] & 0xFF00 ) ) | |
{ | |
//(*ch++) = ((unsigned char*)wch)[0]; | |
(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 ); | |
(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F ); | |
} | |
else if( !( wch[0] & 0xF800 ) ) | |
{ | |
(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 ); | |
(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F ); | |
} | |
else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 ) | |
&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) ) | |
&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 ) | |
&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) ) | |
) | |
{ | |
uint32_t longer_value; | |
longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) ); | |
if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF | |
{ | |
longer_value = ( longer_value - 0xF0000 ) + 0xD800; | |
sizeInBytes += 2; | |
} | |
wch++; | |
if( !(longer_value & 0xFFFF ) ) | |
{ | |
// 16 bit encoding (shouldn't be hit | |
(*ch++) = 0xE0 | (char)( ( longer_value >> 12 ) & 0x0F ); | |
(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f ); | |
(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f ); | |
} | |
else if( !( longer_value & 0xFFE00000 ) ) | |
{ | |
// 21 bit encoding ... | |
(*ch++) = 0xF0 | (char)( ( longer_value >> 18 ) & 0x07 ); | |
(*ch++) = 0x80 | (char)( ( longer_value >> 12 ) & 0x3f ); | |
(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f ); | |
(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f ); | |
} | |
else | |
{ | |
// too long to encode. | |
} | |
} | |
else | |
{ | |
(*ch++) = 0xE0 | ( ( wch[0] >> 12 ) & 0x0F ); // mask just in case of stupid compiles that tread wchar as signed? | |
(*ch++) = 0x80 | ( ( wch[0] >> 6 ) & 0x3f ); | |
(*ch++) = 0x80 | ( ( wch[0] >> 0 ) & 0x3f ); | |
} | |
} | |
wch++; | |
} | |
} | |
(*ch) = 0; | |
if( result_len ) | |
result_len[0] = ch - _ch; | |
ch = _ch; | |
return ch; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment