Skip to content

Instantly share code, notes, and snippets.

@d3x0r
Last active November 7, 2018 10:35
Show Gist options
  • Save d3x0r/29d9d92d131a24af28fb89b67a1b57e4 to your computer and use it in GitHub Desktop.
Save d3x0r/29d9d92d131a24af28fb89b67a1b57e4 to your computer and use it in GitHub Desktop.
A routine which will convert utf-16 to utf8 - assumes utf-16 is valid.
#include <wchar.h>
#include <malloc.h>
#include <stdint.h>
// returns pointer to utf8 string;
// the result must be released with free().
//
// length is in count of wide characters passed.
// optional result_len may be used to get byte count of utf8 characters;
// the count does not include the final NUL; pass NULL to ignore.
char * UTF16ToUTF8 ( const wchar_t *wch, size_t len, size_t *result_len )
{
// Conversion to char* :
// Can just convert wchar_t* to char* using one of the
// conversion functions such as:
// WideCharToMultiByte()
// wcstombs_s()
// ... etc
size_t sizeInBytes;
char tmp[2];
char *ch;
char *_ch;
const wchar_t *_wch = wch;
sizeInBytes = 1; // start with 1 for the ending nul
_ch = ch = tmp;
{
size_t n;
// first, count how many bytes we will need.
for( n = 0; n < len; n++ )
{
if( !( wch[0] & 0xFF80 ) )
sizeInBytes++;
else if( !( wch[0] & 0xF800 ) )
sizeInBytes += 2;
else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 )
&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
)
{
int longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) );
if( !(longer_value & 0xFFFF0000 ) )
sizeInBytes += 3;
else if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
sizeInBytes += 2;
else
sizeInBytes += 4;
wch++;
}
else
{
// just encode the 16 bits as it is.
sizeInBytes+= 3;
}
wch++;
}
}
wch = _wch;
_ch = ch = (char*)malloc(sizeInBytes);
{
size_t n;
for( n = 0; n < len; n++ )
{
{
if( !( wch[0] & 0xFF80 ) )
{
(*ch++) = ((unsigned char*)wch)[0];
}
else if( !( wch[0] & 0xFF00 ) )
{
//(*ch++) = ((unsigned char*)wch)[0];
(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 );
(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F );
}
else if( !( wch[0] & 0xF800 ) )
{
(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 );
(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F );
}
else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 )
&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
)
{
uint32_t longer_value;
longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) );
if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
{
longer_value = ( longer_value - 0xF0000 ) + 0xD800;
sizeInBytes += 2;
}
wch++;
if( !(longer_value & 0xFFFF ) )
{
// 16 bit encoding (shouldn't be hit
(*ch++) = 0xE0 | (char)( ( longer_value >> 12 ) & 0x0F );
(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f );
(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f );
}
else if( !( longer_value & 0xFFE00000 ) )
{
// 21 bit encoding ...
(*ch++) = 0xF0 | (char)( ( longer_value >> 18 ) & 0x07 );
(*ch++) = 0x80 | (char)( ( longer_value >> 12 ) & 0x3f );
(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f );
(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f );
}
else
{
// too long to encode.
}
}
else
{
(*ch++) = 0xE0 | ( ( wch[0] >> 12 ) & 0x0F ); // mask just in case of stupid compiles that tread wchar as signed?
(*ch++) = 0x80 | ( ( wch[0] >> 6 ) & 0x3f );
(*ch++) = 0x80 | ( ( wch[0] >> 0 ) & 0x3f );
}
}
wch++;
}
}
(*ch) = 0;
if( result_len )
result_len[0] = ch - _ch;
ch = _ch;
return ch;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment