d3x0r/UTF16ToUTF8.c

## UTF16ToUTF8.c

#include <wchar.h>
#include <malloc.h>
#include <stdint.h>


// returns pointer to utf8 string;
// the result must be released with free().
//
// length is in count of wide characters passed.
// optional result_len may be used to get byte count of utf8 characters;
// the count does not include the final NUL; pass NULL to ignore.
char * UTF16ToUTF8 ( const wchar_t *wch, size_t len, size_t *result_len )
{
	// Conversion to char* :
	// Can just convert wchar_t* to char* using one of the
	// conversion functions such as:
	// WideCharToMultiByte()
	// wcstombs_s()
	// ... etc
	size_t  sizeInBytes;
	char  tmp[2];
	char	 *ch;
	char	 *_ch;
	const wchar_t *_wch = wch;
	sizeInBytes = 1; // start with 1 for the ending nul
	_ch = ch = tmp;
	{
		size_t n;
		// first, count how many bytes we will need.
		for( n = 0; n < len; n++ )
		{
			if( !( wch[0] & 0xFF80 ) )
				sizeInBytes++;
			else if( !( wch[0] & 0xF800 ) )
				sizeInBytes += 2;
			else if( (  ( ( wch[0] & 0xFC00 ) >= 0xD800 )
					   && ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
					 && ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
					   && ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
					 )
			{
				int longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) );
				if( !(longer_value & 0xFFFF0000 ) )
					sizeInBytes += 3;
				else if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
					sizeInBytes += 2;
				else
					sizeInBytes += 4;
				wch++;
			}
			else
			{
				// just encode the 16 bits as it is.
				sizeInBytes+= 3;
			}
			wch++;
		}
	}
	wch = _wch;
	_ch = ch = (char*)malloc(sizeInBytes);
	{
		size_t n;
		for( n = 0; n < len; n++ )
		{
			{
				if( !( wch[0] & 0xFF80 ) )
				{
					(*ch++) = ((unsigned char*)wch)[0];
				}
				else if( !( wch[0] & 0xFF00 ) )
				{
					//(*ch++) = ((unsigned char*)wch)[0];
					(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 );
					(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F );
				}
				else if( !( wch[0] & 0xF800 ) )
				{
					(*ch++) = 0xC0 | ( ( ((unsigned char*)wch)[1] & 0x7 ) << 2 ) | ( ( ((unsigned char*)wch)[0] ) >> 6 );
					(*ch++) = 0x80 | ( ((unsigned char*)wch)[0] & 0x3F );
				}
				else if( (  ( ( wch[0] & 0xFC00 ) >= 0xD800 )
							 && ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
						   && ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
							 && ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
					 )
				{
					uint32_t longer_value;
					longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) | ( ( wch[1] & 0x3ff ) ) );
					if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
					{
						longer_value = ( longer_value - 0xF0000 ) + 0xD800;
						sizeInBytes += 2;
					}
					wch++;
					if( !(longer_value & 0xFFFF ) )
					{
						// 16 bit encoding (shouldn't be hit
						(*ch++) = 0xE0 | (char)( ( longer_value >> 12 ) & 0x0F );
						(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f );
						(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f );
					}
					else if( !( longer_value & 0xFFE00000 ) )
					{
						// 21 bit encoding ...
						(*ch++) = 0xF0 | (char)( ( longer_value >> 18 ) & 0x07 );
						(*ch++) = 0x80 | (char)( ( longer_value >> 12 ) & 0x3f );
						(*ch++) = 0x80 | (char)( ( longer_value >> 6 ) & 0x3f );
						(*ch++) = 0x80 | (char)( ( longer_value >> 0 ) & 0x3f );
					}
					else
					{
						// too long to encode.
					}
				}
				else
				{
						(*ch++) = 0xE0 | ( ( wch[0] >> 12 ) & 0x0F ); // mask just in case of stupid compiles that tread wchar as signed?
						(*ch++) = 0x80 | ( ( wch[0] >> 6 ) & 0x3f );
						(*ch++) = 0x80 | ( ( wch[0] >> 0 ) & 0x3f );
				}
			}
			wch++;
		}
	}
	(*ch) = 0;
	if( result_len )
		result_len[0] = ch - _ch;
	ch = _ch;
	return ch;
}

	#include <wchar.h>
	#include <malloc.h>
	#include <stdint.h>


	// returns pointer to utf8 string;
	// the result must be released with free().
	//
	// length is in count of wide characters passed.
	// optional result_len may be used to get byte count of utf8 characters;
	// the count does not include the final NUL; pass NULL to ignore.
	char * UTF16ToUTF8 ( const wchar_t wch, size_t len, size_t result_len )
	{
	// Conversion to char* :
	// Can just convert wchar_t* to char* using one of the
	// conversion functions such as:
	// WideCharToMultiByte()
	// wcstombs_s()
	// ... etc
	size_t sizeInBytes;
	char tmp[2];
	char *ch;
	char *_ch;
	const wchar_t *_wch = wch;
	sizeInBytes = 1; // start with 1 for the ending nul
	_ch = ch = tmp;
	{
	size_t n;
	// first, count how many bytes we will need.
	for( n = 0; n < len; n++ )
	{
	if( !( wch[0] & 0xFF80 ) )
	sizeInBytes++;
	else if( !( wch[0] & 0xF800 ) )
	sizeInBytes += 2;
	else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 )
	&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
	&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
	&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
	)
	{
	int longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) \| ( ( wch[1] & 0x3ff ) ) );
	if( !(longer_value & 0xFFFF0000 ) )
	sizeInBytes += 3;
	else if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
	sizeInBytes += 2;
	else
	sizeInBytes += 4;
	wch++;
	}
	else
	{
	// just encode the 16 bits as it is.
	sizeInBytes+= 3;
	}
	wch++;
	}
	}
	wch = _wch;
	_ch = ch = (char*)malloc(sizeInBytes);
	{
	size_t n;
	for( n = 0; n < len; n++ )
	{
	{
	if( !( wch[0] & 0xFF80 ) )
	{
	(ch++) = ((unsigned char)wch)[0];
	}
	else if( !( wch[0] & 0xFF00 ) )
	{
	//(ch++) = ((unsigned char)wch)[0];
	(ch++) = 0xC0 \| ( ( ((unsigned char)wch)[1] & 0x7 ) << 2 ) \| ( ( ((unsigned char*)wch)[0] ) >> 6 );
	(ch++) = 0x80 \| ( ((unsigned char)wch)[0] & 0x3F );
	}
	else if( !( wch[0] & 0xF800 ) )
	{
	(ch++) = 0xC0 \| ( ( ((unsigned char)wch)[1] & 0x7 ) << 2 ) \| ( ( ((unsigned char*)wch)[0] ) >> 6 );
	(ch++) = 0x80 \| ( ((unsigned char)wch)[0] & 0x3F );
	}
	else if( ( ( ( wch[0] & 0xFC00 ) >= 0xD800 )
	&& ( ( wch[0] & 0xFC00 ) < 0xDC00 ) )
	&& ( ( ( wch[1] & 0xFC00 ) >= 0xDC00 )
	&& ( ( wch[1] & 0xFC00 ) < 0xE000 ) )
	)
	{
	uint32_t longer_value;
	longer_value = 0x10000 + ( ( ( wch[0] & 0x3ff ) << 10 ) \| ( ( wch[1] & 0x3ff ) ) );
	if( ( longer_value >= 0xF0000 ) && ( longer_value < 0xF0800 ) ) // hack a way to encode D800-DFFF
	{
	longer_value = ( longer_value - 0xF0000 ) + 0xD800;
	sizeInBytes += 2;
	}
	wch++;
	if( !(longer_value & 0xFFFF ) )
	{
	// 16 bit encoding (shouldn't be hit
	(*ch++) = 0xE0 \| (char)( ( longer_value >> 12 ) & 0x0F );
	(*ch++) = 0x80 \| (char)( ( longer_value >> 6 ) & 0x3f );
	(*ch++) = 0x80 \| (char)( ( longer_value >> 0 ) & 0x3f );
	}
	else if( !( longer_value & 0xFFE00000 ) )
	{
	// 21 bit encoding ...
	(*ch++) = 0xF0 \| (char)( ( longer_value >> 18 ) & 0x07 );
	(*ch++) = 0x80 \| (char)( ( longer_value >> 12 ) & 0x3f );
	(*ch++) = 0x80 \| (char)( ( longer_value >> 6 ) & 0x3f );
	(*ch++) = 0x80 \| (char)( ( longer_value >> 0 ) & 0x3f );
	}
	else
	{
	// too long to encode.
	}
	}
	else
	{
	(*ch++) = 0xE0 \| ( ( wch[0] >> 12 ) & 0x0F ); // mask just in case of stupid compiles that tread wchar as signed?
	(*ch++) = 0x80 \| ( ( wch[0] >> 6 ) & 0x3f );
	(*ch++) = 0x80 \| ( ( wch[0] >> 0 ) & 0x3f );
	}
	}
	wch++;
	}
	}
	(*ch) = 0;
	if( result_len )
	result_len[0] = ch - _ch;
	ch = _ch;
	return ch;
	}