Skip to content

Instantly share code, notes, and snippets.

@davidglezz
Created January 13, 2014 23:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save davidglezz/8409916 to your computer and use it in GitHub Desktop.
Save davidglezz/8409916 to your computer and use it in GitHub Desktop.
utf16_to_utf8 function, untested by me, but I hope it works well, comes from opusfile with some modifications.
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
static char* utf16_to_utf8 (const wchar_t *src)
{
size_t len = wcslen(src), si, di;
char *dst = (char*)malloc(sizeof(*dst)*(3*len+1));
if (dst == NULL)
return dst;
for (di = si = 0; si < len; si++)
{
unsigned c0 = src[si];
if (c0 < 0x80)
{
/*Can be represented by a 1-byte sequence.*/
dst[di++] = (char)c0;
continue;
}
else if (c0 < 0x800)
{
/*Can be represented by a 2-byte sequence.*/
dst[di++] = (char)(0xC0|c0>>6);
dst[di++] = (char)(0x80|c0&0x3F);
continue;
}
else if (c0 >= 0xD800 && c0 < 0xDC00)
{
/*This is safe, because c0 was not 0 and src is NUL-terminated.*/
unsigned c1 = src[si+1];
if (c1 >= 0xDC00 && c1 < 0xE000)
{
/*Surrogate pair.*/
unsigned w = ((c0&0x3FF)<<10|c1&0x3FF)+0x10000;
/*Can be represented by a 4-byte sequence.*/
dst[di++] = (char)(0xF0|w>>18);
dst[di++] = (char)(0x80|w>>12&0x3F);
dst[di++] = (char)(0x80|w>>6&0x3F);
dst[di++] = (char)(0x80|w&0x3F);
si++;
continue;
}
}
/*Anything else is either a valid 3-byte sequence, an invalid surrogate
pair, or 'not a character'.
In the latter two cases, we just encode the value as a 3-byte
sequence anyway (producing technically invalid UTF-8).
Later error handling will detect the problem, with a better
chance of giving a useful error message.*/
dst[di++] = (char)(0xE0|c0>>12);
dst[di++] = (char)(0x80|c0>>6&0x3F);
dst[di++] = (char)(0x80|c0&0x3F);
}
dst[di++] = '\0';
return dst;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment