Skip to content

Instantly share code, notes, and snippets.

@cloudwu
Created January 5, 2017 06:41
Show Gist options
  • Save cloudwu/4705dbf6a0a4657bee78f2749e8da601 to your computer and use it in GitHub Desktop.
Save cloudwu/4705dbf6a0a4657bee78f2749e8da601 to your computer and use it in GitHub Desktop.
A filter for convert utf8 to utf16 in windows.
// Use mingw in windows
// gcc -O2 -o utf8.exe utf8to16.c
#include <stdio.h>
#include <fcntl.h>
#include <io.h>
#include <wchar.h>
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
static const unsigned long offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
#define UNI_SHIFT 10
#define UNI_BASE 0x0010000
#define UNI_MASK 0x3FF
#define UNI_SUR_HIGH_START 0xD800
#define UNI_SUR_LOW_START 0xDC00
int
main() {
int c;
_setmode(_fileno(stdout), _O_WTEXT);
while ((c=fgetc(stdin)) >= 0) {
unsigned long ch = 0;
int extra = trailingBytesForUTF8[c];
#define READBYTE() ch += c; ch <<= 6; if ((c=fgetc(stdin)) < 0) return 1;
switch (extra) {
case 5: READBYTE();
case 4: READBYTE();
case 3: READBYTE();
case 2: READBYTE();
case 1: READBYTE();
case 0: ch += c;
}
ch -= offsetsFromUTF8[extra];
if (ch <= 0xffff) {
wprintf(L"%lc", ch);
} else {
ch -= UNI_BASE;
wprintf(L"%lc%lc", (ch >> UNI_SHIFT) + UNI_SUR_HIGH_START,
(ch & UNI_MASK) + UNI_SUR_LOW_START);
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment