Skip to content

Instantly share code, notes, and snippets.

@gerdr
Created June 4, 2012 16:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gerdr/2869236 to your computer and use it in GitHub Desktop.
Save gerdr/2869236 to your computer and use it in GitHub Desktop.
UTF-8 encoder
/*
Copyright 2012 Gerhard R. <gerd.r.devel@googlemail.com>
Permission is granted to use, modify, and / or redistribute at will.
This includes removing authorship notices, re-use of code parts in
other software (with or without giving credit), and / or creating a
commercial product based on it.
This permission is not revocable by the author.
This software is provided as-is. Use it at your own risk. There is
no warranty whatsoever, neither expressed nor implied, and by using
this software you accept that the author(s) shall not be held liable
for any loss of data, loss of service, or other damages, be they
incidental or consequential. Your only option other than accepting
this is not to use the software at all.
*/
#include <stddef.h>
#include <stdint.h>
extern void *utf8_encode(void *bytes, uint32_t cp);
enum
{
CP_CHAR = 1 << 0,
CP_LOW_SURROGATE = 1 << 1,
CP_HIGH_SURROGATE = 1 << 2,
CP_NONCHAR = 1 << 3,
CP_OVERFLOW = 1 << 4,
U8_SINGLE = 1 << 5,
U8_DOUBLE = 1 << 6,
U8_TRIPLE = 1 << 7,
U8_QUAD = 1 << 8
};
static unsigned classify(uint32_t cp)
{
if(cp == 0)
return CP_CHAR | U8_DOUBLE;
if(cp <= 0x7F)
return CP_CHAR | U8_SINGLE;
if(cp <= 0x07FF)
return CP_CHAR | U8_DOUBLE;
if(0xD800 <= cp && cp <= 0xDBFF)
return CP_HIGH_SURROGATE | U8_TRIPLE;
if(0xDC00 <= cp && cp <= 0xDFFF)
return CP_LOW_SURROGATE | U8_TRIPLE;
if(0xFDD0 <= cp && cp <= 0xFDEF)
return CP_NONCHAR | U8_TRIPLE;
if(cp <= 0xFFFD)
return CP_CHAR | U8_TRIPLE;
if(cp == 0xFFFE || cp == 0xFFFF)
return CP_NONCHAR | U8_TRIPLE;
if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
return CP_NONCHAR | U8_QUAD;
if(cp <= 0x10FFFF)
return CP_CHAR | U8_QUAD;
if(cp <= 0x1FFFFF)
return CP_OVERFLOW | U8_QUAD;
return 0;
}
void *utf8_encode(void *bytes, uint32_t cp)
{
unsigned cc = classify(cp);
uint8_t *bp = bytes;
if(!(cc & CP_CHAR))
return NULL;
if(cc & U8_SINGLE)
{
bp[0] = (uint8_t)cp;
return bp + 1;
}
if(cc & U8_DOUBLE)
{
bp[0] = (uint8_t)((6 << 5) | (cp >> 6));
bp[1] = (uint8_t)((2 << 6) | (cp & 0x3F));
return bp + 2;
}
if(cc & U8_TRIPLE)
{
bp[0] = (uint8_t)((14 << 4) | (cp >> 12));
bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 6) & 0x3F));
bp[2] = (uint8_t)(( 2 << 6) | ( cp & 0x3F));
return bp + 3;
}
if(cc & U8_QUAD)
{
bp[0] = (uint8_t)((30 << 3) | (cp >> 18));
bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 12) & 0x3F));
bp[2] = (uint8_t)(( 2 << 6) | ((cp >> 6) & 0x3F));
bp[3] = (uint8_t)(( 2 << 6) | ( cp & 0x3F));
return bp + 4;
}
return NULL;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment