Skip to content

Instantly share code, notes, and snippets.

@dobrokot
Created August 6, 2013 19:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dobrokot/6167587 to your computer and use it in GitHub Desktop.
Save dobrokot/6167587 to your computer and use it in GitHub Desktop.
faster conversion to unicode from UTF-8 for python 2.*
#include <Python.h>
#include <stdlib.h>
#include <stdio.h>
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
#include <stdint.h>
static const uint8_t utf8d[] = {
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
static uint32_t inline decode(uint32_t state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
return utf8d[256 + state*16 + type];
}
static uint32_t inline validate(uint32_t state, uint32_t byte) {
return utf8d[256 + state*16 + utf8d[byte]];
}
static PyObject* decode_utf8(PyObject *, PyObject *args)
{
PyObject *pystr;
if (!PyArg_ParseTuple(args, "O:decode_utf8", &pystr))
return NULL;
if (!PyString_CheckExact(pystr))
{
PyErr_Format(PyExc_TypeError, "string expected ('%s' given)", pystr->ob_type->tp_name);
return NULL;
}
Py_ssize_t in_size = PyString_Size(pystr);
const char *in_data = PyString_AsString(pystr);
size_t out_size = 0;
{
uint32_t st = 0;
for (Py_ssize_t i = 0; i < in_size; ++i) {
st = validate(st, (unsigned char)in_data[i]);
out_size += (st == UTF8_ACCEPT);
}
if (st != UTF8_ACCEPT) {
PyErr_Format(PyExc_TypeError, "invalid UTF-8 in function decode_utf8");
return NULL;
}
}
PyObject *result_obj = PyUnicode_FromUnicode(NULL, out_size);
Py_UNICODE *out_data = PyUnicode_AS_UNICODE(result_obj);
size_t out_i = 0;
uint32_t st = 0;
uint32_t ch = 0;
for (Py_ssize_t i = 0; i < in_size; ++i) {
st = decode(st, &ch, (unsigned char)in_data[i]);
if (st == UTF8_ACCEPT) {
out_data[out_i++] = ch; //todo: some handling for windows, which have 16bit unicode strings
}
}
return result_obj;
}
static PyMethodDef module_methods[] = {
{"decode_utf8", (PyCFunction)decode_utf8, METH_VARARGS, "decode_utf8 from utf-8 to unicode, faster then default conversion" },
{NULL} /* Sentinel */
};
PyMODINIT_FUNC initfastutf8(void) {
PyObject *module = Py_InitModule3("fastutf8", module_methods, "fastutf8");
if (module == NULL)
return;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment