Created
August 6, 2013 19:07
-
-
Save dobrokot/6167587 to your computer and use it in GitHub Desktop.
faster conversion to unicode from UTF-8 for python 2.*
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Python.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#define UTF8_ACCEPT 0 | |
#define UTF8_REJECT 1 | |
#include <stdint.h> | |
static const uint8_t utf8d[] = { | |
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df | |
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef | |
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff | |
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 | |
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 | |
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 | |
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 | |
}; | |
static uint32_t inline decode(uint32_t state, uint32_t* codep, uint32_t byte) { | |
uint32_t type = utf8d[byte]; | |
*codep = (state != UTF8_ACCEPT) ? | |
(byte & 0x3fu) | (*codep << 6) : | |
(0xff >> type) & (byte); | |
return utf8d[256 + state*16 + type]; | |
} | |
static uint32_t inline validate(uint32_t state, uint32_t byte) { | |
return utf8d[256 + state*16 + utf8d[byte]]; | |
} | |
static PyObject* decode_utf8(PyObject *, PyObject *args) | |
{ | |
PyObject *pystr; | |
if (!PyArg_ParseTuple(args, "O:decode_utf8", &pystr)) | |
return NULL; | |
if (!PyString_CheckExact(pystr)) | |
{ | |
PyErr_Format(PyExc_TypeError, "string expected ('%s' given)", pystr->ob_type->tp_name); | |
return NULL; | |
} | |
Py_ssize_t in_size = PyString_Size(pystr); | |
const char *in_data = PyString_AsString(pystr); | |
size_t out_size = 0; | |
{ | |
uint32_t st = 0; | |
for (Py_ssize_t i = 0; i < in_size; ++i) { | |
st = validate(st, (unsigned char)in_data[i]); | |
out_size += (st == UTF8_ACCEPT); | |
} | |
if (st != UTF8_ACCEPT) { | |
PyErr_Format(PyExc_TypeError, "invalid UTF-8 in function decode_utf8"); | |
return NULL; | |
} | |
} | |
PyObject *result_obj = PyUnicode_FromUnicode(NULL, out_size); | |
Py_UNICODE *out_data = PyUnicode_AS_UNICODE(result_obj); | |
size_t out_i = 0; | |
uint32_t st = 0; | |
uint32_t ch = 0; | |
for (Py_ssize_t i = 0; i < in_size; ++i) { | |
st = decode(st, &ch, (unsigned char)in_data[i]); | |
if (st == UTF8_ACCEPT) { | |
out_data[out_i++] = ch; //todo: some handling for windows, which have 16bit unicode strings | |
} | |
} | |
return result_obj; | |
} | |
static PyMethodDef module_methods[] = { | |
{"decode_utf8", (PyCFunction)decode_utf8, METH_VARARGS, "decode_utf8 from utf-8 to unicode, faster then default conversion" }, | |
{NULL} /* Sentinel */ | |
}; | |
PyMODINIT_FUNC initfastutf8(void) { | |
PyObject *module = Py_InitModule3("fastutf8", module_methods, "fastutf8"); | |
if (module == NULL) | |
return; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment