Skip to content

Instantly share code, notes, and snippets.

@tnzk
Created October 22, 2011 13:40
Show Gist options
  • Select an option

  • Save tnzk/1306021 to your computer and use it in GitHub Desktop.

Select an option

Save tnzk/1306021 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include "utf8decoder.h"
int main( int argc, char** argv)
{
char text[] = "日本語テスト彳";
Utf8Decoder u8d(text, (int)strlen(text));
for ( int i = 0; i < u8d.length(); i++ ){
printf("%i: %lx\n", i, u8d.get(i));
}
return 0;
}
#include "utf8decoder.h"
Utf8Decoder::Utf8Decoder( char* s, int n)
{
this->s = s;
this->n = n;
}
int Utf8Decoder::utf8_byte_type(char c)
{
if(( c & 0xc0) == 0x80) return 0; // following byte
if(( c & 0xe0) == 0xc0) return 2;
if(( c & 0xf0) == 0xe0) return 3;
if(( c & 0xf8) == 0xf0) return 4;
if(( c & 0xfc) == 0xf8) return 5;
if(( c & 0xfe) == 0xfc) return 6;
return 1;
}
int Utf8Decoder::get_utf8_bytes( unsigned char* buf, unsigned char* src)
{
int n = utf8_byte_type(*src);
for( int i = 0; i < n; i++) buf[i] = *(src + i);
return n;
}
long int Utf8Decoder::get_unicode_index( unsigned char* buf, int n)
{
long int dst = 0;
if( utf8_byte_type(buf[0]) == 0) return 0;
if( utf8_byte_type(buf[0]) == 1) return buf[0];
for( int i = 0; i < n; i++) {
int bitleft;
unsigned char data = buf[i];
if( utf8_byte_type(data) == 0) {
// following byte
bitleft = 6;
data <<= 2;
} else {
// head byte
bitleft = 7 - n;
data <<= (n + 1);
}
while(bitleft) {
dst |= (data & 0x80) >> 7;
bitleft--;
dst <<= 1;
data <<= 1;
}
}
return dst >> 1;
}
long int Utf8Decoder::get( int i)
{
unsigned char utf8bytes[] = {0,0,0,0,0,0,0};
int c = 0;
int j;
for( j = 0; j < this->n; j++) {
if(utf8_byte_type(s[j])) c++;
if( c > i ) break;
}
int m = this->get_utf8_bytes( utf8bytes, (unsigned char*)(this->s + j));
return get_unicode_index( utf8bytes, m);
}
int Utf8Decoder::length()
{
int c = 0;
for( int i = 0; i < this->n; i++) if(utf8_byte_type(s[i])) c++;
return c;
}
class Utf8Decoder {
public:
char* s;
int n;
Utf8Decoder(char*, int);
long int get(int);
int length();
private:
int utf8_byte_type( char);
int get_utf8_bytes( unsigned char*, unsigned char*);
long int get_unicode_index( unsigned char*, int);
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment