Skip to content

Instantly share code, notes, and snippets.

@ozdemirburak
Last active April 30, 2021 11:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ozdemirburak/89a7a1673cb65ce83469 to your computer and use it in GitHub Desktop.
Save ozdemirburak/89a7a1673cb65ce83469 to your computer and use it in GitHub Desktop.
UTF8, UTF16, UTF32, ISO8859 conversions
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdint.h>
#include <stddef.h>
int getSize(unsigned char *);
unsigned char * fileToBuffer(unsigned char *);
unsigned char * replaceCharset(const char *, const char *, const char *);
unsigned char * extractCharset(const char *);
const char *getFileExtension(const char *);
const char *getBeforeExtension(const char *);
size_t utf8_charlen(uint8_t);
size_t utf8_valid(const uint8_t *);
uint32_t utf8_to_32(const uint8_t *);
unsigned char * UTF8_TO_UTF32(unsigned char*, unsigned char*);
unsigned char * UTF32_TO_UTF8(unsigned char*, unsigned char*);
unsigned char * UTF8_TO_UTF16(unsigned char*, unsigned char*);
unsigned char * UTF16_TO_UTF8(unsigned char*, unsigned char*);
unsigned char * ISO8859_TO_UTF8(unsigned char*, unsigned char*, unsigned int);
unsigned char * UTF8_TO_ISO8859(unsigned char*, unsigned char*, int);
unsigned char * convert(unsigned char*, unsigned char*, const char*, const char*);
void writeImageFromBuffer(const char* , const char*, unsigned long);
void createFile(unsigned char*, unsigned char*, unsigned char *);
int getSize(unsigned char * data)
{
int datasize =0;
while (data[datasize] != '\0')
{
datasize++;
}
return datasize;
}
// stores file data in a buffer and returns it
unsigned char * fileToBuffer(unsigned char* filename)
{
unsigned char *source = NULL;
FILE *fp = fopen(filename, "r");
if (fp != NULL)
{
// goto end
if (fseek(fp, 0L, SEEK_END) == 0)
{
// get the size of file
long bufsize = ftell(fp);
if (bufsize == -1)
{
printf("Error occured in fileToBuffer function while trying to get size \n");
}
// allocate
source = malloc(sizeof(char) * (bufsize + 1));
// goto start
if (fseek(fp, 0L, SEEK_SET) != 0)
{
printf("Error occured in fileToBuffer function while trying to go to start \n");
}
// read the entire file
size_t newLen = fread(source, sizeof(char), bufsize, fp);
if (newLen == 0)
{
fputs("Error reading file", stderr);
}
else
{
source[++newLen] = '\0';
}
}
}
else
{
printf("Error occured in fileToBuffer function while trying to access to file \n");
}
fclose(fp);
return source;
}
unsigned char *replaceCharset(const char *str, const char *old, const char *new)
{
char *ret, *r;
const char *p, *q;
size_t oldlen = strlen(old);
size_t count, retlen, newlen = strlen(new);
int samesize = (oldlen == newlen);
if (!samesize)
{
for (count = 0, p = str; (q = strstr(p, old)) != NULL; p = q + oldlen)
count++;
retlen = p - str + strlen(p) + count * (newlen - oldlen);
}
else
retlen = strlen(str);
if ((ret = malloc(retlen + 1)) == NULL)
return NULL;
r = ret, p = str;
while (1)
{
if (!samesize && !count--)
break;
if ((q = strstr(p, old)) == NULL)
break;
ptrdiff_t l = q - p;
memcpy(r, p, l);
r += l;
memcpy(r, new, newlen);
r += newlen;
p = q + oldlen;
}
strcpy(r, p);
return ret;
}
unsigned char *extractCharset(const char *str)
{
char * charset;
if(strstr (str,"utf8") != NULL || strstr (str,"utf-8") != NULL)
charset = "utf-8";
else if(strstr (str,"utf16") != NULL || strstr (str,"utf-16") != NULL)
charset = "utf-16";
else if(strstr (str,"utf32") != NULL || strstr (str,"utf-32") != NULL)
charset = "utf-32";
else if(strstr (str,"iso-8859-1") != NULL || strstr (str,"88591") != NULL)
charset = "iso-8859-1";
else if(strstr (str,"iso-8859-9") != NULL || strstr (str,"88599") != NULL)
charset = "iso-8859-9";
else // not found, just make it utf-8
charset = "utf-8";
return charset;
}
const char *getFileExtension(const char *filename)
{
const char *e = strrchr (filename, '.');
if (e == NULL)
e = "";
return e;
}
const char *getBeforeExtension(const char *filename)
{
char *ptr = NULL;
ptr = malloc(sizeof(char) * strlen(filename));
int total = 0;
while(*filename != '.')
{
printf("%c \n", *filename);
*ptr++ = *filename++;
total++;
}
return ptr - total;
}
size_t utf8_charlen(uint8_t c)
{
if (c < 0x80) return 1; /* 0xxxxxxx */
else if ((c & 0xe0)==0xc0) return 2; /* 110xxxxx */
else if ((c & 0xf0)==0xe0) return 3; /* 1110xxxx */
else if ((c & 0xf8)==0xf0 && (c <= 0xf4)) return 4; /* 11110xxx */
else return 0; /* invalid UTF8 */
}
size_t utf8_valid(const uint8_t *c)
{
size_t clen = utf8_charlen(*c);
switch(clen)
{
case 4: if ((c[3] & 0xc0) != 0x80) return 0;
case 3: if ((c[2] & 0xc0) != 0x80) return 0;
case 2: if ((c[1] & 0xc0) != 0x80) return 0;
case 1: return clen; /* no trailing bytes to validate */
case 0: return 0; /* invalid utf8 */
}
return clen; /* don't complain, gcc */
}
uint32_t utf8_to_32(const uint8_t *c)
{
switch(utf8_valid(c))
{
case 0: return 0; /* invalid utf8 */
case 1: return *c; /* no work, just promote size */
case 2: return ((c[0] & 0x1f) << 6) | (c[1] & 0x3f);
case 3: return ((c[0] & 0x0f) << 12) | ((c[1] & 0x3f) << 6) | (c[2] & 0x3f);
case 4: return ((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | ((c[2] & 0x3f) << 6) | (c[3] & 0x3f);
}
return 0; /* no complaints gcc */
}
unsigned char* UTF8_TO_UTF32(unsigned char* in, unsigned char* out)
{
int total = 0;
unsigned char *c;
for(c = in; *c; c+=utf8_charlen(*c))
{
*out++ = utf8_to_32(c);
total++;
}
return out-total;
}
unsigned char* UTF32_TO_UTF8(unsigned char* in, unsigned char* out)
{
unsigned int total = 0;
while(*in)
{
if(*in < 0x80)
{
*out++ = *in;
total++;
}
else if(*in < 0x800)
{
*out++ = 0xc0 + ((*in & 0x7c0) >> 6);
*out++ = 0x80 + (*in & 0x3f);
total = total + 2;
}
else if(*in < 0x10000)
{
*out++ = 0xe0 + ((*in & 0xf000) >> 12);
*out++ = 0x80 + ((*in & 0xfc0) >> 6);
*out++ = 0x80 + (*in & 0x3f);
total = total + 3;
}
else if(*in < 0x200000)
{
*out++ = 0xf0 + ((*in & 0x1c) >> 18);
*out++ = 0x80 + ((*in & 0x3f) >> 12);
*out++ = 0x80 + ((*in & 0x3f) >> 6);
*out++ = 0x80 + (*in & 0x3f);
total = total + 4;
}
else if(*in < 0x4000000)
{
*out++ = 0xf8 + ((*in & 0x30) >> 24);
*out++ = 0x80 + ((*in & 0x3f) >> 18);
*out++ = 0x80 + ((*in & 0x3f) >> 12);
*out++ = 0x80 + ((*in & 0x3f) >> 6);
*out++ = 0x80 + (*in & 0x3f);
total = total + 5;
}
else if(*in < 0x80000000)
{
*out++ = 0xfc + (*in >> 30);
*out++ = 0x80 + ((*in & 0x3f) >> 24);
*out++ = 0x80 + ((*in & 0x3f) >> 18);
*out++ = 0x80 + ((*in & 0x3f) >> 12);
*out++ = 0x80 + ((*in & 0x3f) >> 6);
*out++ = 0x80 + (*in & 0x3f);
total = total + 6;
}
*in++;
}
return out - total;
}
unsigned char* UTF8_TO_UTF16(unsigned char* in, unsigned char* out)
{
unsigned int total = 0;
while(*in)
{
if(*in < 0x10000)
{
*out++ = *in++;
total++;
}
else
{
*in -= 0x10000;
*out++ = 0xd800 | (*in++ >> 10);
*out++ = 0xdc00 | (*in++ & 0x3ff);
total = total + 2;
}
}
return out - total;
}
unsigned char* UTF16_TO_UTF8(unsigned char* in, unsigned char* out)
{
unsigned int total = 0;
while(*in)
{
if((*in >= 0x0001) && (*in <= 0x007f))
{
*out++ = *in++;
total++;
}
else if(*in > 0x07ff)
{
*out++ = 0xe0 | ((*in++ >> 12) & 0x0f);
*out++ = 0x80 | ((*in++ >> 6) & 0x3f);
*out++ = 0x80 | (*in++ & 0x3f);
total = total + 3;
}
else
{
*out++ = 0xc0 | ((*in++ >> 6) & 0x1f);
*out++ = 0x80 | (*in++ & 0x3f);
total = total + 2;
}
}
return out - total;
}
/*
Bits of CP First Last Sequence Byte 1 Byte 2 Byte3 Byte4
7 U+0000 U+007F 1 0xxxxxxx - - -
11 U+0080 U+07FF 2 110xxxxx 10xxxxxx - -
16 U+0800 U+FFFF 3 1110xxxx 10xxxxxx 10xxxxxx -
21 U+10000 U+1FFFFF 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
unsigned char* ISO8859_TO_UTF8(unsigned char* in, unsigned char* out, unsigned int version)
{
// version 9 = Turkish, version 1 = Standart
unsigned int c;
unsigned int total = 0;
while (*in)
{
if (*in < 0x80) // 128, ASCII, keep the same
{
// no need to change, keep same
*out++ = *in++;
// dont miss pointer
total++;
}
else if (*in < 0x0800) // 128 to 2047
{
// Turkish
if(version == 9)
{
/*
References:
-> http://www.unicodetools.com/unicode/codepages.php?codepage=iso-8859-9
-> http://en.wikipedia.org/wiki/ISO/IEC_8859-9
-> http://www.utf8-chartable.de/unicode-utf8-table.pl?start=256&names=-&utf8=dec
Position 0xD0 0xDD 0xDE 0xF0 0xFD 0xFE
8859-9 Ğ İ Ş ğ ı ş
8859-1 Ð Ý Þ ð ý þ
Ğ -> Hex: 0xD0 && Decimal: 208 && UTF8: c4 9e | 196 158
ğ -> Hex: 0xF0 && Decimal: 240 && UTF8: c4 9f | 196 159
İ -> Hex: 0xDD && Decimal: 221 && UTF8: c4 b0 | 196 176
ı -> Hex: 0xFD && Decimal: 253 && UTF8: c4 b1 | 196 177
Ş -> Hex: 0xDE && Decimal: 222 && UTF8: c3 96 | 197 158
ş -> Hex: 0xFE && Decimal: 254 && UTF8: c5 9f | 197 159
*/
// if already utf8, keep the same
if
((
(*in == 196 && *(in+1) == 158) || // Ğ
(*in == 196 && *(in+1) == 159) || // ğ
(*in == 196 && *(in+1) == 176) || // İ
(*in == 196 && *(in+1) == 177) || // ı
(*in == 197 && *(in+1) == 158) || // Ş
(*in == 197 && *(in+1) == 159) // ş
))
{
*out++ = *in++;
*out++ = *in++;
}
else
{
switch(*in)
{
printf("%d \n",*in);
case(208): // Ğ
*out++ = 196; *out++ = 158; *in++; *in++;
case(240): // ğ
*out++ = 196; *out++ = 159; *in++; *in++;
case(221): // İ
*out++ = 196; *out++ = 176; *in++; *in++;
case(253): // ı
*out++ = 196; *out++ = 177; *in++; *in++;
case(222): // Ş
*out++ = 197; *out++ = 158; *in++; *in++;
case(254): // ş
*out++ = 197; *out++ = 159; *in++; *in++;
default:
// 0xc0 OR (in and 0xc0) shift right by 6
// *in AND 0xc0 = ab cd ef gh AND 11 00 00 00 = ab 00 00 00
// ab 00 00 00 : shift right -> 00 00 00 ab
// 11 00 00 00 or 00 00 00 ab
// 11 00 00 ab = *out++ -> in 110xxxxx format
*out++ = (0xc0 | (*in++ & 0xc0) >> 6); // first byte
// 0x80 OR (in and 0x3f)
// *in AND 0x3f = ab cd ef gh AND 00 11 11 11 = 00 cd ef gh
// 10 00 00 00 OR 00 cd ef gh = 10 cd ef gh
// 10 cd ef gh = *out++ -> in 10xxxxxx format
*out++ = (0x80 | (*in++ & 0x3f)); // second byte11 U+0080 U+07FF 2 110xxxxx 10xxxxxx
}
}
// dont miss pointer
total = total + 2;
}
else
{
// 0xc0 OR (in and 0xc0) shift right by 6
// *in AND 0xc0 = ab cd ef gh AND 11 00 00 00 = ab 00 00 00
// ab 00 00 00 : shift right -> 00 00 00 ab
// 11 00 00 00 or 00 00 00 ab
// 11 00 00 ab = *out++ -> in 110xxxxx format
*out++ = (0xc0 | (*in++ & 0xc0) >> 6); // first byte
// 0x80 OR (in and 0x3f)
// *in AND 0x3f = ab cd ef gh AND 00 11 11 11 = 00 cd ef gh
// 10 00 00 00 OR 00 cd ef gh = 10 cd ef gh
// 10 cd ef gh = *out++ -> in 10xxxxxx format
*out++ = (0x80 | (*in++ & 0x3f)); // second byte11 U+0080 U+07FF 2 110xxxxx 10xxxxxx
// dont miss pointer
total = total + 2;
}
}
else if(*in < 0x010000) // 2048 to 65535
{
// 1110xxxx format
*out++ = (0xe0 | ((*in++ >> 12) & 0x0f)); // first byte
// 10xxxxxx format
*out++ = (0x80 | ((*in++ >> 6) & 0x3f)); // second byte
// 10xxxxxx format
*out++ = (0x80 | (*in++ & 0x3f)); // third byte
// dont miss pointer
total = total + 3;
}
else if(*in < 0x110000) // 65536 to 1114112
{
// 11110xxx format
*out++ = (0xf0 | ((*in++ >> 18) & 0x07)); // first byte
// 10xxxxxx format
*out++ = (0x80 | ((*in++ >> 12) & 0x3f)); // second byte
// 10xxxxxx format
*out++ = (0x80 | ((*in++ >> 6) & 0x3f)); // third byte
// 10xxxxxx format
*out++ = (0x80 | (*in++ & 0x3f)); // forth byte
// dont miss pointer
total = total + 4;
}
else
{
printf("ERROR \n");
return -1;
}
}
return out - total;
}
unsigned char* UTF8_TO_ISO8859(unsigned char* in, unsigned char* out, int version)
{
unsigned int total = 0;
while (*in)
{
if (*in < 0x80) // 128, ASCII, keep the same
{
// no need to change, keep same
*out++ = *in++;
// dont miss pointer
total++;
}
else if(*in < 0x800)
{
if(version == 9)
{
/*
Ğ -> Hex: 0xD0 && Decimal: 208 && UTF8: c4 9e | 196 158
ğ -> Hex: 0xF0 && Decimal: 240 && UTF8: c4 9f | 196 159
İ -> Hex: 0xDD && Decimal: 221 && UTF8: c4 b0 | 196 176
ı -> Hex: 0xFD && Decimal: 253 && UTF8: c4 b1 | 196 177
Ş -> Hex: 0xDE && Decimal: 222 && UTF8: c3 96 | 197 158
ş -> Hex: 0xFE && Decimal: 254 && UTF8: c5 9f | 197 159
*/
if((*(in) == 196) && (*(in+1) == 158)) // Ğ
{
*out++ = 208;
}
else if((*(in) == 196) && (*(in+1) == 159)) // ğ
{
*out++ = 240;
}
else if((*(in) == 196) && (*(in+1) == 176)) // İ
{
*out++ = 221;
}
else if((*(in) == 196) && (*(in+1) == 177)) // ı
{
*out++ = 253;
}
else if((*(in) == 197) && (*(in+1) == 158)) // Ş
{
*out++ = 222;
}
else if((*(in) == 197) && (*(in+1) == 159)) // ş
{
*out++ = 254;
}
else
{
*out++ = ((0x1F & *(in) ) << 6) | (*(in+1) & 0x3F);
}
*in++;
*in++;
// dont miss pointer
total++;
}
else
{
// ((0x1F AND in) shift 6) OR (in and 0x3f)
// 00 01 11 11 AND ab cd ef gh = 00 0d ef gh
// 00 0d ef gh shift 6 = 0d ef gh 00
// *in AND 0x3f = ab cd ef gh AND 00 11 11 11 = 00 cd ef gh
// 0d ef gh 00 OR 00 cd ef gh = 0a bc cd ef
// 0a bc cd ef = *out++ -> in 0xxxxxxx format
*out++ = ((0x1F & *in++ ) << 6) | (*in++ & 0x3F) ;
// dont miss pointer
total++;
}
}
else if(*in < 0x010000) // 2048 to 65535
{
*out++ = ((0x0F & *in++ ) << 12) | (*in++ << 6) | *in++;
total++;
}
else if(*in < 0x110000) // 65536 to 1114112
{
*out++ = (((0x0F & *in++ ) << 18) | (*in++ << 12) | (*in++ << 6) | *in++ ) + 65536;
total++;
}
else
{
printf("ERROR \n");
return -1;
}
}
return out - total;
}
unsigned char * convert(unsigned char* in, unsigned char* out, const char* from, const char* to)
{
if(strcmp(from,"utf-8") == 0 && strcmp(to,"utf-16") == 0)
{
out = UTF8_TO_UTF16(in,out);
}
else if(strcmp(from,"utf-8") == 0 && strcmp(to,"utf-32") == 0)
{
out = UTF8_TO_UTF32(in,out);
}
else if(strcmp(from,"utf-8") == 0 && strcmp(to,"iso-8859-1") == 0)
{
out = UTF8_TO_ISO8859(in,out,1);
}
else if(strcmp(from,"utf-8") == 0 && strcmp(to,"iso-8859-9") == 0)
{
out = UTF8_TO_ISO8859(in,out,9);
}
else if(strcmp(from,"utf-16") == 0 && strcmp(to,"utf-8") == 0)
{
out = UTF16_TO_UTF8(in,out);
}
else if(strcmp(from,"utf-16") == 0 && strcmp(to,"utf-32") == 0)
{
out = UTF16_TO_UTF8(in,out);
out = UTF8_TO_UTF32(in,out);
}
else if(strcmp(from,"utf-16") == 0 && strcmp(to,"iso-8859-1") == 0)
{
out = UTF16_TO_UTF8(in,out);
out = UTF8_TO_ISO8859(in,out,1);
}
else if(strcmp(from,"utf-16") == 0 && strcmp(to,"iso-8859-9") == 0)
{
out = UTF16_TO_UTF8(in,out);
out = UTF8_TO_ISO8859(in,out,9);
}
else if(strcmp(from,"utf-32") == 0 && strcmp(to,"utf-8") == 0)
{
out = UTF32_TO_UTF8(in,out);
}
else if(strcmp(from,"utf-32") == 0 && strcmp(to,"utf-16") == 0)
{
out = UTF32_TO_UTF8(in,out);
out = UTF8_TO_UTF16(in,out);
}
else if(strcmp(from,"utf-32") == 0 && strcmp(to,"iso-8859-1") == 0)
{
out = UTF32_TO_UTF8(in,out);
out = UTF8_TO_ISO8859(in,out,1);
}
else if(strcmp(from,"utf-32") == 0 && strcmp(to,"iso-8859-9") == 0)
{
out = UTF32_TO_UTF8(in,out);
out = UTF8_TO_ISO8859(in,out,9);
}
else if(strcmp(from,"iso-8859-1") == 0 && strcmp(to,"utf-8") == 0)
{
out = ISO8859_TO_UTF8(in,out,1);
}
else if(strcmp(from,"iso-8859-1") == 0 && strcmp(to,"utf-16") == 0)
{
out = ISO8859_TO_UTF8(in,out,1);
out = UTF8_TO_UTF16(in,out);
}
else if(strcmp(from,"iso-8859-1") == 0 && strcmp(to,"utf-32") == 0)
{
out = ISO8859_TO_UTF8(in,out,1);
out = UTF8_TO_UTF32(in,out);
}
else if(strcmp(from,"iso-8859-1") == 0 && strcmp(to,"iso-8859-9") == 0)
{
out = ISO8859_TO_UTF8(in,out,1);
out = UTF8_TO_ISO8859(in,out,9);
}
else if(strcmp(from,"iso-8859-9") == 0 && strcmp(to,"utf-8") == 0)
{
out = ISO8859_TO_UTF8(in,out,9);
}
else if(strcmp(from,"iso-8859-9") == 0 && strcmp(to,"utf-16") == 0)
{
out = ISO8859_TO_UTF8(in,out,9);
out = UTF8_TO_UTF16(in,out);
}
else if(strcmp(from,"iso-8859-9") == 0 && strcmp(to,"utf-32") == 0)
{
out = ISO8859_TO_UTF8(in,out,9);
out = UTF8_TO_UTF32(in,out);
}
else if(strcmp(from,"iso-8859-9") == 0 && strcmp(to,"iso-8859-1") == 0)
{
out = ISO8859_TO_UTF8(in,out,9);
out = UTF8_TO_ISO8859(in,out,1);
}
else
{
out = in;
}
}
void writeImageFromBuffer ( const char* filename , const char* buffer , unsigned long length )
{
const int REGULARPACKAGE_SIZE = 1024 * 1024;
FILE *image; /* Pointer to file */
unsigned long bytesWritten; /* Bytes written so far in the buffer */
int bytesToWrite; /* Bytes to write to file */
char *tmpBuffer; /* Temporary buffer */
bytesWritten = 0;
image = fopen ( filename , "wb" );
// allocate tmpBuffer
tmpBuffer = (char *) malloc ( REGULARPACKAGE_SIZE );
while ( bytesWritten < length )
{
if ( length - bytesWritten >= REGULARPACKAGE_SIZE ) /* not last frame */
{
bytesToWrite = REGULARPACKAGE_SIZE;
}
else /* last frame */
{
bytesToWrite = length - bytesWritten;
// reallocate tmpBuffer to its adecuate size
tmpBuffer = (char *) realloc ( tmpBuffer , bytesToWrite );
}
// copy original buffer <bytesToWrite> elements to tmpBuffer
tmpBuffer = &buffer[bytesWritten];
// write tmpBuffer to file
fwrite ( tmpBuffer , 1 , bytesToWrite , image );
// just upgrade the var
bytesWritten += bytesToWrite;
}
fclose ( image );
}
void createFile(unsigned char* infile, unsigned char* outfile, unsigned char * charset)
{
unsigned char * in, * out = NULL;
int total = 0;
in = fileToBuffer(infile); // in has the file data now
int datasize = getSize(in);
out = malloc( sizeof(char) * datasize);
out = convert(in,out,"utf-8","iso-8859-9");
writeImageFromBuffer(outfile,out,getSize(out));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment