Skip to content

Instantly share code, notes, and snippets.

@colrichie
Created November 21, 2023 10:07
Show Gist options
  • Save colrichie/ed63ec85ba088614594b54e70ea6fd18 to your computer and use it in GitHub Desktop.
Save colrichie/ed63ec85ba088614594b54e70ea6fd18 to your computer and use it in GitHub Desktop.
Simple Textdata Converter from CESU-8 to UTF-8
/*####################################################################
#
# CESU8toUTF8.c - Simple Textdata Converter from CESU-8 to UTF-8
#
# Usage : cat your_textfile.txt | ./CESU8toUTF8 > converted_text.txt
#
# How to compile me : cc -O3 -o CESU8toUTF8 CESU8toUTF8.c
#
#
# Written by Colonel Richie (@colrichie) on 2023-11-21
#
####################################################################*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <locale.h>
#define MAXBUF 1048576
int find_cesu8(char* pszLine) {
int i;
unsigned char* puc;
puc = (unsigned char*)pszLine;
for (i=0; i<strlen(pszLine); ) {
if (puc[i ]!=0xED ){i++ ;continue;}
if (puc[i+1]<0xA0 || puc[i+1]>0xAF){i+=2;continue;}
if (puc[i+2]<0x80 || puc[i+2]>0xBF){i+=3;continue;}
if (puc[i+3]!=0xED ){i+=4;continue;}
if (puc[i+4]<0xB0 || puc[i+4]>0xBF){i+=5;continue;}
if (puc[i+5]<0x80 || puc[i+5]>0xBF){i+=6;continue;}
return i;
}
return -1;
}
int main(void) {
FILE *fp;
char szBuf[MAXBUF], *pszBuf;
int i, iOfs;
char szUTF8[4];
setlocale(LC_CTYPE, "");
fp = stdin;
while(fgets(szBuf, MAXBUF, fp) != NULL) {
pszBuf=szBuf;
while ((iOfs=find_cesu8(pszBuf))>=0) {
for (i=0; i<iOfs; i++) {putchar(*pszBuf);pszBuf++;}
pszBuf[1]++;
szUTF8[0] = 0xF0 | ((pszBuf[1] & 0x1C)>>2) ;
szUTF8[1] = 0x80 | ((pszBuf[1] & 0x03)<<4) | ((pszBuf[2] & 0x3C)>>2);
szUTF8[2] = 0x80 | ((pszBuf[2] & 0x03)<<4) | ( pszBuf[4] & 0x0F );
szUTF8[3] = pszBuf[5]; ;
putchar(szUTF8[0]);putchar(szUTF8[1]);putchar(szUTF8[2]);putchar(szUTF8[3]);
pszBuf+=6;
}
while (*pszBuf!=0) {putchar(*pszBuf);pszBuf++;}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment