Skip to content

Instantly share code, notes, and snippets.

@varnit
Last active August 6, 2021 08:46
Show Gist options
  • Save varnit/11129486 to your computer and use it in GitHub Desktop.
Save varnit/11129486 to your computer and use it in GitHub Desktop.
detect text encoding and convert to utf8
#include "unicode/ucsdet.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iconv.h>
#include <errno.h>
#define DEST_CHARSET "UTF-8"
#define UNKNOWN_CHARSET_DEFAULT "WINDOWS-1252"
static void
toutf8(char* input, char* output) {
const UCharsetMatch** matches;
int32_t matchCount = 0;
const char* input_type;
size_t input_len = strlen(input);
size_t output_len = input_len * 4 + 1;
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *csd = ucsdet_open(&status);
ucsdet_setText(csd, input, input_len, &status);
matches = ucsdet_detectAll(csd, &matchCount, &status);
if(status != U_ZERO_ERROR) {
printf("failed to detect\n");
input_type = UNKNOWN_CHARSET_DEFAULT;
} else {
printf("detected\n");
input_type = ucsdet_getName(matches[0], &status);
}
printf("type: %s\n", input_type);
iconv_t cd = iconv_open(DEST_CHARSET, input_type);
iconv(cd, &input, &input_len, &output, &output_len);
iconv_close(cd);
ucsdet_close(csd);
u_cleanup();
}
int
main(int argc, char* argv[]) {
char* buffer = 0;
char* buffer2;
char* utf8;
long length;
FILE * f = fopen ("data.txt", "rb");
if (f) {
fseek (f, 0, SEEK_END);
length = ftell(f);
fseek (f, 0, SEEK_SET);
buffer = malloc(length);
buffer2 = malloc(length);
if (buffer) {
fread (buffer, 1, length, f);
}
fclose (f);
}
if (buffer) {
char* utf8 = (char*) malloc(strlen(buffer) * 4 + 1);
toutf8(buffer, utf8);
printf("%s\n", utf8);
if (buffer2 != NULL) {
free(buffer2);
}
if (buffer != NULL) {
free(buffer);
}
if (utf8 != NULL) {
free(utf8);
}
}
return 0;
}
cc convert.c -o convert `pkg-config --libs --cflags icu-io`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment