Created
May 2, 2018 16:39
-
-
Save dtzWill/7bc07da1dcd02e01c2fbb28cbaa81420 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iconv.h> | |
#include <errno.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdio.h> | |
char* convert(const char *from_charset, const char *to_charset, const char *input) { | |
size_t inleft, outleft, converted = 0; | |
char *output, *outbuf, *tmp; | |
const char *inbuf; | |
size_t outlen; | |
iconv_t cd; | |
int errno; | |
if ((cd = iconv_open(to_charset, from_charset)) == (iconv_t) -1) | |
return NULL; | |
inleft = strlen(input); | |
inbuf = input; | |
/* we'll start off allocating an output buffer which is the same size | |
* as our input buffer. */ | |
outlen = inleft; | |
/* we allocate 4 bytes more than what we need for nul-termination... */ | |
if (!(output = (char*) malloc(outlen + 4))) { | |
iconv_close(cd); | |
return NULL; | |
} | |
do { | |
errno = 0; | |
outbuf = output + converted; | |
outleft = outlen - converted; | |
converted = iconv(cd, (char **) &inbuf, &inleft, &outbuf, &outleft); | |
if (converted != (size_t) -1 || errno == EINVAL) { | |
/* | |
* EINVAL An incomplete multibyte sequence has been encoun- | |
* tered in the input. | |
* | |
* We'll just truncate it and ignore it. | |
*/ | |
break; | |
} | |
if (errno != E2BIG) { | |
/* | |
* EILSEQ An invalid multibyte sequence has been encountered | |
* in the input. | |
* | |
* Bad input, we can't really recover from this. | |
*/ | |
iconv_close(cd); | |
free(output); | |
return NULL; | |
} | |
/* | |
* E2BIG There is not sufficient room at *outbuf. | |
* | |
* We just need to grow our outbuffer and try again. | |
*/ | |
converted = outbuf - output; | |
outlen += inleft * 2 + 8; | |
if (!(tmp = (char*) realloc(output, outlen + 4))) { | |
iconv_close(cd); | |
free(output); | |
return NULL; | |
} | |
output = tmp; | |
outbuf = output + converted; | |
} while (1); | |
/* flush the iconv conversion */ | |
iconv(cd, NULL, NULL, &outbuf, &outleft); | |
iconv_close(cd); | |
/* Note: not all charsets can be nul-terminated with a single | |
* nul byte. UCS2, for example, needs 2 nul bytes and UCS4 | |
* needs 4. I hope that 4 nul bytes is enough to terminate all | |
* multibyte charsets? */ | |
/* nul-terminate the string */ | |
memset(outbuf, 0, 4); | |
return output; | |
} | |
int main() { | |
//char input[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚„…†‡‰Š‹ŚŤŽŹ‘’“”•–—™š›śťžźˇ˘Ł¤Ą¦§¨©Ş«¬®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ"; | |
char input[] = "caffc8f] はれひほふ\n Author: A U Thor <author@example.com>"; | |
//char *encoding= "cp1250"; | |
char *encoding = "ISO-2022-JP"; | |
char* outputCp = convert("UTF-8", encoding, input); | |
printf("Input: %s\n", input); | |
printf("UTF-8 -> %s: %s\n", encoding, outputCp); | |
char* outputUtf8 = convert(encoding, "UTF-8", outputCp); | |
printf("UTF-8 -> %s -> UTF-8: %s\n", encoding, outputUtf8); | |
if (strcmp(outputUtf8, input) == 0) { | |
printf("Input and output are identical\n"); | |
} else { | |
printf("Output DOESN'T match with the input\n"); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment