Skip to content

Instantly share code, notes, and snippets.

@kikairoya
Last active August 29, 2015 14:08
Show Gist options
  • Save kikairoya/7bb24f0a760e9c2f4ee5 to your computer and use it in GitHub Desktop.
Save kikairoya/7bb24f0a760e9c2f4ee5 to your computer and use it in GitHub Desktop.
// auto convert input and output encoding
// copyright (c) 2014 kikairoya <kikairoya@gmail.com>
// this file can use, redistribute and/or modify under
// Boost Software License 1.0 or GNU GPLv2
//
// usage:
// $ ${something outputs UTF-8,CP932,eucJP} | LANG=ja_JP.eucJP autoconv
// --> converts stdout encoding to eucJP
// $ LANG=ja_JP.UTF-8 autoconv ${something outputs UTF-8,CP932,eucJP}
// --> prints stdout and stderr encoding to UTF-8
//
// encoding detection handles text by line so it can mix some encodings.
// supported only UTF-8, CP932(Shift_JIS), EUC-JP.
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <locale.h>
#include <langinfo.h>
#include <iconv.h>
#include <fcntl.h>
#include <sys/select.h>
#include <sys/types.h>
#include <sys/wait.h>
typedef enum {
enc_utf8,
enc_cp932,
enc_eucjp,
enc_7bit,
enc_unknown
} encoding_t;
iconv_t conv_utf8 = NULL;
iconv_t conv_cp932 = NULL;
iconv_t conv_eucjp = NULL;
encoding_t prefer = enc_unknown;
encoding_t guess_encoding(const char *str, encoding_t prefer) {
// detecting encoding algorithm modified from http://dobon.net/vb/dotnet/string/detectcode.html
// original algorithm written by 1999-2005 Dan Kogai
const size_t len = strlen(str);
const unsigned char *s = (const unsigned char *)str;
const unsigned char *const end = s + len;
int seen_8bit = 0;
int sjis = 0;
int euc = 0;
int utf8 = 0;
for (const unsigned char *p = s; p < end; ++p) {
if (*p <= 0x06 || *p == 0x7F || *p == 0xFF) return enc_unknown;
if (*p & 0x80) seen_8bit = 1;
}
if (!seen_8bit) return enc_7bit;
if (len < 2) return enc_unknown;
for (const unsigned char *p = s; p < end-1; ++p) {
if (((0x81 <= p[0] && p[0] <= 0x9F) || (0xE0 <= p[0] && p[0] <= 0xFC)) &&
((0x40 <= p[1] && p[1] <= 0x7E) || (0x80 <= p[1] && p[1] <= 0xFC)))
{
//SJIS_C
sjis += 2;
++p;
}
}
for (const unsigned char *p = s; p < end-1; ++p) {
if (((0xA1 <= p[0] && p[0] <= 0xFE) && (0xA1 <= p[1] && p[1] <= 0xFE)) ||
(p[0] == 0x8E && (0xA1 <= p[1] && p[1] <= 0xDF)))
{
//EUC_C
//EUC_KANA
euc += 2;
++p;
} else if (p[0] == 0x8F && (0xA1 <= p[1] && p[1] <= 0xFE) &&
(0xA1 <= p[2] && p[2] <= 0xFE))
{
euc += 3;
++p; ++p;
}
}
for (const unsigned char *p = s; p < end-1; ++p) {
if ((0xC0 <= p[0] && p[0] <= 0xDF) && (0x80 <= p[1] && p[1] <= 0xBF))
{
//UTF8
utf8 += 2;
++p;
} else if ((0xE0 <= p[0] && p[1] <= 0xEF) && (0x80 <= p[1] && p[1] <= 0xBF) &&
(0x80 <= p[2] && p[2] <= 0xBF))
{
//UTF8
utf8 += 3;
++p; ++p;
}
}
switch (prefer) {
case enc_utf8: utf8 += utf8/2 + 2; break;
case enc_cp932: sjis += sjis/2 + 2; break;
case enc_eucjp: euc += euc/2 + 2; break;
default: break;
}
if (euc > sjis && euc > utf8) return enc_eucjp;
if (sjis > euc && sjis > utf8) return enc_cp932;
if (utf8 > euc && utf8 > sjis) return enc_utf8;
return enc_unknown;
}
__attribute__((malloc)) char *try_conv_1(const char *s, iconv_t cd) {
if (!cd) return NULL;
size_t ilen = strlen(s);
size_t olen = ilen * 3 + 1;
char *ret = malloc(olen);
char *dup = strdup(s);
char *ibuf = dup;
char *obuf = ret;
iconv(cd, NULL, NULL, NULL, NULL);
if (iconv(cd, &ibuf, &ilen, &obuf, &olen) < 0) {
free(dup);
free(ret);
return NULL;
}
free(dup);
*obuf = 0;
return ret;
}
__attribute__((malloc)) char *try_conv(const char *s, char **freep) {
const encoding_t e = guess_encoding(s, prefer);
char *out = NULL;
switch (e) {
case enc_utf8: out = try_conv_1(s, conv_utf8); break;
case enc_cp932: out = try_conv_1(s, conv_cp932); break;
case enc_eucjp: out = try_conv_1(s, conv_eucjp); break;
default: break;
}
if (freep) *freep = out;
return out ? out : (char *)s;
}
__attribute__((malloc)) char *try_conv_single_block(const char *s, char **freep, size_t *atelen) {
const char *p = strchr(s, '\n');
if (!p) {
*freep = 0;
*atelen = 0;
return NULL;
}
char *news = malloc(p-s+2);
memcpy(news, s, p-s+1);
news[p-s+1] = 0;
*atelen = p-s+1;
return try_conv(news, freep);
}
__attribute__((malloc)) char *get_line(FILE *in) {
char buf[BUFSIZ];
char *ret = 0;
size_t retlen = 0;
do {
buf[BUFSIZ-2] = 0;
if (!fgets(buf, BUFSIZ, stdin)) break;
const size_t len = strlen(buf);
ret = realloc(ret, retlen + len + 1);
memcpy(ret+retlen, buf, len + 1);
retlen += len;
if (buf[BUFSIZ-2] == 0 || buf[BUFSIZ-2] == '\n') break;
} while (!feof(stdin)) ;
return ret;
}
void block_convert_write(unsigned char *tmpbuf, size_t tmplen, unsigned char **permbuf, size_t *permlen) {
*permbuf = realloc(*permbuf, *permlen + tmplen + 1);
memcpy(*permbuf+*permlen, tmpbuf, tmplen);
*permlen += tmplen;
(*permbuf)[*permlen] = 0;
char *x;
char *o;
size_t movelen = 0;
size_t atelen = 0;
do {
atelen = 0;
o = try_conv_single_block((char *)(*permbuf+movelen), &x, &atelen);
if (o) {
movelen += atelen;
fputs(o, stdout);
free(x);
}
} while (o);
memmove(*permbuf, *permbuf+movelen, *permlen-movelen);
*permlen -= movelen;
}
int forward_loop(pid_t child, int sfd, int efd) {
sigset_t sigs;
sigemptyset(&sigs);
sigaddset(&sigs, SIGCHLD);
if (sigprocmask(SIG_BLOCK, &sigs, NULL) < 0) {
perror("sigprocmask");
return -1;
}
unsigned char buf[BUFSIZ];
unsigned char *sbuf = 0; size_t slen = 0;
unsigned char *ebuf = 0; size_t elen = 0;
int nfds = (sfd < efd ? efd : sfd) + 1;
while (1) {
fd_set rfds, wfds, efds;
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&efds);
if (sfd != -1) FD_SET(sfd, &rfds);
if (efd != -1) FD_SET(efd, &rfds);
int r = select(nfds, &rfds, &wfds, &efds, NULL);
if (r == -1 && errno == EINTR) continue;
if (r == -1) return -2;
if (sfd != -1 && FD_ISSET(sfd, &rfds)) {
int len = read(sfd, buf, BUFSIZ);
if (len <= 0) {
free(sbuf);
close(sfd);
sfd = -1;
} else {
block_convert_write(buf, len, &sbuf, &slen);
}
}
if (efd != -1 && FD_ISSET(efd, &rfds)) {
int len = read(efd, buf, BUFSIZ);
if (len <= 0) {
free(ebuf);
close(efd);
efd = -1;
} else {
block_convert_write(buf, len, &ebuf, &elen);
}
}
if (sfd == -1 && efd == -1) {
int c;
if (waitpid(child, &c, 0) < 0) return 3;
if (WIFSIGNALED(c)) raise(WTERMSIG(c));
if (WIFEXITED(c)) return WEXITSTATUS(c);
return 0;
}
}
return 0;
}
int main(int argc, char **argv) {
setlocale(LC_ALL, "");
const char *codeset = nl_langinfo(CODESET);
{
const char *tocode = codeset;
if (strcasecmp(codeset, "UTF-8")) conv_utf8 = iconv_open(tocode, "UTF-8");
else prefer = enc_utf8;
if (strcasecmp(codeset, "SJIS") && strcasecmp(codeset, "Shift_JIS") && strcasecmp(codeset, "CP932")) conv_cp932 = iconv_open(tocode, "CP932");
else prefer = enc_cp932;
if (strcasecmp(codeset, "eucJP")) conv_eucjp = iconv_open(tocode, "EUCJP");
else prefer = enc_eucjp;
}
if (argc == 1) {
char *s;
while ((s = get_line(stdin)) != NULL) {
char *x;
fputs(try_conv(s, &x), stdout);
free(s);
free(x);
}
} else {
int pipes[2];
int pipee[2];
if (pipe2(pipes, O_CLOEXEC) < 0 || pipe2(pipee, O_CLOEXEC) < 0) {
perror("pipe");
return 1;
}
pid_t pid = fork();
if (pid < 0) {
perror("fork");
return 1;
} else if (pid == 0) {
dup2(pipes[1], 1);
dup2(pipee[1], 2);
execvp(argv[1], argv+1);
} else {
close(pipes[1]);
close(pipee[1]);
int c = forward_loop(pid, pipes[0], pipee[0]);
if (c < 0) {
perror(strerror(errno));
return 2;
}
return c;
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment