Last active
August 29, 2015 14:08
-
-
Save kikairoya/7bb24f0a760e9c2f4ee5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// auto convert input and output encoding | |
// copyright (c) 2014 kikairoya <kikairoya@gmail.com> | |
// this file can use, redistribute and/or modify under | |
// Boost Software License 1.0 or GNU GPLv2 | |
// | |
// usage: | |
// $ ${something outputs UTF-8,CP932,eucJP} | LANG=ja_JP.eucJP autoconv | |
// --> converts stdout encoding to eucJP | |
// $ LANG=ja_JP.UTF-8 autoconv ${something outputs UTF-8,CP932,eucJP} | |
// --> prints stdout and stderr encoding to UTF-8 | |
// | |
// encoding detection handles text by line so it can mix some encodings. | |
// supported only UTF-8, CP932(Shift_JIS), EUC-JP. | |
#include <stdio.h> | |
#include <unistd.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <locale.h> | |
#include <langinfo.h> | |
#include <iconv.h> | |
#include <fcntl.h> | |
#include <sys/select.h> | |
#include <sys/types.h> | |
#include <sys/wait.h> | |
typedef enum { | |
enc_utf8, | |
enc_cp932, | |
enc_eucjp, | |
enc_7bit, | |
enc_unknown | |
} encoding_t; | |
iconv_t conv_utf8 = NULL; | |
iconv_t conv_cp932 = NULL; | |
iconv_t conv_eucjp = NULL; | |
encoding_t prefer = enc_unknown; | |
encoding_t guess_encoding(const char *str, encoding_t prefer) { | |
// detecting encoding algorithm modified from http://dobon.net/vb/dotnet/string/detectcode.html | |
// original algorithm written by 1999-2005 Dan Kogai | |
const size_t len = strlen(str); | |
const unsigned char *s = (const unsigned char *)str; | |
const unsigned char *const end = s + len; | |
int seen_8bit = 0; | |
int sjis = 0; | |
int euc = 0; | |
int utf8 = 0; | |
for (const unsigned char *p = s; p < end; ++p) { | |
if (*p <= 0x06 || *p == 0x7F || *p == 0xFF) return enc_unknown; | |
if (*p & 0x80) seen_8bit = 1; | |
} | |
if (!seen_8bit) return enc_7bit; | |
if (len < 2) return enc_unknown; | |
for (const unsigned char *p = s; p < end-1; ++p) { | |
if (((0x81 <= p[0] && p[0] <= 0x9F) || (0xE0 <= p[0] && p[0] <= 0xFC)) && | |
((0x40 <= p[1] && p[1] <= 0x7E) || (0x80 <= p[1] && p[1] <= 0xFC))) | |
{ | |
//SJIS_C | |
sjis += 2; | |
++p; | |
} | |
} | |
for (const unsigned char *p = s; p < end-1; ++p) { | |
if (((0xA1 <= p[0] && p[0] <= 0xFE) && (0xA1 <= p[1] && p[1] <= 0xFE)) || | |
(p[0] == 0x8E && (0xA1 <= p[1] && p[1] <= 0xDF))) | |
{ | |
//EUC_C | |
//EUC_KANA | |
euc += 2; | |
++p; | |
} else if (p[0] == 0x8F && (0xA1 <= p[1] && p[1] <= 0xFE) && | |
(0xA1 <= p[2] && p[2] <= 0xFE)) | |
{ | |
euc += 3; | |
++p; ++p; | |
} | |
} | |
for (const unsigned char *p = s; p < end-1; ++p) { | |
if ((0xC0 <= p[0] && p[0] <= 0xDF) && (0x80 <= p[1] && p[1] <= 0xBF)) | |
{ | |
//UTF8 | |
utf8 += 2; | |
++p; | |
} else if ((0xE0 <= p[0] && p[1] <= 0xEF) && (0x80 <= p[1] && p[1] <= 0xBF) && | |
(0x80 <= p[2] && p[2] <= 0xBF)) | |
{ | |
//UTF8 | |
utf8 += 3; | |
++p; ++p; | |
} | |
} | |
switch (prefer) { | |
case enc_utf8: utf8 += utf8/2 + 2; break; | |
case enc_cp932: sjis += sjis/2 + 2; break; | |
case enc_eucjp: euc += euc/2 + 2; break; | |
default: break; | |
} | |
if (euc > sjis && euc > utf8) return enc_eucjp; | |
if (sjis > euc && sjis > utf8) return enc_cp932; | |
if (utf8 > euc && utf8 > sjis) return enc_utf8; | |
return enc_unknown; | |
} | |
__attribute__((malloc)) char *try_conv_1(const char *s, iconv_t cd) { | |
if (!cd) return NULL; | |
size_t ilen = strlen(s); | |
size_t olen = ilen * 3 + 1; | |
char *ret = malloc(olen); | |
char *dup = strdup(s); | |
char *ibuf = dup; | |
char *obuf = ret; | |
iconv(cd, NULL, NULL, NULL, NULL); | |
if (iconv(cd, &ibuf, &ilen, &obuf, &olen) < 0) { | |
free(dup); | |
free(ret); | |
return NULL; | |
} | |
free(dup); | |
*obuf = 0; | |
return ret; | |
} | |
__attribute__((malloc)) char *try_conv(const char *s, char **freep) { | |
const encoding_t e = guess_encoding(s, prefer); | |
char *out = NULL; | |
switch (e) { | |
case enc_utf8: out = try_conv_1(s, conv_utf8); break; | |
case enc_cp932: out = try_conv_1(s, conv_cp932); break; | |
case enc_eucjp: out = try_conv_1(s, conv_eucjp); break; | |
default: break; | |
} | |
if (freep) *freep = out; | |
return out ? out : (char *)s; | |
} | |
__attribute__((malloc)) char *try_conv_single_block(const char *s, char **freep, size_t *atelen) { | |
const char *p = strchr(s, '\n'); | |
if (!p) { | |
*freep = 0; | |
*atelen = 0; | |
return NULL; | |
} | |
char *news = malloc(p-s+2); | |
memcpy(news, s, p-s+1); | |
news[p-s+1] = 0; | |
*atelen = p-s+1; | |
return try_conv(news, freep); | |
} | |
__attribute__((malloc)) char *get_line(FILE *in) { | |
char buf[BUFSIZ]; | |
char *ret = 0; | |
size_t retlen = 0; | |
do { | |
buf[BUFSIZ-2] = 0; | |
if (!fgets(buf, BUFSIZ, stdin)) break; | |
const size_t len = strlen(buf); | |
ret = realloc(ret, retlen + len + 1); | |
memcpy(ret+retlen, buf, len + 1); | |
retlen += len; | |
if (buf[BUFSIZ-2] == 0 || buf[BUFSIZ-2] == '\n') break; | |
} while (!feof(stdin)) ; | |
return ret; | |
} | |
void block_convert_write(unsigned char *tmpbuf, size_t tmplen, unsigned char **permbuf, size_t *permlen) { | |
*permbuf = realloc(*permbuf, *permlen + tmplen + 1); | |
memcpy(*permbuf+*permlen, tmpbuf, tmplen); | |
*permlen += tmplen; | |
(*permbuf)[*permlen] = 0; | |
char *x; | |
char *o; | |
size_t movelen = 0; | |
size_t atelen = 0; | |
do { | |
atelen = 0; | |
o = try_conv_single_block((char *)(*permbuf+movelen), &x, &atelen); | |
if (o) { | |
movelen += atelen; | |
fputs(o, stdout); | |
free(x); | |
} | |
} while (o); | |
memmove(*permbuf, *permbuf+movelen, *permlen-movelen); | |
*permlen -= movelen; | |
} | |
int forward_loop(pid_t child, int sfd, int efd) { | |
sigset_t sigs; | |
sigemptyset(&sigs); | |
sigaddset(&sigs, SIGCHLD); | |
if (sigprocmask(SIG_BLOCK, &sigs, NULL) < 0) { | |
perror("sigprocmask"); | |
return -1; | |
} | |
unsigned char buf[BUFSIZ]; | |
unsigned char *sbuf = 0; size_t slen = 0; | |
unsigned char *ebuf = 0; size_t elen = 0; | |
int nfds = (sfd < efd ? efd : sfd) + 1; | |
while (1) { | |
fd_set rfds, wfds, efds; | |
FD_ZERO(&rfds); | |
FD_ZERO(&wfds); | |
FD_ZERO(&efds); | |
if (sfd != -1) FD_SET(sfd, &rfds); | |
if (efd != -1) FD_SET(efd, &rfds); | |
int r = select(nfds, &rfds, &wfds, &efds, NULL); | |
if (r == -1 && errno == EINTR) continue; | |
if (r == -1) return -2; | |
if (sfd != -1 && FD_ISSET(sfd, &rfds)) { | |
int len = read(sfd, buf, BUFSIZ); | |
if (len <= 0) { | |
free(sbuf); | |
close(sfd); | |
sfd = -1; | |
} else { | |
block_convert_write(buf, len, &sbuf, &slen); | |
} | |
} | |
if (efd != -1 && FD_ISSET(efd, &rfds)) { | |
int len = read(efd, buf, BUFSIZ); | |
if (len <= 0) { | |
free(ebuf); | |
close(efd); | |
efd = -1; | |
} else { | |
block_convert_write(buf, len, &ebuf, &elen); | |
} | |
} | |
if (sfd == -1 && efd == -1) { | |
int c; | |
if (waitpid(child, &c, 0) < 0) return 3; | |
if (WIFSIGNALED(c)) raise(WTERMSIG(c)); | |
if (WIFEXITED(c)) return WEXITSTATUS(c); | |
return 0; | |
} | |
} | |
return 0; | |
} | |
int main(int argc, char **argv) { | |
setlocale(LC_ALL, ""); | |
const char *codeset = nl_langinfo(CODESET); | |
{ | |
const char *tocode = codeset; | |
if (strcasecmp(codeset, "UTF-8")) conv_utf8 = iconv_open(tocode, "UTF-8"); | |
else prefer = enc_utf8; | |
if (strcasecmp(codeset, "SJIS") && strcasecmp(codeset, "Shift_JIS") && strcasecmp(codeset, "CP932")) conv_cp932 = iconv_open(tocode, "CP932"); | |
else prefer = enc_cp932; | |
if (strcasecmp(codeset, "eucJP")) conv_eucjp = iconv_open(tocode, "EUCJP"); | |
else prefer = enc_eucjp; | |
} | |
if (argc == 1) { | |
char *s; | |
while ((s = get_line(stdin)) != NULL) { | |
char *x; | |
fputs(try_conv(s, &x), stdout); | |
free(s); | |
free(x); | |
} | |
} else { | |
int pipes[2]; | |
int pipee[2]; | |
if (pipe2(pipes, O_CLOEXEC) < 0 || pipe2(pipee, O_CLOEXEC) < 0) { | |
perror("pipe"); | |
return 1; | |
} | |
pid_t pid = fork(); | |
if (pid < 0) { | |
perror("fork"); | |
return 1; | |
} else if (pid == 0) { | |
dup2(pipes[1], 1); | |
dup2(pipee[1], 2); | |
execvp(argv[1], argv+1); | |
} else { | |
close(pipes[1]); | |
close(pipee[1]); | |
int c = forward_loop(pid, pipes[0], pipee[0]); | |
if (c < 0) { | |
perror(strerror(errno)); | |
return 2; | |
} | |
return c; | |
} | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment