Last active
December 21, 2015 16:19
-
-
Save marcomaggi/6332826 to your computer and use it in GitHub Desktop.
Untested example of using the Expat library with GBK encoding, make use of GNU Libiconv.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* An untested example of using the Expat library with GBK encoding: | |
read an XML document from stdin and exit. Libiconv is used to | |
perform the actual conversion from GBK to UTF-8. On a GNU+Linux | |
system compile it with: | |
$ gcc -Wall -o expat-gbk-demo expat-gbk-demo.c -liconv -lexpat | |
A useless XML test file follows: | |
<?xml version='1.0' encoding='GBK'?> | |
<!-- XML test file. Notice that the document must really | |
start with "<?xml", else an error will be raised by Expat. --> | |
<!DOCTYPE toys [ | |
<!ELEMENT ball EMPTY> | |
<!ATTLIST ball colour CDATA #REQUIRED> | |
]> | |
<toys><ball colour='red'/></toys> | |
<!-- end of file --> | |
Informations about GBK encoding can be found here: | |
<http://en.wikipedia.org/wiki/GBK> | |
This file is in the public domain. | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <iconv.h> | |
#include <expat.h> | |
#include <stdint.h> | |
static void | |
logmsg (const char * message) | |
{ | |
fprintf(stderr, message); | |
fprintf(stderr, "\n"); | |
} | |
static int | |
convert_gbk_to_utf8 (void *data, const char *in_bytes) | |
{ | |
iconv_t context = (iconv_t) data; | |
size_t in_bytes_left = 2; | |
size_t ou_bytes_left = 4; | |
uint32_t ou_bytes; | |
char * in_buffer = in_bytes; | |
char * ou_buffer = (char*)&ou_bytes; | |
size_t rv; | |
logmsg("converting GBK 2-bytes character"); | |
errno = 0; | |
rv = iconv(context, | |
&in_buffer, &in_bytes_left, | |
&ou_buffer, &ou_bytes_left); | |
if (((size_t)-1) == rv) { | |
return -1; /* all the errors */ | |
} else { | |
return (int)ou_bytes; | |
} | |
} | |
static void | |
release_gbk_to_utf8 (void *data) | |
{ | |
iconv_t context = (iconv_t) data; | |
logmsg("releasing GBK iconv context"); | |
iconv_close(context); | |
} | |
int | |
unknown_encoding_handler (void *encodingHandlerData, /* unused */ | |
const XML_Char *name, | |
XML_Encoding *info) | |
{ | |
logmsg("initialising for custom encoding"); | |
if (strcmp(name, "GBK") || strcmp(name, "gbk")) { | |
logmsg("setup GBK processing: start"); | |
iconv_t context; | |
context = iconv_open("UCS-4-INTERNAL", "GBK"); | |
if (((iconv_t)(-1)) == context) | |
goto error; | |
{ | |
int i; | |
/* Bytes in the range [0, 127] are single GBK characters | |
representing "as is" the corresponding ASCII characters. */ | |
for (i=0; i<128; ++i) { | |
info->map[i] = i; | |
} | |
/* Bytes in the range [128, 255] are the first byte in a 2-bytes | |
GBK character; they must be handed to the conversion | |
function. */ | |
for (i=128; i<256; ++i) { | |
info->map[i] = -2; | |
} | |
} | |
info->data = (void *)context; | |
info->convert = convert_gbk_to_utf8; | |
info->release = release_gbk_to_utf8; | |
logmsg("setup GBK processing: done"); | |
return XML_STATUS_OK; | |
} else { | |
logmsg("unsupported encoding"); | |
return XML_STATUS_ERROR; | |
} | |
error: | |
logmsg("error in custom encoding setup"); | |
return XML_STATUS_ERROR; | |
} | |
int | |
main (int argc, const char *const argv[]) | |
{ | |
#undef BUFFER_SIZE | |
#define BUFFER_SIZE 16 /* this is small on purpose */ | |
XML_Parser parser; | |
parser = XML_ParserCreate(NULL); | |
if (NULL == parser) { | |
fprintf(stderr, "error allocating parser\n"); | |
goto error; | |
} | |
{ | |
enum XML_Status status; | |
XML_SetUnknownEncodingHandler(parser, unknown_encoding_handler, NULL); | |
for (;;) { | |
void * buffer; | |
size_t nbytes; | |
buffer = XML_GetBuffer(parser, BUFFER_SIZE); | |
if (NULL == buffer) { | |
fprintf(stderr, "error getting parser buffer\n"); | |
goto parser_error; | |
} | |
nbytes = fread(buffer, 1, BUFFER_SIZE, stdin); | |
if (ferror(stdin)) { | |
fprintf(stderr, "error reading input\n"); | |
goto parser_error; | |
} | |
status = XML_ParseBuffer(parser, (int)nbytes, (0 == nbytes)); | |
if (XML_STATUS_OK != status) { | |
fprintf(stderr, "error parsing input: %s\n", | |
XML_ErrorString(XML_GetErrorCode(parser))); | |
goto parser_error; | |
} | |
if (0 == nbytes) | |
break; | |
} | |
} | |
XML_ParserFree(parser); | |
fprintf(stderr, "Success!!!\n"); | |
exit(EXIT_SUCCESS); | |
parser_error: | |
XML_ParserFree(parser); | |
error: | |
exit(EXIT_FAILURE); | |
} | |
/* end of file */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment