Skip to content

Instantly share code, notes, and snippets.

@marcomaggi
Last active December 21, 2015 16:19
Show Gist options
  • Save marcomaggi/6332826 to your computer and use it in GitHub Desktop.
Save marcomaggi/6332826 to your computer and use it in GitHub Desktop.
Untested example of using the Expat library with GBK encoding, make use of GNU Libiconv.
/* An untested example of using the Expat library with GBK encoding:
read an XML document from stdin and exit. Libiconv is used to
perform the actual conversion from GBK to UTF-8. On a GNU+Linux
system compile it with:
$ gcc -Wall -o expat-gbk-demo expat-gbk-demo.c -liconv -lexpat
A useless XML test file follows:
<?xml version='1.0' encoding='GBK'?>
<!-- XML test file. Notice that the document must really
start with "<?xml", else an error will be raised by Expat. -->
<!DOCTYPE toys [
<!ELEMENT ball EMPTY>
<!ATTLIST ball colour CDATA #REQUIRED>
]>
<toys><ball colour='red'/></toys>
<!-- end of file -->
Informations about GBK encoding can be found here:
<http://en.wikipedia.org/wiki/GBK>
This file is in the public domain.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include <expat.h>
#include <stdint.h>
static void
logmsg (const char * message)
{
fprintf(stderr, message);
fprintf(stderr, "\n");
}
static int
convert_gbk_to_utf8 (void *data, const char *in_bytes)
{
iconv_t context = (iconv_t) data;
size_t in_bytes_left = 2;
size_t ou_bytes_left = 4;
uint32_t ou_bytes;
char * in_buffer = in_bytes;
char * ou_buffer = (char*)&ou_bytes;
size_t rv;
logmsg("converting GBK 2-bytes character");
errno = 0;
rv = iconv(context,
&in_buffer, &in_bytes_left,
&ou_buffer, &ou_bytes_left);
if (((size_t)-1) == rv) {
return -1; /* all the errors */
} else {
return (int)ou_bytes;
}
}
static void
release_gbk_to_utf8 (void *data)
{
iconv_t context = (iconv_t) data;
logmsg("releasing GBK iconv context");
iconv_close(context);
}
int
unknown_encoding_handler (void *encodingHandlerData, /* unused */
const XML_Char *name,
XML_Encoding *info)
{
logmsg("initialising for custom encoding");
if (strcmp(name, "GBK") || strcmp(name, "gbk")) {
logmsg("setup GBK processing: start");
iconv_t context;
context = iconv_open("UCS-4-INTERNAL", "GBK");
if (((iconv_t)(-1)) == context)
goto error;
{
int i;
/* Bytes in the range [0, 127] are single GBK characters
representing "as is" the corresponding ASCII characters. */
for (i=0; i<128; ++i) {
info->map[i] = i;
}
/* Bytes in the range [128, 255] are the first byte in a 2-bytes
GBK character; they must be handed to the conversion
function. */
for (i=128; i<256; ++i) {
info->map[i] = -2;
}
}
info->data = (void *)context;
info->convert = convert_gbk_to_utf8;
info->release = release_gbk_to_utf8;
logmsg("setup GBK processing: done");
return XML_STATUS_OK;
} else {
logmsg("unsupported encoding");
return XML_STATUS_ERROR;
}
error:
logmsg("error in custom encoding setup");
return XML_STATUS_ERROR;
}
int
main (int argc, const char *const argv[])
{
#undef BUFFER_SIZE
#define BUFFER_SIZE 16 /* this is small on purpose */
XML_Parser parser;
parser = XML_ParserCreate(NULL);
if (NULL == parser) {
fprintf(stderr, "error allocating parser\n");
goto error;
}
{
enum XML_Status status;
XML_SetUnknownEncodingHandler(parser, unknown_encoding_handler, NULL);
for (;;) {
void * buffer;
size_t nbytes;
buffer = XML_GetBuffer(parser, BUFFER_SIZE);
if (NULL == buffer) {
fprintf(stderr, "error getting parser buffer\n");
goto parser_error;
}
nbytes = fread(buffer, 1, BUFFER_SIZE, stdin);
if (ferror(stdin)) {
fprintf(stderr, "error reading input\n");
goto parser_error;
}
status = XML_ParseBuffer(parser, (int)nbytes, (0 == nbytes));
if (XML_STATUS_OK != status) {
fprintf(stderr, "error parsing input: %s\n",
XML_ErrorString(XML_GetErrorCode(parser)));
goto parser_error;
}
if (0 == nbytes)
break;
}
}
XML_ParserFree(parser);
fprintf(stderr, "Success!!!\n");
exit(EXIT_SUCCESS);
parser_error:
XML_ParserFree(parser);
error:
exit(EXIT_FAILURE);
}
/* end of file */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment