bnoordhuis/test-utf8.c

## test-utf8.c
/**
 * Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include "utf8.h"

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <errno.h>

static void test_enc(const int input[], const int size, const char **expected) {
  char buf[4];
  int i, len;

  for (i = 0; i < size; i++) {
    len = utf8_enc((unsigned long) input[i], buf, 4);
    if (len < 0) {
      assert((const char *) (size_t) len == expected[i]);
    }
    else {
      assert(len == (int) strlen(expected[i]));
      assert(0 == memcmp(expected[i], buf, len));
    }
  }
}

static void test_dec(const char *input, const int expected[], const int size) {
  const char *s, *se;
  int i;

  s = input, se = input + strlen(input);
  for (i = 0; i < size; i++) {
    const int c = utf8_dec(&s, se - s);
    assert(c == expected[i]);
  }
}

int main(void) {
  /*
   * encoder tests
   */
  {
    const int input[] = { 't', 'e', 's', 't' };
    const char *expected[] = { "t", "e", "s", "t", 0 };
    test_enc(input, sizeof(input) / sizeof(input[0]), expected);
  }
  {
    const int input[] = { 'b', 252, 'c', 'h', 'e', 'r' };
    const char *expected[] = { "b", "ü", "c", "h", "e", "r", 0 };
    test_enc(input, sizeof(input) / sizeof(input[0]), expected);
  }
  {
    const int input[] = { 0x20AC };
    const char *expected[] = { "€" };
    test_enc(input, sizeof(input) / sizeof(input[0]), expected);
  }
  {
    const int input[] = { 0x024B62 };
    const char *expected[] = { "\xF0\xA4\xAD\xA2" };
    test_enc(input, sizeof(input) / sizeof(input[0]), expected);
  }
  {
    const int input[] = { -1 };
    const char *expected[] = { (const char *) -EINVAL };
    test_enc(input, sizeof(input) / sizeof(input[0]), expected);
  }

  /*
   * decoder tests
   */
  {
    const int expected[] = { 't', 'e', 's', 't' };
    test_dec("test", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { 'b', 252, 'c', 'h', 'e', 'r' };
    test_dec("bücher", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { 0x20AC };
    test_dec("€", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { 0x024B62 };
    test_dec("\xF0\xA4\xAD\xA2", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { -1 };
    test_dec("\xFF", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { -1 };
    test_dec("\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
  }
  {
    const int expected[] = { -1 };
    test_dec("\xFF\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
  }
  /* outside any plane, should fail
  {
    const int expected[] = { -1 };
    test_dec("\xFF\xFF\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
  }
  */
  return 0;
}

## utf8.h
/**
 * Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#ifndef UTF8_H_
#define UTF8_H_

#include <stddef.h>
#include <errno.h>

/**
 * Converts a Unicode code point to UTF-8 and writes it to the output buffer.
 *
 * Returns the number of bytes written (1-4) on success or a negative value
 * on error.
 *
 * There are two possible errors:
 *
 *   -E2BIG   output buffer too small
 *   -EINVAL  illegal code point
 *
 */
static int utf8_enc(unsigned long codepoint, char *dst, size_t size);

/**
 * Converts a UTF-8 sequence converted to its Unicode code point.
 *
 * Returns the code point on success or -1 on error.
 */
static long utf8_dec(const char **s, size_t size);

static int utf8_enc(unsigned long codepoint, char *dst, size_t size) {
  if (codepoint <= 0x7F) {
    if (size < 1) {
      return -E2BIG;
    }
    else {
      *dst++ = codepoint;
      return 1;
    }
  }

  if (codepoint <= 0x7FF) {
    if (size < 2) {
      return -E2BIG;
    }
    else {
      *dst++ = 0xC0 | ((codepoint >> 6) & 0x1F);
      *dst++ = 0x80 | (codepoint & 0x3F);
      return 2;
    }
  }

  if (codepoint <= 0xFFFF) {
    if (size < 3) {
      return -E2BIG;
    }
    else {
      *dst++ = 0xE0 | ((codepoint >> 12) & 0x0F);
      *dst++ = 0x80 | ((codepoint >> 6) & 0x3F);
      *dst++ = 0x80 | (codepoint & 0x3F);
      return 3;
    }
  }

  if (codepoint <= 0x10FFFF) {
    if (size < 4) {
      return -E2BIG;
    }
    else {
      *dst++ = 0xF0 | ((codepoint >> 18) & 0x07);
      *dst++ = 0x80 | ((codepoint >> 12) & 0x3F);
      *dst++ = 0x80 | ((codepoint >> 6) & 0x3F);
      *dst++ = 0x80 | (codepoint & 0x3F);
      return 4;
    }
  }

  return -EINVAL;
}

static long utf8_dec(const char **s, size_t size) {
  if (!(*s[0] & 0x80)) {
    return *(*s)++;
  }

  switch (*s[0] & 0xF0) {
  case 0xF0:
    if (size >= 4) {
      unsigned a = *(*s)++ & 0x07;
      unsigned b = *(*s)++ & 0x3F;
      unsigned c = *(*s)++ & 0x3F;
      unsigned d = *(*s)++ & 0x3F;
      return (a << 18) | (b << 12) | (c << 6) | d;
    }
    break;

  case 0xE0:
    if (size >= 3) {
      unsigned a = *(*s)++ & 0x0F;
      unsigned b = *(*s)++ & 0x3F;
      unsigned c = *(*s)++ & 0x3F;
      return (a << 12) | (b << 6) | c;
    }
    break;

  case 0xC0:
    if (size >= 2) {
      unsigned a = *(*s)++ & 0x1F;
      unsigned b = *(*s)++ & 0x3F;
      return (a << 6) | b;
    }
    break;
  }

  return -1;
}

#endif /* utf8.h */
	/**
	* Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*/
	#include "utf8.h"

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <assert.h>
	#include <errno.h>

	static void test_enc(const int input[], const int size, const char **expected) {
	char buf[4];
	int i, len;

	for (i = 0; i < size; i++) {
	len = utf8_enc((unsigned long) input[i], buf, 4);
	if (len < 0) {
	assert((const char *) (size_t) len == expected[i]);
	}
	else {
	assert(len == (int) strlen(expected[i]));
	assert(0 == memcmp(expected[i], buf, len));
	}
	}
	}

	static void test_dec(const char *input, const int expected[], const int size) {
	const char s, se;
	int i;

	s = input, se = input + strlen(input);
	for (i = 0; i < size; i++) {
	const int c = utf8_dec(&s, se - s);
	assert(c == expected[i]);
	}
	}

	int main(void) {
	/*
	* encoder tests
	*/
	{
	const int input[] = { 't', 'e', 's', 't' };
	const char *expected[] = { "t", "e", "s", "t", 0 };
	test_enc(input, sizeof(input) / sizeof(input[0]), expected);
	}
	{
	const int input[] = { 'b', 252, 'c', 'h', 'e', 'r' };
	const char *expected[] = { "b", "ü", "c", "h", "e", "r", 0 };
	test_enc(input, sizeof(input) / sizeof(input[0]), expected);
	}
	{
	const int input[] = { 0x20AC };
	const char *expected[] = { "€" };
	test_enc(input, sizeof(input) / sizeof(input[0]), expected);
	}
	{
	const int input[] = { 0x024B62 };
	const char *expected[] = { "\xF0\xA4\xAD\xA2" };
	test_enc(input, sizeof(input) / sizeof(input[0]), expected);
	}
	{
	const int input[] = { -1 };
	const char expected[] = { (const char ) -EINVAL };
	test_enc(input, sizeof(input) / sizeof(input[0]), expected);
	}

	/*
	* decoder tests
	*/
	{
	const int expected[] = { 't', 'e', 's', 't' };
	test_dec("test", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { 'b', 252, 'c', 'h', 'e', 'r' };
	test_dec("bücher", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { 0x20AC };
	test_dec("€", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { 0x024B62 };
	test_dec("\xF0\xA4\xAD\xA2", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { -1 };
	test_dec("\xFF", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { -1 };
	test_dec("\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
	}
	{
	const int expected[] = { -1 };
	test_dec("\xFF\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
	}
	/* outside any plane, should fail
	{
	const int expected[] = { -1 };
	test_dec("\xFF\xFF\xFF\xFF", expected, sizeof(expected) / sizeof(expected[0]));
	}
	*/
	return 0;
	}