rprichard/test.c

## test.c
// Copyright 2017 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <locale.h>
#include <stdio.h>
#include <wchar.h>

int main() {
  setlocale(LC_ALL, "en_US.UTF-8");
  {
    // With Bionic, a non-ASCII character is always a match:
    //
    // Bionic prints: 1 xĀy
    // glibc/musl/FreeBSD print: 1 x
    //
    wchar_t wbuf[16] = { 0 };
    const int ret = sscanf(
      "x" "\xc4\x80" "yz",
      "%l[xy]",
      wbuf);
    printf("%d %ls\n", ret, wbuf);
  }
  {
    // On FreeBSD, a non-ASCII character is never a match. (Test 1/2)
    //
    // Bionic/glibc/musl print: 1 aĀb
    // FreeBSD prints: 1 a
    wchar_t wbuf[16] = { 0 };
    const int ret = sscanf(
            "a" "\xc4\x80" "bx",
      "%l[" "a" "\xc4\x80" "b" "]",
      wbuf);
    printf("%d %ls\n", ret, wbuf);
  }
  {
    // On FreeBSD, a non-ASCII character is never a match. (Test 2/2)
    //
    // Bionic/glibc/musl print: 1 aĀb
    // FreeBSD prints: 1 a
    wchar_t wbuf[16] = { 0 };
    const int ret = sscanf(
      "a" "\xc4\x80" "bx",
      "%l[^xy]",
      wbuf);
    printf("%d %ls\n", ret, wbuf);
  }
  {
    // glibc/musl apparently do the match using narrow chars first, then
    // convert to wide chars on output.
    //
    // glibc/musl prints: 1 xĀ [U+0100]
    wchar_t wbuf[16] = { 0 };
    const int ret = sscanf(
      "x"
        "\xc4\x80" // Matches the C4 from char#2 and the 80 from char #1
        "\xc6\x82" // Neither byte matches.
      "yz",
      "%l[xy"
        "\xc5\x80"
        "\xc4\x81"
      "]", wbuf);
    printf("%d %ls [U+%04x]\n", ret, wbuf, wbuf[1]);
  }
  {
    // What if we only match part of a codepoint?
    //
    // musl prints:   -1 x [U+0078 U+0000 U+0000]
    // gcc prints:     0 x [U+0078 U+0000 U+0000]
    // FreeBSD prints: 1 x [U+0078 U+0000 U+0000]
    wchar_t wbuf[16] = { 0 };
    const int ret = sscanf(
      "x"
        "\xc4\x82" // Only the first byte matches.
      "yz",
      "%l[xy"
        "\xc5\x80"
        "\xc4\x81"
      "]", wbuf);
    printf("%d %ls [U+%04x U+%04x U+%04x]\n", ret, wbuf, wbuf[0], wbuf[1], wbuf[2]);
  }
  return 0;
}
	// Copyright 2017 Google LLC.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include <locale.h>
	#include <stdio.h>
	#include <wchar.h>

	int main() {
	setlocale(LC_ALL, "en_US.UTF-8");
	{
	// With Bionic, a non-ASCII character is always a match:
	//
	// Bionic prints: 1 xĀy
	// glibc/musl/FreeBSD print: 1 x
	//
	wchar_t wbuf[16] = { 0 };
	const int ret = sscanf(
	"x" "\xc4\x80" "yz",
	"%l[xy]",
	wbuf);
	printf("%d %ls\n", ret, wbuf);
	}
	{
	// On FreeBSD, a non-ASCII character is never a match. (Test 1/2)
	//
	// Bionic/glibc/musl print: 1 aĀb
	// FreeBSD prints: 1 a
	wchar_t wbuf[16] = { 0 };
	const int ret = sscanf(
	"a" "\xc4\x80" "bx",
	"%l[" "a" "\xc4\x80" "b" "]",
	wbuf);
	printf("%d %ls\n", ret, wbuf);
	}
	{
	// On FreeBSD, a non-ASCII character is never a match. (Test 2/2)
	//
	// Bionic/glibc/musl print: 1 aĀb
	// FreeBSD prints: 1 a
	wchar_t wbuf[16] = { 0 };
	const int ret = sscanf(
	"a" "\xc4\x80" "bx",
	"%l[^xy]",
	wbuf);
	printf("%d %ls\n", ret, wbuf);
	}
	{
	// glibc/musl apparently do the match using narrow chars first, then
	// convert to wide chars on output.
	//
	// glibc/musl prints: 1 xĀ [U+0100]
	wchar_t wbuf[16] = { 0 };
	const int ret = sscanf(
	"x"
	"\xc4\x80" // Matches the C4 from char#2 and the 80 from char #1
	"\xc6\x82" // Neither byte matches.
	"yz",
	"%l[xy"
	"\xc5\x80"
	"\xc4\x81"
	"]", wbuf);
	printf("%d %ls [U+%04x]\n", ret, wbuf, wbuf[1]);
	}
	{
	// What if we only match part of a codepoint?
	//
	// musl prints: -1 x [U+0078 U+0000 U+0000]
	// gcc prints: 0 x [U+0078 U+0000 U+0000]
	// FreeBSD prints: 1 x [U+0078 U+0000 U+0000]
	wchar_t wbuf[16] = { 0 };
	const int ret = sscanf(
	"x"
	"\xc4\x82" // Only the first byte matches.
	"yz",
	"%l[xy"
	"\xc5\x80"
	"\xc4\x81"
	"]", wbuf);
	printf("%d %ls [U+%04x U+%04x U+%04x]\n", ret, wbuf, wbuf[0], wbuf[1], wbuf[2]);
	}
	return 0;
	}