Skip to content

Instantly share code, notes, and snippets.

@rprichard
Created December 12, 2017 05:01
Show Gist options
  • Save rprichard/600107e4eeeac3849ba0cf2b43e866d9 to your computer and use it in GitHub Desktop.
Save rprichard/600107e4eeeac3849ba0cf2b43e866d9 to your computer and use it in GitHub Desktop.
sscanf %l[...] handling
// Copyright 2017 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <locale.h>
#include <stdio.h>
#include <wchar.h>
int main() {
setlocale(LC_ALL, "en_US.UTF-8");
{
// With Bionic, a non-ASCII character is always a match:
//
// Bionic prints: 1 xĀy
// glibc/musl/FreeBSD print: 1 x
//
wchar_t wbuf[16] = { 0 };
const int ret = sscanf(
"x" "\xc4\x80" "yz",
"%l[xy]",
wbuf);
printf("%d %ls\n", ret, wbuf);
}
{
// On FreeBSD, a non-ASCII character is never a match. (Test 1/2)
//
// Bionic/glibc/musl print: 1 aĀb
// FreeBSD prints: 1 a
wchar_t wbuf[16] = { 0 };
const int ret = sscanf(
"a" "\xc4\x80" "bx",
"%l[" "a" "\xc4\x80" "b" "]",
wbuf);
printf("%d %ls\n", ret, wbuf);
}
{
// On FreeBSD, a non-ASCII character is never a match. (Test 2/2)
//
// Bionic/glibc/musl print: 1 aĀb
// FreeBSD prints: 1 a
wchar_t wbuf[16] = { 0 };
const int ret = sscanf(
"a" "\xc4\x80" "bx",
"%l[^xy]",
wbuf);
printf("%d %ls\n", ret, wbuf);
}
{
// glibc/musl apparently do the match using narrow chars first, then
// convert to wide chars on output.
//
// glibc/musl prints: 1 xĀ [U+0100]
wchar_t wbuf[16] = { 0 };
const int ret = sscanf(
"x"
"\xc4\x80" // Matches the C4 from char#2 and the 80 from char #1
"\xc6\x82" // Neither byte matches.
"yz",
"%l[xy"
"\xc5\x80"
"\xc4\x81"
"]", wbuf);
printf("%d %ls [U+%04x]\n", ret, wbuf, wbuf[1]);
}
{
// What if we only match part of a codepoint?
//
// musl prints: -1 x [U+0078 U+0000 U+0000]
// gcc prints: 0 x [U+0078 U+0000 U+0000]
// FreeBSD prints: 1 x [U+0078 U+0000 U+0000]
wchar_t wbuf[16] = { 0 };
const int ret = sscanf(
"x"
"\xc4\x82" // Only the first byte matches.
"yz",
"%l[xy"
"\xc5\x80"
"\xc4\x81"
"]", wbuf);
printf("%d %ls [U+%04x U+%04x U+%04x]\n", ret, wbuf, wbuf[0], wbuf[1], wbuf[2]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment