nokute78/arm_string.c

## arm_string.c
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/*  Fluent Bit
 *  ==========
 *  Copyright (C) 2019-2021 The Fluent Bit Authors
 *  Copyright (C) 2015-2018 Treasure Data Inc.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

static int octal_digit(char c)
{
    return (c >= '0' && c <= '7');
}

static int hex_digit(char c)
{
    return ((c >= '0' && c <= '9') ||
            (c >= 'A' && c <= 'F') ||
            (c >= 'a' && c <= 'f'));
}

static int u8_wc_toutf8(char *dest, uint32_t ch)
{
    if (ch < 0x80) {
        dest[0] = (char)ch;
        return 1;
    }
    if (ch < 0x800) {
        dest[0] = (ch>>6) | 0xC0;
        dest[1] = (ch & 0x3F) | 0x80;
        return 2;
    }
    if (ch < 0x10000) {
        dest[0] = (ch>>12) | 0xE0;
        dest[1] = ((ch>>6) & 0x3F) | 0x80;
        dest[2] = (ch & 0x3F) | 0x80;
        return 3;
    }
    if (ch < 0x110000) {
        dest[0] = (ch>>18) | 0xF0;
        dest[1] = ((ch>>12) & 0x3F) | 0x80;
        dest[2] = ((ch>>6) & 0x3F) | 0x80;
        dest[3] = (ch & 0x3F) | 0x80;
        return 4;
    }
    return 0;
}

/* assumes that src points to the character after a backslash
   returns number of input characters processed */
static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest)
{
    uint32_t ch;
    char digs[9]="\0\0\0\0\0\0\0\0";
    int dno=0, i=1;

    ch = (uint32_t)str[0];    /* take literal character */

    if (str[0] == 'n')
        ch = L'\n';
    else if (str[0] == 't')
        ch = L'\t';
    else if (str[0] == 'r')
        ch = L'\r';
    else if (str[0] == 'b')
        ch = L'\b';
    else if (str[0] == 'f')
        ch = L'\f';
    else if (str[0] == 'v')
        ch = L'\v';
    else if (str[0] == 'a')
        ch = L'\a';
    else if (octal_digit(str[0])) {
        i = 0;
        do {
            digs[dno++] = str[i++];
        } while (i < size && octal_digit(str[i]) && dno < 3);
        ch = strtol(digs, NULL, 8);
    }
    else if (str[0] == 'x') {
        while (i < size && hex_digit(str[i]) && dno < 2) {
            digs[dno++] = str[i++];
        }
        if (dno > 0) {
            ch = strtol(digs, NULL, 16);
        }
    }
    else if (str[0] == 'u') {
        while (i < size && hex_digit(str[i]) && dno < 4) {
            digs[dno++] = str[i++];
        }
        if (dno > 0) {
            ch = strtol(digs, NULL, 16);
        }
    }
    else if (str[0] == 'U') {
        while (i < size && hex_digit(str[i]) && dno < 8) {
            digs[dno++] = str[i++];
        }
        if (dno > 0) {
            ch = strtol(digs, NULL, 16);
        }
    }
    *dest = ch;

    return i;
}

int flb_unescape_string_utf8(const char *in_buf, int sz, char *out_buf)
{
    uint32_t ch;
    char temp[4];
    const char *end;
    const char *next;
                int size;


    int count_out = 0;
    int count_in = 0;
    int esc_in = 0;
    int esc_out = 0;

    end = in_buf + sz;
    while (in_buf < end && *in_buf && count_in < sz) {
        next = in_buf + 1;
        if (next < end && *in_buf == '\\') {
            esc_in = 2;
            switch (*next) {
            case '"':
                ch = '"';
                break;
            case '\'':
                ch = '\'';
                break;
            case '\\':
                ch = '\\';
                break;
            case '/':
                ch = '/';
                break;
            case 'n':
                ch = '\n';
                break;
            case 'b':
                ch = '\b';
                break;
            case 't':
                ch = '\t';
                break;
            case 'f':
                ch = '\f';
                break;
            case 'r':
                ch = '\r';
                break;
            default:
                size = end - next;
                if (size > 0) {
                    esc_in = u8_read_escape_sequence(next, size, &ch) + 1;
                }
                else {
                    ch = (uint32_t) *in_buf;
                    esc_in = 1;
                }
            }
        }
        else {
            ch = (uint32_t) *in_buf;
            esc_in = 1;
        }

        in_buf += esc_in;
        count_in += esc_in;

        esc_out = u8_wc_toutf8(temp, ch);
        if (esc_out > sz-count_out) {
            printf("Crossing over string boundary");
            break;
        }

        if (esc_out == 0) {
            out_buf[count_out] = ch;
            esc_out = 1;
        }
        else if (esc_out == 1) {
            out_buf[count_out] = (char) temp[0];
        }
        else {
            memcpy(&out_buf[count_out], temp, esc_out);
        }
        count_out += esc_out;
    }
    if (count_in < sz) {
       printf("Not at boundary but still NULL terminating : %d - '%s'", sz, in_buf);
    }
    out_buf[count_out] = '\0';
    return count_out;
}

int main() {
  unsigned char input[128] = "aaabbb";
  int size = strlen(input);
  unsigned char output[128];

  input[0] = 0xe3; input[1] = 0x81; input[2] = 0x82;
  input[3] = 0xe3; input[4] = 0x81; input[5] = 0x83;
  flb_unescape_string_utf8(input, size, &output[0]);
  fwrite(&input[0], size, 1, stdout);
  puts("\n");
  fwrite(&output[0], strlen(output), 1, stdout);
  puts("\n");
}
	/* -- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -- */

	/* Fluent Bit
	* ==========
	* Copyright (C) 2019-2021 The Fluent Bit Authors
	* Copyright (C) 2015-2018 Treasure Data Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <stdint.h>

	static int octal_digit(char c)
	{
	return (c >= '0' && c <= '7');
	}

	static int hex_digit(char c)
	{
	return ((c >= '0' && c <= '9') \|\|
	(c >= 'A' && c <= 'F') \|\|
	(c >= 'a' && c <= 'f'));
	}

	static int u8_wc_toutf8(char *dest, uint32_t ch)
	{
	if (ch < 0x80) {
	dest[0] = (char)ch;
	return 1;
	}
	if (ch < 0x800) {
	dest[0] = (ch>>6) \| 0xC0;
	dest[1] = (ch & 0x3F) \| 0x80;
	return 2;
	}
	if (ch < 0x10000) {
	dest[0] = (ch>>12) \| 0xE0;
	dest[1] = ((ch>>6) & 0x3F) \| 0x80;
	dest[2] = (ch & 0x3F) \| 0x80;
	return 3;
	}
	if (ch < 0x110000) {
	dest[0] = (ch>>18) \| 0xF0;
	dest[1] = ((ch>>12) & 0x3F) \| 0x80;
	dest[2] = ((ch>>6) & 0x3F) \| 0x80;
	dest[3] = (ch & 0x3F) \| 0x80;
	return 4;
	}
	return 0;
	}

	/* assumes that src points to the character after a backslash
	returns number of input characters processed */
	static int u8_read_escape_sequence(const char str, int size, uint32_t dest)
	{
	uint32_t ch;
	char digs[9]="\0\0\0\0\0\0\0\0";
	int dno=0, i=1;

	ch = (uint32_t)str[0]; /* take literal character */

	if (str[0] == 'n')
	ch = L'\n';
	else if (str[0] == 't')
	ch = L'\t';
	else if (str[0] == 'r')
	ch = L'\r';
	else if (str[0] == 'b')
	ch = L'\b';
	else if (str[0] == 'f')
	ch = L'\f';
	else if (str[0] == 'v')
	ch = L'\v';
	else if (str[0] == 'a')
	ch = L'\a';
	else if (octal_digit(str[0])) {
	i = 0;
	do {
	digs[dno++] = str[i++];
	} while (i < size && octal_digit(str[i]) && dno < 3);
	ch = strtol(digs, NULL, 8);
	}
	else if (str[0] == 'x') {
	while (i < size && hex_digit(str[i]) && dno < 2) {
	digs[dno++] = str[i++];
	}
	if (dno > 0) {
	ch = strtol(digs, NULL, 16);
	}
	}
	else if (str[0] == 'u') {
	while (i < size && hex_digit(str[i]) && dno < 4) {
	digs[dno++] = str[i++];
	}
	if (dno > 0) {
	ch = strtol(digs, NULL, 16);
	}
	}
	else if (str[0] == 'U') {
	while (i < size && hex_digit(str[i]) && dno < 8) {
	digs[dno++] = str[i++];
	}
	if (dno > 0) {
	ch = strtol(digs, NULL, 16);
	}
	}
	*dest = ch;

	return i;
	}

	int flb_unescape_string_utf8(const char in_buf, int sz, char out_buf)
	{
	uint32_t ch;
	char temp[4];
	const char *end;
	const char *next;
	int size;


	int count_out = 0;
	int count_in = 0;
	int esc_in = 0;
	int esc_out = 0;

	end = in_buf + sz;
	while (in_buf < end && *in_buf && count_in < sz) {
	next = in_buf + 1;
	if (next < end && *in_buf == '\\') {
	esc_in = 2;
	switch (*next) {
	case '"':
	ch = '"';
	break;
	case '\'':
	ch = '\'';
	break;
	case '\\':
	ch = '\\';
	break;
	case '/':
	ch = '/';
	break;
	case 'n':
	ch = '\n';
	break;
	case 'b':
	ch = '\b';
	break;
	case 't':
	ch = '\t';
	break;
	case 'f':
	ch = '\f';
	break;
	case 'r':
	ch = '\r';
	break;
	default:
	size = end - next;
	if (size > 0) {
	esc_in = u8_read_escape_sequence(next, size, &ch) + 1;
	}
	else {
	ch = (uint32_t) *in_buf;
	esc_in = 1;
	}
	}
	}
	else {
	ch = (uint32_t) *in_buf;
	esc_in = 1;
	}

	in_buf += esc_in;
	count_in += esc_in;

	esc_out = u8_wc_toutf8(temp, ch);
	if (esc_out > sz-count_out) {
	printf("Crossing over string boundary");
	break;
	}

	if (esc_out == 0) {
	out_buf[count_out] = ch;
	esc_out = 1;
	}
	else if (esc_out == 1) {
	out_buf[count_out] = (char) temp[0];
	}
	else {
	memcpy(&out_buf[count_out], temp, esc_out);
	}
	count_out += esc_out;
	}
	if (count_in < sz) {
	printf("Not at boundary but still NULL terminating : %d - '%s'", sz, in_buf);
	}
	out_buf[count_out] = '\0';
	return count_out;
	}

	int main() {
	unsigned char input[128] = "aaabbb";
	int size = strlen(input);
	unsigned char output[128];

	input[0] = 0xe3; input[1] = 0x81; input[2] = 0x82;
	input[3] = 0xe3; input[4] = 0x81; input[5] = 0x83;
	flb_unescape_string_utf8(input, size, &output[0]);
	fwrite(&input[0], size, 1, stdout);
	puts("\n");
	fwrite(&output[0], strlen(output), 1, stdout);
	puts("\n");
	}