stevebrun/reverse-utf8-string.c

## reverse-utf8-string.c
#include <stdio.h>
#include <assert.h>

/// Count the amount of bytes contained within a null-terminated string.
/// - returns: The amount of bits before the null-terminator in the given string.
int string_length(char const *const string);

/// Reverse an array of bytes in-place.
/// - parameter bytes: A buffer of bytes to be reversed.
/// - parameter length: The amount of bytes contained within the buffer that should be reversed.
void bytes_reverse(char *const bytes, int const length);

/// Copy an array of bytes into another byte array.
/// - parameter source: An array of bytes that should be copied.
/// - parameter length: The amount of bytes to copy.
/// - parameter destination: A byte buffer to copy the bytes into.
void bytes_copy(char const *const source, int const length, char *const destintaion);

/// Create a string representation of the eight bits in a byte.
/// - parameter string: The byte buffer into which the string representation should by written.
/// - parameter byte: The byte to represent as a string.
void string_print_byte(char *const string, char const byte);

/// Given a UTF-8 encoded string, determine the amount of bytes used to represent the first codepoint.
/// - parameter string: A UTF-8 encoded character array.
/// - returns:
///     * 0 when the fist byte is a continuation byte.
///     * 1, 2, 3, or 4 for any valid UTF-8 codepoint.
///     * -1 when the first byte is invalid in UTF-8.
int utf8_string_next_codepoint_length(char const *const string);

/// Count the amount of codepoints within a given UTF-8 encoded string.
/// - parameter string: A valid UTF-8 encoded null-terminated byte string.
/// - returns: The amount of codepoints found before the null-terminator.
///            An invalid UTF-8 string will return a length of -1.
int utf8_string_codepoint_length(char const *const string);

/// Reverse in-place the codepoints of UTF-8 encoded string.
/// - parameter string: A valid UTF-8 encoded string.
void utf8_string_codepoint_reverse(char *const string);

int main(int argc, char **argv) {
    char cent[] = "\xc2\xa2";
    char euro[] = "\xe2\x82\xac";
    char clock[] = "\xe2\x8f\xb0";
    char circle[] = "\xf0\x90\x8d\x88";
    char letter[] = "a";

    char sentence[] = "What --- is the ---- turning the --- into --?";
    bytes_copy(clock, 3, sentence + 5);
    bytes_copy(circle, 4, sentence + 16);
    bytes_copy(euro, 3, sentence + 33);
    bytes_copy(cent, 2, sentence + 42);

    printf("%s\n", sentence);
    utf8_string_codepoint_reverse(sentence);
    printf("%s\n", sentence);
}

int string_length(char const *const string) {
    assert( string != NULL );
    int length = 0;
    for (; string[length] != '\0'; length += 1) {}
    return length;
}

void bytes_reverse(char *const bytes, int const length) {
    assert( bytes != NULL );
    assert( length > 0 );
    for (int start = 0, end = length - 1; start < end; start += 1, end -= 1) {
        char byte = bytes[end];
        bytes[end] = bytes[start];
        bytes[start] = byte;
    }
}

void bytes_copy(char const *const source, int const length, char *const destination) {
    assert( source != NULL );
    assert( destination != NULL );
    for (int i = 0; i < length; i += 1) {
        destination[i] = source[i];
    }
}

void string_print_byte(char *const string, char byte) {
    assert( string != NULL );
    char byte_string[] = { (byte & 0x80) >> 7,
                           (byte & 0x40) >> 6,
                           (byte & 0x20) >> 5,
                           (byte & 0x10) >> 4,
                           (byte & 0x08) >> 3,
                           (byte & 0x06) >> 2,
                           (byte & 0x02) >> 1,
                           (byte & 0x01) };
    bytes_copy(byte_string, sizeof(byte_string), string);
}

int utf8_string_next_codepoint_length(char const *const string) {
    assert( string != NULL );
    char c = string[0];
    if ((c & 0x80) == 0) return 1; // 0....... stand-alone codepoint
    if ((c & 0x40) == 0) return 0; // 10...... continuation byte
    if ((c & 0x20) == 0) return 2; // 110..... two-byte codepoint initial byte
    if ((c & 0x10) == 0) return 3; // 1110.... three-byte codepoint initial byte
    if ((c & 0x08) == 0) return 4; // 11110... four-byte codepoint initial byte
    return -1;
}

int utf8_string_codepoint_length(char const *const string) {
    int length = 0;
    int const raw_length = string_length(string);
    for (int i = 0, char_len = 0; i < raw_length; i += char_len, length += 1) {
        char_len = utf8_string_next_codepoint_length(string + i);
        if (char_len <= 0) { return -1; }
    }
    return length;
}

void utf8_string_codepoint_reverse(char *const string) {
    int length = string_length(string);
    for (int i = 0, char_len = 0; i < length; i += char_len) {
        char_len = utf8_string_next_codepoint_length(string + i);
        assert( char_len > 0 && char_len < 5 );
        bytes_reverse(string + i, char_len);
    }
    bytes_reverse(string, length);
}
	#include <stdio.h>
	#include <assert.h>

	/// Count the amount of bytes contained within a null-terminated string.
	/// - returns: The amount of bits before the null-terminator in the given string.
	int string_length(char const *const string);

	/// Reverse an array of bytes in-place.
	/// - parameter bytes: A buffer of bytes to be reversed.
	/// - parameter length: The amount of bytes contained within the buffer that should be reversed.
	void bytes_reverse(char *const bytes, int const length);

	/// Copy an array of bytes into another byte array.
	/// - parameter source: An array of bytes that should be copied.
	/// - parameter length: The amount of bytes to copy.
	/// - parameter destination: A byte buffer to copy the bytes into.
	void bytes_copy(char const const source, int const length, char const destintaion);

	/// Create a string representation of the eight bits in a byte.
	/// - parameter string: The byte buffer into which the string representation should by written.
	/// - parameter byte: The byte to represent as a string.
	void string_print_byte(char *const string, char const byte);

	/// Given a UTF-8 encoded string, determine the amount of bytes used to represent the first codepoint.
	/// - parameter string: A UTF-8 encoded character array.
	/// - returns:
	/// * 0 when the fist byte is a continuation byte.
	/// * 1, 2, 3, or 4 for any valid UTF-8 codepoint.
	/// * -1 when the first byte is invalid in UTF-8.
	int utf8_string_next_codepoint_length(char const *const string);

	/// Count the amount of codepoints within a given UTF-8 encoded string.
	/// - parameter string: A valid UTF-8 encoded null-terminated byte string.
	/// - returns: The amount of codepoints found before the null-terminator.
	/// An invalid UTF-8 string will return a length of -1.
	int utf8_string_codepoint_length(char const *const string);

	/// Reverse in-place the codepoints of UTF-8 encoded string.
	/// - parameter string: A valid UTF-8 encoded string.
	void utf8_string_codepoint_reverse(char *const string);

	int main(int argc, char **argv) {
	char cent[] = "\xc2\xa2";
	char euro[] = "\xe2\x82\xac";
	char clock[] = "\xe2\x8f\xb0";
	char circle[] = "\xf0\x90\x8d\x88";
	char letter[] = "a";

	char sentence[] = "What --- is the ---- turning the --- into --?";
	bytes_copy(clock, 3, sentence + 5);
	bytes_copy(circle, 4, sentence + 16);
	bytes_copy(euro, 3, sentence + 33);
	bytes_copy(cent, 2, sentence + 42);

	printf("%s\n", sentence);
	utf8_string_codepoint_reverse(sentence);
	printf("%s\n", sentence);
	}

	int string_length(char const *const string) {
	assert( string != NULL );
	int length = 0;
	for (; string[length] != '\0'; length += 1) {}
	return length;
	}

	void bytes_reverse(char *const bytes, int const length) {
	assert( bytes != NULL );
	assert( length > 0 );
	for (int start = 0, end = length - 1; start < end; start += 1, end -= 1) {
	char byte = bytes[end];
	bytes[end] = bytes[start];
	bytes[start] = byte;
	}
	}

	void bytes_copy(char const const source, int const length, char const destination) {
	assert( source != NULL );
	assert( destination != NULL );
	for (int i = 0; i < length; i += 1) {
	destination[i] = source[i];
	}
	}

	void string_print_byte(char *const string, char byte) {
	assert( string != NULL );
	char byte_string[] = { (byte & 0x80) >> 7,
	(byte & 0x40) >> 6,
	(byte & 0x20) >> 5,
	(byte & 0x10) >> 4,
	(byte & 0x08) >> 3,
	(byte & 0x06) >> 2,
	(byte & 0x02) >> 1,
	(byte & 0x01) };
	bytes_copy(byte_string, sizeof(byte_string), string);
	}

	int utf8_string_next_codepoint_length(char const *const string) {
	assert( string != NULL );
	char c = string[0];
	if ((c & 0x80) == 0) return 1; // 0....... stand-alone codepoint
	if ((c & 0x40) == 0) return 0; // 10...... continuation byte
	if ((c & 0x20) == 0) return 2; // 110..... two-byte codepoint initial byte
	if ((c & 0x10) == 0) return 3; // 1110.... three-byte codepoint initial byte
	if ((c & 0x08) == 0) return 4; // 11110... four-byte codepoint initial byte
	return -1;
	}

	int utf8_string_codepoint_length(char const *const string) {
	int length = 0;
	int const raw_length = string_length(string);
	for (int i = 0, char_len = 0; i < raw_length; i += char_len, length += 1) {
	char_len = utf8_string_next_codepoint_length(string + i);
	if (char_len <= 0) { return -1; }
	}
	return length;
	}

	void utf8_string_codepoint_reverse(char *const string) {
	int length = string_length(string);
	for (int i = 0, char_len = 0; i < length; i += char_len) {
	char_len = utf8_string_next_codepoint_length(string + i);
	assert( char_len > 0 && char_len < 5 );
	bytes_reverse(string + i, char_len);
	}
	bytes_reverse(string, length);
	}