Skip to content

Instantly share code, notes, and snippets.

@stevebrun
Created February 1, 2018 02:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stevebrun/e341760c0a7d5cf4e113b136277e5ad4 to your computer and use it in GitHub Desktop.
Save stevebrun/e341760c0a7d5cf4e113b136277e5ad4 to your computer and use it in GitHub Desktop.
The proper way to reverse the code points in a UTF-8 encoded string.
#include <stdio.h>
#include <assert.h>
/// Count the amount of bytes contained within a null-terminated string.
/// - returns: The amount of bits before the null-terminator in the given string.
int string_length(char const *const string);
/// Reverse an array of bytes in-place.
/// - parameter bytes: A buffer of bytes to be reversed.
/// - parameter length: The amount of bytes contained within the buffer that should be reversed.
void bytes_reverse(char *const bytes, int const length);
/// Copy an array of bytes into another byte array.
/// - parameter source: An array of bytes that should be copied.
/// - parameter length: The amount of bytes to copy.
/// - parameter destination: A byte buffer to copy the bytes into.
void bytes_copy(char const *const source, int const length, char *const destintaion);
/// Create a string representation of the eight bits in a byte.
/// - parameter string: The byte buffer into which the string representation should by written.
/// - parameter byte: The byte to represent as a string.
void string_print_byte(char *const string, char const byte);
/// Given a UTF-8 encoded string, determine the amount of bytes used to represent the first codepoint.
/// - parameter string: A UTF-8 encoded character array.
/// - returns:
/// * 0 when the fist byte is a continuation byte.
/// * 1, 2, 3, or 4 for any valid UTF-8 codepoint.
/// * -1 when the first byte is invalid in UTF-8.
int utf8_string_next_codepoint_length(char const *const string);
/// Count the amount of codepoints within a given UTF-8 encoded string.
/// - parameter string: A valid UTF-8 encoded null-terminated byte string.
/// - returns: The amount of codepoints found before the null-terminator.
/// An invalid UTF-8 string will return a length of -1.
int utf8_string_codepoint_length(char const *const string);
/// Reverse in-place the codepoints of UTF-8 encoded string.
/// - parameter string: A valid UTF-8 encoded string.
void utf8_string_codepoint_reverse(char *const string);
int main(int argc, char **argv) {
char cent[] = "\xc2\xa2";
char euro[] = "\xe2\x82\xac";
char clock[] = "\xe2\x8f\xb0";
char circle[] = "\xf0\x90\x8d\x88";
char letter[] = "a";
char sentence[] = "What --- is the ---- turning the --- into --?";
bytes_copy(clock, 3, sentence + 5);
bytes_copy(circle, 4, sentence + 16);
bytes_copy(euro, 3, sentence + 33);
bytes_copy(cent, 2, sentence + 42);
printf("%s\n", sentence);
utf8_string_codepoint_reverse(sentence);
printf("%s\n", sentence);
}
int string_length(char const *const string) {
assert( string != NULL );
int length = 0;
for (; string[length] != '\0'; length += 1) {}
return length;
}
void bytes_reverse(char *const bytes, int const length) {
assert( bytes != NULL );
assert( length > 0 );
for (int start = 0, end = length - 1; start < end; start += 1, end -= 1) {
char byte = bytes[end];
bytes[end] = bytes[start];
bytes[start] = byte;
}
}
void bytes_copy(char const *const source, int const length, char *const destination) {
assert( source != NULL );
assert( destination != NULL );
for (int i = 0; i < length; i += 1) {
destination[i] = source[i];
}
}
void string_print_byte(char *const string, char byte) {
assert( string != NULL );
char byte_string[] = { (byte & 0x80) >> 7,
(byte & 0x40) >> 6,
(byte & 0x20) >> 5,
(byte & 0x10) >> 4,
(byte & 0x08) >> 3,
(byte & 0x06) >> 2,
(byte & 0x02) >> 1,
(byte & 0x01) };
bytes_copy(byte_string, sizeof(byte_string), string);
}
int utf8_string_next_codepoint_length(char const *const string) {
assert( string != NULL );
char c = string[0];
if ((c & 0x80) == 0) return 1; // 0....... stand-alone codepoint
if ((c & 0x40) == 0) return 0; // 10...... continuation byte
if ((c & 0x20) == 0) return 2; // 110..... two-byte codepoint initial byte
if ((c & 0x10) == 0) return 3; // 1110.... three-byte codepoint initial byte
if ((c & 0x08) == 0) return 4; // 11110... four-byte codepoint initial byte
return -1;
}
int utf8_string_codepoint_length(char const *const string) {
int length = 0;
int const raw_length = string_length(string);
for (int i = 0, char_len = 0; i < raw_length; i += char_len, length += 1) {
char_len = utf8_string_next_codepoint_length(string + i);
if (char_len <= 0) { return -1; }
}
return length;
}
void utf8_string_codepoint_reverse(char *const string) {
int length = string_length(string);
for (int i = 0, char_len = 0; i < length; i += char_len) {
char_len = utf8_string_next_codepoint_length(string + i);
assert( char_len > 0 && char_len < 5 );
bytes_reverse(string + i, char_len);
}
bytes_reverse(string, length);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment