tommai78101/unicode.c

## unicode.c
#include <stdio.h>
#include <stdlib.h>
#include <uchar.h>
#include <locale.h>

#define __STD_UTF_16__

//Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
	//First, grab the first byte of the UTF-8 string
	unsigned char* utf8_currentCodeUnit = utf8_str;
	char16_t* utf16_currentCodeUnit = utf16_str_output;
	int utf8_str_iterator = 0;
	int utf16_str_iterator = 0;

	//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
	//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
	//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
	while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size || utf8_str_iterator < utf8_str_size)) {
		//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
		//differently from one another.
		if (*utf8_currentCodeUnit < 0x80) {
			//0..127, the ASCII range.

			//We directly plug in the values to the UTF-16 code unit.
			*utf16_currentCodeUnit = (char16_t) (*utf8_currentCodeUnit);
			utf16_currentCodeUnit++;
			utf16_str_iterator++;

			//Increment the current code unit pointer to the next code unit
			utf8_currentCodeUnit++;

			//Increment the iterator to keep track of where we are in the UTF-8 string
			utf8_str_iterator++;
		}
		else if (*utf8_currentCodeUnit < 0xC0) {
			//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
			utf8_currentCodeUnit++;
			utf8_str_iterator++;
		}
		else if (*utf8_currentCodeUnit < 0xE0) {
			//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.

			//Work on the first code unit.
			char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);

			//Increment the current code unit pointer to the next code unit
			utf8_currentCodeUnit++;

			//Work on the second code unit.
			char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);

			//Increment the current code unit pointer to the next code unit
			utf8_currentCodeUnit++;

			//Create the UTF-16 code unit, then increment the iterator.
			//Credits to @tbeu.
			//Thanks to @k6l2 for explaining why we need 6 instead of 8.
			//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are
			//adding 2 extra zeroes in between the actual data of both shorts.
			int unicode = (highShort << 6) | lowShort;

			//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
			if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
				//Directly set the value to the UTF-16 code unit.
				*utf16_currentCodeUnit = (char16_t) unicode;
				utf16_currentCodeUnit++;
				utf16_str_iterator++;
			}

			//Increment the iterator to keep track of where we are in the UTF-8 string
			utf8_str_iterator += 2;
		}
		else if (*utf8_currentCodeUnit < 0xF0) {
			//2048..65535, the remaining Basic Multilingual Plane.

			//Work on the UTF-8 code units one by one.
			//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
			//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
			char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
			utf8_currentCodeUnit++;
			char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
			char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
			utf8_currentCodeUnit++;
			char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
			char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
			utf8_currentCodeUnit++;

			//Create the resulting UTF-16 code unit, then increment the iterator.
			int unicode = (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;

			//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
			//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
			if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
				//Directly set the value to the UTF-16 code unit.
				*utf16_currentCodeUnit = (char16_t) unicode;
				utf16_currentCodeUnit++;
				utf16_str_iterator++;
			}

			//Increment the iterator to keep track of where we are in the UTF-8 string
			utf8_str_iterator += 3;
		}
		else if (*utf8_currentCodeUnit < 0xF8) {
			//65536..10FFFF, the Unicode UTF range

			//Work on the UTF-8 code units one by one.
			//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
			//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
			char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
			char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
			utf8_currentCodeUnit++;
			char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
			char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
			utf8_currentCodeUnit++;
			char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
			char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
			utf8_currentCodeUnit++;
			char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
			char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
			utf8_currentCodeUnit++;

			int unicode = (sixthChar << 4) | (fifthCharHigh << 2) | fifthCharLow | (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
			char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
			char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;

			//Set the UTF-16 code units
			*utf16_currentCodeUnit = lowSurrogate;
			utf16_currentCodeUnit++;
			utf16_str_iterator++;

			//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
			if (utf16_str_iterator < utf16_str_output_size) {
				*utf16_currentCodeUnit = highSurrogate;
				utf16_currentCodeUnit++;
				utf16_str_iterator++;
			}

			//Increment the iterator to keep track of where we are in the UTF-8 string
			utf8_str_iterator += 4;
		}
		else {
			//Invalid UTF-8 code unit, we ignore.
			utf8_currentCodeUnit++;
			utf8_str_iterator++;
		}
	}

	//We clean up the output string if the UTF-16 iterator is still less than the output string size.
	while (utf16_str_iterator < utf16_str_output_size) {
		*utf16_currentCodeUnit = '\0';
		utf16_currentCodeUnit++;
		utf16_str_iterator++;
	}
}

int main() {
	unsigned char array[] = u8"我是誰?";
	char16_t output[25];

	utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));

	//Set debug breakpoint on the return statement, and inspect "output" variable.
	return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <uchar.h>
	#include <locale.h>

	#define __STD_UTF_16__

	//Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
	void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
	//First, grab the first byte of the UTF-8 string
	unsigned char* utf8_currentCodeUnit = utf8_str;
	char16_t* utf16_currentCodeUnit = utf16_str_output;
	int utf8_str_iterator = 0;
	int utf16_str_iterator = 0;

	//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
	//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
	//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
	while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size \|\| utf8_str_iterator < utf8_str_size)) {
	//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
	//differently from one another.
	if (*utf8_currentCodeUnit < 0x80) {
	//0..127, the ASCII range.

	//We directly plug in the values to the UTF-16 code unit.
	utf16_currentCodeUnit = (char16_t) (utf8_currentCodeUnit);
	utf16_currentCodeUnit++;
	utf16_str_iterator++;

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator++;
	}
	else if (*utf8_currentCodeUnit < 0xC0) {
	//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
	utf8_currentCodeUnit++;
	utf8_str_iterator++;
	}
	else if (*utf8_currentCodeUnit < 0xE0) {
	//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.

	//Work on the first code unit.
	char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Work on the second code unit.
	char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Create the UTF-16 code unit, then increment the iterator.
	//Credits to @tbeu.
	//Thanks to @k6l2 for explaining why we need 6 instead of 8.
	//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are
	//adding 2 extra zeroes in between the actual data of both shorts.
	int unicode = (highShort << 6) \| lowShort;

	//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
	if ((0 <= unicode && unicode <= 0xD7FF) \|\| (0xE000 <= unicode && unicode <= 0xFFFF)) {
	//Directly set the value to the UTF-16 code unit.
	*utf16_currentCodeUnit = (char16_t) unicode;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 2;
	}
	else if (*utf8_currentCodeUnit < 0xF0) {
	//2048..65535, the remaining Basic Multilingual Plane.

	//Work on the UTF-8 code units one by one.
	//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
	//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
	char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;
	char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
	char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;

	//Create the resulting UTF-16 code unit, then increment the iterator.
	int unicode = (fourthChar << 12) \| (thirdChar << 8) \| (secondCharHigh << 6) \| (secondCharLow << 4) \| firstChar;

	//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
	//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
	if ((0 <= unicode && unicode <= 0xD7FF) \|\| (0xE000 <= unicode && unicode <= 0xFFFF)) {
	//Directly set the value to the UTF-16 code unit.
	*utf16_currentCodeUnit = (char16_t) unicode;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 3;
	}
	else if (*utf8_currentCodeUnit < 0xF8) {
	//65536..10FFFF, the Unicode UTF range

	//Work on the UTF-8 code units one by one.
	//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
	//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
	char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
	char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;
	char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
	char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;

	int unicode = (sixthChar << 4) \| (fifthCharHigh << 2) \| fifthCharLow \| (fourthChar << 12) \| (thirdChar << 8) \| (secondCharHigh << 6) \| (secondCharLow << 4) \| firstChar;
	char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
	char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;

	//Set the UTF-16 code units
	*utf16_currentCodeUnit = lowSurrogate;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;

	//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
	if (utf16_str_iterator < utf16_str_output_size) {
	*utf16_currentCodeUnit = highSurrogate;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 4;
	}
	else {
	//Invalid UTF-8 code unit, we ignore.
	utf8_currentCodeUnit++;
	utf8_str_iterator++;
	}
	}

	//We clean up the output string if the UTF-16 iterator is still less than the output string size.
	while (utf16_str_iterator < utf16_str_output_size) {
	*utf16_currentCodeUnit = '\0';
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}
	}

	int main() {
	unsigned char array[] = u8"我是誰?";
	char16_t output[25];

	utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));

	//Set debug breakpoint on the return statement, and inspect "output" variable.
	return 0;
	}