nullhook/chars.cc

## chars.cc
#include <iostream>
#include <locale>
#include <string>
#include <fstream>
#include <codecvt>

// utf8/utf16/utf32 can be directly written to file without conversions
// sizeof(T) gives you bytes of the type
// .size() .length() gives count of chars
// if char16_t is stored the open the file with utf16 encoding
// utf8 is slowly becoming the standard; MacRoman was apple's default but now it's utf8
// 'locale' can give you system's default local language, curr, date settings
// you need to know the encoding prior to decoding
// you can convert a char16_t to machine's default charset by using std::locale
// utf16 stores less char bytes vs utf8
// endiness is byte ordering; little endiness means bytes will end with small number
// utf8 is comaptible with basic ascii and they're of single byte length and most significant bit is always 0
// basic ascii is only 0-127, but there are about 2^23 possible in utf8
// compiler: to calculate the byte length, or copy a utf8 string, it doesn't need to know about utf8
// compiler: to calculate the number of code points, or to split a string correctly, it does need to know about utf8
// splitting a string is the better example here. If you're interpreting it as ascii, but it actually has multi-byte utf8 characters in it, you can split in the middle of a code point by accident and produce two invalid or incorrect utf8 strings

int main() {
  std::u16string u16str = u"ßx";

  // convert u16 to u8. you can imbue it also!
  std::string u8conv = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str);

  std::string u8str = u8"ßx";


  std::cout << "\n";

  std::cout << "u16 Type size: " << sizeof(char16_t) << "\n";
  std::cout << "u16 String size: " << u16str.size() << "\n";
  std::cout << "u16->u8 String size: " << u8conv.size() << "\n";
  std::cout << "u16 pointer address: " << u16str.c_str() << "\n";
  std::cout << "UTF-16 produced: ";
  for(char16_t c : u16str)
    std::cout << std::hex << std::showbase << c << ' ';

  std::cout << "\n";
  std::cout << "\n";

  std::cout << "u8 Type size: " << sizeof(u8str) << "\n";
  std::cout << "u8 String size: " << u8str.size() << "\n";
  std::cout << "u8 pointer: " << u8str.c_str() << "\n";
  std::cout << "UTF-16 to UTF-8 conversion produced: ";
  for(unsigned char c : u8conv) /* char is signed, so numbers >127 are effectively negative numbers */
    std::cout << std::hex << std::showbase << +(c) << ' '; /* dont print leading zeros */

  std::cout << "\n";

  std::ofstream file("from_utf16.txt");
  file.write(u8conv.c_str(), sizeof(char)*u8conv.size());

  return 0;
}
	#include <iostream>
	#include <locale>
	#include <string>
	#include <fstream>
	#include <codecvt>

	// utf8/utf16/utf32 can be directly written to file without conversions
	// sizeof(T) gives you bytes of the type
	// .size() .length() gives count of chars
	// if char16_t is stored the open the file with utf16 encoding
	// utf8 is slowly becoming the standard; MacRoman was apple's default but now it's utf8
	// 'locale' can give you system's default local language, curr, date settings
	// you need to know the encoding prior to decoding
	// you can convert a char16_t to machine's default charset by using std::locale
	// utf16 stores less char bytes vs utf8
	// endiness is byte ordering; little endiness means bytes will end with small number
	// utf8 is comaptible with basic ascii and they're of single byte length and most significant bit is always 0
	// basic ascii is only 0-127, but there are about 2^23 possible in utf8
	// compiler: to calculate the byte length, or copy a utf8 string, it doesn't need to know about utf8
	// compiler: to calculate the number of code points, or to split a string correctly, it does need to know about utf8
	// splitting a string is the better example here. If you're interpreting it as ascii, but it actually has multi-byte utf8 characters in it, you can split in the middle of a code point by accident and produce two invalid or incorrect utf8 strings

	int main() {
	std::u16string u16str = u"ßx";

	// convert u16 to u8. you can imbue it also!
	std::string u8conv = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str);

	std::string u8str = u8"ßx";


	std::cout << "\n";

	std::cout << "u16 Type size: " << sizeof(char16_t) << "\n";
	std::cout << "u16 String size: " << u16str.size() << "\n";
	std::cout << "u16->u8 String size: " << u8conv.size() << "\n";
	std::cout << "u16 pointer address: " << u16str.c_str() << "\n";
	std::cout << "UTF-16 produced: ";
	for(char16_t c : u16str)
	std::cout << std::hex << std::showbase << c << ' ';

	std::cout << "\n";
	std::cout << "\n";

	std::cout << "u8 Type size: " << sizeof(u8str) << "\n";
	std::cout << "u8 String size: " << u8str.size() << "\n";
	std::cout << "u8 pointer: " << u8str.c_str() << "\n";
	std::cout << "UTF-16 to UTF-8 conversion produced: ";
	for(unsigned char c : u8conv) /* char is signed, so numbers >127 are effectively negative numbers */
	std::cout << std::hex << std::showbase << +(c) << ' '; /* dont print leading zeros */

	std::cout << "\n";

	std::ofstream file("from_utf16.txt");
	file.write(u8conv.c_str(), sizeof(char)*u8conv.size());

	return 0;
	}