bwoods/UnicodeData.cpp

## ucd.cc
#if 0 /// this file can be executed directly (as a shell script)
c++ --std=c++17 -Os $0 -o ${0%.*} && ${0%.*} $@
exit $?
#endif

#include <fstream>
#include <string>
#include <regex>


auto unhex(std::string const& string) {
	return std::stoi(string, nullptr, 16);
};

auto split(std::string const& string, std::regex const& regex) {
	std::sregex_token_iterator end, begin(string.begin(), string.end(), regex, -1, std::regex_constants::match_not_null);
	return std::vector<std::string>(begin, end);
}

auto quoted(std::string const& string) {
	if (std::regex_match(string, std::regex("^[0-9A-F]{4,5}$")))
		return std::to_string(unhex(string));

	return "'" + string + "'";
};

auto basename(std::string string) {
	string = regex_replace(string, std::regex(".*/"), ""); // remove path
	string = regex_replace(string, std::regex("\\.txt$"), ""); // remove extension
	string = regex_replace(string, std::regex("Property$"), ""); // remove Property
	return string;
};


#include <stdio.h>

int main(int argc, char * const * const argv) {
	std::ios::sync_with_stdio(false);
	std::ifstream file;

	printf("BEGIN;\n");
	for (auto i = 1; i < argc && (file = std::ifstream(argv[i])); ++i) {
		auto filename = basename(argv[i]);
		printf("CREATE TABLE IF NOT EXISTS '%s' (codepoint INTEGER PRIMARY KEY NOT NULL, property);\n", filename.c_str());

		do {
			std::string line;
			std::getline(file, line);

			if (line = std::regex_replace(line, std::regex("[ ]*#.*$"), ""); line.empty()) // remove comments THEN check length
				continue;

			auto columns = split(line, std::regex("[ ]*;[ ]*"));

			if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}$"))) { // ⇐ single codepoint
				printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), unhex(columns.front()), quoted(columns.back()).c_str());

			} else if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}\\.\\.[0-9A-F]{4,5}$"))) { // ⇐ a range of codepoints
				auto codepoints = split(columns.front(), std::regex("\\.\\."));

				for (auto i = unhex(codepoints[0]), j = unhex(codepoints[1]); i <= j; ++i)
					printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), i, quoted(columns.back()).c_str());
			}

		} while (file);
	}

	printf("COMMIT;\n");
}


## ucd.sh
#!/usr/bin/awk -f
# ./ucd-parse UCD/extracted/DerivedGeneralCategory.txt…

BEGIN { FS=";"; print "BEGIN;" }
END { print "COMMIT;" }

{
	if (FNR == 1)
		print "CREATE TABLE IF NOT EXISTS '" basename(FILENAME) "' (codepoint INTEGER PRIMARY KEY, property);"

	gsub(/#.*/, ""); # remove comments
	gsub(/ /, "", $2) # removes spaces
}

/^[0-9A-F]{4,5}\.\.[0-9A-F]{4,5}/ {
	split($1, values, /\.\./)

	for (i = unhex(values[1]); i <= unhex(values[2]); ++i)
		print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" i ", " quoted($2) ");"
}

/^[0-9A-F]{4,5}[ ]+/ {
	print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" unhex($1) ", " quoted($2) ");"
}


function unhex(str) {
	return ("0x" str) + 0
}

function quoted(str) {
	if (match(str, /[0-9A-F]{4,5}/) == 1)
		return unhex(str);
	else
		return "'" str "'";
}

function basename(file) {
	sub(/.*\//, "", file) # remove path
	sub(/\.txt$/, "", file) # remove extension
	sub(/Property$/, "", file) # remove Property
	return file
}


## UnicodeData.cpp
#include <iostream>
#include <iomanip>

#include <fstream>
#include <sstream>
#include <string>

#include <vector>
#include <deque>
#include <array>
#include <map>

#include <cmath>


int main()
{
	std::ios::sync_with_stdio(false);

	constexpr size_t width = 16;
	constexpr size_t column = 2;

	std::deque<std::array<std::string, 14>> lines;
	std::array<std::string, 14> properties = { };

	std::ifstream input("UnicodeData.txt");
	while (input) {
		for (auto& property : properties)
			std::getline(input, property, ';');

		lines.push_back(std::move(properties));
	}

	std::vector<size_t> stage1;
	std::vector<std::array<std::string, width>> stage2;

	{
		std::map<std::array<std::string, width>, unsigned> groups;
		std::array<std::string, width> group = { };

		while (not lines.empty()) {
			for (auto& element : group) {
				if (lines.empty())
					break;

				element = lines.front()[column];
				lines.pop_front();
			}

			auto [itr, unseen] = groups.try_emplace(std::move(group));
			if (unseen == true) {
				itr->second = groups.size();
				stage2.push_back(itr->first);
			}

			stage1.push_back(itr->second);
		}

	}

	std::stringstream output;

	output << "constexpr ";
	output << (stage2.size() <= 256 ? "uint8_t" : "uint16_t") << " stage1 = {";
	for (unsigned i = 0; i < stage1.size(); ++i) {
		if (i % 22 == 0) output << "\n  ";
		output << std::setw(3) << stage1[i] - 1 << ", ";
	}
	output << "\n};\n\n";


	output << "constexpr uint8_t stage2 = {";
	for (auto& array : stage2) {
		output << "\n  ";
		std::copy(array.begin(), array.end(), std::ostream_iterator<std::string>(output, ", "));
	}
	output << "\n};\n\n";


	std::cout << "stage 1: " << stage1.size() << " entries\n";
	std::cout << "stage 2: " << stage2.size() << " entries\n\n";

	std::cout << "memory: " << (stage1.size() * 2) + (stage2.size() * width) << " bytes\n";
	std::cout << "file:   " << std::count(output.str().begin(), output.str().end(), '\n') << " lines\n\n\n";

	std::cout << output.str() << std::endl;
}


## UnicodeData.sh
#!/bin/bash


awkscript='
BEGIN { FS = ";" }

{
	printf   hex($1)   ";"  # code
	printf       $2    ";"  # name
	printf       $3    ";"  # general category
	printf       $4    ";"  # canonical combining class
	printf       $5    ";"  # bidi class
	printf       $6    ";"  # decomposition
	printf   hex($7)   ";"  # decimal digit value
	printf   hex($8)   ";"  # digit value
	printf   num($9)   ";"  # numeric value
	printf       $10   ";"  # bidi mirriored
	printf       $11   ";"  # unicode 1 name
	printf       $12   ";"  # ISO comment
	printf   hex($13)  ";"  # simple uppercase
	printf   hex($14)  ";"  # simple lowercase
	printf   hex($15)       # simple title

	printf "\n"
}


function hex(str) {
	if(str == "")
		return str;
	else
		return ("0x" str) + 0;
}

function num(str) {
	if(str == "")
		return str;
	else
		return str + 0;
}
'

awk "$awkscript" UnicodeData.txt > UnicodeData.tmp

\rm -f Unicode.db # silence File Not Found warnings
sqlite3 Unicode.db << EOS

CREATE TEMPORARY TABLE unicode_data (
	Value INTEGER,
	Name TEXT,
	General_Category TEXT,
	Canonical_Combining_Class INTEGER,
	Bidi_Class TEXT,
	Decomposition_Mapping TEXT,
	Decimal_Digit_Value INTEGER,
	Digit_Value INTEGER,
	Numeric_Value FLOAT,
	Bidi_Mirrored TEXT,
	Unicode_1_Name TEXT,
	ISO_Comment TEXT,
	Simple_Uppercase_Mapping INTEGER,
	Simple_Lowercase_Mapping INTEGER,
	Simple_Titlecase_Mapping INTEGER
);

.separator ;
.import UnicodeData.tmp "unicode_data"

PRAGMA page_size=2048;
BEGIN TRANSACTION;

-- To get HEX values for code:
-- 	SELECT printf('%05X', code) AS hex FROM …

CREATE TABLE "unicode name" (code INTEGER PRIMARY KEY, name TEXT);
 INSERT INTO "unicode name" SELECT Value, lower(Name) FROM unicode_data
	WHERE Name NOT LIKE '<%>';

CREATE TABLE "unicode category" (code INTEGER PRIMARY KEY, general TEXT);
 INSERT INTO "unicode category" SELECT Value, General_Category FROM unicode_data;

CREATE TABLE "unicode combining" (code INTEGER, class INTEGER);
 INSERT INTO "unicode combining" SELECT Value, Canonical_Combining_Class FROM unicode_data
	WHERE Canonical_Combining_Class <> 0;

CREATE TABLE "unicode bidi" (code INTEGER, class TEXT);
 INSERT INTO "unicode bidi" SELECT Value, Bidi_Class FROM unicode_data;

CREATE TABLE "unicode mirrored" (code INTEGER);
 INSERT INTO "unicode mirrored" SELECT Value FROM unicode_data
	WHERE Bidi_Mirrored <> 'N';

CREATE TABLE "unicode decomposition" (code INTEGER PRIMARY KEY, mapping TEXT);
 INSERT INTO "unicode decomposition" SELECT Value, Decomposition_Mapping FROM unicode_data
	WHERE Decomposition_Mapping <> '' AND Decomposition_Mapping NOT LIKE '<%';

CREATE TABLE "unicode decimal" (code INTEGER, value INTEGER);
 INSERT INTO "unicode decimal" SELECT Value, Decimal_Digit_Value FROM unicode_data
	WHERE Decimal_Digit_Value <> '';

CREATE TABLE "unicode digit" (code INTEGER, value INTEGER);
 INSERT INTO "unicode digit" SELECT Value, Digit_Value FROM unicode_data
	WHERE Digit_Value <> '';

CREATE TABLE "unicode uppercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
 INSERT INTO "unicode uppercase" SELECT Value, Simple_Uppercase_Mapping FROM unicode_data
	WHERE Simple_Uppercase_Mapping <> '';

CREATE TABLE "unicode lowercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
 INSERT INTO "unicode lowercase" SELECT Value, Simple_Lowercase_Mapping FROM unicode_data
	WHERE Simple_Lowercase_Mapping <> '';

CREATE TABLE "unicode titlecase" (code INTEGER PRIMARY KEY, mapping INTEGER);
 INSERT INTO "unicode titlecase" SELECT Value, Simple_Titlecase_Mapping FROM unicode_data
	WHERE Simple_Titlecase_Mapping <> '';

COMMIT TRANSACTION;

-- Example query
SELECT printf('0x%05X', start) as first, printf('0x%05X', end) as last FROM (
  WITH sequence AS (
    SELECT code FROM "unicode combining"
  )
  -- http://www.xaprb.com/blog/2006/03/22/find-contiguous-ranges-with-sql/
  SELECT l.code AS start, (
      SELECT min(a.code) AS id
        FROM sequence AS a LEFT OUTER JOIN sequence AS b ON a.code = b.code - 1
       WHERE b.code IS null AND a.code >= l.code
    ) AS end
  FROM sequence AS l LEFT OUTER JOIN sequence AS r ON r.code = l.code - 1
  WHERE r.code IS null
);

EOS

rm UnicodeData.tmp
	#if 0 /// this file can be executed directly (as a shell script)
	c++ --std=c++17 -Os $0 -o ${0%.} && ${0%.} $@
	exit $?
	#endif

	#include <fstream>
	#include <string>
	#include <regex>


	auto unhex(std::string const& string) {
	return std::stoi(string, nullptr, 16);
	};

	auto split(std::string const& string, std::regex const& regex) {
	std::sregex_token_iterator end, begin(string.begin(), string.end(), regex, -1, std::regex_constants::match_not_null);
	return std::vector<std::string>(begin, end);
	}

	auto quoted(std::string const& string) {
	if (std::regex_match(string, std::regex("^[0-9A-F]{4,5}$")))
	return std::to_string(unhex(string));

	return "'" + string + "'";
	};

	auto basename(std::string string) {
	string = regex_replace(string, std::regex(".*/"), ""); // remove path
	string = regex_replace(string, std::regex("\\.txt$"), ""); // remove extension
	string = regex_replace(string, std::regex("Property$"), ""); // remove Property
	return string;
	};


	#include <stdio.h>

	int main(int argc, char * const * const argv) {
	std::ios::sync_with_stdio(false);
	std::ifstream file;

	printf("BEGIN;\n");
	for (auto i = 1; i < argc && (file = std::ifstream(argv[i])); ++i) {
	auto filename = basename(argv[i]);
	printf("CREATE TABLE IF NOT EXISTS '%s' (codepoint INTEGER PRIMARY KEY NOT NULL, property);\n", filename.c_str());

	do {
	std::string line;
	std::getline(file, line);

	if (line = std::regex_replace(line, std::regex("[ ]#.$"), ""); line.empty()) // remove comments THEN check length
	continue;

	auto columns = split(line, std::regex("[ ];[ ]"));

	if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}$"))) { // ⇐ single codepoint
	printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), unhex(columns.front()), quoted(columns.back()).c_str());

	} else if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}\\.\\.[0-9A-F]{4,5}$"))) { // ⇐ a range of codepoints
	auto codepoints = split(columns.front(), std::regex("\\.\\."));

	for (auto i = unhex(codepoints[0]), j = unhex(codepoints[1]); i <= j; ++i)
	printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), i, quoted(columns.back()).c_str());
	}

	} while (file);
	}

	printf("COMMIT;\n");
	}
	#!/usr/bin/awk -f
	# ./ucd-parse UCD/extracted/DerivedGeneralCategory.txt…

	BEGIN { FS=";"; print "BEGIN;" }
	END { print "COMMIT;" }

	{
	if (FNR == 1)
	print "CREATE TABLE IF NOT EXISTS '" basename(FILENAME) "' (codepoint INTEGER PRIMARY KEY, property);"

	gsub(/#.*/, ""); # remove comments
	gsub(/ /, "", $2) # removes spaces
	}

	/^[0-9A-F]{4,5}\.\.[0-9A-F]{4,5}/ {
	split($1, values, /\.\./)

	for (i = unhex(values[1]); i <= unhex(values[2]); ++i)
	print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" i ", " quoted($2) ");"
	}

	/^[0-9A-F]{4,5}[ ]+/ {
	print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" unhex($1) ", " quoted($2) ");"
	}


	function unhex(str) {
	return ("0x" str) + 0
	}

	function quoted(str) {
	if (match(str, /[0-9A-F]{4,5}/) == 1)
	return unhex(str);
	else
	return "'" str "'";
	}

	function basename(file) {
	sub(/.*\//, "", file) # remove path
	sub(/\.txt$/, "", file) # remove extension
	sub(/Property$/, "", file) # remove Property
	return file
	}
	#include <iostream>
	#include <iomanip>

	#include <fstream>
	#include <sstream>
	#include <string>

	#include <vector>
	#include <deque>
	#include <array>
	#include <map>

	#include <cmath>


	int main()
	{
	std::ios::sync_with_stdio(false);

	constexpr size_t width = 16;
	constexpr size_t column = 2;

	std::deque<std::array<std::string, 14>> lines;
	std::array<std::string, 14> properties = { };

	std::ifstream input("UnicodeData.txt");
	while (input) {
	for (auto& property : properties)
	std::getline(input, property, ';');

	lines.push_back(std::move(properties));
	}

	std::vector<size_t> stage1;
	std::vector<std::array<std::string, width>> stage2;

	{
	std::map<std::array<std::string, width>, unsigned> groups;
	std::array<std::string, width> group = { };

	while (not lines.empty()) {
	for (auto& element : group) {
	if (lines.empty())
	break;

	element = lines.front()[column];
	lines.pop_front();
	}

	auto [itr, unseen] = groups.try_emplace(std::move(group));
	if (unseen == true) {
	itr->second = groups.size();
	stage2.push_back(itr->first);
	}

	stage1.push_back(itr->second);
	}

	}

	std::stringstream output;

	output << "constexpr ";
	output << (stage2.size() <= 256 ? "uint8_t" : "uint16_t") << " stage1 = {";
	for (unsigned i = 0; i < stage1.size(); ++i) {
	if (i % 22 == 0) output << "\n ";
	output << std::setw(3) << stage1[i] - 1 << ", ";
	}
	output << "\n};\n\n";


	output << "constexpr uint8_t stage2 = {";
	for (auto& array : stage2) {
	output << "\n ";
	std::copy(array.begin(), array.end(), std::ostream_iterator<std::string>(output, ", "));
	}
	output << "\n};\n\n";


	std::cout << "stage 1: " << stage1.size() << " entries\n";
	std::cout << "stage 2: " << stage2.size() << " entries\n\n";

	std::cout << "memory: " << (stage1.size() * 2) + (stage2.size() * width) << " bytes\n";
	std::cout << "file: " << std::count(output.str().begin(), output.str().end(), '\n') << " lines\n\n\n";

	std::cout << output.str() << std::endl;
	}
	#!/bin/bash


	awkscript='
	BEGIN { FS = ";" }

	{
	printf hex($1) ";" # code
	printf $2 ";" # name
	printf $3 ";" # general category
	printf $4 ";" # canonical combining class
	printf $5 ";" # bidi class
	printf $6 ";" # decomposition
	printf hex($7) ";" # decimal digit value
	printf hex($8) ";" # digit value
	printf num($9) ";" # numeric value
	printf $10 ";" # bidi mirriored
	printf $11 ";" # unicode 1 name
	printf $12 ";" # ISO comment
	printf hex($13) ";" # simple uppercase
	printf hex($14) ";" # simple lowercase
	printf hex($15) # simple title

	printf "\n"
	}


	function hex(str) {
	if(str == "")
	return str;
	else
	return ("0x" str) + 0;
	}

	function num(str) {
	if(str == "")
	return str;
	else
	return str + 0;
	}
	'

	awk "$awkscript" UnicodeData.txt > UnicodeData.tmp

	\rm -f Unicode.db # silence File Not Found warnings
	sqlite3 Unicode.db << EOS

	CREATE TEMPORARY TABLE unicode_data (
	Value INTEGER,
	Name TEXT,
	General_Category TEXT,
	Canonical_Combining_Class INTEGER,
	Bidi_Class TEXT,
	Decomposition_Mapping TEXT,
	Decimal_Digit_Value INTEGER,
	Digit_Value INTEGER,
	Numeric_Value FLOAT,
	Bidi_Mirrored TEXT,
	Unicode_1_Name TEXT,
	ISO_Comment TEXT,
	Simple_Uppercase_Mapping INTEGER,
	Simple_Lowercase_Mapping INTEGER,
	Simple_Titlecase_Mapping INTEGER
	);

	.separator ;
	.import UnicodeData.tmp "unicode_data"

	PRAGMA page_size=2048;
	BEGIN TRANSACTION;

	-- To get HEX values for code:
	-- SELECT printf('%05X', code) AS hex FROM …

	CREATE TABLE "unicode name" (code INTEGER PRIMARY KEY, name TEXT);
	INSERT INTO "unicode name" SELECT Value, lower(Name) FROM unicode_data
	WHERE Name NOT LIKE '<%>';

	CREATE TABLE "unicode category" (code INTEGER PRIMARY KEY, general TEXT);
	INSERT INTO "unicode category" SELECT Value, General_Category FROM unicode_data;

	CREATE TABLE "unicode combining" (code INTEGER, class INTEGER);
	INSERT INTO "unicode combining" SELECT Value, Canonical_Combining_Class FROM unicode_data
	WHERE Canonical_Combining_Class <> 0;

	CREATE TABLE "unicode bidi" (code INTEGER, class TEXT);
	INSERT INTO "unicode bidi" SELECT Value, Bidi_Class FROM unicode_data;

	CREATE TABLE "unicode mirrored" (code INTEGER);
	INSERT INTO "unicode mirrored" SELECT Value FROM unicode_data
	WHERE Bidi_Mirrored <> 'N';

	CREATE TABLE "unicode decomposition" (code INTEGER PRIMARY KEY, mapping TEXT);
	INSERT INTO "unicode decomposition" SELECT Value, Decomposition_Mapping FROM unicode_data
	WHERE Decomposition_Mapping <> '' AND Decomposition_Mapping NOT LIKE '<%';

	CREATE TABLE "unicode decimal" (code INTEGER, value INTEGER);
	INSERT INTO "unicode decimal" SELECT Value, Decimal_Digit_Value FROM unicode_data
	WHERE Decimal_Digit_Value <> '';

	CREATE TABLE "unicode digit" (code INTEGER, value INTEGER);
	INSERT INTO "unicode digit" SELECT Value, Digit_Value FROM unicode_data
	WHERE Digit_Value <> '';

	CREATE TABLE "unicode uppercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
	INSERT INTO "unicode uppercase" SELECT Value, Simple_Uppercase_Mapping FROM unicode_data
	WHERE Simple_Uppercase_Mapping <> '';

	CREATE TABLE "unicode lowercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
	INSERT INTO "unicode lowercase" SELECT Value, Simple_Lowercase_Mapping FROM unicode_data
	WHERE Simple_Lowercase_Mapping <> '';

	CREATE TABLE "unicode titlecase" (code INTEGER PRIMARY KEY, mapping INTEGER);
	INSERT INTO "unicode titlecase" SELECT Value, Simple_Titlecase_Mapping FROM unicode_data
	WHERE Simple_Titlecase_Mapping <> '';

	COMMIT TRANSACTION;

	-- Example query
	SELECT printf('0x%05X', start) as first, printf('0x%05X', end) as last FROM (
	WITH sequence AS (
	SELECT code FROM "unicode combining"
	)
	-- http://www.xaprb.com/blog/2006/03/22/find-contiguous-ranges-with-sql/
	SELECT l.code AS start, (
	SELECT min(a.code) AS id
	FROM sequence AS a LEFT OUTER JOIN sequence AS b ON a.code = b.code - 1
	WHERE b.code IS null AND a.code >= l.code
	) AS end
	FROM sequence AS l LEFT OUTER JOIN sequence AS r ON r.code = l.code - 1
	WHERE r.code IS null
	);

	EOS

	rm UnicodeData.tmp