Skip to content

Instantly share code, notes, and snippets.

@bwoods
Last active May 31, 2024 23:27
Show Gist options
  • Save bwoods/e5bd964fc360ba5e920e to your computer and use it in GitHub Desktop.
Save bwoods/e5bd964fc360ba5e920e to your computer and use it in GitHub Desktop.
Parsing UnicodeData.txt
#if 0 /// this file can be executed directly (as a shell script)
c++ --std=c++17 -Os $0 -o ${0%.*} && ${0%.*} $@
exit $?
#endif
#include <fstream>
#include <string>
#include <regex>
auto unhex(std::string const& string) {
return std::stoi(string, nullptr, 16);
};
auto split(std::string const& string, std::regex const& regex) {
std::sregex_token_iterator end, begin(string.begin(), string.end(), regex, -1, std::regex_constants::match_not_null);
return std::vector<std::string>(begin, end);
}
auto quoted(std::string const& string) {
if (std::regex_match(string, std::regex("^[0-9A-F]{4,5}$")))
return std::to_string(unhex(string));
return "'" + string + "'";
};
auto basename(std::string string) {
string = regex_replace(string, std::regex(".*/"), ""); // remove path
string = regex_replace(string, std::regex("\\.txt$"), ""); // remove extension
string = regex_replace(string, std::regex("Property$"), ""); // remove Property
return string;
};
#include <stdio.h>
int main(int argc, char * const * const argv) {
std::ios::sync_with_stdio(false);
std::ifstream file;
printf("BEGIN;\n");
for (auto i = 1; i < argc && (file = std::ifstream(argv[i])); ++i) {
auto filename = basename(argv[i]);
printf("CREATE TABLE IF NOT EXISTS '%s' (codepoint INTEGER PRIMARY KEY NOT NULL, property);\n", filename.c_str());
do {
std::string line;
std::getline(file, line);
if (line = std::regex_replace(line, std::regex("[ ]*#.*$"), ""); line.empty()) // remove comments THEN check length
continue;
auto columns = split(line, std::regex("[ ]*;[ ]*"));
if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}$"))) { // ⇐ single codepoint
printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), unhex(columns.front()), quoted(columns.back()).c_str());
} else if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}\\.\\.[0-9A-F]{4,5}$"))) { // ⇐ a range of codepoints
auto codepoints = split(columns.front(), std::regex("\\.\\."));
for (auto i = unhex(codepoints[0]), j = unhex(codepoints[1]); i <= j; ++i)
printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), i, quoted(columns.back()).c_str());
}
} while (file);
}
printf("COMMIT;\n");
}
#!/usr/bin/awk -f
# ./ucd-parse UCD/extracted/DerivedGeneralCategory.txt…
BEGIN { FS=";"; print "BEGIN;" }
END { print "COMMIT;" }
{
if (FNR == 1)
print "CREATE TABLE IF NOT EXISTS '" basename(FILENAME) "' (codepoint INTEGER PRIMARY KEY, property);"
gsub(/#.*/, ""); # remove comments
gsub(/ /, "", $2) # removes spaces
}
/^[0-9A-F]{4,5}\.\.[0-9A-F]{4,5}/ {
split($1, values, /\.\./)
for (i = unhex(values[1]); i <= unhex(values[2]); ++i)
print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" i ", " quoted($2) ");"
}
/^[0-9A-F]{4,5}[ ]+/ {
print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" unhex($1) ", " quoted($2) ");"
}
function unhex(str) {
return ("0x" str) + 0
}
function quoted(str) {
if (match(str, /[0-9A-F]{4,5}/) == 1)
return unhex(str);
else
return "'" str "'";
}
function basename(file) {
sub(/.*\//, "", file) # remove path
sub(/\.txt$/, "", file) # remove extension
sub(/Property$/, "", file) # remove Property
return file
}
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <deque>
#include <array>
#include <map>
#include <cmath>
int main()
{
std::ios::sync_with_stdio(false);
constexpr size_t width = 16;
constexpr size_t column = 2;
std::deque<std::array<std::string, 14>> lines;
std::array<std::string, 14> properties = { };
std::ifstream input("UnicodeData.txt");
while (input) {
for (auto& property : properties)
std::getline(input, property, ';');
lines.push_back(std::move(properties));
}
std::vector<size_t> stage1;
std::vector<std::array<std::string, width>> stage2;
{
std::map<std::array<std::string, width>, unsigned> groups;
std::array<std::string, width> group = { };
while (not lines.empty()) {
for (auto& element : group) {
if (lines.empty())
break;
element = lines.front()[column];
lines.pop_front();
}
auto [itr, unseen] = groups.try_emplace(std::move(group));
if (unseen == true) {
itr->second = groups.size();
stage2.push_back(itr->first);
}
stage1.push_back(itr->second);
}
}
std::stringstream output;
output << "constexpr ";
output << (stage2.size() <= 256 ? "uint8_t" : "uint16_t") << " stage1 = {";
for (unsigned i = 0; i < stage1.size(); ++i) {
if (i % 22 == 0) output << "\n ";
output << std::setw(3) << stage1[i] - 1 << ", ";
}
output << "\n};\n\n";
output << "constexpr uint8_t stage2 = {";
for (auto& array : stage2) {
output << "\n ";
std::copy(array.begin(), array.end(), std::ostream_iterator<std::string>(output, ", "));
}
output << "\n};\n\n";
std::cout << "stage 1: " << stage1.size() << " entries\n";
std::cout << "stage 2: " << stage2.size() << " entries\n\n";
std::cout << "memory: " << (stage1.size() * 2) + (stage2.size() * width) << " bytes\n";
std::cout << "file: " << std::count(output.str().begin(), output.str().end(), '\n') << " lines\n\n\n";
std::cout << output.str() << std::endl;
}
#!/bin/bash
awkscript='
BEGIN { FS = ";" }
{
printf hex($1) ";" # code
printf $2 ";" # name
printf $3 ";" # general category
printf $4 ";" # canonical combining class
printf $5 ";" # bidi class
printf $6 ";" # decomposition
printf hex($7) ";" # decimal digit value
printf hex($8) ";" # digit value
printf num($9) ";" # numeric value
printf $10 ";" # bidi mirriored
printf $11 ";" # unicode 1 name
printf $12 ";" # ISO comment
printf hex($13) ";" # simple uppercase
printf hex($14) ";" # simple lowercase
printf hex($15) # simple title
printf "\n"
}
function hex(str) {
if(str == "")
return str;
else
return ("0x" str) + 0;
}
function num(str) {
if(str == "")
return str;
else
return str + 0;
}
'
awk "$awkscript" UnicodeData.txt > UnicodeData.tmp
\rm -f Unicode.db # silence File Not Found warnings
sqlite3 Unicode.db << EOS
CREATE TEMPORARY TABLE unicode_data (
Value INTEGER,
Name TEXT,
General_Category TEXT,
Canonical_Combining_Class INTEGER,
Bidi_Class TEXT,
Decomposition_Mapping TEXT,
Decimal_Digit_Value INTEGER,
Digit_Value INTEGER,
Numeric_Value FLOAT,
Bidi_Mirrored TEXT,
Unicode_1_Name TEXT,
ISO_Comment TEXT,
Simple_Uppercase_Mapping INTEGER,
Simple_Lowercase_Mapping INTEGER,
Simple_Titlecase_Mapping INTEGER
);
.separator ;
.import UnicodeData.tmp "unicode_data"
PRAGMA page_size=2048;
BEGIN TRANSACTION;
-- To get HEX values for code:
-- SELECT printf('%05X', code) AS hex FROM …
CREATE TABLE "unicode name" (code INTEGER PRIMARY KEY, name TEXT);
INSERT INTO "unicode name" SELECT Value, lower(Name) FROM unicode_data
WHERE Name NOT LIKE '<%>';
CREATE TABLE "unicode category" (code INTEGER PRIMARY KEY, general TEXT);
INSERT INTO "unicode category" SELECT Value, General_Category FROM unicode_data;
CREATE TABLE "unicode combining" (code INTEGER, class INTEGER);
INSERT INTO "unicode combining" SELECT Value, Canonical_Combining_Class FROM unicode_data
WHERE Canonical_Combining_Class <> 0;
CREATE TABLE "unicode bidi" (code INTEGER, class TEXT);
INSERT INTO "unicode bidi" SELECT Value, Bidi_Class FROM unicode_data;
CREATE TABLE "unicode mirrored" (code INTEGER);
INSERT INTO "unicode mirrored" SELECT Value FROM unicode_data
WHERE Bidi_Mirrored <> 'N';
CREATE TABLE "unicode decomposition" (code INTEGER PRIMARY KEY, mapping TEXT);
INSERT INTO "unicode decomposition" SELECT Value, Decomposition_Mapping FROM unicode_data
WHERE Decomposition_Mapping <> '' AND Decomposition_Mapping NOT LIKE '<%';
CREATE TABLE "unicode decimal" (code INTEGER, value INTEGER);
INSERT INTO "unicode decimal" SELECT Value, Decimal_Digit_Value FROM unicode_data
WHERE Decimal_Digit_Value <> '';
CREATE TABLE "unicode digit" (code INTEGER, value INTEGER);
INSERT INTO "unicode digit" SELECT Value, Digit_Value FROM unicode_data
WHERE Digit_Value <> '';
CREATE TABLE "unicode uppercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
INSERT INTO "unicode uppercase" SELECT Value, Simple_Uppercase_Mapping FROM unicode_data
WHERE Simple_Uppercase_Mapping <> '';
CREATE TABLE "unicode lowercase" (code INTEGER PRIMARY KEY, mapping INTEGER);
INSERT INTO "unicode lowercase" SELECT Value, Simple_Lowercase_Mapping FROM unicode_data
WHERE Simple_Lowercase_Mapping <> '';
CREATE TABLE "unicode titlecase" (code INTEGER PRIMARY KEY, mapping INTEGER);
INSERT INTO "unicode titlecase" SELECT Value, Simple_Titlecase_Mapping FROM unicode_data
WHERE Simple_Titlecase_Mapping <> '';
COMMIT TRANSACTION;
-- Example query
SELECT printf('0x%05X', start) as first, printf('0x%05X', end) as last FROM (
WITH sequence AS (
SELECT code FROM "unicode combining"
)
-- http://www.xaprb.com/blog/2006/03/22/find-contiguous-ranges-with-sql/
SELECT l.code AS start, (
SELECT min(a.code) AS id
FROM sequence AS a LEFT OUTER JOIN sequence AS b ON a.code = b.code - 1
WHERE b.code IS null AND a.code >= l.code
) AS end
FROM sequence AS l LEFT OUTER JOIN sequence AS r ON r.code = l.code - 1
WHERE r.code IS null
);
EOS
rm UnicodeData.tmp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment