Last active
May 31, 2024 23:27
-
-
Save bwoods/e5bd964fc360ba5e920e to your computer and use it in GitHub Desktop.
Parsing UnicodeData.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if 0 /// this file can be executed directly (as a shell script) | |
c++ --std=c++17 -Os $0 -o ${0%.*} && ${0%.*} $@ | |
exit $? | |
#endif | |
#include <fstream> | |
#include <string> | |
#include <regex> | |
auto unhex(std::string const& string) { | |
return std::stoi(string, nullptr, 16); | |
}; | |
auto split(std::string const& string, std::regex const& regex) { | |
std::sregex_token_iterator end, begin(string.begin(), string.end(), regex, -1, std::regex_constants::match_not_null); | |
return std::vector<std::string>(begin, end); | |
} | |
auto quoted(std::string const& string) { | |
if (std::regex_match(string, std::regex("^[0-9A-F]{4,5}$"))) | |
return std::to_string(unhex(string)); | |
return "'" + string + "'"; | |
}; | |
auto basename(std::string string) { | |
string = regex_replace(string, std::regex(".*/"), ""); // remove path | |
string = regex_replace(string, std::regex("\\.txt$"), ""); // remove extension | |
string = regex_replace(string, std::regex("Property$"), ""); // remove Property | |
return string; | |
}; | |
#include <stdio.h> | |
int main(int argc, char * const * const argv) { | |
std::ios::sync_with_stdio(false); | |
std::ifstream file; | |
printf("BEGIN;\n"); | |
for (auto i = 1; i < argc && (file = std::ifstream(argv[i])); ++i) { | |
auto filename = basename(argv[i]); | |
printf("CREATE TABLE IF NOT EXISTS '%s' (codepoint INTEGER PRIMARY KEY NOT NULL, property);\n", filename.c_str()); | |
do { | |
std::string line; | |
std::getline(file, line); | |
if (line = std::regex_replace(line, std::regex("[ ]*#.*$"), ""); line.empty()) // remove comments THEN check length | |
continue; | |
auto columns = split(line, std::regex("[ ]*;[ ]*")); | |
if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}$"))) { // ⇐ single codepoint | |
printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), unhex(columns.front()), quoted(columns.back()).c_str()); | |
} else if (std::regex_match(columns.front(), std::regex("^[0-9A-F]{4,5}\\.\\.[0-9A-F]{4,5}$"))) { // ⇐ a range of codepoints | |
auto codepoints = split(columns.front(), std::regex("\\.\\.")); | |
for (auto i = unhex(codepoints[0]), j = unhex(codepoints[1]); i <= j; ++i) | |
printf("INSERT OR REPLACE INTO '%s' VALUES (%i, %s);\n", filename.c_str(), i, quoted(columns.back()).c_str()); | |
} | |
} while (file); | |
} | |
printf("COMMIT;\n"); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# ./ucd-parse UCD/extracted/DerivedGeneralCategory.txt… | |
BEGIN { FS=";"; print "BEGIN;" } | |
END { print "COMMIT;" } | |
{ | |
if (FNR == 1) | |
print "CREATE TABLE IF NOT EXISTS '" basename(FILENAME) "' (codepoint INTEGER PRIMARY KEY, property);" | |
gsub(/#.*/, ""); # remove comments | |
gsub(/ /, "", $2) # removes spaces | |
} | |
/^[0-9A-F]{4,5}\.\.[0-9A-F]{4,5}/ { | |
split($1, values, /\.\./) | |
for (i = unhex(values[1]); i <= unhex(values[2]); ++i) | |
print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" i ", " quoted($2) ");" | |
} | |
/^[0-9A-F]{4,5}[ ]+/ { | |
print "INSERT OR REPLACE INTO '" basename(FILENAME) "' VALUES (" unhex($1) ", " quoted($2) ");" | |
} | |
function unhex(str) { | |
return ("0x" str) + 0 | |
} | |
function quoted(str) { | |
if (match(str, /[0-9A-F]{4,5}/) == 1) | |
return unhex(str); | |
else | |
return "'" str "'"; | |
} | |
function basename(file) { | |
sub(/.*\//, "", file) # remove path | |
sub(/\.txt$/, "", file) # remove extension | |
sub(/Property$/, "", file) # remove Property | |
return file | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <iomanip> | |
#include <fstream> | |
#include <sstream> | |
#include <string> | |
#include <vector> | |
#include <deque> | |
#include <array> | |
#include <map> | |
#include <cmath> | |
int main() | |
{ | |
std::ios::sync_with_stdio(false); | |
constexpr size_t width = 16; | |
constexpr size_t column = 2; | |
std::deque<std::array<std::string, 14>> lines; | |
std::array<std::string, 14> properties = { }; | |
std::ifstream input("UnicodeData.txt"); | |
while (input) { | |
for (auto& property : properties) | |
std::getline(input, property, ';'); | |
lines.push_back(std::move(properties)); | |
} | |
std::vector<size_t> stage1; | |
std::vector<std::array<std::string, width>> stage2; | |
{ | |
std::map<std::array<std::string, width>, unsigned> groups; | |
std::array<std::string, width> group = { }; | |
while (not lines.empty()) { | |
for (auto& element : group) { | |
if (lines.empty()) | |
break; | |
element = lines.front()[column]; | |
lines.pop_front(); | |
} | |
auto [itr, unseen] = groups.try_emplace(std::move(group)); | |
if (unseen == true) { | |
itr->second = groups.size(); | |
stage2.push_back(itr->first); | |
} | |
stage1.push_back(itr->second); | |
} | |
} | |
std::stringstream output; | |
output << "constexpr "; | |
output << (stage2.size() <= 256 ? "uint8_t" : "uint16_t") << " stage1 = {"; | |
for (unsigned i = 0; i < stage1.size(); ++i) { | |
if (i % 22 == 0) output << "\n "; | |
output << std::setw(3) << stage1[i] - 1 << ", "; | |
} | |
output << "\n};\n\n"; | |
output << "constexpr uint8_t stage2 = {"; | |
for (auto& array : stage2) { | |
output << "\n "; | |
std::copy(array.begin(), array.end(), std::ostream_iterator<std::string>(output, ", ")); | |
} | |
output << "\n};\n\n"; | |
std::cout << "stage 1: " << stage1.size() << " entries\n"; | |
std::cout << "stage 2: " << stage2.size() << " entries\n\n"; | |
std::cout << "memory: " << (stage1.size() * 2) + (stage2.size() * width) << " bytes\n"; | |
std::cout << "file: " << std::count(output.str().begin(), output.str().end(), '\n') << " lines\n\n\n"; | |
std::cout << output.str() << std::endl; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
awkscript=' | |
BEGIN { FS = ";" } | |
{ | |
printf hex($1) ";" # code | |
printf $2 ";" # name | |
printf $3 ";" # general category | |
printf $4 ";" # canonical combining class | |
printf $5 ";" # bidi class | |
printf $6 ";" # decomposition | |
printf hex($7) ";" # decimal digit value | |
printf hex($8) ";" # digit value | |
printf num($9) ";" # numeric value | |
printf $10 ";" # bidi mirriored | |
printf $11 ";" # unicode 1 name | |
printf $12 ";" # ISO comment | |
printf hex($13) ";" # simple uppercase | |
printf hex($14) ";" # simple lowercase | |
printf hex($15) # simple title | |
printf "\n" | |
} | |
function hex(str) { | |
if(str == "") | |
return str; | |
else | |
return ("0x" str) + 0; | |
} | |
function num(str) { | |
if(str == "") | |
return str; | |
else | |
return str + 0; | |
} | |
' | |
awk "$awkscript" UnicodeData.txt > UnicodeData.tmp | |
\rm -f Unicode.db # silence File Not Found warnings | |
sqlite3 Unicode.db << EOS | |
CREATE TEMPORARY TABLE unicode_data ( | |
Value INTEGER, | |
Name TEXT, | |
General_Category TEXT, | |
Canonical_Combining_Class INTEGER, | |
Bidi_Class TEXT, | |
Decomposition_Mapping TEXT, | |
Decimal_Digit_Value INTEGER, | |
Digit_Value INTEGER, | |
Numeric_Value FLOAT, | |
Bidi_Mirrored TEXT, | |
Unicode_1_Name TEXT, | |
ISO_Comment TEXT, | |
Simple_Uppercase_Mapping INTEGER, | |
Simple_Lowercase_Mapping INTEGER, | |
Simple_Titlecase_Mapping INTEGER | |
); | |
.separator ; | |
.import UnicodeData.tmp "unicode_data" | |
PRAGMA page_size=2048; | |
BEGIN TRANSACTION; | |
-- To get HEX values for code: | |
-- SELECT printf('%05X', code) AS hex FROM … | |
CREATE TABLE "unicode name" (code INTEGER PRIMARY KEY, name TEXT); | |
INSERT INTO "unicode name" SELECT Value, lower(Name) FROM unicode_data | |
WHERE Name NOT LIKE '<%>'; | |
CREATE TABLE "unicode category" (code INTEGER PRIMARY KEY, general TEXT); | |
INSERT INTO "unicode category" SELECT Value, General_Category FROM unicode_data; | |
CREATE TABLE "unicode combining" (code INTEGER, class INTEGER); | |
INSERT INTO "unicode combining" SELECT Value, Canonical_Combining_Class FROM unicode_data | |
WHERE Canonical_Combining_Class <> 0; | |
CREATE TABLE "unicode bidi" (code INTEGER, class TEXT); | |
INSERT INTO "unicode bidi" SELECT Value, Bidi_Class FROM unicode_data; | |
CREATE TABLE "unicode mirrored" (code INTEGER); | |
INSERT INTO "unicode mirrored" SELECT Value FROM unicode_data | |
WHERE Bidi_Mirrored <> 'N'; | |
CREATE TABLE "unicode decomposition" (code INTEGER PRIMARY KEY, mapping TEXT); | |
INSERT INTO "unicode decomposition" SELECT Value, Decomposition_Mapping FROM unicode_data | |
WHERE Decomposition_Mapping <> '' AND Decomposition_Mapping NOT LIKE '<%'; | |
CREATE TABLE "unicode decimal" (code INTEGER, value INTEGER); | |
INSERT INTO "unicode decimal" SELECT Value, Decimal_Digit_Value FROM unicode_data | |
WHERE Decimal_Digit_Value <> ''; | |
CREATE TABLE "unicode digit" (code INTEGER, value INTEGER); | |
INSERT INTO "unicode digit" SELECT Value, Digit_Value FROM unicode_data | |
WHERE Digit_Value <> ''; | |
CREATE TABLE "unicode uppercase" (code INTEGER PRIMARY KEY, mapping INTEGER); | |
INSERT INTO "unicode uppercase" SELECT Value, Simple_Uppercase_Mapping FROM unicode_data | |
WHERE Simple_Uppercase_Mapping <> ''; | |
CREATE TABLE "unicode lowercase" (code INTEGER PRIMARY KEY, mapping INTEGER); | |
INSERT INTO "unicode lowercase" SELECT Value, Simple_Lowercase_Mapping FROM unicode_data | |
WHERE Simple_Lowercase_Mapping <> ''; | |
CREATE TABLE "unicode titlecase" (code INTEGER PRIMARY KEY, mapping INTEGER); | |
INSERT INTO "unicode titlecase" SELECT Value, Simple_Titlecase_Mapping FROM unicode_data | |
WHERE Simple_Titlecase_Mapping <> ''; | |
COMMIT TRANSACTION; | |
-- Example query | |
SELECT printf('0x%05X', start) as first, printf('0x%05X', end) as last FROM ( | |
WITH sequence AS ( | |
SELECT code FROM "unicode combining" | |
) | |
-- http://www.xaprb.com/blog/2006/03/22/find-contiguous-ranges-with-sql/ | |
SELECT l.code AS start, ( | |
SELECT min(a.code) AS id | |
FROM sequence AS a LEFT OUTER JOIN sequence AS b ON a.code = b.code - 1 | |
WHERE b.code IS null AND a.code >= l.code | |
) AS end | |
FROM sequence AS l LEFT OUTER JOIN sequence AS r ON r.code = l.code - 1 | |
WHERE r.code IS null | |
); | |
EOS | |
rm UnicodeData.tmp | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment