Skip to content

Instantly share code, notes, and snippets.

@TinoDidriksen
Last active March 8, 2023 09:21
Show Gist options
  • Save TinoDidriksen/8483a1ae3c402c63beaeaf53ff73161b to your computer and use it in GitHub Desktop.
Save TinoDidriksen/8483a1ae3c402c63beaeaf53ff73161b to your computer and use it in GitHub Desktop.
#!/usr/bin/env php
<?php
/*
USAGE:
./cg-combsets.php kalcg.dansyn kalcg.dan kalcg.dep kalcg.extra kalcg.pre > sets.cg3
Parses all given grammars and spits out all sets used across all grammars, and marks conflicts for manual inspection.
Used to create a combined deduplicated sets.cg3 that multiple grammars can Include.
It uses CG-3's --dump-ast to get UTF-16 offsets and tags, but fetches the actual lines from the grammars.
*/
$gs = [];
function sort_line($a, $b) {
return $a['l'] - $b['l'];
}
$taglists = [
'_S_DELIMITERS_' => ['Delimiters', 'DELIMITERS'],
'_S_SOFT_DELIMITERS_' => ['SoftDelimiters', 'SOFT-DELIMITERS'],
'_S_TEXT_DELIMITERS_' => ['TextDelimiters', 'TEXT-DELIMITERS'],
'_S_LIST_TAGS_' => ['ListTags', 'LIST-TAGS'],
'_S_STRICT_TAGS_' => ['StrictTags', 'STRICT-TAGS'],
];
array_shift($argv);
$maxl = 0;
foreach ($argv as $f) {
fprintf(STDERR, "Parsing %s\n", $f);
$us = shell_exec('cg3 --grammar-only --show-unused-sets -g '.escapeshellarg($f).' 2>/dev/null');
$b = strpos($us, 'Unused sets:') + strlen('Unused sets:');
$e = strpos($us, 'End of unused sets.', $b);
$us = trim(substr($us, $b, $e-$b))."\n";
$g = file_get_contents($f);
$u16 = mb_convert_encoding($g, 'UTF-16LE', 'UTF-8');
$xml = shell_exec('cg3 --grammar-only --dump-ast -g '.escapeshellarg($f).' 2>/dev/null');
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadXML($xml);
$xpath = new DOMXPath($dom);
$l = 0;
foreach ($taglists as $n => $xp) {
$sets = $xpath->query($xp[0]);
foreach ($sets as $set) {
$p = $set;
$l = 1;
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2);
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE');
$tags = [];
foreach ($set->firstChild->childNodes as $tn) {
if ($tn->nodeName === 'Tag') {
$tags[$tn->getAttribute('t')] = 1;
}
else {
$ts = [];
foreach ($tn->childNodes as $ct) {
$ts[] = $ct->getAttribute('t');
}
$tags['('.implode(' ', $ts).')'] = 1;
}
}
if (!array_key_exists($n, $gs)) {
$gs[$n] = ['l' => $l + $maxl, 't' => $tags, 'd' => [$s => $tags]];
}
else {
foreach ($tags as $t => $_) {
$gs[$n]['t'][$t] = ($gs[$n]['t'][$t] ?? 0) + 1;
}
$gs[$n]['d'][$s] = $tags;
}
}
}
$sets = $xpath->query('List/SetName');
foreach ($sets as $set) {
$p = $set->parentNode;
$l = intval($p->getAttribute('l'));
$n = $set->getAttribute('t');
if (strpos($us, "Line $l set $n\n") === false) {
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2);
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE');
$tags = [];
foreach ($set->nextSibling->childNodes as $tn) {
if ($tn->nodeName === 'Tag') {
$tags[$tn->getAttribute('t')] = 1;
}
else {
$ts = [];
foreach ($tn->childNodes as $ct) {
$ts[] = $ct->getAttribute('t');
}
$tags['('.implode(' ', $ts).')'] = 1;
}
}
if (!array_key_exists($n, $gs)) {
$gs[$n] = ['l' => $l + $maxl, 't' => $tags, 'd' => [$s => $tags]];
}
else {
foreach ($tags as $t => $_) {
$gs[$n]['t'][$t] = ($gs[$n]['t'][$t] ?? 0) + 1;
}
$gs[$n]['d'][$s] = $tags;
}
}
}
$sets = $xpath->query('Set/SetName');
foreach ($sets as $set) {
$p = $set->parentNode;
$l = intval($p->getAttribute('l'));
$n = $set->getAttribute('t');
if (strpos($us, "Line $l set $n\n") === false) {
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2);
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE');
if (!array_key_exists($n, $gs)) {
$gs[$n] = ['l' => $l + $maxl, 't' => [], 'd' => [$s => true]];
}
else {
$gs[$n]['d'][$s] = true;
}
}
}
$maxl += $l;
}
uasort($gs, 'sort_line');
$last = ['l' => 0];
foreach ($gs as $sn => $g) {
if ($g['l']-1 != $last['l']) {
echo "\n";
}
$plain = false;
if (array_key_exists($sn, $taglists)) {
$sn = $taglists[$sn][1];
$plain = true;
}
$cnt = count($g['d']);
if ($cnt > 1) {
echo "# CONFLICT: {$sn}\n";
if (!empty($g['t'])) {
foreach ($g['d'] as $s => $ts) {
if ($ts === true) {
continue;
}
foreach ($ts as $k => $v) {
if ($g['t'][$k] > 1) {
unset($ts[$k]);
}
}
if (!empty($ts)) {
echo "# ".implode(' ', array_keys($ts))."\n";
}
}
if (!$plain) {
echo 'LIST ';
}
echo "{$sn} = ".implode(' ', array_keys($g['t']))." ;\n";
}
foreach ($g['d'] as $s => $ts) {
if ($ts === true) {
echo "{$s}\n";
continue;
}
}
}
else {
echo array_key_first($g['d'])."\n";
}
$last = $g;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment