Last active
March 8, 2023 09:21
-
-
Save TinoDidriksen/8483a1ae3c402c63beaeaf53ff73161b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php | |
<?php | |
/* | |
USAGE: | |
./cg-combsets.php kalcg.dansyn kalcg.dan kalcg.dep kalcg.extra kalcg.pre > sets.cg3 | |
Parses all given grammars and spits out all sets used across all grammars, and marks conflicts for manual inspection. | |
Used to create a combined deduplicated sets.cg3 that multiple grammars can Include. | |
It uses CG-3's --dump-ast to get UTF-16 offsets and tags, but fetches the actual lines from the grammars. | |
*/ | |
$gs = []; | |
function sort_line($a, $b) { | |
return $a['l'] - $b['l']; | |
} | |
$taglists = [ | |
'_S_DELIMITERS_' => ['Delimiters', 'DELIMITERS'], | |
'_S_SOFT_DELIMITERS_' => ['SoftDelimiters', 'SOFT-DELIMITERS'], | |
'_S_TEXT_DELIMITERS_' => ['TextDelimiters', 'TEXT-DELIMITERS'], | |
'_S_LIST_TAGS_' => ['ListTags', 'LIST-TAGS'], | |
'_S_STRICT_TAGS_' => ['StrictTags', 'STRICT-TAGS'], | |
]; | |
array_shift($argv); | |
$maxl = 0; | |
foreach ($argv as $f) { | |
fprintf(STDERR, "Parsing %s\n", $f); | |
$us = shell_exec('cg3 --grammar-only --show-unused-sets -g '.escapeshellarg($f).' 2>/dev/null'); | |
$b = strpos($us, 'Unused sets:') + strlen('Unused sets:'); | |
$e = strpos($us, 'End of unused sets.', $b); | |
$us = trim(substr($us, $b, $e-$b))."\n"; | |
$g = file_get_contents($f); | |
$u16 = mb_convert_encoding($g, 'UTF-16LE', 'UTF-8'); | |
$xml = shell_exec('cg3 --grammar-only --dump-ast -g '.escapeshellarg($f).' 2>/dev/null'); | |
$dom = new DOMDocument(); | |
$dom->preserveWhiteSpace = false; | |
$dom->loadXML($xml); | |
$xpath = new DOMXPath($dom); | |
$l = 0; | |
foreach ($taglists as $n => $xp) { | |
$sets = $xpath->query($xp[0]); | |
foreach ($sets as $set) { | |
$p = $set; | |
$l = 1; | |
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2); | |
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE'); | |
$tags = []; | |
foreach ($set->firstChild->childNodes as $tn) { | |
if ($tn->nodeName === 'Tag') { | |
$tags[$tn->getAttribute('t')] = 1; | |
} | |
else { | |
$ts = []; | |
foreach ($tn->childNodes as $ct) { | |
$ts[] = $ct->getAttribute('t'); | |
} | |
$tags['('.implode(' ', $ts).')'] = 1; | |
} | |
} | |
if (!array_key_exists($n, $gs)) { | |
$gs[$n] = ['l' => $l + $maxl, 't' => $tags, 'd' => [$s => $tags]]; | |
} | |
else { | |
foreach ($tags as $t => $_) { | |
$gs[$n]['t'][$t] = ($gs[$n]['t'][$t] ?? 0) + 1; | |
} | |
$gs[$n]['d'][$s] = $tags; | |
} | |
} | |
} | |
$sets = $xpath->query('List/SetName'); | |
foreach ($sets as $set) { | |
$p = $set->parentNode; | |
$l = intval($p->getAttribute('l')); | |
$n = $set->getAttribute('t'); | |
if (strpos($us, "Line $l set $n\n") === false) { | |
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2); | |
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE'); | |
$tags = []; | |
foreach ($set->nextSibling->childNodes as $tn) { | |
if ($tn->nodeName === 'Tag') { | |
$tags[$tn->getAttribute('t')] = 1; | |
} | |
else { | |
$ts = []; | |
foreach ($tn->childNodes as $ct) { | |
$ts[] = $ct->getAttribute('t'); | |
} | |
$tags['('.implode(' ', $ts).')'] = 1; | |
} | |
} | |
if (!array_key_exists($n, $gs)) { | |
$gs[$n] = ['l' => $l + $maxl, 't' => $tags, 'd' => [$s => $tags]]; | |
} | |
else { | |
foreach ($tags as $t => $_) { | |
$gs[$n]['t'][$t] = ($gs[$n]['t'][$t] ?? 0) + 1; | |
} | |
$gs[$n]['d'][$s] = $tags; | |
} | |
} | |
} | |
$sets = $xpath->query('Set/SetName'); | |
foreach ($sets as $set) { | |
$p = $set->parentNode; | |
$l = intval($p->getAttribute('l')); | |
$n = $set->getAttribute('t'); | |
if (strpos($us, "Line $l set $n\n") === false) { | |
$s = substr($u16, $p->getAttribute('b')*2, ($p->getAttribute('e')-$p->getAttribute('b'))*2); | |
$s = mb_convert_encoding($s, 'UTF-8', 'UTF-16LE'); | |
if (!array_key_exists($n, $gs)) { | |
$gs[$n] = ['l' => $l + $maxl, 't' => [], 'd' => [$s => true]]; | |
} | |
else { | |
$gs[$n]['d'][$s] = true; | |
} | |
} | |
} | |
$maxl += $l; | |
} | |
uasort($gs, 'sort_line'); | |
$last = ['l' => 0]; | |
foreach ($gs as $sn => $g) { | |
if ($g['l']-1 != $last['l']) { | |
echo "\n"; | |
} | |
$plain = false; | |
if (array_key_exists($sn, $taglists)) { | |
$sn = $taglists[$sn][1]; | |
$plain = true; | |
} | |
$cnt = count($g['d']); | |
if ($cnt > 1) { | |
echo "# CONFLICT: {$sn}\n"; | |
if (!empty($g['t'])) { | |
foreach ($g['d'] as $s => $ts) { | |
if ($ts === true) { | |
continue; | |
} | |
foreach ($ts as $k => $v) { | |
if ($g['t'][$k] > 1) { | |
unset($ts[$k]); | |
} | |
} | |
if (!empty($ts)) { | |
echo "# ".implode(' ', array_keys($ts))."\n"; | |
} | |
} | |
if (!$plain) { | |
echo 'LIST '; | |
} | |
echo "{$sn} = ".implode(' ', array_keys($g['t']))." ;\n"; | |
} | |
foreach ($g['d'] as $s => $ts) { | |
if ($ts === true) { | |
echo "{$s}\n"; | |
continue; | |
} | |
} | |
} | |
else { | |
echo array_key_first($g['d'])."\n"; | |
} | |
$last = $g; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment