Last active
September 26, 2020 18:09
-
-
Save mohd-akram/e0101f7d66cdaabbb60dbc6d5e48e58a to your computer and use it in GitHub Desktop.
A script to convert an ERE (extended regular expression) to one or more BREs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env awk -f | |
# usage: ere2bre <ere> | |
function bracket( \ | |
pattern, i, \ | |
c, c2, term, len, end \ | |
) { | |
++i # skip opening bracket | |
if (substr(pattern, i, 1) == "]") ++i # skip right bracket | |
len = length(pattern) | |
for (; i <= len; i++) { | |
c = substr(pattern, i, 1) | |
c2 = substr(pattern, i, 2) | |
if (c == "\\") ++i | |
else if (c2 == term) { term = ""; ++i } | |
else if (c2 == "[:" && !term) { term = ":]"; ++i } | |
else if (c2 == "[." && !term) { term = ".]"; ++i } | |
else if (c2 == "[=" && !term) { term = "=]"; ++i } | |
else if (c == "]" && !term) { end = i; break } | |
} | |
if (!end) { | |
print "Unbalanced brackets" > "/dev/stderr" | |
exit 1 | |
} | |
return end | |
} | |
function groups( \ | |
pattern, out, \ | |
idx, start, len, i, c, depth \ | |
) { | |
start = 1 | |
len = length(pattern) | |
for (i = 1; i <= len; i++) { | |
c = substr(pattern, i, 1) | |
if (c == "\\") ++i | |
else if (c == "[") i = bracket(pattern, i) | |
else if (c == "(") { | |
if (!depth) { | |
out[++idx] = substr(pattern, start, i-start) | |
start = i | |
} | |
++depth | |
} | |
else if (c == ")") { | |
--depth | |
if (!depth) { | |
out[++idx] = substr(pattern, start, i-start+1) | |
start = i+1 | |
} | |
} | |
} | |
out[++idx] = substr(pattern, start) | |
return idx | |
} | |
function alts( \ | |
pattern, out, \ | |
idx, start, len, i, c, depth \ | |
) { | |
start = 1 | |
len = length(pattern) | |
for (i = 1; i <= len; i++) { | |
c = substr(pattern, i, 1) | |
if (c == "\\") ++i | |
else if (c == "[") i = bracket(pattern, i) | |
else if (c == "(") ++depth | |
else if (c == ")") --depth | |
else if (c == "|" && !depth) { | |
out[++idx] = substr(pattern, start, i-start) | |
start = i+1 | |
} | |
} | |
if (depth) { | |
print "Unbalanced parentheses" > "/dev/stderr" | |
exit 1 | |
} | |
out[++idx] = substr(pattern, start) | |
return idx | |
} | |
function replace( \ | |
pattern, \ | |
new, len, i, c, end \ | |
) { | |
len = length(pattern) | |
for (i = 1; i <= len; i++) { | |
c = substr(pattern, i, 1) | |
if (c == "\\") { | |
++i | |
c = substr(pattern, i, 1) | |
if ( \ | |
c != "?" && c != "+" && c != "|" && \ | |
c != "{" && c != "}" && c != "(" && c != ")" \ | |
) | |
c = "\\" c | |
} | |
else if (c == "[") { | |
end = bracket(pattern, i) | |
c = substr(pattern, i, end-i+1) | |
i = end | |
} | |
else if (c == "?") c = "\\{0,1\\}" | |
else if (c == "+") c = "\\{1,\\}" | |
else if (c == "{") c = "\\{" | |
else if (c == "}") c = "\\}" | |
else if (c == "(") c = "\\(" | |
else if (c == ")") c = "\\)" | |
new = new c | |
} | |
return new | |
} | |
function permutate( \ | |
opts, lens, out, oidx, \ | |
res, idx, len, i, s \ | |
) { | |
len = lens[0] | |
if (!idx) idx = 1 | |
if (idx > len) { | |
for (i = 1; i <= len; i++) | |
s = s res[i] | |
out[++oidx] = s | |
return oidx | |
} | |
for (i = 1; i <= lens[idx]; i++) { | |
res[idx] = opts[idx,i] | |
oidx = permutate(opts, lens, out, oidx, res, idx+1) | |
} | |
return oidx | |
} | |
function flatten( \ | |
pattern, out, \ | |
parts, idx, len, gs, gslen, wrap, ps, pslen, opts, lens, i, j, k \ | |
) { | |
len = alts(pattern, parts) | |
for (i = 1; i <= len; i++) { | |
gslen = groups(parts[i], gs) | |
if (gslen == 1) { | |
out[++idx] = gs[1] | |
continue | |
} | |
for (j = 1; j <= gslen; j++) { | |
wrap = gs[j] ~ /^\(/ | |
if (wrap) | |
gsub(/^\(|\)$/, "", gs[j]) | |
pslen = flatten(gs[j], ps) | |
for (k = 1; k <= pslen; k++) | |
opts[j,k] = wrap ? "(" ps[k] ")" : ps[k] | |
lens[j] = k-1 | |
} | |
lens[0] = j-1 | |
idx = permutate(opts, lens, out, idx) | |
} | |
return idx | |
} | |
BEGIN { | |
len = flatten(ARGV[1], res) | |
while (++i <= len) print replace(res[i]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment