Skip to content

Instantly share code, notes, and snippets.

@jahav
Created June 3, 2023 15:26
Show Gist options
  • Save jahav/fe8da165571b439ba7ee3417fb5f494e to your computer and use it in GitHub Desktop.
Save jahav/fe8da165571b439ba7ee3417fb5f494e to your computer and use it in GitHub Desktop.
A PoC converter that converts ANTLR lexer to Rolex lexer.
using System.Text;
using Antlr4.Runtime;
internal class Program
{
private static void Main(string[] args)
{
Console.WriteLine("Hello, World!");
var f = string.Join("\n", File.ReadAllLines(@"c:\Users\havli\source\repos\Antlr2Rolex\FormulaLexer.g4"));
var inputStream = new AntlrInputStream(f);
var lexer = new ANTLRv4Lexer(inputStream);
var input = new CommonTokenStream(lexer);
var parser = new ANTLRv4Parser(input);
var ctx = parser.grammarSpec();
var visitor = new Visitor();
var rolexLexer = visitor.Visit(ctx);
Console.WriteLine(rolexLexer);
}
private class Visitor : ANTLRv4ParserBaseVisitor<string>
{
private static readonly HashSet<char> _regexEscape = "{}()[]?*.+-$^'"
.Select(x => x).ToHashSet();
private List<string> _tokenNames = new();
private Dictionary<string, string> _patterns = new();
public override string VisitGrammarSpec(ANTLRv4Parser.GrammarSpecContext context)
{
var lexerRules = context.rules().ruleSpec().Select(rs => rs.lexerRuleSpec()).Where(rs => rs is not null).ToList();
_tokenNames.AddRange(lexerRules.Where(lr => lr.FRAGMENT() is null).Select(lr => lr.TOKEN_REF().GetText()));
var maxLoops = context.rules().ruleSpec().Count();
var loop = 0;
while (true)
{
var noErrors = true;
foreach (var lexerRule in lexerRules)
{
var name = lexerRule.TOKEN_REF().GetText();
var isPatternDone = _patterns.ContainsKey(name);
if (isPatternDone)
continue;
try
{
var pattern = Visit(lexerRule);
_patterns.Add(name, pattern);
}
catch (MissingDependencyPatternException e)
{
// continue, each pass should add at least one fragment of token
// so I will get it all eventually
noErrors = false;
}
}
if (loop++ > maxLoops)
throw new Exception("Infinite loop " + loop);
if (noErrors)
break;
}
var sb = new StringBuilder();
foreach (var tokenName in _tokenNames)
{
sb.Append(tokenName).Append(" = '").Append(_patterns[tokenName]).Append("'\n");
}
return sb.ToString();
}
// lexerRuleSpec
// : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI
// ;
public override string VisitLexerRuleSpec(ANTLRv4Parser.LexerRuleSpecContext context)
{
//Console.WriteLine("VisitLexerRuleSpec " + context.ToStringTree());
var isFragment = context.FRAGMENT() is not null;
var name = context.TOKEN_REF().GetText();
var res = VisitChildren(context);
return res;
}
public override string VisitLexerRuleBlock(ANTLRv4Parser.LexerRuleBlockContext context)
{
return VisitChildren(context);
}
// lexerElement
// : lexerAtom ebnfSuffix?
// | lexerBlock ebnfSuffix?
// | actionBlock QUESTION?
// ;
public override string VisitLexerElement(ANTLRv4Parser.LexerElementContext context)
{
var pattern = Visit(context.children[0]);
var hasEbnfSuffix = context.children.Count > 1;
if (hasEbnfSuffix)
{
var repeatCharacter = context.children[1].GetText();
pattern = MakeBlock(MakeBlock(pattern) + repeatCharacter);
}
return pattern;
}
// lexerBlock
// : LPAREN lexerAltList RPAREN
// ;
//
public override string VisitLexerBlock(ANTLRv4Parser.LexerBlockContext context)
{
var pattern = Visit(context.children[1]);
return MakeBlock(pattern);
}
// lexerAltList
// : lexerAlt (OR lexerAlt)*
// ;
public override string VisitLexerAltList(ANTLRv4Parser.LexerAltListContext context)
{
var firstChildPattern = Visit(context.children[0]);
var pattern = MakeBlock(firstChildPattern);
for (var childIndex = 2; childIndex < context.ChildCount; childIndex += 2)
{
var childPattern = Visit(context.children[childIndex]);
pattern += "|" + MakeBlock(childPattern);
}
return MakeBlock(pattern);
}
// lexerAtom
// : characterRange
// | terminal
// | notSet
// | LEXER_CHAR_SET
// | DOT elementOptions?
// ;
public override string VisitLexerAtom(ANTLRv4Parser.LexerAtomContext context)
{
if (context.characterRange() is not null)
{
return Visit(context.characterRange());
}
if (context.terminal() is not null)
{
return Visit(context.terminal());
}
if (context.LEXER_CHAR_SET() is not null)
{
var regexCharSet = context.LEXER_CHAR_SET().GetText();
return regexCharSet;
}
throw new NotImplementedException();
}
public override string VisitCharacterRange(ANTLRv4Parser.CharacterRangeContext context)
{
var first = context.STRING_LITERAL(0).GetText()[1..^1];
var startChar = ConvertUnicodeCodepoint(first);
var second = context.STRING_LITERAL(1).GetText()[1..^1];
var endChar = ConvertUnicodeCodepoint(second);
var range = $"[{startChar}-{endChar}]";
return range;
static string ConvertUnicodeCodepoint(string codepoint)
{
if (codepoint.StartsWith("\\u{") && codepoint.EndsWith('}'))
{
return "\\u" + codepoint[3..^1];
}
return codepoint;
}
}
public override string VisitTerminal(ANTLRv4Parser.TerminalContext context)
{
var stringLiteral = context.STRING_LITERAL()?.GetText();
if (stringLiteral is not null)
{
return EscapeLiteralForRegEx(stringLiteral[1..^1]);
}
// else token and reference to a fragment
var tokenRef = context.TOKEN_REF().GetText();
if (_patterns.TryGetValue(tokenRef, out var pattern))
return pattern;
throw new MissingDependencyPatternException(tokenRef);
}
protected override string AggregateResult(string aggregate, string nextResult)
{
var combined = aggregate + nextResult;
return combined;
}
private string EscapeLiteralForRegEx(string literal)
{
var sb = new StringBuilder(literal.Length);
foreach (var c in literal)
{
if (_regexEscape.Contains(c))
sb.Append('\\');
sb.Append(c);
}
return sb.ToString();
}
private string MakeBlock(string pattern)
{
return "(" + pattern + ")";
}
}
internal class MissingDependencyPatternException : Exception
{
public MissingDependencyPatternException(string name)
{
Name = name;
}
public string Name { get; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment