Skip to content

Instantly share code, notes, and snippets.

@quat1024
Last active September 10, 2023 05:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quat1024/a1966a5718407ad63c1848504f637374 to your computer and use it in GitHub Desktop.
Save quat1024/a1966a5718407ad63c1848504f637374 to your computer and use it in GitHub Desktop.
pratt parsing in java, but succinct
package agency.highlysuspect.parsefunny;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.stream.Collectors;
//This is sort of a synthesis of the parsers described in these two articles:
//
//matklad - "Simple but Powerful Pratt Parsing"
// https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
// Very featureful Pratt parser that can recognize prefix, postfix, infix, and "around"fix operators.
// Uses a somewhat delicate numeric precedence scheme. Implemented in Rust.
// *He also has this nice article https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
//
//Jamie Brandon (scattered-thoughts) - "Better operator precedence"
// https://www.scattered-thoughts.net/writing/better-operator-precedence/
// Introduces a "comparePrecedence" function instead of using numeric precedence.
// But the given implementation (in Zig) doesn't cover anything other than infix operators.
//
//Wanted the best of both. I think I got it.
//
//There's also this nice article about pratt parsing by the Crafting Interpreters developer:
//Bob Nystrom - Pratt Parsers: Expression Parsing Made Easy
// https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
//
//But it gets a little bogged down in all the Java, so much so that someone wrote a "pratt parsing
//without virtual dispatch or Java" article about it (http://www.oilshell.org/blog/2016/11/03.html),
//and I wanted to redeem Java a bit :) So here's a complete lexer and parser in ~200 lines of commented java 17.
public class ParserExploration2 {
static void check(boolean arg, String msg) {
if(!arg) throw new IllegalStateException(msg);
}
/// Lexing ///
enum TokenType {
EOF, //token after the end of the file (very common technique to avoid Optional<Token> everywhere)
ATOM, //stand-in for a "number" or "name literal" token you might have in a real parser
STAR, SLASH, SQUIGGLE, PLUS, MINUS, DOT, BANG, PERCENT, //grab bag of operators
LPAREN, RPAREN, //parenthesis
}
record Token(TokenType type, String display) {
@Override public String toString() { return display; }
}
static class Lexer {
//Toy-quality lexer ported from matklad's post, lexes 1-character long operations and literals only.
//Don't get bogged down with this, it's not the important part.
Lexer(String in) {
this.tokens = in.chars()
.filter(i -> !Character.isWhitespace(i))
.mapToObj(i -> switch(i) {
case '*' -> new Token(TokenType.STAR, "*");
case '/' -> new Token(TokenType.SLASH, "/");
case '~' -> new Token(TokenType.SQUIGGLE, "~");
case '+' -> new Token(TokenType.PLUS, "+");
case '-' -> new Token(TokenType.MINUS, "-");
case '.' -> new Token(TokenType.DOT, ".");
case '!' -> new Token(TokenType.BANG, "!");
case '%' -> new Token(TokenType.PERCENT, "%");
case '(' -> new Token(TokenType.LPAREN, "(");
case ')' -> new Token(TokenType.RPAREN, ")");
default -> new Token(TokenType.ATOM, Character.toString(i));
}).collect(Collectors.toCollection(ArrayDeque::new));
}
//A "peeking iterator" API over the lexemes. Note that there's no way to see two tokens ahead.
final Deque<Token> tokens;
Token peek() {
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.peekFirst();
}
Token pop() {
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.pollFirst();
}
}
/// AST sketch. Renders toString as an S-expression. ///
interface Expr {}
record Atom(String s) implements Expr {
@Override public String toString() { return s; }
}
record UnaryOp(OperationType op, Expr operand) implements Expr {
@Override public String toString() { return "(%s %s)".formatted(op, operand); }
}
record BinaryOp(OperationType op, Expr left, Expr right) implements Expr {
@Override public String toString() { return "(%s %s %s)".formatted(op, left, right); }
}
/// Parsing ///
//Entrypoint for the recursive expression parser. In a real parsing application, this would be one of many little functions.
Expr expr(Lexer lex) {
return pratt(lex, null); //`null` means "there is no operation to my left to consider binding to".
}
//Pratt parser bullshit magic:
Expr pratt(Lexer lex, @Nullable OperationType leftOp) {
Token lhsToken = lex.pop();
Expr lhs = switch(lhsToken.type) {
case ATOM -> new Atom(lhsToken.toString());
case LPAREN -> {
//When crossing into parenthesis, revert to the null operation type.
//Nothing should be able to bind leftwards *through* an opening paren.
Expr e = pratt(lex, null);
check(lex.pop().type == TokenType.RPAREN, "expected closing parenthesis");
yield e;
}
default -> {
//is it prefix?
OperationType asPrefix = OperationType.prefixOpFrom(lhsToken);
check(asPrefix != null, lhsToken + " is not a prefix operation!");
//we already consumed the prefix operator's token
Expr rhs = pratt(lex, asPrefix); //recurse to find the right-hand side of the expression
yield new UnaryOp(asPrefix, rhs); //apply the prefix operator to the *next* operand
//look for another infix or postfix operation
}
};
while(true) {
Token opToken = lex.peek();
//(Special casing for EOF, RParen, etc is not actually needed; xxxOpFrom returning `null` takes care of it.)
//is it infix?
OperationType infixOp = OperationType.infixOpFrom(opToken);
if(infixOp != null && bindsToRight(leftOp, infixOp)) {
lex.pop(); //consume the infix operator's token
Expr rhs = pratt(lex, infixOp); //recurse to find the right-hand side of the expression (!HARD to understand!)
lhs = new BinaryOp(infixOp, lhs, rhs); //apply the infix operator to *both* operands
continue; //look for another infix or postfix operation
}
//is it postfix?
OperationType postfixOp = OperationType.postfixOpFrom(opToken);
if(postfixOp != null && bindsToRight(leftOp, postfixOp)) {
lex.pop(); //consume the postfix operator's token
lhs = new UnaryOp(postfixOp, lhs); //apply the postfix operator to the *previous* operand
continue; //look for another infix or postfix operation
}
return lhs;
}
}
//Whether the operation on the right is "stronger" than the one on the left.
static boolean bindsToRight(@Nullable OperationType left, @NotNull OperationType right) {
if(left == null) return true; //There isn't an operator on the left to even consider binding to!
//Key observation: this is the only place precedence and rightAssociative are read.
//The precedence implementation can be swapped out for anything (such as a handwritten table),
//if manual precedence-number fiddling isn't up your alley. Exceptions for specific pairs of
//operators can also be added to this method.
if(left.precedence == right.precedence) return left.rightAssociative; //break ties like this
else return right.precedence > left.precedence;
}
//A token corresponds to zero or more "logical operations" depending on the token's fixity.
//For example, the - token can be unary minus or subtraction, and unary minus binds tighter than
//multiplication, but multiplication binds tighter than subtraction. In other words, tokens don't
//really "have precedence", but (token, fixity) tuples do. The typical way of modelling this is
//maintaining separate precedence tables: one for prefix operators, one for infix operators, and
//one for postfix operators. This is a little fiddly, so in the spirit of reducing the amount of
//"magic numbers" in pratt parsing, I tried mapping each operator and fixity into a *single*
//OperationType table, giving a name to the operation at the same time (so I can say "factorial"
//instead of "postfix bang"). Maybe it makes more sense this way? It's probably a little easier to
//reason about when it comes time to add more operations? I think it's just as flexible? Idk.
enum OperationType {
ADD (20, false, "+"),
SUBTRACT (20, false, "-"),
SQUIGGLE (30, false, "~"), //Arbitrary binary operator with precedence between + and *
PERCENT (30, true, "%"), //Same, but right-associative
MULTIPLY (40, false, "*"),
DIVIDE (40, false, "/"),
UNARY_PLUS (50, false, "+"),
UNARY_MINUS (50, false, "-"),
FACTORIAL (60, false, "!"), //Postfix
UNARY_SQUIGGLE(70, false, "~"),
COMPOSITION (100, true, "."); //Right-associative
OperationType(int precedence, boolean rightAssociative, String display) {
this.precedence = precedence;
this.rightAssociative = rightAssociative;
this.display = display;
}
final int precedence;
final boolean rightAssociative;
final String display;
@Override public String toString() { return this.display; }
//Tables for converting tokens into logical operations, depending on the token's fixity.
//Note that if a token has infix *and* postfix forms, the main parser loop will just pick one.
//I think correctly disambiguating cases like that requires backtracking or infinite lookahead.
//Prefix and infix is okay though.
static @Nullable OperationType prefixOpFrom(Token tok) {
return switch(tok.type) {
case PLUS -> OperationType.UNARY_PLUS;
case MINUS -> OperationType.UNARY_MINUS;
case SQUIGGLE -> OperationType.UNARY_SQUIGGLE;
default -> null;
};
}
static @Nullable OperationType infixOpFrom(Token tok) {
return switch(tok.type) {
case PLUS -> OperationType.ADD;
case MINUS -> OperationType.SUBTRACT;
case SQUIGGLE -> OperationType.SQUIGGLE;
case PERCENT -> OperationType.PERCENT;
case STAR -> OperationType.MULTIPLY;
case SLASH -> OperationType.DIVIDE;
case DOT -> OperationType.COMPOSITION;
default -> null;
};
}
static @Nullable OperationType postfixOpFrom(Token tok) {
return switch(tok.type) {
case BANG -> OperationType.FACTORIAL;
default -> null;
};
}
}
/// demonstration ///
void checkParse(String in, String expect) {
String out = expr(new Lexer(in)).toString();
System.out.println("in: " + in);
System.out.println("out: " + out);
System.out.println("expect: " + expect);
check(out.equals(expect), "Expected " + expect + " but found " + out);
}
void go() {
//left-associative operators
checkParse("1 + 2 + 3 + 4", "(+ (+ (+ 1 2) 3) 4)");
checkParse("1 ~ 2 ~ 3 ~ 4", "(~ (~ (~ 1 2) 3) 4)");
checkParse("1 * 2 * 3 * 4", "(* (* (* 1 2) 3) 4)");
checkParse("1 + 2 - 3 + 4", "(+ (- (+ 1 2) 3) 4)"); //operators with the same priority
checkParse("1 * 2 / 3 * 4", "(* (/ (* 1 2) 3) 4)");
//right-associative operators
checkParse("1 . 2 . 3 . 4", "(. 1 (. 2 (. 3 4)))");
checkParse("1 % 2 % 3 % 4", "(% 1 (% 2 (% 3 4)))");
//mixed associativity among operators with the same precedence
checkParse("1 ~ 2 ~ 3 % 4", "(% (~ (~ 1 2) 3) 4)");
checkParse("1 ~ 2 % 3 % 4", "(% (~ 1 2) (% 3 4))");
checkParse("1 % 2 ~ 3 ~ 4", "(% 1 (~ (~ 2 3) 4))");
checkParse("1 % 2 % 3 ~ 4", "(% 1 (% 2 (~ 3 4)))");
//operator precedence
checkParse("1 + 2 ~ 3", "(+ 1 (~ 2 3))"); //plus is weaker than squiggle
checkParse("1 * 2 ~ 3", "(~ (* 1 2) 3)"); //star is stronger than squiggle
checkParse("1 + 2 * 3 + 4", "(+ (+ 1 (* 2 3)) 4)");
checkParse("1 * 2 + 3 * 4", "(+ (* 1 2) (* 3 4))");
//prefix operators
checkParse("-1", "(- 1)");
checkParse("- - 1", "(- (- 1))");
checkParse("- + 1", "(- (+ 1))");
checkParse("- - 1 - 2", "(- (- (- 1)) 2)");
checkParse("- - 1 - - 2", "(- (- (- 1)) (- 2))");
checkParse("1--2", "(- 1 (- 2))");
checkParse("- 1 * - 2", "(* (- 1) (- 2))");
//postfix operators
checkParse("5 !", "(! 5)");
checkParse("5 ! !", "(! (! 5))");
checkParse("- 5 !", "(- (! 5))"); //unary - is weaker than !
checkParse("~ 5 !", "(! (~ 5))"); //unary ~ is stronger than !
checkParse("---5!!!", "(- (- (- (! (! (! 5))))))");
checkParse("~~~5!!!", "(! (! (! (~ (~ (~ 5))))))");
checkParse("-~~5!!!", "(- (! (! (! (~ (~ 5))))))");
checkParse("~-~5!!!", "(~ (- (! (! (! (~ 5))))))");
checkParse("1 + 3!", "(+ 1 (! 3))");
checkParse("1 + 3 ! + 5", "(+ (+ 1 (! 3)) 5)");
//parentheticals
checkParse("( 1 + 2 ) * 3", "(* (+ 1 2) 3)");
checkParse("1 + ( 2 * 3 )", "(+ 1 (* 2 3))");
checkParse("(1+2)*(3+4)", "(* (+ 1 2) (+ 3 4))");
checkParse("~(1)", "(~ 1)");
checkParse("~((((((1))))))", "(~ 1)");
checkParse("~(((~(((1))))))", "(~ (~ 1))");
checkParse("(4 + 5)!", "(! (+ 4 5))");
//stupid viral math problems from the internet
checkParse("6 / 2 * (1 + 2)", "(* (/ 6 2) (+ 1 2))");
checkParse("2 + 2 * 4", "(+ 2 (* 2 4))");
checkParse("1 + 1 * 0 + 1", "(+ (+ 1 (* 1 0)) 1)");
}
public static void main(String[] args) {
new ParserExploration2().go();
}
}
package agency.highlysuspect.parsefunny;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.stream.Collectors;
//This shorter version strips away the pre/postfix parsing & parenthesis parsing, leaving only the
//core of the algorithm. It's also implemented more traditionally, using a numeric "binding power".
public class ParserExploration2_InfixOnly {
static void check(boolean arg, String msg) {
if(!arg) throw new IllegalStateException(msg);
}
/// Lexing ///
enum TokenType {
EOF, //token after the end of the file (very common technique to avoid Optional<Token> everywhere)
ATOM, //stand-in for a "number" or "name literal" token you might have in a real parser
STAR, SLASH, SQUIGGLE, PLUS, MINUS, DOT, PERCENT, //grab bag of infix binary operators
}
record Token(TokenType type, String display) {
@Override public String toString() { return display; }
}
static class Lexer {
//Toy-quality lexer ported from matklad's post, lexes 1-character long operations and literals only.
//Don't get bogged down with this, it's not the important part.
Lexer(String in) {
this.tokens = in.chars()
.filter(i -> !Character.isWhitespace(i))
.mapToObj(i -> switch(i) {
case '*' -> new Token(TokenType.STAR, "*");
case '/' -> new Token(TokenType.SLASH, "/");
case '~' -> new Token(TokenType.SQUIGGLE, "~");
case '+' -> new Token(TokenType.PLUS, "+");
case '-' -> new Token(TokenType.MINUS, "-");
case '.' -> new Token(TokenType.DOT, ".");
case '%' -> new Token(TokenType.PERCENT, "%");
default -> new Token(TokenType.ATOM, Character.toString(i));
}).collect(Collectors.toCollection(ArrayDeque::new));
}
//A "peeking iterator" API over the lexemes. Note that there's no way to see two tokens ahead.
final Deque<Token> tokens;
Token peek() {
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.peekFirst();
}
Token pop() {
return tokens.isEmpty() ? new Token(TokenType.EOF, "") : tokens.pollFirst();
}
}
/// AST sketch. Renders toString as an S-expression. ///
interface Expr {}
record Atom(String s) implements Expr {
@Override public String toString() { return s; }
}
record BinaryOp(String op, Expr left, Expr right) implements Expr {
@Override public String toString() { return "(%s %s %s)".formatted(op, left, right); }
}
/// Parsing ///
static final int NOT_INFIX_OP = 0;
Expr expr(Lexer lex) {
return pratt(lex, NOT_INFIX_OP); //"there is no operation to my left to consider binding to"
}
Expr pratt(Lexer lex, int minBindingPower) {
Token lhsToken = lex.pop();
check(lhsToken.type == TokenType.ATOM, "expected atom");
Expr lhs = new Atom(lhsToken.toString());
while(true) {
Token opToken = lex.peek();
if(leftPower(opToken) == NOT_INFIX_OP || leftPower(opToken) < minBindingPower) return lhs;
lex.pop();
Expr rhs = pratt(lex, rightPower(opToken)); //recurse to find the right-hand side of the expression
lhs = new BinaryOp(opToken.toString(), lhs, rhs); //^ that is the HARD part to understand about this function
}
}
static int leftPower(Token t) {
return switch(t.type) {
case PLUS, MINUS -> 20;
case SQUIGGLE, PERCENT -> 30;
case STAR, SLASH -> 40;
case DOT -> 100;
default -> NOT_INFIX_OP;
};
}
static int rightPower(Token t) {
return switch(t.type) {
case PLUS, MINUS, SQUIGGLE, STAR, SLASH -> leftPower(t) + 1;
case DOT, PERCENT -> leftPower(t) - 1;
default -> NOT_INFIX_OP;
};
}
/// demonstration ///
void checkParse(String in, String expect) {
String out = expr(new Lexer(in)).toString();
System.out.println("in: " + in);
System.out.println("out: " + out);
System.out.println("expect: " + expect);
check(out.equals(expect), "Expected " + expect + " but found " + out);
}
void go() {
//left-associative operators
checkParse("1 + 2 + 3 + 4", "(+ (+ (+ 1 2) 3) 4)");
checkParse("1 ~ 2 ~ 3 ~ 4", "(~ (~ (~ 1 2) 3) 4)");
checkParse("1 * 2 * 3 * 4", "(* (* (* 1 2) 3) 4)");
checkParse("1 + 2 - 3 + 4", "(+ (- (+ 1 2) 3) 4)"); //operators with the same priority
checkParse("1 * 2 / 3 * 4", "(* (/ (* 1 2) 3) 4)");
//right-associative operators
checkParse("1 . 2 . 3 . 4", "(. 1 (. 2 (. 3 4)))");
checkParse("1 % 2 % 3 % 4", "(% 1 (% 2 (% 3 4)))");
//mixed associativity among operators with the same precedence
checkParse("1 ~ 2 ~ 3 % 4", "(% (~ (~ 1 2) 3) 4)");
checkParse("1 ~ 2 % 3 % 4", "(% (~ 1 2) (% 3 4))");
checkParse("1 % 2 ~ 3 ~ 4", "(% 1 (~ (~ 2 3) 4))");
checkParse("1 % 2 % 3 ~ 4", "(% 1 (% 2 (~ 3 4)))");
//operator precedence
checkParse("1 + 2 ~ 3", "(+ 1 (~ 2 3))"); //plus is weaker than squiggle
checkParse("1 * 2 ~ 3", "(~ (* 1 2) 3)"); //star is stronger than squiggle
checkParse("1 + 2 * 3 + 4", "(+ (+ 1 (* 2 3)) 4)");
checkParse("1 * 2 + 3 * 4", "(+ (* 1 2) (* 3 4))");
}
public static void main(String[] args) {
new ParserExploration2_InfixOnly().go();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment