Skip to content

Instantly share code, notes, and snippets.

@cdcarter
Created June 18, 2023 19:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdcarter/ef56a8930a5b13e3ad98c0098a7ed375 to your computer and use it in GitHub Desktop.
Save cdcarter/ef56a8930a5b13e3ad98c0098a7ed375 to your computer and use it in GitHub Desktop.
const std = @import("std");
const ast = @import("ast.zig");
const lex = @import("lexer.zig");
fn infixBindingPower(op: ast.Operator) struct { u8, u8 } {
return switch (op) {
.EXP => return .{ 3, 4 },
.ADD, .SUB => return .{ 5, 6 },
.MUL, .DIV => return .{ 7, 8 },
};
}
// returning a null indicates this operator doesn't prefix bind.
fn prefixBindingPower(op: ast.Operator) ?u8 {
return switch (op) {
.ADD, .SUB => 9,
else => null,
};
}
pub const Parser = struct {
arena: std.heap.ArenaAllocator = undefined,
allocator: std.mem.Allocator = undefined,
tokens: []lex.Token,
index: usize = 0,
fail_msg: []const u8 = "",
fn init(self: *Parser, allocator: std.mem.Allocator) void {
self.arena = std.heap.ArenaAllocator.init(allocator); // TODO look into the state thingy, can i keep a smaller reference?
self.allocator = self.arena.allocator();
}
fn deinit(self: *Parser) void {
self.arena.deinit();
}
inline fn currToken(self: Parser) lex.Token {
return self.tokens[self.index];
}
inline fn peekToken(self: Parser) ?lex.Token {
if ((self.index + 1) >= self.tokens.len) return null;
return self.tokens[self.index + 1];
}
fn advanceToken(self: *Parser) void {
self.index += 1;
}
fn enterParse(self: *Parser) !?*const ast.Expression {
if (self.index == self.tokens.len) return null;
return (try self.parseExpression(0)).?;
}
// XXX some people might make this throw...
fn currIsSyntax(self: *Parser, syn: lex.SyntaxToken) bool {
const tok = self.currToken();
return switch (tok) {
.syntax => |st| {
return st == syn;
},
else => {
return false;
},
};
}
fn parseExpression(self: *Parser, min_binding_power: u8) !?*const ast.Expression {
var lhs = try self.allocator.create(ast.Expression);
lhs.* = switch (self.currToken()) {
.identifier => |ident| ast.Expression{ .identifier = ident },
.literal => |lit| ast.Expression{ .literal = lit },
.syntax => |syn| oblk: {
_ = switch (syn) {
.LPAREN => {
var temp = try self.parseExpression(0) orelse {
try self.fail(error.NoExpressionAfterOpenParen, "parsed an open paren and expected an expression after but found none.", .{});
unreachable;
};
self.advanceToken();
if (!self.currIsSyntax(lex.SyntaxToken.RPAREN)) {
try self.fail(error.NoCloseParen, "expecting a close paren but found {}.", .{self.currToken()});
unreachable;
}
break :oblk temp.*; // somethings causing a straight up crash, probably because memory addresses.? parseExpression allocs but we're also allocing here. hrm.
},
else => {
try self.fail(error.BadToken, "bad token {} for an lval.", .{syn});
unreachable;
},
};
},
.operator => |op| blk: {
const right_binding_power = prefixBindingPower(op) orelse {
try self.fail(error.BadOperatorForPrefix, "bad operator {} for a prefix operation.", .{op});
unreachable;
};
self.advanceToken();
const rhs = (try self.parseExpression(right_binding_power)) orelse {
try self.fail(error.NoExpressionAfterUnaryOp, "parsed an {} as infix op and expected an expression after but found none.", .{op});
unreachable;
};
break :blk ast.Expression{ .unaryOp = .{ .op = op, .right = @constCast(rhs) } };
},
.eof => {
return null;
},
};
while (self.peekToken()) |peek| {
const op = switch (peek) {
.operator => |op| op,
.eof => break,
else => {
try self.fail(error.UnhandledTokenType, "bad token {any} for an op", .{self.advanceToken()});
unreachable;
},
};
const binding = infixBindingPower(op); // todo should binding power just be a struct??
const left_binding_power = binding[0];
const right_binding_power = binding[1];
if (left_binding_power < min_binding_power) break;
self.advanceToken();
self.advanceToken();
var rhs = try self.parseExpression(right_binding_power) orelse {
try self.fail(error.DanglingOperator, "just parsed an {}, expecting another expression...", .{op});
unreachable;
};
var binNode = try self.allocator.create(ast.Expression);
binNode.* = ast.Expression{ .binaryOp = .{ .left = lhs, .right = rhs, .op = op } };
lhs = binNode;
}
return lhs;
}
fn fail(self: *Parser, err: anyerror, comptime msg: []const u8, vals: anytype) !void {
self.fail_msg = try std.fmt.allocPrint(self.allocator, msg, vals);
return err;
}
};
fn testParse(input: []const u8) !void {
var l: lex = .{ .source = input, .allocator = std.testing.allocator };
std.debug.print("parsing `{s}`.\n", .{input});
const tokens = try l.lex();
defer std.testing.allocator.free(tokens);
var p: Parser = .{ .tokens = tokens };
p.init(std.testing.allocator);
defer p.deinit();
if (p.enterParse()) |tree| {
std.debug.print("{any}\n", .{tree});
} else |err| {
std.debug.print("Unexpected error during parsing.\nType: {}\nMessage: {s}\n", .{ err, p.fail_msg });
try std.testing.expect(false);
}
}
test "do things compile" {
std.debug.print("\n", .{});
try testParse("-123+456");
try testParse("-123");
try testParse("abc");
try testParse("123 + 456");
try testParse("123 + 456 - 333 * 88");
try testParse("1+a+3");
try testParse("");
try testParse(" ");
try testParse("(123-456)"); // hehe it crashed without a stack or error
//try testParse("1 1 1 "); // this one is broken with unexpected token type void. wow.
//todo add asserts
}
const std = @import("std");
pub const Operator = enum {
ADD,
SUB,
MUL,
DIV,
EXP,
};
pub const Expression = union(enum) {
literal: Literal,
identifier: Identifier,
binaryOp: BinaryOp,
unaryOp: UnaryOp,
pub const Literal = i16;
pub const Identifier = []const u8;
pub const BinaryOp = struct {
left: *const Expression,
right: *const Expression,
op: Operator,
};
pub const UnaryOp = struct {
right: *Expression,
op: Operator,
};
pub fn format(
self: Expression,
comptime fmt: []const u8,
options: std.fmt.FormatOptions,
writer: anytype,
) !void {
_ = options;
_ = fmt;
// todo this whole thing is a bit messy, read more about format strings
_ = switch (self) {
.literal => |lit| try writer.print("int<{d}>", .{lit}),
.identifier => |ident| try writer.print("ident<{s}>", .{ident}),
.binaryOp => |bop| {
try writer.print("binaryOp<{}, {}, {}>", .{ bop.left, bop.op, bop.right });
},
.unaryOp => |uop| {
try writer.print("unaryOp<{}, {}>", .{ uop.op, uop.right });
},
};
}
};
const std = @import("std");
const ast = @import("ast.zig");
const ascii = std.ascii;
// SyntaxTokens are operators/tokens that the AST doesn't care about.
pub const SyntaxToken = enum { LPAREN, RPAREN };
pub const TokenType = std.meta.Tag(Token);
// if you want line and col, you would start here...
pub const Token = union(enum) {
literal: i16,
identifier: []const u8,
operator: ast.Operator,
syntax: SyntaxToken,
eof: u1,
};
source: []const u8,
index: usize = 0,
allocator: std.mem.Allocator,
pub fn lex(self: *@This()) ![]Token {
var workingSet = std.ArrayList(Token).init(self.allocator);
defer workingSet.deinit();
while (self.index < self.source.len) {
var c: u8 = self.source[self.index];
self.index += 1; //todo can this be a continue expression?
switch (c) {
'0'...'9' => {
const start: usize = self.index - 1;
while (self.index < self.source.len) {
if (!ascii.isDigit(self.source[self.index])) break;
self.index += 1;
}
const val = self.source[start..self.index];
const intVal = try std.fmt.parseInt(i16, val, 10);
try workingSet.append(.{ .literal = intVal });
},
'a'...'z', 'A'...'Z' => {
const start: usize = self.index - 1;
while (self.index < self.source.len) {
if (!ascii.isAlphabetic(self.source[self.index])) break;
self.index += 1;
}
const val = self.source[start..self.index];
try workingSet.append(.{ .identifier = val });
},
'+' => {
try workingSet.append(.{ .operator = .ADD });
},
'-' => {
try workingSet.append(.{ .operator = .SUB });
},
'*' => {
try workingSet.append(.{ .operator = .MUL });
},
'/' => {
try workingSet.append(.{ .operator = .DIV });
},
'^' => {
try workingSet.append(.{ .operator = .EXP });
},
'(' => {
try workingSet.append(.{ .syntax = .LPAREN });
},
')' => {
try workingSet.append(.{ .syntax = .RPAREN });
}, // TODO how to make lexer switch on ops better???
' ', '\t' => {
continue;
},
else => {
std.debug.print("lex error, unexpected character {c} in {s}\n", .{ c, self.source });
},
}
}
return workingSet.toOwnedSlice();
}
fn testLex(input: []const u8) !void {
var lexer: @This() = .{ .source = input, .allocator = std.testing.allocator };
var result = try lexer.lex();
defer std.testing.allocator.free(result);
std.debug.print("{any}\n", .{result});
}
test "basic lex" {
std.debug.print("\n", .{});
try testLex("12343");
try testLex("1+2+3");
try testLex("1+2 + 3");
try testLex("abc + 2 + 3");
//TODO add asserts
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment