Skip to content

Instantly share code, notes, and snippets.

@nektro
Last active Aug 30, 2022
Embed
What would you like to do?
A data-oriented-design built tokenizer
zig-*
.zigmod
deps.zig
const std = @import("std");
const deps = @import("./deps.zig");
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const mode = b.standardReleaseOptions();
const exe = b.addExecutable("example", "main.zig");
exe.setTarget(target);
exe.setBuildMode(mode);
deps.addAllTo(exe);
exe.install();
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}
let x, y, z; // Declare 3 variables
x = 5; // Assign the value 5 to x
y = 6; // Assign the value 6 to y
z = x + y; // Assign the sum of x and y to z
document.getElementById("demo").innerHTML = "The value of z is " + z + ".";
const std = @import("std");
const tokenize = @import("tokenize");
pub fn main() !void {
var file = try std.fs.cwd().openFile("example.js", .{});
defer file.close();
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const alloc = gpa.allocator();
var content = try file.reader().readAllAlloc(alloc, 1024);
var tok_doc = try tokenize.do(alloc, "{}();,=+", content);
defer tok_doc.deinit(alloc);
std.log.info("Success!", .{});
std.log.info("parsed {d} tokens", .{tok_doc.tokens.len});
std.debug.print("\n", .{});
for (tok_doc.tokens.items(.tag)) |item, i| {
std.debug.print("{s}\t{s}\n", .{ @tagName(item), tok_doc.str(@intCast(u32, i)) });
}
}
const std = @import("std");
const string = []const u8;
pub const Document = struct {
tokens: std.MultiArrayList(Token).Slice,
string_bytes: []const u8,
extra: []const u32,
pub fn str(self: Document, ind: Token.Index) []const u8 {
const tag = self.tokens.items(.tag)[ind];
const extra = self.tokens.items(.extra)[ind];
return switch (tag) {
.symbol => &[_]u8{@intCast(u8, extra)},
.word, .string => self.string_bytes[self.extra[extra]..][0..self.extra[extra + 1]],
};
}
pub fn deinit(self: *Document, alloc: std.mem.Allocator) void {
self.tokens.deinit(alloc);
alloc.free(self.string_bytes);
alloc.free(self.extra);
}
};
pub const Token = struct {
tag: Tag,
extra: u32,
line: u32,
pos: u32,
pub const Index = u32;
pub const Tag = std.meta.Tag(Data);
pub const Data = union(enum) {
symbol: void,
word: ExtraStr,
string: ExtraStr,
};
pub const skippedChars = &[_]u8{ ' ', '\n', '\t', '\r' };
};
const ExtraStr = struct {
start: u32,
len: u32,
pub fn get(self: @This(), code: Document) []const u8 {
return code.string_bytes[self.start..][0..self.len];
}
};
const Worker = struct {
insts: std.ArrayListUnmanaged(Token) = .{},
extras: std.ArrayListUnmanaged(u32) = .{},
strings: std.ArrayListUnmanaged(u8) = .{},
strings_map: std.StringHashMapUnmanaged(u32) = .{},
pub fn addStr(self: *Worker, alloc: std.mem.Allocator, str: string) !u32 {
var res = try self.strings_map.getOrPut(alloc, str);
if (res.found_existing) return res.value_ptr.*;
const q = self.strings.items.len;
try self.strings.appendSlice(alloc, str);
const r = self.extras.items.len;
try self.extras.appendSlice(alloc, &[_]u32{ @intCast(u32, q), @intCast(u32, str.len) });
res.value_ptr.* = @intCast(u32, r);
return @intCast(u32, r);
}
pub fn final(self: *Worker, alloc: std.mem.Allocator) !Document {
self.strings_map.deinit(alloc);
var tokens_list = self.insts.toOwnedSlice(alloc);
defer alloc.free(tokens_list);
var multilist = std.MultiArrayList(Token){};
errdefer multilist.deinit(alloc);
try multilist.ensureUnusedCapacity(alloc, tokens_list.len);
for (tokens_list) |item| multilist.appendAssumeCapacity(item);
return Document{
.tokens = multilist.slice(),
.string_bytes = self.strings.toOwnedSlice(alloc),
.extra = self.extras.toOwnedSlice(alloc),
};
}
};
const InnerMode = enum {
unknown,
line_comment,
string,
};
/// Document owns its memory, `input` may be freed after this function returns
pub fn do(alloc: std.mem.Allocator, symbols: []const u8, input: string) !Document {
var wrk = Worker{};
// String table indexes 0 and 1 are reserved for special meaning.
try wrk.strings.appendSlice(alloc, &[_]u8{ 0, 0 });
var line: u32 = 1;
var pos: u32 = 1;
var start: usize = 0;
var end: usize = 0;
var mode = InnerMode.unknown;
@setEvalBranchQuota(100000);
for (input) |c, i| {
var shouldFlush: bool = undefined;
blk: {
if (mode == .unknown) {
if (c == '/' and input[i + 1] == '/') {
mode = .line_comment;
shouldFlush = false;
break :blk;
}
if (c == '"') {
mode = .string;
shouldFlush = false;
break :blk;
}
}
if (mode == .line_comment) {
if (c == '\n') {
// skip comments
// f(v.handle(TTCom, in[s:i]))
start = i;
end = i;
mode = .unknown;
}
shouldFlush = c == '\n';
break :blk;
}
if (mode == .string) {
if (c == input[start]) {
try wrk.insts.append(alloc, .{
.tag = .string,
.extra = try wrk.addStr(alloc, input[start .. i + 1]),
.line = line,
.pos = pos,
});
start = i + 1;
end = i;
mode = .unknown;
}
shouldFlush = false;
break :blk;
}
if (std.mem.indexOfScalar(u8, Token.skippedChars, c)) |_| {
shouldFlush = true;
break :blk;
}
if (std.mem.indexOfScalar(u8, symbols, c)) |_| {
shouldFlush = true;
break :blk;
}
shouldFlush = false;
break :blk;
}
if (!shouldFlush) {
end += 1;
}
if (shouldFlush) {
if (mode == .unknown) {
if (end - start > 0) {
try wrk.insts.append(alloc, .{
.tag = .word,
.extra = try wrk.addStr(alloc, input[start..end]),
.line = line,
.pos = pos,
});
start = i;
end = i;
}
if (std.mem.indexOfScalar(u8, Token.skippedChars, c)) |_| {
start += 1;
end += 1;
}
if (std.mem.indexOfScalar(u8, symbols, c)) |_| {
try wrk.insts.append(alloc, .{
.tag = .symbol,
.extra = c,
.line = line,
.pos = pos,
});
start += 1;
end += 1;
}
}
}
pos += 1;
if (c != '\n') continue;
line += 1;
pos = 1;
}
alloc.free(input);
return try wrk.final(alloc);
}
id: gukdhjgjclrqg0muddtl2e8yrm0na99wwyup6mpi17s9mh40
name: tokenize
main: tokenize.zig
license: MIT
description: A data-oriented-design built tokenizer
dependencies:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment