Skip to content

Instantly share code, notes, and snippets.

@Ruhrpottpatriot
Last active February 10, 2022 12:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ruhrpottpatriot/37042d657c2bca942157d92e1893430a to your computer and use it in GitHub Desktop.
Save Ruhrpottpatriot/37042d657c2bca942157d92e1893430a to your computer and use it in GitHub Desktop.
#pragma once
#include <stdint.h>
// Normally located in Parser.c
#define ulex_token(name) Tok_ ## name
#define ulex_token_type uint32_t
#define ulex_trans(name) (0x80000000 | (uint32_t)ulex_token(name) << 16)
#define ulex_eof(name) (uint16_t)ulex_token(name)
struct ulex_lexer {
const uint32_t *equiv_table;
const uint32_t *trans_table;
const uint16_t *eof_table;
uint32_t state;
// The fields below make up a string
// Note: The intptr_t are only ever used inside the lexer
// They probably should be uintptr_t instead
const uint8_t *source;
intptr_t source_index;
intptr_t source_size;
};
enum {
ulex_error_none = 0, // No error
ulex_error_insufficient_buffer = 1, // Buffer too small
};
typedef uint32_t ulex_error;
enum {
ulex_flags_chunk = 0x1, // input is partial, no eof
};
typedef uint32_t ulex_flags;
static intptr_t lex(
struct ulex_lexer *lexer,
uint32_t *__restrict trans_buffer,
uint32_t *__restrict offset_buffer,
intptr_t buffer_size
) {
const uint32_t *__restrict equiv_table = lexer->equiv_table;
const uint32_t *__restrict trans_table = lexer->trans_table;
uint32_t state = lexer->state;
const uint8_t *source = lexer->source;
intptr_t source_offset = lexer->source_index;
intptr_t source_size = lexer->source_size;
intptr_t buffer_offset = 0;
buffer_size *= 4;
// Loop until we have either reached the maximum size, i.e. we have run out of space, for the given buffers,
// or we have handled the complete buffer
while ((source_offset < source_size) & (buffer_offset < buffer_size)) {
// The below code is equivalent to
// uint32_t equiv = equiv_table[source[source_offset]];
// uint32_t trans = *(const uint32_t*)((const char*)trans_table + equiv + state);
//
// *(uint32_t*)((char*)trans_buffer + buffer_offset) = trans;
// *(uint32_t*)((char*)offset_buffer + buffer_offset) = (uint32_t)source_offset;
//
// state = trans & 0xffff;
// buffer_offset += trans >> 29;
// source_offset += 1;
// source: u8 const *
uint8_t equiv_index = *(source + source_offset);
// equiv_table: u32 const *
uint32_t equiv = *(equiv_table + equiv_index);
// Cast the trans_table pointer from a u32 to a char so that the addition
// will move the pointer with `equiv` and `state` by BYTEs and not a DWORDs.
// After that cast it back to u32, so we have the correct type
// trans_table: u32 const*
const char* trans_table_char_ptr = (const char*)trans_table + equiv + state;
uint32_t trans = *(const uint32_t*)trans_table_char_ptr;
// Same reason as directly above for the casting to char* and back to u32*
*(uint32_t *) ((char *) trans_buffer + buffer_offset) = trans;
*(uint32_t *) ((char *) offset_buffer + buffer_offset) = (uint32_t) source_offset;
// Take the lower 16 bit
state = trans & 0xffff;
// Shift 29 bits to the right
buffer_offset += trans >> 29;
// Increment source offset, i.e. move one token ahead
source_offset += 1;
}
// Set the new state of the state machine and update the current index into the source file
lexer->state = state;
lexer->source_index = source_offset;
// Return the number of tokens to the caller
return buffer_offset / 4;
}
ulex_error ulex_lex(
struct ulex_lexer *lexer, // The lexer responsible for the source lexing
ulex_flags flags, // The flags describing the source code
uint32_t *token_buffer, // Holds a list of all tokens from the source file
uint32_t *offset_buffer,
intptr_t buffer_size, // The size of the two buffers above
intptr_t *out_token_count // Returns how many tokens are in the source file
) {
// Steps taken:
// * Desugar `arr[i]` to `*(arr + i)`
// * Desugar `i++` to `i += 1`;
// * Added more variables to remove nested array access
// first transition gets special handling
if (lexer->state == (uint32_t) -1) {
// The below code is equivalent to:
// uint32_t equiv = lexer->equiv_table[lexer->source[lexer->source_index++]];
// lexer->state = *(const uint32_t*)((const char*)lexer->trans_table + equiv) & 0xffff;
// Get the current index in the source string, return it, then increment the source index
uint32_t source_idx = lexer->source_index;
lexer->source_index += 1;
// Get the byte/uint8 at the current source index and implicitly cast it to u32 so we can use it to index
// the equiv_table
// lexer->source: u8* => lexer->source[i]: u8
uint32_t current_char = *(lexer->source + source_idx);
// Access the equiv table with the current char cast to uint32
uint32_t equiv = *(lexer->equiv_table + current_char);
// Cast the trans_table pointer from a u32 to a char so that the addition
// will move the pointer by a BYTE and not a DWORD.
// After that cast it back to u32, so we have the correct type
const char *trans_table_char_ptr = (const char *) lexer->trans_table + equiv;
const uint32_t *trans_table_u32_ptr = (const uint32_t *) trans_table_char_ptr;
// Dereference the table ptr and use a 16 bit mask to get the lower 16 bits
// This allows us to use the lower bits as index into the state machine
lexer->state = *trans_table_u32_ptr & 0xffff;
}
// Call the actual lexing function, which returns the number of tokens that are present
// in the buffer
intptr_t token_count = lex(lexer, token_buffer, offset_buffer, buffer_size);
// transform transitions to tokens
for (intptr_t i = 0; i < token_count; ++i) {
// The below code is equivalent to:
// token_buffer[i] = (token_buffer[i] >> 16) & 0xfff;
// Take the token at index i and shift it 16 bits to the right, then take the lower 12 bits of it
// This is equivalent to getting bytes 16-27
// token_buffer: uint32_t *
uint32_t element = *(token_buffer + i);
uint32_t element_shifted = element >> 16;
uint32_t element_masked = element_shifted & 0xfff;
*(token_buffer + i) = element_masked;
}
// Test if everything fit into the buffers
int source_end = lexer->source_index == lexer->source_size;
// handle end of file by adding an end of file token
// to the token buffer and incrementing the token counter by one
// It's also rested whether the given source file is a chunk,
// i.e. not a complete source code file and therefore has no EOF marker
if ((flags & ulex_flags_chunk) == 0 && source_end) {
// The below code is equivalent to:
// token_buffer[token_count] = lexer->eof_table[lexer->state / 4];
// offset_buffer[token_count] = (uint32_t) lexer->source_size;
// ++token_count;
uint32_t index = lexer->state / 4;
*(token_buffer + token_count) = *(lexer->eof_table + index);
*(offset_buffer + token_count) = (uint32_t) lexer->source_size;
token_count += 1;
}
// Assign the outbound count with the number of tokens in the source file
*out_token_count = token_count;
// If we could store all tokens inside the buffers we are good,
// else we want to return an error indicating that the provided buffer wasn't big enough
return source_end ? ulex_error_none : ulex_error_insufficient_buffer;
}
// Predefined tables
static const uint32_t ulex_equiv_table[0x100] = {
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0,
/* Remaining members omitted */
};
static const uint32_t ulex_trans_table[0x7e0] = {
0x0, 0x34, 0x4, 0x2c, 0x10, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, 0x2c, 0x2c, 0x10, 0x34, 0x34, ulex_trans(IntDec) | 0x0, ulex_trans(Newline) | 0x0, ulex_trans(Colon) | 0x0, 0xa4, ulex_trans(Lt) | 0x0, ulex_trans(Assign) | 0x0, ulex_trans(Gt) | 0x0, ulex_trans(Word) | 0x0, ulex_trans(LBrack) | 0x0, ulex_trans(Div) | 0x0, ulex_trans(RBrack) | 0x0, ulex_trans(Dot) | 0x0, ulex_trans(Sub) | 0x0, ulex_trans(Ne) | 0x0, ulex_trans(Comma) | 0x0, ulex_trans(String) | 0x0, ulex_trans(Add) | 0x0, ulex_trans(ModAssign) | 0x0, ulex_trans(Con) | 0x0, ulex_trans(MulAssign) | 0x0, ulex_trans(AddAssign) | 0x0, ulex_trans(SubAssign) | 0x0, ulex_trans(DivAssign) | 0x0, ulex_trans(Mul) | 0x0, ulex_trans(Eq) | 0x0, ulex_trans(IntDec) | 0x0, 0xec, ulex_trans(RParen) | 0x0, ulex_trans(Le) | 0x0, ulex_trans(Ge) | 0x0, ulex_trans(Whitespace) | 0x0, ulex_trans(LParen) | 0x0, ulex_trans(Mod) | 0x0, ulex_trans(DocComment) | 0x0, ulex_trans(Dis) | 0x0, 0x2c, ulex_trans(Float) | 0x0, ulex_trans(IntHex) | 0x0, ulex_trans(BlockComment) | 0x0, ulex_trans(Not) | 0x0, ulex_trans(IntDec) | 0x0, ulex_trans(IntDec) | 0x0, ulex_trans(Word) | 0x0, ulex_trans(IntDec) | 0x0, 0xec, ulex_trans(Float) | 0x0, ulex_trans(Float) | 0x0, ulex_trans(Whitespace) | 0x0,
/* Remaining members omitted */
};
static const uint16_t ulex_eof_table[0x3f] = {
ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(IntDec), ulex_eof(Newline), ulex_eof(Colon), ulex_eof(LineComment), ulex_eof(Lt), ulex_eof(Assign), ulex_eof(Gt), ulex_eof(Word), ulex_eof(LBrack), ulex_eof(Div), ulex_eof(RBrack), ulex_eof(Dot), ulex_eof(Sub), ulex_eof(Ne), ulex_eof(Comma), ulex_eof(String), ulex_eof(Add), ulex_eof(ModAssign), ulex_eof(Con), ulex_eof(MulAssign), ulex_eof(AddAssign), ulex_eof(SubAssign), ulex_eof(DivAssign), ulex_eof(Mul), ulex_eof(Eq), ulex_eof(IntDec), ulex_eof(LineComment), ulex_eof(RParen), ulex_eof(Le), ulex_eof(Ge), ulex_eof(Whitespace), ulex_eof(LParen), ulex_eof(Mod), ulex_eof(DocComment), ulex_eof(Dis), ulex_eof(String), ulex_eof(Float), ulex_eof(IntHex), ulex_eof(BlockComment), ulex_eof(Not), ulex_eof(IntDec), ulex_eof(IntDec), ulex_eof(Word), ulex_eof(IntDec), ulex_eof(LineComment), ulex_eof(Float), ulex_eof(Float), ulex_eof(Whitespace),
};
use std::ptr::null_mut;
struct Lexer {
equiv_table: *const u32,
trans_table: *const u32,
eof_table: *const u32,
state: Option<u32>,
source: *mut u8,
source_index: isize,
source_size: isize,
}
impl Lexer {
fn reset(&mut self) {
self.state = None;
}
unsafe fn new() -> Self {
Lexer {
trans_table: ULEX_TRANS_TABLE.as_ptr(),
equiv_table: ULEX_EQUIV_TABLE.as_ptr(),
eof_table: ULEX_EOF_TABLE.as_ptr(),
state: None,
source: null_mut(),
source_index: 0,
source_size: 0,
}
}
unsafe fn ulex_lex(
&mut self,
flags: Flags,
token_buffer: *mut u32,
offset_buffer: *mut u32,
buffer_size: isize,
) -> Result<isize, Error> {
// First transition gets special handling
if self.state.is_none() {
// Get the index first, then increment it
// This is equivalent to `x := arr[i++]`
let idx = self.source_index;
self.source_index += 1;
// Get the current char from the source string
let current_char = *(self.source.offset(idx) as *const isize);
let equivalence = *(self.equiv_table.offset(current_char) as *const usize);
let trans = *((self.trans_table as *const char).add(equivalence) as *const u32);
let trans = trans & 0xffff;
self.state = Some(trans);
}
let mut token_count = self.lex(token_buffer, offset_buffer, buffer_size);
for i in 0..token_count {
*token_buffer.offset(i) = (*token_buffer.offset(i) >> (16 as u32)) & 0xfff;
}
// handle end of file
let source_end = self.source_index == self.source_size;
if flags != Flags::Chunk && source_end {
let state = (self.state.unwrap() / 4) as usize;
*token_buffer.offset(token_count) = *self.eof_table.add(state);
*offset_buffer.offset(token_count) = self.source_size as u32;
token_count += 1;
}
Ok(token_count)
}
unsafe fn lex(
&mut self,
trans_buffer: *mut u32,
offset_buffer: *mut u32,
buffer_size: isize,
) -> isize {
let equiv_table = self.equiv_table;
let trans_table = self.trans_table;
// Can only be called by ulex_lex, which will always set the state
let mut state = self.state.unwrap();
let source = self.source;
let mut source_offset = self.source_index;
let source_size = self.source_size;
let mut buffer_offset = 0;
let buffer_size = buffer_size / 4;
while (source_offset < source_size) & (buffer_offset < buffer_size) {
let idx = (*source.offset(source_offset)) as isize;
let equiv = *equiv_table.offset(idx);
let idx = (equiv + state) as usize;
let trans = *((trans_table as *const char).add(idx) as *const u32);
// Cast the buffers to a char ptr so we can walk the offset byte by byte and not by dword by dword
// unsafe
*((trans_buffer as *mut char).offset(buffer_offset) as *mut u32) = trans;
*((offset_buffer as *mut char).offset(buffer_offset) as *mut u32) = source_offset as u32;
state = trans & 0xffff;
buffer_offset += (trans >> 29) as isize;
source_offset += 1;
}
self.state = Some(state);
self.source_index = source_offset;
return buffer_offset / 4;
}
}
#[derive(PartialEq, Eq)]
enum Error {
InsufficientBuffer
}
#[derive(PartialEq, Eq)]
enum Flags {
Chunk = 0x1
}
// Predefined lexer tables
// Equiv table, each two rows are sixteen elements
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0,
/* Rest of members omitted for brevity */
];
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32];
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32];
// Predefined lexer tables
// Equiv table, each two rows are sixteen elements
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0,
/* Rest of members omitted for brevity */
];
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32];
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32];struct Lexer<'a> {
equiv_table: &'a [u32],
trans_table: &'a [u32],
eof_table: &'a [u32],
state: Option<u32>,
source: Vec<u8>,
source_index: usize,
source_size: usize,
}
impl Lexer<'_> {
fn reset(&mut self) {
self.state = None;
}
unsafe fn new() -> Self {
Lexer {
trans_table: ULEX_TRANS_TABLE,
equiv_table: ULEX_EQUIV_TABLE,
eof_table: ULEX_EOF_TABLE,
state: None,
source: Vec::new(),
source_index: 0,
source_size: 0,
}
}
fn ulex_lex(
&mut self,
flags: Flags,
token_buffer: &mut [u32],
offset_buffer: &mut [u32],
buffer_size: isize,
) -> Result<usize, Error> {
// First transition gets special handling
if self.state.is_none() {
// Get the index first, then increment it
// This is equivalent to `x := arr[i++]`
let idx = self.source_index;
self.source_index += 1;
// Get the current char from the source string
let current_char = self.source[idx] as usize;
let equivalence = self.equiv_table[current_char] as usize;
let trans = get_middle_index(self.trans_table, equivalence);
let trans = trans & 0xffff;
self.state = Some(trans);
}
let mut token_count = self.lex(token_buffer, offset_buffer, buffer_size);
for i in 0..token_count {
token_buffer[i] = token_buffer[i] >> 16 & 0xfff;
}
// handle end of file
let source_end = self.source_index == self.source_size;
if flags != Flags::Chunk && source_end {
let state = (self.state.unwrap() / 4) as usize;
token_buffer[token_count] = self.eof_table[state];
offset_buffer[token_count] = self.source_size as u32;
token_count += 1;
}
Ok(token_count)
}
fn lex(
&mut self,
trans_buffer: &mut [u32],
offset_buffer: &mut [u32],
buffer_size: isize,
) -> usize {
let equiv_table = self.equiv_table;
let trans_table = self.trans_table;
// Can only be called by ulex_lex, which will always set the state
let mut state = self.state.unwrap();
let source = &self.source;
let mut source_offset = self.source_index as usize;
let source_size = self.source_size as usize;
let mut buffer_offset = 0 as usize;
let buffer_size = buffer_size as usize / 4;
while (source_offset < source_size) & (buffer_offset < buffer_size) {
let idx = source[source_offset] as usize;
let equiv = equiv_table[idx];
let idx = (equiv + state) as usize;
let trans = get_middle_index(trans_table, idx);
assign_in_the_middle(trans_buffer, buffer_offset, trans);
assign_in_the_middle(offset_buffer, buffer_offset, source_offset as u32);
state = trans & 0xffff;
buffer_offset += (trans >> 29) as usize;
source_offset += 1;
}
self.state = Some(state);
self.source_index = source_offset;
return buffer_offset / 4;
}
}
fn assign_in_the_middle(buffer: &mut [u32], offset: usize, data: u32) {
let ptr = buffer.as_mut_ptr();
unsafe {
// Cast the buffers to a char ptr so we can walk the offset by BYTE and not by DWORD
*((ptr as *mut char).add(offset) as *mut u32) = data;
}
}
fn get_middle_index(table: &[u32], offset: usize) -> u32 {
let table_ptr = table.as_ptr();
unsafe {
// Cast the buffers to a char ptr so we can walk the offset by BYTE and not by DWORD
*((table_ptr as *const char).add(offset) as *const u32)
}
}
#[derive(PartialEq, Eq)]
enum Error {
InsufficientBuffer
}
#[derive(PartialEq, Eq)]
enum Flags {
Chunk = 0x1
}
// Predefined lexer tables
// Equiv table, each two rows are sixteen elements
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0,
/* Rest of members omitted for brevity */
];
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32];
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment