Last active
February 10, 2022 12:06
-
-
Save Ruhrpottpatriot/37042d657c2bca942157d92e1893430a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <stdint.h> | |
// Normally located in Parser.c | |
#define ulex_token(name) Tok_ ## name | |
#define ulex_token_type uint32_t | |
#define ulex_trans(name) (0x80000000 | (uint32_t)ulex_token(name) << 16) | |
#define ulex_eof(name) (uint16_t)ulex_token(name) | |
struct ulex_lexer { | |
const uint32_t *equiv_table; | |
const uint32_t *trans_table; | |
const uint16_t *eof_table; | |
uint32_t state; | |
// The fields below make up a string | |
// Note: The intptr_t are only ever used inside the lexer | |
// They probably should be uintptr_t instead | |
const uint8_t *source; | |
intptr_t source_index; | |
intptr_t source_size; | |
}; | |
enum { | |
ulex_error_none = 0, // No error | |
ulex_error_insufficient_buffer = 1, // Buffer too small | |
}; | |
typedef uint32_t ulex_error; | |
enum { | |
ulex_flags_chunk = 0x1, // input is partial, no eof | |
}; | |
typedef uint32_t ulex_flags; | |
static intptr_t lex( | |
struct ulex_lexer *lexer, | |
uint32_t *__restrict trans_buffer, | |
uint32_t *__restrict offset_buffer, | |
intptr_t buffer_size | |
) { | |
const uint32_t *__restrict equiv_table = lexer->equiv_table; | |
const uint32_t *__restrict trans_table = lexer->trans_table; | |
uint32_t state = lexer->state; | |
const uint8_t *source = lexer->source; | |
intptr_t source_offset = lexer->source_index; | |
intptr_t source_size = lexer->source_size; | |
intptr_t buffer_offset = 0; | |
buffer_size *= 4; | |
// Loop until we have either reached the maximum size, i.e. we have run out of space, for the given buffers, | |
// or we have handled the complete buffer | |
while ((source_offset < source_size) & (buffer_offset < buffer_size)) { | |
// The below code is equivalent to | |
// uint32_t equiv = equiv_table[source[source_offset]]; | |
// uint32_t trans = *(const uint32_t*)((const char*)trans_table + equiv + state); | |
// | |
// *(uint32_t*)((char*)trans_buffer + buffer_offset) = trans; | |
// *(uint32_t*)((char*)offset_buffer + buffer_offset) = (uint32_t)source_offset; | |
// | |
// state = trans & 0xffff; | |
// buffer_offset += trans >> 29; | |
// source_offset += 1; | |
// source: u8 const * | |
uint8_t equiv_index = *(source + source_offset); | |
// equiv_table: u32 const * | |
uint32_t equiv = *(equiv_table + equiv_index); | |
// Cast the trans_table pointer from a u32 to a char so that the addition | |
// will move the pointer with `equiv` and `state` by BYTEs and not a DWORDs. | |
// After that cast it back to u32, so we have the correct type | |
// trans_table: u32 const* | |
const char* trans_table_char_ptr = (const char*)trans_table + equiv + state; | |
uint32_t trans = *(const uint32_t*)trans_table_char_ptr; | |
// Same reason as directly above for the casting to char* and back to u32* | |
*(uint32_t *) ((char *) trans_buffer + buffer_offset) = trans; | |
*(uint32_t *) ((char *) offset_buffer + buffer_offset) = (uint32_t) source_offset; | |
// Take the lower 16 bit | |
state = trans & 0xffff; | |
// Shift 29 bits to the right | |
buffer_offset += trans >> 29; | |
// Increment source offset, i.e. move one token ahead | |
source_offset += 1; | |
} | |
// Set the new state of the state machine and update the current index into the source file | |
lexer->state = state; | |
lexer->source_index = source_offset; | |
// Return the number of tokens to the caller | |
return buffer_offset / 4; | |
} | |
ulex_error ulex_lex( | |
struct ulex_lexer *lexer, // The lexer responsible for the source lexing | |
ulex_flags flags, // The flags describing the source code | |
uint32_t *token_buffer, // Holds a list of all tokens from the source file | |
uint32_t *offset_buffer, | |
intptr_t buffer_size, // The size of the two buffers above | |
intptr_t *out_token_count // Returns how many tokens are in the source file | |
) { | |
// Steps taken: | |
// * Desugar `arr[i]` to `*(arr + i)` | |
// * Desugar `i++` to `i += 1`; | |
// * Added more variables to remove nested array access | |
// first transition gets special handling | |
if (lexer->state == (uint32_t) -1) { | |
// The below code is equivalent to: | |
// uint32_t equiv = lexer->equiv_table[lexer->source[lexer->source_index++]]; | |
// lexer->state = *(const uint32_t*)((const char*)lexer->trans_table + equiv) & 0xffff; | |
// Get the current index in the source string, return it, then increment the source index | |
uint32_t source_idx = lexer->source_index; | |
lexer->source_index += 1; | |
// Get the byte/uint8 at the current source index and implicitly cast it to u32 so we can use it to index | |
// the equiv_table | |
// lexer->source: u8* => lexer->source[i]: u8 | |
uint32_t current_char = *(lexer->source + source_idx); | |
// Access the equiv table with the current char cast to uint32 | |
uint32_t equiv = *(lexer->equiv_table + current_char); | |
// Cast the trans_table pointer from a u32 to a char so that the addition | |
// will move the pointer by a BYTE and not a DWORD. | |
// After that cast it back to u32, so we have the correct type | |
const char *trans_table_char_ptr = (const char *) lexer->trans_table + equiv; | |
const uint32_t *trans_table_u32_ptr = (const uint32_t *) trans_table_char_ptr; | |
// Dereference the table ptr and use a 16 bit mask to get the lower 16 bits | |
// This allows us to use the lower bits as index into the state machine | |
lexer->state = *trans_table_u32_ptr & 0xffff; | |
} | |
// Call the actual lexing function, which returns the number of tokens that are present | |
// in the buffer | |
intptr_t token_count = lex(lexer, token_buffer, offset_buffer, buffer_size); | |
// transform transitions to tokens | |
for (intptr_t i = 0; i < token_count; ++i) { | |
// The below code is equivalent to: | |
// token_buffer[i] = (token_buffer[i] >> 16) & 0xfff; | |
// Take the token at index i and shift it 16 bits to the right, then take the lower 12 bits of it | |
// This is equivalent to getting bytes 16-27 | |
// token_buffer: uint32_t * | |
uint32_t element = *(token_buffer + i); | |
uint32_t element_shifted = element >> 16; | |
uint32_t element_masked = element_shifted & 0xfff; | |
*(token_buffer + i) = element_masked; | |
} | |
// Test if everything fit into the buffers | |
int source_end = lexer->source_index == lexer->source_size; | |
// handle end of file by adding an end of file token | |
// to the token buffer and incrementing the token counter by one | |
// It's also rested whether the given source file is a chunk, | |
// i.e. not a complete source code file and therefore has no EOF marker | |
if ((flags & ulex_flags_chunk) == 0 && source_end) { | |
// The below code is equivalent to: | |
// token_buffer[token_count] = lexer->eof_table[lexer->state / 4]; | |
// offset_buffer[token_count] = (uint32_t) lexer->source_size; | |
// ++token_count; | |
uint32_t index = lexer->state / 4; | |
*(token_buffer + token_count) = *(lexer->eof_table + index); | |
*(offset_buffer + token_count) = (uint32_t) lexer->source_size; | |
token_count += 1; | |
} | |
// Assign the outbound count with the number of tokens in the source file | |
*out_token_count = token_count; | |
// If we could store all tokens inside the buffers we are good, | |
// else we want to return an error indicating that the provided buffer wasn't big enough | |
return source_end ? ulex_error_none : ulex_error_insufficient_buffer; | |
} | |
// Predefined tables | |
static const uint32_t ulex_equiv_table[0x100] = { | |
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0, | |
/* Remaining members omitted */ | |
}; | |
static const uint32_t ulex_trans_table[0x7e0] = { | |
0x0, 0x34, 0x4, 0x2c, 0x10, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, ulex_trans(Error) | 0x0, 0x2c, 0x2c, 0x10, 0x34, 0x34, ulex_trans(IntDec) | 0x0, ulex_trans(Newline) | 0x0, ulex_trans(Colon) | 0x0, 0xa4, ulex_trans(Lt) | 0x0, ulex_trans(Assign) | 0x0, ulex_trans(Gt) | 0x0, ulex_trans(Word) | 0x0, ulex_trans(LBrack) | 0x0, ulex_trans(Div) | 0x0, ulex_trans(RBrack) | 0x0, ulex_trans(Dot) | 0x0, ulex_trans(Sub) | 0x0, ulex_trans(Ne) | 0x0, ulex_trans(Comma) | 0x0, ulex_trans(String) | 0x0, ulex_trans(Add) | 0x0, ulex_trans(ModAssign) | 0x0, ulex_trans(Con) | 0x0, ulex_trans(MulAssign) | 0x0, ulex_trans(AddAssign) | 0x0, ulex_trans(SubAssign) | 0x0, ulex_trans(DivAssign) | 0x0, ulex_trans(Mul) | 0x0, ulex_trans(Eq) | 0x0, ulex_trans(IntDec) | 0x0, 0xec, ulex_trans(RParen) | 0x0, ulex_trans(Le) | 0x0, ulex_trans(Ge) | 0x0, ulex_trans(Whitespace) | 0x0, ulex_trans(LParen) | 0x0, ulex_trans(Mod) | 0x0, ulex_trans(DocComment) | 0x0, ulex_trans(Dis) | 0x0, 0x2c, ulex_trans(Float) | 0x0, ulex_trans(IntHex) | 0x0, ulex_trans(BlockComment) | 0x0, ulex_trans(Not) | 0x0, ulex_trans(IntDec) | 0x0, ulex_trans(IntDec) | 0x0, ulex_trans(Word) | 0x0, ulex_trans(IntDec) | 0x0, 0xec, ulex_trans(Float) | 0x0, ulex_trans(Float) | 0x0, ulex_trans(Whitespace) | 0x0, | |
/* Remaining members omitted */ | |
}; | |
static const uint16_t ulex_eof_table[0x3f] = { | |
ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(Error), ulex_eof(IntDec), ulex_eof(Newline), ulex_eof(Colon), ulex_eof(LineComment), ulex_eof(Lt), ulex_eof(Assign), ulex_eof(Gt), ulex_eof(Word), ulex_eof(LBrack), ulex_eof(Div), ulex_eof(RBrack), ulex_eof(Dot), ulex_eof(Sub), ulex_eof(Ne), ulex_eof(Comma), ulex_eof(String), ulex_eof(Add), ulex_eof(ModAssign), ulex_eof(Con), ulex_eof(MulAssign), ulex_eof(AddAssign), ulex_eof(SubAssign), ulex_eof(DivAssign), ulex_eof(Mul), ulex_eof(Eq), ulex_eof(IntDec), ulex_eof(LineComment), ulex_eof(RParen), ulex_eof(Le), ulex_eof(Ge), ulex_eof(Whitespace), ulex_eof(LParen), ulex_eof(Mod), ulex_eof(DocComment), ulex_eof(Dis), ulex_eof(String), ulex_eof(Float), ulex_eof(IntHex), ulex_eof(BlockComment), ulex_eof(Not), ulex_eof(IntDec), ulex_eof(IntDec), ulex_eof(Word), ulex_eof(IntDec), ulex_eof(LineComment), ulex_eof(Float), ulex_eof(Float), ulex_eof(Whitespace), | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::ptr::null_mut; | |
struct Lexer { | |
equiv_table: *const u32, | |
trans_table: *const u32, | |
eof_table: *const u32, | |
state: Option<u32>, | |
source: *mut u8, | |
source_index: isize, | |
source_size: isize, | |
} | |
impl Lexer { | |
fn reset(&mut self) { | |
self.state = None; | |
} | |
unsafe fn new() -> Self { | |
Lexer { | |
trans_table: ULEX_TRANS_TABLE.as_ptr(), | |
equiv_table: ULEX_EQUIV_TABLE.as_ptr(), | |
eof_table: ULEX_EOF_TABLE.as_ptr(), | |
state: None, | |
source: null_mut(), | |
source_index: 0, | |
source_size: 0, | |
} | |
} | |
unsafe fn ulex_lex( | |
&mut self, | |
flags: Flags, | |
token_buffer: *mut u32, | |
offset_buffer: *mut u32, | |
buffer_size: isize, | |
) -> Result<isize, Error> { | |
// First transition gets special handling | |
if self.state.is_none() { | |
// Get the index first, then increment it | |
// This is equivalent to `x := arr[i++]` | |
let idx = self.source_index; | |
self.source_index += 1; | |
// Get the current char from the source string | |
let current_char = *(self.source.offset(idx) as *const isize); | |
let equivalence = *(self.equiv_table.offset(current_char) as *const usize); | |
let trans = *((self.trans_table as *const char).add(equivalence) as *const u32); | |
let trans = trans & 0xffff; | |
self.state = Some(trans); | |
} | |
let mut token_count = self.lex(token_buffer, offset_buffer, buffer_size); | |
for i in 0..token_count { | |
*token_buffer.offset(i) = (*token_buffer.offset(i) >> (16 as u32)) & 0xfff; | |
} | |
// handle end of file | |
let source_end = self.source_index == self.source_size; | |
if flags != Flags::Chunk && source_end { | |
let state = (self.state.unwrap() / 4) as usize; | |
*token_buffer.offset(token_count) = *self.eof_table.add(state); | |
*offset_buffer.offset(token_count) = self.source_size as u32; | |
token_count += 1; | |
} | |
Ok(token_count) | |
} | |
unsafe fn lex( | |
&mut self, | |
trans_buffer: *mut u32, | |
offset_buffer: *mut u32, | |
buffer_size: isize, | |
) -> isize { | |
let equiv_table = self.equiv_table; | |
let trans_table = self.trans_table; | |
// Can only be called by ulex_lex, which will always set the state | |
let mut state = self.state.unwrap(); | |
let source = self.source; | |
let mut source_offset = self.source_index; | |
let source_size = self.source_size; | |
let mut buffer_offset = 0; | |
let buffer_size = buffer_size / 4; | |
while (source_offset < source_size) & (buffer_offset < buffer_size) { | |
let idx = (*source.offset(source_offset)) as isize; | |
let equiv = *equiv_table.offset(idx); | |
let idx = (equiv + state) as usize; | |
let trans = *((trans_table as *const char).add(idx) as *const u32); | |
// Cast the buffers to a char ptr so we can walk the offset byte by byte and not by dword by dword | |
// unsafe | |
*((trans_buffer as *mut char).offset(buffer_offset) as *mut u32) = trans; | |
*((offset_buffer as *mut char).offset(buffer_offset) as *mut u32) = source_offset as u32; | |
state = trans & 0xffff; | |
buffer_offset += (trans >> 29) as isize; | |
source_offset += 1; | |
} | |
self.state = Some(state); | |
self.source_index = source_offset; | |
return buffer_offset / 4; | |
} | |
} | |
#[derive(PartialEq, Eq)] | |
enum Error { | |
InsufficientBuffer | |
} | |
#[derive(PartialEq, Eq)] | |
enum Flags { | |
Chunk = 0x1 | |
} | |
// Predefined lexer tables | |
// Equiv table, each two rows are sixteen elements | |
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[ | |
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, | |
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0, | |
/* Rest of members omitted for brevity */ | |
]; | |
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table | |
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32]; | |
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table | |
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Predefined lexer tables | |
// Equiv table, each two rows are sixteen elements | |
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[ | |
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, | |
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0, | |
/* Rest of members omitted for brevity */ | |
]; | |
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table | |
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32]; | |
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table | |
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32];struct Lexer<'a> { | |
equiv_table: &'a [u32], | |
trans_table: &'a [u32], | |
eof_table: &'a [u32], | |
state: Option<u32>, | |
source: Vec<u8>, | |
source_index: usize, | |
source_size: usize, | |
} | |
impl Lexer<'_> { | |
fn reset(&mut self) { | |
self.state = None; | |
} | |
unsafe fn new() -> Self { | |
Lexer { | |
trans_table: ULEX_TRANS_TABLE, | |
equiv_table: ULEX_EQUIV_TABLE, | |
eof_table: ULEX_EOF_TABLE, | |
state: None, | |
source: Vec::new(), | |
source_index: 0, | |
source_size: 0, | |
} | |
} | |
fn ulex_lex( | |
&mut self, | |
flags: Flags, | |
token_buffer: &mut [u32], | |
offset_buffer: &mut [u32], | |
buffer_size: isize, | |
) -> Result<usize, Error> { | |
// First transition gets special handling | |
if self.state.is_none() { | |
// Get the index first, then increment it | |
// This is equivalent to `x := arr[i++]` | |
let idx = self.source_index; | |
self.source_index += 1; | |
// Get the current char from the source string | |
let current_char = self.source[idx] as usize; | |
let equivalence = self.equiv_table[current_char] as usize; | |
let trans = get_middle_index(self.trans_table, equivalence); | |
let trans = trans & 0xffff; | |
self.state = Some(trans); | |
} | |
let mut token_count = self.lex(token_buffer, offset_buffer, buffer_size); | |
for i in 0..token_count { | |
token_buffer[i] = token_buffer[i] >> 16 & 0xfff; | |
} | |
// handle end of file | |
let source_end = self.source_index == self.source_size; | |
if flags != Flags::Chunk && source_end { | |
let state = (self.state.unwrap() / 4) as usize; | |
token_buffer[token_count] = self.eof_table[state]; | |
offset_buffer[token_count] = self.source_size as u32; | |
token_count += 1; | |
} | |
Ok(token_count) | |
} | |
fn lex( | |
&mut self, | |
trans_buffer: &mut [u32], | |
offset_buffer: &mut [u32], | |
buffer_size: isize, | |
) -> usize { | |
let equiv_table = self.equiv_table; | |
let trans_table = self.trans_table; | |
// Can only be called by ulex_lex, which will always set the state | |
let mut state = self.state.unwrap(); | |
let source = &self.source; | |
let mut source_offset = self.source_index as usize; | |
let source_size = self.source_size as usize; | |
let mut buffer_offset = 0 as usize; | |
let buffer_size = buffer_size as usize / 4; | |
while (source_offset < source_size) & (buffer_offset < buffer_size) { | |
let idx = source[source_offset] as usize; | |
let equiv = equiv_table[idx]; | |
let idx = (equiv + state) as usize; | |
let trans = get_middle_index(trans_table, idx); | |
assign_in_the_middle(trans_buffer, buffer_offset, trans); | |
assign_in_the_middle(offset_buffer, buffer_offset, source_offset as u32); | |
state = trans & 0xffff; | |
buffer_offset += (trans >> 29) as usize; | |
source_offset += 1; | |
} | |
self.state = Some(state); | |
self.source_index = source_offset; | |
return buffer_offset / 4; | |
} | |
} | |
fn assign_in_the_middle(buffer: &mut [u32], offset: usize, data: u32) { | |
let ptr = buffer.as_mut_ptr(); | |
unsafe { | |
// Cast the buffers to a char ptr so we can walk the offset by BYTE and not by DWORD | |
*((ptr as *mut char).add(offset) as *mut u32) = data; | |
} | |
} | |
fn get_middle_index(table: &[u32], offset: usize) -> u32 { | |
let table_ptr = table.as_ptr(); | |
unsafe { | |
// Cast the buffers to a char ptr so we can walk the offset by BYTE and not by DWORD | |
*((table_ptr as *const char).add(offset) as *const u32) | |
} | |
} | |
#[derive(PartialEq, Eq)] | |
enum Error { | |
InsufficientBuffer | |
} | |
#[derive(PartialEq, Eq)] | |
enum Flags { | |
Chunk = 0x1 | |
} | |
// Predefined lexer tables | |
// Equiv table, each two rows are sixteen elements | |
static ULEX_EQUIV_TABLE: &'static [u32; 0x100] = &[ | |
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, | |
0x0, 0xfc, 0x1f8, 0xfc, 0xfc, 0x2f4, 0x0, 0x0, | |
/* Rest of members omitted for brevity */ | |
]; | |
// static ULEX_TRANS_TABLE: &'static [u32; 0x7e0] = todo!(); // Replace with actual table | |
static ULEX_TRANS_TABLE: &'static [u32; 0x1] = &[0x0 as u32]; | |
// static ULEX_EOF_TABLE: &'static [u32; 0x3f] = todo!(); // Replace with actual table | |
static ULEX_EOF_TABLE: &'static [u32; 0x1] = &[0x0 as u32]; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment