Per Vognsen pervognsen

## vm_bench.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__x86_64__)
#define BREAK    asm("int3")
#else
#error Implement macros for your CPU.
#endif

## prefetch_and_hoist.c
// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
// and that pc, ins, next_ins, next_opcase are always in registers.

#define a ((ins >> 8) & 0xFF)
#define b ((ins >> 16) & 0xFF)
#define c ((ins >> 24) & 0xFF)

// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts

## shift_dfa.md

      
              1 file
            
          
              4 forks
            
          
              6 comments
            
          
              93 stars
            
          
                pervognsen
                / shift_dfa.md
            
            
              Last active
              January 27, 2024 19:54
            
              
                Shift-based DFAs
              
          
    A traditional table-based DFA implementation looks like this:
uint8_t table[NUM_STATES][256]

uint8_t run(const uint8_t *start, const uint8_t *end, uint8_t state) {
    for (const uint8_t *s = start; s != end; s++)
        state = table[state][*s];
    return state;
}


## pratt_cps.c
// Direct style

int paren_prefix(Ctx *c) {
    next(c); // LPAREN
    int x = binary(c, -1);
    expect(c, RPAREN);
    return x;
}

int binary(Ctx *c, int min_prec) {

## remotery_code.c
#pragma section(".remotery_code", read, execute)

__declspec(allocate(".remotery_code"))
static const uint8_t code[] =
{
    0x50,                                           // push rax
    0x9C,                                           // pushfq
    0x53,                                           // push rbx
    0x51,                                           // push rcx
    0x52,                                           // push rdx

## algebraic_data_type_numeral_encoding.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              5 stars
            
          
                pervognsen
                / algebraic_data_type_numeral_encoding.md
            
            
              Last active
              January 13, 2024 00:48
            
          
    (Originally written as a reply to an HN submission of this article: https://www.cs.virginia.edu/~lat7h/blog/posts/434.html)
There's a simple recipe for arithmetically encoding recursive algebraic data types (in the functional programming sense) which is related to this.
What you might have seen is Goedel numbering where a finite sequence of natural numbers a_0, a_1, ..., a_n (where n isn't fixed but can vary per sequence) is mapped bijectively onto p_0^a_0 a_1^p_1 ... a_n^p_n where p_0, p_1, ... is an enumeration of the primes.
However, if you want to represent trees instead of sequences, you have a better, simpler option. The key is the existence of a bijective pairing function between N^2 and N, which you can write as <m, n> for m, n in N.
You have a lot of choices for how to construct the pairing function. But a curious fact is that there is essentially one polynomial pairing function and it's the one you saw in class when you learned that the rationals are countable: https://en.wikipedia.org/wiki/Fuet

  
## ast_intern.c
#include <assert.h>

#include <tuple>
#include <vector>
#include <string>

typedef uint32_t Str;

std::vector<const char*> strs;

## assert_default_params.c
void assert_func(int cond, const char *msg) {
    if (!cond) {
        fprintf(stderr, "assert failed: %s\n", msg);
        abort();
    }
}

#define assert_helper(x, y, ...) assert_func(x, y)
#define assert(x, ...) assert_helper((x), ## __VA_ARGS__, #x)

## segregated_tables.c
// Length-segregated string tables for length < 16. You use a separate overflow table for length >= 16.
// By segregating like this you can pack the string data in the table itself tightly without any padding. The datapath
// is uniform and efficient for all lengths < 16 by using unaligned 16-byte SIMD loads/compares and masking off the length prefix.

// One of the benefits of packing string data tightly for each length table is that you can afford to reduce the load factor
// on shorter length tables without hurting space utilization too much. This can push hole-in-one rates into the 95% range without
// too much of a negative impact on cache utilization.

// Since get() takes a vector register as an argument with the key, you want to shape the upstream code so the string to be queried
// is naturally in a vector. For example, in an optimized identifier lexer you should already have a SIMD fast path for length < 16

## ast_tagged_index.c
// We have five kinds of nodes: literal, negate, not, add, xor.
// In this case we only need 3 bits for the tag but you can use as many as you need.

enum Tag {
    LIT,
    NEG,
    NOT,
    ADD,
    XOR,
};
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>

	#if defined(__x86_64__)
	#define BREAK asm("int3")
	#else
	#error Implement macros for your CPU.
	#endif
	// Example: Opcode dispatch in a bytecode VM. Assume the opcode case dispatching is mispredict heavy,
	// and that pc, ins, next_ins, next_opcase are always in registers.

	#define a ((ins >> 8) & 0xFF)
	#define b ((ins >> 16) & 0xFF)
	#define c ((ins >> 24) & 0xFF)

	// Version 1: Synchronous instruction fetch and opcode dispatch. The big bottleneck is that given how light
	// the essential work is for each opcode case (e.g. something like ADD is typical), you're dominated
	// by the cost of the opcode dispatch branch mispredicts. When there's a mispredict, the pipeline restarts
	// Direct style

	int paren_prefix(Ctx *c) {
	next(c); // LPAREN
	int x = binary(c, -1);
	expect(c, RPAREN);
	return x;
	}

	int binary(Ctx *c, int min_prec) {
	#pragma section(".remotery_code", read, execute)

	__declspec(allocate(".remotery_code"))
	static const uint8_t code[] =
	{
	0x50, // push rax
	0x9C, // pushfq
	0x53, // push rbx
	0x51, // push rcx
	0x52, // push rdx
	#include <assert.h>

	#include <tuple>
	#include <vector>
	#include <string>

	typedef uint32_t Str;

	std::vector<const char*> strs;
	void assert_func(int cond, const char *msg) {
	if (!cond) {
	fprintf(stderr, "assert failed: %s\n", msg);
	abort();
	}
	}

	#define assert_helper(x, y, ...) assert_func(x, y)
	#define assert(x, ...) assert_helper((x), ## __VA_ARGS__, #x)
	// Length-segregated string tables for length < 16. You use a separate overflow table for length >= 16.
	// By segregating like this you can pack the string data in the table itself tightly without any padding. The datapath
	// is uniform and efficient for all lengths < 16 by using unaligned 16-byte SIMD loads/compares and masking off the length prefix.

	// One of the benefits of packing string data tightly for each length table is that you can afford to reduce the load factor
	// on shorter length tables without hurting space utilization too much. This can push hole-in-one rates into the 95% range without
	// too much of a negative impact on cache utilization.

	// Since get() takes a vector register as an argument with the key, you want to shape the upstream code so the string to be queried
	// is naturally in a vector. For example, in an optimized identifier lexer you should already have a SIMD fast path for length < 16
	// We have five kinds of nodes: literal, negate, not, add, xor.
	// In this case we only need 3 bits for the tag but you can use as many as you need.

	enum Tag {
	LIT,
	NEG,
	NOT,
	ADD,
	XOR,
	};