Tristan Hume trishume

## gist:4e85cb3f315583a18d748b8e7f36c924
// Merge pass
static void merge_pass(S16 *out, const S16 *inA, const S16 *inB, size_t elemsPerRun)
{
    // need pow2 elemsPerRun>=16!
    const S16 *endA = inA + elemsPerRun;
    const S16 *endB = inB + elemsPerRun;
    Vec vMin0 = load8_s16(inA + 0);
    Vec vMin1 = load8_s16(inA + 8);
    Vec vMax0 = load8_s16(inB + 0);
    Vec vMax1 = load8_s16(inB + 8);

## rh_grow.c
// This can grow a Robin Hood linear probing hash table near word-at-a-time memcpy speeds. If you're confused why I use 'keys'
// to describe the hash values, it's because my favorite perspective on Robin Hood (which I learned from Paul Khuong)
// is that it's just a sorted gap array which is MSB bucketed and insertion sorted per chain:
// https://pvk.ca/Blog/2019/09/29/a-couple-of-probabilistic-worst-case-bounds-for-robin-hood-linear-probing/
// The more widely known "max displacement" picture of Robin Hood hashing also has strengths since the max displacement
// can be stored very compactly. You can see a micro-optimized example of that here for small tables where the max displacement
// can fit in 4 bits: Sub-nanosecond Searches Using Vector Instructions, https://www.youtube.com/watch?v=paxIkKBzqBU
void grow(Table *table) {
	u64 exp = 64 - table->shift;
	// We grow the table downward in place by a factor of 2 (not counting the overflow area at table->end).

## tagged_prefetched_eval.c
INLINE
u32 index(Ref r) {
	return r >> 4;
}

INLINE
u32 tag(Ref r) {
	return r & 15;
}

## vm_bench.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__x86_64__)
#define BREAK    asm("int3")
#else
#error Implement macros for your CPU.
#endif

## ast_intern.c
#include <assert.h>

#include <tuple>
#include <vector>
#include <string>

typedef uint32_t Str;

std::vector<const char*> strs;

## segregated_tables.c
// Length-segregated string tables for length < 16. You use a separate overflow table for length >= 16.
// By segregating like this you can pack the string data in the table itself tightly without any padding. The datapath
// is uniform and efficient for all lengths < 16 by using unaligned 16-byte SIMD loads/compares and masking off the length prefix.

// One of the benefits of packing string data tightly for each length table is that you can afford to reduce the load factor
// on shorter length tables without hurting space utilization too much. This can push hole-in-one rates into the 95% range without
// too much of a negative impact on cache utilization.

// Since get() takes a vector register as an argument with the key, you want to shape the upstream code so the string to be queried
// is naturally in a vector. For example, in an optimized identifier lexer you should already have a SIMD fast path for length < 16

## linear_gc.c
// Linear-scan mark-compact collector for causal data structures, i.e. the reference graph is acyclic and the objects
// are ordered by age in the heap (which happens if you use a linear allocator) so they are automatically topologically sorted.
// This algorithm has very high memory-level parallelism which can be exploited by modern out-of-order processors, and should
// be memory throughput limited.
void collect(void) {
    // Initialize marks from roots.
    memset(marks, 0, num_nodes * sizeof(uint32_t));
    int newest = 0, oldest = num_nodes;
    for (int i = 0; i < num_roots; i++) {
        marks[roots[i]] = 1;

## perfold.c
// Heavily based on ideas from https://github.com/LuaJIT/LuaJIT/blob/v2.1/src/lj_opt_fold.c
// The most fundamental deviation is that I eschew the big hash table and the lj_opt_fold()
// trampoline for direct tail calls. The biggest problem with a trampoline is that you lose
// the control flow context. Another problem is that there's too much short-term round-tripping
// of data through memory. It's also easier to do ad-hoc sharing between rules with my approach.
// From what I can tell, it also isn't possible to do general reassociation with LJ's fold engine
// since that requires non-tail recursion, so LJ does cases like (x + n1) + n2 => x + (n1 + n2)
// but not (x + n1) + (y + n2) => x + (y + (n1 + n2)) which is common in address generation. The
// code below has some not-so-obvious micro-optimizations for register passing and calling conventions,
// e.g. the unary_cse/binary_cse parameter order, the use of long fields in ValueRef.

## chain_tree.py
# Here's a probably-not-new data structure I discovered after implementing weight-balanced trees with
# amortized subtree rebuilds (e.g. http://jeffe.cs.illinois.edu/teaching/algorithms/notes/10-scapegoat-splay.pdf)
# and realizing it was silly to turn a subtree into a sorted array with an in-order traversal only as an
# aid in constructing a balanced subtree in linear time. Why not replace the subtree by the sorted array and
# use binary search when hitting that leaf array? Then you'd defer any splitting of the array until inserts and
# deletes hit that leaf array. Only in hindsight did I realize this is similar to the rope data structure for strings.
# Unlike ropes, it's a key-value search tree for arbitrary ordered keys.
#
# The main attraction of this data structure is its simplicity (on par with a standard weight-balanced tree) and that it
# coalesces subtrees into contiguous arrays, which reduces memory overhead and boosts the performance of in-order traversals

## rpn_serialize.c
typedef enum {
    CMD_INT = NODE_INT,
    CMD_NEG = NODE_NEG,
    CMD_NOT = NODE_NOT,
    CMD_ADD = NODE_ADD,
    CMD_SUB = NODE_SUB,
    CMD_RET = 128,
    CMD_SETREF,
    CMD_GETREF,
} Cmd;
	// Merge pass
	static void merge_pass(S16 out, const S16 inA, const S16 *inB, size_t elemsPerRun)
	{
	// need pow2 elemsPerRun>=16!
	const S16 *endA = inA + elemsPerRun;
	const S16 *endB = inB + elemsPerRun;
	Vec vMin0 = load8_s16(inA + 0);
	Vec vMin1 = load8_s16(inA + 8);
	Vec vMax0 = load8_s16(inB + 0);
	Vec vMax1 = load8_s16(inB + 8);
	// This can grow a Robin Hood linear probing hash table near word-at-a-time memcpy speeds. If you're confused why I use 'keys'
	// to describe the hash values, it's because my favorite perspective on Robin Hood (which I learned from Paul Khuong)
	// is that it's just a sorted gap array which is MSB bucketed and insertion sorted per chain:
	// https://pvk.ca/Blog/2019/09/29/a-couple-of-probabilistic-worst-case-bounds-for-robin-hood-linear-probing/
	// The more widely known "max displacement" picture of Robin Hood hashing also has strengths since the max displacement
	// can be stored very compactly. You can see a micro-optimized example of that here for small tables where the max displacement
	// can fit in 4 bits: Sub-nanosecond Searches Using Vector Instructions, https://www.youtube.com/watch?v=paxIkKBzqBU
	void grow(Table *table) {
	u64 exp = 64 - table->shift;
	// We grow the table downward in place by a factor of 2 (not counting the overflow area at table->end).
	INLINE
	u32 index(Ref r) {
	return r >> 4;
	}

	INLINE
	u32 tag(Ref r) {
	return r & 15;
	}
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>

	#if defined(__x86_64__)
	#define BREAK asm("int3")
	#else
	#error Implement macros for your CPU.
	#endif
	#include <assert.h>

	#include <tuple>
	#include <vector>
	#include <string>

	typedef uint32_t Str;

	std::vector<const char*> strs;
	// Length-segregated string tables for length < 16. You use a separate overflow table for length >= 16.
	// By segregating like this you can pack the string data in the table itself tightly without any padding. The datapath
	// is uniform and efficient for all lengths < 16 by using unaligned 16-byte SIMD loads/compares and masking off the length prefix.

	// One of the benefits of packing string data tightly for each length table is that you can afford to reduce the load factor
	// on shorter length tables without hurting space utilization too much. This can push hole-in-one rates into the 95% range without
	// too much of a negative impact on cache utilization.

	// Since get() takes a vector register as an argument with the key, you want to shape the upstream code so the string to be queried
	// is naturally in a vector. For example, in an optimized identifier lexer you should already have a SIMD fast path for length < 16
	// Linear-scan mark-compact collector for causal data structures, i.e. the reference graph is acyclic and the objects
	// are ordered by age in the heap (which happens if you use a linear allocator) so they are automatically topologically sorted.
	// This algorithm has very high memory-level parallelism which can be exploited by modern out-of-order processors, and should
	// be memory throughput limited.
	void collect(void) {
	// Initialize marks from roots.
	memset(marks, 0, num_nodes * sizeof(uint32_t));
	int newest = 0, oldest = num_nodes;
	for (int i = 0; i < num_roots; i++) {
	marks[roots[i]] = 1;
	// Heavily based on ideas from https://github.com/LuaJIT/LuaJIT/blob/v2.1/src/lj_opt_fold.c
	// The most fundamental deviation is that I eschew the big hash table and the lj_opt_fold()
	// trampoline for direct tail calls. The biggest problem with a trampoline is that you lose
	// the control flow context. Another problem is that there's too much short-term round-tripping
	// of data through memory. It's also easier to do ad-hoc sharing between rules with my approach.
	// From what I can tell, it also isn't possible to do general reassociation with LJ's fold engine
	// since that requires non-tail recursion, so LJ does cases like (x + n1) + n2 => x + (n1 + n2)
	// but not (x + n1) + (y + n2) => x + (y + (n1 + n2)) which is common in address generation. The
	// code below has some not-so-obvious micro-optimizations for register passing and calling conventions,
	// e.g. the unary_cse/binary_cse parameter order, the use of long fields in ValueRef.
	# Here's a probably-not-new data structure I discovered after implementing weight-balanced trees with
	# amortized subtree rebuilds (e.g. http://jeffe.cs.illinois.edu/teaching/algorithms/notes/10-scapegoat-splay.pdf)
	# and realizing it was silly to turn a subtree into a sorted array with an in-order traversal only as an
	# aid in constructing a balanced subtree in linear time. Why not replace the subtree by the sorted array and
	# use binary search when hitting that leaf array? Then you'd defer any splitting of the array until inserts and
	# deletes hit that leaf array. Only in hindsight did I realize this is similar to the rope data structure for strings.
	# Unlike ropes, it's a key-value search tree for arbitrary ordered keys.
	#
	# The main attraction of this data structure is its simplicity (on par with a standard weight-balanced tree) and that it
	# coalesces subtrees into contiguous arrays, which reduces memory overhead and boosts the performance of in-order traversals
	typedef enum {
	CMD_INT = NODE_INT,
	CMD_NEG = NODE_NEG,
	CMD_NOT = NODE_NOT,
	CMD_ADD = NODE_ADD,
	CMD_SUB = NODE_SUB,
	CMD_RET = 128,
	CMD_SETREF,
	CMD_GETREF,
	} Cmd;