Skip to content

Instantly share code, notes, and snippets.

@pervognsen

pervognsen/ast_intern.c

Last active Jun 13, 2021
Embed
What would you like to do?
#include <assert.h>
#include <tuple>
#include <vector>
#include <string>
typedef uint32_t Str;
std::vector<const char*> strs;
enum { STR_BIAS = 1 << 21 };
// The different bitwise tagging schemes combine to give an inline literal encoding of all three-character tokens
// with tight or loose whitespace trivia, which covers all operator tokens, several keywords and short identifiers.
Str strn(const char *s, size_t n) {
if (n <= 3) {
uint32_t i = 0;
if (n >= 1) i |= s[0];
if (n >= 2) i |= s[1] << 7;
if (n >= 3) i |= s[2] << 14;
return i;
}
uint32_t i;
// In a real implementation this would be a hash table. One of my experimental ideas is to use content-defined
// chunking to define cut-points based on rolling hashes and then split and cons recursively to promote sharing.
for (i = 0; i < strs.size(); i++)
if (strncmp(strs[i], s, n) == 0) return i + STR_BIAS;
strs.push_back(_strdup(s));
return i + STR_BIAS;
}
Str str(const char *s) {
return strn(s, strlen(s));
}
// Since the AST is immutable, we can do typeless bitwise sharing of data. You could also use unaligned,
// overlapping substring matching like an LZ77 dictionary compressor. In practice you'd want to have more
// than just a 2:1 compressor but it's a universal interning primitive and it promotes data sharing, so if
// nothing else it's a good demonstration of hash consing as recursive dictionary compression.
std::vector<uint64_t> heap(1);
// cons is a 2:1 data compressor when (car, cdr) is already in the heap. You could dedup swapped pairs by
// storing them ordered in the heap and stealing a tag bit to store the direction so you can swap back in car/cdr.
uint32_t cons(uint32_t car, uint32_t cdr) {
uint64_t p = (uint64_t)car | ((uint64_t)cdr << 32);
uint32_t i;
// A real implementation would speed up the dedup match search with a hash table. But the hash table isn't needed
// to navigate the heap, and you can always rebuild it from the heap in linear time if you've thrown it away.
for (i = 1; i < heap.size(); i++)
if (heap[i] == p) return i;
heap.push_back(p);
return i;
}
uint64_t uncons(uint32_t i) {
return heap[i];
}
uint32_t car(uint32_t i) {
return (uint32_t)heap[i];
}
uint32_t cdr(uint32_t i) {
return (uint32_t)(heap[i] >> 32);
}
uint32_t bitcons(uint32_t n, uint32_t car, uint32_t cdr) {
// If you want to enforce powers of two: assert((n & (n - 1)) == 0);
assert(car < n);
assert(cdr < (1ull << 32) / n - (n - 1));
return car + cdr * n;
}
uint32_t bitcar(uint32_t n, uint32_t p) {
return p % n;
}
uint32_t bitcdr(uint32_t n, uint32_t p) {
return p / n;
}
typedef uint32_t Trivia;
Trivia trivia(Str lhs, Str rhs) {
if (rhs == 0) return bitcons(2, 0, lhs); // Empty right trivia
return bitcons(2, 1, cons(lhs, rhs));
}
typedef uint32_t Token;
Token token(Str str, Trivia triv) {
if (triv == 0) return bitcons(3, 0, str); // Tight (nothing on either side)
if (triv == 64) return bitcons(3, 1, str); // Loose (one space on left, nothing on right)
return bitcons(3, 2, cons(str, triv));
}
typedef uint32_t Expr;
Expr name(Token tok) {
return bitcons(3, 0, tok);
}
Expr unary(Token op, Expr lhs) {
return bitcons(3, 1, cons(op, lhs));
}
Expr binary(Token op, Expr lhs, Expr rhs) {
return bitcons(3, 2, cons(op, cons(lhs, rhs)));
}
int main(int argc, char** argv) {
auto empty = str("");
auto space = str(" ");
auto tight = trivia(empty, empty);
auto loose = trivia(space, empty);
auto x = name(token(str("x"), tight));
auto y = name(token(str("y"), loose));
auto plus = token(str("+"), loose);
auto add_xy = binary(plus, x, y);
auto add_xy2 = binary(token(str("+"), loose), name(token(str("x"), tight)), name(token(str("y"), loose)));
assert(add_xy == add_xy2);
auto xor_xy = binary(token(str("^"), loose), x, y);
assert(xor_xy != add_xy);
assert(bitcar(3, add_xy) == bitcar(3, xor_xy));
assert(cdr(bitcdr(3, add_xy)) == cdr(bitcdr(3, xor_xy)));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment