Per Vognsen pervognsen

## paren_repr.py
# I happened to be looking at some of Cranelift's code, and I noticed that their constant-time dominates()
# check was using a somewhat more ad-hoc version of a hidden gem from the data structures literature called the
# parenthesis representation for trees. As far as I know, this was invented by Jacobson in his 1989 paper
# Space-Efficient Static Trees and Graphs. I first learned about it from the slightly later paper by Munro and Raman
# called Succinct Representations of Balanced Parentheses and Static Trees. I figured I'd give it an extremely
# quick intro and then show how it leads to a (slightly better) version of Cranelift's algorithm.
#
# This parenthesis representation of trees is surprisingly versatile, but its most striking feature is that
# it lets us query the ancestor relationship between two nodes in a tree in constant time, with a few instructions.
# And the idea is extremely simple and intuitive if you just draw the right kind of picture.

## RestartToCursorCommand.cs
// This is "restart to cursor", a complement to VS's "run to cursor" which will first restart (and recompile if you have
// it set up like that) the currently running program (if any) and then execute the "run to cursor" command.
// To make an extension on your own, create a C# VSIX template, then Add Item -> Extensibility -> Command and replace the
// file with the contents of this Gist. I bind it to Ctrl-Shift-F10 since Ctrl-F10 is "run to cursor".

using EnvDTE80;
using EnvDTE;
using Microsoft.VisualStudio.Shell;
using Microsoft.VisualStudio.Shell.Interop;
using System;

## vs_cpp_setup.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              9 stars
            
          
                pervognsen
                / vs_cpp_setup.md
            
            
              Last active
              October 9, 2023 11:10
            
              
                My Visual Studio setup for C++ development
              
          
    I recently upgraded to Windows 11 and had to set up Visual Studio C++ again. A few things have
changed in Windows 11 with the tiled window management and deeper Windows Terminal integration,
so I ended up revisiting my usual VS setup and made a bunch of changes. I'm documenting them here
for my own future benefit (yes, I do know about VS import/export settings) and on the off chance
that anyone else might get some ideas for their own setup.
It should be mentioned that I'm a single monitor user (multimon gives me neck pain as I've gotten
older) and the monitor I currently use is 25". A laptop screen can't really accommodate the 2x2
full screen layout I'm using as my default here, so when I'm on a laptop I only use the
vertical layout out of necessity.

  
## fp_aliasing_semantics.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                pervognsen
                / fp_aliasing_semantics.md
            
            
              Created
              July 30, 2021 04:21
            
          
    x64 addss:
VADDSS (EVEX encoded versions)
    IF (EVEX.b = 1) AND SRC2 *is a register*
        THEN
            SET_RM(EVEX.RC);
        ELSE
            SET_RM(MXCSR.RM);
    FI;

IF k1[0] or no writemask

  
## multistep.asm
multistep:                              # @multistep
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        mov     rcx, qword ptr [rsi]
        mov     r10, qword ptr [rsi + 8]
        mov     rdx, qword ptr [rsi + 24]
        mov     r11, qword ptr [rsi + 32]

## puzzle.c
// For N = 11, a call to perms(p, N, 0) only takes 0.36 seconds.
// Why would a debugger seemingly hang forever when you try to
// step over the outermost recursive call to perms in the loop?
// Hint: How is "step over" ("next" in gdb/lldb) implemented?

u64 perms(u64 p[N], int n, u64 s) {
	if (n == 1) {
		s += signature(p);
	} else {
		for (int i = 0; i < n; i++) {

## rh_grow.c
// This can grow a Robin Hood linear probing hash table near word-at-a-time memcpy speeds. If you're confused why I use 'keys'
// to describe the hash values, it's because my favorite perspective on Robin Hood (which I learned from Paul Khuong)
// is that it's just a sorted gap array which is MSB bucketed and insertion sorted per chain:
// https://pvk.ca/Blog/2019/09/29/a-couple-of-probabilistic-worst-case-bounds-for-robin-hood-linear-probing/
// The more widely known "max displacement" picture of Robin Hood hashing also has strengths since the max displacement
// can be stored very compactly. You can see a micro-optimized example of that here for small tables where the max displacement
// can fit in 4 bits: Sub-nanosecond Searches Using Vector Instructions, https://www.youtube.com/watch?v=paxIkKBzqBU
void grow(Table *table) {
	u64 exp = 64 - table->shift;
	// We grow the table downward in place by a factor of 2 (not counting the overflow area at table->end).

## tagged_prefetched_eval.c
INLINE
u32 index(Ref r) {
	return r >> 4;
}

INLINE
u32 tag(Ref r) {
	return r & 15;
}

## value_speculation.c
// Estimating CPU frequency...
// CPU frequency: 4.52 GHz
// sum1: value = 15182118497126522709, 0.31 secs, 5.14 cycles/elem
// sum2: value = 15182118497126522709, 0.17 secs, 2.93 cycles/elem

#define RW(x) asm("" : "+r"(x))

typedef struct Node {
	u64 value;
	struct Node *next;

## mispredict_penalty.c
#define NOP ({ int x; asm("nop" : "=r"(x)); asm("" : : "r"(x)); })

NOINLINE
double calc_mispredict_penalty(void) {
	i64 n = 1ull << 28;

	u64 rnglen = (n + 63) / 64;
	u64 *rng = malloc(rnglen * sizeof(u64));
	for (u64 i = 0; i < rnglen; i++)
		rng[i] = random_u64();
	# I happened to be looking at some of Cranelift's code, and I noticed that their constant-time dominates()
	# check was using a somewhat more ad-hoc version of a hidden gem from the data structures literature called the
	# parenthesis representation for trees. As far as I know, this was invented by Jacobson in his 1989 paper
	# Space-Efficient Static Trees and Graphs. I first learned about it from the slightly later paper by Munro and Raman
	# called Succinct Representations of Balanced Parentheses and Static Trees. I figured I'd give it an extremely
	# quick intro and then show how it leads to a (slightly better) version of Cranelift's algorithm.
	#
	# This parenthesis representation of trees is surprisingly versatile, but its most striking feature is that
	# it lets us query the ancestor relationship between two nodes in a tree in constant time, with a few instructions.
	# And the idea is extremely simple and intuitive if you just draw the right kind of picture.
	// This is "restart to cursor", a complement to VS's "run to cursor" which will first restart (and recompile if you have
	// it set up like that) the currently running program (if any) and then execute the "run to cursor" command.
	// To make an extension on your own, create a C# VSIX template, then Add Item -> Extensibility -> Command and replace the
	// file with the contents of this Gist. I bind it to Ctrl-Shift-F10 since Ctrl-F10 is "run to cursor".

	using EnvDTE80;
	using EnvDTE;
	using Microsoft.VisualStudio.Shell;
	using Microsoft.VisualStudio.Shell.Interop;
	using System;
	multistep: # @multistep
	push r15
	push r14
	push r13
	push r12
	push rbx
	mov rcx, qword ptr [rsi]
	mov r10, qword ptr [rsi + 8]
	mov rdx, qword ptr [rsi + 24]
	mov r11, qword ptr [rsi + 32]
	// For N = 11, a call to perms(p, N, 0) only takes 0.36 seconds.
	// Why would a debugger seemingly hang forever when you try to
	// step over the outermost recursive call to perms in the loop?
	// Hint: How is "step over" ("next" in gdb/lldb) implemented?

	u64 perms(u64 p[N], int n, u64 s) {
	if (n == 1) {
	s += signature(p);
	} else {
	for (int i = 0; i < n; i++) {
	// This can grow a Robin Hood linear probing hash table near word-at-a-time memcpy speeds. If you're confused why I use 'keys'
	// to describe the hash values, it's because my favorite perspective on Robin Hood (which I learned from Paul Khuong)
	// is that it's just a sorted gap array which is MSB bucketed and insertion sorted per chain:
	// https://pvk.ca/Blog/2019/09/29/a-couple-of-probabilistic-worst-case-bounds-for-robin-hood-linear-probing/
	// The more widely known "max displacement" picture of Robin Hood hashing also has strengths since the max displacement
	// can be stored very compactly. You can see a micro-optimized example of that here for small tables where the max displacement
	// can fit in 4 bits: Sub-nanosecond Searches Using Vector Instructions, https://www.youtube.com/watch?v=paxIkKBzqBU
	void grow(Table *table) {
	u64 exp = 64 - table->shift;
	// We grow the table downward in place by a factor of 2 (not counting the overflow area at table->end).
	INLINE
	u32 index(Ref r) {
	return r >> 4;
	}

	INLINE
	u32 tag(Ref r) {
	return r & 15;
	}
	// Estimating CPU frequency...
	// CPU frequency: 4.52 GHz
	// sum1: value = 15182118497126522709, 0.31 secs, 5.14 cycles/elem
	// sum2: value = 15182118497126522709, 0.17 secs, 2.93 cycles/elem

	#define RW(x) asm("" : "+r"(x))

	typedef struct Node {
	u64 value;
	struct Node *next;
	#define NOP ({ int x; asm("nop" : "=r"(x)); asm("" : : "r"(x)); })

	NOINLINE
	double calc_mispredict_penalty(void) {
	i64 n = 1ull << 28;

	u64 rnglen = (n + 63) / 64;
	u64 rng = malloc(rnglen sizeof(u64));
	for (u64 i = 0; i < rnglen; i++)
	rng[i] = random_u64();