Denys Mentiei den-mentiei

## win32_webgpu.c
// example how to set up WebGPU rendering on Windows in C
// uses Dawn implementation of WebGPU: https://dawn.googlesource.com/dawn/
// download pre-built Dawn webgpu.h/dll/lib files from https://github.com/mmozeiko/build-dawn/releases/latest

#include "webgpu.h"

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#define _USE_MATH_DEFINES

## paren_repr.py
# I happened to be looking at some of Cranelift's code, and I noticed that their constant-time dominates()
# check was using a somewhat more ad-hoc version of a hidden gem from the data structures literature called the
# parenthesis representation for trees. As far as I know, this was invented by Jacobson in his 1989 paper
# Space-Efficient Static Trees and Graphs. I first learned about it from the slightly later paper by Munro and Raman
# called Succinct Representations of Balanced Parentheses and Static Trees. I figured I'd give it an extremely
# quick intro and then show how it leads to a (slightly better) version of Cranelift's algorithm.
#
# This parenthesis representation of trees is surprisingly versatile, but its most striking feature is that
# it lets us query the ancestor relationship between two nodes in a tree in constant time, with a few instructions.
# And the idea is extremely simple and intuitive if you just draw the right kind of picture.

## zmk_mouse_support.md

      
              1 file
            
          
              2 forks
            
          
              24 comments
            
          
              36 stars
            
          
                krissen
                / zmk_mouse_support.md
            
            
              Last active
              July 4, 2024 22:54
            
          
    Edit 2024-02-05:
The instructions below are outdated.
There is an official PR (#2027) for mouse functionality. You should use that rather than what is described below.
See comments [1, 2] below.
ZMK mouse support


## atomic stuff
partial ordering: time/events are related to each other as graphs of dependencies
total ordering: time/events are related to each other as a sequence of steps

atomic op: op which is observed to either happen fully or not at all (no in-between)
atomic variable: memory location which atomic ops happen to
data race: non-atomic ops from 2+ threads on same memory location where one op is a write

load: an atomic read to observe a value
store: an atomic write to publish a value
read-modify-write (rmw): a read, updating the value, then a write, all atomically

## tls_client.c
#define WIN32_LEAN_AND_MEAN
#include <winsock2.h>
#include <windows.h>
#define SECURITY_WIN32
#include <security.h>
#include <schannel.h>
#include <shlwapi.h>
#include <assert.h>
#include <stdio.h>

## multigetbits.cpp
static inline __m128i prefix_sum_u8(__m128i x)
{
#if 1
	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
	// for us)
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
#else

## ast_intern.c
#include <assert.h>

#include <tuple>
#include <vector>
#include <string>

typedef uint32_t Str;

std::vector<const char*> strs;

## lex_parse.c
// Lexer

#define TOK_CHAR(ch1, tok1) \
    case ch1: \
        next_ch(); \
        tok = TOK_##tok1; \
        tok_prec = 0; \
        break;

#define TOK_EXPR(ch1, tok1, prec1, lexpr1, rexpr1) \

## asm_x64.c
// x64 encoding

enum Reg {
    RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
    R8,  R9,  R10, R11, R12, R13, R14, R15,
};

enum XmmReg {
    XMM0, XMM1, XMM2,  XMM3,  XMM4,  XMM5,  XMM6,  XMM7,
    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,

## benchmark.cpp
#if _WIN32
struct Timer {
    LARGE_INTEGER win32_freq;
    LARGE_INTEGER win32_start;

    Timer() {
        QueryPerformanceFrequency(&win32_freq);
        win32_start.QuadPart = 0;
    }
	// example how to set up WebGPU rendering on Windows in C
	// uses Dawn implementation of WebGPU: https://dawn.googlesource.com/dawn/
	// download pre-built Dawn webgpu.h/dll/lib files from https://github.com/mmozeiko/build-dawn/releases/latest

	#include "webgpu.h"

	#define WIN32_LEAN_AND_MEAN
	#include <windows.h>

	#define _USE_MATH_DEFINES
	# I happened to be looking at some of Cranelift's code, and I noticed that their constant-time dominates()
	# check was using a somewhat more ad-hoc version of a hidden gem from the data structures literature called the
	# parenthesis representation for trees. As far as I know, this was invented by Jacobson in his 1989 paper
	# Space-Efficient Static Trees and Graphs. I first learned about it from the slightly later paper by Munro and Raman
	# called Succinct Representations of Balanced Parentheses and Static Trees. I figured I'd give it an extremely
	# quick intro and then show how it leads to a (slightly better) version of Cranelift's algorithm.
	#
	# This parenthesis representation of trees is surprisingly versatile, but its most striking feature is that
	# it lets us query the ancestor relationship between two nodes in a tree in constant time, with a few instructions.
	# And the idea is extremely simple and intuitive if you just draw the right kind of picture.
	partial ordering: time/events are related to each other as graphs of dependencies
	total ordering: time/events are related to each other as a sequence of steps

	atomic op: op which is observed to either happen fully or not at all (no in-between)
	atomic variable: memory location which atomic ops happen to
	data race: non-atomic ops from 2+ threads on same memory location where one op is a write

	load: an atomic read to observe a value
	store: an atomic write to publish a value
	read-modify-write (rmw): a read, updating the value, then a write, all atomically
	#define WIN32_LEAN_AND_MEAN
	#include <winsock2.h>
	#include <windows.h>
	#define SECURITY_WIN32
	#include <security.h>
	#include <schannel.h>
	#include <shlwapi.h>
	#include <assert.h>
	#include <stdio.h>
	static inline __m128i prefix_sum_u8(__m128i x)
	{
	#if 1
	// alternative form that uses shifts, not the general shuffle network on port 5 (which is a bottleneck
	// for us)
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 8));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 16));
	x = _mm_add_epi8(x, _mm_slli_epi64(x, 32));
	x = _mm_add_epi8(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 7,7,7,7,7,7,7,7)));
	#else
	#include <assert.h>

	#include <tuple>
	#include <vector>
	#include <string>

	typedef uint32_t Str;

	std::vector<const char*> strs;
	// Lexer

	#define TOK_CHAR(ch1, tok1) \
	case ch1: \
	next_ch(); \
	tok = TOK_##tok1; \
	tok_prec = 0; \
	break;

	#define TOK_EXPR(ch1, tok1, prec1, lexpr1, rexpr1) \
	// x64 encoding

	enum Reg {
	RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
	R8, R9, R10, R11, R12, R13, R14, R15,
	};

	enum XmmReg {
	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
	#if _WIN32
	struct Timer {
	LARGE_INTEGER win32_freq;
	LARGE_INTEGER win32_start;

	Timer() {
	QueryPerformanceFrequency(&win32_freq);
	win32_start.QuadPart = 0;
	}