Skip to content

Instantly share code, notes, and snippets.

@rygorous
rygorous / interp.py
Created July 27, 2021 19:08
BC4 interpolation formulas
# ---- Intel decoder impl
# matches 2014 HSW laptop
weights8_intel = [round(256*x/7) for x in range(8)]
weights6_intel = [round(256*x/5) for x in range(8)]
def intel_lerp8(a, b, w):
return ((256-w)*a + w*b + 128) >> 8
def interp_int_8col(a, b, x):
// Merge pass
static void merge_pass(S16 *out, const S16 *inA, const S16 *inB, size_t elemsPerRun)
{
// need pow2 elemsPerRun>=16!
const S16 *endA = inA + elemsPerRun;
const S16 *endB = inB + elemsPerRun;
Vec vMin0 = load8_s16(inA + 0);
Vec vMin1 = load8_s16(inA + 8);
Vec vMax0 = load8_s16(inB + 0);
Vec vMax1 = load8_s16(inB + 8);
Avoid, avoid,
Avoid pop culture
avoid!
Avoid,
avoid pop culture
then you'll feel better
Ahhhhhhhh!
Avoid, avoid
#include <stdint.h>
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef unsigned long long U64;
typedef intptr_t SINTa;
struct KernelState
{
@rygorous
rygorous / hull.py
Last active June 25, 2021 10:30
Convex hull
import random
# Determinant predicate (line sidedness test)
def det3x3_pt(p, q, r):
a = (q[0] - p[0], q[1] - p[1])
b = (r[0] - p[0], r[1] - p[1])
return a[0]*b[1] - a[1]*b[0]
def convex_hull(points):
# sorts points by x then y which works for us (we only need the sort by x part here, but it doesn't hurt)
---- On Ryzen 3950X
SimpleProf :seconds calls count : clk/call clk/count
search_one : 0.3081 1 8388480 : 1078358365.0 128.55
search_one_pf : 0.3415 1 8388480 : 1195232920.0 142.49 <-- speculative prefetching (next options for L and R)
search_multi2 : 0.3062 1 8388480 : 1071663705.0 127.75
search_multi4 : 0.2454 1 8388480 : 859008465.0 102.40
search_multi8 : 0.2113 1 8388480 : 739533515.0 88.16
search_multi16 : 0.1996 1 8388480 : 698443550.0 83.26
search_multi32 : 0.1785 1 8388480 : 624785840.0 74.48
@rygorous
rygorous / example.html
Created April 16, 2020 18:18
Valid HTML5
<!DOCTYPE html>
<html>
<head>
<title>Some fun HTML5 stuff</title>
<body>
<p>All of this is valid syntax!
<ul>
<li>Don't need to close paragraphs (that one's old)
<li>or list items
<li>since it's clear from context!
@rygorous
rygorous / rast.c
Created March 2, 2020 01:56
Simple watertight triangle rasterizer
// ---- triangle rasterizer
#define SUBPIXEL_SHIFT 8
#define SUBPIXEL_SCALE (1 << SUBPIXEL_SHIFT)
static RADINLINE S64 det2x2(S32 a, S32 b, S32 c, S32 d)
{
S64 r = (S64) a*d - (S64) b*c;
return r >> SUBPIXEL_SHIFT;
}
@rygorous
rygorous / gist:a549832e23b913ac70237d23c1600f8a
Created August 16, 2019 22:37
pseudo-ucode expansion for LOOP <dest>
lea rcx, [rcx-1] ; decrement rcx w/o flag update
mov temp0, rax ; save rax that we're about to trash
lahf ; save original flags
test rcx, rcx ; check whether updated rcx is zero
setz temp1 ; temp1 = 1 if rcx=0, 0 otherwise
sahf ; restore flags
mov rax, temp0 ; restore rax
jecxz temp1, dest ; jump if temp1 is zero, not rcx (doesn't exist in regular ISA but rcx is renamed anyway so the internal uop can do any source)
NOTE the actual ucode expansion probably doesn't have the MOVs since I would expect the internal LAHF/SAHF uops
@rygorous
rygorous / b.bat
Created August 9, 2019 23:08
Histogram code with all the tricks :) Needs NASM + VC++
@echo off
setlocal
cd %~dp0
call vcvars amd64
..\..\bin\win32\nasm -f win64 -g -o histo_asm.obj histo_asm.nas || exit /b 1
cl /Zi /O2 /nologo histotest.cpp histo_asm.obj || exit /b 1