Skip to content

Instantly share code, notes, and snippets.

View djg's full-sized avatar
💭
I may be slow to respond.

Dan Glastonbury djg

💭
I may be slow to respond.
View GitHub Profile
// This is your vertex buffer, split into components.
// On D3D11 HW, you can probably use structured buffers to de-SoA this, but I haven't checked.
Buffer<float3> buf_pos;
Buffer<float3> buf_norm;
Buffer<float2> buf_uvs;
float4x4 clip_from_model;
uint base_index, index_mask;
@rygorous
rygorous / gist:4172889
Created November 30, 2012 00:28
SSE/AVX matrix multiply
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
union Mat44 {
float m[4][4];
__m128 row[4];
};
Making the hot path (in this case, no match) fall-through:
--- before ---
mov up,down
mov up,down
# check 4
l:mov up,acc
sub 4
jez z
use compiler::parser::tok::{self, Tok};
use compiler::ast::*;
grammar<'input>["LALR(1)"];
extern {
type Location = usize;
type Error = tok::Error;
enum Tok<'input> {
// latency tester generator
#define PROD_ALF "add %1,%1,%2\n" // ALU fast-forward: basic ALU ops; CCMP/CCMN
#define PROD_ALU "add %1,%1,%2,lsl #13\n" // ALU+shift; all bitfield move; EXTR; RBIT/REV*; CLS/CLZ; CSEL/CSET etc.
#define PROD_SHF "lslv %1,%1,%2\n" // variable shifts, imm movs (e.g. "movz %1,#0,lsl #16")
#define PROD_LDR "ldr %1,[%4]\n" // load
#define CONS_ALU "add %1,%1,%2\n" // basic ALU ops; CCMP/CCMN; CSEL/CSET etc; CLZ/CLS; first (unshifted) src in ALU+shift
#define CONS_SHF "add %1,%2,%1,lsl #13\n" // second (shifted) src in ALU+shift; SBFM/UBFM/BFM/RBIT/REV*/var shifts/EXTR all sources
#define CONS_AGU "ldr %1,[%4,%1]\n" // load/store address generation unit
#define CONS_STR "str %1,[%4]\n" // store data