Skip to content

Instantly share code, notes, and snippets.

View FernandoS27's full-sized avatar

Fernando S. FernandoS27

View GitHub Profile
CSETP stands for Constrol Code Set Predicate. It's a predicate set instruction that works on an special register called cc (Control Code)
it takes 2 input predicates to modify and one extra that's logicaly compared to flag in cc being tested.
Control Code is only updated when the instruction that generates it has bit 47 on. cc apears to be 32 bits wide and the values of cc are:
{ 0x0000000000000000ull, 0x0000000000001f00ull, N("f") },
{ 0x0000000000000100ull, 0x0000000000001f00ull, N("lt") },
{ 0x0000000000000200ull, 0x0000000000001f00ull, N("eq") },
{ 0x0000000000000300ull, 0x0000000000001f00ull, N("le") },
{ 0x0000000000000400ull, 0x0000000000001f00ull, N("gt") },
#include <cuda.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
int main()
{
int N = 1000;
size_t size = N * sizeof(float);
// Kernel definition
__global__ void Test(float* A, float* B, float* C)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
unsigned int approx(float f) {
float ax = fabs(f);
unsigned int extra = 1;
if (ax >= 128.f) {
ax = 129.f;
extra = 0;
}
double w;
float ax2 = modf(ax, &w);
unsigned int sign = (f < 0.0f) << 31;
bool IsSchedInstruction(u32 offset, u32 main_offset) {
// sched instructions appear once every 4 instructions.
static constexpr size_t SchedPeriod = 4;
u32 absolute_offset = offset - main_offset;
return (absolute_offset % SchedPeriod) == 0;
}
void ShaderDumper::dump() {
FileUtil::IOFile sFile;
#version 430 core
#extension GL_ARB_separate_shader_objects : enable
#define MAX_CONSTBUFFER_ELEMENTS 4096
bool exec_fragment();
in vec4 position;
layout(location = 0) out vec4 FragColor0;
layout(location = 1) out vec4 FragColor1;
layout(location = 2) out vec4 FragColor2;
// Example program
#include <iostream>
#include <string>
#include <cmath>
#include <utility>
#include <array>
#include <cstring>
typedef unsigned char u8;
typedef unsigned int u32;
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
u32 block_height) {
const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + 63) / 64};
for (u32 line = 0; line < subrect_height; ++line) {
const u32 gob_address_y =
(line / (8 * block_height)) * 512 * block_height * image_width_in_gobs +
(line % (8 * block_height) / 8) * 512;
auto& table = legacy_swizzle_table[line % 8];
for (u32 x = 0; x < subrect_width; ++x) {
/*0538*/ @P0 BRK ;
/*0548*/ IADD32I R5, R5, -0x1 ;
/*0550*/ IMNMX.U32 R5, R5, 0x2, PT ;
/*0558*/ SHL R5, R5, 0x2 ;
/*0568*/ LDC R5, c[0x1][R5+0xc0] ;
/*0570*/ BRX R5 -0x578 ;
/*0578*/ { MOV R6, R1 ;
/*0588*/ BRK }
/*0590*/ { MOV R6, R2 ;
/*0598*/ BRK }
#version 430 core
#extension GL_ARB_separate_shader_objects : enable
#define EMULATION_UBO_BINDING 2
#define CBUF_BINDING_1 3
#define CBUF_BINDING_3 4
#define CBUF_BINDING_4 5
#define SAMPLER_BINDING_0 0
#define SAMPLER_BINDING_1 1
#define SAMPLER_BINDING_2 2