Skip to content

Instantly share code, notes, and snippets.

Avatar

N.Nakasato dadeba

  • University of Aizu
  • Japan
View GitHub Profile
@dadeba
dadeba / gist:1557145
Created Jan 3, 2012
Compute the average of relative error between 3D vectors a0 & a1 in Eigen
View gist:1557145
// a0 and a1 are row-major Array
Vector3f a_err = (std::abs((a0 - a1)/a0)).colwise().sum()/a0.rows();
@dadeba
dadeba / gist:1582789
Created Jan 9, 2012
Tahiti ISA for my DGEMM code
View gist:1582789
v_fma_f64 v[29:30], v[39:40], v[41:42], v[29:30] // 0000025C: D298001D 04765327
v_fma_f64 v[31:32], v[39:40], v[43:44], v[31:32] // 00000264: D298001F 047E5727
v_fma_f64 v[33:34], v[39:40], v[45:46], v[33:34] // 0000026C: D2980021 04865B27
v_fma_f64 v[35:36], v[39:40], v[47:48], v[35:36] // 00000274: D2980023 048E5F27
s_branch label_002E // 0000027C: BF82FF8E
@dadeba
dadeba / Tahiti_DGEMM.isa
Created Jan 18, 2012
Tahiti ISA for 4x4 DGEMM kernel generated from our IL DGEMM code
View Tahiti_DGEMM.isa
shader main
asic(SI_ASIC)
type(PS)
// s_ps_state in s0
s_mov_b64 s[44:45], exec // 00000000: BEAC047E
s_wqm_b64 exec, exec // 00000004: BEFE0A7E
v_floor_f32 v0, v2 // 00000008: 7E004902
v_floor_f32 v1, v3 // 0000000C: 7E024903
v_mul_legacy_f32 v55, 2.0, v0 // 00000010: 0E6E00F4
@dadeba
dadeba / gist:1700860
Created Jan 29, 2012
Tahiti ISA for 4x4 DGEMM kernel Catalyst 12.1
View gist:1700860
shader main
asic(SI_ASIC)
type(PS)
// s_ps_state in s0
s_mov_b64 s[44:45], exec // 00000000: BEAC047E
s_wqm_b64 exec, exec // 00000004: BEFE0A7E
s_load_dwordx4 s[12:15], s[10:11], 0x00 // 00000008: C0860B00
v_floor_f32 v0, v2 // 0000000C: 7E004902
v_floor_f32 v1, v3 // 00000010: 7E024903
@dadeba
dadeba / e_time.c
Created Feb 1, 2012
measure the elapsed time on linux
View e_time.c
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
// How to use?
//
// double st = e_time();
// ... where computation going on ...
// double en = e_time();
//
@dadeba
dadeba / sph_neighbor.cl
Created Feb 3, 2012
An OpenCL kernel for neighbor search
View sph_neighbor.cl
#define NNMAX 16
float R2(float4 p)
{
return p.x*p.x + p.y*p.y + p.z*p.z;
}
__kernel void sph_neighbor(__global float4 *pos,
__global float *size,
__global int *next,
__global int *more,
@dadeba
dadeba / gravity_v4.cl
Created Mar 23, 2012
OpenCL: A vectorized kernel for gravity interaction
View gravity_v4.cl
#define READONLY_P const * restrict
__kernel
void
grav1(
__global float4 READONLY_P x,
__global float4 READONLY_P y,
__global float4 READONLY_P z,
__global float4 READONLY_P m,
__global float4 *ax,
@dadeba
dadeba / gravity_v4v2.cl
Created Mar 23, 2012
OpenCL: A vectorized kernel for gravity interaction inspired by Tanikawa etal. 2012
View gravity_v4v2.cl
#define READONLY_P const * restrict
float4 sum(float8 x)
{
float4 tmp;
tmp.x = x.s0 + x.s4;
tmp.y = x.s1 + x.s5;
tmp.z = x.s2 + x.s6;
tmp.w = x.s3 + x.s7;
return tmp;
View html.rb
def self.get(url)
return self.new(Open3.popen3("curl", "--location", "--compressed", url) {|i,o,e| o.read })
end
View gravity_v4v2.asm
.file "/tmp/5d41e25b-e85e-4f10-836a-5b23eab3f6a7.TMP"
.text
.globl _Z12native_rsqrtDv8_f
.align 16, 0x90
.type _Z12native_rsqrtDv8_f,@function
_Z12native_rsqrtDv8_f: # @_Z12native_rsqrtDv8_f
# BB#0:
vrsqrtps YMM0, YMM0
ret
.Ltmp0: