Skip to content

Instantly share code, notes, and snippets.

@liamwhite
Last active September 15, 2018 00:57
Show Gist options
  • Save liamwhite/7abba3cc490334d76560416a0b75a859 to your computer and use it in GitHub Desktop.
Save liamwhite/7abba3cc490334d76560416a0b75a859 to your computer and use it in GitHub Desktop.
Vectorized De Casteljau's algorithm
/*
* Fast Cubic Bezier evaluation.
* Vectorized C.
*/
#include <math.h>
typedef float float4 __attribute__((vector_size(16)));
typedef float float2 __attribute__((vector_size(8)));
static float normalize(float2 p)
{
return p / sqrt(p[0]*p[0] + p[1]*p[1]);
}
static float2 lerp(float2 p0, float2 p1, float t)
{
return (p1 - p0)*t + p0;
}
float4 decasteljau_b3(float2 c0, float2 c1, float2 c2, float2 c3, float t)
{
float2 p0, p1, p2;
// Compute point
p0 = lerp(c0, c1, t);
p1 = lerp(c1, c2, t);
p2 = lerp(c2, c3, t);
p0 = lerp(p0, p1, t);
p1 = lerp(p1, p2, t);
p2 = lerp(p0, p1, t);
// Compute tangent at point
p1 = normalize(p1 - p0);
return (float4) { p2[0], p2[1], -p1[1], p1[0] };
}
.LCPI0_0:
.long 1065353216 # float 1
.long 1065353216 # float 1
.long 1065353216 # float 1
.long 1065353216 # float 1
.LCPI0_1:
.long 2147483648 # float -0
.long 2147483648 # float -0
.long 2147483648 # float -0
.long 2147483648 # float -0
decasteljau_b3: # @decasteljau_b3
vsubps %xmm0, %xmm1, %xmm5
vmovsldup %xmm4, %xmm4 # xmm4 = xmm4[0,0,2,2]
vfmadd213ps %xmm0, %xmm4, %xmm5
vsubps %xmm1, %xmm2, %xmm0
vfmadd213ps %xmm1, %xmm4, %xmm0
vsubps %xmm2, %xmm3, %xmm1
vfmadd213ps %xmm2, %xmm4, %xmm1
vsubps %xmm5, %xmm0, %xmm2
vfmadd213ps %xmm5, %xmm4, %xmm2
vsubps %xmm0, %xmm1, %xmm1
vfmadd213ps %xmm0, %xmm4, %xmm1
vsubps %xmm2, %xmm1, %xmm0
vfmadd231ps %xmm4, %xmm0, %xmm2
vmulps %xmm0, %xmm0, %xmm1
vmovshdup %xmm1, %xmm3 # xmm3 = xmm1[1,1,3,3]
vaddss %xmm3, %xmm1, %xmm1
vcvtss2sd %xmm1, %xmm1, %xmm1
vsqrtsd %xmm1, %xmm1, %xmm1
vrcpps %xmm1, %xmm3
vfnmadd213ps .LCPI0_0(%rip), %xmm3, %xmm1
vfmadd132ps %xmm3, %xmm3, %xmm1
vmulps %xmm1, %xmm0, %xmm0
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
vxorps .LCPI0_1(%rip), %xmm1, %xmm1
vinsertps $32, %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
vinsertps $48, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1,2],xmm0[0]
retq
/*
* Fast Cubic Bezier evaluation.
* Vectorized OpenCL.
*/
float4 decasteljau_b3(float2 c0, float2 c1, float2 c2, float2 c3, float t)
{
float2 p0, p1, p2;
// Compute point
p0 = mix(c0, c1, t);
p1 = mix(c1, c2, t);
p2 = mix(c2, c3, t);
p0 = mix(p0, p1, t);
p1 = mix(p1, p2, t);
p2 = mix(p0, p1, t);
// Compute tangent at point
p1 = normalize(p1 - p0);
return (float4) (p2.x, p2.y, -p1.y, p1.x);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment