Last active
September 15, 2018 00:57
-
-
Save liamwhite/7abba3cc490334d76560416a0b75a859 to your computer and use it in GitHub Desktop.
Vectorized De Casteljau's algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Fast Cubic Bezier evaluation. | |
* Vectorized C. | |
*/ | |
#include <math.h> | |
typedef float float4 __attribute__((vector_size(16))); | |
typedef float float2 __attribute__((vector_size(8))); | |
static float normalize(float2 p) | |
{ | |
return p / sqrt(p[0]*p[0] + p[1]*p[1]); | |
} | |
static float2 lerp(float2 p0, float2 p1, float t) | |
{ | |
return (p1 - p0)*t + p0; | |
} | |
float4 decasteljau_b3(float2 c0, float2 c1, float2 c2, float2 c3, float t) | |
{ | |
float2 p0, p1, p2; | |
// Compute point | |
p0 = lerp(c0, c1, t); | |
p1 = lerp(c1, c2, t); | |
p2 = lerp(c2, c3, t); | |
p0 = lerp(p0, p1, t); | |
p1 = lerp(p1, p2, t); | |
p2 = lerp(p0, p1, t); | |
// Compute tangent at point | |
p1 = normalize(p1 - p0); | |
return (float4) { p2[0], p2[1], -p1[1], p1[0] }; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.LCPI0_0: | |
.long 1065353216 # float 1 | |
.long 1065353216 # float 1 | |
.long 1065353216 # float 1 | |
.long 1065353216 # float 1 | |
.LCPI0_1: | |
.long 2147483648 # float -0 | |
.long 2147483648 # float -0 | |
.long 2147483648 # float -0 | |
.long 2147483648 # float -0 | |
decasteljau_b3: # @decasteljau_b3 | |
vsubps %xmm0, %xmm1, %xmm5 | |
vmovsldup %xmm4, %xmm4 # xmm4 = xmm4[0,0,2,2] | |
vfmadd213ps %xmm0, %xmm4, %xmm5 | |
vsubps %xmm1, %xmm2, %xmm0 | |
vfmadd213ps %xmm1, %xmm4, %xmm0 | |
vsubps %xmm2, %xmm3, %xmm1 | |
vfmadd213ps %xmm2, %xmm4, %xmm1 | |
vsubps %xmm5, %xmm0, %xmm2 | |
vfmadd213ps %xmm5, %xmm4, %xmm2 | |
vsubps %xmm0, %xmm1, %xmm1 | |
vfmadd213ps %xmm0, %xmm4, %xmm1 | |
vsubps %xmm2, %xmm1, %xmm0 | |
vfmadd231ps %xmm4, %xmm0, %xmm2 | |
vmulps %xmm0, %xmm0, %xmm1 | |
vmovshdup %xmm1, %xmm3 # xmm3 = xmm1[1,1,3,3] | |
vaddss %xmm3, %xmm1, %xmm1 | |
vcvtss2sd %xmm1, %xmm1, %xmm1 | |
vsqrtsd %xmm1, %xmm1, %xmm1 | |
vrcpps %xmm1, %xmm3 | |
vfnmadd213ps .LCPI0_0(%rip), %xmm3, %xmm1 | |
vfmadd132ps %xmm3, %xmm3, %xmm1 | |
vmulps %xmm1, %xmm0, %xmm0 | |
vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3] | |
vxorps .LCPI0_1(%rip), %xmm1, %xmm1 | |
vinsertps $32, %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0,1],xmm1[0],xmm2[3] | |
vinsertps $48, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,1,2],xmm0[0] | |
retq |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Fast Cubic Bezier evaluation. | |
* Vectorized OpenCL. | |
*/ | |
float4 decasteljau_b3(float2 c0, float2 c1, float2 c2, float2 c3, float t) | |
{ | |
float2 p0, p1, p2; | |
// Compute point | |
p0 = mix(c0, c1, t); | |
p1 = mix(c1, c2, t); | |
p2 = mix(c2, c3, t); | |
p0 = mix(p0, p1, t); | |
p1 = mix(p1, p2, t); | |
p2 = mix(p0, p1, t); | |
// Compute tangent at point | |
p1 = normalize(p1 - p0); | |
return (float4) (p2.x, p2.y, -p1.y, p1.x); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment