edubart/gfxtoy.nelua

## gfxtoy.nelua
-- THIS whole framework is hacky and just a PROOF OF CONCEPT!
##[[
primtypes.number = primtypes.float32
primtypes.integer = primtypes.int32
primtypes.uinteger = primtypes.uint32

if OPENMP then
  cflags '-fopenmp'
end
local function parallel_for()
  if OPENMP then
    cemit '#pragma omp parallel for'
  end
end
local function simd_for()
  if ccinfo.is_emscripten then
    cemit '#pragma clang loop vectorize(enable) vectorize_width(4)'
  elseif ccinfo.is_clang then
    cemit '#pragma clang loop vectorize(enable)'
  elseif ccinfo.is_gcc then
    cemit '#pragma GCC ivdep'
  elseif OPENMP then
    cemit '#pragma omp simd'
  end
end

if DEBUG_VECTORIZER then
  if ccinfo.is_clang then
    cflags '-Rpass-analysis=loop-vectorize -Rpass=loop-vectorize'
    -- cflags '-Rpass-missed=loop-vectorize'
  elseif ccinfo.is_gcc then
    cflags '-fopt-info-loop-optimized'
    -- cflags '-fopt-info-vec-missed'
  end
end

if ccinfo.is_gcc then
  -- GCC is unable to vectorize some code when this flag is enabled, why?
  -- it is enabled by default above -O2
  -- cflags '-fno-tree-pre'
end

cflags '-Ofast -march=native -fno-plt'
pragmas.nogc = true
pragmas.nochecks = true
]]


require 'math'
require 'memory'


global float = @number
global int = @integer
global uint = @uinteger

local is_scalar = #[concept(function(x) return x.type.is_scalar end)]#

function math.rsqrt(x: is_scalar) <inline,nosideeffect>
  return 1.0 / sqrt(x)
end

-- Returns 0 if x < edge, and 1 otherwise.
function math.step(edge: is_scalar, x: is_scalar) <inline,nosideeffect>
  x = x < edge and 0 or 1
  return x
end

function math.fastexp(x: float32) <inline,nosideeffect>
  -- Fast exp (imprecise)
  -- Reference: https://github.com/ekmett/approximate/blob/master/cbits/fast.c
  local u: union {f: float32, x: int32} = {x = (@int32)(12102203 * x) + 1064866805}
  return u.f
end

function math.fastpow(a: float32, b: float32): float32 <inline,nosideeffect>
  -- Fast pow (precise)
  -- Reference: https://github.com/ekmett/approximate/blob/master/cbits/fast.c
  local flip: boolean = b < 0
  b = flip and -b or b
  local e: int32 = (@int32)(b)
  local u: union{f: float32, x: int32} = {f = a}
  u.x = (@int32)((b - e)*(u.x - 1065353216) + 1065353216)
  local r: float32 = 1.0
  ## for i=1,32 do
    if e == 0 then goto finish end
    r = e & 1 ~= 0 and r*a or r
    a = a * a
    e = e >>> 1
  ## end
::finish::
  r = r * u.f
  return flip and 1.0/r or r
end

function math.fasterpow(a: float32, b: float32): float32 <inline,nosideeffect>
  -- Very fast pow (imprecise)
  -- Reference: https://github.com/ekmett/approximate/blob/master/cbits/fast.c
  local u: union {f: float32, x: int32} = {f = a}
  u.x = (@int32)(b * (u.x - 1064866805) + 1064866805)
  return u.f
end

function math.fastcbrt(a: float32): float32 <inline,nosideeffect>
  -- Cube root approximation using 1 iterations of Halley's method
  if unlikely(a == 0.0) then return 0.0 end
  local u: union{f: float32, x: uint32} = {f = a}
  u.x = u.x /// 3_u32 + 709921077_u32
  local x: float32 = u.f -- initial guess
  local x3: float32 = x*x*x
  x = u.f * (x3 + 2.0*a) / (2.0*x3 + a)
  x3 = x*x*x
  return x * (x3 + 2.0*a) / (2.0*x3 + a)
end

function math.fastercbrt(a: float32): float32 <inline,nosideeffect>
  if unlikely(a == 0.0) then return 0.0 end
  -- Cube root approximation using 2 iterations of Halley's method
  local u: union{f: float32, x: uint32} = {f = a}
  u.x = u.x /// 3_u32 + 709921077_u32
  local x: float32 = u.f -- initial guess
  local x3: float32 = x*x*x
  return x * (x3 + 2.0*a) / (2.0*x3 + a)
end

-- Convert float bit representation to an unsigned integer.
function math.floatbits2uint(x: float): uint <inline,nosideeffect>
  local u: union{f: float, i: uint} = {f = x}
  return u.i
end

-- Convert float bit representation to an integer.
function math.floatbits2int(x: float): int <inline,nosideeffect>
  local u: union{f: float, i: int} = {f = x}
  return u.i
end

-- Returns -1 if x < 0, and 1 otherwise.
function math.signb(x: is_scalar) <inline,nosideeffect>
  return x < 0 and -1 or 1
end

function math.floorround(x: is_scalar) <inline,nosideeffect>
  return math.floor(x + 0.5)
end

-- Returns cos(acos(x)/3.0) but optimized, usually used to solve cubic equations.
-- Reference: https://iquilezles.org/www/articles/trisect/trisect.htm
function math.trisect(x: float): float <inline,nosideeffect>
  x = math.sqrt(0.5+0.5*x)
  return x*(x*(x*(x*-0.008972+0.039071)-0.107074)+0.576975)+0.5
end

-- Like `trisect`, but faster losing precision.
-- Reference: https://www.shadertoy.com/view/WltSD7
function math.fasttrisect(x: float): float <inline,nosideeffect>
  x = math.sqrt(0.5+0.5*x)
  return x*(-0.064916*x+0.564916)+0.5
end

-- Returns arccosine of an scalar.
-- Handbook of Mathematical Functions - M. Abramowitz and I.A. Stegun, Ed.
-- Reference: https://developer.download.nvidia.com/cg/acos.html
function math.fastacos(x: float): float <inline,nosideeffect>
  local negate: float = x < 0.0 and 1.0 or 0.0
  x = math.abs(x)
  local ret: float = -0.0187293
  ret = ret * x
  ret = ret + 0.0742610
  ret = ret * x
  ret = ret - 0.2121144
  ret = ret * x
  ret = ret + 1.5707288
  ret = ret * math.sqrt(1.0-x)
  ret = ret - 2.0 * negate * ret
  return negate * 3.14159265358979 + ret
end

-- Returns arccosine of an scalar. (imprecise)
-- Approximation by Sebastien Lagarde.
-- Reference: https://www.shadertoy.com/view/lsjXDc
function math.fasteracos(x: float): float <inline,nosideeffect>
  local y: float = math.abs(math.clamp(x,-1.0,1.0))
  local b: float = (-0.168577*y + 1.56723) * math.sqrt(1.0 - y)
  local t: float = math.sign(x)
  local a: float = 0.5*3.1415927
  return a * (1.0-t) + b * t
end

-- Reference: https://www.shadertoy.com/view/7d23D1
function math.fastsin(x: float): float <inline,nosideeffect>
  -- Quartic sin approximation (precise), using the following constrains:
  -- f(0) = 0, f(pi) = 0, f(pi/2) = 1, f'(0) = 1, f'(pi) = -1, f'(pi/2) = 0
  local line = x*#[1/math.pi]#
  local stair = math.floor(line)
  local saw = line - stair
  local wave = saw*(saw*(saw*(saw*#[16 - 4*math.pi]# + #[8*math.pi - 32]#) + #[16 - 5*math.pi]#) + #[math.pi]#)
  local signal = (1.0-2.0*(stair - 2.0*math.floor(0.5*line)))
  return signal*wave
end

function math.fastcos(x: float): float <inline,nosideeffect>
  return math.fastsin(x + #[math.pi/2]#)
end

function math.qsin(x: float): float <inline,nosideeffect>
  -- Quadratic sin approximation (imprecise), using the following constrains:
  -- f(0) = 0, f(pi) = 0, f(pi/2) = 1
  local line = x*#[1/math.pi]#
  local stair = math.floor(line)
  local saw = line - stair
  local wave = 4.0*saw*(1.0-saw)
  local signal = (1.0-2.0*(stair - 2.0*math.floor(0.5*line)))
  return signal*wave
end

function math.qcos(x: float): float <inline,nosideeffect>
  return math.qsin(x + #[math.pi/2]#)
end


--TODO: inplace operators
--TODO: vec constructors
--TODO: remove delay resolution to optimize parsing
--TODO: vec to string for print

##[[
local function choose_vec2_type(a, b)
  local asubtype = a.type.subtype or a.type
  local bsubtype = b.type.subtype or b.type
  if asubtype.is_float or bsubtype.is_float then
    return vec2.value
  elseif asubtype == primtypes.byte and bsubtype == primtypes.byte then
    return bvec2.value
  elseif asubtype.is_unsigned and bsubtype.is_unsigned then
    return uvec2.value
  elseif asubtype.is_integral and bsubtype.is_integral then
    return ivec2.value
  else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
end

local function choose_vec3_type(a, b)
  local asubtype = a.type.subtype or a.type
  local bsubtype = b.type.subtype or b.type
  if asubtype.is_float or bsubtype.is_float then
    return vec3.value
  elseif asubtype == primtypes.byte and bsubtype == primtypes.byte then
    return bvec3.value
  elseif asubtype.is_unsigned and bsubtype.is_unsigned then
    return uvec3.value
  elseif asubtype.is_integral and bsubtype.is_integral then
    return ivec3.value
  else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
end

local function choose_vec4_type(a, b)
  local asubtype = a.type.subtype or a.type
  local bsubtype = b.type.subtype or b.type
  if asubtype.is_float or bsubtype.is_float then
    return vec4.value
  elseif asubtype == primtypes.byte and bsubtype == primtypes.byte then
    return bvec4.value
  elseif asubtype.is_unsigned and bsubtype.is_unsigned then
    return uvec4.value
  elseif asubtype.is_integral and bsubtype.is_integral then
    return ivec4.value
  else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
end
]]


global vec2 <aligned(8)>  = @record{x: float, y: float}
global vec3 <aligned(16)> = @record{x: float, y: float, z: float}
global vec4 <aligned(16)> = @record{x: float, y: float, z: float, w: float}

global ivec2 <aligned(8)>  = @record{x: int, y: int}
global ivec3 <aligned(16)> = @record{x: int, y: int, z: int}
global ivec4 <aligned(16)> = @record{x: int, y: int, z: int, w: int}

global uvec2 <aligned(8)> = @record{x: uint, y: uint}
global uvec3 <aligned(16)> = @record{x: uint, y: uint, z: uint}
global uvec4 <aligned(16)> = @record{x: uint, y: uint, z: uint, w: uint}

global bvec3 <aligned(1)> = @record{x: byte, y: byte, z: byte}
global bvec4 <aligned(4)> = @record{x: byte, y: byte, z: byte, w: byte}

##[[
do
  vec2.value.is_vec2 = true vec2.value.an_vec2 = true vec2.value.an_fvec = true vec2.value.an_vec = true vec2.value.subtype = float.value
  vec3.value.is_vec3 = true vec3.value.an_vec3 = true vec3.value.an_fvec = true vec3.value.an_vec = true vec3.value.subtype = float.value
  vec4.value.is_vec4 = true vec4.value.an_vec4 = true vec4.value.an_fvec = true vec4.value.an_vec = true vec4.value.subtype = float.value

  ivec2.value.is_ivec2 = true ivec2.value.an_vec2 = true ivec2.value.an_ivec = true ivec2.value.an_vec = true ivec2.value.subtype = int.value
  ivec3.value.is_ivec3 = true ivec3.value.an_vec3 = true ivec3.value.an_ivec = true ivec3.value.an_vec = true ivec3.value.subtype = int.value
  ivec4.value.is_ivec4 = true ivec4.value.an_vec4 = true ivec4.value.an_ivec = true ivec4.value.an_vec = true ivec4.value.subtype = int.value

  uvec2.value.is_uvec2 = true uvec2.value.an_vec2 = true uvec2.value.an_uvec = true uvec2.value.an_vec = true uvec2.value.subtype = uint.value
  uvec3.value.is_uvec3 = true uvec3.value.an_vec3 = true uvec3.value.an_uvec = true uvec3.value.an_vec = true uvec3.value.subtype = uint.value
  uvec4.value.is_uvec4 = true uvec4.value.an_vec4 = true uvec4.value.an_uvec = true uvec4.value.an_vec = true uvec4.value.subtype = uint.value

  bvec3.value.is_bvec3 = true bvec3.value.an_vec3 = true bvec3.value.an_bvec = true bvec3.value.an_vec = true bvec3.value.subtype = primtypes.byte
  bvec4.value.is_bvec4 = true bvec4.value.an_vec4 = true bvec4.value.an_bvec = true bvec4.value.an_vec = true bvec4.value.subtype = primtypes.byte
end
]]

  local function add(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x+b.x, a.y+b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x+b, a.y+b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a+b.x, a+b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x+b.x, a.y+b.y, a.z+b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x+b, a.y+b, a.z+b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a+b.x, a+b.y, a+b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x+b, a.y+b, a.z+b, a.w+b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a+b.x, a+b.y, a+b.z, a+b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function sub(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x-b.x, a.y-b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x-b, a.y-b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a-b.x, a-b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x-b.x, a.y-b.y, a.z-b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x-b, a.y-b, a.z-b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a-b.x, a-b.y, a-b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x-b, a.y-b, a.z-b, a.w-b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a-b.x, a-b.y, a-b.z, a-b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function mul(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x*b.x, a.y*b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x*b, a.y*b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a*b.x, a*b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x*b.x, a.y*b.y, a.z*b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x*b, a.y*b, a.z*b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a*b.x, a*b.y, a*b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x*b, a.y*b, a.z*b, a.w*b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a*b.x, a*b.y, a*b.z, a*b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function div(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return vec2{a.x/b.x, a.y/b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      local k = 1.0/b
      return vec2{a.x*k, a.y*k}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return vec2{a/b.x, a/b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return vec3{a.x/b.x, a.y/b.y, a.z/b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      local k = 1.0/b
      return vec3{a.x*k, a.y*k, a.z*k}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return vec3{a/b.x, a/b.y, a/b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return vec4{a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      local k = 1.0/b
      return vec4{a.x*k, a.y*k, a.z*k, a.w*k}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return vec4{a/b.x, a/b.y, a/b.z, a/b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function idiv(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x//b.x, a.y//b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x//b, a.y//b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a//b.x, a//b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x//b.x, a.y//b.y, a.z//b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x//b, a.y//b, a.z//b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a//b.x, a//b.y, a//b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x//b.x, a.y//b.y, a.z//b.z, a.w//b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x//b, a.y//b, a.z//b, a.w//b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a//b.x, a//b.y, a//b.z, a//b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function tdiv(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x///b.x, a.y///b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x///b, a.y///b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a///b.x, a///b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x///b.x, a.y///b.y, a.z///b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x///b, a.y///b, a.z///b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a///b.x, a///b.y, a///b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x///b.x, a.y///b.y, a.z///b.z, a.w///b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x///b, a.y///b, a.z///b, a.w///b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a///b.x, a///b.y, a///b.z, a///b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function unm(a: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 then
      return #[a.type]#{-a.x, -a.y}
    ## elseif a.type.an_vec3 then
      return #[a.type]#{-a.x, -a.y, -a.z}
    ## elseif a.type.an_vec4 then
      return #[a.type]#{-a.x, -a.y, -a.z, -a.w}
    ## else static_error("invalid argument of type '%s'", a.type) end
  end

  local function bnot(a: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 then
      return #[a.type]#{~a.x, ~a.y}
    ## elseif a.type.an_vec3 then
      return #[a.type]#{~a.x, ~a.y, ~a.z}
    ## elseif a.type.an_vec4 then
      return #[a.type]#{~a.x, ~a.y, ~a.z, ~a.w}
    ## else static_error("invalid argument of type '%s'", a.type) end
  end

  local function shr(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x>>b.x, a.y>>b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x>>b, a.y>>b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a>>b.x, a>>b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x>>b.x, a.y>>b.y, a.z>>b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x>>b, a.y>>b, a.z>>b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a>>b.x, a>>b.y, a>>b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x>>b.x, a.y>>b.y, a.z>>b.z, a.w>>b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x>>b, a.y>>b, a.z>>b, a.w>>b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a>>b.x, a>>b.y, a>>b.z, a>>b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function asr(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x>>>b.x, a.y>>>b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x>>>b, a.y>>>b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a>>>b.x, a>>>b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x>>>b.x, a.y>>>b.y, a.z>>>b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x>>>b, a.y>>>b, a.z>>>b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a>>>b.x, a>>>b.y, a>>>b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x>>>b.x, a.y>>>b.y, a.z>>>b.z, a.w>>>b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x>>>b, a.y>>>b, a.z>>>b, a.w>>>b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a>>>b.x, a>>>b.y, a>>>b.z, a>>>b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function shl(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x<<b.x, a.y<<b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x<<b, a.y<<b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a<<b.x, a<<b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x<<b.x, a.y<<b.y, a.z<<b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x<<b, a.y<<b, a.z<<b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a<<b.x, a<<b.y, a<<b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x<<b.x, a.y<<b.y, a.z<<b.z, a.w<<b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x<<b, a.y<<b, a.z<<b, a.w<<b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a<<b.x, a<<b.y, a<<b.z, a<<b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function band(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x&b.x, a.y&b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x&b, a.y&b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a&b.x, a&b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x&b.x, a.y&b.y, a.z&b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x&b, a.y&b, a.z&b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a&b.x, a&b.y, a&b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x&b.x, a.y&b.y, a.z&b.z, a.w&b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x&b, a.y&b, a.z&b, a.w&b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a&b.x, a&b.y, a&b.z, a&b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function bor(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x|b.x, a.y|b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x|b, a.y|b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a|b.x, a|b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x|b.x, a.y|b.y, a.z|b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x|b, a.y|b, a.z|b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a|b.x, a|b.y, a|b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x|b.x, a.y|b.y, a.z|b.z, a.w|b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x|b, a.y|b, a.z|b, a.w|b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a|b.x, a|b.y, a|b.z, a|b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  local function bxor(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a.x~b.x, a.y~b.y}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{a.x~b, a.y~b}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{a~b.x, a~b.y}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a.x~b.x, a.y~b.y, a.z~b.z}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{a.x~b, a.y~b, a.z~b}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{a~b.x, a~b.y, a~b.z}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a.x~b.x, a.y~b.y, a.z~b.z, a.w~b.w}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{a.x~b, a.y~b, a.z~b, a.w~b}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{a~b.x, a~b.y, a~b.z, a~b.w}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  global function mod(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.is_scalar and b.type.is_scalar then
      return math.mod(a,b)
    ## elseif a.type.an_vec2 and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{math.mod(a.x,b.x), math.mod(a.y,b.y)}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return #[choose_vec2_type(a,b)]#{math.mod(a.x,b), math.mod(a.y,b)}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return #[choose_vec2_type(a,b)]#{math.mod(a,b.x), math.mod(a,b.y)}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{math.mod(a.x,b.x), math.mod(a.y,b.y), math.mod(a.z,b.z)}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return #[choose_vec3_type(a,b)]#{math.mod(a.x,b), math.mod(a.y,b), math.mod(a.z,b)}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return #[choose_vec3_type(a,b)]#{math.mod(a,b.x), math.mod(a,b.y), math.mod(a,b.z)}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{math.mod(a.x,b.x), math.mod(a.y,b.y), math.mod(a.z,b.z), math.mod(a.w,b.w)}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return #[choose_vec4_type(a,b)]#{math.mod(a.x,b), math.mod(a.y,b), math.mod(a.z,b), math.mod(a.w,b)}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return #[choose_vec4_type(a,b)]#{math.mod(a,b.x), math.mod(a,b.y), math.mod(a,b.z), math.mod(a,b.w)}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  global function pow(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.is_scalar and b.type.is_scalar then
      return math.pow(a,b)
    ## elseif a.type.an_vec2 and b.type.an_vec2 then
      return vec2{math.pow(a.x,b.x), math.pow(a.y,b.y)}
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return vec2{math.pow(a.x,b), math.pow(a.y,b)}
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return vec2{math.pow(a,b.x), math.pow(a,b.y)}
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return vec3{math.pow(a.x,b.x), math.pow(a.y,b.y), math.pow(a.z,b.z)}
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return vec3{math.pow(a.x,b), math.pow(a.y,b), math.pow(a.z,b)}
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return vec3{math.pow(a,b.x), math.pow(a,b.y), math.pow(a,b.z)}
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return vec4{math.pow(a.x,b.x), math.pow(a.y,b.y), math.pow(a.z,b.z), math.pow(a.w,b.w)}
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return vec4{math.pow(a.x,b), math.pow(a.y,b), math.pow(a.z,b), math.pow(a.w,b)}
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return vec4{math.pow(a,b.x), math.pow(a,b.y), math.pow(a,b.z), math.pow(a,b.w)}
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

  global function length(a: auto) <inline,nosideeffect>
    ## if a.type.is_scalar then
      return a
    ## elseif a.type.an_vec2 then
      return math.sqrt(a.x*a.x + a.y*a.y)
    ## elseif a.type.an_vec3 then
      return math.sqrt(a.x*a.x + a.y*a.y + a.z*a.z)
    ## elseif a.type.an_vec4 then
      return math.sqrt(a.x*a.x + a.y*a.y + a.z*a.z + a.w*a.w)
    ## else static_error("invalid argument of type '%s'", a.type) end
  end

  global function dot(a: auto, b: auto) <inline,nosideeffect>
    ## if a.type.is_scalar and b.type.is_scalar then
      return a*b
    ## elseif a.type.an_vec2 and b.type.an_vec2 then
      return a.x*b.x + a.y*b.y
    ## elseif a.type.an_vec2 and b.type.is_scalar then
      return a.x*b + a.y*b
    ## elseif a.type.is_scalar and b.type.an_vec2 then
      return a*b.x + a*b.y
    ## elseif a.type.an_vec3 and b.type.an_vec3 then
      return a.x*b.x + a.y*b.y + a.z*b.z
    ## elseif a.type.an_vec3 and b.type.is_scalar then
      return a.x*b + a.y*b + a.z*b
    ## elseif a.type.is_scalar and b.type.an_vec3 then
      return a*b.x + a*b.y + a*b.z
    ## elseif a.type.an_vec4 and b.type.an_vec4 then
      return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w
    ## elseif a.type.an_vec4 and b.type.is_scalar then
      return a.x*b + a.y*b + a.z*b + a.w*b
    ## elseif a.type.is_scalar and b.type.an_vec4 then
      return a*b.x + a*b.y + a*b.z + a*b.w
    ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
  end

##[[
local iters = require 'nelua.utils.iterators'
local math_scalar2_methods = {min=true, max=true, step=true,
                              atan2='atan', fastpow='pow', fasterpow='pow'}
]]
## for fname,fv in iters.ospairs(math_scalar2_methods) do
    global function #|fname|#(a: auto, b: auto) <inline,nosideeffect>
      local f <comptime> = math.#|fname|#;
      ## if a.type.is_scalar and b.type.is_scalar then
        return f(a, b)
      ## elseif a.type.an_vec2 and b.type.an_vec2 then
        return #[choose_vec2_type(a,b)]#{f(a.x,b.x), f(a.y,b.y)}
      ## elseif a.type.an_vec2 and b.type.is_scalar then
        return #[choose_vec2_type(a,b)]#{f(a.x,b), f(a.y,b)}
      ## elseif a.type.is_scalar and b.type.an_vec2 then
        return #[choose_vec2_type(a,b)]#{f(a,b.x), f(a,b.y)}
      ## elseif a.type.an_vec3 and b.type.an_vec3 then
        return #[choose_vec3_type(a,b)]#{f(a.x,b.x), f(a.y,b.y), f(a.z,b.z)}
      ## elseif a.type.an_vec3 and b.type.is_scalar then
        return #[choose_vec3_type(a,b)]#{f(a.x,b), f(a.y,b), f(a.z,b)}
      ## elseif a.type.is_scalar and b.type.an_vec3 then
        return #[choose_vec3_type(a,b)]#{f(a,b.x), f(a,b.y), f(a,b.z)}
      ## elseif a.type.an_vec4 and b.type.an_vec4 then
        return #[choose_vec4_type(a,b)]#{f(a.x,b.x), f(a.y,b.y), f(a.z,b.z), f(a.w,b.w)}
      ## elseif a.type.an_vec4 and b.type.is_scalar then
        return #[choose_vec4_type(a,b)]#{f(a.x,b), f(a.y,b), f(a.z,b), f(a.w,b)}
      ## elseif a.type.is_scalar and b.type.an_vec4 then
        return #[choose_vec4_type(a,b)]#{f(a,b.x), f(a,b.y), f(a,b.z), f(a,b.w)}
      ## else static_error("invalid argument of types '%s' '%s'", a.type, b.type) end
    end
## end

##[[
local math_float1_methods = {
  sqrt=true, rsqrt=false, cbrt=false, rcbrt=false,
  exp=true, exp2=true, log=true, log2=true,
  cos=true, sin=true, tan=true, acos=true, asin=true, atan=true,
  cosh=true, sinh=true, tanh=true, acosh=true, asinh=true, atanh=true,
  deg=false, rad=false,
  fastexp='exp',
  fastcbrt=false, fastercbrt=false,
  fastsin='sin', fastcos='cos', qsin='sin', qcos='cos',
  trisect=false, fasttrisect=false, fastacos='acos',
}
]]
## for fname,glslfname in iters.ospairs(math_float1_methods) do
    global function #|fname|#(a: auto) <inline,nosideeffect>
      local f <comptime> = math.#|fname|#;
      ## if a.type.is_scalar then
        return f(a)
      ## elseif a.type.an_vec2 then
        return vec2{f(a.x), f(a.y)}
      ## elseif a.type.an_vec3 then
        return vec3{f(a.x), f(a.y), f(a.z)}
      ## elseif a.type.an_vec4 then
        return vec4{f(a.x), f(a.y), f(a.z), f(a.w)}
      ## else static_error("invalid argument of type '%s'", a.type) end
    end
## end

##[[
local math_scalar1_methods = {
  'floor', 'ceil', 'abs', 'sign', 'signb', 'fract', 'round'
}
]]
## for i,fname in ipairs(math_scalar1_methods) do
    global function #|fname|#(a: auto) <inline,nosideeffect>
      local f <comptime> = math.#|fname|#;
      ## if a.type.is_scalar then
        return f(a)
      ## elseif a.type.an_vec2 then
        return #[a.type]#{f(a.x), f(a.y)}
      ## elseif a.type.an_vec3 then
        return #[a.type]#{f(a.x), f(a.y), f(a.z)}
      ## elseif a.type.an_vec4 then
        return #[a.type]#{f(a.x), f(a.y), f(a.z), f(a.w)}
      ## else static_error("invalid argument of type '%s'", a.type) end
    end
## end

## for i,fname in ipairs{'ifloor', 'iceil'} do
global function #|fname|#(a: auto) <inline,nosideeffect>
  local f <comptime> = math.#|fname|#;
  ## if a.type.is_scalar then
    return f(a)
  ## elseif a.type.an_vec2 then
    return ivec2{f(a.x), f(a.y)}
  ## elseif a.type.an_vec3 then
    return ivec3{f(a.x), f(a.y), f(a.z)}
  ## elseif a.type.an_vec4 then
    return ivec4{f(a.x), f(a.y), f(a.z), f(a.w)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end
## end

global function tovec2(a: auto): vec2 <inline,nosideeffect>
  ## if a.type.is_scalar then
    return vec2{a,a}
  ## elseif a.type.an_vec then
    return vec2{a.x,a.y}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function tovec3(a: auto): vec3 <inline,nosideeffect>
  ## if a.type.is_scalar then
    return vec3{a,a,a}
  ## elseif a.type.an_vec2 then
    return vec3{a.x,a.y,1.0}
  ## elseif a.type.an_vec3 then
    return vec3{a.x,a.y,a.z}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function tovec4(a: auto): vec4 <inline,nosideeffect>
  ## if a.type.is_scalar then
    return vec4{a,a,a,a}
  ## elseif a.type.an_vec2 then
    return vec4{a.x,a.y,1.0,1.0}
  ## elseif a.type.an_vec3 then
    return vec4{a.x,a.y,a.z,1.0}
  ## elseif a.type.an_vec4 then
    return vec4{a.x,a.y,a.z,a.w}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function toivec2(a: auto): ivec2 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: int = int(a)
    return ivec2{i,i}
  ## elseif a.type.an_vec then
    return ivec2{int(a.x),int(a.y)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function toivec3(a: auto): ivec3 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: int = int(a)
    return ivec3{i,i,i}
  ## elseif a.type.an_vec2 then
    return ivec3{int(a.x),int(a.y),255}
  ## elseif a.type.an_vec then
    return ivec3{int(a.x),int(a.y),int(a.z)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function toivec4(a: auto): ivec4 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: int = int(a)
    return ivec4{i,i,i,i}
  ## elseif a.type.an_vec2 then
    return ivec4{int(a.x),int(a.y),255,255}
  ## elseif a.type.an_vec3 then
    return ivec4{int(a.x),int(a.y),int(a.z),255}
  ## elseif a.type.an_vec4 then
    return ivec4{int(a.x),int(a.y),int(a.z),int(a.w)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function touvec2(a: auto): uvec2 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: uint = uint(a)
    return uvec2{i,i}
  ## elseif a.type.an_vec then
    return uvec2{uint(a.x),uint(a.y)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function touvec3(a: auto): uvec3 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: uint = uint(a)
    return uvec3{i,i,i}
  ## elseif a.type.an_vec2 then
    return uvec3{uint(a.x),uint(a.y),255}
  ## elseif a.type.an_vec then
    return uvec3{uint(a.x),uint(a.y),uint(a.z)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function touvec4(a: auto): uvec4 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: uint = uint(a)
    return uvec4{i,i,i,i}
  ## elseif a.type.an_vec2 then
    return uvec4{uint(a.x),uint(a.y),255,255}
  ## elseif a.type.an_vec3 then
    return uvec4{uint(a.x),uint(a.y),uint(a.z),255}
  ## elseif a.type.an_vec4 then
    return uvec4{uint(a.x),uint(a.y),uint(a.z),uint(a.w)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function tobvec4(a: auto): bvec4 <inline,nosideeffect>
  ## if a.type.is_scalar then
    local i: byte = byte(a)
    return bvec4{i,i,i,i}
  ## elseif a.type.an_vec2 then
    return bvec4{byte(a.x),byte(a.y),255,255}
  ## elseif a.type.an_vec3 then
    return bvec4{byte(a.x),byte(a.y),byte(a.z),255}
  ## elseif a.type.an_vec4 then
    return bvec4{byte(a.x),byte(a.y),byte(a.z),byte(a.w)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function topixel(a: auto): bvec4 <inline,nosideeffect>
  ## if a.type.is_float then
    local v: byte = (@byte)(a * 255.0)
    return (@bvec4) {
      x = v,
      y = v,
      z = v,
      w = 255
    }
  ## elseif a.type.is_uint8 then
    return (@bvec4) {
      x = a,
      y = a,
      z = a,
      w = 255
    }
  ## elseif a.type.is_bvec3 then
    return (@bvec4) {
      x = a.x,
      y = a.y,
      z = a.z,
      w = 255
    }
  ## elseif a.type.is_vec3 then
    a = a * 255.0
    return (@bvec4) {
      x = (@byte)(a.x),
      y = (@byte)(a.y),
      z = (@byte)(a.z),
      w = 255
    }
  ## elseif a.type.is_vec4 then
    a = a * 255.0
    return (@bvec4) {
      x = (@byte)(a.x),
      y = (@byte)(a.y),
      z = (@byte)(a.z),
      w = (@byte)(a.w),
    }
  ## elseif a.type.is_bvec4 then
    return a
  ## else static_error("invalid argument of type '%s'", a.type) end
end

##[[
do
  local function alias_rgba_fields(recordsym)
    local fields = recordsym.value.fields
    fields.r = fields.x
    fields.g = fields.y
    fields.b = fields.z
    fields.a = fields.w
  end

  alias_rgba_fields(vec3)
  alias_rgba_fields(vec4)
  alias_rgba_fields(ivec3)
  alias_rgba_fields(ivec4)
  alias_rgba_fields(uvec3)
  alias_rgba_fields(uvec4)
  alias_rgba_fields(bvec3)
  alias_rgba_fields(bvec4)
end

do
  local vecsyms = {
    vec2,
    vec3,
    vec4,
    ivec2,
    ivec3,
    ivec4,
    uvec2,
    uvec3,
    uvec4,
    bvec3,
    bvec4,
  }
  local vecmethods
    vecmethods = {
      __add = add,
      __sub = sub,
      __mul = mul,
      __idiv = idiv,
      __tdiv = tdiv,
      __div = div,
      __mod = mod,
      __pow = pow,
      __unm = unm,
      __bnot = bnot,
      __shl = shl,
      __shr = shr,
      __asr = asr,
      __band = band,
      __bor = bor,
      __bxor = bxor,
      __len = length,
    }

  for _,vecsym in ipairs(vecsyms) do
    for key,method in pairs(vecmethods) do
      vecsym.value.metafields[key] = method
    end
  end
end

local function auto_swizzling_methods(vectype)
  local skip = false
  setmetatable(vectype.metafields, {
    __index = hygienize(function(_, name)
      if skip then return end
      skip = true
      local method
      if name:match('^[xyzwrgba]+$') then
      ]]
        local vecT = #[vectype]#
        function vecT.#|name|#(self: vecT) <inline,nosideeffect>
          ## if #name == 4 then
            return #[choose_vec4_type(self,self)]#{self.#|name:sub(1,1)|#, self.#|name:sub(2,2)|#, self.#|name:sub(3,3)|#, self.#|name:sub(4,4)|#};
          ## elseif #name == 3 then
            return #[choose_vec3_type(self,self)]#{self.#|name:sub(1,1)|#, self.#|name:sub(2,2)|#, self.#|name:sub(3,3)|#};
          ## elseif #name == 2 then
            return #[choose_vec2_type(self,self)]#{self.#|name:sub(1,1)|#, self.#|name:sub(2,2)|#};
          ## elseif #name == 1 then
            return self.#|name:sub(1,1)|#;
          ## end
        end
      ##[[
        method = vectype.metafields[name]
      end
      skip = false
      return method
    end)
  })
end
auto_swizzling_methods(ivec2.value)
auto_swizzling_methods(ivec3.value)
auto_swizzling_methods(ivec4.value)
auto_swizzling_methods(uvec2.value)
auto_swizzling_methods(uvec3.value)
auto_swizzling_methods(uvec4.value)
auto_swizzling_methods(vec2.value)
auto_swizzling_methods(vec3.value)
auto_swizzling_methods(vec4.value)
auto_swizzling_methods(bvec3.value)
auto_swizzling_methods(bvec4.value)
]]

global PI: number <comptime> = #[math.pi]#
global DEG2RAD: number <comptime> = #[math.pi/180]#
global TAU: number <comptime> = #[2*math.pi]#
global E: number <comptime> = #[math.exp(1)]#
global PHI: number <comptime> = #[(1+math.sqrt(5))/2]#
global SQRT2: number <comptime> = #[math.sqrt(2)]#
global SQRT1_2: number <comptime> = #[1/math.sqrt(2)]#

global function normalize(a: auto) <inline,nosideeffect>
  return a / length(a)
end

global function mix(a: auto, b: auto, t: auto) <inline,nosideeffect>
  ## if t.type.an_fvec or t.type.is_float then
  return a * (1.0-t) + b * t
  ## else
  return a * (1-t) + b * t
  ## end
end

global function clamp(a: auto, minv: auto, maxv: auto) <inline,nosideeffect>
  return min(max(a, minv), maxv)
end

global function smoothstep(edge0: auto, edge1: auto, x: auto) <inline,nosideeffect>
  local t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
  return t * t * (3.0 - 2.0 * t)
end

global function cross(a: vec3, b: vec3): vec3 <inline,nosideeffect>
  return vec3{a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x}
end

-- Extensions
global function smootherstep(edge0: auto, edge1: auto, x: auto) <inline,nosideeffect>
  local t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
  return t * t * (3.0 - 2.0 * t)
end

global function smootherstep(edge0: auto, edge1: auto, x: auto) <inline,nosideeffect>
  local t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
  return t * t * t * (t * (t * 6.0 - 15.0) + 10.0)
end

global function cross2(a: vec2, b: vec2): float <inline,nosideeffect>
  return a.x*b.y - a.y*b.x
end

global function fastmix(a: auto, b: auto, t: auto) <inline,nosideeffect>
  return a + (b-a) * t
end

global function linearstep(a: auto, b: auto, x: auto) <inline,nosideeffect>
  return clamp((x-a)/(b-a), 0.0, 1.0)
end

global function reflect(i: auto, n: auto) <inline,nosideeffect>
  return i - 2.0*dot(n, i)*n
end

global function sincos(x: float): vec2 <inline,nosideeffect>
  return vec2{sin(x), cos(x)}
end

global function fastsincos(x: float): vec2 <inline,nosideeffect>
  return fastsin(vec2{x, x+#[math.pi/2]#})
end

global function qsincos(x: float): vec2 <inline,nosideeffect>
  return qsin(vec2{x, x+#[math.pi/2]#})
end

global function pow2(a: auto) <inline,nosideeffect>
  return a * a
end

global function pow3(a: auto) <inline,nosideeffect>
  return a * a * a
end

global function ndot(a: vec2, b: vec2) <inline,nosideeffect>
  return a.x*b.x - a.y*b.y
end

global function dot2(a: auto) <inline,nosideeffect>
  return dot(a,a)
end

global function length_n(p: auto, n: float): float <inline,nosideeffect>
  return fastpow(dot(fastpow(abs(p), n), 1.0), 1.0/n)
end

-- Reference: https://www.iquilezles.org/www/articles/functions/functions.htm
global function almost_identity3(x: auto, n: auto) <inline,nosideeffect>
  return sqrt(x*x+n)
end

global function floatbits2uint(a: auto) <inline,nosideeffect>
  ## if a.type.is_scalar then
    return math.floatbits2uint(a)
  ## elseif a.type.is_vec2 then
    return uvec2{math.floatbits2uint(a.x), math.floatbits2uint(a.y)}
  ## elseif a.type.is_vec3 then
    return uvec3{math.floatbits2uint(a.x), math.floatbits2uint(a.y), math.floatbits2uint(a.z)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function floatbits2int(a: auto) <inline,nosideeffect>
  ## if a.type.is_scalar then
    return math.floatbits2int(a)
  ## elseif a.type.is_vec2 then
    return ivec2{math.floatbits2int(a.x), math.floatbits2int(a.y)}
  ## elseif a.type.is_vec3 then
    return ivec3{math.floatbits2int(a.x), math.floatbits2int(a.y), math.floatbits2int(a.z)}
  ## else static_error("invalid argument of type '%s'", a.type) end
end

global function bias(x: auto, b: float) <inline,nosideeffect>
  return b + (1.0-b)*x
end

global mat2 <aligned(16)> = @record{
  m: [3][3]float
}

global mat3 <aligned(16)> = @record{
  m: [3][3]float
}

## mat2.value.is_mat2 = true mat3.value.is_mat = true mat3.value.N = 2 mat3.value.M = 2
## mat3.value.is_mat3 = true mat3.value.is_mat = true mat3.value.N = 3 mat3.value.M = 3

global function tomat3(a: auto): mat3 <inline>
  ## if a.type.is_scalar then
    return (@mat3){{
      {  a, 0.0, 0.0},
      {0.0,   a, 0.0},
      {0.0, 0.0,   a},
    }}
  ## end
end

global function tomat2(a: auto): mat2 <inline>
  ## if a.type.is_scalar then
    return (@mat2){{
      {  a, 0.0},
      {0.0,   a},
    }}
  ## elseif a.type.is_mat3 then
    return (@mat2){{
      {a.m[0][0], a.m[0][1]},
      {a.m[1][0], a.m[1][1]},
    }}
  ## elseif a.type.is_mat2 then
    return a
  ## end
end

global function transpose(a: auto) <inline>
  ## if a.type.is_scalar then
    return a
  ## elseif a.type.is_mat then
    local b: #[a.type]# <noinit>
    ## for i=0,a.type.N-1 do
      ## for j=0,a.type.M-1 do
        b.m[#[i]#][#[j]#] = a.m[#[j]#][#[i]#]
      ## end
    ## end
    return b
  ## end
end

function mat3.__mul(a: mat3, b: auto) <inline>
  ## if b.type.an_vec3 then
    return (@vec3){
      a.m[0][0]*b.x + a.m[0][1]*b.y + a.m[0][2]*b.z,
      a.m[1][0]*b.x + a.m[1][1]*b.y + a.m[1][2]*b.z,
      a.m[2][0]*b.x + a.m[2][1]*b.y + a.m[2][2]*b.z,
    }
  ## elseif b.type.an_vec2 then
    return (@vec2){
      a.m[0][0]*b.x + a.m[0][1]*b.y + a.m[0][2],
      a.m[1][0]*b.x + a.m[1][1]*b.y + a.m[1][2],
    }
  ## elseif b.type.is_mat3 then
    local c: mat3
    ## for i=0,2 do
      ## for j=0,2 do
        ## for k=0,2 do
          c.m[#[i]#][#[j]#] = c.m[#[i]#][#[j]#] + a.m[#[i]#][#[k]#] * b.m[#[k]#][#[j]#]
        ## end
      ## end
    ## end
    return c
  ## else static_error('invalid') end
end

function mat2.__mul(a: mat2, b: auto) <inline>
  ## if b.type.an_vec2 then
    return (@vec2){
      a.m[0][0]*b.x + a.m[0][1]*b.y,
      a.m[1][0]*b.x + a.m[1][1]*b.y,
    }
  ## elseif b.type.is_mat2 then
    local c: mat2
    ## for i=0,1 do
      ## for j=0,1 do
        ## for k=0,1 do
          c.m[#[i]#][#[j]#] = c.m[#[i]#][#[j]#] + a.m[#[i]#][#[k]#] * b.m[#[k]#][#[j]#]
        ## end
      ## end
    ## end
    return c
  ## else static_error('invalid') end
end


-- Smooth minimum
-- Reference: https://www.iquilezles.org/www/articles/smin/smin.htm
global function smin(a: float, b: float, k: float): float <inline,nosideeffect>
  return min(a, b) - pow2(max(k - abs(a-b), 0.0)/k)*k*0.25
end

-- Onion (exact)
-- Reference: https://www.iquilezles.org/www/articles/distfunctions/distfunctions.htm
global function onion(d: float, w: float) <inline,nosideeffect>
  return abs(d) - w
end

global function inline1(d: float, hw: float) <inline,nosideeffect>
  return abs(d + hw) - hw
end

global function outline1(d: float, hw: float) <inline,nosideeffect>
  return abs(d - hw) - hw
end

global function outline(d: float, w: float) <inline,nosideeffect>
  return abs(d - 2*w) - w
end

-- Subtraction (bound)
-- Reference: https://www.iquilezles.org/www/articles/distfunctions/distfunctions.htm
global function subtract(a: float, b: float): float <inline,nosideeffect>
  return max(-b, a)
end

-- Elongation (exterior exact, interior incorrect)
-- For 1D elongation gives exact exterior and interior distances.
-- However for 2D/3D elongations produces a core of zero distances in the interior.
-- Reference: https://www.iquilezles.org/www/articles/distfunctions/distfunctions.htm
global function elongate(p: auto, h: auto) <inline,nosideeffect>
  return p - clamp(p, -h, h)
end

-- Elongation (exact)
-- For 1D elongation gives exact exterior and interior distances.
-- The second result must be summed into the distance.
-- Reference: https://www.iquilezles.org/www/articles/distfunctions/distfunctions.htm
global function elongate_exact(p: auto, h: auto) <inline,nosideeffect>
  p = abs(p) - h
  ## if p.type.is_scalar then
    return max(p, 0.0), min(p, 0.0)
  ## elseif p.type.is_vec2 then
    return max(p, 0.0), min(math.max(p.x, p.y), 0.0)
  ## elseif p.type.is_vec3 then
    return max(p, 0.0), min(math.max(p.x, p.y, p.z), 0.0)
  ## else static_error("invalid argument of type '%s'", p.type) end
end

global function rotate2(p: vec2, sc: overload(float, vec2)): vec2 <inline,nosideeffect>
  ## if sc.type.is_float then
  local sc: vec2 = fastsincos(sc)
  ## end
  return vec2{p.x*sc.y - p.y*sc.x, p.x*sc.x + p.y*sc.y}
end

--[[
TODO:
* use a namespace
* scale
* rename abs to sym
* infinite repetition
*  finite repetition
* bend? distorts?
]]

-- Mirror X axis.
global function mirrorx(p: auto) <inline,nosideeffect>
  p.x = abs(p.x)
  return p
end

-- Mirror Y axis.
global function mirrory(p: auto) <inline,nosideeffect>
  p.y = abs(p.y)
  return p
end

-- Mirror following a normal vector.
global function mirror(p: vec2, n: overload(float, vec2)): vec2 <inline,nosideeffect>
  ## if n.type.is_float then
  local n: vec2 = fastsincos(n)
  ## end
  return p - n*min(2.0*dot(p, n), 0.0)
end

-- Mirror at a point following a normal vector.
global function mirror_at(p: vec2, o: vec2, n: overload(float, vec2)): vec2 <inline,nosideeffect>
  ## if n.type.is_float then
  local n: vec2 = fastsincos(n)
  ## end
  return p - n*min(2.0*dot(p - o, n), 0.0)
end

-- Flip X axis
global function flipx(p: auto) <inline,nosideeffect>
  p.x = -p.x
  return p
end

-- Flip Y axis
global function flipy(p: auto) <inline,nosideeffect>
  p.y = -p.y
  return p
end

-- Flip XY axis
global function flipxy(p: auto) <inline,nosideeffect>
  return vec2{-p.x, -p.y}
end

-- Rotate right 90 degrees
global function ror90(p: auto) <inline,nosideeffect>
  return vec2{p.y, p.x}
end

-- Rotate left 90 degrees
global function rol90(p: auto) <inline,nosideeffect>
  return vec2{p.y, -p.x}
end

global function rep(p: auto, c: float) <inline,nosideeffect>
  return mod(p+0.5*c,c)-0.5*c
end

global function replim(p: auto, c: float, l: auto) <inline,nosideeffect>
  return p-c*clamp(round(p/c),-l,l)
end

global function blendalpha(src: auto, dst: auto, sa: float) <inline, nosideeffect>
  local col = sa*src + (1.0-sa)*dst
  ## if dst.type.is_bvec4 then
    return tobvec4(col)
  ## else
    return col
  ## end
end

global function blend(src: bvec4, dst: bvec4): bvec4 <inline, nosideeffect>
  local sa: int = src.a
  local s: ivec4 = ivec4{src.r*sa, src.g*sa, src.b*sa, sa}
  local sa1: int = 255 - sa
  local d: ivec4 = toivec4(dst * sa1)
  return (@bvec4){
    r = (@byte)((s.r + d.r) /// 255),
    g = (@byte)((s.g + d.g) /// 255),
    b = (@byte)((s.b + d.b) /// 255),
    a = (@byte)((s.a + d.a) /// 255),
  }
end

global function blend2(src: vec4, dst: vec4): vec4 <inline, nosideeffect>
  local sa: float = src.a
  local s: vec4 = {src.r*sa, src.g*sa, src.b*sa, sa}
  local sa1: float = 1.0 - sa
  local d: vec4 = dst * sa1
  return s + d
end

-- Transform coordinate system from Right Down to Normalized Centered Right Up
global function rd2ncru(coord: auto, size: auto): vec2 <inline, nosideeffect>
  local ry: float = 1.0/size.y
  return ((1.0 - size) + 2.0*coord)*vec2{ry, -ry}
end

-- Modified IQ's pallette.
-- Reference: http://iquilezles.org/www/articles/palettes/palettes.htm
global function qsinpal(t: float, a: auto, b: auto, c: auto, d: auto): vec3 <inline, nosideeffect>
  return a + b*qsin(2.0*PI*(c*abs(t)-d))
end

-- Reference: https://www.shadertoy.com/view/WlG3zG
global function pow22(x: auto) <inline, nosideeffect>
  return (0.433198*x*x)*(sqrt(x) + 1.30841)
end

-- Reference: https://www.shadertoy.com/view/WlG3zG
global function rpow22(x: auto) <inline, nosideeffect>
  return 1.12661*sqrt(x) - 0.126615*x
end


-- Signed 2D distance functions.
global sdf2 = @record{}

-- Circle.
-- Origin is at the geometric center.
-- `r` is the circle radius.
-- Reference: https://www.shadertoy.com/view/3ltSW2
function sdf2.circle(p: vec2, r: float): float <inline>
  return length(p) - r
end

-- Segment
-- Origin is at the geometric center.
-- Parameter `r` is the segment half vector.
function sdf2.segment(p: vec2, b: vec2): float <inline>
  p = p + b
  local h: float = clamp(dot(p,b)/dot(b,b), 0.0, 2.0)
  return length(p - b*h)
end

-- Segment At
-- Determined by 2 vertices `a` and `b`.
-- Reference: https://www.shadertoy.com/view/3tdSDj
function sdf2.segment_at(p: vec2, a: vec2, b: vec2): float <inline>
  local pa: vec2, ba: vec2 = p-a, b-a
  local h: float = clamp(dot(pa,ba)/dot2(ba), 0.0, 1.0)
  return length(pa - ba*h)
end

-- Segment Path
-- Determined by multiple vertices.
function sdf2.segment_path(p: vec2, ...: varargs): float <inline>
  ## local nargs = select('#', ...)
  local d: float = sdf2.segment_at(p, #[select(1, ...)]#, #[select(2, ...)]#)
  ## for i=3,nargs do
    d = min(d, sdf2.segment_at(p, #[select(i-1, ...)]#, #[select(i, ...)]#))
  ## end
  return d
end

-- Box
-- Origin is at the geometric center.
-- Parameter `r` is the box half width and half height.
-- Reference: https://www.youtube.com/watch?v=62-pRVZuS5c
function sdf2.box(p: vec2, r: vec2): float <inline>
  local q: vec2 = abs(p)-r
  return length(max(q,0.0)) + min(max(q.x,q.y),0.0)
end

-- Parallelogram
-- Origin is at the geometric center.
-- Parameter `r` is the parallelogram half base and half height.
-- Parameter `rs` is the parallelogram half skew, 0 makes it a box.
-- Reference: https://www.shadertoy.com/view/7dlGRf
function sdf2.parallelogram(p: vec2, r: vec2, rs: float): float <inline>
  local e: vec2 = vec2{rs, r.y}
  p = signb(p.y)*p
  -- horizontal edge
  local w: vec2 = p - e
  w.x = w.x - clamp(w.x, -r.x, r.x)
  local d: vec2 = vec2{dot2(w), -w.y}
  -- vertical edge
  local s: float = p.x*e.y - p.y*e.x
  p = signb(s)*p
  local v: vec2 = p - vec2{r.x,0}
  v = v - e*clamp(dot(v,e)/dot2(e), -1.0, 1.0)
  d = min(d, vec2{dot2(v), r.x*r.y-abs(s)})
  return sqrt(d.x)*sign(-d.y)
end

-- Vertical Stick
-- Origin is at the geometric center.
-- Parameter `r` is the stick half height.
-- Reference: https://www.youtube.com/watch?v=62-pRVZuS5c
function sdf2.vertical_stick(p: vec2, r: float): float <inline>
  local q: vec2 = vec2{abs(p.x), abs(p.y) - r}
  return length(vec2{q.x, max(q.y,0.0)}) + min(max(q.x,q.y),0.0)
end

-- Rhombus
-- Origin is at the geometric center.
-- Parameter `r` is the rhombus half width and half height.
-- Reference: https://www.shadertoy.com/view/XdXcRB
function sdf2.rhombus(p: vec2, b: vec2): float <inline>
  p = abs(p)
  local h: float = clamp((ndot(b,b) - 2.0*ndot(p,b))/dot(b,b), -1.0, 1.0)
  local d: float = length(p - b*vec2{(1.0-h)/2.0,(1.0+h)/2.0})
  return d * sign(p.x*b.y + p.y*b.x - b.x*b.y)
end

-- Triangle
-- Determined by 3 vertices.
-- Reference: https://www.shadertoy.com/view/XsXSz4
function sdf2.triangle_at(p: vec2, A: vec2, B: vec2, C: vec2): float <inline>
  local e0: vec2 = B - A
  local e1: vec2 = C - B
  local e2: vec2 = A - C
  local v0: vec2 = p - A
  local v1: vec2 = p - B
  local v2: vec2 = p - C
  local pq0: vec2 = v0 - e0*clamp(dot(v0,e0)/dot(e0,e0), 0.0, 1.0)
  local pq1: vec2 = v1 - e1*clamp(dot(v1,e1)/dot(e1,e1), 0.0, 1.0)
  local pq2: vec2 = v2 - e2*clamp(dot(v2,e2)/dot(e2,e2), 0.0, 1.0)
  local s: float = e0.x*e2.y - e0.y*e2.x
  local d: vec2 = min(min(vec2{dot(pq0, pq0), s*(v0.x*e0.y-v0.y*e0.x)},
                          vec2{dot(pq1, pq1), s*(v1.x*e1.y-v1.y*e1.x)}),
                          vec2{dot(pq2, pq2), s*(v2.x*e2.y-v2.y*e2.x)})
  return -sqrt(d.x)*sign(d.y)
end

-- Isosceles Triangle.
-- Origin is at the geometric center.
-- Parameter `r` is the triangle half base and half width.
-- Reference: https://www.shadertoy.com/view/MldcD7
function sdf2.isosceles_triangle(p: vec2, r: vec2): float <inline>
  r.y = -2.0*r.y
  p = vec2{abs(p.x), p.y + r.y*(2.0/3.0)}
  local a: vec2 = p - r*clamp(dot(p,r)/dot(r,r), 0.0, 1.0)
  local b: vec2 = p - r*vec2{clamp(p.x/r.x, 0.0, 1.0), 1.0}
  local k: float = sign(r.y)
  local d: float = min(dot(a, a), dot(b, b))
  local s: float = max(k*(p.x*r.y-p.y*r.x), k*(p.y-r.y))
  return sqrt(d)*sign(s)
end

-- Equilateral Triangle
-- Origin is at the geometric center.
-- Parameter `r` is the triangle half base.
-- Reference: https://www.shadertoy.com/view/Xl2yDW
function sdf2.equilateral_triangle(p: vec2, r: float): float <inline>
  local k: float <comptime> = #[math.sqrt(3.0)]#
  p.x = abs(p.x) - r
  p.y = p.y + r/k
  local kpy: float = k*p.y
  p = (do
    if p.x+kpy > 0.0 then
      in vec2{(p.x-kpy)/2.0, (-k*p.x-p.y)/2.0}
    else
      in p
    end
  end)
  p.x = p.x - clamp(p.x, -2.0*r, 0.0)
  return -length(p)*sign(p.y)
end

-- Ellipse
-- Origin is at the geometric center.
-- Parameter `r` is the ellipse horizontal and vertical radius.
-- Reference: https://www.shadertoy.com/view/tttfzr
function sdf2.ellipse(p: vec2, r: vec2): float <inline>
  p = abs(p) -- symmetry
  -- determine in/out and initial value
  local s: float = dot2(p/r) > 1.0 and 1.0 or -1.0
  local cs: vec2 = (do
    if s > 0.0 then
      in vec2{r.y*p.x, r.x*p.y}
    elseif r.x*(p.x-r.x) - r.y*(p.y-r.y) < 0.0 then
      in vec2{0.01, 1.0}
    else
      in vec2{1.0, 0.01}
    end
  end)
  cs = normalize(cs)
  -- find root with Newton solver
  local u: vec2 <noinit>
  local v: vec2 <noinit>
  ## for i=1,4 do
    do
      u = r*cs
      v = r*vec2{-cs.y,cs.x}
      local pu: vec2 = p - u
      local a: float = dot(pu, v)
      local c: float = dot(pu, u) + dot(v,v)
      local b: float = sqrt(c*c-a*a)
      cs = vec2{cs.x*b-cs.y*a, cs.y*b+cs.x*a}/c
    end
  ## end
  -- compute final point and distance
  return length(p-r*cs) * s
end

-- Ellipsoid (approximated, fast)
-- Origin is at the geometric center.
-- Parameter `r` is the ellipsoid horizontal and vertical radius.
-- Parameter `n` is the roundness of the ellipse, must be in [1,inf] range.
-- Small `n` makes the ellisoid more like a rhombus, bigger values makes it more like a rounded rectangle.
-- If `n` is 2 then the ellipsoid is an ellipse (default).
-- NOTE: The distance field is inexact and will be distorted.
-- Reference: https://www.iquilezles.org/www/articles/ellipsoids/ellipsoids.htm
function sdf2.ellipsoid(p: vec2, r: vec2, n: facultative(float)): float <inline>
  ## if n.type.is_niltype then
    return length(p)*(1.0 - 1.0/length(p/r))
  ## else
    return length_n(p, n)*(1.0 - 1.0/length_n(p/r, n))
  ## end
end

-- Vesica (intersection of two horizontal circles of same radius)
-- Origin is at the geometric center.
-- Parameter `r` is the circle radius.
-- Parameter `d` is the distance between the two vesica's circles,
-- negative for "()" shape, positive for "OO" shapes, must be in [-r,r] range.
-- Reference: https://www.shadertoy.com/view/XtVfRW
function sdf2.vesica(p: vec2, r: float, d: float): float <inline>
  p = abs(p)
  local b: float = sqrt(r*r-d*d)
  if (p.y-b)*d > p.x*b then
    return length(vec2{p.x,p.y-b})*sign(d)
  else
    return length(vec2{p.x+d,p.y})-r
  end
end

-- Arc
-- Origin is at the arc center.
-- Parameter `sc` is the `sin` and `cos` of the arc fill half angle.
-- Parameter `r` is the arc circle radius.
-- Reference: https://www.shadertoy.com/view/wl23RK
function sdf2.arc(p: vec2, sc: auto, r: float, w: float): float <inline>
  ## if sc.type.is_scalar then
  local sc: vec2 = fastsincos(sc)
  ## end
  p.x = abs(p.x) -- symmetry
  if sc.y*p.x > sc.x*p.y then
    return length(p - r*sc) - w
  else
    return abs(length(p) - r) - w
  end
end


-- Horseshoe
-- Origin is at the arc center.
-- Parameter `sc` is the `sin` and `cos` of the arc fill half angle.
-- Parameter `r` is the arc circle radius.
-- Parameter `w` is the arc line width plus the size of the arc tails.
-- Reference: https://www.shadertoy.com/view/wl23RK
function sdf2.horseshoe(p: vec2, sc: auto, r: float, w: vec2): float <inline>
  ## if sc.type.is_scalar then
  local sc: vec2 = fastsincos(sc)
  ## end
  p = vec2{abs(p.x), -p.y}
  local l: float = length(p)
  p = vec2{sc.y*p.x + sc.x*p.y, sc.x*p.x - sc.y*p.y}
  p = vec2{p.y > 0.0 and p.x or l*sign(sc.y), p.x > 0.0 and p.y or l}
  p = vec2{p.x - w.x, abs(p.y-r) - w.y}
  return length(max(p, 0.0)) + min(0.0, max(p.x, p.y))
end

-- Egg
-- Origin is at the base circle center.
-- Parameter `ra` is the egg base circle radius.
-- Parameter `rb` is the egg radius, must be in range [0,ra].
-- Small `rb` values makes the shape more eggy, bigger values makes it more like a circle.
-- Reference: https://www.shadertoy.com/view/Wdjfz3
function sdf2.egg(p: vec2, ra: float, rb: float): float <inline>
  p.x = abs(p.x)
  local k <comptime> = #[math.sqrt(3.0)]#
  local r: float = ra - rb
  if p.y < 0.0 then
    return length(p) - r - rb
  elseif k*(p.x+r) < p.y then
    return length(vec2{p.x, p.y-k*r}) - rb
  else
    return length(vec2{p.x+r, p.y}) - 2.0*r - rb
  end
end

-- Pie
-- Origin is at the pie circle center.
-- Parameter `sc` is the `sin` and `cos` of the pie fill angle.
-- Parameter `r` is the pie circle radius.
-- Reference: https://www.shadertoy.com/view/3l23RK
function sdf2.pie(p: vec2, sc: auto, r: float): float <inline>
  ## if sc.type.is_scalar then
  local sc: vec2 = fastsincos(sc)
  ## end
  p.x = abs(p.x)
  local l: float = length(p) - r
  local m: float = length(p - sc*clamp(dot(p,sc), 0.0, r))
  return max(l, m*sign(sc.y*p.x-sc.x*p.y))
end

-- Trapezoid
-- Origin is at the geometric center.
-- `br` is the bottom edge half width.
-- `tr` is the top edge half width.
-- `hr` is the half height.
-- Reference: https://www.shadertoy.com/view/MlycD3
function sdf2.trapezoid(p: vec2, br: float, tr: float, hr: float): float <inline>
  local k1: vec2 = vec2{tr,hr}
  local k2: vec2 = vec2{tr-br,2.0*hr}
  p.x = abs(p.x)
  local r: float = p.y < 0.0 and br or tr
  local ca: vec2 = vec2{max(0.0, p.x-r), abs(p.y)-hr}
  local cb: vec2 = p - k1 + k2*clamp(dot(k1-p,k2)/dot(k2,k2), 0.0, 1.0)
  local s: float = (cb.x < 0.0 and ca.y < 0.0) and -1.0 or 1.0
  return s*sqrt(min(dot(ca,ca), dot(cb,cb)))
end

-- Uneven Capsule
-- Origin is at the bottom capsule circle center.
-- `br` is the bottom capsule circle radius.
-- `tr` is the top capsule circle radius.
-- `h` is the capsule height.
-- Reference: https://www.shadertoy.com/view/4lcBWn
function sdf2.uneven_capsule(p: vec2, br: float, tr: float, h: float): float <inline>
  p.x = abs(p.x)
  local b: float = (br-tr)/h
  local a: float = sqrt(1.0-b*b)
  local k: float = dot(p,vec2{-b,a})
  if k < 0.0 then
    return length(p) - br
  elseif k > a*h then
    return length(p-vec2{0.0,h}) - tr
  else
    return dot(p, vec2{a,b}) - br
  end
end

-- Regular Pentagon
-- Origin is at the geometric center.
-- `r` is the pentagon height from bottom to center.
-- Reference: https://www.shadertoy.com/view/llVyWW
function sdf2.pentagon(p: vec2, r: float): float <inline>
  local KCOS <comptime> = #[math.cos(math.pi/5)]#
  local KSIN <comptime> = #[math.sin(math.pi/5)]#
  local KTAN <comptime> = #[math.tan(math.pi/5)]#
  p.y = -p.y
  p.x = abs(p.x)
  p = p - min(dot(vec2{-KCOS,KSIN},p),0.0)*vec2{-2*KCOS,2*KSIN}
  p = p - min(dot(vec2{ KCOS,KSIN},p),0.0)*vec2{ 2*KCOS,2*KSIN}
  p = p - vec2{clamp(p.x,-r*KTAN,r*KTAN),r}
  return length(p)*sign(p.y)
end

-- Regular Hexagon
-- Origin is at the geometric center.
-- `r` is the hexagon half height.
-- Reference: https://www.shadertoy.com/view/WtySRc
function sdf2.hexagon(p: vec2, r: float): float <inline>
  local KCOS <comptime> = #[math.cos(math.pi/6)]#
  local KSIN <comptime> = #[math.sin(math.pi/6)]#
  local KTAN <comptime> = #[math.tan(math.pi/6)]#
  p = abs(p)
  p = p - min(dot(vec2{-KCOS,KSIN},p),0.0)*vec2{-2*KCOS,2*KSIN}
  p = p - vec2{clamp(p.x, -KTAN*r, KTAN*r), r}
  return length(p)*sign(p.y)
end

-- X
-- Origin is at the geometric center.
-- `w` is the "x" width.
-- Reference: https://www.shadertoy.com/view/3dKSDc
function sdf2.x(p: vec2, w: float): float <inline>
  p = abs(p)
  return length(p - min(p.x+p.y, w)*0.5)
end

-- Cross
-- Origin is at the geometric center.
-- `ra` is the cross half width.
-- `rb` is the cross line half width, must be in range [0,ra]
-- Reference: https://www.shadertoy.com/view/XtGfzw
function sdf2.cross(p: vec2, ra: float, rb: float): float <inline>
  p = abs(p)
  p = (do
    if p.y > p.x then
      in vec2{p.y, p.x}
    else
      in p
    end
  end)
  local q: vec2 = p - vec2{ra, rb}
  local k: float = max(q.y,q.x)
  local w: vec2 = (do
    if k > 0.0 then
      in q
    else
      in vec2{rb-p.x, -k}
    end
  end)
  return sign(k)*length(max(w,0.0))
end

-- Parabola Segment
-- Origin is at parabola top center.
-- `r` is the segment half horizontal width.
-- `h` is the parabola height.
-- Reference: https://www.shadertoy.com/view/3lSczz
function sdf2.parabola_segment(pos: vec2, r: float, h: float): float <inline>
  pos.x = abs(pos.x)
  local ik: float = r*r/h
  local p: float = ik*(h+pos.y-ik*0.5)/3.0
  local q: float = ik*ik*pos.x*0.25
  local t: float = q*q - p*p*p
  local x: float = (do
    if t > 0.0 then
      local c: float = sqrt(abs(t))
      local bmc: float = q-c
      in fastercbrt(q+c) + fastercbrt(abs(bmc))*sign(bmc)
    else
      local ra: float = 1.0/p
      in 2.0*fasttrisect(q*ra*sqrt(ra))*sqrt(p)
    end
  end)
  x = min(x,r)
  return length(pos-vec2{x,x*x/ik-h})
end

-- Quad
-- Determined by 4 vertices.
-- Reference: https://www.shadertoy.com/view/WtVcD1
function sdf2.quad_at(p: vec2, v: [4]vec2): float <inline>
  local gs: float = cross2(v[0]-v[3],v[1]-v[0])
  local resx: float <noinit>
  local resw: float <noinit>
  do -- edge 0
    local e: vec2  = v[1]-v[0]
    local w: vec2  = p-v[0]
    local q: vec2  = w-e*clamp(dot(w,e)/dot(e,e),0.0,1.0)
    local d: float = dot(q,q)
    local s: float = gs*cross2(w,e)
    resx = d
    resw = s
  end
  do -- edge 1
    local e: vec2  = v[2]-v[1]
    local w: vec2  = p-v[1]
    local q: vec2  = w-e*clamp(dot(w,e)/dot(e,e),0.0,1.0)
    local d: float = dot(q,q)
    local s: float = gs*cross2(w,e)
    resx = d < resx and d or resx
    resw = s > resw and s or resw
  end
  do -- edge 2
    local e: vec2 = v[3]-v[2]
    local w: vec2  = p-v[2]
    local q: vec2  = w-e*clamp(dot(w,e)/dot(e,e),0.0,1.0)
    local d: float = dot(q,q)
    local s: float = gs*cross2(w,e)
    resx = d < resx and d or resx
    resw = s > resw and s or resw
  end
  do -- edge 3
    local e: vec2  = v[0]-v[3]
    local w: vec2  = p-v[3]
    local q: vec2  = w-e*clamp(dot(w,e)/dot(e,e),0.0,1.0)
    local d: float = dot(q,q)
    local s: float = gs*cross2(w,e)
    resx = d < resx and d or resx
    resw = s > resw and s or resw
  end
  return sqrt(resx)*sign(resw)
end

-- Rounded Box
-- Origin is at the geometric center.
-- Parameter `b` is the box half width and half height.
-- Parameter `r` is the roundness for each corner.
-- Reference: https://www.shadertoy.com/view/4llXD7
function sdf2.rounded_box(p: vec2, b: vec2, r: vec4): float <inline>
  local s: vec2 = p.x > 0 and vec2{r.z, r.w} or vec2{r.x, r.y}
  s.x = s.y > 0 and s.y or s.x
  local q: vec2 = abs(p)-b+s.x
  return min(max(q.x,q.y),0.0) + length(max(q,0.0)) - s.x
end

-- Bezier Segment (quadratic)
-- Reference: https://www.shadertoy.com/view/MlKcDD
function sdf2.bezier_segment_at(pos: vec2, A: vec2, B: vec2, C: vec2): float <inline>
  local a: vec2 = B - A
  local b: vec2 = A - 2.0*B + C
  local c: vec2 = a * 2.0
  local d: vec2 = A - pos
  local kk: float = 1.0/dot(b,b)
  local kx: float = kk * dot(a,b)
  local ky: float = kk * (dot(c,a)+dot(d,b))/3.0
  local kz: float = kk * dot(d,a)
  local kx2: float = kx*kx
  local p: float = ky - kx2
  local p3: float = p*p*p
  local q: float = kx*(2.0*kx2 - 3.0*ky) + kz
  local h: float = q*q + 4.0*p3
  if h >= 0.0 then -- 1 root
    h = sqrt(h)
    local x: vec2 = (vec2{h,-h}-q)/2.0
    local uv: vec2 = sign(x)*fastercbrt(abs(x))
    local t: float = clamp(uv.x+uv.y-kx, 0.0, 1.0)
    return sqrt(dot2(d+(c+b*t)*t))
  else -- 3 roots
    local z: float = sqrt(-p)
    local m: float = trisect(q/(p*z*2.0))
    local n: float = sqrt(1.0-m*m)*#[math.sqrt(3)]#
    -- the third root is not needed
    local t: vec2 = clamp(vec2{m+m,-n-m}*z-kx, 0.0, 1.0)
    return sqrt(min(dot2(d+(c+b*t.x)*t.x), dot2(d+(c+b*t.y)*t.y)))
  end
end

## local make_Canvas = generalize(function(Color, WIDTH, HEIGHT)
  local Color = #[Color]#
  local CanvasT <aligned(32)> = @record{
    pixels: [#[HEIGHT]#][#[WIDTH]#]Color,
    proj: mat3,
    iproj: mat3,
    mt: mat3,
    imt: mat3,
    m: mat3,
    im: mat3,
    mts: [8]mat3,
    imts: [8]mat3,
    ss: [8]float,
    tstack: int,
    s: float,
    lw: float,
    dirty: boolean,
  }
  global CanvasT.WIDTH <comptime> = #[WIDTH]#
  global CanvasT.HEIGHT <comptime> = #[HEIGHT]#

  local LINE_MARGIN <comptime> = #[HEIGHT >= 512 and 2 or 1]#

  ## CanvasT.value.color_type = Color

  function CanvasT:fragment_draw(fn: auto <comptime>, ...: varargs): void <inline>
    ## parallel_for()
    for py=0,<CanvasT.HEIGHT do
      local row: auto = &self.pixels[py]
      ## simd_for()
      for px=0,<CanvasT.WIDTH do
        fn(&row[px], ivec2{px,py}, ivec2{CanvasT.WIDTH, CanvasT.HEIGHT}, ...)
      end
    end
  end

  function CanvasT:fragment_draw_at(coord: ivec2, size: ivec2, fn: auto <comptime>, ...: varargs): void <inline>
    local s: ivec2 <const> = max(-coord, 0)
    local e: ivec2 <const> = size - max(coord + size - ivec2{CanvasT.WIDTH, CanvasT.HEIGHT}, 0)
    ## parallel_for()
    for py=s.y,<e.y do
      local y: int = coord.y + py
      local row: auto = &self.pixels[y]
      ## simd_for()
      for px=s.x,<e.x do
        local x: int = coord.x + px
        fn(&row[x], ivec2{x,y}, ivec2{CanvasT.WIDTH, CanvasT.HEIGHT}, ivec2{px,py}, size, ...)
      end
    end
  end

  function CanvasT:clear(col: Color): void
    if col == Color{} then
      memory.zero(&self.pixels, CanvasT.WIDTH * CanvasT.HEIGHT * #Color)
    else
      self:fragment_draw(function(frag_col: *Color, frag_coord: ivec2, frag_size: ivec2, col: Color): void <inline>
        $frag_col = col
      end, col)
    end
  end

  local function update_matrixes(self: *CanvasT) <inline>
    if not self.dirty then return end
    self.m = self.proj * self.mt
    self.im = self.imt * self.iproj
    self.dirty = false
  end

  function CanvasT:reset_transform()
    self.mt = tomat3(1.0)
    self.imt = tomat3(1.0)
    self.s = 1.0
    self.dirty = true
  end

  function CanvasT:push_transform()
    check(self.tstack < 8)
    self.mts[self.tstack] = self.mt
    self.imts[self.tstack] = self.imt
    self.ss[self.tstack] = self.s
    self.tstack = self.tstack + 1
  end

  function CanvasT:pop_transform()
    check(self.tstack > 0)
    self.tstack = self.tstack - 1
    self.mt = self.mts[self.tstack]
    self.imt = self.imts[self.tstack]
    self.s = self.ss[self.tstack]
    self.dirty = true
  end

  function CanvasT:ortho(left: float, right: float, top: float, bottom: float): void
    local W: float <comptime> = CanvasT.WIDTH-1
    local H: float <comptime> = CanvasT.HEIGHT-1
    -- project [l,r]->[0,W], [t,b]->[0,H]
    local bt: float = bottom - top
    local rl: float = right - left
    local fw: float = W/rl
    local fh: float = H/bt
    self.proj = mat3{{
      {fw,   0.0,  -left*fw},
      {0.0,   fh,   -top*fh},
      {0.0,  0.0,       1.0},
    }}
    -- project [0,0]->[l,t], [W,H]->[r, b]
    local ifw: float = rl/W
    local ifh: float = bt/H
    self.iproj = mat3{{
      {ifw,  0.0,  left},
      {0.0,  ifh,  top},
      {0.0,  0.0,  1.0},
    }}
    self.lw = LINE_MARGIN * max(ifw, ifh)
    self.dirty = true
  end

  function CanvasT:rotate(theta: float): void
    -- multiply by rotation matrix
    local sc: vec2 = sincos(theta)
    local a: mat3 = {{
      {sc.y, -sc.x, 0.0},
      {sc.x,  sc.y, 0.0},
      {0.0,    0.0, 1.0},
    }}
    local ia: mat3 = transpose(a)
    self.mt = self.mt * a
    self.imt = ia * self.imt
    self.dirty = true
  end

  function CanvasT:translate(t: vec2): void
    local a: mat3 = {{
      {1.0, 0.0, t.x},
      {0.0, 1.0, t.y},
      {0.0, 0.0, 1.0},
    }}
    local ia: mat3 = {{
      {1.0, 0.0,-t.x},
      {0.0, 1.0,-t.y},
      {0.0, 0.0, 1.0},
    }}
    self.mt = self.mt * a
    self.imt = ia * self.imt
    self.dirty = true
  end

  function CanvasT:scale(s: float): void
    local a: mat3 = {{
      {  s, 0.0, 0.0},
      {0.0,   s, 0.0},
      {0.0, 0.0, 1.0},
    }}
    local is: float = 1/s
    local ia: mat3 = {{
      { is, 0.0, 0.0},
      {0.0,  is, 0.0},
      {0.0, 0.0, 1.0},
    }}
    self.mt = self.mt * a
    self.imt = ia * self.imt
    self.s = s * self.s
    self.dirty = true
  end

  local function box_aabb(self: *CanvasT, r: vec2): (ivec2, ivec2) <inline>
    update_matrixes(self)
    local m: mat2 = tomat2(self.m)
    local topright: vec2 = abs(m * r)
    local botright: vec2 = abs(m * vec2{r.x, -r.y})
    local halfsize: vec2 = max(topright, botright) + LINE_MARGIN
    local center: vec2 = vec2{self.m.m[0][2], self.m.m[1][2]}
    local itopleft: ivec2 = ifloor(center - halfsize)
    local ibotright: ivec2 = iceil(center + halfsize)
    local isize: ivec2 = ibotright - itopleft
    return itopleft, isize
  end

  local function circle_aabb(self: *CanvasT, r: float): (ivec2, ivec2) <inline>
    update_matrixes(self)
    local m: mat2 = tomat2(self.m)
    local rlen: float = length(m * vec2{r,0.0}) + LINE_MARGIN
    local center: vec2 = vec2{self.m.m[0][2], self.m.m[1][2]}
    local itopleft: ivec2 = ifloor(center - rlen)
    local ibotright: ivec2 = iceil(center + rlen)
    local isize: ivec2 = ibotright - itopleft
    return itopleft, isize
  end

  function CanvasT:box(col: Color, r: vec2): void
    local pos: ivec2, size: ivec2 = box_aabb(self, r)
    self:fragment_draw_at(pos, size,
      function(frag_col: *Color, frag_coord: ivec2, frag_size: ivec2,
               coord: ivec2, size: ivec2,
               im: mat3, ds: float, lw: float, r: vec2, col: vec4): void <inline>
        local p: vec2 = im * frag_coord
        local d: float = sdf2.box(p, r) * ds
        local a: float = linearstep(lw, 0.0, d)
        $frag_col = blendalpha(col, $frag_col, a)
      end,
    self.im, self.s, self.lw, r, tovec4(col))
  end

  function CanvasT:box_at(col: Color, p: vec2, s: vec2): void
    local r: vec2 = s/2.0
    self:push_transform()
    self:translate(p + r)
    self:box(col, r)
    self:pop_transform()
  end

  function CanvasT:box_shader(fn: auto <comptime>, r: vec2, ...: varargs): void <inline>
    local pos, size = box_aabb(self, r)
    self:fragment_draw_at(pos, size, fn, self.im, self.s, self.lw, r, ...)
  end
## return CanvasT end)

global Canvas = #[make_Canvas]#

global noise: type = @record{}

-- ## local FLOAT_HASH = true
## if FLOAT_HASH then

-- Float hashing has many problems and poor quality but can be faster.
-- Reference: https://www.shadertoy.com/view/4djSRW

global function hash1(v: auto): float <inline,nosideeffect>
  ## if v.type.is_scalar then
    v = fract(v * 0.1031)
    v = v * (v + 33.33)
    return fract((v + v) * v)
  ## else
    ## if v.type.is_vec2 then
    local v3: vec3 = v:xyx()
    ## elseif v.type.is_vec3 then
    local v3: vec3 = v
    ## end
    v3 = fract(v3 * 0.1031)
    v3 = v3 + dot(v3, v3:yzx() + 33.33)
    return fract((v3.x + v3.y) * v3.z)
  ## end
end

global function hash2(v: auto): vec2 <inline,nosideeffect>
  ## if v.type.is_scalar then
  local v3: vec3 = tovec3(v)
  ## elseif v.type.is_vec2 then
  local v3: vec3 = v:xyx()
  ## elseif v.type.is_vec3 then
  local v3: vec3 = v
  ## end
  v3 = v3 * vec3{0.1031, 0.1030, 0.0973}
  v3 = v3 + dot(v3, v3:yzx() + 33.33)
  return fract((v3:xx() + v3:yz()) * v3:zy())
end

global function hash3(v: auto): vec3 <inline,nosideeffect>
  ## if v.type.is_scalar then
  local v3: vec3 = tovec3(v)
  ## elseif v.type.is_vec2 then
  local v3: vec3 = v:xyx()
  ## elseif v.type.is_vec3 then
  local v3: vec3 = v
  ## end
  v3 = fract(v3 * vec3{0.1031, 0.1030, 0.0973})
  v3 = v3 + dot(v3, v3:yzx() + 33.33)
  return fract((v3:xxy() + v3:yzz()) * v3:zyx())
end

## else

-- Modified from IQ's Integer Hash III
-- It has a good compromise in terms of speed VS quality.
-- Reference: https://www.shadertoy.com/view/Xt3cDn

local function basehash(v: auto): uint <inline,nosideeffect>
  ## if v.type.is_unsigned then
  v = 1103515245_u * (v ~ (v>>1_u))
  local h: uint = 1103515245_u * (v ~ (v>>3_u))
  ## elseif v.type.is_uvec2 then
  v = 1103515245_u * (v:yx() ~ (v>>1_u))
  local h: uint = 1103515245_u * (v.x ~ (v.y>>3_u))
  ## elseif v.type.is_uvec3 then
  v = 1103515245_u * (v:yzx() ~ (v>>1_u))
  local h: uint = 1103515245_u * ((v.x ~ v.z) ~ (v.y>>3_u))
  ## end
  return h ~ (h>>16_u)
end

global function hash1(v: auto): float <inline,nosideeffect>
  local n: uint = basehash(floatbits2uint(v))
  return (n & 0x7fffffff_u)*#[1.0/0x7fffffff]#
end

global function hash2(v: auto): vec2 <inline,nosideeffect>
  local n: uint = basehash(floatbits2uint(v))
  return vec2{n & 0x7fffffff_u, (n*48271_u) & 0x7fffffff_u}*#[1.0/0x7fffffff]#
end

global function hash3(v: auto): vec3 <inline,nosideeffect>
  local n: uint = basehash(floatbits2uint(v))
  return vec3{n & 0x7fffffff_u, (n*16807_u) & 0x7fffffff_u, (n*48271_u & 0x7fffffff_u)}*#[1.0/0x7fffffff]#
end

## end

-- Interpolate noise of 2 neighbors.
local function noisemix1(a: float, b: float, f: float): float <inline,nosideeffect>
  local u: float = f*f*(3.0-2.0*f) -- cubic interpolation
  return fastmix(a, b, u)
end

-- Interpolate noise of 4 neighbors.
local function noisemix2(a: float, b: float, c: float, d: float, f: vec2): float <inline,nosideeffect>
  local u: vec2 = f*f*(3.0-2.0*f) -- cubic interpolation
  return fastmix(fastmix(a,b,u.x), fastmix(c,d,u.x), u.y)
end

-- Interpolate noise of 8 neighbors.
local function noisemix3(a1: float, b1: float, c1: float, d1: float,
                         a2: float, b2: float, c2: float, d2: float,
                         f: vec3): float <inline,nosideeffect>
  local u: vec3 = f*f*(3.0-2.0*f) -- cubic interpolation
  return fastmix(
          fastmix(fastmix(a1,b1,u.x), fastmix(c1,d1,u.x), u.y),
          fastmix(fastmix(a2,b2,u.x), fastmix(c2,d2,u.x), u.y),
          u.z)
end

-- 1D Basic Noise
-- This is similar to value noise but just need 1 hash invocation, thus faster.
-- Return a float in range [0,1].
-- Reference: https://www.shadertoy.com/view/3sd3Rs
function noise.basic1(p: float): float <inline,nosideeffect>
  local i: float, f: float = floor(p), fract(p)
  local s: float = sign(fract(0.5*p)-0.5)
  local k: float = hash1(i)
  local f1: float = f - 1.0
  return 0.5+s*f*f1*((8.0*k-2.0)*f*f1-0.5)
end

-- White Noise
-- Return a float in range [0,1].
function noise.white(p: auto): float <inline,nosideeffect>
  return hash1(p)
end

-- Pixel Noise
-- Return a float in range [0,1].
function noise.pixel(p: auto): float <inline,nosideeffect>
  return hash1(floor(p))
end

--[[
local noise3d: [256][256][256][8]float
for z=0,<256 do
  for y=0,<256 do
    for x=0,<256 do
      local i = vec3{x,y,z}
      local I = ifloor(i + 1.0) & 255
      noise3d[z][y][x][0] = hash1(vec3{i.x,i.y,i.z})
      noise3d[z][y][x][1] = hash1(vec3{I.x,i.y,i.z})
      noise3d[z][y][x][2] = hash1(vec3{i.x,I.y,i.z})
      noise3d[z][y][x][3] = hash1(vec3{I.x,I.y,i.z})
      noise3d[z][y][x][4] = hash1(vec3{i.x,i.y,I.z})
      noise3d[z][y][x][5] = hash1(vec3{I.x,i.y,I.z})
      noise3d[z][y][x][6] = hash1(vec3{i.x,I.y,I.z})
      noise3d[z][y][x][7] = hash1(vec3{I.x,I.y,I.z})
    end
  end
end

-- Value Noise (cubic interpolation)
-- Return a float in range [0,1].
-- Reference: https://www.shadertoy.com/view/lsf3WH
function noise.value(p: auto): float <inline,nosideeffect>
  ## if p.type.is_scalar then
  local i, f = floor(p), fract(p)
  local I = floor(i + 1.0)
  local a: float = hash1(i)
  local b: float = hash1(I)
  return noisemix1(a,b,f)
  ## elseif p.type.is_vec2 then
  local i, f = ifloor(p) & 255, fract(p)
  local x: [4]float = $(@*[4]float)(&noise3d[0][i.y][i.x])
  return noisemix2(x[0],x[1],x[2],x[3],f)
  ## elseif p.type.is_vec3 then
  local i = ifloor(p) & 255
  local x: [8]float = noise3d[i.z][i.y][i.x]
  return noisemix3(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],fract(p))
  ## end
end
]]

-- Value Noise (cubic interpolation)
-- Return a float in range [0,1].
-- Reference: https://www.shadertoy.com/view/lsf3WH
function noise.value(p: auto): float <inline,nosideeffect>
  local i, f = floor(p), fract(p)
  local I = floor(i + 1.0)
  ## if p.type.is_scalar then
  local a: float = hash1(i)
  local b: float = hash1(I)
  return noisemix1(a,b,f)
  ## elseif p.type.is_vec2 then
  local a: float = hash1(i)
  local b: float = hash1(vec2{I.x,i.y})
  local c: float = hash1(vec2{i.x,I.y})
  local d: float = hash1(I)
  return noisemix2(a,b,c,d,f)
  ## elseif p.type.is_vec3 then
  local a1: float = hash1(i)
  local b1: float = hash1(vec3{I.x,i.y,i.z})
  local c1: float = hash1(vec3{i.x,I.y,i.z})
  local d1: float = hash1(vec3{I.x,I.y,i.z})
  local a2: float = hash1(vec3{i.x,i.y,I.z})
  local b2: float = hash1(vec3{I.x,i.y,I.z})
  local c2: float = hash1(vec3{i.x,I.y,I.z})
  local d2: float = hash1(I)
  return noisemix3(a1,b1,c1,d1,a2,b2,c2,d2,f)
  ## end
end

-- Gradient Noise (cubic interpolation)
-- Return a float in range [0,1].
-- Reference: https://www.shadertoy.com/view/XdXGW8
-- Reference: https://www.iquilezles.org/www/articles/gradientnoise/gradientnoise.htm
function noise.gradient(p: auto): float <inline,nosideeffect>
  local i, f = floor(p), fract(p)
  local I, F = floor(i + 1.0), f - 1.0
  ## if p.type.is_scalar then
  local a: float = (-0.5 + hash1(i)) * f
  local b: float = (-0.5 + hash1(I)) * F
  return 0.5+2.0*noisemix1(a,b,f)
  ## elseif p.type.is_vec2 then
  local a: float = dot(-0.5 + hash2(i), f)
  local b: float = dot(-0.5 + hash2(vec2{I.x,i.y}), vec2{F.x,f.y})
  local c: float = dot(-0.5 + hash2(vec2{i.x,I.y}), vec2{f.x,F.y})
  local d: float = dot(-0.5 + hash2(I), F)
  return 0.5+noisemix2(a,b,c,d,f)
  ## elseif p.type.is_vec3 then
  local a1: float = dot(-0.5 + hash3(i), f)
  local b1: float = dot(-0.5 + hash3(vec3{I.x,i.y,i.z}), vec3{F.x,f.y,f.z})
  local c1: float = dot(-0.5 + hash3(vec3{i.x,I.y,i.z}), vec3{f.x,F.y,f.z})
  local d1: float = dot(-0.5 + hash3(vec3{I.x,I.y,i.z}), vec3{F.x,F.y,f.z})
  local a2: float = dot(-0.5 + hash3(vec3{i.x,i.y,I.z}), vec3{f.x,f.y,F.z})
  local b2: float = dot(-0.5 + hash3(vec3{I.x,i.y,I.z}), vec3{F.x,f.y,F.z})
  local c2: float = dot(-0.5 + hash3(vec3{i.x,I.y,I.z}), vec3{f.x,F.y,F.z})
  local d2: float = dot(-0.5 + hash3(I), F)
  return 0.5+noisemix3(a1,b1,c1,d1,a2,b2,c2,d2,f)
  ## end
end

-- Wave Noise
-- Reference: https://www.shadertoy.com/view/tldSRj
function noise.wave(p: auto, k: facultative(float)): float <inline,nosideeffect>
  ## if k.type.is_niltype then
  local kp = PI*p
  ## else
  local kp = k*p
  ## end
  p = p + 33.33 -- offset to improve edges randomness
  local i, f = floor(p), fract(p)
  local I = floor(i + 1.0)
  ## if p.type.is_scalar then
  local function g(v: float): float <inline,nosideeffect>
    return qsin(v*13.0)
  end
  local a: float = qsin(kp * g(i))
  local b: float = qsin(kp * g(I))
  return 0.5+0.5*noisemix1(a,b,f)
  ## elseif p.type.is_vec2 then
  local function g(v: vec2): vec2 <inline,nosideeffect>
    return qsin(v*vec2{13.0*v.y,17.0*v.x})
  end
  local a: float = qsin(dot(kp, g(i)))
  local b: float = qsin(dot(kp, g(vec2{I.x,i.y})))
  local c: float = qsin(dot(kp, g(vec2{i.x,I.y})))
  local d: float = qsin(dot(kp, g(I)))
  return 0.5+0.5*noisemix2(a,b,c,d,f)
  ## elseif p.type.is_vec3 then
  local function g(v: vec3): vec3 <inline,nosideeffect>
    return qsin(v*vec3{13.0*v.z,17.0*v.x,19.0*v.y})
  end
  local a1: float = qsin(dot(kp, g(i)))
  local b1: float = qsin(dot(kp, g(vec3{I.x,i.y,i.z})))
  local c1: float = qsin(dot(kp, g(vec3{i.x,I.y,i.z})))
  local d1: float = qsin(dot(kp, g(vec3{I.x,I.y,i.z})))
  local a2: float = qsin(dot(kp, g(vec3{i.x,i.y,I.z})))
  local b2: float = qsin(dot(kp, g(vec3{I.x,i.y,I.z})))
  local c2: float = qsin(dot(kp, g(vec3{i.x,I.y,I.z})))
  local d2: float = qsin(dot(kp, g(I)))
  return 0.5+0.5*noisemix3(a1,b1,c1,d1,a2,b2,c2,d2,f)
  ## end
end

-- Simplex Noise
-- Return a float in range [0,1].
-- Reference: https://www.shadertoy.com/view/Msf3WH
function noise.simplex(p: auto): float <inline,nosideeffect>
  ## if p.type.is_vec2 then
  local K1: float <comptime> = #[(math.sqrt(3)-1)/2]#
  local K2: float <comptime> = #[(3-math.sqrt(3))/6]#
  local i: vec2 = floor(p + (p.x+p.y)*K1)
  local a: vec2 = p - i + (i.x+i.y)*K2
  local m: float = step(a.y,a.x)
  local o: vec2 = vec2{m,1.0-m}
  local b: vec2 = a - o + K2
  local c: vec2 = a - (1.0 - 2.0*K2)
  local h: vec3 = max(0.5-vec3{dot2(a), dot2(b), dot2(c)}, 0.0)
  local n: vec3 = pow2(pow2(h)) * vec3{
    dot(a,hash2(i)),
    dot(b,hash2(floor(i+o))),
    dot(c,hash2(floor(i+1.0)))
  }
  return 0.5+0.5*dot(n, tovec3(70.0))
  ## elseif p.type.is_vec3 then
  local K1: float <comptime> = #[1/3]#
  local K2: float <comptime> = #[1/6]#
  local i: vec3 = floor(p + (p.x+p.y+p.z)*K1)
  local a: vec3 = p - i + (i.x+i.y+i.z)*K2
  local m: vec3 = step(tovec3(0.0), a - a:yzx())
  local i1: vec3 = m * (1.0 - m:zxy())
  local i2: vec3 = 1.0 - m:zxy() * (1.0 - m)
  local b: vec3 = a - (i1 - K2)
  local c: vec3 = a - (i2 - 2.0*K2)
  local d: vec3 = a - (1.0 - 3.0*K2)
  local h: vec4 = max(0.6-vec4{dot2(a), dot2(b), dot2(c), dot2(d)}, 0.0)
  local n: vec4 = pow2(pow2(h)) * vec4{
    dot(a,hash3(i)),
    dot(b,hash3(floor(i+i1))),
    dot(c,hash3(floor(i+i2))),
    dot(d,hash3(floor(i+1.0)))
  }
  return 0.5+0.5*dot(n, tovec4(31.316))
  ## end
end

-- 2D Voronoi
-- Return a float in range [0,8].
-- Reference: https://www.iquilezles.org/www/articles/smoothvoronoi/smoothvoronoi.htm
function noise.voronoi2(p: vec2, t: facultative(float)): float <inline,nosideeffect>
  local q: vec2, f: vec2 = floor(p), fract(p)
  local res: float = 8.0
  local b: vec2 <noinit>
  local o: vec2 <noinit>
  local r: vec2 <noinit>
  local d: float <noinit>
  ## for j=-1,1 do for i=-1,1 do
    b = vec2{#[i]#, #[j]#}
    o = hash2(floor(q + b))
    ## if not t.type.is_niltype then
    o = 0.5 + 0.5*qsin(t + 2.0*PI*o)
    ## end
    r = b + o - f
    d = dot2(r)
    res = min(res, d)
  ## end end
  return sqrt(res)
end

-- 2D Smooth Voronoi
-- Return a float in range [0,1].
-- Reference: https://www.iquilezles.org/www/articles/smoothvoronoi/smoothvoronoi.htm
function noise.smoothvoronoi2(p: vec2, t: facultative(float)): float <inline,nosideeffect>
  local q: vec2, f: vec2 = floor(p), fract(p)
  local res: float = 0.0
  local b: vec2 <noinit>
  local o: vec2 <noinit>
  local r: vec2 <noinit>
  local d: float <noinit>
  ## for j=-1,1 do for i=-1,1 do
    b = vec2{#[i]#, #[j]#}
    o = hash2(floor(q + b))
    ## if not t.type.is_niltype then
    o = 0.5 + 0.5*qsin(t + 2.0*PI*o)
    ## end
    r = b + o - f
    d = dot2(r)
    res = res + 1.0/pow2(pow2(pow2(d)))
  ## end end
  return clamp(1.0/sqrt(sqrt(sqrt(sqrt(res)))), 0.0, 1.0)
end

-- 2D Voronoi (line distance)
-- Return a float in range [0,8].
-- Reference: https://www.iquilezles.org/www/articles/voronoilines/voronoilines.htm
-- Reference: https://www.shadertoy.com/view/ldl3W8
function noise.voronoilines2(p: vec2, t: facultative(float)): float <inline,nosideeffect>
  local q: vec2, f: vec2 = floor(p), fract(p)
  local b: vec2 <noinit>
  local o: vec2 <noinit>
  local r: vec2 <noinit>
  local d: float <noinit>
  local s: float <noinit>
  local res: float = 8.0
  local mb: vec2
  local mr: vec2
  ## for j=-1,1 do for i=-1,1 do
    b = vec2{#[i]#, #[j]#}
    o = hash2(floor(q + b))
    ## if not t.type.is_niltype then
    o = 0.5 + 0.5*qsin(t + 2.0*PI*o)
    ## end
    r = b + o - f
    d = dot2(r)
    s = step(d, res)
    mr = mix(mr, r, s)
    mb = mix(mb, b, s)
    res = min(d, res)
  ## end end
  res = 8.0
  ## for j=-2,2 do for i=-2,2 do
    b = mb + vec2{#[i]#, #[j]#}
    o = hash2(floor(q + b))
    ## if not t.type.is_niltype then
    o = 0.5 + 0.5*qsin(t + 2.0*PI*o)
    ## end
    r = b + o - f
    d = dot(0.5*(mr+r), normalize(r-mr))
    res = min(res, d)
  ## end end
  return res
end

-- 2D Vornoise
-- u=0, v=0 => Cell Noise
-- u=0, v=1 => Gradient Noise
-- u=1, v=0 => Voronoi
-- u=1, v=1 => Voronoise
-- Return a float in range [0,1].
-- Reference: https://www.iquilezles.org/www/articles/voronoise/voronoise.htm
-- Reference: https://www.shadertoy.com/view/Xd23Dh
function noise.voronoise2(p: vec2, u: facultative(float), v: facultative(float)): float <inline,nosideeffect>
  ## if u.type.is_niltype then
  local u: float <comptime> = 1.0
  ## end
  ## if v.type.is_niltype then
  local v: float <comptime> = 1.0
  local k: float <comptime> = 1.0
  ## else
  local k: float = 1.0+63.0*fastpow(1.0-v,6.0)
  ## end
  local i: vec2, f: vec2 = floor(p), fract(p)
  local g: vec2 <noinit>
  local o: vec3 <noinit>
  local d: vec2 <noinit>
  local w: float <noinit>
  local a: vec2
  ## for y=-2,2 do for x=-2,2 do
    g = vec2{#[x]#, #[y]#}
    o = hash3(floor(i + g))*vec3{u,u,1.0}
    d = g - f + o:xy()
    w = fastpow(1.0 - linearstep(0.0,#[math.sqrt(2)]#,length(d)), k)
    a = a + vec2{o.z*w, w}
  ## end end
  return clamp(a.x/a.y, 0.0, 1.0)
end

-- Fractional Brownian Motion (combine noises)
-- G is usually in range [0.5,1.0].
-- G=1.4142 for Brown noise, G=1.0 for Pink noise, G=0.5 for Yellow Noise.
-- In graphics usually Yellow noise is used.
-- Octaves is usually in range [1,8].
-- Return a float in range [-a,a], where a >= 1.0.
-- Reference: https://www.iquilezles.org/www/articles/fbm/fbm.htm
-- Reference: https://www.shadertoy.com/view/3sd3Rs
function noise.fbm(p: auto,
                   G: float,
                   octaves: int <comptime>,
                   noisefn: auto <comptime>): float <inline,nosideeffect>
  local a: float = 1.0
  local t: float = 0.0
  ## if p.type.is_vec2 then
  ## local c,s = 2*math.cos(math.pi/6), 2*math.sin(math.pi/6)
  local A: mat2 <const> = {{
    {#[c]#,-#[s]#},
    {#[s]#,#[c]#}
  }}
  ## end
  ## local N = octaves.value-1
  ## for i=0,N do
    t = t + a * (-1.0 + 2.0*noisefn(p))
    ## if i ~= N then
      a = a * G
      -- each fbm octave must scale by 2
      ## if p.type.is_scalar then
        p = 2 * p + 0.131 -- offset a little to remove bias
      ## elseif p.type.is_vec2 then
        p = A * p -- rotate a little to remove bias
      ## else
        p = 2 * p
      ## end
    ## end
  ## end
  return t
end

##[[
os.execute('wget -O stb_image_write.h https://raw.githubusercontent.com/nothings/stb/master/stb_image_write.h')
cdefine 'STB_IMAGE_WRITE_STATIC'
cdefine 'STB_IMAGE_WRITE_IMPLEMENTATION'
cincdir '.'
cinclude 'stb_image_write.h'
]]
global function stbi_write_png(filename: cstring, w: cint, h: cint, comp: cint, data: pointer, stride_in_bytes: cint): cint <cimport,nodecl> end

## function DRAW(width, height)
local canvas: Canvas(bvec4, #[width]#, #[height]#)
canvas:fragment_draw(main_image, 0.0)
stbi_write_png("output.png", #[width]#, #[height]#, 4, &canvas.pixels[0][0], #[width]#*4)
## end

global function exps(x: float, k: float): float
  return exp(k*x-k)
end

global f32: type = @float
global v3: type = @vec3
global v2: type = @vec2