Arch D. Robison ArchRobison

## gist:9145036
julia> function foo( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
           (a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
       end
foo (generic function with 1 method)

julia> t = NTuple{4,Float32}
(Float32,Float32,Float32,Float32)

julia> code_llvm(foo,(t,t))

## gist:9231536
function saxpy( a, x, y )
    @simd for i=1:length(x)
        @inbounds y[i] = y[i]+a*x[i]
    end
end

function flog( n, reps, tolerance )
    x = rand(Float32,n)
    y = rand(Float32,n)
    z = copy(y)

## gist:9231763
function sweep( irange, jrange, U, Vx, Vy, A, B )
    for j in jrange
        @simd for i in irange
            @inbounds begin
                u = U[i,j]
                Vx[i,j] += (A[i,j+1]+A[i,j])*(U[i,j+1]-u)
                Vy[i,j] += (A[i+1,j]+A[i,j])*(U[i+1,j]-u)
                U [i,j] = u + B[i,j]*((Vx[i,j]-Vx[i,j-1]) + (Vy[i,j]-Vy[i-1,j]))
            end
        end

## gist:9232051
function inner( x, y )
    s = zero(eltype(x))
    @simd for i=1:length(x)
        @inbounds s += x[i]*y[i]
    end
    s
end

function flog( n, reps, tolerance )
    x = rand(Float32,n)

## gist:9793916
$ cat foo.jl
function add( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
    (a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
end

function mul( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
    (a[1]*b[1],a[2]*b[2],a[3]*b[3],a[4]*b[4])
end

function madd( a::NTuple{4,Float32}, b::NTuple{4,Float32}, c::NTuple{4,Float32} )

## gist:9808976
$ cat vadd.jl
function vadd( a::NTuple{4,Float64}, b::NTuple{4,Float64} )
    (a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
end

function vadd_one!(arr::Array{Float64, 1})
    len  = length(arr) # assuming len multiple of 4
    one =  (1.0, 1.0, 1.0, 1.0)
    @inbounds for i = 1:4:len
        inp = (arr[i], arr[i+1], arr[i+2], arr[i+3])

## gist:9809496
$ cat v.jl
function vadd_one!(arr::Array{Float32, 1})
    @simd for i = 1:length(arr)
         @inbounds arr[i] += 1
    end
end

code_llvm(vadd_one!,(Array{Float32, 1},))
$ julia v.jl

## gist:9944313
L.preheader:                                      ; preds = %top
  %.op = add i64 %0, 1, !dbg !9
  %8 = select i1 %2, i64 %.op, i64 1, !dbg !9
  %9 = sub i64 %8, %5, !dbg !9
  %xtraiter = and i64 %9, 7
  switch i64 %xtraiter, label %L.unr [
    i64 0, label %L.preheader.split
    i64 1, label %L.unr22
    i64 2, label %L.unr18
    i64 3, label %L.unr14

## gist:a7c4cf396c5332a4ddd2
--- a/src/init.c
+++ b/src/init.c
@@ -64,6 +64,9 @@ extern BOOL (WINAPI *hSymRefreshModuleList)(HANDLE);
 #include <sched.h>   // for setting CPU affinity
 #endif

+#undef SIGSTKSZ
+#define SIGSTKSZ (64*1024)
+
 char *julia_home = NULL;

## gist:477eb33a6bac0aa17218
::::::::::::::
test1.jl
::::::::::::::
# Example from early discussion.
# Has constant lower bound, expression as upper bound, and no reductions.
function test1( a, x, y )
    @simd for i=1:length(x)
        @inbounds y[i] = y[i]+a*x[i]
    end
end
	julia> function foo( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
	(a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
	end
	foo (generic function with 1 method)

	julia> t = NTuple{4,Float32}
	(Float32,Float32,Float32,Float32)

	julia> code_llvm(foo,(t,t))
	function saxpy( a, x, y )
	@simd for i=1:length(x)
	@inbounds y[i] = y[i]+a*x[i]
	end
	end

	function flog( n, reps, tolerance )
	x = rand(Float32,n)
	y = rand(Float32,n)
	z = copy(y)
	function sweep( irange, jrange, U, Vx, Vy, A, B )
	for j in jrange
	@simd for i in irange
	@inbounds begin
	u = U[i,j]
	Vx[i,j] += (A[i,j+1]+A[i,j])*(U[i,j+1]-u)
	Vy[i,j] += (A[i+1,j]+A[i,j])*(U[i+1,j]-u)
	U [i,j] = u + B[i,j]*((Vx[i,j]-Vx[i,j-1]) + (Vy[i,j]-Vy[i-1,j]))
	end
	end
	function inner( x, y )
	s = zero(eltype(x))
	@simd for i=1:length(x)
	@inbounds s += x[i]*y[i]
	end
	s
	end

	function flog( n, reps, tolerance )
	x = rand(Float32,n)
	$ cat foo.jl
	function add( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
	(a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
	end

	function mul( a::NTuple{4,Float32}, b::NTuple{4,Float32} )
	(a[1]b[1],a[2]b[2],a[3]b[3],a[4]b[4])
	end

	function madd( a::NTuple{4,Float32}, b::NTuple{4,Float32}, c::NTuple{4,Float32} )
	$ cat vadd.jl
	function vadd( a::NTuple{4,Float64}, b::NTuple{4,Float64} )
	(a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])
	end

	function vadd_one!(arr::Array{Float64, 1})
	len = length(arr) # assuming len multiple of 4
	one = (1.0, 1.0, 1.0, 1.0)
	@inbounds for i = 1:4:len
	inp = (arr[i], arr[i+1], arr[i+2], arr[i+3])
	$ cat v.jl
	function vadd_one!(arr::Array{Float32, 1})
	@simd for i = 1:length(arr)
	@inbounds arr[i] += 1
	end
	end

	code_llvm(vadd_one!,(Array{Float32, 1},))
	$ julia v.jl
	L.preheader: ; preds = %top
	%.op = add i64 %0, 1, !dbg !9
	%8 = select i1 %2, i64 %.op, i64 1, !dbg !9
	%9 = sub i64 %8, %5, !dbg !9
	%xtraiter = and i64 %9, 7
	switch i64 %xtraiter, label %L.unr [
	i64 0, label %L.preheader.split
	i64 1, label %L.unr22
	i64 2, label %L.unr18
	i64 3, label %L.unr14
	--- a/src/init.c
	+++ b/src/init.c
	@@ -64,6 +64,9 @@ extern BOOL (WINAPI *hSymRefreshModuleList)(HANDLE);
	#include <sched.h> // for setting CPU affinity
	#endif

	+#undef SIGSTKSZ
	+#define SIGSTKSZ (64*1024)
	+
	char *julia_home = NULL;
	::::::::::::::
	test1.jl
	::::::::::::::
	# Example from early discussion.
	# Has constant lower bound, expression as upper bound, and no reductions.
	function test1( a, x, y )
	@simd for i=1:length(x)
	@inbounds y[i] = y[i]+a*x[i]
	end
	end