timholy/fsaiter.jl

## fsaiter.jl
step(i::Int) = 1

type ForwardSubarrayIterator
    sub_min::Vector{Int}
    sub_max::Vector{Int}
    inc::Vector{Int}
    pos::Vector{Int}
    index::Int   # the current linear index
end
function ForwardSubarrayIterator(dims::Dims,ind::RangeIndex...)
    # Compute increments for the "carry" operation
    # This calculates the linear-index difference in going from
    #     [max1,max2,i3,...]
    # to
    #     [min1,min2,i3+1,...]
    inc = Array(Int,length(dims))
    sub_min = Array(Int,length(dims))
    sub_max = Array(Int,length(dims))
    inc[1] = step(ind[1])
    sub_min[1] = min(ind[1])
    sub_max[1] = max(ind[1])
    index_min::Int = sub_min[1]
    index_max::Int = sub_max[1]
    s::Int = 1    # will hold the stride
    for idim = 2:length(dims)
        s *= dims[idim-1]
        inc[idim] = s - index_max + index_min
        sub_min[idim] = min(ind[idim])
        sub_max[idim] = max(ind[idim])
        index_max += (sub_max[idim]-1)*s
        index_min += (sub_min[idim]-1)*s
    end
    ForwardSubarrayIterator(sub_min,sub_max,inc,copy(sub_min),index_min)
end
function next(iter::ForwardSubarrayIterator)
    idim = 1
    iter.pos[1] += 1
    while idim < length(iter.pos) && iter.pos[idim] > iter.sub_max[idim]
        iter.pos[idim] = iter.sub_min[idim]
        idim += 1
        iter.pos[idim] += 1
    end
    iter.index += iter.inc[idim]
end

function myref(A::Array, I::RangeIndex...)
    i = length(I)
    while i > 0 && isa(I[i],Integer); i-=1; end
    d = map(length, I)::Dims
    X = similar(A, d[1:i])

    iter = ForwardSubarrayIterator(size(A),I...)
    X[1] = A[iter.index]
    for i = 2:numel(X)
        X[i] = A[next(iter)]
    end
end

function myassign(A::Array, X::Array, I::RangeIndex...)
    iter = ForwardSubarrayIterator(size(A),I...)
    A[iter.index] = X[1]
    for i = 2:numel(X)
        A[next(iter)] = X[i]
    end
end

## fsaiter2.jl
# The following should go into range.jl
step(i::Int) = 1
first(i::Int) = i
min(r::Range1) = r.start
max(r::Range1) = last(r)
min(r::Range) = r.step > 0 ? r.start : last(r)
max(r::Range) = r.step > 0 ? last(r) : r.start

# From here on is the material for ref/assign with RangeIndexes
type ForwardSubarrayIterator
    index::Int         # the current linear index
    copy_len::Int   # number of elements that can be copied in blocks
    len::Vector{Int}   # length of iterator along each non-copied dimension
    pos::Vector{Int}   # current state of iterator relative to len
    inc::Vector{Int}   # increment within dimension
end
function ForwardSubarrayIterator(dims::Dims,ind::RangeIndex...)
    # Check dimensions
    # Because we can use linear indexing, the sizes don't have to match
    if length(dims) < length(ind)
        error("Dimensionality mismatch")
    end
    if length(ind) < length(dims)
        sz = tuple(dims[1:length(ind)-1]...,prod(dims[length(ind):end]))
    else
        sz = dims
    end
    for idim = 1:length(sz)
        if min(ind[idim]) < 1 || max(ind[idim]) > sz[idim]
            println(ind)
            error(min(ind[idim]),' ',max(ind[idim])," index out of range")
        end
    end
    # Parse ind, looking for contiguous blocks of memory (for which we
    # can use copy_to). Then set up each remaining coordinate's
    # increment behavior.
    iscontiguous = true
    copy_len = 1
    offset_first = 0
    offset_last = 0
    len = Array(Int,0)
    inc = Array(Int,0)
    s = 1
    idim = 1
    while idim <= length(ind)
        if iscontiguous && step(ind[idim]) == 1
            if length(ind[idim]) == sz[idim]
                # We're still accumulating coordinates that form a
                # contiguous block
                copy_len *= sz[idim]
            else
                # The contiguous block is broken, but we can
                # copy along a subset of this dimension
                offset_first = copy_len*(min(ind[idim])-1)
                copy_len *= length(ind[idim])
                offset_last = offset_first
                iscontiguous = false
            end
            s *= sz[idim]
        else
            iscontiguous = false
            if length(ind[idim]) > 1
                push(len,length(ind[idim]))
                push(inc,s*step(ind[idim]) + offset_first - offset_last)
            end
            offset_first += s*(first(ind[idim])-1)
            offset_last = offset_first + s*(length(ind[idim])-1)*step(ind[idim])
            s *= sz[idim]
        end
        idim += 1
    end
    pos = ones(Int,length(len))
    ForwardSubarrayIterator(offset_first+1,copy_len,len,pos,inc)
end
function next(iter::ForwardSubarrayIterator)
    idim::Int = 1
    iter.pos[1] += 1
    while iter.pos[idim] > iter.len[idim] && idim < length(iter.pos)
        iter.pos[idim] = 1
        idim += 1
        iter.pos[idim] += 1
    end
    iter.index += iter.inc[idim]
end


# Assign/ref goes here

function myassign{T}(A::Array{T}, X::Array{T}, I::RangeIndex...)
    iter = ForwardSubarrayIterator(size(A),I...)
    if iter.copy_len > 1
        if isempty(iter.pos)
            #copy_to(A,iter.index,X,1,iter.copy_len)
            ccall(:memcpy, Ptr{Void}, (Ptr{Void}, Ptr{Void}, Uint),
                  pointer(A, iter.index), pointer(X, 1), iter.copy_len*sizeof(T))
        else
            i = 1
            while iter.pos[end] <= iter.len[end]
                #copy_to(A,iter.index,X,i,iter.copy_len)
                ccall(:memcpy, Ptr{Void}, (Ptr{Void}, Ptr{Void}, Uint),
                      pointer(A, iter.index), pointer(X, i), iter.copy_len*sizeof(T))
                i += iter.copy_len
                next(iter)
            end
        end
    else
        arrayset(A,iter.index,X[1])
        for i = 2:numel(X)
            arrayset(A,next(iter),X[i])
        end
    end
end

## fsaiter_cfuncs.c
void fsaiter_assign(double* A,const double *X,int *pos,const int *sub_min,const int *sub_max, const int *inc, int index, int n_dims)
{
  A--;  // convert from 0-offset to 1-offset
  X--;
  int i = 1;
  int idim;
  while (1) {
    A[index] = X[i];
    idim = 0;
    pos[0]++;
    while (pos[idim] > sub_max[idim])
      if (idim < n_dims-1) {
        pos[idim] = sub_min[idim];
        idim++;
        pos[idim]++;
      } else
        return;
    index += inc[idim];
    i++;
  }
}

## test_tarray.jl
function tile1(A::Vector,tilesize::Int)
    ntiles = ifloor(length(A)/tilesize)
    TA = Array(eltype(A),tilesize,ntiles)
    for i1 = 1:ntiles
        indxi1 = (i1-1)*tilesize+1:i1*tilesize
        tmp = A[indxi1]
        TA[:,i1] = tmp
    end
end

function tile2(A::Matrix,tilesize::Dims)
    ntiles = ntuple(2,i->ifloor(size(A,i)/tilesize[i]))
    TA = Array(eltype(A),tilesize...,ntiles...)
    for i2 = 1:ntiles[2]
        indxi2 = (i2-1)*tilesize[2]+1:i2*tilesize[2]
        for i1 = 1:ntiles[1]
            indxi1 = (i1-1)*tilesize[1]+1:i1*tilesize[1]
            tmp = A[indxi1,indxi2]
            TA[:,:,i1,i2] = tmp
        end
    end
end

function tile3(A::Array,tilesize::Dims)
    ntiles = ntuple(3,i->ifloor(size(A,i)/tilesize[i]))
    TA = Array(eltype(A),tilesize...,ntiles...)
    for i3 = 1:ntiles[3]
        indxi3 = (i3-1)*tilesize[3]+1:i3*tilesize[3]
        for i2 = 1:ntiles[2]
            indxi2 = (i2-1)*tilesize[2]+1:i2*tilesize[2]
            for i1 = 1:ntiles[1]
                indxi1 = (i1-1)*tilesize[1]+1:i1*tilesize[1]
                tmp = A[indxi1,indxi2,indxi3]
                TA[:,:,:,i1,i2,i3] = tmp
            end
        end
    end
end

function test1()
    arraysize = 4096*4096
    tilesize = 32*32
    A = randn(arraysize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
end

function test2()
    arraysize = (4096,4096)
    tilesize = (32,32)
    A = randn(arraysize...)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
end

function test3()
    arraysize = (256,256,256)
    tilesize = (16,16,4)
    A = randn(arraysize...)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
end

## test_tarray.m
function testtarray
    fprintf('One-dimensional tiling:\n')
    test1
    fprintf('Two-dimensional tiling:\n')
    test2
    fprintf('Three-dimensional tiling:\n')
    test3
end

function TA = tile1(A,tilesize)
    ntiles = floor(length(A)/tilesize);
    TA = zeros([tilesize ntiles],class(A));
    for i1 = 1:ntiles
      indxi1 = (i1-1)*tilesize+1:i1*tilesize;
      tmp = A(indxi1);
      TA(:,i1) = tmp;
    end
end

function TA = tile2(A,tilesize)
    ntiles = floor(size(A)./tilesize);
    TA = zeros([tilesize ntiles],class(A));
    for i2 = 1:ntiles(2)
        indxi2 = (i2-1)*tilesize(2)+1:i2*tilesize(2);
        for i1 = 1:ntiles(1)
            indxi1 = (i1-1)*tilesize(1)+1:i1*tilesize(1);
            tmp = A(indxi1,indxi2);
            TA(:,:,i1,i2) = tmp;
        end
    end
end

function TA = tile3(A,tilesize)
    ntiles = floor(size(A)./tilesize);
    TA = zeros([tilesize ntiles],class(A));
    for i3 = 1:ntiles(3)
        indxi3 = (i3-1)*tilesize(3)+1:i3*tilesize(3);
        for i2 = 1:ntiles(2)
            indxi2 = (i2-1)*tilesize(2)+1:i2*tilesize(2);
            for i1 = 1:ntiles(1)
                indxi1 = (i1-1)*tilesize(1)+1:i1*tilesize(1);
                tmp = A(indxi1,indxi2,indxi3);
                TA(:,:,:,i1,i2,i3) = tmp;
            end
        end
    end
end

function test1()
    arraysize = 4096*4096;
    tilesize = 32*32;
    A = randn(arraysize,1);
    tic; TA = tile1(A,tilesize); toc
    tic; TA = tile1(A,tilesize); toc
    tic; TA = tile1(A,tilesize); toc
    tic; TA = tile1(A,tilesize); toc
end

function test2()
    arraysize = [4096,4096];
    tilesize = [32,32];
    A = randn(arraysize);
    tic; TA = tile2(A,tilesize); toc
    tic; TA = tile2(A,tilesize); toc
    tic; TA = tile2(A,tilesize); toc
    tic; TA = tile2(A,tilesize); toc
end

function test3()
    arraysize = [256,256,256];
    tilesize = [16,16,4];
    A = randn(arraysize);
    tic; TA = tile3(A,tilesize); toc
    tic; TA = tile3(A,tilesize); toc
    tic; TA = tile3(A,tilesize); toc
    tic; TA = tile3(A,tilesize); toc
end

## test_tarray_iter.jl
function tile1(A::Vector,tilesize::Int)
    ntiles = ifloor(length(A)/tilesize)
    TA = Array(eltype(A),tilesize,ntiles)
    for i1 = 1:ntiles
        indxi1 = (i1-1)*tilesize+1:i1*tilesize
        tmp = A[indxi1]
#        TA[:,i1] = tmp
        myassign(TA,tmp,1:tilesize,i1)
    end
end

function tile2(A::Matrix,tilesize::Dims)
    ntiles = ntuple(2,i->ifloor(size(A,i)/tilesize[i]))
    s = cumprod([tilesize...])
    TA = Array(eltype(A),tilesize...,ntiles...)
    for i2 = 1:ntiles[2]
        indxi2 = (i2-1)*tilesize[2]+1:i2*tilesize[2]
        for i1 = 1:ntiles[1]
            indxi1 = (i1-1)*tilesize[1]+1:i1*tilesize[1]
            tmp = A[indxi1,indxi2]
#            TA[:,:,i1,i2] = tmp
            myassign(TA,tmp,1:tilesize[1],1:tilesize[2],i1,i2)
        end
    end
end

function tile3(A::Array,tilesize::Dims)
    ntiles = ntuple(3,i->ifloor(size(A,i)/tilesize[i]))
    s = cumprod([tilesize...])
    TA = Array(eltype(A),tilesize...,ntiles...)
    for i3 = 1:ntiles[3]
        indxi3 = (i3-1)*tilesize[3]+1:i3*tilesize[3]
        for i2 = 1:ntiles[2]
            indxi2 = (i2-1)*tilesize[2]+1:i2*tilesize[2]
            for i1 = 1:ntiles[1]
                indxi1 = (i1-1)*tilesize[1]+1:i1*tilesize[1]
                tmp = A[indxi1,indxi2,indxi3]
#                TA[:,:,:,i1,i2,i3] = tmp
                myassign(TA,tmp,1:tilesize[1],1:tilesize[2],1:tilesize[3],i1,i2,i3)
            end
        end
    end
end

function test1()
    arraysize = 4096*4096
    tilesize = 32*32
    A = randn(arraysize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
    @time TA = tile1(A,tilesize)
end

function test2()
    arraysize = (4096,4096)
    tilesize = (32,32)
    A = randn(arraysize...)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
    @time TA = tile2(A,tilesize)
end

function test3()
    arraysize = (256,256,256)
    tilesize = (16,16,4)
    A = randn(arraysize...)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
    @time TA = tile3(A,tilesize)
end
	step(i::Int) = 1

	type ForwardSubarrayIterator
	sub_min::Vector{Int}
	sub_max::Vector{Int}
	inc::Vector{Int}
	pos::Vector{Int}
	index::Int # the current linear index
	end
	function ForwardSubarrayIterator(dims::Dims,ind::RangeIndex...)
	# Compute increments for the "carry" operation
	# This calculates the linear-index difference in going from
	# [max1,max2,i3,...]
	# to
	# [min1,min2,i3+1,...]
	inc = Array(Int,length(dims))
	sub_min = Array(Int,length(dims))
	sub_max = Array(Int,length(dims))
	inc[1] = step(ind[1])
	sub_min[1] = min(ind[1])
	sub_max[1] = max(ind[1])
	index_min::Int = sub_min[1]
	index_max::Int = sub_max[1]
	s::Int = 1 # will hold the stride
	for idim = 2:length(dims)
	s *= dims[idim-1]
	inc[idim] = s - index_max + index_min
	sub_min[idim] = min(ind[idim])
	sub_max[idim] = max(ind[idim])
	index_max += (sub_max[idim]-1)*s
	index_min += (sub_min[idim]-1)*s
	end
	ForwardSubarrayIterator(sub_min,sub_max,inc,copy(sub_min),index_min)
	end
	function next(iter::ForwardSubarrayIterator)
	idim = 1
	iter.pos[1] += 1
	while idim < length(iter.pos) && iter.pos[idim] > iter.sub_max[idim]
	iter.pos[idim] = iter.sub_min[idim]
	idim += 1
	iter.pos[idim] += 1
	end
	iter.index += iter.inc[idim]
	end

	function myref(A::Array, I::RangeIndex...)
	i = length(I)
	while i > 0 && isa(I[i],Integer); i-=1; end
	d = map(length, I)::Dims
	X = similar(A, d[1:i])

	iter = ForwardSubarrayIterator(size(A),I...)
	X[1] = A[iter.index]
	for i = 2:numel(X)
	X[i] = A[next(iter)]
	end
	end

	function myassign(A::Array, X::Array, I::RangeIndex...)
	iter = ForwardSubarrayIterator(size(A),I...)
	A[iter.index] = X[1]
	for i = 2:numel(X)
	A[next(iter)] = X[i]
	end
	end
	# The following should go into range.jl
	step(i::Int) = 1
	first(i::Int) = i
	min(r::Range1) = r.start
	max(r::Range1) = last(r)
	min(r::Range) = r.step > 0 ? r.start : last(r)
	max(r::Range) = r.step > 0 ? last(r) : r.start

	# From here on is the material for ref/assign with RangeIndexes
	type ForwardSubarrayIterator
	index::Int # the current linear index
	copy_len::Int # number of elements that can be copied in blocks
	len::Vector{Int} # length of iterator along each non-copied dimension
	pos::Vector{Int} # current state of iterator relative to len
	inc::Vector{Int} # increment within dimension
	end
	function ForwardSubarrayIterator(dims::Dims,ind::RangeIndex...)
	# Check dimensions
	# Because we can use linear indexing, the sizes don't have to match
	if length(dims) < length(ind)
	error("Dimensionality mismatch")
	end
	if length(ind) < length(dims)
	sz = tuple(dims[1:length(ind)-1]...,prod(dims[length(ind):end]))
	else
	sz = dims
	end
	for idim = 1:length(sz)
	if min(ind[idim]) < 1 \|\| max(ind[idim]) > sz[idim]
	println(ind)
	error(min(ind[idim]),' ',max(ind[idim])," index out of range")
	end
	end
	# Parse ind, looking for contiguous blocks of memory (for which we
	# can use copy_to). Then set up each remaining coordinate's
	# increment behavior.
	iscontiguous = true
	copy_len = 1
	offset_first = 0
	offset_last = 0
	len = Array(Int,0)
	inc = Array(Int,0)
	s = 1
	idim = 1
	while idim <= length(ind)
	if iscontiguous && step(ind[idim]) == 1
	if length(ind[idim]) == sz[idim]
	# We're still accumulating coordinates that form a
	# contiguous block
	copy_len *= sz[idim]
	else
	# The contiguous block is broken, but we can
	# copy along a subset of this dimension
	offset_first = copy_len*(min(ind[idim])-1)
	copy_len *= length(ind[idim])
	offset_last = offset_first
	iscontiguous = false
	end
	s *= sz[idim]
	else
	iscontiguous = false
	if length(ind[idim]) > 1
	push(len,length(ind[idim]))
	push(inc,s*step(ind[idim]) + offset_first - offset_last)
	end
	offset_first += s*(first(ind[idim])-1)
	offset_last = offset_first + s(length(ind[idim])-1)step(ind[idim])
	s *= sz[idim]
	end
	idim += 1
	end
	pos = ones(Int,length(len))
	ForwardSubarrayIterator(offset_first+1,copy_len,len,pos,inc)
	end
	function next(iter::ForwardSubarrayIterator)
	idim::Int = 1
	iter.pos[1] += 1
	while iter.pos[idim] > iter.len[idim] && idim < length(iter.pos)
	iter.pos[idim] = 1
	idim += 1
	iter.pos[idim] += 1
	end
	iter.index += iter.inc[idim]
	end




	# Assign/ref goes here

	function myassign{T}(A::Array{T}, X::Array{T}, I::RangeIndex...)
	iter = ForwardSubarrayIterator(size(A),I...)
	if iter.copy_len > 1
	if isempty(iter.pos)
	#copy_to(A,iter.index,X,1,iter.copy_len)
	ccall(:memcpy, Ptr{Void}, (Ptr{Void}, Ptr{Void}, Uint),
	pointer(A, iter.index), pointer(X, 1), iter.copy_len*sizeof(T))
	else
	i = 1
	while iter.pos[end] <= iter.len[end]
	#copy_to(A,iter.index,X,i,iter.copy_len)
	ccall(:memcpy, Ptr{Void}, (Ptr{Void}, Ptr{Void}, Uint),
	pointer(A, iter.index), pointer(X, i), iter.copy_len*sizeof(T))
	i += iter.copy_len
	next(iter)
	end
	end
	else
	arrayset(A,iter.index,X[1])
	for i = 2:numel(X)
	arrayset(A,next(iter),X[i])
	end
	end
	end
	void fsaiter_assign(double* A,const double X,int pos,const int sub_min,const int sub_max, const int *inc, int index, int n_dims)
	{
	A--; // convert from 0-offset to 1-offset
	X--;
	int i = 1;
	int idim;
	while (1) {
	A[index] = X[i];
	idim = 0;
	pos[0]++;
	while (pos[idim] > sub_max[idim])
	if (idim < n_dims-1) {
	pos[idim] = sub_min[idim];
	idim++;
	pos[idim]++;
	} else
	return;
	index += inc[idim];
	i++;
	}
	}
	function tile1(A::Vector,tilesize::Int)
	ntiles = ifloor(length(A)/tilesize)
	TA = Array(eltype(A),tilesize,ntiles)
	for i1 = 1:ntiles
	indxi1 = (i1-1)tilesize+1:i1tilesize
	tmp = A[indxi1]
	TA[:,i1] = tmp
	end
	end

	function tile2(A::Matrix,tilesize::Dims)
	ntiles = ntuple(2,i->ifloor(size(A,i)/tilesize[i]))
	TA = Array(eltype(A),tilesize...,ntiles...)
	for i2 = 1:ntiles[2]
	indxi2 = (i2-1)tilesize[2]+1:i2tilesize[2]
	for i1 = 1:ntiles[1]
	indxi1 = (i1-1)tilesize[1]+1:i1tilesize[1]
	tmp = A[indxi1,indxi2]
	TA[:,:,i1,i2] = tmp
	end
	end
	end

	function tile3(A::Array,tilesize::Dims)
	ntiles = ntuple(3,i->ifloor(size(A,i)/tilesize[i]))
	TA = Array(eltype(A),tilesize...,ntiles...)
	for i3 = 1:ntiles[3]
	indxi3 = (i3-1)tilesize[3]+1:i3tilesize[3]
	for i2 = 1:ntiles[2]
	indxi2 = (i2-1)tilesize[2]+1:i2tilesize[2]
	for i1 = 1:ntiles[1]
	indxi1 = (i1-1)tilesize[1]+1:i1tilesize[1]
	tmp = A[indxi1,indxi2,indxi3]
	TA[:,:,:,i1,i2,i3] = tmp
	end
	end
	end
	end

	function test1()
	arraysize = 4096*4096
	tilesize = 32*32
	A = randn(arraysize)
	@time TA = tile1(A,tilesize)
	@time TA = tile1(A,tilesize)
	@time TA = tile1(A,tilesize)
	@time TA = tile1(A,tilesize)
	end

	function test2()
	arraysize = (4096,4096)
	tilesize = (32,32)
	A = randn(arraysize...)
	@time TA = tile2(A,tilesize)
	@time TA = tile2(A,tilesize)
	@time TA = tile2(A,tilesize)
	@time TA = tile2(A,tilesize)
	end

	function test3()
	arraysize = (256,256,256)
	tilesize = (16,16,4)
	A = randn(arraysize...)
	@time TA = tile3(A,tilesize)
	@time TA = tile3(A,tilesize)
	@time TA = tile3(A,tilesize)
	@time TA = tile3(A,tilesize)
	end
	function testtarray
	fprintf('One-dimensional tiling:\n')
	test1
	fprintf('Two-dimensional tiling:\n')
	test2
	fprintf('Three-dimensional tiling:\n')
	test3
	end

	function TA = tile1(A,tilesize)
	ntiles = floor(length(A)/tilesize);
	TA = zeros([tilesize ntiles],class(A));
	for i1 = 1:ntiles
	indxi1 = (i1-1)tilesize+1:i1tilesize;
	tmp = A(indxi1);
	TA(:,i1) = tmp;
	end
	end

	function TA = tile2(A,tilesize)
	ntiles = floor(size(A)./tilesize);
	TA = zeros([tilesize ntiles],class(A));
	for i2 = 1:ntiles(2)
	indxi2 = (i2-1)tilesize(2)+1:i2tilesize(2);
	for i1 = 1:ntiles(1)
	indxi1 = (i1-1)tilesize(1)+1:i1tilesize(1);
	tmp = A(indxi1,indxi2);
	TA(:,:,i1,i2) = tmp;
	end
	end
	end

	function TA = tile3(A,tilesize)
	ntiles = floor(size(A)./tilesize);
	TA = zeros([tilesize ntiles],class(A));
	for i3 = 1:ntiles(3)
	indxi3 = (i3-1)tilesize(3)+1:i3tilesize(3);
	for i2 = 1:ntiles(2)
	indxi2 = (i2-1)tilesize(2)+1:i2tilesize(2);
	for i1 = 1:ntiles(1)
	indxi1 = (i1-1)tilesize(1)+1:i1tilesize(1);
	tmp = A(indxi1,indxi2,indxi3);
	TA(:,:,:,i1,i2,i3) = tmp;
	end
	end
	end
	end

	function test1()
	arraysize = 4096*4096;
	tilesize = 32*32;
	A = randn(arraysize,1);
	tic; TA = tile1(A,tilesize); toc
	tic; TA = tile1(A,tilesize); toc
	tic; TA = tile1(A,tilesize); toc
	tic; TA = tile1(A,tilesize); toc
	end

	function test2()
	arraysize = [4096,4096];
	tilesize = [32,32];
	A = randn(arraysize);
	tic; TA = tile2(A,tilesize); toc
	tic; TA = tile2(A,tilesize); toc
	tic; TA = tile2(A,tilesize); toc
	tic; TA = tile2(A,tilesize); toc
	end

	function test3()
	arraysize = [256,256,256];
	tilesize = [16,16,4];
	A = randn(arraysize);
	tic; TA = tile3(A,tilesize); toc
	tic; TA = tile3(A,tilesize); toc
	tic; TA = tile3(A,tilesize); toc
	tic; TA = tile3(A,tilesize); toc
	end