rrnewton/cuda_backend_bug

## cuda_backend_bug
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE CPP #-}

-- Naive translation of shen_anova1_batch.m into Accelerate.

module Main (main) where

import Data.Array.Accelerate as A
import Data.Array.Accelerate.CUDA as R
import Prelude as P
import Data.List hiding (group)
import Data.List.Split (splitEvery)
-- import Data.Set as S

import Control.Monad (when, mapM_, forM_)

import Data.Array.Unboxed       (IArray, UArray, elems, listArray)
import System.Random

type Matrix a = Array DIM2 a
type EI = Exp Int    -- Shorthand for the lazy.
runE x = R.run (unit x)  -- Run for scalars

dbg = False -- TOGGLE me.

--------------------------------------------------------------------------------

-- Test harness -- set up and run the test.
test_anova1 :: Int -> Int -> IO ()
test_anova1 nsbj width = do
  let
      nsnp = 1
      nsbj' = constant nsbj
      area = constant (width*width)
      -- imdata = rand( width*width, nsbj )
      imdata :: Acc (Matrix Float) = generate (index2 area nsbj') inorder
      inorder x = let (r, c) = unlift (unindex2 x) in
                  A.fromIntegral (r + c * area) -- Count by ones in row major..

      -- Ammendment -- for now fixing nsnp == 1 and just doing a vector rather than a matrix for snpdata:
      snpdata :: Acc (Vector Int)
      snpdata = generate (index1 nsbj') ((`mod` 3) . unindex1) -- Return 0,1, or 2

  putStrLn "=== Accelerate naive anova1 batch ==="

  putStrLn$ "Image width: "++ show width
  putStrLn$ "SNP category membership across patients:"++ pp(R.run snpdata)

  -- tic
  -- forM [0 .. nsnp-1] $ \ i -> do
  anova1_batch imdata snpdata
--      fmap1(:,i) = F;    ?????
  -- toc
  return ()


----------------------------------------------------------------------------------------------------
-- Shen's implementation of anova1 in a batch mode
-- 'voxels' is a matrix
-- 'group' is a column vector
-- 'voxel' columns have the same group memebership
anova1_batch :: Acc (Matrix Float) -> Acc (Vector Int) -> IO ()
-- RRN: I believe group should represent a set of individual's single
-- nucleotide values at a specific location (a specific SNP).
-- 'voxel's represents a /flat/ row of voxels for each individual in
-- the group.  Here we analyze how variance at each voxel correlates
-- with the different categories of individuals created by 'group'.
--
-- Specifically, in this case there are three /subgroups/ within
-- 'group', corresponding to snp=0, snp=1, snp=2.
anova1_batch voxels group = do
  let d = shape voxels
      -- n = total number of voxels in each image
      -- r = number of parallel anova runs == number of individuals in group
      (n::EI, r::EI) = unlift$ unindex2 d

      -- FIXME:
      -- uniqueSnps = unique (R.run group)   -- 'g': All the unique SNP settings.
      -- numUniqueSNPs = size g              -- 'p': How many unique values.
      uniqueSNPs = [0,1,2]  -- TEMP, FIXME -- hardcoding
      numUniqueSNPs = 3     -- TEMP, FIXME -- hardcoding
      -- NOTE: We could try to keep everything within Accelerate, but
      -- for the "outermost" loop here, over different SNP values, we
      -- allow ourselves the rconvenience of using lists, and keeping
      -- things in regular Haskell.

      -- First, how much variance is there across ALL voxels (per sample):
      var_v = variance1of2 voxels :: Acc (Vector Float)
      ss_T = A.map (df_T *) var_v;
      -- Next, we compute per-group variance.
      ------------------------------------------------------------
      -- Group is a mapping between index->groupID, and we effectively want to invert it
      -- to get groupID->indices.  To accomplish that we first explicitly attach indices:
      zippedGroup = A.zip (iota r) group

      -- For each observed SNP value {0,1,2} ...
      idxs = P.map (\ snpVal ->
                    let
                        -- Here we want the index of *all* of the voxels belonging to
                        -- individuals with a particular snpVal.
                        idx = A.map A.fst $ filterAcc isMatch zippedGroup
                        isMatch pr = let (_::EI,thisSnp::EI) = unlift pr in
                                     thisSnp ==* constant snpVal
                    in idx
                   ) uniqueSNPs
      -- Now we're ready to extract the set of voxels that belong to this group of patients:
      groupedVoxels :: [Acc (Matrix Float)]
      -- rowGroups = selectRows idx voxels
      groupedVoxels = P.map (`selectRows` voxels) idxs
      -- Whew, now we have finally achieved unflat = X(idx,:) in the original code....

      -- Next we look at variance within each voxel for individuals in THIS snp category:
      -- The result is a per-voxel variance:
      -- variances = A.map (* (A.fromIntegral$ newCols - 1)) (variance2of2 unflat)
      variances :: [Acc (Vector Float)] = P.map variance2of2 groupedVoxels

      -- What is the size of each group, minus 1:
      lens :: [Exp Float] = P.map ((\x -> A.fromIntegral (x - 1)) . unindex1 . shape) idxs

      -- Do a sum reduction, the result is an array of per-voxel statistics:
      ss_W = foldl1 (.+) (P.zipWith (liftScalar (.*)) lens variances)
      ------------------------------------------------------------

      df_B :: Exp Float = A.fromIntegral$ numUniqueSNPs - 1
      df_W :: Exp Float = A.fromIntegral$ n - numUniqueSNPs
      df_T :: Exp Float = A.fromIntegral$ n - 1

      -- Well, this is quite verbose due to there being no scalar/array overloading:
      ms_W = liftScalarR (./) ss_W df_W
      ss_B = ss_T .- ss_W;
      ms_B = liftScalarR (./) ss_B df_B
      f_statistic = ms_B ./ ms_W

  putStrLn$ "Finally, here's the F statistic:  " ++ pp(run ss_W)
  return ()


-- TEMP -- hardcoding this to a specific dimensionality until I fix the general version below
-- This sums along the RIGHT of two dimensions (INNERMOST).
-- If interpreted as a (row,column) matrix, this collapses each row.
variance2of2 :: (Elt a, IsFloating a)
         => Acc (Array DIM2 a) -> Acc (Array DIM1 a)
variance2of2 arr =
  A.map (/ (denom-1))
        (A.zipWith (-) sum2
                       (A.zipWith (*) mean sum1))
 where
  mean = A.map (/ denom) sum1
  denom = A.fromIntegral rows
  Z :. (rows::EI) :. (cols::EI) = unlift sh1
  sh1 :: Exp ( Z :. Int :. Int) = shape arr
  -- Sum along LEFT dimension:
  sum1 = fold (+) 0 arr
  sum2 = fold (+) 0 sqrs
  sqrs = A.zipWith (*) arr arr


-- TEMP -- hardcoding this to a specific dimensionality until I fix the general version below
-- This iterates along the LEFT of two dimensions (OUTERMOST).
-- If interpreted as a (row,column) matrix, this collapses each column,
-- which is the same behavior as the var() function in matlab applied to a matrix.
variance1of2 :: (Elt a, IsFloating a)
         => Acc (Array DIM2 a) -> Acc (Array DIM1 a)
variance1of2 arr =
  A.map (/ (denom-1))
        (A.zipWith (-) sum2
                       (A.zipWith (*) mean sum1))
 where
  mean = A.map (/ denom) sum1
  denom = A.fromIntegral rows
  Z :. (rows::EI) :. (cols::EI) = unlift sh1
  sh1 :: Exp ( Z :. Int :. Int) = shape arr
  -- Sum along LEFT dimension:
  swapped = swap2Dims arr
  sum1 = fold (+) 0 swapped
  sum2 = fold (+) 0 sqrs
  sqrs = A.zipWith (*) swapped swapped


-- A swap function fixed to exactly two dimensions.
swap2Dims :: (Elt e, Num e) => Acc (Matrix e) -> Acc (Matrix e)
swap2Dims a =
    backpermute new_extent swap2 a -- Perform a gather.
  where
    new_extent = swap2 (shape a)
    swap2 :: Exp DIM2 -> Exp DIM2
    swap2 ind = let (Z :.i2 :.i1) = unlift ind in
                lift (Z :. (i1::EI) :. (i2::EI))

----------------------------------------------------------------------------------------------------
-- Let's print matrices nicely.

padleft n str | length str >= n = str
padleft n str | otherwise       = P.take (n - length str) (repeat ' ') ++ str

class NiceShow a where
  pp :: a -> String

instance Show a => NiceShow (Array DIM1 a) where
  pp arr =
    capends$ concat$
    intersperse " " $
    P.map (padleft maxpad) ls
   where
         ls   = P.map show $ toList arr
         maxpad = maximum$ P.map length ls

capends x = "| "++x++" |"

-- This could be much more efficient:
instance Show a => NiceShow (Array DIM2 a) where
  pp arr = concat $
           intersperse "\n" $
           P.map (capends .
                  concat .
                  intersperse " " .
                  P.map (padleft maxpad))
            rowls
   where (Z :. rows :. cols) = arrayShape arr
         ls   = P.map show $ toList arr
         maxpad = maximum$ P.map length ls
         rowls = splitEvery cols ls

----------------------------------------------------------------------------------------------------
-- General Utility functions:
----------------------------------------------------------------------------------------------------

-- Project a subset of the columns (second dim) from a two dimensional matrix.
--
--   @selectRows sl xs@ is similar to xs(sl,:) in Matlab.
selectRows :: Elt e => Acc (Vector Int) -> Acc (Array DIM2 e) -> Acc (Array DIM2 e)
selectRows sl xs =
  let Z :. rows         = unlift $ shape sl
      Z :. _ :. cols    = unlift $ shape xs     :: Z :. Exp Int :. Exp Int
  in
  backpermute
    (index2 rows cols)
    (\ix -> let Z :. j :. i = unlift ix in index2 (sl ! index1 j) i)
    xs

-- Filter -- from accelerate-examples
-- ------
filterAcc :: Elt a
          => (Exp a -> Exp Bool)
          -> Acc (Vector a)
          -> Acc (Vector a)
filterAcc p arr
  = let -- arr              = A.use vec
        flags            = A.map (boolToInt . p) arr
        (targetIdx, len) = A.scanl' (+) 0 flags
        arr'             = A.backpermute (index1 $ the len) id arr
    in
    A.permute const arr' (\ix -> flags!ix ==* 0 ? (ignore, index1 $ targetIdx!ix)) arr
    -- FIXME: This is abusing 'permute' in that the first two arguments are
    --        only justified because we know the permutation function will
    --        write to each location in the target exactly once.
    --        Instead, we should have a primitive that directly encodes the
    --        compaction pattern of the permutation function.

-- Create a vector containing integers in the range [0,n)
-- iota :: (Elt n, IsNum n) => Exp Int -> Acc (Vector n)
iota :: Exp Int -> Acc (Vector Int)
iota n = A.generate (index1 n) (A.fromIntegral . unindex1)


-- Perform a scalar/array operation by lifting an array/array binary
-- operator.
liftScalar :: (Shape sh, Elt e)
           => (Acc (Array sh e) -> Acc (Array sh e) -> Acc (Array sh e))
           ->  Exp e            -> Acc (Array sh e) -> Acc (Array sh e)
liftScalar op left right = op left' right
 where
  -- I can't figure out how to use replicate here.  This is a spurious data dependency introduced by map:
  left' = A.map (\_ -> left) right
  --  left' = A.replicate (shape right) (A.unit left)


-- Same but flipped:
liftScalarR :: (Shape sh, Elt e)
           => (Acc (Array sh e) -> Acc (Array sh e) -> Acc (Array sh e))
           ->  Acc (Array sh e) -> Exp e            -> Acc (Array sh e)
liftScalarR op = flip (liftScalar (flip op))

--------------------------------------------------------------------------------

-- Elementwise operation matching matlab:

infixl 7 ./
infixl 7 .*
infixl 6 .+
infixl 6 .-

a ./ b = A.zipWith (/) a b
a .* b = A.zipWith (*) a b
a .+ b = A.zipWith (+) a b
a .- b = A.zipWith (-) a b

--------------------------------------------------------------------------------

small = test_anova1 6 3
big = test_anova1 300 256
main = small

----------------------------------------------------------------------------------------------------
	{-# LANGUAGE ScopedTypeVariables #-}
	{-# LANGUAGE TypeOperators #-}
	{-# LANGUAGE FlexibleContexts #-}
	{-# LANGUAGE FlexibleInstances #-}
	{-# LANGUAGE CPP #-}

	-- Naive translation of shen_anova1_batch.m into Accelerate.

	module Main (main) where

	import Data.Array.Accelerate as A
	import Data.Array.Accelerate.CUDA as R
	import Prelude as P
	import Data.List hiding (group)
	import Data.List.Split (splitEvery)
	-- import Data.Set as S

	import Control.Monad (when, mapM_, forM_)

	import Data.Array.Unboxed (IArray, UArray, elems, listArray)
	import System.Random

	type Matrix a = Array DIM2 a
	type EI = Exp Int -- Shorthand for the lazy.
	runE x = R.run (unit x) -- Run for scalars

	dbg = False -- TOGGLE me.

	--------------------------------------------------------------------------------

	-- Test harness -- set up and run the test.
	test_anova1 :: Int -> Int -> IO ()
	test_anova1 nsbj width = do
	let
	nsnp = 1
	nsbj' = constant nsbj
	area = constant (width*width)
	-- imdata = rand( width*width, nsbj )
	imdata :: Acc (Matrix Float) = generate (index2 area nsbj') inorder
	inorder x = let (r, c) = unlift (unindex2 x) in
	A.fromIntegral (r + c * area) -- Count by ones in row major..

	-- Ammendment -- for now fixing nsnp == 1 and just doing a vector rather than a matrix for snpdata:
	snpdata :: Acc (Vector Int)
	snpdata = generate (index1 nsbj') ((`mod` 3) . unindex1) -- Return 0,1, or 2

	putStrLn "=== Accelerate naive anova1 batch ==="

	putStrLn$ "Image width: "++ show width
	putStrLn$ "SNP category membership across patients:"++ pp(R.run snpdata)

	-- tic
	-- forM [0 .. nsnp-1] $ \ i -> do
	anova1_batch imdata snpdata
	-- fmap1(:,i) = F; ?????
	-- toc
	return ()


	----------------------------------------------------------------------------------------------------
	-- Shen's implementation of anova1 in a batch mode
	-- 'voxels' is a matrix
	-- 'group' is a column vector
	-- 'voxel' columns have the same group memebership
	anova1_batch :: Acc (Matrix Float) -> Acc (Vector Int) -> IO ()
	-- RRN: I believe group should represent a set of individual's single
	-- nucleotide values at a specific location (a specific SNP).
	-- 'voxel's represents a /flat/ row of voxels for each individual in
	-- the group. Here we analyze how variance at each voxel correlates
	-- with the different categories of individuals created by 'group'.
	--
	-- Specifically, in this case there are three /subgroups/ within
	-- 'group', corresponding to snp=0, snp=1, snp=2.
	anova1_batch voxels group = do
	let d = shape voxels
	-- n = total number of voxels in each image
	-- r = number of parallel anova runs == number of individuals in group
	(n::EI, r::EI) = unlift$ unindex2 d

	-- FIXME:
	-- uniqueSnps = unique (R.run group) -- 'g': All the unique SNP settings.
	-- numUniqueSNPs = size g -- 'p': How many unique values.
	uniqueSNPs = [0,1,2] -- TEMP, FIXME -- hardcoding
	numUniqueSNPs = 3 -- TEMP, FIXME -- hardcoding
	-- NOTE: We could try to keep everything within Accelerate, but
	-- for the "outermost" loop here, over different SNP values, we
	-- allow ourselves the rconvenience of using lists, and keeping
	-- things in regular Haskell.

	-- First, how much variance is there across ALL voxels (per sample):
	var_v = variance1of2 voxels :: Acc (Vector Float)
	ss_T = A.map (df_T *) var_v;
	-- Next, we compute per-group variance.
	------------------------------------------------------------
	-- Group is a mapping between index->groupID, and we effectively want to invert it
	-- to get groupID->indices. To accomplish that we first explicitly attach indices:
	zippedGroup = A.zip (iota r) group

	-- For each observed SNP value {0,1,2} ...
	idxs = P.map (\ snpVal ->
	let
	-- Here we want the index of all of the voxels belonging to
	-- individuals with a particular snpVal.
	idx = A.map A.fst $ filterAcc isMatch zippedGroup
	isMatch pr = let (_::EI,thisSnp::EI) = unlift pr in
	thisSnp ==* constant snpVal
	in idx
	) uniqueSNPs
	-- Now we're ready to extract the set of voxels that belong to this group of patients:
	groupedVoxels :: [Acc (Matrix Float)]
	-- rowGroups = selectRows idx voxels
	groupedVoxels = P.map (`selectRows` voxels) idxs
	-- Whew, now we have finally achieved unflat = X(idx,:) in the original code....

	-- Next we look at variance within each voxel for individuals in THIS snp category:
	-- The result is a per-voxel variance:
	-- variances = A.map (* (A.fromIntegral$ newCols - 1)) (variance2of2 unflat)
	variances :: [Acc (Vector Float)] = P.map variance2of2 groupedVoxels

	-- What is the size of each group, minus 1:
	lens :: [Exp Float] = P.map ((\x -> A.fromIntegral (x - 1)) . unindex1 . shape) idxs

	-- Do a sum reduction, the result is an array of per-voxel statistics:
	ss_W = foldl1 (.+) (P.zipWith (liftScalar (.*)) lens variances)
	------------------------------------------------------------

	df_B :: Exp Float = A.fromIntegral$ numUniqueSNPs - 1
	df_W :: Exp Float = A.fromIntegral$ n - numUniqueSNPs
	df_T :: Exp Float = A.fromIntegral$ n - 1

	-- Well, this is quite verbose due to there being no scalar/array overloading:
	ms_W = liftScalarR (./) ss_W df_W
	ss_B = ss_T .- ss_W;
	ms_B = liftScalarR (./) ss_B df_B
	f_statistic = ms_B ./ ms_W

	putStrLn$ "Finally, here's the F statistic: " ++ pp(run ss_W)
	return ()


	-- TEMP -- hardcoding this to a specific dimensionality until I fix the general version below
	-- This sums along the RIGHT of two dimensions (INNERMOST).
	-- If interpreted as a (row,column) matrix, this collapses each row.
	variance2of2 :: (Elt a, IsFloating a)
	=> Acc (Array DIM2 a) -> Acc (Array DIM1 a)
	variance2of2 arr =
	A.map (/ (denom-1))
	(A.zipWith (-) sum2
	(A.zipWith (*) mean sum1))
	where
	mean = A.map (/ denom) sum1
	denom = A.fromIntegral rows
	Z :. (rows::EI) :. (cols::EI) = unlift sh1
	sh1 :: Exp ( Z :. Int :. Int) = shape arr
	-- Sum along LEFT dimension:
	sum1 = fold (+) 0 arr
	sum2 = fold (+) 0 sqrs
	sqrs = A.zipWith (*) arr arr


	-- TEMP -- hardcoding this to a specific dimensionality until I fix the general version below
	-- This iterates along the LEFT of two dimensions (OUTERMOST).
	-- If interpreted as a (row,column) matrix, this collapses each column,
	-- which is the same behavior as the var() function in matlab applied to a matrix.
	variance1of2 :: (Elt a, IsFloating a)
	=> Acc (Array DIM2 a) -> Acc (Array DIM1 a)
	variance1of2 arr =
	A.map (/ (denom-1))
	(A.zipWith (-) sum2
	(A.zipWith (*) mean sum1))
	where
	mean = A.map (/ denom) sum1
	denom = A.fromIntegral rows
	Z :. (rows::EI) :. (cols::EI) = unlift sh1
	sh1 :: Exp ( Z :. Int :. Int) = shape arr
	-- Sum along LEFT dimension:
	swapped = swap2Dims arr
	sum1 = fold (+) 0 swapped
	sum2 = fold (+) 0 sqrs
	sqrs = A.zipWith (*) swapped swapped


	-- A swap function fixed to exactly two dimensions.
	swap2Dims :: (Elt e, Num e) => Acc (Matrix e) -> Acc (Matrix e)
	swap2Dims a =
	backpermute new_extent swap2 a -- Perform a gather.
	where
	new_extent = swap2 (shape a)
	swap2 :: Exp DIM2 -> Exp DIM2
	swap2 ind = let (Z :.i2 :.i1) = unlift ind in
	lift (Z :. (i1::EI) :. (i2::EI))

	----------------------------------------------------------------------------------------------------
	-- Let's print matrices nicely.

	padleft n str \| length str >= n = str
	padleft n str \| otherwise = P.take (n - length str) (repeat ' ') ++ str

	class NiceShow a where
	pp :: a -> String

	instance Show a => NiceShow (Array DIM1 a) where
	pp arr =
	capends$ concat$
	intersperse " " $
	P.map (padleft maxpad) ls
	where
	ls = P.map show $ toList arr
	maxpad = maximum$ P.map length ls

	capends x = "\| "++x++" \|"

	-- This could be much more efficient:
	instance Show a => NiceShow (Array DIM2 a) where
	pp arr = concat $
	intersperse "\n" $
	P.map (capends .
	concat .
	intersperse " " .
	P.map (padleft maxpad))
	rowls
	where (Z :. rows :. cols) = arrayShape arr
	ls = P.map show $ toList arr
	maxpad = maximum$ P.map length ls
	rowls = splitEvery cols ls

	----------------------------------------------------------------------------------------------------
	-- General Utility functions:
	----------------------------------------------------------------------------------------------------

	-- Project a subset of the columns (second dim) from a two dimensional matrix.
	--
	-- @selectRows sl xs@ is similar to xs(sl,:) in Matlab.
	selectRows :: Elt e => Acc (Vector Int) -> Acc (Array DIM2 e) -> Acc (Array DIM2 e)
	selectRows sl xs =
	let Z :. rows = unlift $ shape sl
	Z :. _ :. cols = unlift $ shape xs :: Z :. Exp Int :. Exp Int
	in
	backpermute
	(index2 rows cols)
	(\ix -> let Z :. j :. i = unlift ix in index2 (sl ! index1 j) i)
	xs

	-- Filter -- from accelerate-examples
	-- ------
	filterAcc :: Elt a
	=> (Exp a -> Exp Bool)
	-> Acc (Vector a)
	-> Acc (Vector a)
	filterAcc p arr
	= let -- arr = A.use vec
	flags = A.map (boolToInt . p) arr
	(targetIdx, len) = A.scanl' (+) 0 flags
	arr' = A.backpermute (index1 $ the len) id arr
	in
	A.permute const arr' (\ix -> flags!ix ==* 0 ? (ignore, index1 $ targetIdx!ix)) arr
	-- FIXME: This is abusing 'permute' in that the first two arguments are
	-- only justified because we know the permutation function will
	-- write to each location in the target exactly once.
	-- Instead, we should have a primitive that directly encodes the
	-- compaction pattern of the permutation function.

	-- Create a vector containing integers in the range [0,n)
	-- iota :: (Elt n, IsNum n) => Exp Int -> Acc (Vector n)
	iota :: Exp Int -> Acc (Vector Int)
	iota n = A.generate (index1 n) (A.fromIntegral . unindex1)


	-- Perform a scalar/array operation by lifting an array/array binary
	-- operator.
	liftScalar :: (Shape sh, Elt e)
	=> (Acc (Array sh e) -> Acc (Array sh e) -> Acc (Array sh e))
	-> Exp e -> Acc (Array sh e) -> Acc (Array sh e)
	liftScalar op left right = op left' right
	where
	-- I can't figure out how to use replicate here. This is a spurious data dependency introduced by map:
	left' = A.map (\_ -> left) right
	-- left' = A.replicate (shape right) (A.unit left)


	-- Same but flipped:
	liftScalarR :: (Shape sh, Elt e)
	=> (Acc (Array sh e) -> Acc (Array sh e) -> Acc (Array sh e))
	-> Acc (Array sh e) -> Exp e -> Acc (Array sh e)
	liftScalarR op = flip (liftScalar (flip op))

	--------------------------------------------------------------------------------

	-- Elementwise operation matching matlab:

	infixl 7 ./
	infixl 7 .*
	infixl 6 .+
	infixl 6 .-

	a ./ b = A.zipWith (/) a b
	a .* b = A.zipWith (*) a b
	a .+ b = A.zipWith (+) a b
	a .- b = A.zipWith (-) a b

	--------------------------------------------------------------------------------

	small = test_anova1 6 3
	big = test_anova1 300 256
	main = small

	----------------------------------------------------------------------------------------------------