msakai/nn_ad.hs

## nn_ad.hs
{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
import Control.Monad
import qualified Data.Vector as V
import Data.Vector.Generic ((!))
import qualified Data.Vector.Generic as VG
import Numeric.AD
import qualified Numeric.Optimization.Algorithms.HagerZhang05.AD as CG
import System.Exit
import qualified System.Random.MWC as Rand
import qualified System.Random.MWC.Distributions as Rand
import Text.CSV
import Text.Printf

h :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> v a
h theta x = a3
  where
    theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
    theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta
    a2 :: v a
    a2 = VG.generate hiddenLayerSize $ \i -> sigmoid $
           let theta1_i = theta1 i
           in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j | j <- [0..inputLayerSize-1] ]
    a3 = VG.generate numLabel $ \i -> sigmoid $
           let theta2_i = theta2 i
           in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j | j <- [0..hiddenLayerSize-1] ]

predict :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> Int
predict theta x = VG.maxIndex (h theta x)

cost :: forall a v. (RealFloat a, VG.Vector v a) => a -> [(v a, Int)] -> v a -> a
cost lambda samples theta
  = (1 / m) * sum [-log (if k == y then o!k else 1 - o!k) | (x,y) <- samples, let o = h theta x, k <- [0 .. numLabel-1]]
  + (lambda / (2*m)) * (sum [x**2 | i <- [0..hiddenLayerSize-1], x <- VG.toList (VG.slice 1 inputLayerSize (theta1 i))] +
                        sum [x**2 | i <- [0..numLabel-1], x <- VG.toList (VG.slice 1 hiddenLayerSize (theta2 i))])
  where
    m = fromIntegral (length samples)
    theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
    theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta

sigmoid :: RealFloat a => a -> a
sigmoid x = 1 / (1 + exp (-x))

inputLayerSize = 400
hiddenLayerSize = 25
numLabel = 10
thetaSize = theta1Size + theta2Size
theta1Size = (inputLayerSize+1) * hiddenLayerSize
theta2Size = (hiddenLayerSize+1) * numLabel
epsilonInit = 0.12

{-
Data files are available from the following URLs.
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

They are converted from ex4data1.mat using Octave as below.

> load('ex4data1.mat');
> csvwrite("ex4data1_X.csv", X);
> csvwrite("ex4data1_y.csv", y);
-}
loadData :: VG.Vector v Double => IO [(v Double, Int)]
loadData = do
  x <- do
    ret <- parseCSVFromFile "ex4data1_X.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [VG.fromList [read x :: Double | x <- row] | row <- rows, row /= [""]]
  y <- do
    ret <- parseCSVFromFile "ex4data1_y.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [if y' == 10 then 0 else y' | [y] <- rows, y /= "", let y' = read y]
  return $ zip x y

train :: Double -> [(V.Vector Double, Int)] -> IO (V.Vector Double)
train lambda samples = do
  let params = CG.defaultParameters
               { CG.printFinal  = True
               , CG.printParams = True
               , CG.verbose     = CG.VeryVerbose -- CG.Verbose
               , CG.maxItersFac = 50 / fromIntegral thetaSize
               }
      grad_tol = 0.00001
  theta0 <- randTheta
  (theta, _result1, _stat1) <- CG.optimize params grad_tol theta0 (cost (auto lambda) [(fmap auto x, y) | (x,y) <- samples])
  return theta

randTheta :: VG.Vector v Double => IO (v Double)
randTheta = do
  gen <- Rand.createSystemRandom
  VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

main :: IO ()
main = do
  let trainSize = 200
  samples <- loadData
  (trainData, testData) <- do
    gen <- Rand.createSystemRandom
    samples2 <- Rand.uniformShuffle (V.fromList samples) gen
    return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

  let lambda = 1
  theta <- train lambda trainData
  printf "precision (training data): %f\n" $ sum [1::Double | (x,y) <- trainData, predict theta x == y] / fromIntegral (length trainData)
  printf "precision (test data): %f\n" $ sum [1::Double | (x,y) <- testData, predict theta x == y] / fromIntegral (length testData)

## nn_hmatrix.hs
{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
import qualified Data.Vector as V
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Generic as VG
import Numeric.LinearAlgebra.HMatrix
import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
import System.Exit
import qualified System.Random.MWC as Rand
import qualified System.Random.MWC.Distributions as Rand
import Text.CSV
import Text.Printf

h :: Vector Double -> Matrix Double -> Matrix Double
h theta x = a3
  where
    theta1 = reshape (inputLayerSize+1) (VG.slice 0 theta1Size theta)
    theta2 = reshape (hiddenLayerSize+1) (VG.slice theta1Size theta2Size theta)
    a1 = fromBlocks [[1, x]]
    z2 = a1 <> tr theta1
    a2 = fromBlocks [[1, sigmoid z2]]
    z3 = a2 <> tr theta2
    a3 = sigmoid z3

predict :: Vector Double -> Matrix Double -> Vector Int
predict theta x = VG.fromList . map VG.maxIndex . toRows $ h theta x

cost :: Double -> (Matrix Double, Vector Int) -> Vector Double -> (Double, Vector Double)
cost lambda (x,y) theta = (val, thetaGrad)
  where
    m = fromIntegral (VG.length y)

    -- hiddenLayerSize >< 1+inputLayerSize
    theta1 = reshape (inputLayerSize+1) (VG.slice 0 theta1Size theta)

    -- numLabel >< 1+hiddenLayerSize
    theta2 = reshape (hiddenLayerSize+1) (VG.slice theta1Size theta2Size theta)

    a1 = fromBlocks [[1, x]]          -- m >< 1+inputLayerSize
    z2 = a1 <> tr theta1              -- m >< hiddenLayerSize
    a2 = fromBlocks [[1, sigmoid z2]] -- m >< 1+hiddenLayerSize
    z3 = a2 <> tr theta2              -- m >< numLabel
    a3 = sigmoid z3                   -- m >< numLabel

    -- m >< numLabel
    y2 = fromRows [ VG.fromList [if j == yi then 1 else 0 | j <- [0..numLabel-1]] | yi <- VG.toList y ]

    val = (1 / m) * sumElements (- y2 * log(a3) - (1 - y2) * log(1 - a3))
        + (lambda / (2*m)) * (sumElements (dropColumns 1 theta1 ^ (2::Int)) +
                              sumElements (dropColumns 1 theta2 ^ (2::Int)))

    delta3 = a3 - y2 -- m >< numLabel
    delta2 = dropColumns 1 (delta3 <> theta2) * sigmoid' z2 -- m >< hiddenLayerSize

    -- hiddenLayerSize >< 1+inputLayerSize
    theta1Grad = scale (1/m) (tr delta2 <> a1) + scale (lambda / m) (fromBlocks [[0, dropColumns 1 theta1]])

    -- numLabel >< 1+hiddenLayerSize
    theta2Grad = scale (1/m) (tr delta3 <> a2) + scale (lambda / m) (fromBlocks [[0, dropColumns 1 theta2]])

    thetaGrad = flatten theta1Grad VG.++ flatten theta2Grad

sigmoid :: Floating a => a -> a
sigmoid x = 1 / (1 + exp (-x))

sigmoid' :: Floating a => a -> a
sigmoid' x = y * (1 - y)
  where y = sigmoid x

inputLayerSize = 400
hiddenLayerSize = 25
numLabel = 10
thetaSize = theta1Size + theta2Size
theta1Size = (inputLayerSize+1) * hiddenLayerSize
theta2Size = (hiddenLayerSize+1) * numLabel
epsilonInit = 0.12

{-
Data files are available from the following URLs.
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

They are converted from ex4data1.mat using Octave as below.

> load('ex4data1.mat');
> csvwrite("ex4data1_X.csv", X);
> csvwrite("ex4data1_y.csv", y);
-}
loadData :: VG.Vector v Double => IO [(v Double, Int)]
loadData = do
  x <- do
    ret <- parseCSVFromFile "ex4data1_X.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [VG.fromList [read x :: Double | x <- row] | row <- rows, row /= [""]]
  y <- do
    ret <- parseCSVFromFile "ex4data1_y.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [if y' == 10 then 0 else y' | [y] <- rows, y /= "", let y' = read y]
  return $ zip x y

train :: Double -> [(VS.Vector Double, Int)] -> IO (VS.Vector Double)
train lambda samples = do
  let samples2 = (fromRows (map fst samples), VG.fromList (map snd samples))
  let params = CG.defaultParameters
               { CG.printFinal  = True
               , CG.printParams = True
               , CG.verbose     = CG.VeryVerbose -- CG.Verbose
               , CG.maxItersFac = 50 / fromIntegral thetaSize
               }
      grad_tol = 0.00001
  (theta0 :: VS.Vector Double) <- randTheta

  (theta, _result1, _stat1) <- CG.optimize params grad_tol theta0
                               (CG.VFunction (fst . cost lambda samples2))
                               (CG.VGradient (snd . cost lambda samples2))
                               (Just (CG.VCombined (cost lambda samples2)))
  return theta

randTheta :: VG.Vector v Double => IO (v Double)
randTheta = do
  gen <- Rand.createSystemRandom
  VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

precision :: VS.Vector Double -> [(VS.Vector Double, Int)] -> Double
precision theta samples = VG.sum (VG.zipWith (\a b -> if a==b then 1 else 0) (predict theta x) y) / m
  where
    x = fromRows (map fst samples)
    y = VG.fromList (map snd samples)
    m = fromIntegral $ length samples

main :: IO ()
main = do
  let trainSize = 200
  samples <- loadData
  (trainData, testData) <- do
    gen <- Rand.createSystemRandom
    samples2 <- Rand.uniformShuffle (V.fromList samples) gen
    return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

  let lambda = 1
  theta <- train lambda trainData

  printf "precision (training data): %f\n" (precision theta trainData)
  printf "precision (test data): %f\n"(precision theta testData)

## nn_hmatrix_static.hs
{-# LANGUAGE
      DataKinds
    , ExistentialQuantification
    , FlexibleContexts
    , ScopedTypeVariables
    , TypeFamilies
    , TypeOperators
#-}
{-# OPTIONS_GHC -fplugin GHC.TypeLits.KnownNat.Solver #-}
{-# OPTIONS_GHC -fplugin GHC.TypeLits.Presburger #-}
{-# OPTIONS_GHC -Wall #-}
import Control.Applicative
import Control.Arrow ((***))
import Control.Monad
import Data.Maybe
import Data.Proxy
import qualified Data.Vector as V
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Generic as VG
import GHC.TypeLits
import qualified Numeric.LinearAlgebra.HMatrix as HM
import Numeric.LinearAlgebra.Static
import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
import System.Exit
import qualified System.Random.MWC as Rand
import qualified System.Random.MWC.Distributions as Rand
import Text.CSV
import Text.Printf

type InputLayerSize = 400
type HiddenLayerSize = 25
type NumLabel = 10
type Params = (L HiddenLayerSize (1 + InputLayerSize), L NumLabel (1 + HiddenLayerSize))

numLabel :: Int
numLabel = fromIntegral (natVal (Proxy :: Proxy NumLabel))

params1Size, params2Size :: Int
params1Size = case size (fst (undefined :: Params)) of { (m,n) -> m*n }
params2Size = case size (snd (undefined :: Params)) of { (m,n) -> m*n }

h :: forall m. KnownNat m => Params -> L m InputLayerSize -> L m NumLabel
h (theta1,theta2) x = a3
  where
    a1 = col 1 ||| x          :: L m (1+InputLayerSize)
    z2 = a1 <> tr theta1      :: L m HiddenLayerSize
    a2 = col 1 ||| sigmoid z2 :: L m (1+HiddenLayerSize)
    z3 = a2 <> tr theta2      :: L m NumLabel
    a3 = sigmoid z3           :: L m NumLabel

predict :: forall m. KnownNat m => Params -> L m InputLayerSize -> HM.Vector Int
predict theta x = VG.fromList . map (VG.maxIndex . unwrap) . toRows $ h theta x

cost
  :: forall m. KnownNat m
  => ℝ -> (L m InputLayerSize, HM.Vector Int) -> Params -> (ℝ, Params)
cost lambda (x,y) (theta1,theta2) = (val, (theta1Grad, theta2Grad))
  where
    m = fromIntegral $ fst $ size x

    a1 = col 1 ||| x          :: L m (1+InputLayerSize)
    z2 = a1 <> tr theta1      :: L m HiddenLayerSize
    a2 = col 1 ||| sigmoid z2 :: L m (1+HiddenLayerSize)
    z3 = a2 <> tr theta2      :: L m NumLabel
    a3 = sigmoid z3           :: L m NumLabel

    y2 :: L m NumLabel
    Just y2 = fromRows [ fromJust $ create $ VS.generate numLabel (\j -> if j == yi then 1 else 0)
                       | yi <- VG.toList y ]

    val = (1 / m) * sumElements (- y2 * log(a3) - (1 - y2) * log(1 - a3))
        + (lambda / (2*m)) * (sumElements (dropFirstColumn theta1 ^ (2::Int)) +
                              sumElements (dropFirstColumn theta2 ^ (2::Int)))

    delta3 :: L m NumLabel
    delta3 = a3 - y2

    delta2 :: L m HiddenLayerSize
    delta2 = dropFirstColumn (delta3 <> theta2) * sigmoid' z2

    theta1Grad :: L HiddenLayerSize (1+InputLayerSize)
    theta1Grad = scale (1/m) (tr delta2 <> a1) + scale (lambda / m) (col 0 ||| dropFirstColumn theta1)

    theta2Grad :: L NumLabel (1+HiddenLayerSize)
    theta2Grad = scale (1/m) (tr delta3 <> a2) + scale (lambda / m) (col 0 ||| dropFirstColumn theta2)

fromRows :: forall m n. (KnownNat m, KnownNat n) => [R n] -> Maybe (L m n)
fromRows xss = withRows xss exactDims

dropFirstColumn :: forall m n. (KnownNat m, KnownNat n) => L m (1 + n) -> L m n
dropFirstColumn mat = snd (splitCols mat :: (L m 1, L m n))

sumElements :: forall m n. (KnownNat m, KnownNat n) => L m n -> ℝ
sumElements = HM.sumElements . unwrap

scale
  :: forall field vec mat n m. (Num field, KnownNat m, KnownNat n)
  => Domain field vec mat => field -> mat m n -> mat m n
scale a = dmmap (a*)

sigmoid :: Floating a => a -> a
sigmoid x = 1 / (1 + exp (-x))

sigmoid' :: Floating a => a -> a
sigmoid' x = y * (1 - y)
  where y = sigmoid x

epsilonInit :: ℝ
epsilonInit = 0.12

{-
Data files are available from the following URLs.
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

They are converted from ex4data1.mat using Octave as below.

> load('ex4data1.mat');
> csvwrite("ex4data1_X.csv", X);
> csvwrite("ex4data1_y.csv", y);
-}
loadData :: IO ([R InputLayerSize], [Int])
loadData = do
  x <- do
    ret <- parseCSVFromFile "ex4data1_X.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows ->
        return [fromMaybe (error "invalid row size") $ create $ VG.fromList [read x | x <- row] | row <- rows, row /= [""]]
  y <- do
    ret <- parseCSVFromFile "ex4data1_y.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return $ [if y' == 10 then 0 else y' | [y] <- rows, y /= "", let y' = read y]
  return (x, y)

train
  :: forall m. KnownNat m
  => ℝ -> (L m InputLayerSize, HM.Vector Int) -> IO Params
train lambda samples = do
  let params = CG.defaultParameters
               { CG.printFinal  = True
               , CG.printParams = True
               , CG.verbose     = CG.VeryVerbose -- CG.Verbose
               , CG.maxItersFac = 50 / fromIntegral (params1Size + params2Size)
               }
      grad_tol = 0.00001
  theta0 <- randTheta
  (theta, _result1, _stat1) <- CG.optimize params grad_tol (pack theta0)
                               (CG.VFunction (fst . cost2))
                               (CG.VGradient (snd . cost2))
                               (Just (CG.VCombined cost2))
  return $ unpack theta
  where
    cost2 :: HM.Vector ℝ -> (ℝ, HM.Vector ℝ)
    cost2 = (id *** pack) . f . unpack
      where
        f :: Params -> (ℝ, Params)
        f = cost lambda samples

    pack :: Params -> HM.Vector ℝ
    pack (theta1,theta2) = HM.flatten (unwrap theta1) VG.++ HM.flatten (unwrap theta2)

    unpack :: HM.Vector ℝ -> Params
    unpack xs =
      case VG.splitAt params1Size xs of
        (ys,zs) -> (matrix (VG.toList ys), matrix (VG.toList zs))

randTheta :: IO Params
randTheta = do
  gen <- Rand.createSystemRandom
  theta1 <- matrix <$> replicateM params1Size (Rand.uniformR (-epsilonInit, epsilonInit) gen)
  theta2 <- matrix <$> replicateM params2Size (Rand.uniformR (-epsilonInit, epsilonInit) gen)
  return (theta1, theta2)

precision
  :: forall m. KnownNat m
  => Params -> (L m InputLayerSize, HM.Vector Int) -> ℝ
precision theta (x,y) = VG.sum (VG.zipWith (\a b -> if a==b then 1 else 0) (predict theta x) y) / m
  where
    m = fromIntegral $ fst (size x)

main :: IO ()
main = do
  let trainSize = 200
  (x,y) <- loadData

  gen <- Rand.createSystemRandom
  indexes <- Rand.uniformShuffle (VG.enumFromN 0 (length x)) gen
  let x' :: V.Vector (R InputLayerSize)
      x' = VG.backpermute (V.fromList x) (VG.convert indexes)
      y' :: HM.Vector Int
      y' = VG.backpermute (VS.fromList y) indexes
      (trainX, testX) = VG.splitAt trainSize x'
      (trainY, testY) = VG.splitAt trainSize y'

  let lambda = 1
  theta <-  withRows (VG.toList trainX) $ \trainX' -> do
    theta <- train lambda (trainX',trainY)
    printf "precision (training data): %f\n" (precision theta (trainX',trainY))
    return theta
  withRows (VG.toList testX) $ \testX' -> do
    printf "precision (test data): %f\n" (precision theta (testX',testY))

## nn_noad.hs
{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
import Control.Monad
import Control.Monad.ST
import Data.STRef
import qualified Data.Vector as V
import qualified Data.Vector.Storable as VS
import Data.Vector.Generic ((!))
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Generic.Mutable as VM
import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
import System.Exit
import qualified System.Random.MWC as Rand
import qualified System.Random.MWC.Distributions as Rand
import Text.CSV
import Text.Printf

h :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> v a
h theta x = a3
  where
    theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
    theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta
    a2 :: v a
    a2 = VG.generate hiddenLayerSize $ \i -> sigmoid $
           let theta1_i = theta1 i
           in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j | j <- [0..inputLayerSize-1] ]
    a3 = VG.generate numLabel $ \i -> sigmoid $
           let theta2_i = theta2 i
           in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j | j <- [0..hiddenLayerSize-1] ]

predict :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> Int
predict theta x = VG.maxIndex (h theta x)

cost :: forall a v. (RealFloat a, VG.Vector v a) => a -> [(v a, Int)] -> v a -> (a, v a)
cost lambda samples theta = runST $ do
  let m = fromIntegral (length samples)
      theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
      theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta

  valRef <- newSTRef 0
  thetaGrad <- VM.replicate thetaSize 0
  let theta1Grad = VM.slice 0 theta1Size thetaGrad
      theta2Grad = VM.slice theta1Size theta2Size thetaGrad

  forM_ samples $ \(x,y) -> do
    let z2 :: v a
        z2 = VG.generate hiddenLayerSize $ \i ->
               let theta1_i = theta1 i
               in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j | j <- [0..inputLayerSize-1] ]
        a2 = VG.map sigmoid z2
        z3 :: v a
        z3 = VG.generate numLabel $ \i ->
               let theta2_i = theta2 i
               in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j | j <- [0..hiddenLayerSize-1] ]
        a3 = VG.map sigmoid z3

    forM_ [0 .. numLabel-1] $ \k ->
      modifySTRef' valRef (\x -> x - log (if k == y then a3!k else 1 - a3!k))

    let delta3 = VG.zipWith (-) a3 (VG.generate numLabel $ \i -> if i == y then 1 else 0)
        delta2 = VG.zipWith (*) (VG.generate hiddenLayerSize (\j -> sum [delta3!i * theta2 i ! (1+j) | i <- [0..numLabel-1]])) (VG.map sigmoid' z2)
    forM_ [0..numLabel-1] $ \i -> do
      modify theta2Grad (+ delta3!i) (i*(hiddenLayerSize+1))
      forM_ [0..hiddenLayerSize-1] $ \j -> do
        modify theta2Grad (+ (delta3!i * a2!j)) (i*(hiddenLayerSize+1)+1+j)
    forM_ [0..hiddenLayerSize-1] $ \i -> do
      modify theta1Grad (+ delta2!i) (i*(inputLayerSize+1))
      forM_ [0..inputLayerSize-1] $ \j -> do
        modify theta1Grad (+ (delta2!i * x!j)) (i*(inputLayerSize+1)+1+j)

  modifySTRef' valRef (/ m)
  forM_ [0..thetaSize-1] $ \i -> modify thetaGrad (/ m) i

  -- Regularization
  modifySTRef' valRef $
    (+ (lambda / (2*m)) * (sum [x**2 | i <- [0..hiddenLayerSize-1], x <- VG.toList (VG.slice 1 inputLayerSize (theta1 i))] +
                           sum [x**2 | i <- [0..numLabel-1], x <- VG.toList (VG.slice 1 hiddenLayerSize (theta2 i))]))
  forM_ [0..numLabel-1] $ \i -> do
    forM_ [0..hiddenLayerSize-1] $ \j -> do
      modify theta2Grad (+ (theta2 i ! (1+j) * lambda / m)) (i*(1+hiddenLayerSize)+1+j)
  forM_ [0..hiddenLayerSize-1] $ \i -> do
    forM_ [0..inputLayerSize-1] $ \j -> do
      modify theta1Grad (+ (theta1 i ! (1+j) * lambda / m)) (i*(1+inputLayerSize)+1+j)

  val <- readSTRef valRef
  g <- VG.unsafeFreeze thetaGrad
  return (val, g)

-- modify :: (PrimMonad m, MVector v a) => v (PrimState m) a -> (a -> a) -> Int -> m ()
modify v f i = do
  x <- VM.read v i
  VM.write v i (f x)

sigmoid :: RealFloat a => a -> a
sigmoid x = 1 / (1 + exp (-x))

sigmoid' :: RealFloat a => a -> a
sigmoid' x = y * (1 - y)
  where y = sigmoid x

inputLayerSize = 400
hiddenLayerSize = 25
numLabel = 10
thetaSize = theta1Size + theta2Size
theta1Size = (inputLayerSize+1) * hiddenLayerSize
theta2Size = (hiddenLayerSize+1) * numLabel
epsilonInit = 0.12

{-
Data files are available from the following URLs.
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

They are converted from ex4data1.mat using Octave as below.

> load('ex4data1.mat');
> csvwrite("ex4data1_X.csv", X);
> csvwrite("ex4data1_y.csv", y);
-}
loadData :: VG.Vector v Double => IO [(v Double, Int)]
loadData = do
  x <- do
    ret <- parseCSVFromFile "ex4data1_X.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [VG.fromList [read x :: Double | x <- row] | row <- rows, row /= [""]]
  y <- do
    ret <- parseCSVFromFile "ex4data1_y.csv"
    case ret of
      Left err -> print err >> exitFailure
      Right rows -> return [if y' == 10 then 0 else y' | [y] <- rows, y /= "", let y' = read y]
  return $ zip x y

train :: Double -> [(VS.Vector Double, Int)] -> IO (VS.Vector Double)
train lambda samples = do
  let params = CG.defaultParameters
               { CG.printFinal  = True
               , CG.printParams = True
               , CG.verbose     = CG.VeryVerbose -- CG.Verbose
               , CG.maxItersFac = 50 / fromIntegral thetaSize
               }
      grad_tol = 0.00001
  (theta0 :: VS.Vector Double) <- randTheta
  (theta, _result1, _stat1) <- CG.optimize params grad_tol theta0
                               (CG.VFunction (fst . cost lambda samples))
                               (CG.VGradient (snd . cost lambda samples))
                               (Just (CG.VCombined (cost lambda samples)))
  return theta

randTheta :: VG.Vector v Double => IO (v Double)
randTheta = do
  gen <- Rand.createSystemRandom
  VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

main :: IO ()
main = do
  let trainSize = 200
  samples <- loadData
  (trainData, testData) <- do
    gen <- Rand.createSystemRandom
    samples2 <- Rand.uniformShuffle (V.fromList samples) gen
    return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

  let lambda = 1
  theta <- train lambda trainData
  printf "precision (training data): %f\n" $ sum [1::Double | (x,y) <- trainData, predict theta x == y] / fromIntegral (length trainData)
  printf "precision (test data): %f\n" $ sum [1::Double | (x,y) <- testData, predict theta x == y] / fromIntegral (length testData)
	{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
	import Control.Monad
	import qualified Data.Vector as V
	import Data.Vector.Generic ((!))
	import qualified Data.Vector.Generic as VG
	import Numeric.AD
	import qualified Numeric.Optimization.Algorithms.HagerZhang05.AD as CG
	import System.Exit
	import qualified System.Random.MWC as Rand
	import qualified System.Random.MWC.Distributions as Rand
	import Text.CSV
	import Text.Printf

	h :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> v a
	h theta x = a3
	where
	theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
	theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta
	a2 :: v a
	a2 = VG.generate hiddenLayerSize $ \i -> sigmoid $
	let theta1_i = theta1 i
	in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j \| j <- [0..inputLayerSize-1] ]
	a3 = VG.generate numLabel $ \i -> sigmoid $
	let theta2_i = theta2 i
	in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j \| j <- [0..hiddenLayerSize-1] ]

	predict :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> Int
	predict theta x = VG.maxIndex (h theta x)

	cost :: forall a v. (RealFloat a, VG.Vector v a) => a -> [(v a, Int)] -> v a -> a
	cost lambda samples theta
	= (1 / m) * sum [-log (if k == y then o!k else 1 - o!k) \| (x,y) <- samples, let o = h theta x, k <- [0 .. numLabel-1]]
	+ (lambda / (2m)) (sum [x**2 \| i <- [0..hiddenLayerSize-1], x <- VG.toList (VG.slice 1 inputLayerSize (theta1 i))] +
	sum [x**2 \| i <- [0..numLabel-1], x <- VG.toList (VG.slice 1 hiddenLayerSize (theta2 i))])
	where
	m = fromIntegral (length samples)
	theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
	theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta

	sigmoid :: RealFloat a => a -> a
	sigmoid x = 1 / (1 + exp (-x))

	inputLayerSize = 400
	hiddenLayerSize = 25
	numLabel = 10
	thetaSize = theta1Size + theta2Size
	theta1Size = (inputLayerSize+1) * hiddenLayerSize
	theta2Size = (hiddenLayerSize+1) * numLabel
	epsilonInit = 0.12

	{-
	Data files are available from the following URLs.
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

	They are converted from ex4data1.mat using Octave as below.

	> load('ex4data1.mat');
	> csvwrite("ex4data1_X.csv", X);
	> csvwrite("ex4data1_y.csv", y);
	-}
	loadData :: VG.Vector v Double => IO [(v Double, Int)]
	loadData = do
	x <- do
	ret <- parseCSVFromFile "ex4data1_X.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [VG.fromList [read x :: Double \| x <- row] \| row <- rows, row /= [""]]
	y <- do
	ret <- parseCSVFromFile "ex4data1_y.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [if y' == 10 then 0 else y' \| [y] <- rows, y /= "", let y' = read y]
	return $ zip x y

	train :: Double -> [(V.Vector Double, Int)] -> IO (V.Vector Double)
	train lambda samples = do
	let params = CG.defaultParameters
	{ CG.printFinal = True
	, CG.printParams = True
	, CG.verbose = CG.VeryVerbose -- CG.Verbose
	, CG.maxItersFac = 50 / fromIntegral thetaSize
	}
	grad_tol = 0.00001
	theta0 <- randTheta
	(theta, _result1, _stat1) <- CG.optimize params grad_tol theta0 (cost (auto lambda) [(fmap auto x, y) \| (x,y) <- samples])
	return theta

	randTheta :: VG.Vector v Double => IO (v Double)
	randTheta = do
	gen <- Rand.createSystemRandom
	VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

	main :: IO ()
	main = do
	let trainSize = 200
	samples <- loadData
	(trainData, testData) <- do
	gen <- Rand.createSystemRandom
	samples2 <- Rand.uniformShuffle (V.fromList samples) gen
	return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

	let lambda = 1
	theta <- train lambda trainData
	printf "precision (training data): %f\n" $ sum [1::Double \| (x,y) <- trainData, predict theta x == y] / fromIntegral (length trainData)
	printf "precision (test data): %f\n" $ sum [1::Double \| (x,y) <- testData, predict theta x == y] / fromIntegral (length testData)
	{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
	import qualified Data.Vector as V
	import qualified Data.Vector.Storable as VS
	import qualified Data.Vector.Generic as VG
	import Numeric.LinearAlgebra.HMatrix
	import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
	import System.Exit
	import qualified System.Random.MWC as Rand
	import qualified System.Random.MWC.Distributions as Rand
	import Text.CSV
	import Text.Printf

	h :: Vector Double -> Matrix Double -> Matrix Double
	h theta x = a3
	where
	theta1 = reshape (inputLayerSize+1) (VG.slice 0 theta1Size theta)
	theta2 = reshape (hiddenLayerSize+1) (VG.slice theta1Size theta2Size theta)
	a1 = fromBlocks [[1, x]]
	z2 = a1 <> tr theta1
	a2 = fromBlocks [[1, sigmoid z2]]
	z3 = a2 <> tr theta2
	a3 = sigmoid z3

	predict :: Vector Double -> Matrix Double -> Vector Int
	predict theta x = VG.fromList . map VG.maxIndex . toRows $ h theta x

	cost :: Double -> (Matrix Double, Vector Int) -> Vector Double -> (Double, Vector Double)
	cost lambda (x,y) theta = (val, thetaGrad)
	where
	m = fromIntegral (VG.length y)

	-- hiddenLayerSize >< 1+inputLayerSize
	theta1 = reshape (inputLayerSize+1) (VG.slice 0 theta1Size theta)

	-- numLabel >< 1+hiddenLayerSize
	theta2 = reshape (hiddenLayerSize+1) (VG.slice theta1Size theta2Size theta)

	a1 = fromBlocks [[1, x]] -- m >< 1+inputLayerSize
	z2 = a1 <> tr theta1 -- m >< hiddenLayerSize
	a2 = fromBlocks [[1, sigmoid z2]] -- m >< 1+hiddenLayerSize
	z3 = a2 <> tr theta2 -- m >< numLabel
	a3 = sigmoid z3 -- m >< numLabel

	-- m >< numLabel
	y2 = fromRows [ VG.fromList [if j == yi then 1 else 0 \| j <- [0..numLabel-1]] \| yi <- VG.toList y ]

	val = (1 / m) * sumElements (- y2 * log(a3) - (1 - y2) * log(1 - a3))
	+ (lambda / (2m)) (sumElements (dropColumns 1 theta1 ^ (2::Int)) +
	sumElements (dropColumns 1 theta2 ^ (2::Int)))

	delta3 = a3 - y2 -- m >< numLabel
	delta2 = dropColumns 1 (delta3 <> theta2) * sigmoid' z2 -- m >< hiddenLayerSize

	-- hiddenLayerSize >< 1+inputLayerSize
	theta1Grad = scale (1/m) (tr delta2 <> a1) + scale (lambda / m) (fromBlocks [[0, dropColumns 1 theta1]])

	-- numLabel >< 1+hiddenLayerSize
	theta2Grad = scale (1/m) (tr delta3 <> a2) + scale (lambda / m) (fromBlocks [[0, dropColumns 1 theta2]])

	thetaGrad = flatten theta1Grad VG.++ flatten theta2Grad

	sigmoid :: Floating a => a -> a
	sigmoid x = 1 / (1 + exp (-x))

	sigmoid' :: Floating a => a -> a
	sigmoid' x = y * (1 - y)
	where y = sigmoid x

	inputLayerSize = 400
	hiddenLayerSize = 25
	numLabel = 10
	thetaSize = theta1Size + theta2Size
	theta1Size = (inputLayerSize+1) * hiddenLayerSize
	theta2Size = (hiddenLayerSize+1) * numLabel
	epsilonInit = 0.12

	{-
	Data files are available from the following URLs.
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

	They are converted from ex4data1.mat using Octave as below.

	> load('ex4data1.mat');
	> csvwrite("ex4data1_X.csv", X);
	> csvwrite("ex4data1_y.csv", y);
	-}
	loadData :: VG.Vector v Double => IO [(v Double, Int)]
	loadData = do
	x <- do
	ret <- parseCSVFromFile "ex4data1_X.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [VG.fromList [read x :: Double \| x <- row] \| row <- rows, row /= [""]]
	y <- do
	ret <- parseCSVFromFile "ex4data1_y.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [if y' == 10 then 0 else y' \| [y] <- rows, y /= "", let y' = read y]
	return $ zip x y

	train :: Double -> [(VS.Vector Double, Int)] -> IO (VS.Vector Double)
	train lambda samples = do
	let samples2 = (fromRows (map fst samples), VG.fromList (map snd samples))
	let params = CG.defaultParameters
	{ CG.printFinal = True
	, CG.printParams = True
	, CG.verbose = CG.VeryVerbose -- CG.Verbose
	, CG.maxItersFac = 50 / fromIntegral thetaSize
	}
	grad_tol = 0.00001
	(theta0 :: VS.Vector Double) <- randTheta

	(theta, _result1, _stat1) <- CG.optimize params grad_tol theta0
	(CG.VFunction (fst . cost lambda samples2))
	(CG.VGradient (snd . cost lambda samples2))
	(Just (CG.VCombined (cost lambda samples2)))
	return theta

	randTheta :: VG.Vector v Double => IO (v Double)
	randTheta = do
	gen <- Rand.createSystemRandom
	VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

	precision :: VS.Vector Double -> [(VS.Vector Double, Int)] -> Double
	precision theta samples = VG.sum (VG.zipWith (\a b -> if a==b then 1 else 0) (predict theta x) y) / m
	where
	x = fromRows (map fst samples)
	y = VG.fromList (map snd samples)
	m = fromIntegral $ length samples

	main :: IO ()
	main = do
	let trainSize = 200
	samples <- loadData
	(trainData, testData) <- do
	gen <- Rand.createSystemRandom
	samples2 <- Rand.uniformShuffle (V.fromList samples) gen
	return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

	let lambda = 1
	theta <- train lambda trainData

	printf "precision (training data): %f\n" (precision theta trainData)
	printf "precision (test data): %f\n"(precision theta testData)
	{-# LANGUAGE
	DataKinds
	, ExistentialQuantification
	, FlexibleContexts
	, ScopedTypeVariables
	, TypeFamilies
	, TypeOperators
	#-}
	{-# OPTIONS_GHC -fplugin GHC.TypeLits.KnownNat.Solver #-}
	{-# OPTIONS_GHC -fplugin GHC.TypeLits.Presburger #-}
	{-# OPTIONS_GHC -Wall #-}
	import Control.Applicative
	import Control.Arrow ((***))
	import Control.Monad
	import Data.Maybe
	import Data.Proxy
	import qualified Data.Vector as V
	import qualified Data.Vector.Storable as VS
	import qualified Data.Vector.Generic as VG
	import GHC.TypeLits
	import qualified Numeric.LinearAlgebra.HMatrix as HM
	import Numeric.LinearAlgebra.Static
	import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
	import System.Exit
	import qualified System.Random.MWC as Rand
	import qualified System.Random.MWC.Distributions as Rand
	import Text.CSV
	import Text.Printf

	type InputLayerSize = 400
	type HiddenLayerSize = 25
	type NumLabel = 10
	type Params = (L HiddenLayerSize (1 + InputLayerSize), L NumLabel (1 + HiddenLayerSize))

	numLabel :: Int
	numLabel = fromIntegral (natVal (Proxy :: Proxy NumLabel))

	params1Size, params2Size :: Int
	params1Size = case size (fst (undefined :: Params)) of { (m,n) -> m*n }
	params2Size = case size (snd (undefined :: Params)) of { (m,n) -> m*n }

	h :: forall m. KnownNat m => Params -> L m InputLayerSize -> L m NumLabel
	h (theta1,theta2) x = a3
	where
	a1 = col 1 \|\|\| x :: L m (1+InputLayerSize)
	z2 = a1 <> tr theta1 :: L m HiddenLayerSize
	a2 = col 1 \|\|\| sigmoid z2 :: L m (1+HiddenLayerSize)
	z3 = a2 <> tr theta2 :: L m NumLabel
	a3 = sigmoid z3 :: L m NumLabel

	predict :: forall m. KnownNat m => Params -> L m InputLayerSize -> HM.Vector Int
	predict theta x = VG.fromList . map (VG.maxIndex . unwrap) . toRows $ h theta x

	cost
	:: forall m. KnownNat m
	=> ℝ -> (L m InputLayerSize, HM.Vector Int) -> Params -> (ℝ, Params)
	cost lambda (x,y) (theta1,theta2) = (val, (theta1Grad, theta2Grad))
	where
	m = fromIntegral $ fst $ size x

	a1 = col 1 \|\|\| x :: L m (1+InputLayerSize)
	z2 = a1 <> tr theta1 :: L m HiddenLayerSize
	a2 = col 1 \|\|\| sigmoid z2 :: L m (1+HiddenLayerSize)
	z3 = a2 <> tr theta2 :: L m NumLabel
	a3 = sigmoid z3 :: L m NumLabel

	y2 :: L m NumLabel
	Just y2 = fromRows [ fromJust $ create $ VS.generate numLabel (\j -> if j == yi then 1 else 0)
	\| yi <- VG.toList y ]

	val = (1 / m) * sumElements (- y2 * log(a3) - (1 - y2) * log(1 - a3))
	+ (lambda / (2m)) (sumElements (dropFirstColumn theta1 ^ (2::Int)) +
	sumElements (dropFirstColumn theta2 ^ (2::Int)))

	delta3 :: L m NumLabel
	delta3 = a3 - y2

	delta2 :: L m HiddenLayerSize
	delta2 = dropFirstColumn (delta3 <> theta2) * sigmoid' z2

	theta1Grad :: L HiddenLayerSize (1+InputLayerSize)
	theta1Grad = scale (1/m) (tr delta2 <> a1) + scale (lambda / m) (col 0 \|\|\| dropFirstColumn theta1)

	theta2Grad :: L NumLabel (1+HiddenLayerSize)
	theta2Grad = scale (1/m) (tr delta3 <> a2) + scale (lambda / m) (col 0 \|\|\| dropFirstColumn theta2)

	fromRows :: forall m n. (KnownNat m, KnownNat n) => [R n] -> Maybe (L m n)
	fromRows xss = withRows xss exactDims

	dropFirstColumn :: forall m n. (KnownNat m, KnownNat n) => L m (1 + n) -> L m n
	dropFirstColumn mat = snd (splitCols mat :: (L m 1, L m n))

	sumElements :: forall m n. (KnownNat m, KnownNat n) => L m n -> ℝ
	sumElements = HM.sumElements . unwrap

	scale
	:: forall field vec mat n m. (Num field, KnownNat m, KnownNat n)
	=> Domain field vec mat => field -> mat m n -> mat m n
	scale a = dmmap (a*)

	sigmoid :: Floating a => a -> a
	sigmoid x = 1 / (1 + exp (-x))

	sigmoid' :: Floating a => a -> a
	sigmoid' x = y * (1 - y)
	where y = sigmoid x

	epsilonInit :: ℝ
	epsilonInit = 0.12

	{-
	Data files are available from the following URLs.
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

	They are converted from ex4data1.mat using Octave as below.

	> load('ex4data1.mat');
	> csvwrite("ex4data1_X.csv", X);
	> csvwrite("ex4data1_y.csv", y);
	-}
	loadData :: IO ([R InputLayerSize], [Int])
	loadData = do
	x <- do
	ret <- parseCSVFromFile "ex4data1_X.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows ->
	return [fromMaybe (error "invalid row size") $ create $ VG.fromList [read x \| x <- row] \| row <- rows, row /= [""]]
	y <- do
	ret <- parseCSVFromFile "ex4data1_y.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return $ [if y' == 10 then 0 else y' \| [y] <- rows, y /= "", let y' = read y]
	return (x, y)

	train
	:: forall m. KnownNat m
	=> ℝ -> (L m InputLayerSize, HM.Vector Int) -> IO Params
	train lambda samples = do
	let params = CG.defaultParameters
	{ CG.printFinal = True
	, CG.printParams = True
	, CG.verbose = CG.VeryVerbose -- CG.Verbose
	, CG.maxItersFac = 50 / fromIntegral (params1Size + params2Size)
	}
	grad_tol = 0.00001
	theta0 <- randTheta
	(theta, _result1, _stat1) <- CG.optimize params grad_tol (pack theta0)
	(CG.VFunction (fst . cost2))
	(CG.VGradient (snd . cost2))
	(Just (CG.VCombined cost2))
	return $ unpack theta
	where
	cost2 :: HM.Vector ℝ -> (ℝ, HM.Vector ℝ)
	cost2 = (id *** pack) . f . unpack
	where
	f :: Params -> (ℝ, Params)
	f = cost lambda samples

	pack :: Params -> HM.Vector ℝ
	pack (theta1,theta2) = HM.flatten (unwrap theta1) VG.++ HM.flatten (unwrap theta2)

	unpack :: HM.Vector ℝ -> Params
	unpack xs =
	case VG.splitAt params1Size xs of
	(ys,zs) -> (matrix (VG.toList ys), matrix (VG.toList zs))

	randTheta :: IO Params
	randTheta = do
	gen <- Rand.createSystemRandom
	theta1 <- matrix <$> replicateM params1Size (Rand.uniformR (-epsilonInit, epsilonInit) gen)
	theta2 <- matrix <$> replicateM params2Size (Rand.uniformR (-epsilonInit, epsilonInit) gen)
	return (theta1, theta2)

	precision
	:: forall m. KnownNat m
	=> Params -> (L m InputLayerSize, HM.Vector Int) -> ℝ
	precision theta (x,y) = VG.sum (VG.zipWith (\a b -> if a==b then 1 else 0) (predict theta x) y) / m
	where
	m = fromIntegral $ fst (size x)

	main :: IO ()
	main = do
	let trainSize = 200
	(x,y) <- loadData

	gen <- Rand.createSystemRandom
	indexes <- Rand.uniformShuffle (VG.enumFromN 0 (length x)) gen
	let x' :: V.Vector (R InputLayerSize)
	x' = VG.backpermute (V.fromList x) (VG.convert indexes)
	y' :: HM.Vector Int
	y' = VG.backpermute (VS.fromList y) indexes
	(trainX, testX) = VG.splitAt trainSize x'
	(trainY, testY) = VG.splitAt trainSize y'

	let lambda = 1
	theta <- withRows (VG.toList trainX) $ \trainX' -> do
	theta <- train lambda (trainX',trainY)
	printf "precision (training data): %f\n" (precision theta (trainX',trainY))
	return theta
	withRows (VG.toList testX) $ \testX' -> do
	printf "precision (test data): %f\n" (precision theta (testX',testY))
	{-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-}
	import Control.Monad
	import Control.Monad.ST
	import Data.STRef
	import qualified Data.Vector as V
	import qualified Data.Vector.Storable as VS
	import Data.Vector.Generic ((!))
	import qualified Data.Vector.Generic as VG
	import qualified Data.Vector.Generic.Mutable as VM
	import qualified Numeric.Optimization.Algorithms.HagerZhang05 as CG
	import System.Exit
	import qualified System.Random.MWC as Rand
	import qualified System.Random.MWC.Distributions as Rand
	import Text.CSV
	import Text.Printf

	h :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> v a
	h theta x = a3
	where
	theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
	theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta
	a2 :: v a
	a2 = VG.generate hiddenLayerSize $ \i -> sigmoid $
	let theta1_i = theta1 i
	in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j \| j <- [0..inputLayerSize-1] ]
	a3 = VG.generate numLabel $ \i -> sigmoid $
	let theta2_i = theta2 i
	in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j \| j <- [0..hiddenLayerSize-1] ]

	predict :: forall a v. (RealFloat a, VG.Vector v a) => v a -> v a -> Int
	predict theta x = VG.maxIndex (h theta x)

	cost :: forall a v. (RealFloat a, VG.Vector v a) => a -> [(v a, Int)] -> v a -> (a, v a)
	cost lambda samples theta = runST $ do
	let m = fromIntegral (length samples)
	theta1 i = VG.slice ((inputLayerSize+1) * i) (inputLayerSize+1) theta
	theta2 i = VG.slice (theta1Size + (hiddenLayerSize+1) * i) (hiddenLayerSize+1) theta

	valRef <- newSTRef 0
	thetaGrad <- VM.replicate thetaSize 0
	let theta1Grad = VM.slice 0 theta1Size thetaGrad
	theta2Grad = VM.slice theta1Size theta2Size thetaGrad

	forM_ samples $ \(x,y) -> do
	let z2 :: v a
	z2 = VG.generate hiddenLayerSize $ \i ->
	let theta1_i = theta1 i
	in theta1_i ! 0 + sum [ theta1_i!(j+1) * x!j \| j <- [0..inputLayerSize-1] ]
	a2 = VG.map sigmoid z2
	z3 :: v a
	z3 = VG.generate numLabel $ \i ->
	let theta2_i = theta2 i
	in theta2_i ! 0 + sum [ theta2_i!(j+1) * a2!j \| j <- [0..hiddenLayerSize-1] ]
	a3 = VG.map sigmoid z3

	forM_ [0 .. numLabel-1] $ \k ->
	modifySTRef' valRef (\x -> x - log (if k == y then a3!k else 1 - a3!k))

	let delta3 = VG.zipWith (-) a3 (VG.generate numLabel $ \i -> if i == y then 1 else 0)
	delta2 = VG.zipWith () (VG.generate hiddenLayerSize (\j -> sum [delta3!i theta2 i ! (1+j) \| i <- [0..numLabel-1]])) (VG.map sigmoid' z2)
	forM_ [0..numLabel-1] $ \i -> do
	modify theta2Grad (+ delta3!i) (i*(hiddenLayerSize+1))
	forM_ [0..hiddenLayerSize-1] $ \j -> do
	modify theta2Grad (+ (delta3!i * a2!j)) (i*(hiddenLayerSize+1)+1+j)
	forM_ [0..hiddenLayerSize-1] $ \i -> do
	modify theta1Grad (+ delta2!i) (i*(inputLayerSize+1))
	forM_ [0..inputLayerSize-1] $ \j -> do
	modify theta1Grad (+ (delta2!i * x!j)) (i*(inputLayerSize+1)+1+j)

	modifySTRef' valRef (/ m)
	forM_ [0..thetaSize-1] $ \i -> modify thetaGrad (/ m) i

	-- Regularization
	modifySTRef' valRef $
	(+ (lambda / (2m)) (sum [x**2 \| i <- [0..hiddenLayerSize-1], x <- VG.toList (VG.slice 1 inputLayerSize (theta1 i))] +
	sum [x**2 \| i <- [0..numLabel-1], x <- VG.toList (VG.slice 1 hiddenLayerSize (theta2 i))]))
	forM_ [0..numLabel-1] $ \i -> do
	forM_ [0..hiddenLayerSize-1] $ \j -> do
	modify theta2Grad (+ (theta2 i ! (1+j) * lambda / m)) (i*(1+hiddenLayerSize)+1+j)
	forM_ [0..hiddenLayerSize-1] $ \i -> do
	forM_ [0..inputLayerSize-1] $ \j -> do
	modify theta1Grad (+ (theta1 i ! (1+j) * lambda / m)) (i*(1+inputLayerSize)+1+j)

	val <- readSTRef valRef
	g <- VG.unsafeFreeze thetaGrad
	return (val, g)

	-- modify :: (PrimMonad m, MVector v a) => v (PrimState m) a -> (a -> a) -> Int -> m ()
	modify v f i = do
	x <- VM.read v i
	VM.write v i (f x)

	sigmoid :: RealFloat a => a -> a
	sigmoid x = 1 / (1 + exp (-x))

	sigmoid' :: RealFloat a => a -> a
	sigmoid' x = y * (1 - y)
	where y = sigmoid x

	inputLayerSize = 400
	hiddenLayerSize = 25
	numLabel = 10
	thetaSize = theta1Size + theta2Size
	theta1Size = (inputLayerSize+1) * hiddenLayerSize
	theta2Size = (hiddenLayerSize+1) * numLabel
	epsilonInit = 0.12

	{-
	Data files are available from the following URLs.
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_X.csv.gz
	* https://dl.dropboxusercontent.com/u/123796/gist/b553004a7ca76310e11c/data/ex4data1_y.csv

	They are converted from ex4data1.mat using Octave as below.

	> load('ex4data1.mat');
	> csvwrite("ex4data1_X.csv", X);
	> csvwrite("ex4data1_y.csv", y);
	-}
	loadData :: VG.Vector v Double => IO [(v Double, Int)]
	loadData = do
	x <- do
	ret <- parseCSVFromFile "ex4data1_X.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [VG.fromList [read x :: Double \| x <- row] \| row <- rows, row /= [""]]
	y <- do
	ret <- parseCSVFromFile "ex4data1_y.csv"
	case ret of
	Left err -> print err >> exitFailure
	Right rows -> return [if y' == 10 then 0 else y' \| [y] <- rows, y /= "", let y' = read y]
	return $ zip x y

	train :: Double -> [(VS.Vector Double, Int)] -> IO (VS.Vector Double)
	train lambda samples = do
	let params = CG.defaultParameters
	{ CG.printFinal = True
	, CG.printParams = True
	, CG.verbose = CG.VeryVerbose -- CG.Verbose
	, CG.maxItersFac = 50 / fromIntegral thetaSize
	}
	grad_tol = 0.00001
	(theta0 :: VS.Vector Double) <- randTheta
	(theta, _result1, _stat1) <- CG.optimize params grad_tol theta0
	(CG.VFunction (fst . cost lambda samples))
	(CG.VGradient (snd . cost lambda samples))
	(Just (CG.VCombined (cost lambda samples)))
	return theta

	randTheta :: VG.Vector v Double => IO (v Double)
	randTheta = do
	gen <- Rand.createSystemRandom
	VG.replicateM thetaSize (Rand.uniformR (-epsilonInit, epsilonInit) gen)

	main :: IO ()
	main = do
	let trainSize = 200
	samples <- loadData
	(trainData, testData) <- do
	gen <- Rand.createSystemRandom
	samples2 <- Rand.uniformShuffle (V.fromList samples) gen
	return (take trainSize (V.toList samples2), drop trainSize (V.toList samples2))

	let lambda = 1
	theta <- train lambda trainData
	printf "precision (training data): %f\n" $ sum [1::Double \| (x,y) <- trainData, predict theta x == y] / fromIntegral (length trainData)
	printf "precision (test data): %f\n" $ sum [1::Double \| (x,y) <- testData, predict theta x == y] / fromIntegral (length testData)