akabe/recursiveNeuralNetwork.ml

## recursiveNeuralNetwork.ml
(* recursiveNeuralNetwork.ml --- Recursive Neural Networks and
                                 Online Backpropagation Through Structure (BPTS)

   [MIT License] Copyright (C) 2015 Akinori ABE

   Compilation:
   $ ocamlfind ocamlopt -linkpkg -package slap recursiveNeuralNetwork.ml

   This program requires Sized Linear Algebra Library (SLAP), a linear algebra
   library for OCaml with static size checking for matrix operations (see
   http://akabe.github.io/slap/ for details).
 *)

open Format
open Slap.Io
open Slap.D
open Slap.Common
module Size = Slap.Size

type 'n tree = Node of ('n, Slap.cnt) vec * 'n tree list

type 'n params = {
    wleft : ('n, 'n, Slap.cnt) mat;
    wright : ('n, 'n, Slap.cnt) mat;
    bias : ('n, Slap.cnt) vec;
  }

let make_params feature_dim =
  {
    wleft = Mat.random feature_dim feature_dim;
    wright = Mat.random feature_dim feature_dim;
    bias = Vec.random feature_dim;
  }

(* activation function *)
let sigm a = 1.0 /. (1.0 +. exp (~-. a))
let actv_f ?y x = Vec.map ?y sigm x
let actv_f' y =
  let ones = Vec.make1 (Vec.dim y) in
  Vec.mul y (Vec.sub ones y)

(** [feedforawd params tree] returns an output of a recursive neural network. *)
let feedforward {wleft; wright; bias} tree =
  let rec calc_vec = function
    | Node (x, []) -> x (* a leaf *)
    | Node (y, children) -> (* a non-leaf node *)
       let last_i = List.length children - 1 in
       let add_child_vec i child =
         let x = calc_vec child in
         let cr = if last_i = 0 then 0.5 else (float i) /. (float last_i) in
         let cl = 1.0 -. cr in
         ignore (gemv ~trans:normal ~alpha:cl wleft  x ~beta:1.0 ~y);
         ignore (gemv ~trans:normal ~alpha:cr wright x ~beta:1.0 ~y)
       in
       ignore (Vec.copy bias ~y);
       List.iteri add_child_vec children;
       actv_f ~y y
  in
  calc_vec tree

(** [feedback grads params tree target] computes gradients by backpropagation
    through structure (BPTS) and stores them into [grads].
 *)
let feedback grads {wleft; wright; bias} tree target =
  Mat.fill grads.wleft 0.0;
  Mat.fill grads.wright 0.0;
  Vec.fill grads.bias 0.0;
  let rec add_grads delta = function
    | [] -> () (* a leaf *)
    | children -> (* a non-leaf node *)
       let last_i = List.length children - 1 in
       let add_grads_child i (Node (x, c_children)) =
         let cr = if last_i = 0 then 0.5 else float i /. float last_i in
         let cl = 1.0 -. cr in
         (* Compute gradients of weights *)
         ignore (ger ~alpha:cl delta x grads.wleft);
         ignore (ger ~alpha:cr delta x grads.wright);
         (* Compute delte for a child *)
         let z = gemv ~trans:trans ~alpha:cl wleft  delta in
         ignore (gemv ~trans:trans ~alpha:cr wright delta ~beta:1.0 ~y:z);
         let c_delta = Vec.mul z (actv_f' x) in
         add_grads c_delta c_children
       in
       List.iteri add_grads_child children;
       axpy ~alpha:1.0 ~x:delta grads.bias (* Compute gradients of biases *)
  in
  let Node (y, children) = tree in
  let root_delta = Vec.mul (Vec.sub y target) (actv_f' y) in
  add_grads root_delta children

(** [check_gradient grads params tree target] checks whether given gradients
    [grads] is correct or not by comparison with results of naive numerical
    differentiation. This routine is only for checking implementation.
    The numerical differentiation is much slower than back propagation.

    cf. http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization
*)
let check_gradient grads params tree target =
  let epsilon = 1e-4 in
  let check_digits dE1 dE2 = (* Check 4 significant digits *)
    let abs_dE1 = abs_float dE1 in
    if abs_dE1 < 1e-9
    then abs_float dE2 < 1e-9 (* true if both `dE1' and `dE2' are nealy zero *)
    else let diff = (dE1 -. dE2) *. (0.1 ** (floor (log10 abs_dE1) +. 1.0)) in
         abs_float diff < epsilon (* true if 4 significant digits are the same *)
  in
  let calc_error () = Vec.ssqr_diff (feedforward params tree) target /. 2.0 in
  let check_vec label x dx =
    let check i dE2 =
      let elm = Vec.get_dyn x i in
      Vec.set_dyn x i (elm +. epsilon);
      let pos_err = calc_error () in
      Vec.set_dyn x i (elm -. epsilon);
      let neg_err = calc_error () in
      Vec.set_dyn x i elm; (* restore *)
      let dE1 = (pos_err -. neg_err) /. (2.0 *. epsilon) in
      if not (check_digits dE1 dE2)
      then eprintf "WARNING: %s[%d] naive diff = %.6g, backprop = %.6g@."
                   label i dE1 dE2
    in
    Vec.iteri check dx
  in
  let check_mat label a da =
    Mat.fold_topi (fun i () ai ->
                   let label' = label ^ "[" ^ (string_of_int i) ^ "]" in
                   check_vec label' ai (Mat.row_dyn da i)) () a
  in
  check_vec "dE/db" params.bias grads.bias;
  check_mat "dE/dWleft" params.wleft grads.wleft;
  check_mat "dE/dWright" params.wright grads.wright

(** Online training for a recursive neural network. *)
let train ~eta grads params tree target =
  ignore (feedforward params tree); (* feedforward *)
  feedback grads params tree target; (* feedback *)
  check_gradient grads params tree target; (* gradient checking *)
  (* Update parameters *)
  let alpha = ~-. eta in
  Mat.axpy ~alpha ~x:grads.wleft params.wleft;
  Mat.axpy ~alpha ~x:grads.wright params.wright;
  axpy ~alpha ~x:grads.bias params.bias

let main () =
  Random.self_init ();
  let module N = (val Size.of_int_dyn 5 : Size.SIZE) in
  let node children = Node (Vec.create N.value, children) in
  let leaf feature_vec = Node (feature_vec, []) in
  let tree = node [
                 node [
                     leaf (Vec.make N.value 1.0);
                     node [leaf (Vec.make N.value 2.0);
                           leaf (Vec.make N.value 3.0)]];
                 leaf (Vec.make N.value 4.0);
                 node [leaf (Vec.make N.value 5.0);
                       leaf (Vec.make N.value 6.0);
                       leaf (Vec.make N.value 7.0)];
               ] in
  let target = Vec.make N.value 0.6 in
  let params = make_params N.value in
  let grads = make_params N.value in
  let eta = ref 0.02 in
  for i = 1 to 1000 do
    train ~eta:!eta grads params tree target;
    eta := !eta *. 0.99
  done;
  printf "target = [ %a]; prediction = [ %a]@."
         pp_rfvec target pp_rfvec (feedforward params tree)

let () = main ()
	(* recursiveNeuralNetwork.ml --- Recursive Neural Networks and
	Online Backpropagation Through Structure (BPTS)

	[MIT License] Copyright (C) 2015 Akinori ABE

	Compilation:
	$ ocamlfind ocamlopt -linkpkg -package slap recursiveNeuralNetwork.ml

	This program requires Sized Linear Algebra Library (SLAP), a linear algebra
	library for OCaml with static size checking for matrix operations (see
	http://akabe.github.io/slap/ for details).
	*)

	open Format
	open Slap.Io
	open Slap.D
	open Slap.Common
	module Size = Slap.Size

	type 'n tree = Node of ('n, Slap.cnt) vec * 'n tree list

	type 'n params = {
	wleft : ('n, 'n, Slap.cnt) mat;
	wright : ('n, 'n, Slap.cnt) mat;
	bias : ('n, Slap.cnt) vec;
	}

	let make_params feature_dim =
	{
	wleft = Mat.random feature_dim feature_dim;
	wright = Mat.random feature_dim feature_dim;
	bias = Vec.random feature_dim;
	}

	(* activation function *)
	let sigm a = 1.0 /. (1.0 +. exp (~-. a))
	let actv_f ?y x = Vec.map ?y sigm x
	let actv_f' y =
	let ones = Vec.make1 (Vec.dim y) in
	Vec.mul y (Vec.sub ones y)

	(** [feedforawd params tree] returns an output of a recursive neural network. *)
	let feedforward {wleft; wright; bias} tree =
	let rec calc_vec = function
	\| Node (x, []) -> x (* a leaf *)
	\| Node (y, children) -> (* a non-leaf node *)
	let last_i = List.length children - 1 in
	let add_child_vec i child =
	let x = calc_vec child in
	let cr = if last_i = 0 then 0.5 else (float i) /. (float last_i) in
	let cl = 1.0 -. cr in
	ignore (gemv ~trans:normal ~alpha:cl wleft x ~beta:1.0 ~y);
	ignore (gemv ~trans:normal ~alpha:cr wright x ~beta:1.0 ~y)
	in
	ignore (Vec.copy bias ~y);
	List.iteri add_child_vec children;
	actv_f ~y y
	in
	calc_vec tree

	(** [feedback grads params tree target] computes gradients by backpropagation
	through structure (BPTS) and stores them into [grads].
	*)
	let feedback grads {wleft; wright; bias} tree target =
	Mat.fill grads.wleft 0.0;
	Mat.fill grads.wright 0.0;
	Vec.fill grads.bias 0.0;
	let rec add_grads delta = function
	\| [] -> () (* a leaf *)
	\| children -> (* a non-leaf node *)
	let last_i = List.length children - 1 in
	let add_grads_child i (Node (x, c_children)) =
	let cr = if last_i = 0 then 0.5 else float i /. float last_i in
	let cl = 1.0 -. cr in
	(* Compute gradients of weights *)
	ignore (ger ~alpha:cl delta x grads.wleft);
	ignore (ger ~alpha:cr delta x grads.wright);
	(* Compute delte for a child *)
	let z = gemv ~trans:trans ~alpha:cl wleft delta in
	ignore (gemv ~trans:trans ~alpha:cr wright delta ~beta:1.0 ~y:z);
	let c_delta = Vec.mul z (actv_f' x) in
	add_grads c_delta c_children
	in
	List.iteri add_grads_child children;
	axpy ~alpha:1.0 ~x:delta grads.bias (* Compute gradients of biases *)
	in
	let Node (y, children) = tree in
	let root_delta = Vec.mul (Vec.sub y target) (actv_f' y) in
	add_grads root_delta children

	(** [check_gradient grads params tree target] checks whether given gradients
	[grads] is correct or not by comparison with results of naive numerical
	differentiation. This routine is only for checking implementation.
	The numerical differentiation is much slower than back propagation.

	cf. http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization
	*)
	let check_gradient grads params tree target =
	let epsilon = 1e-4 in
	let check_digits dE1 dE2 = (* Check 4 significant digits *)
	let abs_dE1 = abs_float dE1 in
	if abs_dE1 < 1e-9
	then abs_float dE2 < 1e-9 (* true if both `dE1' and `dE2' are nealy zero *)
	else let diff = (dE1 -. dE2) . (0.1 * (floor (log10 abs_dE1) +. 1.0)) in
	abs_float diff < epsilon (* true if 4 significant digits are the same *)
	in
	let calc_error () = Vec.ssqr_diff (feedforward params tree) target /. 2.0 in
	let check_vec label x dx =
	let check i dE2 =
	let elm = Vec.get_dyn x i in
	Vec.set_dyn x i (elm +. epsilon);
	let pos_err = calc_error () in
	Vec.set_dyn x i (elm -. epsilon);
	let neg_err = calc_error () in
	Vec.set_dyn x i elm; (* restore *)
	let dE1 = (pos_err -. neg_err) /. (2.0 *. epsilon) in
	if not (check_digits dE1 dE2)
	then eprintf "WARNING: %s[%d] naive diff = %.6g, backprop = %.6g@."
	label i dE1 dE2
	in
	Vec.iteri check dx
	in
	let check_mat label a da =
	Mat.fold_topi (fun i () ai ->
	let label' = label ^ "[" ^ (string_of_int i) ^ "]" in
	check_vec label' ai (Mat.row_dyn da i)) () a
	in
	check_vec "dE/db" params.bias grads.bias;
	check_mat "dE/dWleft" params.wleft grads.wleft;
	check_mat "dE/dWright" params.wright grads.wright

	(** Online training for a recursive neural network. *)
	let train ~eta grads params tree target =
	ignore (feedforward params tree); (* feedforward *)
	feedback grads params tree target; (* feedback *)
	check_gradient grads params tree target; (* gradient checking *)
	(* Update parameters *)
	let alpha = ~-. eta in
	Mat.axpy ~alpha ~x:grads.wleft params.wleft;
	Mat.axpy ~alpha ~x:grads.wright params.wright;
	axpy ~alpha ~x:grads.bias params.bias

	let main () =
	Random.self_init ();
	let module N = (val Size.of_int_dyn 5 : Size.SIZE) in
	let node children = Node (Vec.create N.value, children) in
	let leaf feature_vec = Node (feature_vec, []) in
	let tree = node [
	node [
	leaf (Vec.make N.value 1.0);
	node [leaf (Vec.make N.value 2.0);
	leaf (Vec.make N.value 3.0)]];
	leaf (Vec.make N.value 4.0);
	node [leaf (Vec.make N.value 5.0);
	leaf (Vec.make N.value 6.0);
	leaf (Vec.make N.value 7.0)];
	] in
	let target = Vec.make N.value 0.6 in
	let params = make_params N.value in
	let grads = make_params N.value in
	let eta = ref 0.02 in
	for i = 1 to 1000 do
	train ~eta:!eta grads params tree target;
	eta := !eta *. 0.99
	done;
	printf "target = [ %a]; prediction = [ %a]@."
	pp_rfvec target pp_rfvec (feedforward params tree)

	let () = main ()