izmailovpavel/simple_skier.ipynb

## simple_skier.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simplified skiing\n",
    "\n",
    "We have a strip of shape 50x5, and a skier is moving down it. Each 5 rows, there is a gate. Possibly, we will add trees later. At the end the skier is told how many gates he went through. At each time the skier observes number of rows to the next gate, the position of the next gate, and his position. \n",
    "\n",
    "We want to use this example to test the RLGAN idea."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def one_hot_many_features(arr, n_values):\n",
    "    total_n_values = np.prod(n_values)\n",
    "    n_values.append(1)\n",
    "    weights = np.cumprod(n_values[::-1])[::-1][1:]\n",
    "    weights = weights[:, None]\n",
    "#     print(weights, total_n_values)\n",
    "    combined_features = arr.dot(weights)\n",
    "    encoder = OneHotEncoder(n_values=total_n_values)\n",
    "    new_features = encoder.fit_transform(combined_features)\n",
    "    return new_features.toarray()\n",
    "\n",
    "class SimpleSkiGame:\n",
    "    \n",
    "    def __init__(self, strip_width=5, num_gates=10, gate_freq=5):\n",
    "        self.strip_width = strip_width\n",
    "        self.num_gates = num_gates\n",
    "        self.gate_freq = gate_freq\n",
    "        self.strip_len = self.num_gates * self.gate_freq\n",
    "        \n",
    "    def generate_game(self):\n",
    "        gates = np.random.randint(low=0, high=self.strip_width, size=self.num_gates)\n",
    "        initial_position = np.random.randint(low=0, high=self.strip_width)\n",
    "        actions = np.random.randint(low=-1, high=2, size=self.strip_len - 1)\n",
    "        trajectory = [initial_position]\n",
    "        for a in actions:\n",
    "            next_position = trajectory[-1] + a\n",
    "            if next_position >= self.strip_width:\n",
    "                next_position = self.strip_width - 1\n",
    "            elif next_position < 0:\n",
    "                next_position = 0\n",
    "            trajectory.append(next_position)\n",
    "        dist_to_gate = np.arange(4, -1, -1)\n",
    "        dist_to_gate = np.tile(dist_to_gate, self.num_gates)[:, None]\n",
    "        trajectory = np.array(trajectory)[:, None]\n",
    "        trajectory_at_gates = trajectory.reshape((-1, self.gate_freq))[:, -1]\n",
    "        reward = np.sum(trajectory_at_gates == gates)\n",
    "        gates = np.repeat(gates, self.gate_freq)\n",
    "        gates = gates[:-1, None]\n",
    "        trajectory = trajectory = trajectory[:-1]\n",
    "        dist_to_gate = dist_to_gate[:-1]\n",
    "        actions = actions[:, None] + 1\n",
    "        states = self.transform_state_action_to_onehot(gates, trajectory, dist_to_gate, actions)\n",
    "#         print(gates)\n",
    "#         print(trajectory)\n",
    "#         print(dist_to_gate)\n",
    "#         state_action_features = np.hstack([gates[:-1], trajectory[:-1], dist_to_gate[:-1], actions[:, None]+1])\n",
    "#         states = one_hot_many_features(state_action_features, \n",
    "#                                        [self.strip_width, self.strip_width, self.gate_freq, 3])\n",
    "        \n",
    "        return states, reward\n",
    "    \n",
    "    def transform_state_action_to_onehot(self, gates, trajectory, dist_to_gate, actions):\n",
    "        state_action_features = np.hstack([gates, trajectory, dist_to_gate, actions])\n",
    "        states = one_hot_many_features(state_action_features, \n",
    "                                       [self.strip_width, self.strip_width, self.gate_freq, 3])\n",
    "        return states\n",
    "#     def one_hot(self, arr):\n",
    "#         \"\"\"\n",
    "#         Combines three categorical features into one, and applies one hot encoding.\n",
    "#         \"\"\"\n",
    "#         weights = np.array([self.strip_width * self.gate_freq, self.gate_freq, 1])[:, None]\n",
    "#         combined_features = arr.dot(weights)\n",
    "#         encoder = OneHotEncoder(n_values=self.strip_width * self.strip_width * self.gate_freq)\n",
    "#         new_features = encoder.fit_transform(combined_features)\n",
    "#         return new_features.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "game = SimpleSkiGame()\n",
    "s, r = game.generate_game()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate many games"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "n_games = 500\n",
    "game = SimpleSkiGame()\n",
    "states = []\n",
    "actions = []\n",
    "rewards = []\n",
    "for i in range(n_games):\n",
    "    s, r = game.generate_game()\n",
    "    states.append(s)\n",
    "    rewards.append(r)\n",
    "states = np.array(states)\n",
    "rewards = np.array(rewards)[:, None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 1)"
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rewards.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 49, 375)"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "states.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "S = tf.Variable(initial_value=states, trainable=False)\n",
    "r = tf.Variable(initial_value=rewards.astype(float), trainable=False)\n",
    "num_tr = 400\n",
    "S_tr = S[:num_tr]\n",
    "S_te = S[num_tr:]\n",
    "r_tr = r[:num_tr]\n",
    "r_te = r[num_tr:]\n",
    "w = tf.Variable(initial_value=tf.zeros((states.shape[-1]), dtype=tf.float64))#, dtype=tf.float64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def predict_rewards(S, w):\n",
    "    return tf.einsum('gtf,f->g', S, w)[:, None]\n",
    "\n",
    "def compute_loss(r, predicted_rewards):\n",
    "    return tf.reduce_mean((r - predicted_rewards)**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pred_tr = predict_rewards(S_tr, w)\n",
    "pred_te = predict_rewards(S_te, w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tr_loss = compute_loss(r_tr, pred_tr)\n",
    "te_loss = compute_loss(r_te, pred_te)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sgd = tf.train.GradientDescentOptimizer(learning_rate=1e-1)\n",
    "train_op = sgd.minimize(tr_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sess = tf.Session()\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train loss: 6.3425\n",
      "Test loss: 2.25397945338\n",
      "Train loss: 0.379442088898\n",
      "Test loss: 0.595460997003\n",
      "Train loss: 0.153750168329\n",
      "Test loss: 0.338179879635\n",
      "Train loss: 0.0837601282732\n",
      "Test loss: 0.234497723606\n",
      "Train loss: 0.0546310347671\n",
      "Test loss: 0.183248186286\n",
      "Train loss: 0.0399317846675\n",
      "Test loss: 0.154180699614\n",
      "Train loss: 0.0314565089633\n",
      "Test loss: 0.135889983082\n",
      "Train loss: 0.026074058861\n",
      "Test loss: 0.123395368644\n",
      "Train loss: 0.0223974063954\n",
      "Test loss: 0.114284569812\n",
      "Train loss: 0.0197402973611\n",
      "Test loss: 0.107293189233\n",
      "Train loss: 0.0177331251402\n",
      "Test loss: 0.101713545277\n",
      "Train loss: 0.0161628263052\n",
      "Test loss: 0.0971257320751\n",
      "Train loss: 0.014899440224\n",
      "Test loss: 0.0932667862562\n",
      "Train loss: 0.0138597835011\n",
      "Test loss: 0.0899635627337\n",
      "Train loss: 0.0129883488206\n",
      "Test loss: 0.0870967155535\n",
      "Train loss: 0.0122467314998\n",
      "Test loss: 0.0845805740204\n",
      "Train loss: 0.0116075102425\n",
      "Test loss: 0.0823514607107\n",
      "Train loss: 0.0110505631841\n",
      "Test loss: 0.0803606499605\n",
      "Train loss: 0.0105607699468\n",
      "Test loss: 0.0785699614539\n",
      "Train loss: 0.0101265314761\n",
      "Test loss: 0.0769488988728\n",
      "Train loss: 0.0097387884921\n",
      "Test loss: 0.0754727242866\n",
      "Train loss: 0.00939035324234\n",
      "Test loss: 0.0741211181998\n",
      "Train loss: 0.00907544361886\n",
      "Test loss: 0.0728772184349\n",
      "Train loss: 0.00878935133292\n",
      "Test loss: 0.0717269120985\n",
      "Train loss: 0.00852820096685\n",
      "Test loss: 0.0706583018814\n",
      "Train loss: 0.00828877194043\n",
      "Test loss: 0.0696612958774\n",
      "Train loss: 0.00806836487412\n",
      "Test loss: 0.068727287151\n",
      "Train loss: 0.00786469983166\n",
      "Test loss: 0.0678488999665\n",
      "Train loss: 0.00767583782069\n",
      "Test loss: 0.0670197864754\n",
      "Train loss: 0.00750011950963\n",
      "Test loss: 0.0662344622183\n",
      "Train loss: 0.00733611685951\n",
      "Test loss: 0.065488171894\n",
      "Train loss: 0.00718259456421\n",
      "Test loss: 0.0647767790094\n",
      "Train loss: 0.00703847902451\n",
      "Test loss: 0.0640966745606\n",
      "Train loss: 0.00690283317036\n",
      "Test loss: 0.0634447010142\n",
      "Train loss: 0.00677483586769\n",
      "Test loss: 0.0628180886843\n",
      "Train loss: 0.00665376495185\n",
      "Test loss: 0.0622144022209\n",
      "Train loss: 0.0065389831547\n",
      "Test loss: 0.0616314953997\n",
      "Train loss: 0.0064299263591\n",
      "Test loss: 0.0610674727634\n",
      "Train loss: 0.0063260937389\n",
      "Test loss: 0.0605206569501\n",
      "Train loss: 0.00622703943767\n",
      "Test loss: 0.059989560765\n"
     ]
    }
   ],
   "source": [
    "num_iter=400\n",
    "for i in range(num_iter):\n",
    "    _, loss_val = sess.run([train_op, tr_loss])\n",
    "    if not (i%10):\n",
    "        print('Train loss:', loss_val)\n",
    "        print('Test loss:', sess.run(te_loss))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.059523919593294561"
      ]
     },
     "execution_count": 214,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sess.run(te_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "w_model = sess.run(w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sess.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['.' '.' '.' '.' '.']\n",
      " ['.' '.' '.' '.' '.']\n",
      " ['.' '.' '.' '.' '.']\n",
      " ['.' '.' '.' 'S' '.']\n",
      " ['.' '.' '.' '.' 'G']]\n",
      "left : [-0.15964259]\n",
      "down : [-0.12028602]\n",
      "right : [ 0.50504172]\n"
     ]
    }
   ],
   "source": [
    "gate = 4 #0 ... 4, position of gate\n",
    "pos = 3 #0 ... 4, position of skier\n",
    "dist = 1 #0 ... 4, vertical distance to gate\n",
    "\n",
    "gates = np.array([[gate]]) \n",
    "trajectory = ([[pos]]) \n",
    "dist_to_gate = ([[dist]]) \n",
    "\n",
    "print(gen_pic(gate, pos, dist))\n",
    "\n",
    "for a, action in enumerate(['left', 'down', 'right']):\n",
    "    actions = np.array([[a]]) #0 ... 2, left down right\n",
    "    print(action, ': ', end='')\n",
    "    print(game.transform_state_action_to_onehot(gates, trajectory, dist_to_gate, actions).dot(w_model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def gen_pic(gate, pos_x, pos_y):\n",
    "    res = np.array([['.']*5]*5)\n",
    "    res[4-pos_y, pos_x] = 'S'\n",
    "    res[4, gate] = 'G'\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 273,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['.', '.', '.', '.', '.'],\n",
       "       ['.', '.', '.', '.', '.'],\n",
       "       ['.', '.', '.', '.', '.'],\n",
       "       ['S', '.', '.', '.', '.'],\n",
       "       ['.', 'G', '.', '.', '.']],\n",
       "      dtype='<U1')"
      ]
     },
     "execution_count": 273,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Simplified skiing\n",
	"\n",
	"We have a strip of shape 50x5, and a skier is moving down it. Each 5 rows, there is a gate. Possibly, we will add trees later. At the end the skier is told how many gates he went through. At each time the skier observes number of rows to the next gate, the position of the next gate, and his position. \n",
	"\n",
	"We want to use this example to test the RLGAN idea."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.preprocessing import OneHotEncoder"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 146,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def one_hot_many_features(arr, n_values):\n",
	" total_n_values = np.prod(n_values)\n",
	" n_values.append(1)\n",
	" weights = np.cumprod(n_values[::-1])[::-1][1:]\n",
	" weights = weights[:, None]\n",
	"# print(weights, total_n_values)\n",
	" combined_features = arr.dot(weights)\n",
	" encoder = OneHotEncoder(n_values=total_n_values)\n",
	" new_features = encoder.fit_transform(combined_features)\n",
	" return new_features.toarray()\n",
	"\n",
	"class SimpleSkiGame:\n",
	" \n",
	" def __init__(self, strip_width=5, num_gates=10, gate_freq=5):\n",
	" self.strip_width = strip_width\n",
	" self.num_gates = num_gates\n",
	" self.gate_freq = gate_freq\n",
	" self.strip_len = self.num_gates * self.gate_freq\n",
	" \n",
	" def generate_game(self):\n",
	" gates = np.random.randint(low=0, high=self.strip_width, size=self.num_gates)\n",
	" initial_position = np.random.randint(low=0, high=self.strip_width)\n",
	" actions = np.random.randint(low=-1, high=2, size=self.strip_len - 1)\n",
	" trajectory = [initial_position]\n",
	" for a in actions:\n",
	" next_position = trajectory[-1] + a\n",
	" if next_position >= self.strip_width:\n",
	" next_position = self.strip_width - 1\n",
	" elif next_position < 0:\n",
	" next_position = 0\n",
	" trajectory.append(next_position)\n",
	" dist_to_gate = np.arange(4, -1, -1)\n",
	" dist_to_gate = np.tile(dist_to_gate, self.num_gates)[:, None]\n",
	" trajectory = np.array(trajectory)[:, None]\n",
	" trajectory_at_gates = trajectory.reshape((-1, self.gate_freq))[:, -1]\n",
	" reward = np.sum(trajectory_at_gates == gates)\n",
	" gates = np.repeat(gates, self.gate_freq)\n",
	" gates = gates[:-1, None]\n",
	" trajectory = trajectory = trajectory[:-1]\n",
	" dist_to_gate = dist_to_gate[:-1]\n",
	" actions = actions[:, None] + 1\n",
	" states = self.transform_state_action_to_onehot(gates, trajectory, dist_to_gate, actions)\n",
	"# print(gates)\n",
	"# print(trajectory)\n",
	"# print(dist_to_gate)\n",
	"# state_action_features = np.hstack([gates[:-1], trajectory[:-1], dist_to_gate[:-1], actions[:, None]+1])\n",
	"# states = one_hot_many_features(state_action_features, \n",
	"# [self.strip_width, self.strip_width, self.gate_freq, 3])\n",
	" \n",
	" return states, reward\n",
	" \n",
	" def transform_state_action_to_onehot(self, gates, trajectory, dist_to_gate, actions):\n",
	" state_action_features = np.hstack([gates, trajectory, dist_to_gate, actions])\n",
	" states = one_hot_many_features(state_action_features, \n",
	" [self.strip_width, self.strip_width, self.gate_freq, 3])\n",
	" return states\n",
	"# def one_hot(self, arr):\n",
	"# \"\"\"\n",
	"# Combines three categorical features into one, and applies one hot encoding.\n",
	"# \"\"\"\n",
	"# weights = np.array([self.strip_width * self.gate_freq, self.gate_freq, 1])[:, None]\n",
	"# combined_features = arr.dot(weights)\n",
	"# encoder = OneHotEncoder(n_values=self.strip_width * self.strip_width * self.gate_freq)\n",
	"# new_features = encoder.fit_transform(combined_features)\n",
	"# return new_features.toarray()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 147,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"game = SimpleSkiGame()\n",
	"s, r = game.generate_game()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Generate many games"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 176,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"n_games = 500\n",
	"game = SimpleSkiGame()\n",
	"states = []\n",
	"actions = []\n",
	"rewards = []\n",
	"for i in range(n_games):\n",
	" s, r = game.generate_game()\n",
	" states.append(s)\n",
	" rewards.append(r)\n",
	"states = np.array(states)\n",
	"rewards = np.array(rewards)[:, None]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 177,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(500, 1)"
	]
	},
	"execution_count": 177,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"rewards.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 178,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(500, 49, 375)"
	]
	},
	"execution_count": 178,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"states.shape"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Solve"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 189,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import tensorflow as tf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 205,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"tf.reset_default_graph()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 206,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"S = tf.Variable(initial_value=states, trainable=False)\n",
	"r = tf.Variable(initial_value=rewards.astype(float), trainable=False)\n",
	"num_tr = 400\n",
	"S_tr = S[:num_tr]\n",
	"S_te = S[num_tr:]\n",
	"r_tr = r[:num_tr]\n",
	"r_te = r[num_tr:]\n",
	"w = tf.Variable(initial_value=tf.zeros((states.shape[-1]), dtype=tf.float64))#, dtype=tf.float64)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 207,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def predict_rewards(S, w):\n",
	" return tf.einsum('gtf,f->g', S, w)[:, None]\n",
	"\n",
	"def compute_loss(r, predicted_rewards):\n",
	" return tf.reduce_mean((r - predicted_rewards)**2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 208,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pred_tr = predict_rewards(S_tr, w)\n",
	"pred_te = predict_rewards(S_te, w)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 209,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"tr_loss = compute_loss(r_tr, pred_tr)\n",
	"te_loss = compute_loss(r_te, pred_te)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 210,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sgd = tf.train.GradientDescentOptimizer(learning_rate=1e-1)\n",
	"train_op = sgd.minimize(tr_loss)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 211,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sess = tf.Session()\n",
	"sess.run(tf.global_variables_initializer())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 212,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train loss: 6.3425\n",
	"Test loss: 2.25397945338\n",
	"Train loss: 0.379442088898\n",
	"Test loss: 0.595460997003\n",
	"Train loss: 0.153750168329\n",
	"Test loss: 0.338179879635\n",
	"Train loss: 0.0837601282732\n",
	"Test loss: 0.234497723606\n",
	"Train loss: 0.0546310347671\n",
	"Test loss: 0.183248186286\n",
	"Train loss: 0.0399317846675\n",
	"Test loss: 0.154180699614\n",
	"Train loss: 0.0314565089633\n",
	"Test loss: 0.135889983082\n",
	"Train loss: 0.026074058861\n",
	"Test loss: 0.123395368644\n",
	"Train loss: 0.0223974063954\n",
	"Test loss: 0.114284569812\n",
	"Train loss: 0.0197402973611\n",
	"Test loss: 0.107293189233\n",
	"Train loss: 0.0177331251402\n",
	"Test loss: 0.101713545277\n",
	"Train loss: 0.0161628263052\n",
	"Test loss: 0.0971257320751\n",
	"Train loss: 0.014899440224\n",
	"Test loss: 0.0932667862562\n",
	"Train loss: 0.0138597835011\n",
	"Test loss: 0.0899635627337\n",
	"Train loss: 0.0129883488206\n",
	"Test loss: 0.0870967155535\n",
	"Train loss: 0.0122467314998\n",
	"Test loss: 0.0845805740204\n",
	"Train loss: 0.0116075102425\n",
	"Test loss: 0.0823514607107\n",
	"Train loss: 0.0110505631841\n",
	"Test loss: 0.0803606499605\n",
	"Train loss: 0.0105607699468\n",
	"Test loss: 0.0785699614539\n",
	"Train loss: 0.0101265314761\n",
	"Test loss: 0.0769488988728\n",
	"Train loss: 0.0097387884921\n",
	"Test loss: 0.0754727242866\n",
	"Train loss: 0.00939035324234\n",
	"Test loss: 0.0741211181998\n",
	"Train loss: 0.00907544361886\n",
	"Test loss: 0.0728772184349\n",
	"Train loss: 0.00878935133292\n",
	"Test loss: 0.0717269120985\n",
	"Train loss: 0.00852820096685\n",
	"Test loss: 0.0706583018814\n",
	"Train loss: 0.00828877194043\n",
	"Test loss: 0.0696612958774\n",
	"Train loss: 0.00806836487412\n",
	"Test loss: 0.068727287151\n",
	"Train loss: 0.00786469983166\n",
	"Test loss: 0.0678488999665\n",
	"Train loss: 0.00767583782069\n",
	"Test loss: 0.0670197864754\n",
	"Train loss: 0.00750011950963\n",
	"Test loss: 0.0662344622183\n",
	"Train loss: 0.00733611685951\n",
	"Test loss: 0.065488171894\n",
	"Train loss: 0.00718259456421\n",
	"Test loss: 0.0647767790094\n",
	"Train loss: 0.00703847902451\n",
	"Test loss: 0.0640966745606\n",
	"Train loss: 0.00690283317036\n",
	"Test loss: 0.0634447010142\n",
	"Train loss: 0.00677483586769\n",
	"Test loss: 0.0628180886843\n",
	"Train loss: 0.00665376495185\n",
	"Test loss: 0.0622144022209\n",
	"Train loss: 0.0065389831547\n",
	"Test loss: 0.0616314953997\n",
	"Train loss: 0.0064299263591\n",
	"Test loss: 0.0610674727634\n",
	"Train loss: 0.0063260937389\n",
	"Test loss: 0.0605206569501\n",
	"Train loss: 0.00622703943767\n",
	"Test loss: 0.059989560765\n"
	]
	}
	],
	"source": [
	"num_iter=400\n",
	"for i in range(num_iter):\n",
	" _, loss_val = sess.run([train_op, tr_loss])\n",
	" if not (i%10):\n",
	" print('Train loss:', loss_val)\n",
	" print('Test loss:', sess.run(te_loss))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 214,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.059523919593294561"
	]
	},
	"execution_count": 214,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sess.run(te_loss)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 213,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"w_model = sess.run(w)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 204,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sess.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 284,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[['.' '.' '.' '.' '.']\n",
	" ['.' '.' '.' '.' '.']\n",
	" ['.' '.' '.' '.' '.']\n",
	" ['.' '.' '.' 'S' '.']\n",
	" ['.' '.' '.' '.' 'G']]\n",
	"left : [-0.15964259]\n",
	"down : [-0.12028602]\n",
	"right : [ 0.50504172]\n"
	]
	}
	],
	"source": [
	"gate = 4 #0 ... 4, position of gate\n",
	"pos = 3 #0 ... 4, position of skier\n",
	"dist = 1 #0 ... 4, vertical distance to gate\n",
	"\n",
	"gates = np.array([[gate]]) \n",
	"trajectory = ([[pos]]) \n",
	"dist_to_gate = ([[dist]]) \n",
	"\n",
	"print(gen_pic(gate, pos, dist))\n",
	"\n",
	"for a, action in enumerate(['left', 'down', 'right']):\n",
	" actions = np.array([[a]]) #0 ... 2, left down right\n",
	" print(action, ': ', end='')\n",
	" print(game.transform_state_action_to_onehot(gates, trajectory, dist_to_gate, actions).dot(w_model))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 272,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def gen_pic(gate, pos_x, pos_y):\n",
	" res = np.array([['.']5]5)\n",
	" res[4-pos_y, pos_x] = 'S'\n",
	" res[4, gate] = 'G'\n",
	" return res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 273,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([['.', '.', '.', '.', '.'],\n",
	" ['.', '.', '.', '.', '.'],\n",
	" ['.', '.', '.', '.', '.'],\n",
	" ['S', '.', '.', '.', '.'],\n",
	" ['.', 'G', '.', '.', '.']],\n",
	" dtype='<U1')"
	]
	},
	"execution_count": 273,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}