terrence-ou/race_track_algorithm.tex

## race_track_algorithm.tex
\usepackage[many]{tcolorbox}
\usepackage{amsfonts}
\usepackage{amsmath}
\DeclareMathOperator*{\argmax}{argmax}


\begin{tcolorbox}[colback=white,
                  colframe=-red!75!green!50,
                  title=Algorithm: Off-policy Monte Carlo Control with Weighted Importance Sampling,
                  sidebyside align=top,
                  box align=top,
                  halign=flush left]

\textbf{Input:} \\
    \quad $\varepsilon$: a small probability value in range [0.0, 1.0] \\
    \quad $\gamma$: discount factor\\
    \quad $b$: soft policy, we choose $\varepsilon$-greedy policy in this example\\
    \quad $num\_episodes$: total number of episodes, integer\\
\textbf{Output:}\\
    \quad $Q$: action-value function for the optimal policy $\pi_*$\\
\hfill \\
Initialize $Q$: $\mathcal{S} \times \mathcal{A} \leftarrow \mathbb{R}$ arbitrarily, we'll use optimistic initial value in this example \\
Initialize $C$: $\mathcal{S}\times \mathcal{A} \leftarrow 0$ \hfill $\rhd$ cumulative sum of weights\\
Initialize $\pi$: $\mathcal{S} \leftarrow \argmax_a Q(\mathcal{S}, a)$ \hfill $\rhd$ deterministic policy\\

\hfill\\
\textbf{for} $i \leftarrow 1$ \textbf{to} $num\_episodes$ \textbf{do}\\
\quad $\mathcal{T} \leftarrow \text{empty array}$\\
\quad \textbf{while} episode not terminated \textbf{do}\\
\quad \quad $A_t \leftarrow b(S_t)$ \\
\quad \quad Take action $A_t$ and observe $(S_{t+1}, R_{t+1})$\\
\quad \quad Append $(S_t, A_t, R_{t+1})$ to $\mathcal{T}$\\
\quad $G\leftarrow 0$ \hfill $\rhd$ the return of the episode\\
\quad $W\leftarrow 1$ \hfill $\rhd$ sampling weights\\
\quad \textbf{for} $t \leftarrow T - 1$ \textbf{to} $1$ \textbf{do}\\
\quad \quad $G \leftarrow \gamma G + R_{t+1}$\\
\quad \quad $C(S_t, A_t) \leftarrow C(S_t, A_t) + W$ \\
\quad \quad $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{W}{C(S_t, A_t)}[G - Q(S_t, A_t)]$ \hfill $\rhd$ update $Q$ table\\
\quad \quad $\pi(S_t) \leftarrow \argmax_a Q(S_t, a)$ \hfill $\rhd$ update policy, with ties broken consistently\\
\quad \quad \textbf{If} $A_t \neq \pi(S_t)$ \textbf{do}\\
\quad \quad \quad \textbf{break}\\
\quad \quad \textbf{Else} \textbf{do}\\
\quad \quad \quad $W \leftarrow W\frac{1}{b(A_t|S_t)}$, where $b(A_t|S_t) = \begin{cases}1 - \varepsilon + \frac{\varepsilon}{|\mathcal{A}(S_t)|}, &a = A^* \\\frac{\varepsilon}{|\mathcal{A}(S_t)|}, &a \neq A^*  \end{cases}$   \hfill $\rhd$ update $W$\\
\textbf{return} $Q$

\end{tcolorbox}
	\usepackage[many]{tcolorbox}
	\usepackage{amsfonts}
	\usepackage{amsmath}
	\DeclareMathOperator*{\argmax}{argmax}


	\begin{tcolorbox}[colback=white,
	colframe=-red!75!green!50,
	title=Algorithm: Off-policy Monte Carlo Control with Weighted Importance Sampling,
	sidebyside align=top,
	box align=top,
	halign=flush left]

	\textbf{Input:} \\
	\quad $\varepsilon$: a small probability value in range [0.0, 1.0] \\
	\quad $\gamma$: discount factor\\
	\quad $b$: soft policy, we choose $\varepsilon$-greedy policy in this example\\
	\quad $num\_episodes$: total number of episodes, integer\\
	\textbf{Output:}\\
	\quad $Q$: action-value function for the optimal policy $\pi_*$\\
	\hfill \\
	Initialize $Q$: $\mathcal{S} \times \mathcal{A} \leftarrow \mathbb{R}$ arbitrarily, we'll use optimistic initial value in this example \\
	Initialize $C$: $\mathcal{S}\times \mathcal{A} \leftarrow 0$ \hfill $\rhd$ cumulative sum of weights\\
	Initialize $\pi$: $\mathcal{S} \leftarrow \argmax_a Q(\mathcal{S}, a)$ \hfill $\rhd$ deterministic policy\\

	\hfill\\
	\textbf{for} $i \leftarrow 1$ \textbf{to} $num\_episodes$ \textbf{do}\\
	\quad $\mathcal{T} \leftarrow \text{empty array}$\\
	\quad \textbf{while} episode not terminated \textbf{do}\\
	\quad \quad $A_t \leftarrow b(S_t)$ \\
	\quad \quad Take action $A_t$ and observe $(S_{t+1}, R_{t+1})$\\
	\quad \quad Append $(S_t, A_t, R_{t+1})$ to $\mathcal{T}$\\
	\quad $G\leftarrow 0$ \hfill $\rhd$ the return of the episode\\
	\quad $W\leftarrow 1$ \hfill $\rhd$ sampling weights\\
	\quad \textbf{for} $t \leftarrow T - 1$ \textbf{to} $1$ \textbf{do}\\
	\quad \quad $G \leftarrow \gamma G + R_{t+1}$\\
	\quad \quad $C(S_t, A_t) \leftarrow C(S_t, A_t) + W$ \\
	\quad \quad $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \frac{W}{C(S_t, A_t)}[G - Q(S_t, A_t)]$ \hfill $\rhd$ update $Q$ table\\
	\quad \quad $\pi(S_t) \leftarrow \argmax_a Q(S_t, a)$ \hfill $\rhd$ update policy, with ties broken consistently\\
	\quad \quad \textbf{If} $A_t \neq \pi(S_t)$ \textbf{do}\\
	\quad \quad \quad \textbf{break}\\
	\quad \quad \textbf{Else} \textbf{do}\\
	\quad \quad \quad $W \leftarrow W\frac{1}{b(A_t\|S_t)}$, where $b(A_t\|S_t) = \begin{cases}1 - \varepsilon + \frac{\varepsilon}{\|\mathcal{A}(S_t)\|}, &a = A^* \\\frac{\varepsilon}{\|\mathcal{A}(S_t)\|}, &a \neq A^* \end{cases}$ \hfill $\rhd$ update $W$\\
	\textbf{return} $Q$

	\end{tcolorbox}