\documentstyle[12pt,psfig]{book}
%\setlength{\oddsidemargin}{-.1 in}
%\setlength{\evensidemargin}{-.3 in}
\setlength{\evensidemargin}{.0 in}
\addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in}
\setlength{\textheight}{8in}
%
% this command enables to remove a whole part of the text
% from the printout
% to use it just enter
% \remove{
% before the text to be excluded and
% }
% after the text
\newcommand{\remove}[1]{}

%
% The following macros are used to generate nice code for programs.
% See example on how to use it below
%

%%%%%%%%%%%%%%%%%%%%% program macros %%%%%%%%%%%%%%%%%

\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
    \begin{minipage}{\textwidth}
    \begin{tabbing}
    \ \ \ \ \=\kill
  }{
    \end{tabbing}
    \end{minipage}
  }

% a blank line
\def\blankline{\hbox{}}

%%%%%%%%%%%%%%%%%%%%% End of PROGRAM macros %%%%%%%%%%%%%%%%%


%
% The following macro is used to generate the header.
%
%

\newcommand{\lecture}[7]{
   \pagestyle{headings}
   \thispagestyle{plain}
   \newpage
   \setcounter{chapter}{#1}
   \setcounter{page}{#2}
%  \set\thechapter{#3}
   \noindent
   \begin{center}
   \framebox{
      \vbox{
    \hbox to 6.28in { {\bf Computational  Learning Theory
                        \hfill Fall Semester, 1999/00} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1: #3  \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5, #6, #7} }
      }
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

%
% Use these macros for organizing sections of your notes.
% Each command takes two arguments: (1) the title of the section and and
% (2) a keyword for that section to appear in the index.  (See examples.)
% Please don't use \section, \subsection, and \subsubsection directly!
%

\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}

%
% Convention for citations is first author's last name followed by other
% authors' last initials, followed by the year.  For example, to cite the
% seventh entry in the course bibliography, you would type: \cite{BurnsL80}
% (To avoid bibliography problems, for now we redefine the \cite command.)
%

\renewcommand{\cite}[1]{[#1]}

%
% These are just to make things a little easier:
%
\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}}   % generates a blank line in the output

%
% Use these for theorems, lemmas, proofs, etc.
%
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\noindent \bf Proof:}}{\qed \par}
%\newenvironment{proof}{{\em Proof:}}{\hfill\rule{2mm}{2mm}}

%
% Use the following for definitions.
% \bigdef is for definitions to be set off by themselves; \smalldef is for
% definitions given in the middle of a paragraph.
%
\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition }}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}


% **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:

\begin{document}

\lecture{6}{1}{November 25}{Yishay Mansour}{Adi Akavia}{Sivan Sabato}{Moti Gindi}

% **** YOUR NOTES GO HERE

\topic{Finding the Optimal Policy: Value Iteration}{VI} In this
section we present the Value Iteration algorithm (also referred to
as VI) for computing an $\epsilon$-optimal policy\footnote{An
$\epsilon$-optimal
   policy is a policy with a return value $\epsilon$-close
   to the return value of the optimal policy.} for a discounted infinite horizon problem.\\
In Lecture 5 we showed that the Optimality Equations
   for discounted infinite horizon problems are:
\begin{eqnarray*}
v(s) = \max_{a \in A_{s}}\{r(s,a) + \lambda\sum_{j\in S}p(j\mid s,a)v(j)\}
\end{eqnarray*}
We also defined the non-linear operator $L$:
\begin{eqnarray*}
L\vec{v} = \max_{d\in \Pi^{MD}}\{\vec{r}_{d} + \lambda P_{d}\vec{v}\}
\end{eqnarray*}
For which it was shown, that for any starting point $\vec{v}_{0}$,
the series $\{\vec{v}_{n}\}$ defined by
$\vec{v}_{n+1}=L\vec{v}_{n}$, converges to the optimal return
value $\vec{v}_{\lambda}^{*}$. \\ The idea of VI is to use these
results to compute a solution of the Optimality Equations. The VI
algorithm finds a Markovian stationary policy that is
$\epsilon$-optimal. \subtopic{The Value Iteration Algorithm}{}
Input: $MDP$ and parameters $\epsilon$ and $\lambda$
\begin{enumerate}
\item Choose an initial return value function $\vec{v}_{0}$ (by choosing a number for each $s\in S$).
\item $n\leftarrow 0$.
\item \label{L6:viloop} Assign the next return value function:
\begin{eqnarray*}
\forall s\in S,\ v_{n+1}(s) \leftarrow \max_{a \in A_{s}}\{r(s,a)
+ \lambda\sum_{j\in S}p(j\mid s,a)v_{n}(j)\}
\end{eqnarray*}
\item $n\leftarrow n+1$
\item If $\|\vec{v}_{n+1}-\vec{v}_{n}\| < \epsilon \cdot \frac{1-
\lambda}{2\lambda}$, stop,\\ Else return to (\ref{L6:viloop}).
\item Choose the output policy such that:
\begin{eqnarray*}
\pi_{\epsilon}(s) \in argmax_{a \in A_{s}}\{r(s,a) + \lambda\sum_{j\in S}p(j\mid s,a)v_{n+1}(j)\}
\end{eqnarray*}
\end{enumerate}
\subtopic{Correctness of Value Iteration Algorithm}{}
In the following theorem we show that the algorithm finds an $\epsilon$-optimal policy in a
finite number of steps. Note that the selected policy might in fact be the optimal policy, but
we have no way of knowing that in advance.
\begin{theorem}\label{L6:correctvi}
For the series $\{\vec{v}_{n}\}$ and the policy $\pi_{\epsilon}$
computed by VI the following holds:
\begin{enumerate}
\item $\lim_{n\rightarrow \infty} v_{n} = v_{\lambda}^{*}$
\item $\exists N, \forall n>N, \|v_{n+1}-v_{n}\| < \epsilon \cdot \frac{1-\lambda}{2\lambda}$
\item The policy $\pi_{\epsilon}$ is $\epsilon$-optimal
\item If $\|v_{n+1}-v_{n}\| < \epsilon \cdot \frac{1-\lambda}{2\lambda}$ then $\|v_{n+1} -
v_{\lambda}^{*}\| < \frac{\epsilon}{2}$ \label{L6:bounddiff}
\end{enumerate}
\end{theorem}
\begin{proof}
Parts (1) and (2) follow directly from the properties of the
series $v_{n+1} = Lv_{n}$, that were shown in Lecture 5.\\ For
part (3) we assume that $\|v_{n+1}-v_{n}\| < \epsilon \cdot
\frac{1-\lambda}{2\lambda}$, as is the case when the algorithm
stops, and show that $\|v_{\lambda}^{\pi_{\epsilon}} -
v_{\lambda}^{*}\| < \epsilon$, which would make the policy
$\pi_{\epsilon}$ $\epsilon$-optimal.
\begin{eqnarray}
\|v_{\lambda}^{\pi_{\epsilon}} - v_{\lambda}^{*}\| < \|v_{\lambda}^{\pi_{\epsilon}} - v_{n+1}\| +
\|v_{n+1} - v_{\lambda}^{*}\| \label{L6:epopeq}
\end{eqnarray}
We now bound each part of the sum individually:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{\epsilon}} - v_{n+1}\| & = & \|L_{\pi_{\epsilon}}v_{\lambda}^{\pi_{\epsilon}}
- v_{n+1}\| \ \ \ \ \ (because\ v_{\lambda}^{\pi_{\epsilon}}\ is\ the\ fixed\ point\ of
L_{\pi_{\epsilon}})\\
& \leq & \|L_{\pi_{\epsilon}}v_{\lambda}^{\pi_{\epsilon}} - Lv_{n+1}\| + \|Lv_{n+1} - v_{n+1}\|
\end{eqnarray*}
Since $\pi_{\epsilon}$ is maximal over the actions using
$v_{n+1}$, it implies that $L_{\pi_{\epsilon}}v_{n+1} = Lv_{n+1}$
and we conclude that:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{\epsilon}} - v_{n+1}\| & \leq &
\|L_{\pi_{\epsilon}}v_{\lambda}^{\pi_{\epsilon}} -
L_{\pi_{\epsilon}}v_{n+1}\| + \|Lv_{n+1} - Lv_{n}\| \ \ \\ & \leq
&  \lambda\|v_{\lambda}^{\pi_{\epsilon}} - v_{n+1}\| +
\lambda\|v_{n+1} - v_{n}\|
\end{eqnarray*}
From the inequality it follows:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{\epsilon}} - v_{n+1}\| \leq \frac{\lambda}{1-\lambda}\|v_{n+1} - v_{n}\|
\end{eqnarray*}
For the second part of the sum we derive similarly that:
\begin{eqnarray*}
\|v_{n+1} - v_{\lambda}^{*}\| \leq \frac{\lambda}{1-\lambda}\|v_{n+1} - v_{n}\|
\end{eqnarray*}
From which part (4) of the theorem also follows. \\ Returning to
inequality \ref{L6:epopeq}, it follows:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{\epsilon}} - v_{\lambda}^{*}\| \leq \frac{2\lambda}{1-\lambda}\|v_{n+1} -
v_{n}\| < \epsilon
\end{eqnarray*}
Therefore the selected policy $\pi_{\epsilon}$ is
$\epsilon$-optimal.
\end{proof}
\subtopic{Convergence of Value Iteration Algorithm}{} In this
section we show that the convergence to the optimal policy of VI
algorithm is monotonic, and that the convergence is exponentially
fast in the parameter $\lambda$.
\begin{claim}
Monotonicity of convergence:
\begin{enumerate}
\item if $v \geq u$ then $Lv \geq Lu$
\item if $Lv_{n} \geq v_{n}$ then $\forall m \geq 0,\ v_{n+m+1} \geq v_{n+m}$
\end{enumerate}
\end{claim}
\begin{proof}
For part (1), let
\begin{eqnarray*}
\delta \in argmax_{\pi \in \Pi^{MD}}\{r_{\pi} + \lambda P_{\pi}u\}
\end{eqnarray*}
Since $P_{\delta} \geq 0$, it follows that $P_{\delta}v \geq P_{\delta}u$. Therefore:
\begin{eqnarray*}
Lu = r_{\delta} + \lambda P_{\delta}u \leq r_{\delta} + \lambda P_{\delta}v \leq \max_{d \in
\Pi^{MD}}\{r_{d} + \lambda P_{d}v\} = Lv
\end{eqnarray*}
This proves part (1). \\ From part (1) it follows by induction
that if $v \geq u$, then $\forall m \geq 1,\ L^{m}v \geq L^{m}u$.
\\ To prove part (2):
\begin{eqnarray*}
v_{n+m+1} = L^{m}Lv_{n} \geq L^{m}v_{n} = v_{n+m}
\end{eqnarray*}
\end{proof}
%\em{Conclusion:}%
From the above claim it follows that if $Lv_{0} \geq v_{0}$ (or $Lv_{0} \leq v_{0}$), VI will converge %%@
monotonically, which
means that running more iterations will always lead to a better policy. If all rewards in a
specific MDP are non-negative (or non-positive), we can choose $v_{0} = \vec{0}$, which assures %%@
the condition is
met.
\begin{theorem}
Let ${v_n}$ be the sequence of values computed by the VI
algorithm.
\begin{enumerate}
\item $\forall n, \|v_{n} - v_{\lambda}^{*}\| \leq \frac{\lambda^{n}}{1-\lambda}\|v_{1} -
v_{0}\|$
\item $\|v_{\lambda}^{\pi_{n}} - v_{\lambda}^{*}\| \leq \frac{2\lambda^{n}}{1-\lambda}\|v_{1} -
v_{0}\|$, where $\pi_{n}$ is the policy that is defined by $v_n$.
\end{enumerate}
\end{theorem}
\begin{proof}
Part (1): We will use the result of part (\ref{L6:bounddiff}) in theorem \ref{L6:correctvi}:
\begin{eqnarray*}
If\ \ \|v_{n}-v_{n-1}\| < \epsilon \cdot \frac{1-\lambda}{\lambda}\ \ then\ \ \|v_{n} -
v_{\lambda}^{*}\| < {\epsilon}
\end{eqnarray*}
To use it we bound:
\begin{eqnarray*}
\|v_{n}-v_{n-1}\|& = & \|L^{n-1}v_{1}-L^{n-1}v_{0}\| \\
& \leq& \lambda^{n-1}\|v_{1} - v_{0}\|\ \ (because\ L\ is\ contracting)
\end{eqnarray*}
Let $\epsilon = \frac{\lambda^{n}}{1-\lambda}\|v_{1} - v_{0}\|$,
so that we can use theorem \ref{L6:correctvi} to conclude that:
\begin{eqnarray*}
\|v_{n} - v_{\lambda}^{*}\| \leq \frac{\lambda^{n}}{1-\lambda}\|v_{1} - v_{0}\|
\end{eqnarray*}
Which proves part (1).\\
To prove part (2) we bound:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{n}} - v_{\lambda}^{*}\| & \leq & \|v_{\lambda}^{\pi_{n}} - v_{n}\| + \|v_{n}
- v_{\lambda}^{*}\| \\
\end{eqnarray*}
The following bounds derive (a) from the same result of
theorem \ref{L6:correctvi} and (b) from part (1) of this
theorem, respectively:
\begin{eqnarray*}
\|v_{\lambda}^{\pi_{n}} - v_{\lambda}^{*}\| & \leq & \overbrace{\frac{\lambda}{1-\lambda}\|v_{n}-v_{n-1}\|}^{(a)} + %%@
\overbrace{\frac{\lambda^{n}}{1-\lambda}\|v_{1} -
v_{0}\|}^{(b)} \\
& \leq & \frac{2\lambda^{n}}{1-\lambda}\|v_{1} - v_{0}\|
\end{eqnarray*}
The last inequality follows form L being a contracting operator, thus ending the proof of part
(2)
\end{proof}
\blankline From this theorem it follows that each iteration of the
VI algorithm is closer to $v_{\lambda}^{*}$ by a factor of
$\lambda$. As $\lambda$ approaches 1, the rate of convergence
decreases. The bounds we have shown can be used to determine the number of iterations needed for %%@
a specific problem.

% Example
% =======
% The example MDP figure
% ------------------------
\begin{figure}
\psfig{file=sample.eps ,width=6in,clip=}
\caption{Example Diagram}
\label{L6:VI-ExampleFIGURE}
\end{figure}
\subtopic{Example: Running Value Iteration Algorithm}{}
(Consider the MDP in figure \ref{L6:VI-ExampleFIGURE}
){}\\ Let:
\begin{eqnarray*}
\lambda& = &\frac{1}{2}\\
{v_{n+1}}({s_1})& = &MAX\{ 5 + \lambda[\frac{1}{2}{v_n}({s_1}) +
\frac{1}{2}{v_n}({s_2})] ,\ 10 + \lambda{v_n}({s_2}) \}  \\
{v_{n+1}}({s_2})& = &-1 + \lambda{v_n}({s_2})\\
\end{eqnarray*}
Step 1: We Initialize:
\begin{eqnarray*}
{v_0}({s_1})& = & {v_0}({s_2}) = -10 \\
\end{eqnarray*}
Steps 2-5:
\begin{eqnarray*}
 {v_1}({s_2})& = &-1 + 0.5(-10) = -6\\
 {v_1}({s_1})& = &MAX\{ 5 + 0.25(-10) +
0.25(-10),\ 10 + 0.5(-10)\} = 5\\
\\
 {v_2}({s_2})& = &-1 + 0.5(-6) = -4\\
 {v_2}({s_1})& = &MAX\{ 5 + 0.25\cdot5 + 0.25(-6),\ 10 + 0.5(-6)\} = 7\\
\\
 {v_3}({s_2})& = &-1 + 0.5(-4) = -3\\
 {v_3}({s_1})& = &MAX\{ 5 + 0.25\cdot7 +
0.25\cdot(-3),\ 10 + 0.5\cdot(-3)\} = 8.5\\
\end{eqnarray*}
Step 6:
\begin{eqnarray*}
\pi_{\epsilon}(s_{1}) & = a_{12}\\
\pi_{\epsilon}(s_{2}) & = a_{21}
\end{eqnarray*}
Note that the iterated value approaches $v^*_\lambda$, which is:
\begin{eqnarray*}
 v^*_\lambda({s_2})& = &-2\\
 v^*_\lambda({s_1})& = &9\\
\end{eqnarray*}
% Policy Iteration
% =================
\topic {Policy Iteration}{}
In this section we present the Policy Iteration algorithm
(also referred to as PI) for finding the
optimal policy in a discounted infinite horizon
problem. As opposed to the Value Iteration algorithm, the
output of PI is not an approximation of the optimal policy,
but the optimal policy itself.

% The Policy Iteration Algorithm
% ------------------------------

\subtopic {Policy Iteration Algorithm}{} Input: $MDP$, and
$\lambda$
\begin{enumerate}
\item Initialize: $d_0\in\Pi^{MD}$, $n \leftarrow 0$

\item {\em(policy evaluation)}\\
Find $v_n$ (the value of $d_n$) by solving the equations:
\begin{center}
$(I - \lambda{P_{d_n}})v = r_{d_n}$\\
\end{center}

\item {\em(policy improvement)}\\
Choose a greedy policy with respect to $v_n$:\\
Choose the next policy, $d_{n+1}$, s.t.:
\begin{center}
$d_{n+1}\in argmax_{d\in\Pi^{MD}}\{ r_d + \lambda{P_d}{v_{d_n}} \}$\\
\end{center}
Choose $d_{n+1} = d_n$ if possible.
\item If $d_{n+1} = d_n$ stop,\\
else $n \leftarrow n+1$, return to (2).
\end{enumerate}


% Convergence of The Policy Iteration Algorithm
% ---------------------------------------------
\subtopic {Convergence of Policy Iteration Algorithm}{} We shall
see that when the number of states - |S|, and the number of
actions - |A| are finite, there are no two
non-consecutive iterations with the same policy, unless we have an optimal policy.
Therefore $d_n$ converges to the optimal policy %%@
in a finite number of
steps. The key to the convergence of $d_n$ is the monotonicity of $\{v_n\}$.

% Monotonicity Claim of Vn
% ------------------------
\begin{claim}
\label{L6:MonotonicityClaim}
Let $v_n$, $v_{n+1}$ be the values of consecutive iterations of the above
algorithm, then $v_n \leq v_{n+1} \leq v_{\lambda}^*$.
\end{claim}

\begin{proof}
Let $d_{n+1}$ be the policy in the {\em policy improvement} step, then
\begin{eqnarray*}
r_{d_{n+1}} + \lambda{P_{d_{n+1}}}{v_n} &\geq& r_{d_n} + \lambda{P_{d_n}}{v_n}\\
&=& v_n\ \ \ \ \ \ (by\ definition\ of\ v_n)\\
\end{eqnarray*}
Therefore: $r_{d_{n+1}} \geq (I - \lambda{P_{d_{n+1}}}){v_n}$.
Multiplying by $(I - \lambda{P_{d_{n+1}}})^{-1}$ we get:
\begin{eqnarray*}
v_{n+1} = (I - \lambda{P_{d_{n+1}}})^{-1}r_{d_{n+1}} \geq v_n
\end{eqnarray*}
\end{proof}
\blankline
Note: The above claim is valid even when $S$ or $A$ are infinite.

% Convergence Rate of PI Theorem
% ------------------------------
\begin{theorem}
\label{L6:PI-ConvergenceRateTheorem} Let $S$ and $A$ be finite,
then the {\em Policy Iteration} algorithm converges to the optimal
policy after at most $|A|^{|S|}$ iterations.
\end{theorem}

\begin{proof}
Clearly, $|A|^{|S|} \geq |\Pi^{MD}|$. According to claim \ref{L6:MonotonicityClaim}, in each step $v_{n+1} \geq v_n$ except for the %%@
last step in which $v_{n+1} = v_n$. Therefore no policy $\pi \in
\Pi^{MD}$ can appear in two different iterations. Hence the number
of iterations $\leq |\Pi^{MD}| \leq |A|^{|S|}$
\end{proof}
% Example
% -------
\newpage
\begin{figure}\psfig{file=sample.eps ,width=6in ,clip=}
\caption{Example Diagram}\label{L6:PI-ExampleFIGURE}
\end{figure}
\subtopic{Example: Running Policy Iteration Algorithm}{} (Consider the MDP in figure
                                \ref{L6:PI-ExampleFIGURE}
){}\\ Let:
\begin{eqnarray*}
\lambda& = &0.95
\end{eqnarray*}
Step 1:
\begin{eqnarray*}
{d_0}({s_1})& = &a_{12}\\
{d_0}({s_2})& = &a_{21}
\end{eqnarray*}
Policy Evaluation:
\begin{eqnarray*}
 {v_0}({s_1})& = &10 + 0.95 v_0({s_2})\\
 {v_0}({s_2})& = &-1 + 0.95 v_0({s_2})\\
& \Downarrow &\\
{v_0}({s_1}) = -9 &, &\ {v_0}({s_2}) = -20
\end{eqnarray*}
Policy Improvement:
\begin{eqnarray*}
MAX\{ 5 + 0.475v_0({s_1}) + 0.475v_0({s_2}),\ 10 + 0.95v_0({s_2}) \}
= MAX\{ -0.8775,\ -9 \}
\end{eqnarray*}
Therefore:
\begin{eqnarray*}
 d_1({s_1})& = &a_{11}\\
 d_2({s_2})& = &a_{21}\\
\end{eqnarray*}
Policy Evaluation:
\begin{eqnarray*}
 {v_1}({s_1})& = &5 + 0.475 v_1({s_1}) + 0.475 v_1({s_2})\\
 {v_1}({s_2})& = &10 + 0.95 v_1({s_2})\\
&\Downarrow&\\
v_1({s_1}) = -8.571 & , & \ v_1({s_2}) = -20
\end{eqnarray*}
The next policy improvement step shows that $d_2 = d_1$, and
therefore the algorithm terminates and outputs $d_1$ as the
optimal policy. \topic{A Comparison between VI and PI
Algorithms}{} In this section we will compare the convergence rate
of the VI and PI algorithms. It will be shown that, assuming that
the two algorithms begin with the same approximated value, the PI
algorithm converges faster to the optimized value.
\begin{theorem}
\label{L6:PIVI-ComparisonTh} Let $\{u_{i}\}$ be the series of
values created by the VI algorithm (where $u_{n+1}=Lu_{n}$) and
let $\{v_{i}\}$ be the series of values created by PI algorithm.
If $u_{0}=v_{0}$, then $\forall n,\ u_{n}\leq v_{n}\leq
v^*_\lambda$.
\end{theorem}

\begin{proof} We will use induction to prove the theorem.
\\ {\em Induction Basis:} We assume that $u_0=v_0$. $v_0$ is the
return value of a specific policy, and therefore it is clearly
$\leq$ the optimal return value. Therefore: $u_{0} \leq v_{0} \leq
v^*_\lambda$.
\\ {\em Induction Step:}
\begin{eqnarray*}
u_{n+1} = Lu_{n} = L_{p_{n}}u_{n} \ \ \ \ [Where \ p_{n} \in
argmax_{d \in \Pi^{MD}}\{r_{d} + \lambda P_{d}u_{n}\}]
\end{eqnarray*}
From the induction hypothesis $u_n \leq v_n$, and since
$L_{p_{n}}$ is monotonic it follows that:
\begin{eqnarray*}
L_{p_{n}}u_{n} & \leq &L_{p_{n}}v_{n}
\end{eqnarray*}
Since L is taking the maximum over all policies:
\begin{eqnarray*}
L_{p_{n}}v_{n} & \leq & Lv_{n}
\end{eqnarray*}
We denote the policy determined by PI algorithm in iteration $n$
as $d_n$ and therefore:
\begin{eqnarray*}
Lv_{n}=L_{d_{n}}v_{n}
\end{eqnarray*}
From the Optimality Equations we get:
\begin{eqnarray*}
L_{d_{n}}v_{n} &\leq& L_{d_{n}}v_{\lambda}^{d_{n}}
\end{eqnarray*}
From the definition of $v_{n+1}$ we have:
\begin{eqnarray*}
L_{d_{n}}v_{\lambda}^{d_{n}} &=& v_{n+1}
\end{eqnarray*}
And we get $u_{n+1} \leq v_{n+1}$. In Theorem
\ref{L6:MonotonicityClaim} it was proven that $v_{n+1} \leq
v^*_\lambda$ and therefore $u_{n+1} \leq v_{n+1} \leq
v^*_\lambda$.
\end{proof}
\blankline From Theorem \ref{L6:PIVI-ComparisonTh} it follows
that, assuming the same starting point, PI algorithm requires less
stages then VI algorithm to converge to the optimal policy. Yet,
it should be noticed that each single stage of PI requires a
solution of a set of linear equations (the {\em policy evaluation}
stage) and therefore it is computationally more expensive than a
single stage of VI algorithm.
 \topic{Linear Programming}{} \subtopic
{Introduction to Linear Programming}{} In a general {\bf
linear-programming problem}, we wish to optimize a linear
function, subject to a set of linear inequalities. We are given an
$m \times n$ matrix $\mathbf{A}$, an $m$-vector $\vec{b}$, and an
$n$-vector $\vec{c}$. We wish to find a vector $\vec{x}$ of $n$
elements that maximizes the {\bf objective function}:
$\sum_{i=1}^{n}c_{i}x_{i}$\ subject to the $m$ constraints given
by A$\vec{x} \geq \vec{b}$, and $\forall i,\ x_i \geq 0$.
\\ This problem can be written formally as a {\bf linear-program} of the following structure
(also called the primal linear program):
\\{\em Minimize: $\vec{c}^{T}\vec{x}$
\\Subject To: $A\vec{x} \geq \vec{b}$ and $\vec{x} \geq
\vec{0}$} \\ \blankline Many problems can be expressed as linear
programs, and for this reason much work has gone into algorithms
for linear programming. Today we know some polynomial algorithms
that can be used to solve this problem, and there are many
software packages can solve it.
\\ \blankline Each minimization problem can easily be transformed
to a dual maximization problem of the form:
\\ {\em Maximize: $\vec{b}^{T}\vec{y}$
\\ Subject To: $A^{T}\vec{y} \leq \vec{c}$ and $\vec{y} \geq \vec{0}$}
\begin{theorem}
\label{L6:dual}(no proof)
$\vec{c}^{T}\widehat{\vec{x}}=\vec{b}^{T}\widehat{\vec{y}}$, where
$\widehat{\vec{x}}$ and $\widehat{\vec{y}}$ are the solutions of
a minimization problem and its dual maximization problem.
\end{theorem}
The above theorem suggests that we can either solve the primal or
the dual linear program. \subtopic{A Linear Programming Example}{}
Consider the translation of the following problem to a linear
programming problem:
\\ {\em A person undergoing a diet should take N types of
vitamins. One should take a quantity of $b_{i}$ from each vitamin
type. There exist M fruits. Each fruit j contains
$v_{j,1}...v_{j,N}$ vitamins of each type. Each fruit j costs
$p_{j}$. Assuming that it is possible to buy a fraction of a
fruit, what is the cheapest combination of fruits that should be
bought while keeping the diet constraints?} \\
\blankline
The following linear program describes this problem:
\\ {\em Minimize: $\sum_{j=1}^{M}x_{j}p_{j}$
\\ Subject To:
\begin{itemize}
\item $\forall i: \sum_{j=1}^{M}x_{j}v_{j,i} \geq b_{i}$
\item $\forall j: x_{j} \geq 0$
\end{itemize}}
Where $x_{j}$ would denote the quantity bought of fruit $j$.
\\ The solution of this linear program gives the lowest price needed
to achieve the diet constraints. Yet, this problem can be also
translated to a dual maximization linear program:
\\ {\em Maximize: $\sum_{i=1}^{N}y_{i}b_{i}$
\\ Subject To:
\begin{itemize}
\item $\forall j: \sum_{i=1}^{N}y_{i}v_{j,i} \leq p_{j}$
\item $\forall i: y_{i} \geq 0$
\end{itemize}}
The intuition behind this translation can be viewed by changing
the point of view about the problem. In this case we can assume
the following problem:
\\ {\em A vitamins company sells N types of vitamins. Each vitamin price is $y_{i}$.
It is known that every person is taking a quantity of $b_{i}$ from
each vitamin type. The person can either buy the vitamin directly
from the company or get it by eating fruits. There exist M fruits.
Each fruit j contains $v_{j,1}...v_{j,N}$ vitamins of each type.
Each fruit j costs $p_{j}$. The company does not want a
combination of vitamins to cost more than the fruit which contains
them, since then people will buy the fruit rather than the
vitamins. The aim is to maximize the return of the diet.}.
\\ The variable $y_{i}$ in the above program denotes the price of each vitamin type $i$.
The solution of this program will give the maximal income of the
company. According to Theorem \ref{L6:dual} the company's maximal income
will be equal exactly to buyer's minimal expense. \subtopic{Use of
Linear Programming to Find the Optimal Policy}{} The general
translation of the problem $x=max\{a_{1},..,a_{n}\}$ to a linear
program is as follows:
\\ {\em Minimize: $x$
\\ Subject To: $\forall i: x \geq a_{i}$} \\
\blankline
It was shown that the Optimality Equations of the discounted
infinite horizon problem are:
\begin{eqnarray*}
v(s) = \max_{a \in A_{s}}\{r(s,a) + \lambda\sum_{j\in S}p(j\mid
s,a)v(j)\}
\end{eqnarray*}
Using the above translation we would get the following linear
program:
\\ {\em Minimize: $\sum_{j \in S}\alpha(j)v(j)$
\\ Subject To:
\begin{itemize}
\item $\forall s \in S, \forall a \in A_{s}: v(s) \geq r(s,a) +
\lambda\sum_{j \in S}p(j\mid s,a)v(j)$
\end{itemize}}
$\alpha(j)$ is any set of constants with the following
characteristics:
\begin{itemize}
\item $\forall j: \alpha(j)>0$
\item $\sum_{j}\alpha(j)=1$
\end{itemize}
These constants can be viewed as the probability
distribution of the agent's initial state in the MDP. \\
\blankline
Accordingly, the dual linear program would be:
\\ {\em Maximize: $\sum_{s \in S}\sum_{a \in A_{s}}r(s,a) \chi(s,a)$
\\ Subject To:
\begin{itemize}
\item $\forall j \in S: \sum_{a \in A_{j}} \chi(s,a) = \alpha(j) +
\lambda \sum_{a \in A_{s}} \sum_{s \in S} p(j \mid s,a) \chi(s,a)$
\item $\chi(s,a) \geq 0$
\end{itemize}}
Intuitively, $\chi(s,a)$ can be thought of as the
probability of being in state $s$ and performing action $a$, while taking into account
the discount factor - $\lambda$. In the following theorem
$\chi(s,a)$ is defined more formally.
\begin{theorem}
\label{L6:maxth}(no proof)
\begin{enumerate}
\item For every $d \in \Pi^{MR}$, $s \in S$ and $a \in A_{s}$, let:
\begin{eqnarray*}
\chi_{d}(s,a) = \sum_{j\in S}\alpha(j)
\sum_{n=1}^{\infty}\lambda^{n-1} Prob[x_{n}=s\bigwedge y_{n}=a\mid
x_{1}=j]
\end{eqnarray*}
$\chi_{d}(s,a)$ is a feasible solution of the dual linear
maximization problem.
\item Let $\chi_{d}(s,a)$ be a feasible solution of the dual linear
problem. For every $s \in S$ and $a \in A_{s}$ define $d_{x}(s)$
such that:
\begin{eqnarray*}
Prob[d_{x}(s)=a]=\frac{\chi(s,a)}{\sum_{\overline{a} \in
A_{s}}\chi(s,\overline{a})}
\end{eqnarray*}
and then $\chi_{d_{x}}(s,a)=\chi(s,a).$
\end{enumerate}
\end{theorem}

Theorem \ref{L6:maxth} is assuring that every solution to the dual linear
program can be translated to a policy with a return value equal to
the maximized element in the program. Thus, the best solution of
the program can be translated to a policy with the maximal
possible return value. This policy will, obviously, be the best
policy.


\end{document}