

\documentstyle[12pt,psfig]{book}
\setlength{\evensidemargin}{.0 in} \addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{8in}

\newcommand{\remove}[1]{}

\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
    \begin{minipage}{\textwidth}
    \begin{tabbing}
    \ \ \ \ \=\kill
  }{
    \end{tabbing}
    \end{minipage}
  }

\def\blankline{\hbox{}}


\newcommand{\lecture}[7]{
   \pagestyle{headings}
   \thispagestyle{plain}
   \newpage
   \setcounter{chapter}{#1}
   \setcounter{page}{#2}
%  \set\thechapter{#3}
   \noindent
   \begin{center}
   \framebox{
      \vbox{
    \hbox to 6.28in { {\bf Computational  Learning Theory
                       \hfill Fall Semester, 1999/00} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1: #3  \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5, #6, #7} }
      }
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}

\renewcommand{\cite}[1]{[#1]}

\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}}   % generates a blank line in the output

\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\bf Proof:}}{\qed \par}

\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition }}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}

% **** YOUR NOTES GO HERE ****************************************************

\begin{document}
\lecture{5}{1}{November 18}{Yishay Mansour}{Elhanan Borenstein}
{Keren Saggie} {Yossi Mossel}
\begin{flushleft}
\topic{Introduction: Discounted Infinite Horizon}{}\subtopic
{Notation}{}
\begin{itemize}
\item$\vec{v}:S\Rightarrow R$
\item$\|\vec{v}\|_{\infty}=\max_{s \in S}\{|\vec{v}(s)|\}$
\item$H:S\times S\Rightarrow R$
\item$\|H\|=\max_{s \in S}\sum_{j\in S}\|H_(s,j)\|$ (If H is
a probabilities matrix, $\|H\|=1$)

%\end{itemize}
\item\subsubtopic{Probabilities Transition Matrix}{1} Usually the topic of discussion
will be a series of t steps, and the object of inquiry will be the
probability of making a transition from state $s$ to state $j$
after $t$ steps.\\The transition matrix after $t$ steps, starting
from state s, using policy $\pi$ is:\\ \ \\
$P_{\pi}^{t}(j|s)=[P_{d_{t}}\ldots P_{d_{2}}\cdot
P_{d_{1}}](j|s)=Prob_{\pi}(X_{t+1}=j|x_{1}=s)$\\
\item\subsubtopic{Expectation of Reward}{2}The expected value function after t steps, starting from
state s, using policy $\pi$ is:\\ \ \\
$E_{s}^{\pi}[\vec{v}(X_{t})]=[P_{\pi}^{t-1}\cdot\vec{v}](s)$\\
\item\subsubtopic{The discounted value
of policy ${\pi}$}\\
$\vec{v}_{\lambda}^{\pi}=\sum_{t=1}^{\infty}\lambda^{t-1}P_{\pi}^{t-1}r_{d_{t}}$\\
\ \\ (where for deterministic policies,
$r_{d_{t}}(s)=r(s,d_{t}(s))$ is the immediate reward for
transition from s to $d_t(s)$)\\
%$r_{d}=\left[\begin{array}{c}r(s_0)\\r(s_1)\\\vdots\\r(s_i)\\\vdots\end{array}\right]$
\end{itemize}
\pagebreak
\begin{theorem}
\label{L1:InverseTheorem} Let $Q$ be a matrix such that $\|Q\|<1$,
then
\begin{enumerate}
\item There exists $(I-Q)^{-1}$
\item $(I-Q)^{-1} = \lim_{N \rightarrow \infty}\sum_{i=0}^NQ^i$
\end{enumerate}
(The proof can be found in Puterman's book)
\end{theorem}

\subtopic {Assumptions}{} In this section we make the following
simplifying assumptions.\\
\begin{enumerate}
\item The immediate reward and the transition probability are stationary. Hence the functions $r(s,a)$ and $p(j|s,a)$ are identical for any time stop. One benefit is that the algorithm can have a finite input.
\item The immediate reward is bounded: $|r(s,a)|<M$.
\item The discounted parameter is $0 \leq\lambda<1$
\item The number of states and actions is finite.\\
\end{enumerate}

\topic {Calculating the Return Value of a Given Policy}{}
According to Theorem $4.3$ from the previous lecture, for each
stochastic history dependent policy $ \pi=(d_1, d_2, ...) \in
\Pi^{HR}$ there exists a Markovian stochastic policy
$\pi^{'}=(d_1^{'}, d_2^{'}, ...) \in \Pi^{MR}$ that has the same
return, i.e., $ v_\lambda^\pi=v_\lambda^{\pi^{'}}$.\\ \
\\ Let $\pi\in\pi^{MR}$, then
\begin{eqnarray*}
v_\lambda^\pi(s) & = &
E_s^{\pi}[\sum_{t=1}^\infty\lambda^{t-1}r(X_t, Y_t)] =
\sum_{t=1}^{\infty}\lambda^{t-1}P_{\pi}^{t-1}r_{d_{t}}
\end{eqnarray*}

\begin{eqnarray*}
\vec{v}_\lambda^\pi & = & \vec{r}_{d_{1}} + \lambda
P_{d_{1}}[\underbrace{\vec{r}_{d_{2}}+\lambda
P_{d_{2}}\vec{r}_{d_{3}}+\ldots}_{v_\lambda^{\pi^{'}}}]
\end{eqnarray*}

(where $\pi^{'}$ is similar to policy $\pi$ starting from the
second step)\\

\begin{eqnarray*}
\vec{v}_{\lambda}^{\pi} = \vec{r}_{d_{1}}+\lambda
P_{d_{1}}\vec{v}_{\lambda}^{\pi^{'}}
\end{eqnarray*}
If $\pi$ is stationary then $\pi^{'}=\pi$ and

\begin{eqnarray*}
\vec{v}_{\lambda}^{\pi} = \vec{r}_{d_{1}}+\lambda
P_{d_{1}}\vec{v}_{\lambda}^{\pi}
\end{eqnarray*}

All the parameters aside from $\vec{v}_\lambda^\pi$ are known,
thus we have a set of linear equations of the form
$\vec{x}=r_{d{1}}+\lambda P_{d_{1}}\vec{x}$. We will show that
these equations have a single solution which is
$\vec{v}_{\lambda}^{\pi}$.\\

\subtopic {Existence of a unique solution}{} We define a linear
transformation $L_{d}$: $L_d\vec{v}=\vec{r}_d+\lambda
P_d\vec{v}$.\\Since
$\vec{v}_{\lambda}^{\pi}=L_d\vec{v}_\lambda^{\pi}$,
$\vec{v}_{\lambda}^{\pi}$ is a fixed point of $L_d$.

\begin{theorem}
\label{L2:SingleSolutionTheorem} For $0 \leq \lambda < 1$ and
$\pi$ a Markovian Stationary policy,\\ $\vec{v}_{\lambda}^{\pi}$
is the unique solution for the equation set
\begin{eqnarray*}
\vec{v}=\vec{r}_d+\lambda p_d\vec{v}
\end{eqnarray*}
and is equal to
\begin{eqnarray*}
\vec{v}_{\lambda}^{\pi}=(I-\lambda P_d)^{-1}\vec{r}_d
\end{eqnarray*}
\end{theorem}
\begin{proof}
We can write the equation set as
\begin{eqnarray*}
\vec{v}(I-\lambda P_d) = \vec{r}_d
\end{eqnarray*}
Since $P_d$ is a probability matrix, $\|P_d\|=1$, and as $\lambda
< 1$, $\|\lambda P_d\| < 1$.\\ \ \\ According to Theorem
\ref{L1:InverseTheorem}, $(I-\lambda P_d)^{-1}$ exists. Thus, a
solution $\vec{v}=(I-\lambda P_d)^{-1}\vec{r}_d$ exists.\\ \ \\

By the same theorem,
\begin{eqnarray*}
\vec{v}=(I-\lambda P_d)^{-1}\vec{r}_d=\sum_{i=0}^{\infty}(\lambda
P_d)^{i}\vec{r}_d=\sum_{i=0}^{\infty}\lambda^{i}P_d^{i}\vec{r}_d=\sum_{t=1}^{\infty}\lambda^{t-1}P_d^{t-1}\vec{r}_d=\vec{v}_\lambda^{\pi}\
.
\end{eqnarray*}
We have shown that the solution is the discounted return value of
policy $\pi$
\end{proof}

\begin{figure}\label{L9:FIGURE} \psfig{file=sample.eps ,width=6in ,clip=}
\caption{Example Diagram}
\end{figure}
\pagebreak\subtopic {Example:}{} (Consider the MDP in figure
5.1){}\\ \ \\ For a policy $\pi$, which picks ${a_{11}}$ in
${S_1}$ and ${a_{21}}$ in ${S_2}$ we compute the following values:
\begin{eqnarray*}
V({S_1})& = & 5 + \lambda[\frac{1}{2}V({S_1}) +
\frac{1}{2}V({S_2})]\\ V({S_2}) & = & -1 + \lambda[1\cdot
V({S_2})]
\end{eqnarray*}

Or in matrix notation:\\ \ \\$\vec{v} =
\left(\begin{array}{c}5\\-1\end{array}\right) +
\lambda\left(\begin{array}{cc}
 \frac{1}{2} & \frac{1}{2} \\
  0 & 1
\end{array}\right)\vec{v}$\\ \ \\

Solutions are,\\
\begin{eqnarray*}
 V({S_2}) = - \frac{1}{1-\lambda}\\
 V({S_1}) =
 \frac{5-\frac{\frac{1}{2}}{1-\lambda}}{1-\frac{\lambda}{2}}
\end{eqnarray*}
\\
\subtopic {Properties of the transition matrix:}{}
 We show that the matrix $(I - \lambda P_{d})^{-1}$ is order conserving.\\
\begin{lemma}
The following holds for a probability matrix $P$ and $0 \leq \
\lambda < 1$:
\begin{enumerate}
\item If $\|\vec{u}\| \ \geq \ 0$  then $\|(I-\lambda
P)^{-1}\vec{u}\| \ \geq \ \|\vec{u}\| \ \geq \ 0$
\item If $\|\vec{u}\| \ \geq \ \|\vec{v}\|$  then $\|(I-\lambda
P)^{-1}\vec{u}\| \ \geq \ \|(I-\lambda P)^{-1}\vec{v}\|$
\item  If $\|\vec{u}\| \ \geq \ 0$  then $\|\vec{u}^{T}(I-\lambda P)^{-1}\|\
\geq\ \|\vec{u}^{T}\| \geq 0$
\end{enumerate}
\end{lemma}
\begin{proof}
Since $\|P\|= 1$ then $\|\lambda P\|\leq 1$. By theorem
\ref{L1:InverseTheorem}
\begin{eqnarray*}
(I - \lambda P_{d})^{-1}\vec{u} = \vec{u} + \underbrace{(\lambda
P)\vec{u} + (\lambda P)^{2}\vec{u} + \ldots}_{(sum\ of\ positive\
vectors )} \geq \vec{u} \geq 0
\end{eqnarray*}
\end{proof}
\topic{Computing the Optimal Policy}{}Having shown how to evaluate
a given policy, we now turn to show how to find an optimal
policy.\subtopic{Optimality Equations}\\

For a finite horizon we gave the following equations:\\
\begin{eqnarray*}
v_{n}(s) = \max_{a \in A_{s}}\{r(s,a) + \sum_{j\in S}p(j\mid
s,a)v_{n+1}(j)\}
\end{eqnarray*}
For a discounted infinite horizon we have similar equations. We
will show that the Optimality equations are:
\begin{eqnarray*}
v(s) = \max_{a \in A_{s}}\{r(s,a) + \lambda\sum_{j\in S}p(j\mid
s,a)v_(j)\}
\end{eqnarray*}
First we show that maximizing over deterministic and stochastic
policies yield the same value.
\begin{theorem}
\label{L3:MD=MR}
 For all $\vec{v}$ and $1 > \lambda \geq 0$
 \begin{eqnarray*}
\max_{d\in \Pi^{MD}}\{r_{d} + \lambda P_{d}\vec{v}\} =
\max_{d\in\Pi^{MR}}\{r_{d} + \lambda P_{d}\vec{v}\}
\end{eqnarray*}
\end{theorem}

\begin{proof}\\
Since $\Pi^{MD} \subseteq \Pi^{MR}$, the right side of the
equality is at least as large as the left.\\ Now we show that the
left side is at least as large as the right.\\ For $\pi \in
\Pi^{MR}$ and $v$ we define
\begin{eqnarray*}
\forall s\in S \ \ \ \   w_{s}(a) = r(s,a) + \lambda\sum_{j\in S}
p(j\mid s,a)v(j)
\end{eqnarray*}
Fix a state $s\in S$. The value of $\pi$ is a weighted average of
$w_{s}(a)$, and we have
\begin{eqnarray*}
\max_{a\in A_{s}}\{w_{s}(a) \} \geq \sum_{a\in
A_{s}}q_{\pi}(a)w_{s}(a)\ .
\end{eqnarray*}
Hence
\begin{eqnarray*}
\max_{d\in \Pi_{MD}}\{r_{d} + \lambda P_{d}\vec{v} \} \geq r_{\pi}
+ \lambda P_{\pi}\vec{v}\ ,
\end{eqnarray*}
For any $\pi \in \Pi_{MR}$.\\ This shows that the left hand side
in the theorem is at least as large as the right hand side, which
completes the proof.
\end{proof}
\ \\
%\pagebreak
Let us define the non-linear operator $L$:\\
\begin{eqnarray*}
L\vec{v} = \max_{d\in \Pi^{MD}}\{\vec{r}_{d} + \lambda
P_{d}\vec{v}\}
\end{eqnarray*}

Therefore we can state the optimality equation by
\begin{eqnarray*}
v = \max_{d\in \Pi^{MD}}\{\vec{r}_{d} + \lambda P_{d}\vec{v}\} =
L\vec{v}
\end{eqnarray*}


It still remains to be shown that these indeed are optimality
equations, and that the fixed point of the operator is the optimal
return value.


\begin{theorem}
Let $v_{\lambda}^{*}$ be the optimal return value with parameter
$1 > \lambda \geq 0$
\begin{enumerate}
\item If $v \geq Lv$ then $v \geq v_{\lambda}^{*}$
\item If $v \leq Lv$ then $v \leq v_{\lambda}^{*}$
\item If $Lv = v$ then $v = v_{\lambda}^{*}$
\end{enumerate}
\end{theorem}

\begin{proof}
We start by proving (1)
\begin{eqnarray*}
v & \geq & \max_{d\in \Pi^{MD}}\{r_{d} + \lambda P_{d}v\} \ \ \ \
(given)\\ & = & \max_{d\in \Pi^{MR}}\{r_{d} + \lambda P_{d}v\} \ \
\ \ (by\ theorem\ \ref{L3:MD=MR})
\end{eqnarray*}
this implies that for any policy $d$,
\begin{eqnarray*}
v & \geq &r_{d} + \lambda P_{d}v\\ & \geq & r_{d} + \lambda
P_{d}r_{d} + (\lambda P_{d})^{2}v \\ & \geq & \sum_{i=0}^{n-1}
(\lambda P_{d})^{i}r_{d}  +  (\lambda P_{d})^{n}v \ \ \ (where\
[P^{0} = I])
\end{eqnarray*}
For a given policy $d$
\begin{eqnarray*}
v_{\lambda}^{d} = \sum_{i=0}^{\infty}(\lambda P_{d})^{i}r_{d}
\end{eqnarray*}
By subtraction
\begin{eqnarray*}
v - v_{\lambda}^{d} \geq (\lambda P_{d})^{n}v -
\sum_{i=n}^{\infty}(\lambda P_{d})^{i}r_{d}
\end{eqnarray*}
Since $\lambda^{n}\|v\| \geq \|\lambda^{n}P_{d}^{n}v\|$ and
$\lambda < 1$, we have that for any $\epsilon
> 0$ there exists an $N > 0$, such that for all $n > N$ we have $\|\lambda^{n}P_{d}^{n}v\| \leq
\frac{\epsilon}{2}$\\
\
\\Since $|\vec{r}_d| < M$ we can write:
\begin{eqnarray*}
-\frac{\lambda^{n}M}{1-\lambda}\cdot\vec{1} \leq
\sum_{k=n}^{\infty}\lambda^{k}P_{d}^{k}r_{d} \leq
\frac{\lambda^{n}M}{1-\lambda}\cdot\vec{1}
\end{eqnarray*}
For a large enough $n$ we have
\begin{eqnarray*}
\|\sum_{k=n}^{\infty}\lambda^{k}P_{d}^{k}r_{d}\| \leq
\frac{\epsilon}{2}\ .
\end{eqnarray*}
By this we derive that
\begin{eqnarray*}
\forall s\in S \ \ \   v(s) \geq v_{\lambda}^{d}(s) - \epsilon
\end{eqnarray*}
and for all $d$
\begin{eqnarray*}
v \geq v_{\lambda}^{d} - \epsilon
\end{eqnarray*}
Thus
\begin{eqnarray*}
v \geq \max_{d\in \Pi^{MR}} \{ v_{\lambda}^{d}\} - \epsilon  =
\max_{d\in \Pi^{MD}} \{ v_{\lambda}^{d}\} - \epsilon  =
v_{\lambda}^{*} - \epsilon
\end{eqnarray*}
As this is true $\forall \epsilon > 0$
\begin{eqnarray*}
v \geq v_{\lambda}^{*}
\end{eqnarray*}
(If we assume that there is a state $s$ such that $v(s) <
v_{\lambda}^{*}(s)$ we pick $\epsilon = \frac{v_{\lambda}^{*} -
v(s)}{2}$ , and reach a contradiction)\\ We now prove (2)\\Since
$v \leq Lv$ there exists a policy $d$ such that
\begin{eqnarray*}
v \leq  r_{d} + \lambda P_{d}v
\end{eqnarray*}
By theorem \ref{L2:SingleSolutionTheorem}
\begin{eqnarray*}
v \leq  (I - \lambda P_{d})^{-1}r_{d} = v_{\lambda}^{d}
\end{eqnarray*}
Hence
\begin{eqnarray*}
v \leq \max_{d} \{ v_{\lambda}^{d}\}
\end{eqnarray*}
Part (3) follows immediately from parts (1) and (2).
\end{proof}







\subtopic {The Solution of the Optimality Equations:}{} Operator
$T$ is called contracting if there exists $0 \leq \lambda < 1$
such that
\begin{eqnarray*}
\|T\vec{u}-T\vec{v}\| \leq \lambda \|\vec{u}-\vec{v}\| & \forall
\vec{u}, \vec{v} \in R^{n}
\end{eqnarray*}

\begin{theorem}\label{L4:CONTRACTING}
Let $T: R^n\rightarrow R^n$ a contracting operator, then
\begin{enumerate}
\item there exists a unique  $  \vec{v^*}$ such that
$T\vec{v}^*=\vec{v}^*$
\item For each starting point $\vec{v}_0$ the series
$\vec{v}_{n+1}=T\vec{v}_n$ converges to $\vec{v}^*$
\end{enumerate}
\end{theorem}
\begin{proof}
We define $\vec{v}_{n+1}=T\vec{v}_n$ \subsubtopic{Existence of a
limit $\vec{v^*}$}{}
\begin{eqnarray*}
\|\vec{v}_{n+m}-\vec{v}_n\| & = &
\|\sum_{k=0}^{n-1}\vec{v}_{n+k+1}-\vec{v}_{n+k}\|\\ & \leq &
\sum_{k=0}^{m-1}\|\vec{v}_{n+k+1}-\vec{v}_{n+k}\| \ \  (according\
to\ the\ triangle\ inequality)\\& = &
\sum_{k=0}^{m-1}\|T^{n+k}\vec{v}_1-T^{n+k}\vec{v}_0\|\\ & \leq &
\sum_{k=0}^{m-1}\lambda^{n+k}\|\vec{v}_1-\vec{v}_0\|\ \ \ \
(contraction\ n+k\ times)\\& = &
\frac{\lambda^n(1-\lambda^m)}{1-\lambda}\|\vec{v}_1-\vec{v}_0\|
\end{eqnarray*}
Since the coefficient decreases as $n$ increases, $\forall
\epsilon
> 0\ \  \exists N
> 0$  such that $\forall n \geq N, \|\vec{v}_{n+m}-\vec{v}_n\| <
\epsilon$\\ Thus the series $\vec{v}_n$ has a limit.\\ \ \\ Let us
call this limit $\vec{v^*}$ and show that $\vec{v^*}$ is a fixed
point of the operator $T$. \subsubtopic{$\vec{v}^{\ *}$ is a fixed
point}{}
\begin{eqnarray*}
0 & \leq & \|T\vec{v^*}-\vec{v}^{\ *}\|\\ & \leq &
\|T\vec{v^*}-\vec{v}_n\| \ \ + \ \ \|\vec{v}_n-\vec{v^*}\|\
(according\ to\ the \ triangle\ inequality)\\& = &
\|T\vec{v^*}-T\vec{v}_{n-1}\| \ \  + \ \
\|\vec{v}_n-\vec{v^*}\|\\& \leq &
\lambda\|\underbrace{\vec{v^*}-\vec{v}_{n-1}}_{\rightarrow 0 }\| \
\ + \ \ \|\underbrace{\vec{v}^{n}-\vec{v^*}}_{\rightarrow 0}\|
\end{eqnarray*}
Since $\vec{v^*}$ is a limit of $\vec{v}_n$,
\begin{eqnarray*}
\lim_{n\rightarrow\infty}\|\vec{v}_n-\vec{v^*}\| = 0
\end{eqnarray*}
hence
\begin{eqnarray*}
\|T\vec{v^*}-\vec{v^*}\| = 0
\end{eqnarray*}
thus $\vec{v^*}$ is a fixed point of the operator $L$.
\subsubtopic{Uniqueness of $\vec{v^*}$}{} If
\begin{eqnarray*}
T\vec{v}_1 = \vec{v}_1,\  T\vec{v}_2 = \vec{v}_2,\  and\
\vec{v}_1 \neq \vec{v}_2
\end{eqnarray*}
then
\begin{eqnarray*}
\|T\vec{v}_1-T\vec{v}_2\| = \|\vec{v}_1-\vec{v}_2\| \leq
\lambda\|\vec{v}_1-\vec{v}_2\|
\end{eqnarray*}
Hence $\lambda > 1$ in contradiction to the premises.
\\ Thus $\vec{v^*}$ is unique.
\end{proof}
 \ \\ \ \\

Next we will show that the operator $L$ is a contracting
operator.\\
\begin{claim}
$\forall\  0 \leq \lambda < 1$, $L$ is a contracting operator.
\end{claim}
\begin{proof}
For all $\vec{u}$, $\vec{v}$, we choose $s \in S$, and assume
$L\vec{v}(s) \geq L\vec{u}(s)$.\\ We define $a_s^{*}$ such that:
\begin{eqnarray*}
a_s^{*} \in argmax_{a \in A_s}\{r(s,a)+\lambda\sum_{j \in
S}P(j|s,a)\vec{v}(j)\}
\end{eqnarray*}
\begin{eqnarray*}
0 & \leq & L\vec{v}(s)-L\vec{u}(s)\\& \leq &
\underbrace{r(s,a_{s}^{*})+\lambda\sum_{j \in
S}P(j|s,a)\vec{v}(j)}_{L\vec{v}(s)}-\underbrace{r(s,a_{s}^{*})-\lambda\sum_{j
\in S}P(j|s,a)\vec{u}(j)}_{not\ neccessarily\ the\ optimal\
action\ for\ u}\\ & = & \lambda\sum_{j \in
S}P(j|s,a)[\vec{v}(j)-\vec{u}(j)]\\ & \leq & \lambda \sum_{j \in
S}P(j|s,a)\|\vec{v}-\vec{u}\|\\ & = & \lambda\|\vec{v}-\vec{u}\|
\end{eqnarray*}
We have shown that
\begin{eqnarray*}
L\vec{v}(s)-L\vec{u}(s) \leq \lambda \|\vec{v}-\vec{u}\|
\end{eqnarray*}
(The same proof holds for $L\vec{v}(s) \leq L\vec{u}(s)$)\\ \
\\

Thus, for \em{all} $s \in S$,
\begin{eqnarray*}
\|L\vec{v}-L\vec{u}\| \leq \lambda \|\vec{v}-\vec{u}\|
\end{eqnarray*}
Hence $L$ is a contracting operator.
\end{proof}


\begin{theorem}
Let $0 \leq \lambda < 1$ and  $S$ a finite set, then
\begin{enumerate}
\item There exists a unique solution $v^{*}$ such that $Lv^{*}$ and
$v_{\lambda}^{*} = v^{*}$
\item For all $\pi\in \Pi^{MR}$ there exists a unique $v$ such that
$L_{\pi}v = v$ and $v_{\lambda}^{\pi} = v$
\end{enumerate}
\end{theorem}
\begin{proof}
\begin{enumerate}
\item As $L$ has been shown to be a contracting operator there is a
unique solution for the equation $Lv = v$ by theorem
\ref{L4:CONTRACTING}.  This fixed point is $v_{\lambda}^{*}$.
\item Is true by the same argument.
\end{enumerate}
\end{proof}
\subtopic {Example:}{} Using the same example we calculate the
optimal return value to be:
\begin{eqnarray*}
V(S_{1})& = & max\{5 + \lambda[\frac{1}{2}V(S_{1}) +
\frac{1}{2}V(S_{2})], 10 + \lambda V(S_{2})\} \\V(S_{2})& = &-1 +
\lambda V(S_{2})
\end{eqnarray*}
Thus
\begin{eqnarray*}
V(S_{2}) & = & -\frac{1}{1 - \lambda}\\ V(S_{1})& = & max\{5 +
\lambda[\frac{1}{2}V(S_{1}) - \frac{1}{2}\frac{1}{1 - \lambda}],\
10 - \lambda \frac{1}{1 - \lambda}\}
\end{eqnarray*}
If we examine different values of $\lambda$ we get different
optimal actions in $S_{1}$.\\ For example:
\begin{eqnarray*}
 \lambda = 0  \ &:&\  V(S_{1})^{*} = 10 \ \ \      V(S_{2})^{*}
= -1\\  \lambda = \frac{1}{2}  \ &:&\ V(S_{1})^{*} = 9 \ \ \
V(S_{2})^{*} = -2\\ \lambda = \frac{9}{10} \  &:&\  V(S_{1})^{*} =
1 \ \ \ V(S_{2})^{*} = -10
\end{eqnarray*}
Note that as $\lambda$ increases the optimal policy at $S_{1}$
changes from $a_{12}$ to $a_{11}$.



\end{flushleft}
\end{document}


