
%%%%%%%%%%%% Subject: template for lecture notes %%%%%%%%%%%%%%%%%%%%
%
% Thank you for the hitnadvut to write up notes on today's lecture.
%
% Here is the template file for writing up notes for this course.
% If you scribe on a Tuesday we ask that you please bring your
% notes by the following Tuesday. 
%
% To help you with your notes, I can let you copy the notes used in 
% giving the lecture. Please ask
% for help in understanding the material or if you have a question
% about how it should be presented.
%
% In order to make the notes more uniform and easier to read, we are providing
% several macros for theorems and such.  (Some examples of their use are
% included below.) Also, we ask that you follow a few stylistic conventions:
%
% Please write your notes as if you were explaining the material to another
% student, rather than as minutes of a meeting. (That is, don't write
% things like ``Today we started discussing...'' or ``next time,
% we will...'' or ``then, someone asked the question...''
% Just explain the material as clearly as you can.
%
% If you need to refer to an earlier lecture, use the lecture number.
% For example,  ``In Lecture 6 it was shown that...''
% To refer to a later lecture for which you don't know the number,
% just say something like,
% ``This impossibility result will be discussed further in a later
% lecture.'' (At the end of the course, I will go back and fill in the
% appropriate lecture numbers.)
%
% If material from a handout is covered in lecture, then we would like to
% include the text of the handout as a part of the notes to make them more
% self-contained.  To make your job easier, I will mail you the latex
% files for the relevent handouts, and you can merge them in at the
% appropriate places.  Also, you may want to annotate the handout as you
% see fit.  If you happen to take notes on a day that a homework is assigned,
% I will also mail you the exercises, which you can put at the end of your
% notes.
%
% Figures should be done in Latex, if at all possible.  You may have available
% on your system a program called ``texdraw.''  The program lets to draw
% figures with the mouse and then generates a latex file, which you can then
% insert into your text.  If you don't have this program, usually writing the
% latex commands yourself isn't that hard.  For very complicated figures, you
% may wish to leave the appropriate space in the text and draw the figures
% neatly (each on a separate page) in black ink.  For the first draft you
% hand in, you may not want to invest a lot of time drawing the figures very
% neatly, because there's a chance that we will want you to modify
% them.  Save that effort for the draft to be handed out.
%
% Please let me know if you have any questions.
%
%
\documentstyle[12pt]{book}
%\setlength{\oddsidemargin}{-.1 in}
%\setlength{\evensidemargin}{-.3 in}
\setlength{\evensidemargin}{.0 in}
\addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in}
\setlength{\textheight}{8in}

%
% this command enables to remove a whole part of the text 
% from the printout
% to use it just enter
% \remove{  
% before the text to be excluded and
% } 
% after the text
\newcommand{\remove}[1]{}

%
% The following macros are used to generate nice code for programs.
% See example on how to use it below
%

%%%%%%%%%%%%%%%%%%%%% program macros %%%%%%%%%%%%%%%%%

\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
    \begin{minipage}{\textwidth}
    \begin{tabbing}
    \ \ \ \ \=\kill
  }{
    \end{tabbing}
    \end{minipage}
  }

% a blank line
\def\blankline{\hbox{}}

%%%%%%%%%%%%%%%%%%%%% End of PROGRAM macros %%%%%%%%%%%%%%%%%


%
% The following macro is used to generate the header.
%
%

\newcommand{\lecture}[5]{
   \pagestyle{headings}
   \thispagestyle{plain}
   \newpage
   \setcounter{chapter}{#1}
   \setcounter{page}{#2}
%  \set\thechapter{#3}
   \noindent
   \begin{center}
   \framebox{
      \vbox{
    \hbox to 6.28in { {\bf Computational  Learning Theory
                        \hfill Fall Semester, 1999/00} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1: #3  \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5} }
      }
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

%
% Use these macros for organizing sections of your notes.
% Each command takes two arguments: (1) the title of the section and and
% (2) a keyword for that section to appear in the index.  (See examples.)
% Please don't use \section, \subsection, and \subsubsection directly!
%

\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}
 
%
% Convention for citations is first author's last name followed by other
% authors' last initials, followed by the year.  For example, to cite the
% seventh entry in the course bibliography, you would type: \cite{BurnsL80}
% (To avoid bibliography problems, for now we redefine the \cite command.)
%
     
\renewcommand{\cite}[1]{[#1]}

%
% These are just to make things a little easier:
%
\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}}   % generates a blank line in the output

%
% Use these for theorems, lemmas, proofs, etc.
%
\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\noindent \bf Proof:}}{\qed \par}
%\newenvironment{proof}{{\em Proof:}}{\hfill\rule{2mm}{2mm}}

%
% Use the following for definitions.
% \bigdef is for definitions to be set off by themselves; \smalldef is for
% definitions given in the middle of a paragraph.
%
\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition }}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}


% **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:

\begin{document}
%\lecture{**LECTURE-NUMBER**}{**1ST-PAGE**}{**DATE**}{**LECTURER**}{**SCRIBE**}
\lecture{4}{1}{November 11}{Yishay Mansour} {Yael Bdolah, Dror Livnat}

% **** YOUR NOTES GO HERE

%    Some general latex examples and examples making use of the 
% macros follow.  You may want to latex this file and print out the result
% to get an idea of how things look. (You will need to latex the file twice
% to get the cross-references right.)
%    Even if you already know latex, you should take a few minutes to
% look over these examples, so that you understand the general conventions
% we're using.

\topic{Finite Horizon}{}

In lecture number 3 we introduced the optimality equations:\\
$$U_t(h_t) = \max_{a\in A} \{r_t(s_t,a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}(h_t,a,j)\},$$
Where $U_N(h_N)=r_N(s_N)$ for $h_N=(h_{N-1},a_{N-1},s_N)$.\\
We showed:
\begin{enumerate}
\item An optimal policy
\item A deterministic optimal policy
\end{enumerate}
\subtopic {Markovian Policy}{}
\begin{theorem}Let $U_{t}^{*}$ be a solution to the optimality equations then
\begin{enumerate}
\item For any $t$, $1 \leq t \leq N,$ $U_{t}^{*}(h_t)$ depends on $h_t$ only through $s_t$
\item There exist a Markovian deterministic optimal policy
\end{enumerate}
\end{theorem}

\begin{proof} We will use a reversed induction to prove (1).\\
{\em Basis}: $U_{N}^{*}(h_N)=r_N(s_N)$, therefore $U_{N}^{*}(h_N)=U_N^*(s_N)$\\
{\em Induction Step}: We assume the validity of the induction hypothesis for any $n$, $n \geq t+1$ and will prove the validity for $t=n$.
\begin{eqnarray*}
U_t^* & = & \max_{a \in A} \{r_t(s_t,a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}^*(h_t,a,j)\} \\
& = & \max_{a \in A} \underbrace{\{r_t(s_t,a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}^*(j)\} }\\
\end{eqnarray*}
Note that the marked term depends merely on $s$ and $a$. The entire term, therefore depends solely on $s$.\\
Thus, 
\begin{center}
$U_{t}^{*}(h_t)=U_t^*(s_t)$
\end{center}
To prove (2), let $\pi$ be a Markovian deterministic policy that sutisfies:
$${\pi}_t(s_t) = argmax_{a \in A} \{r_t(s_t,a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}^*(j)\}$$
Since the policy's definition depends solely on $s_t$, namely the current state, ${\pi}_t$ is a Markovian policy.
\end{proof}
\subsubtopic {Summary}{} Theorem 3.2 and theorem 4.1 lead to
$$V_N^*(s) = \max_{ \pi \in {\Pi}^{HR}} \{V_N^{\pi}(s) = \max_{\pi \in {\Pi}^{MD}} \{V_N^{\pi}(s)\}$$

\noindent Namely, the optimal policy can always be chosen out of the group of Markovian deterministic policies.

\subtopic {An Algorithm for Constructing an Optimal Policy}{}
In this section we develop an optimal Markovian deterministic policy. As shown in the previous section, 
by this we achieve a general optimal policy. The algorithm construction is done from $t=n$ to 
$t=1$ in a recursive manner.\\
The algorithm:
\begin{enumerate}
\item For $t \leftarrow N$, $\forall s_N, U_N(s_N) = r_N(s_N)$
\item For $t \leftarrow t-1$
$$U_t(s_t) = \max_{ a \in A}\{r_t(s_t, a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}(j)\}$$
and the optimal group of actions is:
$$A_{s_t}^*(t) = argmax_{ a \in A} \{r_t(s_t,a) + \sum_{j\in S} P_t(j|s_t,a)U_{t+1}^*(j)\}$$
\end{enumerate}
The generated policy, ${\pi}^*$, satisfying $\forall t, \forall s_t, {\pi}_t^*(s_t) \in A_{s_t}^*$ is an optimal policy.
\newline
\newline
\subsubtopic {Computational Complexity}{}
\noindent Let $K=|S|$ and $L=|A|$, then the time complexity of the described algorithm is 
$O(NLK^2)$. The latter is derived from repeating the iterative step for $t=1, 2, ..., N$, calculating $U_t(s_t)$ 
for $K$ states, and a complexity of $KL$ for the max operation.

\subtopic {Example: The Recruiting Problem}{}
\subsubtopic {The Problem}{}
A manager has to recruit a new employee and he can serially interview a finit group of 
candidates. There is a total order defined on the candidates' fitness to the opening, and there are
no two employees with the same skill level. The manager is able to sort the candidates' fitness level
after a short interview. After each interview the manager has two alternatives:
\begin{itemize}
\item recruit the last interviewed candidate
\item continue to the next interview (and give up the chance of recruiting the previous candidate)
\end{itemize}
\subsubtopic {The Goal}{}
To maximize the probability of recruiting the best candidate.\\
\newline
\subsubtopic {The Solution}{}
We will first construct a corresponsing MDP for the problem, and then develop the
optimal policy for it.\\
Let $A=\{Continue, QuitAndHire\}$ the possible actions, where $Continue$ stands for 'continue' and $QuitAndHire$ for 'quit and recruit'.
Let $S=\{MaxSoFar, Other, 1, 0\}$ be the group of states of the MDP, standing for:
\begin{itemize}
\item MaxSoFar - the last interviewed candidate was the best so far
\item Other - the last interviewed candidate was NOT the best so far
\item 1 - the last interviewed candidate was THE best candidate in the entire group and was hired
\item 0 - the last interviewed candidate was NOT the best candidate in the entire group and was hired
\end{itemize}
 \begin{figure}
 \caption{The recruiting problem MDP}
 \label{fig:L4RecrutingMDP}
 \end{figure}
Figure \ref{fig:L4RecrutingMDP} shows the resultant MDP, using the following transition probabilities:
\begin{itemize}
\item $q_t=Prob [\max\{x_1, ..., x_t\}=\max\{x_1, ..., x_N\}]=\frac{t}{N}$
\item $r_t=Prob [x_{t+1} \geq \max\{x_1, ..., x_t\}]=\frac{1}{t+1}$
\end{itemize}
where $N$ is the finite time horizon (the size of the candidates group) and $t$ is the current time
(the number of candidates interviewed so far).\\
Writing the optimality equations for this problem we get:\\
$$U_t^*(1)=1, \;\; U_t^*(0)=0, \;\; U_N^*(MaxSoFar)=U_N^*(Other)=0$$
\begin{eqnarray*}
U_t^*(Other)    & = & \max\{0, r_tU_{t+1}^*(MaxSoFar)+(1-r_t)U_{t+1}^*(Other)\} \\
                & = & r_tU_{t+1}^*(MaxSoFar)+(1-r_t)U_{t+1}^*(Other) \\
U_t^*(MaxSoFar) & = & \max\{q_tU_{t+1}^*(1), r_tU_{t+1}^*(MaxSoFar)+(1-r_t)U_{t+1}^*(Other)\} \\
                & = & \max\{q_t\cdot 1, r_tU_{t+1}^*(MaxSoFar)+(1-r_t)U_{t+1}^*(Other)\} \\
				& = & \max\{q_t, U_t^*(Other)\} \\
\end{eqnarray*}
Assigning the terms for $q_t$ and $r_t$ we get:\\
\begin{equation}
U_t^*(Other)=\frac{1}{t+1}U_{t+1}^*(MaxSoFar)+\frac{t}{t+1}U_{t+1}^*(Other)
\label{L4:U_other}
\end{equation}
\begin{equation}
U_t^*(MaxSoFar)=\max\{\frac{t}{N}, U_t^*(Other)\}
\label{L4:U_max}
\end{equation}
An optimal decision rule has the following properties:
\begin{itemize}
\item In $s=Other$, always choose $a_s=Continue$ (otherwise the return is promised to be zero)
\item In $s=MaxSoFar$, choose $a_s=Continue$ if $\frac{t}{N}<U_t^*(Other)$ and $a_s=QuitAndHire$ otherwise
(in order to maximize $U_t^*(MaxSoFar)$)
\end{itemize}
We now show that the optimal policy is of the form:
\begin{itemize}
\item Interview $\tau$ candidates, performing $a_t=Continue, \; t\leq\tau$
\item Quit and hire the first candidate after time $\tau$, which is the best so far
\end{itemize}

\subsubtopic {Optimality Proof}{}
We will now show that the described policy agrees with the optimality equations.\\
We start by showing that if there is a time $\tau$ such that $\frac{\tau}{N}<U_\tau^*(MaxSoFar)$ 
then $\forall t, t<\tau$ we get $\frac{t}{N}<U_t^*(MaxSoFar)$. That is, if there exists a time $t=\tau$ in 
which it is preferred to $Continue$, then for each earlier time it is also preferred to continue. Later
on, we will show that such a time, $\tau$, does exist.\\
Let $\frac{t}{N}<U_t^*(MaxSoFar)$, then according to equation \ref{L4:U_max} $U_\tau^*(MaxSoFar)=U_\tau^*(Other)$.
Performing backword induction steps, and using equations \ref{L4:U_other} and \ref{L4:U_max} we show that for 
$t<\tau$ the inequality remains true:\\
$U_{\tau-1}^*(Other)=\frac{1}{\tau}U_\tau^*(MaxSoFar)+\frac{\tau-1}{\tau}U_\tau^*(Other)=U_\tau^*(Other)>\frac{\tau}{N}$\\
$U_{\tau-1}^*(MaxSoFar)=\max\{\frac{\tau-1}{N}, U_{\tau-1}^*(Other)\}>\frac{\tau}{N}>\frac{\tau-1}{N}$\\
We now show that for $t>\tau$ and $s=MaxSoFar$, it is preferred to $QuitAndHire$.\\
Let $t>\tau$, then according to the first half of the proof,\\
$U_t^*(MaxSoFar)=\frac{t}{N}$\\
and,\\
\begin{eqnarray*}
U_t^*(Other) & = & \frac{1}{t+1}U_{t+1}^*(MaxSoFar)+\frac{t}{t+1}U_{t+1}^*(Other)\\
             & = & \frac{1}{N}+\frac{t}{t+1}U_{t+1}^*(Other)\\
             & = & \frac{1}{N}+\frac{t}{t+1}\frac{1}{N}+\frac{t}{t+2}\frac{1}{N}+...\\
			 & = & \frac{1}{N}\sum_{ i=0}^{ N-t}\frac{t}{t+i}\sim\ln(\frac{N}{t})\\
\end{eqnarray*}


\noindent We conclude this proof by showing that such a $\tau$ exists, that is, for $N\geq2$ there exists
$\tau\geq1$ that meets the above requiremints. Let us assume $\tau=0$ we get:\\
$U_1^*(Other)=\frac{1}{2}+\frac{1}{3}+...+\frac{1}{N}>\frac{1}{2} \geq \frac{1}{N}=U_1^*(MaxSoFar)\geq U_1^*(Other)$\\
which is a circular inequality, and thus leads to cnotradiction.\\
Note that for $N \geq 2$ we always get $\tau \geq 1$:\\
$U_1^*(Other) = ... = U_{\tau}^*(Other)$\\
$||$			\\
$U_1^*(MaxSoFar) = ... = U_{\tau}^*(MaxSoFar)$\\
and for $t>\tau$\\
$U_t^*(MaxSoFar)=\frac{t}{N}$\\
$U_t^*(Other)=\frac{t}{N}(\frac{1}{t}+\frac{1}{t+1}+...+\frac{1}{N-1})$.\\
We therefore choose $Continue$ if $\frac{1}{t}+...+\frac{1}{N-1}>1$ and $QuitAndHire$ otherwise.\\
\newline
\subsubtopic {A Numeric Example}{}
Let $N=5$, we get\\
$\frac{1}{3}+\frac{1}{4}<1$ and $\frac{1}{2}+\frac{1}{3}+\frac{1}{4}>1$\\
and therefore we choose $\tau=2$.\\
\newline
\subsubtopic {An Asymptotic Example}{}
Let $N \rightarrow \infty$, we get\\
$$\sum_{ j=\tau}^{ N-1}\frac{1}{j}\sim\ln\frac{N}{\tau}.$$\\
We therefor shearch for $\tau$ such that\\
$\ln\frac{N}{\tau+1}<1$ and $\ln\frac{N}{\tau}>1$,\\
which leads to:
$$\tau\sim\frac{N}{e}$$
\newline
\subsubtopic {Conclusion}{}
The best policy is to perform $\frac{N}{e}$ interviews, keeping in mind only the level of the best candidate
so far. Then, starting at candidate number $\frac{N}{e}+1$, $QuitAndHire$ on
the first candidate which satisfies the requirement of $Best So Far$. This way the asysmptotic success
probability is $\frac{\tau}{N}=\frac{1}{e}$.

% **** Add the Infinite Horizon Problems definition here.

% **** THIS IS WHERE YAEL STARTED TO ADD AT HOME ****
\topic{Infinite Horizon Problems}{}
\subtopic {The Return Function}{}
We will suggest three possible return functions for the infinite horizon problem:
\begin{enumerate}
\item The {\bf expected sum of the immediate rewards}, i.e.
\begin{eqnarray*}
V^{\pi}(s) & = & \lim_{N \rightarrow \infty} E_s^{\pi}[\sum_{t=1}^N r_t(X_t, Y_t)]\\
& = & \lim_{N \rightarrow \infty} V_N^{\pi}(s)\\
\end{eqnarray*}
Note that this return function may diverge.
\item The {\bf expected discounted sum of the immediate rewards}, i.e.
$$V_\lambda^{\pi}(s) = \lim_{N \rightarrow \infty} E_s^{\pi}[\sum_{t=1}^N \lambda^{t-1}r_t(X_t, Y_t)],\;\;\; 0 < \lambda < 1$$
In this case, a suffice condition for convergence can be for example: $|r(\cdot , \cdot)| \leq M$\\
Under this condition we can find an upper bound to the return function:
$$V_\lambda^{\pi}(s) \leq \sum_{t=1}^N \lambda^{t-1}M = \frac{M}{1-\lambda}$$
Note that this bound is very sensative to the value of the paramter $\lambda$.
$$\lim_{\lambda \rightarrow 1}V_{\lambda}^{\pi}(s) = V^{\pi}(s)$$
\item The {\bf expected average reward}
\begin{eqnarray*} 
g^{\pi}(s) & = & \lim_{N \rightarrow \infty} \frac{1}{N} E^{\pi}_s [\sum_{t=1}^N r(X_t,Y_t)]\\
& = & \lim_{N \rightarrow \infty} \frac{1}{N} V^{\pi}_N(s)
\end{eqnarray*}
This limit does not always exist. A sutisfactory demand for the limit's existance may be
\begin{enumerate}
\item $S$ is finite
\item \begin{math} \pi \end{math} is Markovian and stationary 
\item the system is non periodic
\end{enumerate}
These conditions will be discussed further in a later lecture.
\end{enumerate}

\subtopic {Example 1}{}
This example is an expantion of exmaple 2 given in lecture 3.\\
We will first examine the value gathered from different return functions using two specific policies:
\begin{enumerate}
\item $\pi_1$- always chooses $a_{11}$ when in state $s_{1}$
\item $\pi_2$ - always chooses $a_{12}$ when in state $s_{1}$
\end{enumerate}
\begin{figure}
\caption{Infinite Horizon Example}
\label{fig:L4InfiniteHorizon}
\end{figure}
Let us start by calculating \begin{math}V^{\pi}_N\end{math}:
\begin{eqnarray*}
V^{\pi_2}_N & = & 10 - (N-2) = 12 - N\\
V^{\pi_1}_N & = & 5 + [\frac{1}{2}\cdot 5 + \frac{1}{2}\cdot (-1)] + [\frac{1}{4}\cdot 5 + \frac{3}{4}\cdot (-1)] + ...\\
& = & [10-\frac{1}{2^{N-2}}\cdot 5] - (N-2) + (1-\frac{1}{2^{N-2}})\\
& = & 13 - N - \frac{6}{2^{N-2}}
\end{eqnarray*}
For \begin{math} N \rightarrow \infty \end{math} the gap between the two policies goes to 1 in favor of \begin{math}\pi_1.\end{math}\\
The three suggested return functions evaluate to:
\begin{enumerate}
\item The expected sum of the immediate rewards:
$$V^{\pi_1}(s_1) = V^{\pi_2}(s_1) = -\infty$$
\item Expected average reward:
\begin{eqnarray*}
g^{\pi_2}(s_1) & = & \lim_{N \rightarrow \infty} \frac{12-N}{N} = -1\\
g^{\pi_1}(s_1) & = & -1
\end{eqnarray*}
\item Expected discounted sum:
\begin{eqnarray*}
V^{\pi_2}_\lambda (s_1) = 10 + \sum_{i=1}^\infty \lambda^i(-1) = 10 - \frac{\lambda}{1-\lambda}
\end{eqnarray*}
\end{enumerate}

\subtopic {The Expected Discounted Sum Return Function}{}
Here are some possible explanations for the \begin{math} \lambda \end{math} parameter.
\begin{enumerate}
\item In economical problems the \begin{math} \lambda \end{math} parameter may be interpreted as the interest
\item Consider a finite horizon problem where the horizon is random, i.e.
$$V^{\pi}_N (s) = E^{\pi}_s E_N [\sum_{i=1}^N r(X_t,Y_t)]$$
assuming that the final value of all the states is equal to 0.\\
\\
Let $N$ be distributed geometricly with parameter \begin{math} \lambda \end{math}. The probability to 
stop at the $N^{th}$ step is\\
\begin{math} Prob[N=n] = (1 - \lambda)\lambda^{n-1}\end{math}
\begin{lemma}
\begin{math} V^{\pi}_N (s) = V^{\pi}_{\lambda} (s)\end{math}\\
Under the assumption that \begin{math} |r(\cdot , \cdot)| < M\end{math}
\end{lemma}
\begin{proof}
\begin{eqnarray*}
V^{\pi}_N (s) & = & E^{\pi}_s \{\sum_{n=1}^\infty [\sum_{t=1}^n r(X_t,Y_t)](1-\lambda)\lambda^{n-1}\}\\
& = & E^{\pi}_s [\sum_{t=1}^\infty r(X_t,Y_t)(1-\lambda) \sum_{n=t}^\infty \lambda^{n-1}]\\
& = & E^{\pi}_s [\sum_{t=1}^\infty r(X_t,Y_t)](1-\lambda) \frac{\lambda^{t-1}}{1-\lambda}]\\
& = & E^{\pi}_s [\sum_{t=1}^\infty \lambda^{t-1} r(X_t,Y_t)]\\
& = & V^{\pi}_{\lambda} (s)
\end{eqnarray*}
\end{proof}
If we look back at example 1 we could add to it an additional state, \begin{math} \Lambda \end{math}, that behaves as a 'black hole', see figure \ref{fig:L4InfiniteHorizon}. Once the system reached this state it stays there forever, getting an immediate reward of value 0.\\
The probablity to move into state \begin{math} \Lambda \end{math} is \begin{math} (1-\lambda) \end{math} from any state. All other probabilities given in the original example are multiplied by \begin{math} \lambda \end{math}.\\
\begin{figure}
\caption{y}
\label{fig:L4y}
\end{figure}
The sum of the immediate rewards from the new model is equal to the discounted sum of the immediate rewards from the original model.
\end{enumerate}
\subtopic {Markovian Policy}{}
We will now show for every initial state, $s$, and a history dependent policy, \begin{math} \pi \end{math}, a Markovian policy \begin{math} \pi^{'} \end{math} such that the distribution on $(X_t, Y_t)$ is equal for \begin{math} \pi \end{math} and \begin{math} \pi^{'} \end{math}.
\begin{theorem}
\label{L4:MarkovianPolicyTheorem}
Let \begin{math} \pi=(d_1, d_2, ...) \in \Pi^{HR}\end{math}.\\
Then $\forall s \in S$ there exists a Markovian stochastic policy \begin{math} \pi^{'}=(d_1^{'}, d_2^{'}, ...) \in \Pi^{MR}\end{math},\\
that sutisfies \begin{math} Prob_{\pi^{'}}[X_t=j,Y_t=a | X_1=s] = Prob_{\pi}[X_t=j,Y_t=a | X_1=s] \end{math}
\end{theorem}
\begin{proof}
For every $j \in S$ and $a \in A_j$ we define \begin{math} \pi^{'} \end{math} as follows:\\
\begin{math} q_{d_t^{'}(j)}(a) = Prob_{\pi}[Y_t=a | X_t=j, X_1=s]\end{math}\\
We will first show that this definition results in the same distibution over the group of {\bf actions}:
\begin{eqnarray*}
Prob_{\pi^{'}}[Y_t=a | X_t=j] & = & Prob_{\pi^{'}}[Y_t=a | X_t=j, X_1=s]\\
& = & Prob_{\pi}[Y_t=a | X_t=j, X_1=s]
\end{eqnarray*}
The first equality is derived from the fact that \begin{math} \pi^{'} \end{math} is Markovian.\\
The second equality is by definition.\\
\\
It is left to be shown that the distribution over the group of {\bf states} is equal under \begin{math} \pi \end{math} and \begin{math} \pi^{'} \end{math}, i.e.\\
\begin{math} Prob_{\pi^{'}}[X_t=j | X_1=s] = Prob_{\pi}[X_t=j | X_1=s]\end{math}\\
We will prove this part by an induction on $t$. The idea behind the proof of this part is that if at a certain step we have an equal distribution over the group of states, and we are taking the same stochastic action, we will endup with same distribution over the group of states.\\
{\em Basis}: for \begin{math} t = 1 \;\;\;\; X_t=s\end{math} in \begin{math} \pi \end{math} and in \begin{math} \pi^{'} \end{math}\\
{\em Induction Step}: We assume that there is an identity between the distribution over the group of states in \begin{math} \pi \end{math} and in \begin{math} \pi^{'} \end{math} until the time $t-1$
\begin{eqnarray*}
Prob_{\pi}[X_t=j | X_1=s] & = & \sum_{k \in S} \sum_{a \in A_k} Prob_{\pi}[X_{t-1}=k, Y_{t-1}=a | X_1=s]p(j|k, a)\\
& = & \sum_{k \in S} \sum_{a \in A_k} Prob_{\pi^{'}}[X_{t-1}=k, Y_{t-1}=a | X_1=s]p(j|k, a)\\
& = & Prob_{\pi^{'}}[X_t=j | X_1=s]
\end{eqnarray*}
Recalling that \begin{math} Prob_{\pi^{'}}[X_t=j,Y_t=a | X_1=s] = Prob_{\pi^{'}}[X_t=j | X_1=s] \cdot Prob_{\pi^{'}}[Y_t=a | X_t=j, X_1=s] \end{math}\\
concludes this proof.
\end{proof}
\subsubtopic {The Return Function}{}
\begin{eqnarray*}
V_N^{\pi}(s) & = & \sum_{t=1}^{N-1}\sum_{j \in S}\sum_{a \in A_j} r(j,a)\cdot Prob[X_t=j, Y_t=a | X_1=s]\\
& + & \sum_{j \in S}\sum_{a \in A_j}[r_N(j)]\cdot Prob[X_N=j, Y_N=a | X_1=s]
\end{eqnarray*}
Therefore:
\begin{enumerate}
\item \begin{math} \forall N\; V_N^{\pi}(s) = V_N^{\pi^{'}}(s) \end{math}\\
Since we proved in the last theorem that the dsitribution function is equal for \begin{math} \pi \end{math} and \begin{math} \pi^{'} \end{math}.
\item \begin{math} g^{\pi}(s) = g^{\pi^{'}}(s) \end{math}\\
Since (1) is true for all $N$.
\item \begin{math} V_{\lambda}^{\pi}(s) = V_{\lambda}^{\pi^{'}}(s) \end{math}
\end{enumerate}
One should note that it is imposible to prove theorm \ref{L4:MarkovianPolicyTheorem} for history dependent and Markovian {\bf deteministic} policies, since the random property of the policy allows the modeling of all the histories under one state.




 



% **** THIS ENDS THE EXAMPLES. DON'T DELETE THE FOLLOWING LINE:

\end{document}







