\documentstyle[12pt,psfig]{book}
\setlength{\evensidemargin}{.0 in} \addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{8in}

\newcommand{\remove}[1]{}

\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
    \begin{minipage}{\textwidth}
    \begin{tabbing}
    \ \ \ \ \=\kill
  }{
    \end{tabbing}
    \end{minipage}
  }

\def\blankline{\hbox{}}


\newcommand{\lecture}[7]{
   \pagestyle{headings}
   \thispagestyle{plain}
   \newpage
   \setcounter{chapter}{#1}
   \setcounter{page}{#2}
%  \set\thechapter{#3}
   \noindent
   \begin{center}
   \framebox{
      \vbox{
    \hbox to 6.28in { {\bf Computational  Learning Theory
                       \hfill Fall Semester, 1999/00} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1: #3  \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5, #6, #7} }
      }
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}

\renewcommand{\cite}[1]{[#1]}

\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}}   % generates a blank line in the output

\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\bf Proof:}}{\qed \par}

\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition }}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}


% **** YOUR NOTES GO HERE %%
%***************************
\begin{document}
\lecture{9}{1}{December 23}{Prof. Yishay Mansour}{Harrusi Shachar}
{Sozio Andrei}{ }
\begin{flushleft}
\topic{Evaluating One Policy With Another}{}
Until now we discussed the case we have policy $\pi$ and need to
evlaute its value $V^{\pi}$. \\
Now we look at the case where we have two policies: $\pi_{1}$,$\pi_{2}$. We have samples of %%@
$\pi_{1}$ and we need to evaluate $V\pi_{2}$.\\

\subtopic {Importance Sampling}{}
We have two sources $D_{1}(x)$ and $D_{2}(x)$ that produce differnt distributions. 
We compute expectation of a function F(x) on one source while sampling the other source. The %%@
expectation of F(x) with respect to distribution D is the sum of products of all values of X %%@
with the probability that D assigns that value. In our case:\\ \ \\
$E_{D_{2}}[F(x)]$  =  $\sum{D_{2}}(X)F(x)$  =  %%@
$\sum{D_{1}}(X)(\frac{D_{2}(X)}{D_{1}(X)})F(x)$  =  %%@
$E_{D_{1}}[$($\frac{D_{2}(X)}{D_{1}(X)}$)F(x)$]$\\ \ \\

\underline{EXAMPLE 1}\\
Input:
\begin{itemize}
\item $F(x)  = k.$
\item $D_{1} = Prob[k] = \frac{1}{2^{k}}.$
\item $D_{2} = Prob[k] = \frac{3}{4}(\frac{1}{4})^{k-1}.$
\end{itemize}

Computation:\\
\begin{itemize}
\item We find expectancy $E_{D_{2}}[k]$ from samples of $D_{1}$.\\ \ \\
$E_{D_{1}}[k]  =  E_{D_{2}}[(\frac{D_{1}(k)}{D_{2}(k)})k]=  
E_{D_{2}}[(\frac{(\frac{1}{2})^{k}}{(\frac{3}{4})(\frac{1}{4})^{(k-1)}})k]  =  
E_{D_{2}}[k\frac{4}{3}\frac{1}{2}2^{k-1}]  =  E_{D_{2}}[k\frac{2}{3}^{k}]$
\item We check the equation by computing $E_{D_{2}}[F(x)]$.\\ \ \\
$E_{D_{2}}[k\frac{2}{3}^{k}]$  =  $\sum_{k}(\frac{1}{4})^{k-%%@
1}(\frac{3}{4})(\frac{2^{k}}{3})(k)$  =  
$\sum_{k}(\frac{1}{2^{k}})k$  =  $E_{D_{1}}[k]$
\item One of the problems in importance sampling is the variance.\\
In this case.\\ \ \\
$E_{D_{2}}[(k\frac{2^{k}}{3})^{2}]$  = $\sum_{k}(k^{2})(\frac{2^{2k}}{9})(\frac{1}{4})^{k-%%@
1}\frac{3}{4}$  = $\sum_{k}\frac{k^{2}}{3}$ = $\infty$
\end{itemize}

\subtopic
{Policy Sampling}\\ \ \\
The conclusion from the above equivalence is that if we can compute %%@
$\frac{D_{2}(X)}{D_{1}(X)}$ then we\\able to "transform"
samples from distribution $D_{1}$ to samples in distribution $D_{2}$.\\
We produce $T \in {\{S,A,R\}^{*}}: T_{1}..T_{m}$ of policy $\pi_{1}$.
each sample($T_{j}$) is a run on the model using policy $\pi_{1}$,
i.e. $T_{j} = s_{1},a_{1},r_{1},s_{2},a_{2},r_{2}...$ \\
The probability of generating $T_{j}$ is actually a product of two independent %%@
probabilities:
a policy depeneded probablity on actions and a model depended probability.\\ \ \\
$Prob[T_{j}]$  =  
$\prod_{i=1}^{|T_{j}|}(\pi_{1}(s_{i},a_{i})*Prob(s_{i+1}|s_{i},a_{i}))$ = %%@
\\$(\prod_{i=1}^{|T_{j}|}\Pi_{1}(s_{i},a_{i}))*(\prod_{i=1}^{|T_{j}|}Prob(s_{i+1}|s_{i},a_{i%%@
}))$\\ \ \\
We calculte the ratio of probabilites to have the same $T_{j}$ on differnt policies:\\ \ \\
$\frac{Prob_{\pi_{2}}[T_{j}]}{Prob_{\pi_{1}}[T_{j}]}  =  
\frac {(\prod_{i=1}^{|T_{j}|}\pi_{2}(s_{i},a_{i}))*(\prod_{i=1}^{|T_{j}|}Prob(s_{i+1}|s_{i},a_{i})%%@
)}
{(\prod_{i=1}^{|T_{j}|}\pi_{1}(s_{i},a_{i}))*(\prod_{i=1}^{|T_{j}|}Prob(s_{i+1}|s_{i},a_{i})%%@
)}  =  
\prod_{i=1}^{|T_{j}|}\frac{\pi_{2}(s_{i},a_{i})}{\pi_{1}(s_{i},a_{i})}$\\ \ \\
The important fact is that the ratio does \underline{not} depened on the model, but only on %%@
the policies. Therefor we can compute it with out the model.\\

\underline{EXAMPLE 2}\\ \ \\
Input:
\begin{itemize}
\item policy $\Pi_{1}$ .
\item policy $\Pi_{2}$ is determinstic.
\end{itemize}

Computation:\\
\begin{itemize}
\item because $\Pi_{2}$ is determinstic $\Pi_{2}(s_{i},a_{i}) \in {\{0,1\}}$
\item Therefor the ratio is a product of X, $X \in {\{0,\frac{1}{\Pi_{1}(s,a)}\}}$
\item If $\Pi_{1}$ is the random policy than we simply have a uniform distribution on the %%@
runs. Consistent with $\Pi_{2}$.
\end{itemize}
\subsubtopic
{Problem of sampling} \\
We can estimate policy $\pi_{2}$ from samples of another policy $\pi_{1}$.
The problem is the variance can be big -infinte. (variance in example 1).
Big variance can cause the error of the estimation grow.

\subsubtopic
{conclusion:}{}
\begin{enumerate}
\item To calculate the ratio we don't need any knowledge on the Model\\ 
only about the two policies we use.
\item The ratio is $(\frac{D_{2}(X)}{D_{1}(X)})F(x)$ is the case of Importance Sampling.
\item 1+2 imply we can use samples from one policy to calculate samples on another policy.
\item conclusion 3 explain why Q-learning can work.
\item The Variance must be limited to avoid errors. 
\end{enumerate}

\topic{Q-learning and SARSA algorithms}{}
In this section we descuss off-line and on-line algorithms to compute \\
the optimal policy in case the exact model is not known. 

\subtopic
{Q-learning}{}
Lets consider Value Iteration Algorithm(VI) from lecture 6.It described the non linear %%@
operator: L. In every iteration of the algorithm we operate L:\\
$V_{n+1}=LV_{n}$, and explicitaly:\\ \ \\
$V_{n+1} = \max_{a \in A_{s}}\{r(s,a) + \lambda\sum_{s^{'}\in S}
 P(s^{'}|s,a)V_{n}(s^{'})\}$. \\ \ \\
Lets refine the equation a somewhat. We define new function Q regarding VI:\\ \ \\
$Q^{n+1}(s,a) = r(s,a) + \lambda\sum_{s^{'}\in S}P(s^{'}|s,a)V_{n}(s^{'})$.\\ \ \\
Now the iteration of VI are:
$V_{n} = \max_{a \in A_{s}}\{Q^{n}(s,a)\}$. Expressed in Q function terms only we have:\\ \ %%@
\\
$Q^{n+1}(s,a) = r(s,a) + \lambda\sum_{s^{'}\in S}P(s^{'}|s,a)\max_{b \in %%@
A_{s}}\{Q^{n}(s^{'},b)\}$. \\ \ \\
We write the iteration with $\alpha-notation$.\\
(In lecture 7 we learned it converges the right value.)\\ \ \\
$Q^{n+1}(s,a) = (1-\alpha)Q^{n}(s,a) + \alpha[ r(s,a) + \lambda\sum_{s^{'}\in %%@
S}P(s^{'}|s,a)\max_{b \in A_{s}}\{Q^{n}(s^{'},b)\}]$\\ \ \\
Until now the iterations are equvivalent to VI. Instead of taking the excpetancy of the %%@
value of the next step we take a sample of the next step. We assume that we are in state s, %%@
we take action a, the next state $s^{'}$ is distributed by $P(s^{'}|s,a)$. Finally we get\\ %%@
\ \\
$Q^{n+1}(s,a) = (1-\alpha)Q^{n}(s,a) + \alpha[ r(s,a) + \lambda\max_{b \in %%@
A_{s^{'}}}Q^{n}(s^{'},b)\}]$\\ \ \\

\begin{figure}[phtb]
% the command below makes a frame around the code
\framebox[\textwidth]{
% the program environment 
\begin{program}
\Proc{\sc q-learning} \\
	\Procbegin \\
	Init: $Q(s,a)$ \\
     	\For every run $T$ \\
     	Init $s$ \\
		\For every step in $T$ \\
			choose action $a$ (using  policy $\Pi$) \\
			do action $a$ and obtain: $s^{'} \in S$ ,$r \in R$\\
			$Q(s,a) = Q(s,a) + \alpha[ r(s,a) + \lambda\max_{b \in %%@
A_{s^{'}}}\{Q(s^{'},b)\} - Q(s,a) ]$\\	
			$s \leftarrow s^{'}$\\
 		\Endfor\\		
 	\Endfor\\ 
\Endproc{\sc q-learning}\\  
\end{program}}
\caption{Algorithm for {\sc q-learning} }. 
\label{bctss-code}
\end{figure}

\subsubtopic
{remarks:}{}
\begin{itemize}
\item if we choose Q to be optimal $Q=Q^{*}$ then $Q^{*}(s,a)=r(s,a)+\lambda %%@
E_{s^{'}}[^{max}_{b}Q^{*}(s^{'},b)]$.
\item The algorithm Q-Learning is off-policy since $\Pi$ we don't control the policy that %%@
porforms actions. In general an off-line algorithm doesn't control the actions it does.
\item For on-policy we can give any action a small probability s.t. we reach all MDP.
For on-policy we can hope to achieve rewards getting closer to optimal. 
\end{itemize}
For  on-policy $\pi$ is $\epsilon-greedy$  regarding Q at the moment. Thus we get Sarsa %%@
algorithm\\
\pagebreak

\subtopic
{SARSA}{}
SARSA is on-line algorithm. In this algorithm keep two states the current state and the next %%@
state.\\
Hence its name "S(s),A(a),R(r),S($s^{'}$),A($a^{'}$)" is derive from the fact that we use %%@
the current state (s), current action (a), current reward (r), next state ($s^{'}$) and next %%@
action ($a^{'}$). We update Q with the difference between the next value function in $s^{'}$ %%@
and the current value function.\\
\begin{figure}[phtb]
% the command below makes a frame around the code
\framebox[\textwidth]{
% the program environment 
\begin{program}
\Proc{\sc sarsa} \\
	\Procbegin \\
	Init: $Q(s,a)$\\
     	\For every run $T$\\
			Init $s$ \\
			choose action a for s using  $\epsilon-greedy$ policy for $Q$\\
			\For every step $\in T$ \\
			do action $a$ and get: $s^{'} \in S$, $r \in R$\\
			choose action $a^{'}$ for $s^{'}$ (from Q).\\
			$Q(s,a) = Q(s,a) + \alpha[ r(s,a) + \lambda(Q(s^{'},a^{'})) - Q(s,a) ]$\\
		$s \leftarrow s^{'}$\\
		$a \leftarrow a^{'}$\\
 		\Endfor\\		
 	\Endfor\\ 
\Endproc{\sc sarsa}\\  
\end{program}}
\caption{Algorithm for {\sc sarsa} }. 
\label{bctss-code}
\end{figure}


\topic
{Convergence proof}{}
The general idea is writing our algorithm as:\\ \ \\
$Q \leftarrow (1-\alpha)Q + \alpha(HQ+w)$\\ \ \\
where:\\ \ \\
$(HQ)(s,a) = r(s,a)+\lambda\sum_{s^{'}}P(s^{'}|s,a)^{max}_{b}Q(s^{'},b)$ \\ \ \\
We will show that $H$ is a contracting mapping (operator):\\
$||HQ_{1} - HQ_{2}|| = ||\lambda P[MQ_{1} - MQ_{2}]||$\\
where $(MQ_{1})(s)=^{max}_{a}Q_{1}(s,a)$. Note that $MQ_{1}=V_{1}$. We have,\\
$||HQ_{1} - HQ_{2}|| \leq \lambda||V_{1} - V_{2}||\leq \lambda||Q_{1} - Q_{2}||$,\\ 
Therefore $H$ is $\lambda$  contracting.\\
The parameter $w$ is "noise" in the sample whose expectation is 0.
By using the contracting property of $H$ we show convergence.
For convergence we require:
\begin{itemize}
\item $Q^{n+1}(s,a)=(1-\alpha_{n}(s,a))Q^{n}(s,a) + %%@
\alpha_{n}(s,a)[r(s,a)+\lambda^{max}_{b}Q^{n}(s^{'},b)]$\\
\item Let $T^{s,a}$ be the time we are in state $s$ and make action a\\
\begin{itemize}
\item for     $t \in T^{s,a}$   $\alpha_{t}(s,a)>0$\\
\item and for $t \notin T^{s,a}$   $\alpha_{t}(s,a)=0 $\\
\end{itemize}
\item $F_{t}$ is history untill time $t$\\
\item Thus the algorithm can be written as:\\ \ \\
$Q^{n+1}(s,a)=(1-\alpha_{n}(s,a))Q^{n}(s,a) + \alpha_{n}(s,a)[(HQ^{(n)}(s,a) + %%@
w_{n}(s,a)]$\\ \ \\
where: $w_{n}(s,a)=\lambda[^{max}_{b}Q^{(n)}(s^{'},b)-%%@
\sum_{s^{''}}P(s^{''}|s,a)^{max}_{c}Q^{(n)}(s^{''},c)]$\\
\item because $s^{'}$ is distributed according to $P(.|s,a)$ (the policy is stationary) then %%@
we have:
$E[w_{n}(s,a)]=\lambda[E[^{max}_{b}Q^{(n)}(s^{'},b)]-E[^{max}_{c}Q^{(n)}(s^{''},c)]] = 0$\\
Therefore the expectation of "noise" is 0.
\end{itemize}
\begin{theorem}
(A general theorem for Stochastic processes)\\
Let: $r_{t+1}(i)=(1-\alpha_{t}(i))r_{t}(i)+\alpha_{t}(i)[(Hr_{t})(i)+w_{t}(i)]$\\ \ \\
Given that:\\
\begin{enumerate}
\item $E[w_{t}(i)|F_{t}]=0$\\
\item $E[w_{t}^{2}(i)|F_{t}] \leq A+B||r_{t}||^{2}$ for some A,B constants.\\
\item the steps $\alpha_{t}(i)$ are such that: $\sum\alpha_{t}(i)=\infty$ and %%@
$\sum\alpha_{t}^{2}(i)<\infty$\\
\item the  mapping H (operator) is contracting\\
\end {enumerate}
Then $r_{t} \rightarrow r^{*}$ with probability 1 .\\
(where $R^{*}$ is the fixed point of $Hr^{*}=r^{*}$)\\
\end{theorem}
In the case of Q-Learning it means that $HQ^{*}=Q^{*}$ and $Q^{*}$ is the optimal value %%@
function,\\
(this is the same operator as in Value Iteration).
Thus we achieve by the convergence of Q-Learning the optimal policy value.\\
\begin{theorem}
If all r(s,a) are non-negative and bounded and $Q^{0}(s,a)=0$ then\\
$Q^{(n)} \rightarrow Q^{*}$\\
\end{theorem}
For the theorem to be true we assumed that we sample each (s,a) infinite number of times, \\
thus we could look at each sequence separately.\\ \ \\
How to assure that we visit all (s,a) ?\\
\begin{enumerate}
\item If we follow the greedy policy of $Q^{(n)}$ it may be not possible to check all pairs %%@
(s,a)\\
even if  we reach all states $s \in S$\\
\item We will follow from a state $s$ the $\varepsilon$-greedy policy. With probability $1-%%@
\varepsilon$ we choose the greedy action from s,\\
and do a random policy from s with probability $\varepsilon$ .\\
\item Choose the policy by:\\ \ \\
$\Pi^{n}(s,a)= \frac{exp{-Q(s,a)/T}}{\sum_{b}exp{-Q(s,b)/T}}$\\
T determines if we do $\varepsilon$-greedy action (T $\rightarrow 0$) 
or random action (T $\rightarrow \infty$). 
We can look at this as if we part the time into sections of explorations(random) and of %%@
explotation(greedy).\\ \ \\
\end{enumerate}
On all 3 methods we can decide if we want to exploit the knowledge we have or get more %%@
information about the MDP\\ \ \\
The next theorem is a weaker version of the main theorem.
\begin{theorem}
Let $r_{t+1}=(1-\frac{1}{t})r_{t}+\frac{1}{t}(Hr_{t}+w_{t})$\\
Assume that:\\
\begin{enumerate}
\item $E[w_{t}]=0$  and $|w_{t}|\leq 1$\\
\item H is a contracting mapping\\
\item $|r_{t}| \leq D_{0}$\\
\end{enumerate}
Then $r_{t} \rightarrow r^{*}$ such that $Hr^{*} = r^{*}$\\
\end{theorem}
\begin{proof}
Since $H$ is contracting,\\
$||Hr-Hr^{*}||\leq \beta ||r-r^{*}||$ for some $\beta < 1$.\\
Whithout loss of generality, assume that $r^{*}=0$,\\
therefore $||Hr||\leq \beta ||r||$.\\ \ \\
 Define: $D_{k+1} = (\beta+2\varepsilon)D_{k}$ such that $\beta + 2\varepsilon<1$.\\
 We show by induction that exists a time $t_{k}$ such that for every $t \geq t_{k}$  then  %%@
$||r_{t}|| \leq D_{k}$\\ \ \\
 We bound $r_{t}$ for  $t>t_{k}$. 
 since $||r_{t}|| \leq D_{k}$ then $||Hr_{t}|| \leq \beta D_{k}$ (the induction %%@
hypothesis).\\
 We have: $r_{t+1}=\frac{t-1}{t}r_{t}+\frac{1}{t}(Hr_{t}+w_{t})=\frac{t-l-1}{t}r_{t-%%@
l}+\frac{1}{t}\sum_{i=t-l}^{t}Hr_{i}+\frac{1}{t}\sum_{i=t-l}^{t}w_{i}$.\\
 choose $l=t-t_{k}$ then : $||r_{i}||\leq D_{k}$ and 
 $||Hr_{i}||\leq \beta D_{k}$ .\\
 Since $E[w_{i}]=0$ and $|w_{i}|\leq 1$ we have that:$\frac{1}{t}\sum_{i=t-%%@
l}^{t}w_{i}\rightarrow 0$. 
 Therefore $\frac{1}{t}\sum_{i=t-l}^{t}w_{i} \leq \varepsilon D_{k}$ for all but a finite %%@
number of t.\\ \ \\ 
 We continue: $r_{t} \leq \frac{t_{k}-1}{t}r_{t_{k}}+\frac{1}{t}\sum_{i=t-%%@
l}^{t}Hr_{i}+\frac{1}{t}\sum_{i=t-l}^{t}w_{i} \leq\frac{t_{k}}{t}D_{k}+\frac{t-%%@
t_{k}}{t}\beta D_{k}+\frac{t-t_{k}}{t}\varepsilon$\\ \ \\
 For $t \geq t_{k}/\varepsilon$ we get:\\
 $r_{t} \leq \varepsilon D_{k} + \beta D_{k}+\varepsilon D_{k}  = %%@
(\beta+2\varepsilon)D_{k}=D_{k+1}$\\ \ \\
 Therefore exists $t_{k+1}$ such that $r_{t}\leq D_{k+1}$\\
 Thus when $t\rightarrow \infty$ then $||r_{t}|| \rightarrow r^{*}=0$.
 \end{proof}

\end{flushleft}
\end{document}