\documentstyle[12pt,psfig]{book}
\setlength{\evensidemargin}{.0 in} \addtolength{\topmargin}{-1in}
\setlength{\textwidth}{6.5in} \setlength{\textheight}{8in}

\newcommand{\remove}[1]{}

\newcommand{\Do}{{\small\bf do}\ }
\newcommand{\Return}{{\small\bf return\ }}
\newcommand{\Proc}[1]{#1\+}
\newcommand{\Returns}{{\small\bf returns}}
\newcommand{\Procbegin}{{\small\bf begin}}
\newcommand{\If}{{\small\bf if}\ \=\+}
\newcommand{\Then}{{\small\bf then}\ \=\+}
\newcommand{\Else}{\<{\small\bf else}\ \>}
\newcommand{\Elseif}{\<{\small\bf elseif}\ }
\newcommand{\Endif}{\<{\small\bf end\ if\ }\-\-}
\newcommand{\Endproc}[1]{{\small\bf end} #1\-}
\newcommand{\For}{{\small\bf for}\ \=\+}
\newcommand{\Endfor}{{\small\bf end\ for}\ \-}
\newcommand{\Loop}{{\bf loop}\ \= \+}
\newcommand{\Endloop}{{\bf end\ loop}\ \-}
\newenvironment{program}{
    \begin{minipage}{\textwidth}
    \begin{tabbing}
    \ \ \ \ \=\kill
  }{
    \end{tabbing}
    \end{minipage}
  }

\def\blankline{\hbox{}}


\newcommand{\lecture}[7]{
   \pagestyle{headings}
   \thispagestyle{plain}
   \newpage
   \setcounter{chapter}{#1}
   \setcounter{page}{#2}
%  \set\thechapter{#3}
   \noindent
   \begin{center}
   \framebox{
      \vbox{
    \hbox to 6.28in { {\bf Computational  Learning Theory
                       \hfill Fall Semester, 1999/00} }
       \vspace{4mm}
       \hbox to 6.28in { {\Large \hfill Lecture #1: #3  \hfill} }
       \vspace{2mm}
       \hbox to 6.28in { {\it Lecturer: #4 \hfill Scribe: #5, #6 #7} }
      }
   }
   \end{center}
   \markboth{Lecture #1: #3}{Lecture #1: #3}
   \vspace*{4mm}
}

\newcommand{\topic}[2]{\section{#1} \index{#2} \markright{#1}}
\newcommand{\subtopic}[2]{\subsection{#1} \index{#2}}
\newcommand{\subsubtopic}[2]{\subsubsection{#1} \index{#2}}

\renewcommand{\cite}[1]{[#1]}

\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\blank}{\vspace{1ex}}   % generates a blank line in the output

\newtheorem{theorem}{Theorem}[chapter]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\hfill $\Box$}
\newenvironment{proof}{\par{\bf Proof:}}{\qed \par}

\newenvironment{dfn}{{\vspace*{1ex} \noindent \bf Definition %%@
}}{\vspace*{1ex}}
\newcommand{\bigdef}[2]{\index{#1}\begin{dfn} {\rm #2} \end{dfn}}
\newcommand{\smalldef}[1]{\index{#1} {\em #1}}

% **** YOUR NOTES GO HERE %%@
%****************************************************

\begin{document}
\lecture{7}{1}{December 9}{Prof. Yishay Mansour}{Boshy Shlomy}
{Berkman Omer}{ } 
\begin{flushleft}
\topic{Learning from experience}{}
we consider the case in which the exact model is not known.\\
In this case we develop methods for learning the model using statistical methods based on sampling and experience.

\subtopic
{Average Calculation}{}
Let $V_{1},...,V_{n}$ be independent identical distributed (i.i.d) random variables.\\
We assume $ |V_{i}| \leq M $.\\
The average can be written as :\\ \ \\
$ \hat{V_{N}} = \frac{1}{N}\sum_{i=1}^{N}V_{i} $\\ \ \\
or as a recursive online formula :\\ \ \\
$ \hat{V_{k}} = (1-\frac{1}{k}) \hat{V}_{k-1} +\frac{1}{k}V_{k} $\\
(Since: $(1-\frac{1}{k})\hat{V}_{k-1}= \frac{k-1}{k}(\frac{1}{k- 1}\sum_{i=1}^{k-1}V_{i}) $ )

\subsubtopic
{Convergence}
AAccording to the Law of Large Numbers the average converges to the expected value :\\ \ \\
$ \hat{V_{n}}\longrightarrow \bar{v}=E[V_{i}] $, note that clearly 
\\$E[\hat{V_{n}}]=\bar{v}$.\\ \ \\
The variance is :\\ \ \\
$Var(\hat{V_{n}})=\frac{1}{N^{2}}\sum_{i=1}^{N}Var(V_{i})=\frac{\sigma^{2}}{N}$, where\\
$\sigma^{2}=Var(V_{i})$.\\ \ \\
This implies that when N goes to infinity we have, \\ \ \\
$\lim_{N \rightarrow \infty}\frac{\sigma^{2}}{N}=0$\\ \ \\
so as the sample size increases, the difference between the average and the expected value decreases.

\subsubtopic
{Convergence rate}
UUsing Chernoff's theorem : 
\begin{itemize}
\item $Prob[|\hat{V_{N}}-\bar{v}|\geq\gamma]\leq 2e^{-2N\gamma^{2}}$
\item $Prob[\hat{V_{N}}>(1+\gamma)\bar{v}]\leq e^{-N\bar{v}\gamma^{2}/3}$
\item $Prob[\hat{V_{N}}<(1-\gamma)\bar{v}]\leq e^{-N\bar{v}\gamma^{2}/2}$
\end{itemize}
Note that convergence rate is exponential in the sample size.

\subtopic 
{Weighted average}{}
The weighted average can be defined by:\\ \ \\
$\hat{V}=\sum_{i=1}^{N}\alpha_{i}V_{i}$, where the constants $\alpha_{i}$ have the properties that,\\ 
$\alpha_{i}\geq 0$ and $\sum_{i=1}^{N}\alpha_{i}=1$ \\ \ \\
The expected value of the weighted average is, \\ \ \\
$E[\hat{V}]=E[\sum_{i=1}^{N}\alpha_{i}V_{i}]=
\sum_{i=1}^{N}\alpha_{i}E[V_{i}]=\bar{v}$ \\ \ \\
and the variance is : \\ \ \\
$Var(\hat{V})=E[(\sum_{i}\alpha_{i}V_{i}-\bar{v})^{2}] = E[(\sum_{i}\alpha_{i}(V_{i}-\bar{v}))^{2}]$\\
$=\sum_{i}\alpha_{i}^{2}E(V_{i}-\bar{v})^{2}+\sum_{i\neq %%@
j}\alpha_{i}\alpha_{j}E[(V_{i}-\bar{v})(V_{j}-\bar{v})]$\\ \ \\
for $i \neq j:$ \\ 
$E[(V_{i}-\bar{v})(V_{j}-\bar{v})] = E(V_{i}-\bar{v})E(V_{j}-\bar{v}) $ \\ 
since sample $i$ is independent of sample $j$. We know that $ E(V_{i}-\bar{v}) = 0 $ for all $i$, so : \\ \ \\
$Var(\hat{V})=\sum_{i}\alpha_{i}^{2} E(V_{i}-\bar{v})^{2}=\sum_{i}\alpha_{i}^{2}\sigma^{2}$, \\ \ \\
where $\sigma^{2}$ is the variance of a single variable.\\ \ \\
If $\lim_{N \rightarrow \infty} \sum_{i}\alpha_{i}^{2} = 0$ then $\lim_{N \rightarrow \infty} Var(\hat{V_{N}}) = 0$.

\topic 
{Stochastic Models}{}
Generally, we would like to solve systems of the form $H\vec{r}=\vec{r}$
by finding a fix point of the operator $H$ where $H:R^{n}\rightarrow R^{n}$.\\
One way of solving is to perform iterations of the form $r_{n+1}\leftarrow Hr_{n}$.\\
Note that if \{$r_{n}$\} has a limit such that $r_{n}\longrightarrow r^{*}$
and $H$ is continuous around $r^{*}$ then $Hr^{*}=r^{*}$.\\ \ \\
Remark : if $H$ is a contracting operator then we showed that such a limit always exists.\\ \ \\
An equivalent way for iteration is : \\ \ \\
$r_{n+1} \leftarrow (1-\alpha)r_{n}+\alpha Hr_{n}$ \\ \ \\ 
which has the same convergence property. \\ \ \\
Let us assume that $H$ is not known, or hard to compute. We can replace $H$ by a sample of the form $S=Hr+W$, where $W$ is the sample's "noise" and $E(W)=0$.\\ \ \\
Such an $S$ can be given by simulation of the system or by a random experiment. We can use $S$ instead of $Hr$ and get the following iterative algorithm : \\ \ \\
$r_{n+1} \leftarrow (1-\alpha)r_{n}+\alpha(Hr_{n}+W_{n})$ \\ \ \\
Such an algorithm is called \em Stochastic Approximation. \em

\subtopic
{Using $\alpha$ to control the convergence rate}{}
Decreasing the parameter $\alpha$ causes the noise influence to decrease, but on the other hand the convergence is slower. Usually we will use $\alpha$-values such that: $\lim_{i \rightarrow \infty} \alpha_{i} = 0$. \\ \ \\

Let us look at one \em Typical Example: \em \\ \ \\

\begin{itemize}
\item Let V be a random variable dependent of the parameter $r$.
\item There exists a probability function : $P(V|r)$
\item A function $g(r,V)$
\end{itemize}
We would like to solve : $E_{V}[g(r,V)]=r$, where the expectation regards  $p(V|r)$. \\ \ \\
We can write iterations of the form : \\ \ \\
$r_{n+1} \leftarrow (1-\alpha)r_{n}+\alpha E[g(r_{n},V)].$ \\ \ \\
Computing the expected value can be complicated, but if $P(V|r)$ is known
and can be simulated or sampled, we can get samples ${\tilde{V}}$ and 
use them to evaluate $E_{V}[g(r,V)]$. For example we can sample $\tilde{V_{1}},...,\tilde{V_{k}}$ and compute 
$\frac{1}{k}\sum_{i=1}^{k} g(r_{n},\tilde{V_{i}})$ as the observed average.\\ \ \\
If $K$ is large, then the average will be close to its expected value $E_{V}[g(r,V)]$ and in such a case we should expect a similar behaviour. The other extrem is the case of $k=1$ (one sample), $\tilde{V_{n}}$ . \\ \ \\
$r_{n+1} \leftarrow (1-\alpha_{n})r_{n}+\alpha_{n} g(r_{n},\tilde{V_{n}})$ where $\tilde{V_{n}}$ is distributed by $p(v|r)$.\\ \ \\

This algorithm is called the \em Robbins-Morro algorithm \em. The iterations can be written as: \\ \ \\
$r_{n+1} \leftarrow (1-\alpha_{n})r_{n} + \alpha_{n} [E(g(r,V) + g(r,\tilde{V}) - E(g(r,V)]$ \\ \ \\
where we can define : \\
$Hr_{n} = E[g(r_{n},V)]$, \\
$W_{n}  = g(r,\tilde{V}) - E(g(r,V)$. (and $E[W_{n}] = 0$ by definition of $\tilde{V}$)\\
This implies that for each $s\in S$ we have, \\ \ \\
$r_{n+1}(s) \leftarrow (1-\alpha_{n})r_{n}(s) + \alpha_{n}(s) [(Hr_{n})(s) + %%@
W_{n}(s)]$.

\subtopic
{Choosing $\alpha_{n}(s)$}{}
Since : \\ \ \\
$|r_{n}(s)-r_{0}(s)| \leq \sum_{i} |r_{i+1}(s)-r_{i}(s)| \leq \sum_{i} 
\alpha_(i)(s) |(Hr_{i})(s)-r_{i}(s)+W_{i}(s)| \leq A \sum_{i=1}^{n} 
\alpha_{i}(s)$\\ 
(where A is a constant that bounds $|(Hr_{i})(s)-r_{i}(s)+W_{i}(s)|$ ).\\ \ \\
We need to require that $\sum_{i} \alpha_{i}(s) = \infty$ : $  \forall s\in S$. Otherwise, we have to assume that the distance $||r^{*} - r_{0}||$ is bounded. The above condition insures that any starting point converges to the optimal value.\\ \ \\
We add a condition to insure that the converge rate is fast enough :\\ \ \\
$\forall S\in S$ : $\sum_{i} \alpha_{i}^{2} < \infty $\\ \ \\
One simple choice could be : $\alpha_{i}(s) = \frac {1}{i}$

\subtopic{Evaluating the average}{}
Let $V_{1},...,V_{n}$ be i.i.d. random variables with expected value of
$\bar{v} = E[V_{i}]$.\\
We would like to solve the system $E[V]=r$ (this is a special case of the 
earlier discussion in which $g(V,r)=V$). We will use a single sample $V_{n}$ and get : \\ \ \\ 
$r_{n+1} \leftarrow (1-\alpha_{n})r_{n} + \alpha_{n} V_{n}$ \\ \ \\
if $r_{n} \longrightarrow r^{*}$ then $\bar{v} = E[V] = r^{*}$.\\ \ \\
We compute the variance as follows :\\ \ \\
$Var(r_{n+1}) = (1-\alpha_{n})^{2}Var(r_{n}) + \alpha_{n}^{2}Var(V_{n})$.\\ \ \\
We show that if $\sum_{n} \alpha_{n} = \infty$ and $\alpha_{n} 
\longrightarrow 0$ then $Var(r_{n+1}) \longrightarrow 0$. The following is a general Lemma that is used and is shown later.\\

\begin{lemma}
Let {$\delta_{t}$} be a series such that $\delta_{t} \longrightarrow 0, %%@
\sum_{t} \delta_{t} = \infty, 0 \leq \delta_{t} \leq 1$.
and Let {$e_{t}$} be a series such that $e_{t} \geq 0$ and : 
$e_{t+1} \leq (1-\delta_{t})e_{t} + c\delta_{t}^{2}$.
Then $e_{t} \longrightarrow 0$.
\end{lemma}
\begin{proof}
\em step 1: \em we show that for every constant $\epsilon$, 
$e_{t}<\epsilon$  for an infinite number of $t$'s.\\
Let us assume (by contradiction) that there exists $T$ such that for all 
$t>T$, $e_{t} > \epsilon$ and $c\delta_{t} \leq \epsilon/2$. (since 
$\delta_{t} \longrightarrow 0$). In such a case we have\\ \ \\
$e_{t+1} \leq (1-\delta_{t})e_{t} + c\delta_{t}^{2} = 
e_{t} - \delta_{t}e_{t} + c\delta_{t}^{2} \leq 
e_{t} - \delta_{t}\epsilon + \delta_{t}\epsilon/2 =
e_{t} - \delta_{t}\epsilon/2$. \\ \ \\
Hence :\\ \ \\
$e_{m} \leq e_{T} - \epsilon/2 * \sum_{t=1}^{m-1}\delta_{t}$.\\ \ \\
Since for any m we have $e_{m} \geq 0$ ,then it has to be the case that $\sum_{i=0}^{\infty} \delta_{t} < \infty$, which is a contradiction to the assumption in the lemma.\\ 
Therefore, for any $\epsilon > 0$, there exists $T$ such that $\forall t>T : 
c\delta_{t} < \epsilon$ and $e_{T} \leq \epsilon$.\\ \ \\
\em step 2: \em we show that $\forall t>T : e_{t} \leq \epsilon$.\\
$e_{T+1} \leq (1-\delta_{T})e_{T} + c\delta_{T}^{2} \leq 
(1-\delta_{T})\epsilon + \epsilon \delta_{T} \leq
\epsilon$\\ \ \\
Under the hypothesis that $e_{T} \leq \epsilon$ we showed that $e_{T+1} \leq \epsilon$, this implies that $\forall t>T : e_{t} \leq \epsilon$.
\end{proof}

So the variance could be expressed as :\\ \ \\
$Var(r_{n+1}) = (1-\alpha_{n})^{2}Var(r_{n}) + \alpha_{n}^{2}Var(V_{n}) \leq
(1-\alpha_{n})Var(r_{n}) + \alpha_{n}^{2}Var(V_{n})$\\ \ \\
we can substitute :\\
\begin{itemize}
\item $ \alpha_{n} \equiv \delta_{n}$
\item $C \equiv Var(V_{n})$
\item $e_{n} \equiv Var(r_{n})$
\end{itemize}

Since $\sum_{n} \alpha_{n} = \infty $ and $\alpha_{n} \longrightarrow 0$, we have that $Var(V_{n}) \longrightarrow 0$.\\ \ \\
Remark : the condition $\sum_{n} \alpha_{n}^{2} \longrightarrow 0$ will %%@
insure convergence with probability one. \\ \ \\
We say that $\lim_{n \rightarrow \infty}X_{n}=X$ with probability one if and only if $lim_{n \rightarrow \infty}\sup_{n \leq m}|X_{m}=X|=0$. 

\topic{Evaluating Policy Reward}{}
Let $\pi$ be a policy.
We would like to calculate the reward of policy $\pi$ : $V^{\pi}(s)$.\\ \ \\
For simplicity, we assume that there exist a state $s_{0}$ in the MDP, such that $s_{0}$ has a reward of 0, and $Prob(s_{0}|s_{0},a) = 1$ : $ \forall a \in 
A_{s_{0}}$.\\
Also, we assume that each policy reachs state $s_{0}$ within a finite number of steps with probability 1.\\
Under these assumptions, we can assume each run is finite.

\subtopic{The Naive Approach}{}
for each $s \in S$ :\\
Run $\pi$ from s for m times, where the i-th run is $T_{i}$.\\
Let $r_{i}$ be the reward of $T_{i}$.\\
Estimate the reward of policy $\pi$ starting from s, by :\\ \ \\
$\hat{V^{\pi}}(s) = Avg(r_{i})= \frac{1}{m}\sum_{i=1}^{m}r_{i}$\\ \ \\
The variables $r_{i}$ are independent since the runs $T_{i}$ are independent. By Chernoff's theorem : \\ \ \\
$Prob[|\hat{V^{\pi}(s)}-V^{\pi}(s)|\geq \gamma]\leq 2e^{-2m\gamma^{2}}$ \\ \ \\
This implies that :\\ \ \\
$Prob[\exists s \in S : |\hat{V^{\pi}(s)}-V^{\pi}(s)| \geq \gamma] \leq \sum_{s} Prob[|\hat{V^{\pi}(s)}-V^{\pi}(s)| \geq \gamma] \leq |S| 2e^{-2m\gamma^{2}} = \epsilon $.\\ \ \\
for $\gamma = \sqrt{\frac{ln(|S|)/\epsilon}{2m}}$ the above holds.\\

\subtopic{First Visit}{}
We can improve the approximations by updating rewards of many states in each %%@
single run. 

for each $s \in S$ :
\begin{enumerate}
\item Run $\pi$ from s for m times, where the i-th run is $T_{i}$.\\ 
\item For each run $T_{i}$ and state s in it, let $r(s,T_{i})$ be the reward %%@
of $\pi$ in run $T_{i}$ from the first appearance of s in $T_{i}$ until the %%@
run ends (reaching state $s_{0}$).
\item Let the reward of policy $\pi$ starting from s be :
$\hat{V^{\pi}}(s) = Avg_{s \in T_{i}}(r(s,T_{i}))$
\end{enumerate}

The random variables $r(s,T_{i})$ for a given state s and different $T_{i}$'s, are independent %%@
since different runs are independent. The improvement is based on increasing the number of samples without %%@
increasing the number of runs, so we have a smaller estimation-error.

\subtopic{Every Visit}{}
We can hope to improve the approximations by using every appearance of state s in %%@
each single run. 

for each $s \in S$ :
\begin{enumerate}
\item Run $\pi$ from s for m times, where the i-th run is $T_{i}$.\\
\item For each run $T_{i}$ and state s in it, let $r(s,T_{i},j)$ be the %%@
reward of $\pi$ in run $T_{i}$ from the j-th appearance of s in $T_{i}$ %%@
until the run ends (reaching state $s_{0}$).
\item Let the reward of policy $\pi$ starting from s be :
$\hat{V^{\pi}}(s) = Avg(r(s,T_{i},j))$
\end{enumerate}

The problem is that the random variables $r(s,T_{i},j)$ are \em dependent \em for different $j$'s.

\subtopic{Example}{}

\begin{figure}\label{L9:FIGURE} \psfig{file=figure1.ps ,width=6in ,clip=}
\caption{Example Diagram}
\end{figure}

Consider the MDP in the Figure 7.1. Let $N$ be the length of the run. 
We choose $\gamma = 1$ (undiscounted). 
It is easy to see that, \\ \ \\
$V^{\pi}(s_{1}) = \frac{1}{p} = E[N]$,\\ \ \\
since the distribution is geometric ("success" with probability $p$ when 
moving to $s_{0}$). However note that $r(s_{1},T,j) \geq r(s_{1},T,j+1)$, which implies that $r(s_{1},T,j)$ are independent.\\
The return from the k-th appearance of $S_{1}$ until the end of the run is 
$N-k$ (there are N steps in the run, each of them with reward 1 and the run 
ends the first time we get to $S_{0}$).\\
The average is $\frac{1}{N} \sum_{k=0}^{N-1}(N-k) = \frac{1}{N}\frac{N(N+1)}{2} = \frac{N+1}{2}$ \\
Hence $E[\frac{N+1}{2}]=\frac{\frac{1}{p}+1}{2}=\frac{1+p}{2p}$.\\
Note that this is different from $E[V^{\pi}(s_{1})]=\frac{1}{p}$.\\
The difference (bias) is because the random variables are dependent.\\ \ \\

\em Error Calculations \em \\ \ \\
The fact that \em Every Visit \em is biased does not implies that it is inferier to \em First Visit \em, 
as an estimate. Consider using a single run to estimate the return in the above example. \\ \ \\
\begin{itemize}
\item First Visit : \\ \ \\
$Var(\hat{V_{F}}(S_{1}))=E[N^{2}]-E^{2}[N]=\frac{1}{p^{2}}-\frac{1}{p}$ \\ \ \\
Note that the variance in this case can be bigger than the expected value 
when p is small. for example if p=0.1 then the expected value is 10 and 
variance is 90. 
\item Every Visit : \\ \ \\
$E[(\frac{N+1}{2}-\frac{1}{p})^{2}]=$
$\frac{1}{2p^{2}}-\frac{3}{4p}+\frac{1}{4}$
\end{itemize}
For each $p<1$, the expected square error in \em Every Visit \em is smaller than in \em First Visit \em,
meaning that Every Visit creates a biased estimation but it's variance is %%@
much smaller. The reason is that \em Every Visit \em contains many more samples. 
Note that in \em Every Visit \em the average is computed on all the runs, and not %%@
within the runs.

\end{flushleft}
\end{document}