\documentclass[10pt]{article}
\newtheorem{define}{Definition}
\usepackage{amsmath}
\usepackage{amssymb}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

\newcommand{\RP}{{\rm RP}}
\newcommand{\BPP}{{\rm BPP}}
\newcommand{\Adv}{{\rm Adv}}
\newcommand{\vep}{\varepsilon}

\begin{document}
\input{preamble.tex}

\lecture{13}{June 17, 2009}{Ronitt Rubinfeld}{Cagan Roy, Estrin Michael}

\section{Overview}

\subsection{Last Lecture: Boosting Weak Learners Into Strong Learners}
In the last lecture, we showed that if any concept class can be weakly learned, then it can also be strongly learned. Our proof used a Boosting Algorithm that could boost any weak PAC-learner (an algorithm that could learn to do slightly better than average) into a strong PAC-learner (an algorithm that could learn to do a lot better than average).

\subsection{This Lecture: Hard Functions, Uniformity, and Derandomization}
The Boosting Algorithm we used was based on what was originally a complexity theory result. (It took the Machine Learning theorists a while to realize that it gave them a boosting algorithm.)

In this lecture, we mention the original complexity result (on the existence of a ``hard" set of inputs). We use it to prove Yao's XOR Lemma, an analogous theorem for boosting hardness of functions: from any function that is \emph{slightly}-hard-to-compute using small circuits, we can produce a \emph{very}-hard-to-compute function.

\subsection{Notations (reminder)}
\begin{enumerate}
  \item \(
      R_c(x) = \left\{ \begin{array}{c l}
          +1 & \mbox{if $f(x) = c(x)$} \\
          -1 & \mbox{o.w.}\\
      \end{array} \right. \)
    \quad gives $+1$ if (weak) hypothesis $c$ is right on example $x$
  \item $N_i(x) = \sum_{1 \le j \le i} R_{c_j}(x)$ \quad is the number
    of right $c$'s exceeding the wrong ones (i.e. \#right-\#wrong)
  \item \(
    M_i(x) = \left\{ \begin{array}{c l}
        1 & \mbox{if $N_i(x) \le 0$} \\
        0 & \mbox{if $N_i(x) \ge \frac{1}{\vep \gamma}$} \\
        1 - \vep \gamma N_i(x) & \mbox{o.w.}
      \end{array} \right. \)
    \quad \\is a ``Probabilty filter keeps sample x''- a ``measure'' which upper bounds the error of hypothesis $c=\mbox{Maj}(c_1,\ldots,c_i)$ on example $x$.
  \item $|M_i| = \sum_x M_i(x)$ \qquad is the total ``mass'' of all examples according to ``measure'' $M_i$.
  \item $\mu(Mi) = \frac{|M_i|}{2^n}$\ is the normalized size of the ``measure''
    given $M_i$.
  \item $error(c_{i+1})\equiv Pr_{x\in uniform}\left[c(x) \neq f(x)\right] \leq \frac{|M_i|}{2^n}$
  \item $D_{M_i}(x) = \frac{M_i(x)}{|M_i|}$ \qquad is a distribution over $x$
  \item $\Adv_c(M) = \sum_x R_c(x)M(x)$ \qquad is the advantage of $c$ on $M$ which gives an indication on the number of inputs for which $c$ is correct. (Random guessing gives $0$.)\\\\
  Note: $\Adv_c(M) \ge \gamma|M|$ iff $\Pr_{x \in D_M}[c(x)=f(x)] \ge \frac{1}{2} + \frac{\gamma}{2}$
  \label{item:adv}
\end{enumerate}

\subsection{Non-Uniform Complexity Classes}

\begin{definition}
Let $C$ be a class of languages, and take any function $a: \mathbb N \rightarrow \mathbb N$. The class \emph{$C$ with advice $a$, $C/a$}, is defined as the set of languages $L$ such that $L \in C/a$ if and only if there is some $L' \in C$ such that for all $n$, there exists $\alpha_n \in \{0, 1\}^*$ with $|\alpha_n| \le a(n)$, and $x \in L$ if and only if $(x, \alpha_{|x|}) \in L'$.
\end{definition}



Example: ${\rm P}/{\rm poly} = \bigcup {\rm P}/n^c$ - the class of languages recognizable in polynomial time with polynomial-sized advice. It can be shown that ${\rm P}/\poly$ is also the set of languages computable by a polynomial-sized (non-uniform) circuit. (The circuits correspond exactly to the advice.)

It can be shown that $\RP \subseteq {\rm P}/\poly$ and $\BPP \subseteq {\rm P}/\poly$.

\section{Yao's XOR Lemma}

\subsection{Goal}
We would like to take a problem, which is hard on some inputs, and make it hard for all inputs.

\subsection{Definition of Hard}

\begin{definition}
A function $f: \{\pm 1\}^n \rightarrow \{\pm 1\}$ is \emph{$\delta$-hard on distribution $D$ for size $g$} if for any Boolean circuit\, $C$ with at most $g$ gates\, $\Pr_{x \in_D \{\pm 1\}^n}[C(x) = f(x)] \le 1 - \delta$.
\end{definition}

In other words, $f$ is $\delta$-hard if there is some $\delta$-fraction of the inputs that is hard to compute.

\begin{definition}
Let $M$ be a measure. If $\Adv_C(M) < \epsilon|M|$ for any circuit $C$ of size $g$, we say that $f$ is \emph{$\epsilon$-hard-core on $M$ for size $g$}.
\end{definition}

(Note that if $M$ is the characteristic function of a set $S$, then $Adv_C(M)$ \verb ~ \ fraction of inputs on which a circuit of size $g$ is correct)

\begin{theorem}[Impagliazzo Hard-Core Set Theorem]\label{hardcore_set_theorem}
Let $f$ be $\delta$-hard for size $g$ on the uniform distribution on $n$-bit strings, and let $0 < \epsilon < 1$. Then:
\begin{enumerate}
  \item There is a measure $M$ with $\mu(M) \ge \delta$ such that $f$ is $\epsilon$-hard-core on $M$ for size $g' = \frac{\epsilon^2 \delta^2 g}{4}$
  \item There is a $2\epsilon$-hard-core set $S$ for $f$, for size $g'$ and $|S| \ge \delta 2^n$
\end{enumerate}
\end{theorem}

\begin{proof-idea}
\begin{enumerate}
  \item Assume not true. Then, for every measure $M$ with $\mu(M) \ge \delta$, there is a circuit $C$ of size $g'$ that does well ($\Adv_C(M) \ge \epsilon|M|$)
      The existence of such a circuit (a "weak learner") implies the existence of a "strong learner", which is a circuit built by pasting together the "weak learner" outputs to get a circuit of size $\le g$, which is correct on $> 1 - \delta$ fraction of the inputs.
      The existence of this "strong learner" is a contradiction to the assumption that $f$ is $\delta$-hard for size $g$.

  \item The number of circuits of size $g'$ is $\ll \frac{e^{2^n \epsilon^2 \delta^2 /2}}{4}$.
  Pick S randomly from M (on $D_m$) and use the Chernoff bound to show that Pr[any circuit C of size $g'$ has good advantage] is small.
\end{enumerate}
\end{proof-idea}

\subsection{Yao's XOR Lemma}

We now state Yao's XOR Lemma, which says that for any hard function $f$, the XOR of many copies of $f$ is even harder. Intuitively, this is true because computing XOR requires knowing either the value of $f$ at each input or how many times we are wrong.

\begin{definition}
For any function $f$, define a new function $$f^{\oplus k}(x_1, \ldots, x_k) = f(x_1) \oplus f(x_2) \cdots \oplus f(x_k)$$
\end{definition}

\begin{theorem}[Yao's XOR Lemma]\label{yaos_xor_lemma}
Assume $f$ is $2\epsilon'$-hard-core for size $g'$ on a set $H$ s.t. $|H| \ge \delta 2^n$. Then $f^{\oplus k}$ is $2(\epsilon' + (1-\delta)^k)$-hard-core over $\{\pm 1\}^n$ for size $g'$ according to the uniform distribution.
\end{theorem}

\begin{proof}Suppose that $f$ is $2\epsilon'$-hard-core on a set $H$, $|H| \ge \delta 2^n$, for size $g'$. For contradiction, assume also that there exists a circuit $C$ of size at most $g'$ which satisfies\\
$\Pr_{x_1, \ldots, x_k \in \{\pm 1\}^n}[C(x_1, \ldots, x_k) = f^{\oplus k}(x_1, \ldots, x_k)] \ge \frac{1}{2} + \epsilon' + (1-\delta)^k.$\\

\underline{Plan:} For all $H$ s.t. $|H| \ge \delta 2^n$, use $C$ to produce a new circuit $C'$ that computes $f$ such that $|C'| \leq g'$ and
$\Pr_{x \in H}[C'(x) = f(x)] \ge \frac{1}{2} + \epsilon',$
contradicting our assumption that $f$ is $2\epsilon'$-hard-core on $H$.\\

\underline{Implementation:} Given $H$ such that $|H| > \delta 2^n$, construct $C'$ :\\
Let $A_m$ denote the event that exactly $m$ of $x_1, \ldots, x_k$ are in $H$. (Exactly $m$ of the $x_i$ are in the ``hard part'' and exactly $k-m$ are in the ``easy part''.) Then the probability that none of the $x_i$ are in $H$ is at most
$\Pr[A_0] = \Pr[\mbox{none of the $x_i$ are in $H$}] \le (1-\delta)^k,$\\
So: $\Pr_{x_1, \ldots, x_k}[C(x_1, \ldots, x_k) = f^{\oplus k}(x_1, \ldots, x_k) | A_m \mbox{ for some } m > 0] \ge \frac{1}{2} + \epsilon'.$

This implies that there exists $m > 0$ s.t.
$\Pr_{x_1, \ldots, x_k}[C(x_1, \ldots, x_k) = f^{\oplus k}(x_1, \ldots, x_k) | A_{m}] \ge \frac{1}{2} + \epsilon'.$

Now suppose we are given some random $x \in H$, we will compute $f(x)$ as follows:
\begin{enumerate}
\item Pick $x_1, \ldots, x_{m-1} \in_R H$
\item Pick $y_{m+1}, \ldots, y_k \in_R \overline{H}$
\item Permute $(x_1, \ldots, x_{m-1},x,y_{m+1}, \ldots, y_k)$ via $\pi$ - a random permutation on $k$ elements
\item Xor result with "known" bit - Gives circuit of size $\leq g$
\end{enumerate}

So we have:
\begin{equation*}
\Pr_{x, x_i\textit{'s}, y_j\textit{'s}, \pi}[C(\pi(x_1, \ldots, x_{m-1}, x, y_{m+1}, \ldots, y_k)) = f^{\oplus k}(x_1, \ldots, x_{m-1}, x, y_{m+1}, \ldots, y_k)] \ge \frac{1}{2} + \epsilon'
\end{equation*}

Using an averaging argument, this means there is a choice of $x_1, \ldots, x_{m-1}, y_{m+1}, \ldots, y_k, \pi$ such that
\begin{equation*}
\Pr_x[C(\pi(x_1, \ldots, x_{m-1}, x, y_{m+1}, \ldots, y_k)) = f^{\oplus k}(x_1, \ldots, x_{m-1}, x, y_{m+1}, \ldots, y_k)] \ge \frac{1}{2} + \epsilon'
\end{equation*}

Notice that $f^{\oplus k}(x_1, \ldots, x_{m-1}, x, y_{m+1}, \ldots, y_k)] = f(x) + [\bigoplus f(x_i) + \bigoplus f(y_j)]$\\

Thus, we have a circuit $C'$ of size at most $g'$ such that $\Pr_x[C'(x) = f(x)] \ge \frac{1}{2} + \epsilon'$, which is a contradiction to the assumption that $f$ is hard-core for size $g'$ on the set $H$.
\end{proof}

\end{document}

