\documentclass[10pt]{article}
\newtheorem{define}{Definition}
%\newcommand{\Z}{{\mathbb{Z}}}
%\usepackage{psfig}
\usepackage{amssymb,amsmath}

\oddsidemargin=0.15in
\evensidemargin=0.15in
\topmargin=-.5in
\textheight=9in
\textwidth=6.25in

% XXX some of these (e.g., \randomlychosenfrom) should be moved to preamble.tex
\input{preamble.tex}
%\newtheorem{example}[theorem]{Example}
%\newenvironment{example}{\noindent{\bf Example}\hspace*{1em} \begin{quotation}}{\end{quotation}\bigskip}
% TODO: s/stepcounter/refstepcounter/
\newenvironment{example}[1]{\paragraph{Example~\thetheorem: #1}\stepcounter{theorem}}{} % reuse the 'theorem' counter
\newenvironment{idea}[1]{\paragraph{Idea~\thetheorem: #1}\stepcounter{theorem}}{} % reuse the 'theorem' counter
\def\randomlychosenfrom{\in_{\mathrm{R}}}
\def\Prob#1#2{\mathop{\mathrm{Prob}}\limits_{#1}\left[#2\right]}
\begin{document}
\def\bigOmega{\Omega}
\def\reject{\textsf{reject}}
\def\accept{\textsf{accept}}
\def\pass{\textsf{pass}}
\def\fail{\textsf{fail}}

\lecture{1}{Oct 19, 2009}{Ronitt Rubinfeld}{Daniel Shahaf}

%%%% body goes in here %%%%

%Blah Blah Blah

\section{Sublinear-time algorithms: motivation}

Twenty years ago, there was practically no investigation of sublinear-time
algorithms.  
%(An exception was weather scientist whose sensors had accumulated
%data in rapid quantities.)  
One reason for this was that it wasn't that common
to have extremely large datasets.  Today, however, datasets of many different types
may be so large that a linear-time algorithm would
take ridiculously long.  At other times, time constraints require us to make
a decision too swiftly than to allow for examining the input in its
entirety.\footnote{Another reason for interest
is the human quality of laziness: a quick
answer that requires neither reading much input nor much computations appeals
to many.}

In first part of the course we will model the input as written down somewhere
such that we have query access to (i.e., for any $i$ we can query for the $i$th bit
of the input).  Towards the end of the course, we will consider
a different model where our inputs consists of samples taken from an unknown
distribution.

Running in sublinear-time precludes us from reading the entire input;
therefore, we will typically use sampling.  Though sometimes we will use
straightforward sampling, many of our algorithms will use more intricate
algorithmic techniques combined with sampling.

Being sublinear-time will, in most cases, force us to use randomness in our
algorithms and limit us to only hope for an approximate answer (in many cases
getting a non-approximate answer requires reading the input fully).  The next
example is the only deterministic algorithm we will see in this course.


\section{Examples}

\subsection{A deterministic algorithm}

\begin{example}{point-set diameter}
    \def\hatd{{\hat d}}~

    \begin{description}
        \item[Given]
        An $m \times m$ distance matrix $d$.

        We assume the matrix is symmetric and satisfies the triangle inequality.

        \item[Goal]
        Compute the diameter $\hatd \eqdef max_{u,v} d(u,v)$.

        \item[Algorithm]~ 
        \begin{itemize}
            \item Pick an arbitrary point~$x$.
            \item Find the point~$y$ farthest from~$x$.
            \item Output $z \eqdef d(x,y)$.
        \end{itemize}

        \item[Analysis]~

        \begin{claim}
            The algorithm's time complexity is sublinear.
        \end{claim}
        \begin{proof}
            The algorithm's time complexity 
            is $\bigO(m)$, i.e., $\bigO(\sqrt{\mbox{input size}})$.
        \end{proof}

        \begin{claim}
            The algorithm is a (multiplicative) 2-approximation
            algorithm for the diameter problem; that is,
            $\hatd/2 \le z \le \hatd$.
        \end{claim}
        \begin{proof}
            The right inequality is trivial.  We show the left one.

            Fix two points $a,b$ such that the diameter is $\hatd = d(a,b)$.  Then
            \begin{displaymath}
                \hatd = d(a,b) \le d(a,x) + d(x,b) \le d(x,a) + d(x,b) \le d(x,y) + d(x,y) = 2z.
            \end{displaymath}
        \end{proof}

    \end{description}
\end{example}

\subsection{A decision problem}

As we mentioned before, most sublinear algorithms must output some sort of
approximation.  In this example, we discuss a type of approximation that makes
sense for outputs of decision problems.


\begin{example}{sequence monotonicity, attempt~1}
    ~
    \begin{description}
        \item[Given]
        An ordered list $X_1, \dots, X_n$ of elements (with partial order `$\le$' on them).

        \item[Goal]
        Is the list monotone?  That is, is $X_1 \le \cdots \le X_n$?
    \end{description}
\end{example}

As stated, the goal requires looking at every single sequence element.  (If we
skip over even one of them, that one may be the only one breaking the
monotonicity.)  Therefore we relax the problem:

\begin{example}{sequence monotonicity, attempt~2}
    ~
    \begin{description}
        \item[Given]
        An ordered list $X_1, \dots, X_n$ of elements (with partial order `$\le$' on them)
        and a real fraction $\epsilon \in [0,1]$.

        \item[Goal]
        Is the list \emph{close to monotone}?

        (We will say that a list is \emph{$\epsilon$-close to monotone} if it
        has a monotone subsequence of length $(1-\epsilon)n$.)

        \item[Required behavior]
        We require a 2-sided (BPP) error:
        \begin{itemize}
            \item If the list is monotone,
            the test should \pass{} with probability $3/4$.

            \item If the list is $\epsilon$-far from monotone,
            the test should \fail{} with probability $3/4$.
        \end{itemize}

        \begin{remark}
            The choice of $3/4$ is arbitrary; any constant bounded away
            from~$1/2$ works equally well.  We can amplify the definition from
            our constant to a different constant $1-\beta$ by repeating our
            algorithm $\bigO(\log\frac{1}{\beta})$ times and taking the majority
            answer.
        \end{remark}

        \begin{remark}
            The behavior of the test on inputs that are very close to monotone,
            but are not monotone, is undefined.  (Those inputs are
            $\epsilon'$-close with $0 \lneq \epsilon' \le \epsilon$.)  This
            makes sense because those inputs are `almost' monotone --- so we
            allow ourselves the latitude to treat them as if they were monotone
            --- while, all in all, they are \emph{not} monotone, and declaring
            them as such is correct.
        \end{remark}
    \end{description}

    Here are a few algorithmic ideas that one might try to base a tester upon:
    \begin{idea}{}
        Pick $i < j$ randomly and test $x_i < x_j$.

        We will show that this idea's complexity is $\bigOmega(\sqrt n)$.

        Fix some constant $c$, and consider the following sequence:
        \begin{displaymath}
            \underbrace{c, c-1, \dots, 1},
            \underbrace{2c, 2c-1, \dots, c+1},
            \dots, \dots, \dots,
            \underbrace{n, n-1, \dots, n-c+1}.
        \end{displaymath}
        The longest monotone subsequence has length $n/c$ (we can't pick twice
        from the same `group' since each group is monotonically decreasing) ---
        relatively small, so we would like this sequence to fail the test.
        We can see, however, that the test passes whenever it picks $i,j$ from different groups.

        The following can be shown:

        If the test is repeated by repeatedly picking new pairs $i,j$, each
        time discarding the old pair, and checking each such pair independently
        of the others, then $\bigOmega(n)$ pairs are needed.  However, if the
        test is repeated by picking $k$ indices and checking whether the
        subsequence induced by them is monotone, then $\Theta(\sqrt{n/c})$
        samples are needed (using the Birthday Paradox).
        We will see that we can do much better.
    \end{idea}

    \begin{idea}{}
        Pick $i$ randomly and test $x_i \le x_{i+1}$.

        Fix some constant $c$, and consider the following sequence (of $n$~elements):
        \begin{displaymath}
            \underbrace{1, 2, \dots, n/c},
            \underbrace{1, 2, \dots, n/c},
            \dots, 
            \underbrace{1, 2, \dots, n/c}.
        \end{displaymath}
        Again, the longest monotone subsequence has length $c + \frac{n}{c} - 1$ ---
        relatively small, so we would like this sequence to fail the test.
        However, the test passes unless the $i$ it picks is a ``border point''
        (i.e., unless $X_i = n/c$), which happens with probability $c/n$.
        % XXX expand?
        Therefore we expect to require a linear number of samples before
        detecting an input that should be rejected.
    \end{idea}

    \begin{idea}{}
        Combine the previous two ideas.

        This would verify that the sequence is locally monotone, and also
        monotone at large distances, but would not verify that it is monotone in
        middle-range gaps.  And counter-examples can be found.  However, there
        exists a correct, $\bigO(\log n)$-samples algorithms that works by
        testing pairs at various distances $1, 2, 4, 8, \dots, 2^k, \dots, n/2$.
    \end{idea}
    
    Before giving an algorithm, we make the following assumption.
    \begin{assumption}
        $X_i$ are pairwise distinct.
    \end{assumption}
    \begin{proof}
        Mentally replace each $X_i$ by the tuple $(X_i, i)$ and use dictionary
        order: to compare `$(X_i, i)$' to `$(X_j, j)$', compare the first
        coordinate and use the second coordinate to break ties.
    \end{proof}
    \begin{remark}
        This trick does not hamper the sublinearity of the algorithm because it
        does not require any pre-processing; the transformation can be done on
        the fly as each element is accessed and compared.
    \end{remark}

    \paragraph{Notation}
    \begin{description}
        \item[{`$[n]$'}]
        denotes the set $\set{1, 2, \dots, n}$ of positive integers.
        \item[`$\randomlychosenfrom$']
        denotes assignment of a random member of the set on its RHS to the
        variable on its LHS.  If the distribution is not specified, it is the
        uniform distribution.
    \end{description}
    For example, `$x \randomlychosenfrom [3]$' assigns to $x$ one of the
    three smallest positive integers, chosen uniformly.

    \paragraph{Algorithm}
    \begin{itemize}
        \item Repeat $\bigO(1/\epsilon)$ times:
        \begin{itemize}
            \item Pick $i \randomlychosenfrom [n]$.
            \item Query (obtain) the value $X_i$.
            \item Do binary search for $X_i$.
            \item If either
            \begin{itemize}
                \item an inconsistency was found during the binary search;
                \item $X_i$ was not found;
            \end{itemize}
            then return \fail{}.
        \end{itemize}
        \item Return \pass{}.
    \end{itemize}
    \subparagraph{Inconsistency}
    By an `inconsistency' we mean the following: during the binary search, we
    maintain an interval of allowed values for the next value we query.  The
    interval starts as `$[-\infty, +\infty]$.  Its upper and lower bounds are
    updated whenever we take a step to the left (towards smaller elements) or to
    the right (towards larger elements), respectively.  Whenever we query
    a value we assert that it is in the interval and raise an inconsistency if
    it isn't.

    \subparagraph{Time complexity}
    This algorithm's time complexity is $\bigO(\frac{1}{\epsilon} \log n)$,
    since the augmented binary search and the choosing of a random index cost
    $\bigO(\log n)$ steps each; and those are repeated $\bigO(1/\epsilon)$ times.

    \subparagraph{Correctness}
    We will now show that the algorithm satisfies the required behavior.  We
    will define which indices are `good' and relate the number of bad indices
    to the length of a monotone sequence of elements at good indices.

    \begin{definition}
        An index $i$ is \emph{good} if augmented binary search for $i$ is
        successful (does not detect an inconsistency).
    \end{definition}

    \begin{observation}
        If ${} \ge \epsilon n$ indices are bad, then
        $\Prob{}{\pass} < 1/4$.
    \end{observation}
    \begin{proof}
        Let $c$ be the constant under the `$\bigO(1/\epsilon)$ repetitions'
        clause.  Then
        \begin{equation}\label{eq:pass:lt:quarter}
            \Prob{}{\pass} \le (1-\epsilon)^{c/\epsilon} \le (1/\epsilon)^c < \frac14,
        \end{equation}
        where the last (strict) inequality follows by setting $c$ to a large
        enough (constant) value.
    \end{proof}
    
    \begin{theorem}
        The above algorithm has 2-sided error less than one quarter: it accepts
        good inputs with probability~$1$ and rejects bad inputs with
        probability at least $3/4$.
    \end{theorem}
    \begin{proof}
        If the list is monotone, then it passes with certainty because the
        binary search works and the $X_i$ are assumed distinct.  It remains to
        prove that far-from-monotone lists are rejected with high likelihood.

        We prove the contrapositive: assuming that an input passes with
        probability~${}>1/4$, we will show that it is $\epsilon$-close.

        Let $X_1, \dots, X_n$ be accepted with probability~${}>1/4$.
        By equation~\eqref{eq:pass:lt:quarter}, the number of bad indices
        is~${} < \epsilon n$.  Therefore ${} \ge (1-\epsilon) n$ indices are
        good.

        \begin{claim}
            If we delete all elements at bad indices, the remaining sequence is
            monotone.
        \end{claim}
        \begin{proof}
            Let $i<j$ be two good indices.
            Consider the paths in the binary-search tree from the root to $i$
            and to $j$.  These two paths have some longest prefix common to both
            of them.  It suffices to show that $x_i \le z \le x_j$.

            There are two cases.  If the path to $x_i$ is a prefix of the path
            to $x_j$, then $x_i = z$ (they are the same node in the tree).
            Otherwise $x_i$ is a descendant of a $z$'s left or right child.
            Since $i$ is good, then $x_i$ must be a descendant of $z$'s
            left child; for the same reason $x_i$ must be smaller than~$z$.

            Therefore, $x_i \le z$ always.  By symmetry, $z \le x_j$.  Therefore
            $x_i \le x_j$.
        \end{proof}
        
        The theorem follows from the claim.
    \end{proof}

    \begin{remark}
        It is known that $\bigOmega\bigl((\log n)/\epsilon\bigr)$ samples is
        optimal.
    \end{remark}
\end{example}


\subsection{Another example}

\begin{example}{graph connectivity, attempt~1}
    ~
    \begin{description}
        \item[Given]
        A graph $G = (V,E)$ with $n = \abs{V}$ vertices and $m = \abs{E}$ edges
        having maximum degree at most~$d$ (we think of $d$ as a large constant).
        The graph is represented as an adjacency list.

        \item[Goal]
        Is the graph connected?
    \end{description}
\end{example}

As before, answering this question with no error requires examining the entire
graph: an example is the line graph $L_n$ (i.e., a cycle with one edge removed).
Therefore, we will have to compromise on the goal if we are limited to sublinear
time.

\begin{example}{graph connectivity, attempt~2}
    ~
    \begin{description}
        \item[Given]
        A graph $G = (V,E)$ with $n = \abs{V}$ vertices and $m = \abs{E}$ edges
        having maximum degree at most~$d$ (we think of $d$ as a large constant).
        The graph is represented as an adjacency list.

        \item[Goal]
        Is the graph \emph{close to connected}?

        We will say that a graph is \emph{$\epsilon$-close to connected} if it
        can be transformed into a connected graph by adding at most $\epsilon d n$
        edges.
        
        (An alternative definition exists, which allows adding or removing up to
        $\epsilon d n$ edges, but on the other hand requires the resulting graph
        to still have maximum degree at most~$d$.  For simplicity we will use
        the addition-only definition given in the previous paragraph.)

        \item[Required behavior]
        We require a 1-sided error:
        \begin{itemize}
            \item If the graph is connected,
            the test should \pass{} with probability $1$.

            \item If the graph is $\epsilon$-far from connected,
            the test should \fail{} with probability $3/4$.
        \end{itemize}
    \end{description}

    \begin{proof-idea}
        If a graph is $\epsilon$-far from connected
        $\Rightarrow$ it has many (${}\ge \epsilon n$) connected components
        $\Rightarrow$ many connected components are small
        $\Rightarrow$ many nodes are in small connected components.
    \end{proof-idea}

    \paragraph{Algorithm}
    \begin{enumerate}
        \item Choose $\bigO(1/\epsilon d)$ nodes.
        \item For each node $s$ of these, run a BFS\footnote{breadth-first
        search} (originating from it) until either:
        \begin{enumerate}
            \item\label{gc:bfs:big}
            ${}\ge 2/\epsilon d$ distinct nodes are discovered;

            \item\label{gc:bfs:small}
            $s$ is determined to belong to a connected component of size ${} \le 2/\epsilon d$ nodes.
        \end{enumerate}
        \item If \ref{gc:bfs:small} ever happens, \reject{} $G$ and halt.
        \item Otherwise, \accept{}.
    \end{enumerate}
    
    \subparagraph{Time complexity}
    The number of loops is $\bigO(1/\epsilon d)$.  Each BFS costs up to
    $\bigO(2/\epsilon d)$ steps.  During the BFS, neighbor determination at each
    node is done by iterating its adjacency list, which can have length up to~$d$.
    Therefore the time complexity is $\bigO\left( \frac{1}{\epsilon d} \cdot
    \frac{2}{\epsilon d} \cdot d \right) = \bigO(1/d\epsilon^{2})$.

    \begin{lemma}
        If $G$ is $\epsilon$-far from connected, then $G$ has ${} \ge \epsilon d n$
        connected components.
    \end{lemma}
    \begin{proof}
        $N$ connected components can be connected by adding $N-1$ edges.
    \end{proof}
    \par
    \begin{remark}
        This proof is trickier if we use the alternative (max-degree-respecting)
        definition of $\epsilon$-far.
    \end{remark}
    \begin{corollary}
        If $G$ is $\epsilon$-far from connected, then it has ${} \ge \epsilon d n / 2$
        connected components of size less than $2 / \epsilon d$.
    \end{corollary}
    \begin{proof}
        Since $G$ is $\epsilon$-far, it has $L > \epsilon d n$ connected
        components.  Let $\ell'$ be the number of connected components of size
        ${} < 2 / \epsilon d$ and $\ell$ be the number of connected components
        of size ${} \ge 2 / \epsilon d$.  So $\ell + \ell' = L$.

        Therefore, $\ell+\ell' \ge \epsilon d n$ (using the lemma).

        Also $\ell \cdot \frac{2}{\epsilon d} \le n$ (the LHS is the number of
        vertices in the connected components counted by~$\ell$), i.e.,
        $\ell \le \epsilon d n / 2$.

        The conclusion, $\ell' \ge \epsilon d n /2$, follows by combining the
        last two inequalities.
    \end{proof}

    \begin{corollary}
        The fraction of nodes in $V$ belonging to connected components smaller
        than $2 / \epsilon d$ is at least $\epsilon d / 2$.
    \end{corollary}
    \begin{proof}
        \def\CC{\mathcal{C}}
        For vertex $u \in V$, let $\CC(u)$ denote the connected component of $u$
        and let $S(u)$ be the event that $\abs{\CC(u)} < 2 / \epsilon d$.  Then
        \begin{displaymath}
            \Prob{u \randomlychosenfrom V}{S(u)}
            = \frac{\abs{\set{u \in V : S(u)}}}{\abs{V}}
            \ge \frac{\set{\CC(u) : u \in V \land S(u)}}{\abs{V}}
            \ge \frac{\epsilon d n / 2}{n} = \frac{\epsilon d}{2}.
        \end{displaymath}
        Intuitively, this bounds the number of nodes in small connected
        components from below by the number of such connected components.
    \end{proof}

    \begin{theorem}
        The test passes connected graphs with certainty and fails $\epsilon$-far
        graphs with probability at least~$3/4$.
    \end{theorem}
    \begin{proof}
        The first claim is obvious (step~\ref{gc:bfs:small} will never occur).
        We show the second claim.  We see that
        \begin{displaymath}
            \Prob{}{\fail}
            \ge 1 - \Prob{}{\pass}
            \ge 1 - {\left(1 - \frac{\epsilon d}{2}\right)}^{\bigO(1 / \epsilon d)}
            \ge 1 - e^{-c'} \ge \frac34.
        \end{displaymath}
        where the last inequality follows from choosing the constant $c$ such
        that $e^{-c'} < 1/4$.
    \end{proof}

\end{example}


\end{document}





