\documentclass{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\usepackage{amsmath,amsfonts,amssymb,amsthm,commath,dsfont,nicefrac}
\usepackage{enumitem}
\usepackage{framed}
\usepackage{xspace}
\usepackage{microtype}
\usepackage[round]{natbib}
\usepackage{cleveref}
\usepackage[dvipsnames]{xcolor}
% following loops stolen from djhsu
\def\ddefloop#1{\ifx\ddefloop#1\else\ddef{#1}\expandafter\ddefloop\fi}
\def\ddef#1{\expandafter\def\csname bb#1\endcsname{\ensuremath{\mathbb{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname c#1\endcsname{\ensuremath{\mathcal{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\def\SPAN{\textup{span}}
\def\tu{\textup{u}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\Z{\mathbb{Z}}
\def\be{\mathbf{e}}
\def\nf{\nabla f}
\def\veps{\varepsilon}
\def\cl{\textup{cl}}
\def\inte{\textup{int}}
\def\dom{\textup{dom}}
\def\Rad{\textup{Rad}}
\def\lsq{\ell_{\textup{sq}}}
\def\hcR{\widehat{\cR}}
\def\hcRl{\hcR_\ell}
\def\cRl{\cR_\ell}
\def\hcE{\widehat{\cE}}
\def\cEl{\cE_\ell}
\def\hcEl{\hcE_\ell}
\def\eps{\epsilon}
\def\1{\mathds{1}}
\newcommand{\red}[1]{{\color{red} #1}}
\newcommand{\blue}[1]{{\color{blue} #1}}
\def\srelu{\sigma_{\textup{r}}}
\def\vsrelu{\vec{\sigma_{\textup{r}}}}
\def\vol{\textup{vol}}
\newcommand{\ip}[2]{\left\langle #1, #2 \right \rangle}
\newtheorem{fact}{Fact}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{condition}{Condition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newenvironment{Q}
{%
\clearpage
\item
}
{%
\phantom{s} %lol doesn't work
\bigskip
\textbf{Solution.}
}
\title{ML Theory  Homework 1}
\author{\emph{your NetID here}}
\date{Version 1}
\begin{document}
\maketitle
\textbf{Instructions.} (Different from homework 0.)
\begin{itemize}
\item
Everyone must submit an individual writeup.
\item
You may discuss with up to 3 other people. State their NetIDs clearly on the first page.
Outside of office hours, you should not discuss with anyone but these three.
\item
Homework is due \textbf{Wednesday, October 10, at 3:00pm}; no late homework accepted.
\item
Please consider using the provided {\LaTeX} file as a template.
\end{itemize}
\begin{enumerate}
\newcommand{\xup}[1]{x^{(#1)}}
\begin{Q}
\textbf{(Miscellaneous short questions.)}
Provide complete proofs, but try to find short solutions.
\begin{enumerate}
\item
\textbf{(Justifying uniform norm: upper bound.)}
Suppose $\ell$ is $L$lipschitz, and $\mu$ is a probability
measure supported on $[0,1]^d \times \cbr{1,+1}$.
Show
\[
\cR_\ell(f)  \cR_\ell(g)
=
\int \ell(yf(x))\dif\mu(x,y)

\int \ell(yg(x))\dif\mu(x,y)
\leq
L \fg\_\tu.
\]
\item
\textbf{(Justifying uniform norm: lower bound.)}
Given any two continuous functions $f$ and $g$,
construct an $L$lipschitz loss $\ell$
and a
probability measure $\mu$ so that the previous part is tight:
that is,
\[
\cR_\ell(f)  \cR_\ell(g) = L\fg\_\tu.
\]
\textbf{Remark:} together, we've shown why we aim for uniform approximation
(it implies bounds for all measures).
\item
\textbf{(StoneWeierstrass with $\cos$.)}
Use the StoneWeierstrass theorem, as stated in lecture 5
(do not use another source), to prove that
for every continuous function $f :\R^d\to\R$
and $\eps > 0$, there exists $g \in \SPAN(\cH_{\cos})$
with $\fg\_\tu \leq \eps$.
(\textbf{Hint:} refresh yourself on some trig identities.)
\item
\textbf{(Deep, narrow networks.)}
Let $\srelu(z):=\max\{0,z\}$ denote the ReLU,
and for convenience let $\vsrelu$ denote the coordinatewise
version of appropriate dimension (i.e., $\vsrelu(v)$ outputs
a vector of the same dimension as $v$,
whatever it happens to be).
Suppose $f:[0,1]^d\to\R$ can be written as a network
with a single ReLU layer, specifically
$f(x) = A_2 \vsrelu(A_1 x + b_1)$ where $A_1 \in \R^{w \times d}$
and $A_2 \in \R^{1\times w}$.
Construct a network with $w$ ReLU layers and width $d+3$ which
also (exactly) computes $f$.
\textbf{Remark:}
this reveals some convenient properties of ReLUs.
\item
\textbf{(Uniform approximation with ReLU.)}
Again define $\srelu(z) := \max\{0,z\}$.
Construct $\phi \in \SPAN(\cH_{\srelu})$ which satisfies the
conditions of Theorem 1.9 from Lecture 5 (and provide explicit
verification of these conditions).
\textbf{Remark:} consequently, Theorem 1.9 may be applied,
and thus shallow ReLU networks fit continuous functions.
\end{enumerate}
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(More headaches from MinskyPapert.)}
Recall again the MinskyPapert XOR problem,
as appeared in Lecture 2.
Let $\cS := \{\xup{1},\xup{2},\xup{3},\xup{4}\}$
denote the four points in that problem.
\begin{enumerate}
\item
Find a small axisaligned decision tree which predicts perfectly on $\cS$.
\item
Recall the class of \emph{boosted decision stumps}: these are linear combinations
of axis aligned decision trees with only one internal node.
That is to say, a boosted decision stump is a function of the form
\[
x \mapsto \sum_{i=1}^N \alpha_i \1[ x_{j_i} \geq b_i]
\]
where $(\alpha_1,\ldots,\alpha_N)$ are scalar,
each $j_i \in \{1,\ldots,d\}$ indexes a single coordinate,
and $(b_1,\ldots,b_N)$ are scalars.
\textbf{Problem.} Let $\eps>0$ be given and construct
(a) an axisaligned decision tree $g$ which predicts perfectly,
(b) a boosted decision stump $f$ which is incorrect on half of $\cS$,
but
\[
\rho(f,g) := \int_{[1,+1]^2} f(x) g(x)\dif x \leq \eps.
\]
\textbf{Note.} Rather than the usual $\fg\_1$ which integrates over $[0,1]^d$,
we are integrating over $[1,+1]^2$.
\item
Prove that there can not exist a perfect boosted decision stump for the MinksyPapert instance above.
\textbf{Remark.} First of all, this tells us some limitations of $\\cdot\_1$ approximation.
Second of all, it tells us that boosted decision stumps, which were popular during roughly
19952005, are not so good.
\end{enumerate}
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(Branching programs and decision trees.)}
Recall the discussion from the end of Lecture 5, regarding
the size of $f(x) := \frac 1 d \sum_{i=1}^d x_i$ with $x\in\{0,1\}^d$
when represented as a decision tree and a branching program.
A branching program of size $\cO(d^2)$ was provided.
This
question will prove that any decision tree needs size at least $2^d$.
In this question, the predicates computed by internal nodes are decision stumps,
meaning they have the form $\1[x_i \geq b]$ where $i \in \{1,\ldots,d\}$ and $b\in \R$.
\begin{enumerate}
\item
As discussed in class, the leaves of the tree form a partition of the
input space (in this case $\{0,1\}^d$).
Each leaf can therefore be associate with a string $s$ of length
$d$, where $s_i \in \{\emptyset, 1,+1,\star\}$ means that inputs reaching this node
respectively have nothing, 1, +1, or $\pm 1$ in coordinate $i$.
Prove that given any leaf, its associated string has at least $dp$ entries
equal to $\star$, where $p$ is the number of internal nodes (predicates)
along the roottoleaf path for this leaf.
\item
Use the preceding part to prove that any decision tree with strictly less than
$2^d  1$ internal nodes must fail to represent $f$ (that is, it is incorrect on at least
one input string $x\in\{0,1\}^d$).
\end{enumerate}
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(2layer networks fit continuous functions.)}
Recall from class the definition
\[
\cH_\sigma := \cbr{ x \mapsto \sigma(\ip{a}{x} + b) : a\in\R^d, b\in\R }.
\]
Using StoneWeierstrass, we proved we can approximate continuous functions
with $\SPAN(\cH_{\exp})$. It was then claimed that the
rest of the proof is ``essentially univariate''; this exercise completes that proof.
One more piece of notation is needed. Say that $\sigma:\R\to\R$ is \emph{sigmoidal}
when it is nondecreasing, continuous, and
\[
\lim_{z\to \infty} \sigma(z) = 0,\qquad \lim_{z\to \infty} \sigma(z) = 1.
\]
\begin{enumerate}
\item
The first missing piece is to assert that we really are left with a univariate
problem.
Namely, prove the following.
\begin{quote}
Let $\sigma:\R\to\R$ and $\phi : \R\to\R$ be given.
Suppose that for any interval $[r,s]$ and any $\tau > 0$,
we can always find $h \in \SPAN(\cH_\sigma)$ so that
\[
\sup\cbr{ h(x)  \phi(x) : x\in [r,s] } \leq \tau.
\]
(In words: we have a way to approximate $\phi$ along $[r,s]$
with $\SPAN(\cH_\sigma)$.)
Then for any $\eps>0$ and any $g\in\cH_\phi$ but now $g:\R^d\to\R$,
we can still choose $f\in \SPAN(\cH_\sigma)$ with $\fg\_\tu \leq \eps$.
\end{quote}
\item
Let sigmoidal $\sigma:\R\to\R$,
target error $\tau > 0$,
interval $[r,s]$,
and a function $\psi:\R\to\R$ be given
with $\psi$ Lipschitz continuous along $[r,s]$.
Show that there exists $h\in \SPAN(\cH_{\sigma})$ satisfying
\[
\sup\cbr{ h(x)  \psi(x) : x\in [r,s] } \leq \tau.
\]
\textbf{Hints.} (a) Note that for large $M$, $\sigma(Mx) \approx \1[ x \geq 0]$;
(b) consider drawing a picture for the simpler case of nonincreasing $\psi$,
with special attention to the meaning of Lipschitz continuity.
\item
Prove that $\exp$ and $\cos$ are Lipschitz continuous along any bounded interval.
(Yup, that's really all you need to do for this part.)
\end{enumerate}
Though you don't need to write anything about it here, I urge you to verify that
the preceding steps can be combined with the material in lecture
to complete the proof.
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(A nuisance.)}
Recall that the lectures on approximation of continuous functions by 2 and 3layer networks
did not include a nonlinearity on the final output.
This exercise points out that we can use those as a lemma to establish that networks
\emph{with} final nonlinearities can also approximate continuous functions (albeit with restrictions
on the range).
Throughout this exercise,
suppose a function class $\cF$ is given which fits continuous functions
in our usual sense: for any
continuous $g:\R^d\to\R$ and any $\tau>0$,
there exists $f\in \cF$ with $\fg\_\tu \leq \tau$.
The following notation will be handy.
Namely, given univariate $\sigma : \R\to[0,1]$,
define the function class
\[
\cF_\sigma := \{ \sigma \circ f : f\in \cF\}.
\]
\begin{enumerate}
\item
Suppose $\sigma : \R\to[0,1]$ is sigmoidal (as in the previous exercise),
Lipschitz,
and has a continuous inverse.
Show that for any continuous $g:\R^d\to(0,1)$ and any $\eps > 0$,
there exists $h\in \cF_\sigma$ with $\hg\_1 \leq \eps$.
\textbf{Hint.} Find a way to bake the inverse into the problem.
\item
\textbf{(Optional; \emph{hard mode}.)}
Suppose $\sigma : \R\to[0,1]$ is sigmoidal.
Show that for any continuous $g:\R^d\to[0,1]$ and any $\eps > 0$,
there exists $h\in \cF_\sigma$ with $\hg\_\tu \leq \eps$.
\textbf{Note.} If you do this part, you must \emph{still} provide a complete
independent solution to the previous part. Be nice to the TA\dots
\end{enumerate}
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(Monomials and uniform approximation via derivatives.)}
This problem will provide an approach to uniform approximation that
avoids StoneWeierstrass; \textbf{do not} use StoneWeierstrass or
Weierstrass or anything similar in any step of the proof!
The problem will consider only the univariate case, but essentially the
same proof works in the multivariate case (as discussed at the end).
For convenience, for any activation $\sigma$,
define $\cG_\sigma := \SPAN(\cH_\sigma)$.
Here are some useful analysis facts for this problem:
\begin{itemize}
\item
Continuous functions are uniformly continuous on compact sets.
\item
To say a function $f$ is $C^\infty$ means all derivatives exist (and are continuous).
If $\sigma$ is $C^\infty$, then so is every $f\in\cG_\sigma$.
\end{itemize}
Throughout this problem, suppose $\sigma$ is $C^\infty$ and $\sigma^{(n)} \neq 0$,
meaning the $n^\textup{(th)}$ derivative is not identically the zero function
for every nonnegative integer $n$.
\begin{enumerate}
\item
(Closed under a single derivative.)
Let $f\in \cG_\sigma$ and any $w\in \R$ and any $\eps > 0$ be given,
and define $h(x) := x f'(wx)$
(the mapping $x\mapsto \nicefrac{\partial}{\partial r} f(rx)_{r=w}$).
Prove that there exists $g\in \cG_\sigma$
so that $\hg\_\tu \leq \eps$.
\textbf{Hint.} Consider the definition of
$\nicefrac{\partial}{\partial r} f(rx)_{r=w}$
in terms of limits,
and see how it interacts with an exact (integral remainder) Taylor expansion.
Via the analysis facts above, you can conveniently bound
the remainder term. Use this to construct an appropriate $g\in \cG_\sigma$,
and prove that it works.
\item
(Closed under derivatives.)
For every real $w\in \R$ and positive integer $n$,
define
\[
h_{n,w}(x) := x^{n} \sigma^{(n)}(wx)
= \nicefrac{\partial^n}{\partial r^n} \sigma(rx)_{r=w}.
\]
Show that for any $(w,\eps,n)$,
there exists $g\in\cG_\sigma$ with $\gh_{n,w}\\leq \eps$.
\textbf{Hint.} Combine the previous part with an induction on $n$ and some careful
reasoning about approximations. Be wary of circularity\dots
\item
(Monomials.)
Prove that for any positive integer $n$ and real $\eps>0$,
there exists $g\in\cG_\sigma$
so that $\g  p_n\_\tu \leq \eps$ where $p_n(x) = x^n$.
\textbf{Hint.} Use the previous part, and double check the conditions
on $\sigma$\dots
\end{enumerate}
Now that we have monomials, we can use the Weierstrass Theorem (which has
a simple constructive proof). Also, the proof above goes through no problem
in the multivariate case (now use $x\mapsto \sigma(\ip{w}{x})$, and
take different partial derivatives to get various monomials).
\end{Q}
\emph{(Your solution here.)}
\begin{Q}
\textbf{(Why?)}
You receive full credit for this question so long as you write at least one
sentence for each answer. Please be honest and feel free to be critical.
\begin{enumerate}
\item
Why are you taking this class? What do you expect from it?
\item
What do you expect to gain (e.g., in research, work, life)
by knowing ML Theory?
\item
Do you have any feedback about the class, lectures, or instructor?
\end{enumerate}
\end{Q}
\emph{(Your solution here.)}
\end{enumerate}
%\clearpage
%\bibliographystyle{plainnat}
%\bibliography{hw1}
\end{document}