\documentclass{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\usepackage{amsmath,amsfonts,amssymb,amsthm,commath,dsfont,nicefrac,xfrac}
\usepackage{enumitem}
\usepackage{framed}
\usepackage{xspace}
\usepackage{microtype}
\usepackage[round]{natbib}
\usepackage{cleveref}
\usepackage[dvipsnames]{xcolor}
\def\T{{\scriptscriptstyle\mathsf{T}}}
% following loops stolen from djhsu
\def\ddefloop#1{\ifx\ddefloop#1\else\ddef{#1}\expandafter\ddefloop\fi}
\def\ddef#1{\expandafter\def\csname bb#1\endcsname{\ensuremath{\mathbb{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname c#1\endcsname{\ensuremath{\mathcal{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname v#1\endcsname{\ensuremath{\boldsymbol{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ddefloop
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\def\SPAN{\textup{span}}
\def\tu{\textup{u}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\Z{\mathbb{Z}}
\def\be{\mathbf{e}}
\def\nf{\nabla f}
\def\veps{\varepsilon}
\def\cl{\textup{cl}}
\def\inte{\textup{int}}
\def\dom{\textup{dom}}
\def\Rad{\textup{Rad}}
\def\lsq{\ell_{\textup{sq}}}
\def\hcR{\widehat{\cR}}
\def\hcRl{\hcR_\ell}
\def\cRl{\cR_\ell}
\def\hcE{\widehat{\cE}}
\def\cEl{\cE_\ell}
\def\hcEl{\hcE_\ell}
\def\eps{\epsilon}
\def\1{\mathds{1}}
\newcommand{\red}[1]{{\color{red} #1}}
\newcommand{\blue}[1]{{\color{blue} #1}}
\def\srelu{\sigma_{\textup{r}}}
\def\vsrelu{\vec{\sigma_{\textup{r}}}}
\def\vol{\textup{vol}}
\def\bv{\textsc{bv}}
\def\lip{\textsc{lip}}
\newcommand{\ip}[2]{\left\langle #1, #2 \right \rangle}
\newtheorem{fact}{Fact}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{condition}{Condition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newenvironment{Q}
{%
\clearpage
\item
}
{%
\phantom{s} %lol doesn't work
\bigskip
\noindent\textbf{Solution.} \emph{(If using this template, please write your solution here.)}
}
\title{CS 540 DLT --- Homework 1.}
\author{\emph{your NetID here.}}
\date{Version $1+3\eps$.}
\begin{document}
\maketitle
\noindent\textbf{Instructions.}
\begin{itemize}
\item
Homework is due \textbf{Wednesday, September 28, at 11:59pm};
no late homework accepted.
\item
You must work individually for this homework.
\item
Excluding office hours, and high-level discussions on discord,
you may discuss with at most three other people;
please state their NetIDs clearly on the first page of your submission.
\item
Homework must be typed, and submitted via gradescope.
Please consider using the provided {\LaTeX} file as a template.
\item
Each part of each problem is worth 3 points.
\item
For any problem asking you to construct something, for full credit you must
always formally prove your construction works.
\item
General course and homework policies are
\href{http://mjt.cs.illinois.edu/courses/dlt-f21/}{on the course webpage}.
\end{itemize}
\vspace{2em}
\noindent\textbf{Notation.}
For convenience, given a univariate activation $\sigma:\R\to\R$,
define 2-layer biased networks of arbitrary width as
\begin{align*}
\cF_{\sigma,d,m}
&:= \cbr{ x\mapsto a^\T \sigma(V x + b) : a\in\R^m, V\in\R^{m\times d}, b\in\R^m},
\\
\cF_{\sigma,d}
&:= \bigcup_{m\geq 0} \cF_{\sigma,d,m}.
\end{align*}
Additionally, let $\srelu(z):= \max\{0,z\}$ denote the ReLU.
\vspace{2em}
\noindent\textbf{Version history.}
\begin{enumerate}
\item[$1$.] Initial version.
\item[$1+\eps$.] (1a.) removed $\ell$ subscript. (1d.) ``biased'' explicitly stated.
(4a.) $\cF_{d,1}$ should have been $\cF_{\sigma,1}$. (4b.) $\dif r \to \dif w$.
\item[$1+2\eps$.] Due date pushed back one week; (3a.) nullified.
\item[$1+3\eps$.] (2c.) Clarified/strengthened continuity simplification.
\end{enumerate}
\begin{enumerate}[font={\Large\bfseries},left=0pt]
\begin{Q}
\textbf{Miscellaneous short questions.}
\begin{enumerate}
\item
\textbf{(Strength of uniform norm.)}
Let $\cC$ denote continuous functions over $\R^d$,
fix a continuous activation $\sigma:\R^d \to \R$,
and suppose $\cF_{\sigma,d}$ is a \emph{universal approximator},
meaning $\sup_{g\in\cC} \inf_{f\in\cF_{\sigma,d}}\sup_{x\in[0,1]^d} |f(x)-g(x)| = 0$.
Show that for any loss $\ell$ which is $\rho$-Lipschitz in its first argument
and any probability distribution $\mu$ over
$[0,1]^d\times \R$, the future error $\cR$ defined as
\[
\cR(f) := \bbE_{(x,y)\sim \mu} \ell(f(x),y)
\]
satisfies $\inf_{f\in\cF} \cR(f) = \inf_{g\in\cC} \cR(g)$.
\textbf{Remark:} this claim follows lecture, specifically the discussion of the
varying approximation goals and how they imply each other; this one shows that
universal approximation implies the weaker ``future risk'' approximation.
\item
\textbf{(Weakness of $L_1$ norm.)}
Suppose $\cR$ and $\cF_{\sigma,d}$ and $\cG$ as in the preceding problem
(with ReLU $\sigma=\srelu$ for concreteness),
and the logistic loss
$\ell(\hat y,y) = \ln(1+\exp(-\hat y y))$.
Given any $\eps>0$, construct a discrete probability distribution over $[0,1]^d\times
\{\pm 1\}$ (inputs and labels)
and two functions $f\in\cF_{\srelu,d}$ and $g\in\cC$
so that
\[
\cR(f) \geq \frac 1 \eps + \cR(g)
\qquad
\textup{and}
\qquad
\int_{[0,1]^d} |f(x) - g(x)| \dif x \leq \eps.
\]
\textbf{Remark:} continuing the remark from the preceding problem, this problem
establishes that it would not be enough to approximate continuous functions in
$L_1$, we need something stronger.
\item
\textbf{(Compactness.)}
Show that
$\inf_{f\in\cF_{\srelu,1}} \sup_{x\in \R} |f(x) - \sin(x)| \geq 1$.
\textbf{Remark:} this shows compactness is necessary.
\item
\textbf{(Deep, narrow networks.)}
Suppose $f:[0,1]^d\to\R$ can be written as a 2-layer
biased ReLU network of width $m$, meaning
$f(x) = a^\T \srelu(Vx + b) \in \cF_{\srelu,d,m}$.
Construct a biased network with $m+1$ ReLU layers and width $d+3$ which
also (exactly) computes $f$.
\textbf{Remark:}
this reveals some convenient properties of ReLUs.
\textbf{Remark:} if $f$ itself was constructed to approximate some continuous function,
together we conclude that deep, narrow networks are also universal approximators.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Constructive data-adaptive univariate approximation.}
This question considers functions $f:[0,1]\to\R$; that is, over the unit interval.
Define a \emph{bounded variation} norm $\|f\|_{\bv}$ in the following two steps.
\begin{itemize}
\item
If $f$ is monotone (nondecreasing or nonincreasing),
define $\|f\|_{\bv} := | f(0) - f(1)|$.
\item
Otherwise, given any $f$, define a family of decompositions into monotone
functions as
\[
\cS_f := \cbr{ (g,h) \ : \ f=g+h \textup{ where $g$ and $h$ are monotone} },
\]
and finally a norm $\|f\|_{\bv} = \inf\{ \|g\|_\bv + \|h\|_\bv : (g,h) \in \cS_f\}$,
with the convention $\|f\|_{\bv} = \infty$ when $\cS_f=\emptyset$.
\end{itemize}
This problem will use $\|\cdot\|_\bv$ to give data-adaptive univariate approximation
bounds. For comparison, define the (tightest) Lipschitz constant $\|f\|_{\lip}$
as
\[
\sup_{x\neq y \in [0,1]} \frac {|f(x) - f(y)|}{|x-y|}.
\]
If $f$ is differentiable, then $\|f\|_{\lip} = \sup\{ |f'(x)| : x\in (0,1)\}$.
\begin{enumerate}
\item
Suppose $f$ is continuously differentiable.
Show that $\|f\|_\bv \leq \|f\|_\lip$.
\textbf{Note:} it's still true without differentiability, but more painful.
\item
Show that for any $\eps>0$, there exists $f$ so that $\|f\|_\lip \geq 1/\eps$
but $\|f\|_\bv\leq \eps$.
\item
Show that for any $g:[0,1]\to\R$ and any $\eps>0$,
there exists a 2-layer threshold network (meaning activation $\sigma(r) = \1[z\geq 0]$)
with at most $4 \lceil \|g\|_{\bv} / \eps\rceil$ nodes such
that $|f(x) - g(x)|\leq \eps$ for all $x\in[0,1]$.
\textbf{Simplification:} feel free to assume without proof that
$g$ is continuous, $\|g\|_\bv<\infty$,
and
$\cS_g$ can be restricted to continuous pairs (without changing $\|g\|_\bv$).
\item
Show that for any continuously differentiable $g:[0,1]\to\R$
with $g(0)=0$ and any $\eps>0$,
there exists a 2-layer ReLU network
with at most $4 \lceil \|g'\|_{\bv} / \eps\rceil$ nodes such
that $|f(x) - g(x)|\leq \eps$ for all $x\in[0,1]$.
\textbf{Simplification:} again, when considering monotone pairs, feel free to assume
continuity.
\textbf{Remark:} we can use this to approximate $r\mapsto \exp(r)$ with ReLUs,
and plug this into the proof scheme from the lecture notes to show
$\cF_{\srelu,d}$ is a universal approximator.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{NTK with general activations.}
As in the NTK lectures, recall that the kernel corresponding to a shallow network
with arbitrary activation has the form
\[
k(x,x') := x^\T x' \bbE_w \sigma'(w^\T x) \sigma'(w^\T x'),
\]
where $w\in\R^d$ is a standard Gaussian random vector, thus
$\bbE w = 0$ and $\bbE w w^\T = I$.
Throughout this problem, suppose $\|x\|=1$ (this includes $\|x'\|=1$ in part (a)).
\begin{enumerate}
\item
\textbf{\emph{\red{(The answer to this part of this problem is in the notes now, feel free to skip.)}}}
Prove
$\displaystyle k(x,x') = x^\T x'
\bbE_w\sbr{\sigma'(w^\T \ve_1)\sigma'\del{ w^\T \ve_1 x^\T x' + w^\T\ve_2\sqrt{1 - (x^\T x')^2} } }$,
where $\ve_1$ and $\ve_2$ are standard basis vectors.
\textbf{Hint:} rotational invariance of the Gaussian!
\textbf{Technical note:} if you wish, you can assume $\sigma$ has at most countably many
points of nondifferentiability; since $w$ has a continuous distribution, the integral
may still be computed.
\textbf{Remark:} The kernel therefore only interacts with $x$ and $x'$ via $x^\T x'$,
which is pretty interesting!
\item
Let points $(x_1,\ldots,x_n)$ be given
as well as labels $(y_1,\ldots,y_n)$ with $y_i\in\{\pm 1\}$,
and suppose $\sigma(z) = \max\{0,z\}$, the ReLU.
Recall that the the NTK predictor of width $m$ will have the form
(ignoring scaling)
\[
f(x) := \sum_{j=1}^m v_j^\T x \sigma'(w_j^\T x),
\]
where $(w_1,\ldots,w_m)$ are IID Gaussian, and $(v_1,\ldots,v_m)$ are parameters.
Suppose there exists a pair $(x_i,x_j)$ with
$y_i\neq y_j$ and the angle between $x_i$ and $x_j$ is at most $\delta>0$.
Prove that with probability at least $1 - \sfrac {m\delta}{\pi}$,
it is impossible to find $(v_1,\ldots,v_m)$ with $\sum_i \|v_i\|_2 \leq \sfrac 1 \delta$
so that $f(x_i) = y_i$ for all $i$.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Monomials and uniform approximation via derivatives.}
This problem shows that we can perform univariate approximation with
any activation $\sigma:\R\to\R$ which is not a polynomial, under an additional
technical condition (that it is $C^\infty$, which will be explained shortly).
This can be plugged into the proof in the typed lecture notes to imply that
any such $\cF_{\sigma,d}$ is a universal approximator.
\textbf{Note:} you may not use the Stone-Weierstrass Theorem when solving this problem.
In more detail, $\sigma$ being $C^\infty$ means that it (and every element
of $\cF_{\sigma,1}$) have continuous derivatives of all order, and moreover they are uniformly
bounded over compact sets (that is, the $n^{\textup{th}}$ derivative $\sigma^{(n)}$
satisfies $\sup_{|x|\leq r} \sigma^{(n)}(x) <\infty$ for all $r<\infty$. This fact will
be used a few times in the proofs.
Additionally, as a consequence of $\sigma$ not being a polynomial, then for every $n$,
there exists $x$ so that $\sigma^{(n)}(x) \neq 0$.
Lastly, for convenience, define \emph{uniform norm}
$\|f-g\|_\tu := \sup_{x\in[0,1]}|f(x)-g(x)|$.
\begin{enumerate}
\item
\textbf{(Closed under a single derivative.)}
Let $f\in \cF_{\sigma,1}$ and any $w\in \R$ and any $\eps > 0$ be given,
and define $h(x) := x f'(wx) = (\nicefrac{\dif}{\dif w}) f(wx)$.
Prove that there exists $g\in \cF_{\sigma,1}$
so that $\|h-g\|_\tu \leq \eps$.
\textbf{Hint:}
consider writing $h$ as a limit (via the definition of derivative); we are trying
to show that the limit point is well-approximated in $\cF_{\sigma,1}$. Writing out that
limit, is there a natural candidate with whichn to approximate $h$?
To control the error of this approximation, it may be helpful to use the above
properties, and a second-order Taylor expansion with an exact remainder.
\item
\textbf{(Closed under derivatives.)}
For every real $w,b\in \R$ and positive integer $n$,
define
\[
h_{n,r,b}(x) := x^{n} \sigma^{(n)}(wx+b)
= \frac{\dif^n}{\dif w^n} \sigma(wx+b).
\]
Show that for any $(w,b,\eps,n)$,
there exists $g\in\cF_{\sigma,1}$ with $\|g-h_{n,w,b}\|_{\tu}\leq \eps$.
\textbf{Hint:} set up an induction over $n$; a key to the proof is choosing a
(powerful) inductive hypothesis, be sure to state yours clearly.
The inductive step can be done similarly to the previous part, albeit more
elaborately.
\item
\textbf{(Monomials.)}
Prove that for any positive integer $n$ and real $\eps>0$,
there exists $g\in\cF_{\sigma,1}$
so that $\|g - p_n\|_\tu \leq \eps$ where $p_n(x) = x^n$.
\textbf{Hint:} this one should be short, you can directly use the previous part.
\item
\textbf{(Universal approximation.)}
Show that for any $r>0$ and any $\eps>0$,
there exists $f \in \cF_{\sigma,1}$ with $\sup_{|x|\leq r} |f(x) - \exp(x)|\leq \eps$.
\textbf{Remark:}
This can now be plugged into the universal approximation proof from the lecture
notes, and finishes the proof except when $\sigma$ is not $C^\infty$. There is a trick
to drop $C^\infty$ which I'll mention in some update to the notes.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Why?}
You receive full credit for this question so long as you write at least one
sentence for each answer. Please be honest and feel free to be critical.
\begin{enumerate}
\item
Why are you taking this class?
\item
What is something the instructor can improve?
\end{enumerate}
\end{Q}
\end{enumerate}
%\clearpage
%\bibliographystyle{plainnat}
%\bibliography{hw1}
\end{document}