\documentclass{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\usepackage{amsmath,amsfonts,amssymb,amsthm,commath,dsfont,nicefrac,xfrac}
\usepackage{enumitem}
\usepackage{framed}
\usepackage{xspace}
\usepackage{microtype}
\usepackage[round]{natbib}
\usepackage{cleveref}
\usepackage[dvipsnames]{xcolor}
\usepackage{tikzsymbols}
\def\T{{\scriptscriptstyle\mathsf{T}}}
\def\hG{\widehat{G}}
\def\trace{\textup{tr}}
\def\tF{{\textup{F}}}
\def\URad{\textup{URad}}
\def\sgn{\textup{sgn}}
% following loops stolen from djhsu
\def\ddefloop#1{\ifx\ddefloop#1\else\ddef{#1}\expandafter\ddefloop\fi}
\def\ddef#1{\expandafter\def\csname bb#1\endcsname{\ensuremath{\mathbb{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname c#1\endcsname{\ensuremath{\mathcal{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname v#1\endcsname{\ensuremath{\boldsymbol{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ddefloop
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\def\SPAN{\textup{span}}
\def\tu{\textup{u}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\Z{\mathbb{Z}}
\def\be{\mathbf{e}}
\def\nf{\nabla f}
\def\veps{\varepsilon}
\def\cl{\textup{cl}}
\def\inte{\textup{int}}
\def\dom{\textup{dom}}
\def\Rad{\textup{Rad}}
\def\lsq{\ell_{\textup{sq}}}
\def\hcR{\widehat{\cR}}
\def\nhcR{\nabla\widehat{\cR}}
\def\hcRl{\hcR_\ell}
\def\cRl{\cR_\ell}
\def\hcE{\widehat{\cE}}
\def\cEl{\cE_\ell}
\def\hcEl{\hcE_\ell}
\def\tell{\tilde\ell}
\def\eps{\epsilon}
\def\1{\mathds{1}}
\newcommand{\red}[1]{{\color{red} #1}}
\newcommand{\blue}[1]{{\color{blue} #1}}
\def\srelu{\sigma_{\textup{r}}}
\def\vsrelu{\vec{\sigma_{\textup{r}}}}
\def\vol{\textup{vol}}
\def\barw{\bar w}
\def\conv{\textup{conv}}
\newcommand{\ip}[2]{\left\langle #1, #2 \right \rangle}
\newtheorem{fact}{Fact}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{condition}{Condition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newenvironment{Q}
{%
\clearpage
\item
}
{%
\phantom{s} %lol doesn't work
\bigskip
\noindent\textbf{Solution.} \emph{(If using this template, please write your solution here.)}
}
\title{CS 540 DLT --- Homework 3.}
\author{\emph{your NetID here.}}
\date{Version $1$.}
\begin{document}
\maketitle
\noindent\textbf{Instructions.}
\begin{itemize}
\item
This homework is due \textbf{Thursday, December 15, at 11:59pm};
no late homework accepted.
\item
You must work individually for this homework.
\item
Excluding office hours, and high-level discussions on discord,
you may discuss with at most three other people;
please state their NetIDs clearly on the first page of your submission.
\item
Homework must be typed, and submitted via gradescope.
Please consider using the provided {\LaTeX} file as a template.
\item
Each part of each problem is worth 3 points.
\item
For any problem asking you to construct something, for full credit you must
always formally prove your construction works.
\item
General course and homework policies are
\href{http://mjt.cs.illinois.edu/courses/dlt-f21/}{on the course webpage}.
\end{itemize}
\vspace{2em}
\noindent\textbf{Version history.}
\begin{enumerate}
\item[$1$.] Initial version.
\end{enumerate}
\begin{enumerate}[font={\Large\bfseries},left=0pt]
\begin{Q}
\textbf{Concentration of NTK eigenvalues.}
In hw2, we established that the \emph{infinite-width} Gram matrix from the NTK lectures
has positive minimum eigenvalue. In this problem, we will
prove that the finite width Gram matrix has similar (positive) eigenvalues.
This settles the remaining pieces necessary to make the NTK gradient flow proof concrete.
Throughout this problem, let examples $(x_i)_{i=1}^n$ be given and fixed,
and collect all examples as rows of a matrix $X\in\R^{n\times d}$.
Suppose for simplicity the activation $\sigma$ is differentiable, and $|\sigma'|\leq B$.
Define the sampled Gram matrix $\hG \in \R^{n\times n}$
and expected gram matrix $G\in\R^{n\times n}$ via
\[
\hG_{ij} := \frac 1 m \sum_{k=1}^m x_i^\T x_j \sigma'(w_k^\T x_i)\sigma'(w_k^\T x_j),
%\\
\qquad
G_{ij}
:= \bbE_w x_i^\T x_j \sigma'(w^\T x_i)\sigma'(w^\T x_j).
\]
In this problem, the random draw is over the weights $(w_j)_{j=1}^m$, not
over the examples. Consequently, it is useful to define \emph{another}
family of random matrices: $(H_k)_{k=1}^m$, with
\[
(H_k)_{ij} := x_i^\T x_j \sigma'(w_k^\T x_i)\sigma'(w_k^\T x_j),
\qquad
\textup{whereby }
\hG = \frac 1 m \sum_{k=1}^m H_k.
\]
The approach of this problem is to apply Rademacher complexity.
Let $W = (w_1,\ldots,w_m)$ denote the full random draw, analogous to the data in
our usual applications of Rademacher complexity.
We will use matrix inner products:
\[
\ip{H_k}{V} = \trace\del{H_k^\T V},
\qquad
\textup{and}
\qquad
\envert{ \ip{H_k}{V} }
\leq \|H_k\|_\tF\cdot\|V\|_\tF.
\]
\begin{enumerate}
\item
Prove $\|H_k\|_\tF \leq B^2 \|X\|_\tF^2$.
\item
Define
\[
\cF := \cbr{ U \mapsto \ip{U}{V} : \|V\|_\tF \leq 1 },
\qquad
\cH := (H_1,\ldots, H_m).
\]
The relevant (unnormalized) Rademacher complexity for us is
\[
\URad(\cF_{|\cH})
=\URad\del{
\cbr{
(\ip{H_1}{V},\ldots,\ip{H_m}{V}) : \|V\|_\tF \leq 1
}
}.
\]
Prove $\URad(\cF_{|\cH}) \leq B^2 \|X\|_\tF^2 \sqrt{m}$.
\textbf{Hint.} Chapter 13 of the \emph{old} typed notes has everything you need.
\item
Prove $\{ vv^\T : \|v\|_2 \leq 1 \} \subseteq \{ V : \|V\|_\tF \leq 1\}$.
\item
Prove that with probability at least $1-\delta$ over the draw of $(w_1,\ldots,w_m)$,
simultaneously for every $\|u\|_2\leq 1$,
\[
\envert{ u^\T G u - u^\T \hG u }
\leq
\frac {2B^2 \|X\|_{\tF}^2}{\sqrt{m}}
+ 6B^2 \|X\|_\tF^2 \sqrt{\frac{\ln(4/\delta)}{2m}}
=: \star.
\]
\textbf{Hint.} Chapter 13 of the \emph{old} typed notes has everything you need.
\item
Prove that, with probability at least $1-\delta$, simultaneously
\[
\lambda_{\min}(\hG) \geq \lambda_{\min}(G) - \star
\qquad
\textup{and}
\qquad
\lambda_{\max}(\hG) \leq \lambda_{\max}(G) + \star,
\]
where $\star$ is the right hand side of the bound in the previous part.
\textbf{Remark.} The point is that we can make $\star$ as small as we want
just by increasing $m$.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Shallow Rademacher bound near initialization.}
Throughout this problem, consider our usual shallow NTK setup:
\[
F(x; W) := \frac 1 {\sqrt m} \sum_{j=1}^m s_j \srelu(w_j^\T x),
\]
where $\srelu(z) = \max\{0,z\}$ is the ReLU,
$s_j \in \{-1,+1\}$ is fixed and independent of the data,
$W \in \R^{m\times d}$ with $j$th row $w_j^\T$, and $W$ will always be close to a fixed (and
independent of the data) $W_0\in\R^{m\times d}$, whose $j$th row is $w_{0,j}^\T$.
Let $S = ((x_i,y_i))_{i=1}^n$ denote a data sample, and let $X\in\R^{n\times d}$
be a matrix whose $i$th row is $x_i^\T$.
\begin{enumerate}
\item
Show that for any $j\in \{1,\ldots,m\}$ and any $r\geq 0$,
\[
\URad\del{\cbr{x \mapsto s_j \srelu(w_j^\T x) : \|w_j - w_{0,j}\|_2 \leq r}_{|S}}
\leq
r \|X\|_\tF.
\]
\textbf{Hint.} Chapter 13 of the \emph{old} typed notes has everything you need.
\item
Show that for any $r\geq 0$,
\[
\URad\del{\cbr{x \mapsto F(x;W) : \|(W - W_0)^\T\|_{2,\infty}
\leq \frac r {\sqrt m}}_{|S}}
\leq
r \|X\|_\tF,
\]
where $\|(W - W_0)^\T\|_{2,\infty} = \max_j \|w_j - w_{0,j}\|_2$.
\textbf{Hint.} Chapter 13 of the \emph{old} typed notes has everything you need.
\item
Let $\ell(z) = \ln(1+\exp(-z))$ denote the logistic loss, define a truncation
$\tell(z) = \min\{ \ell(z), 1 \}$, and suppose $\|x\|\leq 1$.
Show that with probability at least $1-\delta$ over the draw of $S$,
every $W$ with $\|(W-W_0)^\T\|_{2,\infty} \leq r/\sqrt{m}$ satisfies
\[
\bbE_{x,y} \tell(y F(x;W))
\leq
\frac 1 n \sum_{i=1}^n \tell(y_i F(x_i;W))
+
\frac {2 r}{\sqrt n}
+
3 \sqrt{ \frac {1}{2n} \ln \frac 2 \delta}.
\]
\textbf{Hint.} Chapter 13 of the \emph{old} typed notes has everything you need.
\textbf{Remark.} What happens if $\ell$ is used in place of $\tell$?
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Generalization experiments near initialization.}
This problem will check some basic generalization properties near initialization.
Starter code is provided
in \texttt{hw3.py}:
it is nearly the same as \texttt{hw2.py}, but additionally it fixes a test/train split.
Note that there is very little data so the results are a bit iffy, but I wanted
everyone to be able to run everything quickly on cpu.
\begin{enumerate}
\item
Using the provided shallow ReLU network class,
plot the \emph{empirical logistic risk over the test set minus the empirical
logistic risk over the training set} for $4096$
gradient descent iterations (on the training set) with step size $1.0$
and widths $m \in \{64, 256, 1024 \}$.
(Only difference with last time is that $m=4$ has been dropped.)
For full points, include the plot in your hand-in, and describe it qualitatively
in 1--3 sentences.
\textbf{Remark.} If the terminology is unclear, see \texttt{hw3.py} for the
relevant treatment of the (linear) logistic regression case.
\item
For the same setup as the previous problem part, plot the main term of the
generalization bound from problem 2: namely, plot
\[
\frac {\sqrt m \|(W_t - W_0)^{\T}\|_{2,\infty}}{\sqrt n}
\]
along the vertical axis for all three widths,
with iteration counter $t$ running along the horizontal axis.
For full points, include the plot in your hand-in, and describe it qualitatively
in 1--3 sentences, including a comparison to the observed excess risk from the
previous part.
\item
For the same setup as the previous problem part, let's drop the ``$-W_0$'' term:
plot
\[
\frac {\sqrt m \|W_t^{\T}\|_{2,\infty}}{\sqrt n}
\]
along the vertical axis for all three widths,
with iteration counter $t$ running along the horizontal axis.
For full points, include the plot in your hand-in, and describe it qualitatively
in 1--3 sentences,
including a comparison to the previous bound and to
the observed excess risk.
\item
Lastly, let's plot the dominant term from the Golowich-Rakhlin-Shamir bound given in
lecture: specifically, since the outer layer signs satisfy $\|\vec s\|_2 = \sqrt{m}$,
plot
\[
\frac {\sqrt m \|W_t\|_{\tF}}{\sqrt n}
\]
along the vertical axis for all three widths,
with iteration counter $t$ running along the horizontal axis.
For full points, include the plot in your hand-in, and describe it qualitatively
in 1--5 sentences,
including a comparison to the previous bounds and to
the observed excess risk.
\end{enumerate}
\end{Q}
\end{enumerate}
%\clearpage
%\bibliographystyle{plainnat}
%\bibliography{hw1}
\end{document}