\documentclass{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\usepackage{amsmath,amsfonts,amssymb,amsthm,commath,dsfont,nicefrac,xfrac}
\usepackage{enumitem}
\usepackage{framed}
\usepackage{xspace}
\usepackage{microtype}
\usepackage[round]{natbib}
\usepackage{cleveref}
\usepackage[dvipsnames]{xcolor}
\def\T{{\scriptscriptstyle\mathsf{T}}}
% following loops stolen from djhsu
\def\ddefloop#1{\ifx\ddefloop#1\else\ddef{#1}\expandafter\ddefloop\fi}
\def\ddef#1{\expandafter\def\csname bb#1\endcsname{\ensuremath{\mathbb{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname c#1\endcsname{\ensuremath{\mathcal{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZ\ddefloop
\def\ddef#1{\expandafter\def\csname v#1\endcsname{\ensuremath{\boldsymbol{#1}}}}
\ddefloop ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ddefloop
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\def\SPAN{\textup{span}}
\def\tu{\textup{u}}
\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\Z{\mathbb{Z}}
\def\be{\mathbf{e}}
\def\nf{\nabla f}
\def\veps{\varepsilon}
\def\cl{\textup{cl}}
\def\inte{\textup{int}}
\def\dom{\textup{dom}}
\def\Rad{\textup{Rad}}
\def\lsq{\ell_{\textup{sq}}}
\def\hcR{\widehat{\cR}}
\def\nhcR{\nabla\widehat{\cR}}
\def\hcRl{\hcR_\ell}
\def\cRl{\cR_\ell}
\def\hcE{\widehat{\cE}}
\def\cEl{\cE_\ell}
\def\hcEl{\hcE_\ell}
\def\eps{\epsilon}
\def\1{\mathds{1}}
\newcommand{\red}[1]{{\color{red} #1}}
\newcommand{\blue}[1]{{\color{blue} #1}}
\def\srelu{\sigma_{\textup{r}}}
\def\vsrelu{\vec{\sigma_{\textup{r}}}}
\def\vol{\textup{vol}}
\def\barw{\bar w}
\def\conv{\textup{conv}}
\def\bv{\textsc{bv}}
\def\lip{\textsc{lip}}
\newcommand{\ip}[2]{\left\langle #1, #2 \right \rangle}
\newtheorem{fact}{Fact}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{condition}{Condition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\theoremstyle{remark}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newenvironment{Q}
{%
\clearpage
\item
}
{%
\phantom{s} %lol doesn't work
\bigskip
\noindent\textbf{Solution.} \emph{(If using this template, please write your solution here.)}
}
\title{CS 540 DLT --- Homework 2.}
\author{\emph{your NetID here.}}
\date{Version $1$.}
\begin{document}
\maketitle
\noindent\textbf{Instructions.}
\begin{itemize}
\item
Homework is due \textbf{Wednesday, November 16, at 11:59pm};
no late homework accepted.
\item
You must work individually for this homework.
\item
Excluding office hours, and high-level discussions on discord,
you may discuss with at most three other people;
please state their NetIDs clearly on the first page of your submission.
\item
Homework must be typed, and submitted via gradescope.
Please consider using the provided {\LaTeX} file as a template.
\item
Each part of each problem is worth 3 points.
\item
For any problem asking you to construct something, for full credit you must
always formally prove your construction works.
\item
General course and homework policies are
\href{http://mjt.cs.illinois.edu/courses/dlt-f22/}{on the course webpage}.
\end{itemize}
\vspace{2em}
\noindent\textbf{Version history.}
\begin{enumerate}
\item[$1$.] Initial version.
\end{enumerate}
\begin{enumerate}[font={\Large\bfseries},left=0pt]
\begin{Q}
\textbf{Clarke differentials.}
\def\subg{\partial_{\textup{s}}}
\def\supg{\partial_{\textup{u}}}
Recall the definition of Clarke differential:
\[
\partial f(w) := \conv\cbr{ \lim_i \nf(w_i) \ : \ w_i \to w,
\ \nabla f(w_i) \text{ exists},
\ \lim_i \nf(w_i) \text{ exists} },
\]
where ``$\conv$'' denotes the convex hull.
Additionally, given a set $U\subseteq \R^d$, define subgradients and supergradients
\emph{relative to $U$} as
\begin{align*}
\subg f(w)
&:= \cbr{ s \in \R^d : \forall w' \in U\centerdot f(w') \geq f(w) + \ip{s}{w'-w} },
\\
\supg f(w)
&:= \cbr{ s \in \R^d : \forall w' \in U\centerdot f(w') \leq f(w) + \ip{s}{w'-w} }.
\end{align*}
Define a function $g:\R^2 \to \R$ as
\[
g(w) := |w_1| - |w_2|.
\]
\begin{enumerate}
\item
Prove that $g$ is locally Lipschitz.
\item
Prove that
$
\partial g(0) = \{ w \in \R^2 : \|w\|_\infty \leq 1\}
$.
\textbf{Hint:} the vector field view of this function in lecture 18 makes
this much easier, and yes we gave the answer in class, but give a rigorous check
here.
\item
Prove that for every $\tau > 0$, no element of $\partial g(0)$ is a subgradient or
supergradient of $g$ relative to $\{ w \in \R^2 : \|w\|_\infty < \tau \}$.
\textbf{Remark:} this gives a rigorous backing to the comment in lecture that
we can not use local subgradients and supergradients as the basis of a
general differential.
\item
Prove that for any starting point $w \in \{ (a, 0) : a \in \R \} \subseteq \R^2$,
the Clarke differential inclusion has multiple solutions.
\textbf{Remark:} we handled $w = (0,0)$ in class and you can just say that, don't
need to reprove that case. For full points, you must explicitly provide multiple paths
and show that they satisfy the differential inclusion.
\item
Prove that for any starting point $w \in \{ (a, b) : a \in \R, b \in \R\setminus \{0\}\}$,
the Clarke differential inclusion has a unique solution.
\end{enumerate}
\end{Q}
\begin{Q}
\textbf{Eigenvalues of expected kernel.}
In the strongly-convex NTK proof scheme,
we needed the \emph{kernel Gram matrix}
to have nicely-behaved eigenvalues.
In this problem, we will work out these eigenvalues in the infinite-width shallow network setting
with differentiable activations; in homework 3, we'll convert this into a finite-width bound.
Throughout this problem let examples $(x_i)_{i=1}^n$ be given and fixed with $\|x_i\|\leq 1$,
and collect all examples as rows of a matrix $X\in\R^{n\times d}$.
Given a differentiable activation function $\sigma$, the shallow \emph{gram matrix}
$G\in\R^{n\times n}$ corresponding to this activation (in the NTK setting) is
\begin{align*}
G_{ij}
&:= \bbE_w x_i^\T x_j \sigma'(w^\T x_i)\sigma'(w^\T x_j),
\end{align*}
where $w$ is a standard Gaussian random vector.
We will not prove meaningful bounds, we will merely show that
$G$ has full rank, though the bounds on the eigenvalues which
can be extracted from this proof are sufficient for the setup in lecture.
First let's establish some basic sanity checks.
\begin{enumerate}
\item
Suppose a \emph{linear network}, meaning $\sigma(z) = z$.
Prove that $G$ being full rank implies $d \geq n$.
\item
Suppose there exists a pair $x_i=x_j$ with $i\neq j$;
prove in the general case ($\sigma$ possibly nonlinear)
that $G$ does not have full rank.
\end{enumerate}
To handle the activations in the nonlinear case, we will need
the \emph{(normalized) Probabilist's Hermite polynomials $(p_k)_{k=0}^\infty$}.
These satisfy many magical properties, but the ones we will need are as follows.
\begin{itemize}
\item
$p_k$ is a polynomial of degree $k$.
\item
If $w$ is a standard Gaussian random vector,
then $\bbE p_k(w^\T x_i) p_l(w^\T x_j) = (x_i^\T x_j)^k \1[k = l]$;
this equality goes a little beyond the usual claim that Hermite polynomials
are \emph{orthonormal} with respect to an inner product
defined by Gaussian integration.
\item
If $h$ is a function with $\bbE |h(g)| < \infty$
where $g$ is a standard univariate Gaussian random variable,
then there exist \emph{Hermite coefficients} $(c_k)_{k=0}^\infty$
with $c_k = \bbE h(g) p_k(g)$
such that $h(x) = \sum_{k=0}^\infty c_k p_k(x)$.
\end{itemize}
For the remaining parts of the problem, fix an activation $\sigma$ (potentially nonlinear)
and let $(c_k)_{k\geq 0}$ denote the Hermite coefficients of $\sigma'$.
\begin{enumerate}[start = 3]
\item
Prove that $G_{ij} = \sum_{k\geq 0} c_k^2 (x_i^\T x_j)^{k+1}$.
\item
Prove that if $G_{jj} > \sum_{i\neq j} |G_{ij}|$ for each $j$,
then $G$ is positive definite
(and thus has full rank).
\textbf{Hint:}
since $G$ is real and symmetric, it has real eigenvalues.
Take any pair $(\lambda, v)$ with $\lambda v = Gv$, and choose
$j = \argmax_j |v_j|$. After some algebra and the provided condition,
it follows that $\lambda>0$.
\item
Suppose $\|x_i\| = 1$, and $x_i\neq \pm x_j$ whenever $i \neq j$,
and that $\sigma'$ has infinitely many nonzero Hermite coefficients
(meaning $\sup\{ k : c_k\neq 0 \} = \infty$).
Prove that $G$ is positive definite (and thus has full rank).
\textbf{Hint:}
you may use the \emph{Schur product theorem} without proof:
if $A,B\succeq 0$, then $\det(A \circ B) \geq \det(A)\cdot\det(B)$, where
$A\circ B$ denote element-wise product.
\end{enumerate}
\textbf{Remark 1:} one of our ``universal approximation'' themes was that we
are fine so long as our activation is not a polynomial. In this setting, if
the activation is not a polynomial, it will have an infinite Hermite expansion.
Consequently, this result holds for the sigmoid, and also the ReLU (after being
careful about the nondifferentiability).
\textbf{Remark 2:}
if we try to use this proof technique to give a lower bound on the eigenvalues,
it will be pretty bad, since standard activations all have fast decay of Hermite
coefficients.
\end{Q}
\begin{Q}
\textbf{Experiments near initialization.}
This problem will check some basic properties near initialization. Starter code is provided
in \texttt{hw2.py}:
it loads the classical \emph{iris data}, runs (linear) logistic regression on the
two chosen classes,
and defines and defines a neural net class.
(For further python help, see for instance my \texttt{pytorch} tutorial:
\url{https://mjt.cs.illinois.edu/ml/pytorch_basics.pdf} ,
which is generated from a jupyter notebook at
\url{https://mjt.cs.illinois.edu/ml/pytorch_basics.ipynb} .)
Note that this data is linearly separable and very small, so this problem
is a toy warm-up, no more.
\textbf{Coding note:} some \texttt{pytorch}
versions may complain about type errors and/or about
the \texttt{dtype} argument not existing. You can fix both issues by using expressions
of the form ``\texttt{.type(X.dtype)}'' or appropriate equivalents, as already exist in
the code.
\begin{enumerate}
\item
Using the provided shallow ReLU network class,
plot empirical logistic risk curves for $4096$ iterations with step size $1.0$
and widths $m \in \{4, 64, 256, 1024 \}$. (That is, the horizontal axis is iterations,
vertical axis is empirical logistic risk.)
For full points: include the plot here, and describe it qualitatively in 1-3 sentences.
\textbf{Coding remark:} when $m=4$, with probability $1/8$, the output layer is all $+1$ or
all $-1$, and the training error will stay large. If your plot happens to have such
a situation (for $m=4$ only of course), you do not need to explain it.
\item
For the same setup as the previous part (including the four choices of $m$),
plot $\|W_t - W_0\|^{4/3} / m^{1/6}$, with $t$ as the horizontal axis once again.
(These exponents are chosen to match Lemma 4.1 in the typed notes.)
For full points: include the plot here, and describe it qualitatively in 1-3 sentences,
including a discussion of whether you think it supports or negates parts of the NTK story.
\end{enumerate}
\end{Q}
\end{enumerate}
\end{document}