Commit 2911a679 authored by Florent Chatelain's avatar Florent Chatelain
Browse files

add simus

parent 14a8159c
%% Cell type:code id: tags:
``` python
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import scipy.linalg as lin
import scipy.stats as stats
import scipy.sparse.linalg
import scipy.special
import pandas as pd
import seaborn as sns
from numpy.random import default_rng
rng = default_rng(0)
def qfunc(t): return 0.5 - 0.5*scipy.special.erf(t/np.sqrt(2))
%matplotlib inline
```
%% Cell type:markdown id: tags:
## Preamble: useful codes and functions
%% Cell type:code id: tags:
``` python
def puncture_X(X, eps):
# compute the (asymptotic) equivalent selection rate when we sample with replacement
# X is assumed to be (p,n) shaped
(p, n) = X.shape
cmpl = False
if (eps > .5):
cmpl = True
eps = 1-eps
if (eps <= 0):
q = 0
else:
q = -np.log(1-eps)
# number of entries in the lower triangular matrix B
nedges = p*n
# sample the entry indexes to retain
indreplace = rng.choice(nedges, replace=True, size=int(np.round(q*nedges)))
# remove duplicates sampled index
if (eps > .01):
if (not cmpl):
ind = -np.ones(nedges, dtype=int)
ind[indreplace] = indreplace
ind = ind[ind >= 0]
else:
ind = np.unique(indreplace)
if cmpl:
ind = np.arange(nedges)
ind[indreplace] = -1
ind = ind[ind >= 0]
# create the sparsified data matrix
vals = X.ravel()[ind]
I, J = np.unravel_index(ind, X.shape)
X_S = sp.sparse.coo_matrix((vals, (I, J)), shape=X.shape)
# get the 'csc' representation for efficient column slicing
X_S = X_S.tocsc()
return X_S
# useful to create the selection matrix/mask
def ind2sub4low(IND):
# IND2SUB4LOW Subscripts from linear index for lower triangular matrix (only
# elements below diagonal)
# IND2SUB4LOW determines the equivalent subscript values corresponding to
# a given single index into a 2D lower triangular matrix, excluded all
# elements over the diagonal.
I = np.round(np.floor(-.5 + .5 * np.sqrt(1 + 8 * IND)) +
2).astype('int') - 1
J = np.round((I+1) * (2 - I) / 2 + IND).astype('int') - 1
return I, J
def mask_B(n, eps, is_diag=1):
cmpl = False
if (eps > .5):
cmpl = True
eps = 1-eps
# compute the (asymptotic) equivalent selection rate when we sample with replacement
if (eps <= 0):
q = 0
else:
q = -np.log(1-eps)
q = -np.log(1-eps)
# number of entries in the lower triangular matrix B
nedges = (n*(n-1))//2
# sample the entry indexes to retain
indreplace = rng.choice(nedges, replace=True, size=int(np.round(q*nedges)))
if cmpl:
ind = np.arange(nedges)
ind[indreplace] = -1
ind = ind[ind > 0]
else:
ind = indreplace
# no need to remove the duplicates (done by the sparse matrix factory)
# create the sparse selection matrix
data = np.ones(len(ind))
I, J = ind2sub4low(ind)
B = sp.sparse.coo_matrix((data, (I, J)), shape=(n, n))
# Symmetric matrix (no need to store the upper diag triangular array)
if (is_diag!=0):
B += sp.sparse.eye(n)
B = B.tocsr()
# remove duplicate (possible du to csr conversion)
B.data[B.data > 1] = 1
indices = B.indices
indptr = B.indptr
return B, indices, indptr
def puncture_K_vanilla(X_S, B):
p = X_S.shape[0]
if sp.sparse.issparse(X_S):
X_S = X_S.todense()
K = X_S.T @ X_S
K = sp.sparse.csr_matrix(B.multiply(K)) / p
K = K + sp.sparse.tril(K, k=-1).T
return K
def gen_synth_mus(p, n, cov_mu):
c0 = p/n # dimension/sample size ratio
# set the corr coeff between the mean vectors
rho = cov_mu[0, 1] / np.sqrt(cov_mu[0,0]*cov_mu[1,1])
# draw the mean vectors mu_1,2
mu0 = rng.normal(size=(p,))
mu1 = np.sqrt((1-rho**2)) * rng.normal(size=(p,)) + rho * mu0
mu0 = mu0 * np.sqrt(cov_mu[0, 0])
mu1 = mu1 * np.sqrt(cov_mu[1, 1])
mus = np.concatenate((mu0, mu1)).reshape(p, 2)
return mus
def simu(eB, eS, P, b=1, sparsity=0):
(p, n) = P.shape
Z = rng.normal(size=(p, n))
X = Z+P
X_S = puncture_X(X, eS)
B, _, _ = mask_B(n, eB, is_diag=b)
K = puncture_K_vanilla(X_S, B)
if sparsity:
lambdas, U = sp.sparse.linalg.eigsh(
K, k=sparsity, which='LA', tol=0, return_eigenvectors=True)
else:
lambdas = np.linalg.eigvalsh(K.todense())
U = None
return lambdas, U
```
%% Cell type:code id: tags:
``` python
def coeffs_F(eB, eS, c0): return [
1, 2/eS, 1/eS**2*(1-c0/eB), -2*c0/eS**3, -c0/eS**4]
def F(eB, eS, c0, t): return np.polyval(coeffs_F(eB, eS, c0), t)
def G(eB, eS, c0, b, t): return eS*b+1/c0 * \
eB*eS*(1+eS*t)+eS/(1+eS*t)+eB/t/(1+eS*t)
def spike(eB, eS, c0, ell, b=1):
Gamma = np.max(np.roots(coeffs_F(eB, eS, c0)))
rho = int(ell > Gamma)*G(eB, eS, c0, b, ell)
zeta = int(ell > Gamma)*(F(eB, eS, c0, ell)*eS**3/ell/(1+eS*ell)**3)
return rho, zeta
def phase_transition(c0, ell, res=1e-3):
eBs = np.arange(res, 1, res)
eSs = np.zeros(len(eBs))
for iB, eB in enumerate(eBs):
eSs[iB] = res
while np.max(np.roots(coeffs_F(eB, eSs[iB], c0))) > ell and eSs[iB] < 1:
eSs[iB] += res
return eBs, eSs
```
%% Cell type:code id: tags:
``` python
def lsd(eB, eS, c0, b=1, xmin=-5, xmax=10, samples=200):
xs = np.linspace(xmin, xmax, samples)
zs = xs+1e-4*1j
watchdog = 50000
density = np.zeros(len(xs))
inv_c0 = 1/c0
m = 0
for i, z in enumerate(zs):
m_tmp = np.inf
w = 0
while np.abs(m_tmp-m) > 1e-6 and w < watchdog:
m_tmp = m
m = 1/(eS*b-z-inv_c0*eS**2*eB*m+inv_c0**2 *
eS**3*eB**3*m**2/(1+inv_c0*eB*eS*m))
w += 1
density[i] = 1/np.pi*np.imag(m)
if w == watchdog:
m = 0
return xs, density
```
%% Cell type:markdown id: tags:
## Simulations
#### Figure 1.
Eigenvalue distribution $\nu_n$ of $K$ versus limit measure $\nu$, for $p=200$, $n=4\,000$, $x_i\sim .4 \mathcal N(\mu_1,I_p)+.6\mathcal N(\mu_2,I_p)$ for $[\mu_1^T,\mu_2^T]^T \sim\mathcal{N}(0, \frac1p \left[\begin{smallmatrix} 10 & 5.5 \\ 5.5 & 15\end{smallmatrix}\right]\otimes I_p)$; $\varepsilon_S=.2$, $\varepsilon_B=.4$. Sample vs theoretical spikes in blue vs red circles. <b>The two ``humps'' remind the semi-circular and Marcenko-Pastur laws.</b>
%% Cell type:code id: tags:
``` python
# Generation of the mean vectors for each sample
p = 200 # dimension size
n = 4000 # sample size
c0 = p/n # dimension/sample size ratio
# covariance to draw the mean vectore
cov_mu = np.array([[10, 5.5], [5.5, 15]]) / p
mus = gen_synth_mus(p=p, n=n, cov_mu=cov_mu)
# repmat the mean vectors for each sample and stack them in P
cs = [.4, .6] # proportion for each sample
n0 = int(.4*n)
J = np.zeros((n, 2))
J[:n0, 0] = 1
J[n0:, 1] = 1
P = mus@J.T
# Puncturing settings
eS = .2 # data puncturing ratio
eB = .4 # kernel puncturing ratio
b = 1 # kernel matrix diagonal entry
# Empirical Spectrum
lambdas = simu(eB, eS, P, b)[0]
xmin = min(np.min(lambdas)*.8, np.min(lambdas)*1.2) # accounting negative min
xmax = np.max(lambdas)*1.2
xs, density = lsd(eB, eS, c0, b, xmin=xmin, xmax=xmax, samples=200)
M = np.diag(np.sqrt(cs)) @ (mus.T@mus) @ np.diag(np.sqrt(cs))
ells = sp.linalg.eigh(M)[0]
isolated_eig_0 = spike(eB, eS, c0, ells[0], b=1)[0]
isolated_eig_1 = spike(eB, eS, c0, ells[1], b=1)[0]
f, ax = plt.subplots(1, 1)
#hist_lambdas, bin_edges = np.histogram(lambdas, bins=200, density=True)
#plt.bar(bin_edges[1:], hist_lambdas, width=.02, label='Eigenvalue histogram')
sns.histplot(lambdas.flatten(), # color="blue", cbar_kws={'edgecolor': 'darkblue'},
stat='density', ax=ax)
plt.plot(xs, density, 'r', label='Limiting density')
plt.plot(isolated_eig_0, 0.002, 'og', fillstyle='none',
label=r'limiting spikes $\rho_{1,2}$')
plt.plot(isolated_eig_1, 0.002, 'og', fillstyle='none')
plt.plot(lambdas[-1], 0.002, 'ob', label=r'Largest eigvals')
plt.plot(lambdas[-2], 0.002, 'ob')
plt.legend()
plt.xlim([-2, 5])
_ = plt.show()
```
%% Output
%% Cell type:markdown id: tags:
#### Figure 2.
Illustration of Theorem 2: asymptotic sample-population eigenvector alignment for $\mathcal L=\ell \in\mathbb R$, as a function of the ``information strength'' $\ell$. Various values of $(\varepsilon_S,\varepsilon_B,c_0)$ indicated in legend. Black dashed lines indicate the limiting (small $\varepsilon_S,\varepsilon_B$) phase transition threshold $\Gamma=(\varepsilon_S^2\varepsilon_Bc_0^{-1})^{-\frac12}$. <b>As $\varepsilon_S,\varepsilon_B\to 0$, performance curves coincide when $\varepsilon_B\varepsilon_S^2c_0^{-1}$ is constant (plain versus dashed set of curves).</b>
%% Cell type:code id: tags:
``` python
eSeBc0s = [(.1, .1, 1), (.2, .025, 1), (.1, .2, 2),
(.05, .05, 1), (.1, .0125, 1), (.05, .1, 2)]
ells = np.linspace(1, 400, 100)
plt.figure(figsize=(8,4))
for eS, eB, c0 in eSeBc0s:
plt.plot(ells, [spike(eB, eS, c0, ell)[1]
for ell in ells], label="({},{},{})".format(eS, eB, c0))
plt.xlabel(r'$\ell$')
plt.ylabel(r'$\zeta$')
plt.grid('On')
_ = plt.legend()
```
%% Output
%% Cell type:markdown id: tags:
#### Figure 3.
Phase transition curves $F(\ell)=0$ for $\mathcal L=\ell\in\mathbb R$ and varying values of $\ell$, for $c_0=.05$. Above each phase transition curve, a spike eigenvalue is found away from the support of $\nu$. <b>For large $\ell$, a wide range of $\varepsilon_B$'s (resp., $\varepsilon_S$) is admissible at virtually no performance loss. Here, also, sparser $B$ matrices are more effective than sparser $S$ matrices.</b>
%% Cell type:code id: tags:
``` python
ells = [1, 2, 5] # spike powers
c0 = .05
f, ax = plt.subplots(1,2, figsize= (12,4), sharey=True)
for ell in ells:
eBs, eSs = phase_transition(c0, ell, res=1e-3)
ax[0].axes.plot(eBs, eSs, label=r'$\ell$={}'.format(ell))
ax[1].axes.plot(eSs, eBs, label=r'$\ell$={}'.format(ell))
ax[0].axes.set_xlabel(r'$\varepsilon_B$')
ax[0].axes.set_ylabel(r'$\varepsilon_S$')
ax[0].axes.grid('On')
ax[1].axes.set_xlabel(r'$\varepsilon_S$')
ax[1].axes.set_ylabel(r'$\varepsilon_B$')
ax[1].axes.set_ylim([0, .3])
ax[1].axes.grid('On')
ax[1].axes.legend()
_ = plt.show()
```
%% Output
%% Cell type:markdown id: tags:
#### Figure 4:
Two-way punctured matrices $K$ for {\bf (left)} $(\varepsilon_S,\varepsilon_B)=(.2,1)$ or {\bf (right)} $(\varepsilon_S,\varepsilon_B)=(1,.04)$, with $c_0=\frac12$, $n=4\,000$, $p=2\,000$, $b=0$. Clustering setting with $x_i\sim .4\mathcal N(\mu_1,I_p)+.6\mathcal N(\mu_2,I_p)$ for $[\mu_1^T,\mu_2^T]^T\sim \mathcal N(0,\frac1p[\begin{smallmatrix} 20 & 12 \\ 12 & 30\end{smallmatrix}]\otimes I_p)$. {\bf (Top)} first $100\times 100$ absolute entries of $K$ (white for zero); {\bf (Middle)} spectrum of $K$, theoretical limit, and isolated eigenvalues; {\bf (Bottom)} second dominant eigenvector $\hat v_2$ of $K$ against theoretical average in red. \textbf{\textit{As confirmed by theory, although (top) $K$ is dense for $\varepsilon_B=1$ and sparse for $\varepsilon_B=.04$ ($96\%$ empty) and (middle) the spectra strikingly differ, (bottom) since $\varepsilon_S^2\varepsilon_Bc_0^{-1}$ is constant, the eigenvector alignment $|\hat v_2^T v_2|^2$ is the same in both cases.}}
%% Cell type:code id: tags:
``` python
# Generation of the mean vectors for each sample
p = 2000 # dimension size
n = 4000 # sample size
c0 = p/n # dimension/sample size ratio
# covariance to draw the mean vectore
cov_mu = np.array([[20, 12], [12, 30]]) / p
mus = gen_synth_mus(p=p, n=n, cov_mu=cov_mu)
# repmat the mean vectors for each sample and stack them in P
cs = [.4, .6] # proportion for each sample
n0 = int(.4*n)
J = np.zeros((n, 2))
J[:n0, 0] = 1
J[n0:, 1] = 1
P = mus@J.T
M = np.diag(np.sqrt(cs)) @ (mus.T@mus) @ np.diag(np.sqrt(cs))
print(M.shape)
# Spikes
ells = sp.linalg.eigh(M)[0]
# Spike principal eigvect
# FIXME
u_gt = np.zeros(n)
# Puncturing settings
eS = .2 # data puncturing ratio
eB = .04 # kernel puncturing ratio
b = 0 # kernel matrix diagonal entry
# full data matrix
Z = rng.normal(size=(p, n))
X = Z+P
f, ax = plt.subplots(3, 2, figsize=(12,8) )
# S-punctured spectrum
B, _, _ = mask_B(n, 1, is_diag=0) # mask for diag entries
X_S = puncture_X(X, eS) # punctured data mat
K_S = puncture_K_vanilla(X_S, B) # remove diag entries
l, u_S = sp.sparse.linalg.eigsh(K_S,
k=2, which='LA', tol=0, return_eigenvectors=True)
u_S = u_S[:,1].ravel() # second principal eigvect
u_S = u_S * np.sign(u_S[:n0].mean() - u_S[n0:].mean() )
lambdas_S = np.linalg.eigvalsh(K_S.todense())
# B-punctured spectrum
B, _, _ = mask_B(n, eB, is_diag=0)
K_B = puncture_K_vanilla(X, B) # punctured kernel mat
l, u_B = sp.sparse.linalg.eigsh(K_B,
k=2, which='LA', tol=0, return_eigenvectors=True)
u_B = u_B[:,1].ravel() # second principal eigvect
u_B = u_B * np.sign(u_B[:n0].mean() - u_B[n0:].mean() )
lambdas_B = np.linalg.eigvalsh(K_B.todense())
# S-punctured figs
ax[0,0].imshow(K_S[:100,:100].todense(), interpolation='nearest', cmap='gray_r')
ax[0,0].axis('off')
xmin = min(np.min(lambdas_S)*.8, np.min(lambdas_S)*1.2) # accounting negative min
xmax = np.max(lambdas_S)*1.2
xs, density = lsd(1, eS, c0, b, xmin=xmin, xmax=xmax, samples=200)
isolated_eig_0 = spike(1, eS, c0, ells[0], b=b)[0]
isolated_eig_1 = spike(1, eS, c0, ells[1], b=b)[0]
sns.histplot(lambdas_S.flatten(), # color="blue", cbar_kws={'edgecolor': 'darkblue'},
stat='density', ax=ax[1,0])
ax[1,0].axes.plot(xs, density, 'r', label='Limiting density')
ax[1,0].axes.plot(isolated_eig_0, 0.002, 'og', fillstyle='none', markersize=10,
label=r'limiting spikes $\rho_{1,2}$')
ax[1,0].axes.plot(isolated_eig_1, 0.002, 'og', fillstyle='none', markersize=10)
ax[1,0].axes.plot(lambdas_S[-1], 0.002, 'ob', label=r'Largest eigvals')
ax[1,0].axes.plot(lambdas_S[-2], 0.002, 'ob')
ax[1,0].axes.legend()
ax[1,0].axes.set_xlim([xmin, xmax])
ax[1,0].axes.set_ylim([0, 2])
ax[1,0].axes.set_ylabel('')
ax[2,0].plot(u_S,'k')
ax[2,0].plot(u_gt,'r')
# B-punctured figs
ax[0,1].imshow(K_B[:100,:100].todense(), interpolation='nearest', cmap='gray_r')
ax[0,1].axis('off')
xmin = min(np.min(lambdas_B)*.8, np.min(lambdas_B)*1.2) # accounting negative min
xmax = np.max(lambdas_B)*1.2
xs, density = lsd(eB, 1, c0, b, xmin=xmin, xmax=xmax, samples=200)
isolated_eig_0 = spike(eB, 1, c0, ells[0], b=b)[0]
isolated_eig_1 = spike(eB, 1, c0, ells[1], b=b)[0]
#hist_lambdas, bin_edges = np.histogram(lambdas, bins=200, density=True)
#plt.bar(bin_edges[1:], hist_lambdas, width=.02, label='Eigenvalue histogram')
sns.histplot(lambdas_B.flatten(), # color="blue", cbar_kws={'edgecolor': 'darkblue'},
stat='density', ax=ax[1,1])
ax[1,1].axes.plot(xs, density, 'r', label='Limiting density')
ax[1,1].axes.plot(isolated_eig_0, 0.002, 'og', fillstyle='none', markersize=10,
label=r'limiting spikes $\rho_{1,2}$')
ax[1,1].axes.plot(isolated_eig_1, 0.002, 'og', fillstyle='none', markersize=10)
ax[1,1].axes.plot(lambdas_B[-1], 0.002, 'ob', label=r'Largest eigvals')
ax[1,1].axes.plot(lambdas_B[-2], 0.002, 'ob')
ax[1,1].axes.set_ylabel('')
ax[1,1].axes.legend()
ax[1,1].axes.set_xlim([xmin, xmax])
ax[2,1].plot(u_B,'k')
ax[2,1].plot(u_gt,'r')
plt.show()
#f.savefig('ouput.png')
```
%% Output
(2, 2)
%% Cell type:code id: tags:
``` python
f.savefig('ouput.png')
```
%% Cell type:markdown id: tags:
#### Figure 5.
Limiting probability of error of spectral clustering of $\mathcal N(\pm\mu,I_p)$ with equal class sizes on $K$: as a function of $\varepsilon_B$ for fixed $\ell=\|\mu\|^2=50$ <b>(top)</b>, and $\varepsilon_S$ for fixed $\ell=50$ <b>(bottom)</b>. Simulations (single realization) in markers for $p=n=4\,000$ ($\color{blue}\times$) and $p=n=8\,000$ ($\color{blue}+$). <b> Very good fit between theory and practice for not too small $\varepsilon_S,\varepsilon_B$ </b>.
%% Cell type:code id: tags:
``` python
p = 4000
ns = [2000,2000]
k = len(ns)
n = np.sum(ns)
c0 = p/n
power = 50
mu = rng.normal(size=(p,))
mu = mu/np.linalg.norm(mu)*np.sqrt(power)
mus= np.array([mu,-mu]).T
J=np.zeros( (n,k) )
for i in range(k):
J[int(np.sum(ns[:i])):int(np.sum(ns[:i+1])),i]=1
P = mus@J.T
ell = np.linalg.norm(mu)**2
f, ax = plt.subplots(2, 1, figsize=(8,8), sharex=True)
## Perf vs epsilon_B (fixed epsilon_S)
res = 1e-1
eSs = [1/10,1/20]
eBs = np.linspace(res,1,int(1/res))
res_thin = 1e-2
eSs_thin = [1/10,1/20]
eBs_thin = np.linspace(res_thin,1,int(1/res_thin))
perf_sim = np.zeros((len(eBs),len(eSs)))
perf_th = np.zeros((len(eBs_thin),len(eSs_thin)))
for iS,eS in enumerate(eSs):
for iB,eB in enumerate(eBs):
u = simu(eB,eS,P,b=1,sparsity=1)[1]
overlap = 1/n*(np.sum(u[:ns[0]] > 0) + np.sum(u[ns[0]:] < 0))
perf_sim[iB,iS] = np.min([overlap,1-overlap])
ax[0].axes.plot(eBs,perf_sim[:,iS],'xb',label=r'Simulation, $\varepsilon_S=${}'.format(eS))
for iS,eS in enumerate(eSs_thin):
for iB,eB in enumerate(eBs_thin):
zeta = spike(eB,eS,c0,ell,b=1)[1]
perf_th[iB,iS] = qfunc(np.sqrt(zeta/(1-zeta)))
ax[0].axes.plot(eBs_thin,perf_th[:,iS],'r',label=r'Theory, $\varepsilon_S=${}'.format(eS))
ax[0].axes.set_xlabel(r'$\varepsilon_B$')
ax[0].axes.set_ylabel(r'$P_e$')
ax[0].axes.grid('On')
ax[0].axes.legend()
## Perf vs epsilon_S (fixed epsilon_B)
eBs = [1/100,1/400]
eSs = np.linspace(res,1,int(1/res))
res_thin = 1e-2
eBs_thin = [1/100,1/400]
eSs_thin = np.linspace(res_thin,1,int(1/res_thin))
perf_sim = np.zeros((len(eSs),len(eBs)))
perf_th = np.zeros((len(eSs_thin),len(eBs_thin)))
for iB,eB in enumerate(eBs):
for iS,eS in enumerate(eSs):
u = simu(eB,eS,P,b=1,sparsity=1)[1]
overlap = 1/n*(np.sum(u[:ns[0]] > 0) + np.sum(u[ns[0]:] < 0))
perf_sim[iS,iB] = np.min([overlap,1-overlap])
ax[1].axes.plot(eSs,perf_sim[:,iB],'xb',label=r'Simulation, $\varepsilon_B=${}'.format(eB))
for iB,eB in enumerate(eBs_thin):
for iS,eS in enumerate(eSs_thin):
zeta = spike(eB,eS,c0,ell,b=1)[1]
perf_th[iS,iB] = qfunc(np.sqrt(zeta/(1-zeta)))
ax[1].axes.plot(eSs_thin,perf_th[:,iB],'r',label=r'Theory, $\varepsilon_B=${}'.format(eB))
ax[1].axes.set_xlabel(r'$\varepsilon_S$')
ax[1].axes.set_ylabel(r'$P_e$')
ax[1].axes.grid('On')
ax[1].axes.legend()
```
%% Output
<matplotlib.legend.Legend at 0x7f25658f2390>
%% Cell type:markdown id: tags:
#### Figure 7
Empirical classification errors for $2$-class (balanced) BigGAN-generated images (`tabby` vs `collie`), with $n=2\,500$ (**top**) and $n=10\,000$ (**bottom**). **Theoretically predicted ``plateau''-behavior observed for all $\varepsilon_B$ not too small**.
Empirical classification errors for $2$-class (balanced) MNIST-fashion images (`shirt` vs `trousers`), with $n=512$ (**top**) and $n=2048$ (**bottom**). **Theoretically predicted ``plateau''-behavior observed for all $\varepsilon_B$ not too small**.
%% Cell type:code id: tags:
``` python
from tensorflow.keras.datasets import fashion_mnist
def gen_MNIST_vectors(n0=1024):
n1 = n0
(X, y), _ = fashion_mnist.load_data()
selected_labels = [3, 7]
selected_labels = [1, 2]
X0 = X[y == selected_labels[0]].reshape(-1, 28**2)
X1 = X[y == selected_labels[1]].reshape(-1, 28**2)
mean0 = X0.mean()
mean1 = X1.mean()
data_full = np.concatenate((X0, X1), axis=0).astype(
float) # (n,p) convention
data_full = X.reshape(-1, 28**2).astype(float)
mean_full = data_full.mean(axis=0)
sd_full_iso = data_full.ravel().std(axis=0, ddof=1)
ind0 = rng.choice(X0.shape[0], replace=False, size=n0)
X0 = X0[ind0, :]
ind1 = rng.choice(X1.shape[0], replace=False, size=n1)
X1 = X1[ind1, :]
data = np.concatenate((X0, X1), axis=0).astype(float) # (n,p) convention
n, p = data.shape
c = p/n
X = (data - data.mean(axis=0)) / sd_full_iso
X = X.T # now this is (p,n) convention
#mean0 / sd_full_iso, mean1 / sd_full_iso
ell = ((mean0 - mean1) / sd_full_iso)**2
return X, n0, n1, ell
import sys
def get_perf_clustering(n0=5000, nbMC=10):
# get the two classes GAN vectors
X, n0, n1, ell = gen_MNIST_vectors(n0)
(p,n) = X.shape
c = p / n
# Set the list of epsilon_B puncturing rates
b_eps_vals = np.array( [5e-4,0.001,0.002, 0.003,0.004, 0.005, 0.01, 0.012, 0.015, 0.02, 0.03, 0.05, 0.1, 0.5, 1] )
neps = len(b_eps_vals)
# Arrays for empirical clustering perf
fixed_s_eps_perf = np.zeros( (neps) )
equiv_s_eps_perf = np.zeros( (neps) )
# Set the epsilon_S puncturing rates in the equi-performance regime
perf_ref = np.sqrt( s_eps_ref**2 * b_eps_ref / c )
s_eps_equi_vals = perf_ref / np.sqrt( b_eps_vals / c )
# Ground truth
u_gt = np.ones( (n,1) )
u_gt[n0:,0] = -1 # class0 and class1 coded as +1 and -1 resp.
for iMC in range (nbMC):
# progression bar
sys.stdout.write("\r") ; sys.stdout.write("[progression {}/{}]".format(iMC+1,nbMC)) ; sys.stdout.flush()
# Fixed epsilon_S data
X_S_fix = puncture_X(X, s_eps_ref) # fixed eps_S regime
# 'Brute force' implementation: use dense vectors to get numpy multithreading and optimization
X_S_fix = X_S_fix.todense()
K_full_fix = X_S_fix.T @ X_S_fix
for i, b_eps in enumerate(b_eps_vals):
B, _, _ = mask_B(n, b_eps)
# Fixed S perf
K_fix = sp.sparse.csr_matrix(B.multiply(K_full_fix)) / p
K_fix = K_fix + sp.sparse.tril(K_fix, k=-1).T
_,u = sp.sparse.linalg.eigsh(K_fix, k=1, which='LA', tol=0, return_eigenvectors=True)
m0 = np.mean(u[:n0])
m1 = np.mean(u[n0:])
u = u * np.sign(m0-m1)
fixed_s_eps_perf[i] = np.mean(u*u_gt<0)
# Equiperf eps_S regime
s_eps_eq = s_eps_equi_vals[i]
X_S_eq = puncture_X(X, s_eps_eq) # equi-performance regime
K_eq = puncture_K_vanilla(X_S_eq, B)
_, u = sp.sparse.linalg.eigsh(K_eq, k=1, which='LA', tol=0, return_eigenvectors=True)
m0 = np.mean(u[:n0])
m1 = np.mean(u[n0:])
u = u * np.sign(m0-m1)
equiv_s_eps_perf[i] = np.mean(u*u_gt<0)
d = {'Beps': b_eps_vals, 'Seps_equi': s_eps_equi_vals, 'perf_equi': equiv_s_eps_perf,
'perf_fixed': fixed_s_eps_perf, 'nbMC': nbMC, 'n0':n0, 'n1':n1}
if (iMC>0):
df_tmp = pd.DataFrame(data=d)
df = df.append(df_tmp, ignore_index=True)
else:
df = pd.DataFrame(data=d)
return df, n0
def plot_perf_clustering(df, n0, ax):
sns.lineplot(data=df, x="Beps", y="perf_equi", err_style='band', ci='sd',
label=r'$\epsilon_S^2 \epsilon_B=${:.4f}'.format(index_ref),
ax=ax)
sns.lineplot(data=df, x="Beps", y="perf_fixed", err_style='band', ci='sd',
label=r'$\epsilon_S=${:.4f}'.format(s_eps_ref),
ax=ax)
ax.axes.legend()
ax.axes.set_ylabel('')
ax.axes.set_ylim([0,0.4])
ax.axes.set_xlim([0,0.1])
ax.axes.grid('on')
```
%% Cell type:code id: tags:
``` python
nbMC = 40
s_eps_ref =.1
b_eps_ref = 0.1
index_ref = s_eps_ref**2 * b_eps_ref
df_1, _ = get_perf_clustering(n0=256, nbMC=nbMC)
df_2, _ = get_perf_clustering(n0=1024, nbMC=nbMC)
```
%% Output
[progression 2/2]
[progression 34/40]
%% Cell type:code id: tags:
``` python
f, ax = plt.subplots(2,1,figsize=(8,8), sharex=True)
plot_perf_clustering(df_1, n0, ax[0])
plot_perf_clustering(df_2, n0, ax[1])
```
%% Output
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment