Commit 00bb10d2 authored by Florent Chatelain's avatar Florent Chatelain
Browse files

fix typos

parent e76dcb05
......@@ -3,43 +3,44 @@
This notebook can be run on mybinder: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/git/https%3A%2F%2Fgricad-gitlab.univ-grenoble-alpes.fr%2Fchatelaf%2Fml-sicom3a/master?urlpath=lab/tree/notebooks/7_Clustering/N1_Kmeans_basic.ipynb)
%% Cell type:markdown id: tags:
# KMEANS basics
The purpose of this lab is to implement simple 1D Kmeans clustering algorithm, and compare the obtained results with those obtained using skleran implementation
The purpose of this lab is to implement simple 1D Kmeans clustering algorithm, and compare the obtained results with those obtained using sklearn implementation
%% Cell type:code id: tags:
``` python
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
%matplotlib inline
```
%% Cell type:markdown id: tags:
## import data from matlab file :
%% Cell type:code id: tags:
``` python
Data=loadmat('fictitious_train.mat')
Data = loadmat("fictitious_train.mat")
print(Data.keys())
X=Data.get('Xtrain')
print('dim of X:{}'.format(X.shape))
X = Data.get("Xtrain")
print("dim of X:{}".format(X.shape))
```
%% Cell type:markdown id: tags:
## Compute the histogram
%% Cell type:code id: tags:
``` python
bins=np.arange(np.min(X)-1,np.max(X)+2,1)
hist_val,bins=np.histogram(X, bins=bins)
bins = np.arange(np.min(X) - 1, np.max(X) + 2, 1)
hist_val, bins = np.histogram(X, bins=bins)
print(hist_val)
print(bins)
```
%% Cell type:markdown id: tags:
......@@ -47,13 +48,13 @@
### or directly visualize the histogram
%% Cell type:code id: tags:
``` python
bins=np.arange(np.min(X)-1,np.max(X)+2,1)
plt.scatter(X,np.zeros_like(X)+.5,c='red',marker='+')
n,bin_edges,patches=plt.hist(x=X,bins=bins, color='blue',histtype='step')
bins = np.arange(np.min(X) - 1, np.max(X) + 2, 1)
plt.scatter(X, np.zeros_like(X) + 0.5, c="red", marker="+")
n, bin_edges, patches = plt.hist(x=X, bins=bins, color="blue", histtype="step")
```
%%%% Output: display_data
![]()
......@@ -61,92 +62,91 @@
%% Cell type:markdown id: tags:
## Implementation of Kmean on a simple case
In this example, the number of clusters is assumed to be known.
### Exercize 1 :
### Exercise 1 :
- Explain/ comment the code below
- What is the main problem left aside by this code?
%% Cell type:code id: tags:
``` python
K=2 #nb of clusters
p=1 # dimension (the code below is given for p=1 only)
K = 2 # nb of clusters
p = 1 # dimension (the code below is given for p=1 only)
```
%% Cell type:code id: tags:
``` python
N=X.size
idx=np.zeros((N,1))
muvec=np.zeros((K,1))
N = X.size
idx = np.zeros((N, 1))
muvec = np.zeros((K, 1))
change = True # Defines the test variable for the loop.
# Default is true (meaning that a new iteration will be performed
perm = np.random.permutation(N)[0:2]
# takes two different random integers between 0 and $N$
change = True # Defines the test variable for the loop.
# Default is true (meaning that a new iteration will be performed
perm=np.random.permutation(N)[0:2]
# takes two different random integers between 0 and $N$
for k in range(0, K):
muvec[k] = X[perm[k], :] # Initialization of the cluster representatives (centers)
for k in range (0,K):
muvec[k] = X[perm[k],:] #Initialization of the cluster representatives (centers)
for i in range(0, N):
d = (X[i] - muvec) ** 2 # Computation of distances wrt cluster centers
idx[i] = np.where(d == d.min())[0] # label = index of closest center
for i in range (0,N):
d=(X[i] - muvec )**2 #Computation of distances wrt cluster centers
idx[i]=np.where(d==d.min())[0] #label = index of closest center
while change:
change=False
#update
for k in range (0,K):
muvec[k]= np.mean( X[idx == k] ) #compute new centers
#prediction
for i in range (0,N):
d=(X[i] - muvec )**2 #Computation of distances wrt cluster centers
index=np.where(d==d.min())[0]##label = index of closest center
if index != idx[i]: #check if some indices changed
change=True
idx[i]=index #replaces new index set
X0=X[idx==0]
X1=X[idx==1]
bins=np.arange(np.min(X)-1,np.max(X)+2,1)
n,bin_edges,patches=plt.hist(x=X,bins=bins, color='blue',histtype='step')
plt.scatter(X0,np.zeros_like(X0)+.5,c='red',marker='+', label='class 0')
plt.scatter(X1,np.zeros_like(X1)+.5,c='green',marker='+',label='class 1')
change = False
# update
for k in range(0, K):
muvec[k] = np.mean(X[idx == k]) # compute new centers
# prediction
for i in range(0, N):
d = (X[i] - muvec) ** 2 # Computation of distances wrt cluster centers
index = np.where(d == d.min())[0] ##label = index of closest center
if index != idx[i]: # check if some indices changed
change = True
idx[i] = index # replaces new index set
X0 = X[idx == 0]
X1 = X[idx == 1]
bins = np.arange(np.min(X) - 1, np.max(X) + 2, 1)
n, bin_edges, patches = plt.hist(x=X, bins=bins, color="blue", histtype="step")
plt.scatter(X0, np.zeros_like(X0) + 0.5, c="red", marker="+", label="class 0")
plt.scatter(X1, np.zeros_like(X1) + 0.5, c="green", marker="+", label="class 1")
plt.legend()
h=plt.gcf()
h = plt.gcf()
```
%%%% Output: display_data
![]()
%% Cell type:markdown id: tags:
### Exercize 2 : sklearn implementation
### Exercise 2 : sklearn implementation
- Compare the results obtained with the simple code above
- Comment and explain the role of the input parameters used in this implementation
%% Cell type:code id: tags:
``` python
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 10, n_init = 10, random_state = 0)
kmeans = KMeans(n_clusters=2, init="k-means++", max_iter=10, n_init=10, random_state=0)
kmeans.fit(X)
y_kmeans = kmeans.fit_predict(X)
Y0=X[y_kmeans==0]
Y1=X[y_kmeans==1]
plt.scatter(Y0,np.zeros_like(Y0)+.7,c='red',marker='o', label='class 0 skl')
plt.scatter(Y1,np.zeros_like(Y1)+.7,c='green',marker='o',label='class 1 skl')
plt.scatter(X0,np.zeros_like(X0)+.5,c='red',marker='+', label='class 0')
plt.scatter(X1,np.zeros_like(X1)+.5,c='green',marker='+',label='class 1')
Y0 = X[y_kmeans == 0]
Y1 = X[y_kmeans == 1]
plt.scatter(Y0, np.zeros_like(Y0) + 0.7, c="red", marker="o", label="class 0 skl")
plt.scatter(Y1, np.zeros_like(Y1) + 0.7, c="green", marker="o", label="class 1 skl")
plt.scatter(X0, np.zeros_like(X0) + 0.5, c="red", marker="+", label="class 0")
plt.scatter(X1, np.zeros_like(X1) + 0.5, c="green", marker="+", label="class 1")
plt.legend()
```
%%%% Output: execute_result
......
......@@ -11,130 +11,133 @@
``` python
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.stats as stats
%matplotlib inline
```
%% Cell type:markdown id: tags:
## Create data set :
%% Cell type:code id: tags:
``` python
D1=np.random.randn(80,)*.1 +1
P1=np.random.rand(80,)*2*np.pi
D2=np.random.randn(40,)*.2
P2=np.random.rand(40,)*2*np.pi
C1=np.zeros((80,2))
C1[:,0]=D1*np.cos(P1)
C1[:,1]=D1*np.sin(P1)
C2=np.zeros((40,2))
C2[:,0]=D2*np.cos(P2)
C2[:,1]=D2*np.sin(P2)
D1 = np.random.randn(80,) * 0.1 + 1
P1 = np.random.rand(80,) * 2 * np.pi
D2 = np.random.randn(40,) * 0.2
P2 = np.random.rand(40,) * 2 * np.pi
C1 = np.zeros((80, 2))
C1[:, 0] = D1 * np.cos(P1)
C1[:, 1] = D1 * np.sin(P1)
C2 = np.zeros((40, 2))
C2[:, 0] = D2 * np.cos(P2)
C2[:, 1] = D2 * np.sin(P2)
plt.subplot(121)
fig=plt.scatter(C1[:,0],C1[:,1],marker='+', color='blue')
fig=plt.scatter(C2[:,0],C2[:,1],marker='o', color='red')
plt.axis('equal')
plt.title('theoretical')
X=np.append(C1,C2,axis=0)
fig = plt.scatter(C1[:, 0], C1[:, 1], marker="+", color="blue")
fig = plt.scatter(C2[:, 0], C2[:, 1], marker="o", color="red")
plt.axis("equal")
plt.title("theoretical")
X = np.append(C1, C2, axis=0)
plt.subplot(122)
plt.scatter(X[:,0],X[:,1])
plt.axis('equal')
plt.title('observed');
plt.scatter(X[:, 0], X[:, 1])
plt.axis("equal")
plt.title("observed");
```
%%%% Output: display_data
![]()
%% Cell type:markdown id: tags:
### Question 6
### Exercise 6
- Briefly explain why usual Kmeans algorithm will fail to detect the classes above
- Is the Kernel approach the only possibily for this kind of clustering problem?
- Is the Kernel approach the only possibility for this kind of clustering problem?
%% Cell type:markdown id: tags:
### Exercice 7
### Exercise 7
- Propose a change of representation space to allow successfull Kmeans clustering in a 1D space. Implement it (use Kmeans_basic.ipynb example)
- Propose a change of representation space to allow successful Kmeans clustering in a 1D space. Implement it (use Kmeans_basic.ipynb example)
- Explain the role of parameter 'gamma' , then change it in Kernel Kmeans code below and comment your findings
- Compare the initialization of this algorithm with the type of initialization uszed in the previous studies of Kmeans.
- Compare the initialization of this algorithm with the type of initialization used in the previous studies of Kmeans.
%% Cell type:code id: tags:
``` python
#Kernel computation
N=X.shape[0]
Ker=np.zeros((N,N))
# Kernel computation
N = X.shape[0]
Ker = np.zeros((N, N))
gamma = 5
gamma=5
for i in range(0, N):
for j in range(0, N):
d = np.sum((X[i, :] - X[j, :]) ** 2)
Ker[i, j] = np.exp(-gamma * d)
for i in range(0,N):
for j in range(0,N):
d=np.sum((X[i,:]-X[j,:])**2)
Ker[i,j]=np.exp(-gamma*d)
# Init
import numpy.matlib
converged = 0;
converged = 0
# Kernel K-means is sensitive to initial conditions (as is Kmeans). Try altering
# this initialisation to see the effect.
# this initialisation to see the effect.
K = 2;
Z = np.matlib.repmat(np.array([1,0]),N,1);
perm=np.random.permutation(N)[0:np.intc(N/2)]
Z[perm,:]=[0,1]
K = 2
Z = np.matlib.repmat(np.array([1, 0]), N, 1)
perm = np.random.permutation(N)[0 : np.intc(N / 2)]
Z[perm, :] = [0, 1]
di=np.zeros((N,K))
count=0
di = np.zeros((N, K))
count = 0
while converged == 0:
count+=1
Nk=np.sum(Z,axis=0)
converged=1
for k in range(0,K):
Vk=Z[:,k].reshape(N,1)
di[:,k]=np.diag(Ker)\
-(2/Nk[k])*np.sum(np.matlib.repmat(Vk.transpose(),N,1)*Ker,axis=1)\
+(float(Nk[k])**(-2))*np.sum( np.sum( \
(Vk@Vk.transpose())*Ker,axis=0), axis=0 )
oldZ=np.copy(Z)
Z=np.zeros((N,K))
for i in range (0,N):
if di[i,0]<di[i,1]:
Z[i,:]=[1,0]
if Z[i,0]!=oldZ[i,0] :
converged=0
count += 1
Nk = np.sum(Z, axis=0)
converged = 1
for k in range(0, K):
Vk = Z[:, k].reshape(N, 1)
di[:, k] = (
np.diag(Ker)
- (2 / Nk[k]) * np.sum(np.matlib.repmat(Vk.transpose(), N, 1) * Ker, axis=1)
+ (float(Nk[k]) ** (-2))
* np.sum(np.sum((Vk @ Vk.transpose()) * Ker, axis=0), axis=0)
)
oldZ = np.copy(Z)
Z = np.zeros((N, K))
for i in range(0, N):
if di[i, 0] < di[i, 1]:
Z[i, :] = [1, 0]
if Z[i, 0] != oldZ[i, 0]:
converged = 0
else:
Z[i,:]=[0,1]
if Z[i,1]!=oldZ[i,1] :
converged=0
#visu
IndC0=np.where(Z[:,0]==1)[0]
IndC1=np.where(Z[:,1]==1)[0]
plt.scatter(X[IndC0,0],X[IndC0,1],color='green',marker='o')
plt.scatter(X[IndC1,0],X[IndC1,1],color='cyan',marker='o')
plt.axis('equal');
Z[i, :] = [0, 1]
if Z[i, 1] != oldZ[i, 1]:
converged = 0
# visu
IndC0 = np.where(Z[:, 0] == 1)[0]
IndC1 = np.where(Z[:, 1] == 1)[0]
plt.scatter(X[IndC0, 0], X[IndC0, 1], color="green", marker="o")
plt.scatter(X[IndC1, 0], X[IndC1, 1], color="cyan", marker="o")
plt.axis("equal")
print('converged in {} iterations'.format(count))
print("converged in {} iterations".format(count))
```
%%%% Output: display_data
![]()
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment