2403res62 - CS564 - Assignment - 4 - K-Means-Iris - Intrinsic - CVIs
2403res62 - CS564 - Assignment - 4 - K-Means-Iris - Intrinsic - CVIs
2403res62
Python code
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
url="./Iris.csv" # raw-github-link
df=pd.read_csv(url) #
converting csv to dataframe
df=df.drop(['Id'], axis=1) #
removing the column 'Id'
tot_rows=len(df.index) #
Counting no. of rows in dataframe
tot_cols=len(df.columns) #
Counting no. of columns in dataframe
data_clustering = df.iloc[:,[0,1,2,3]].values
#################################################################################
##################################################
# ELBOW METHOD TO FIND BEST "K" in K-Means
#################################################################################
##################################################
SSE_Total = []
plt.plot(range(1,11),SSE_Total)
plt.title('Finding Optimum number of Clusters with Elbow Method')
plt.xlabel('Numbers of Clusters in K-Means')
plt.ylabel('Total SSE of the Partitionings')
plt.show()
SSE_Total_Slopes=[]
for i in range(1,9,1) :
slope=-1*(SSE_Total[i]-SSE_Total[i-1])
SSE_Total_Slopes.append(slope)
print(len(SSE_Total_Slopes))
plt.plot(range(2,10),SSE_Total_Slopes)
plt.title('d(SSE_Total)/dx Curve')
plt.xlabel('Number of Clusters')
plt.ylabel('d(SSE_Total)/dx')
plt.show()
SSE_Total_rate_of_change_of_Slopes=[]
for i in range(1,7,1) :
rate_of_change_of_slope=-1*(SSE_Total_Slopes[i]-SSE_Total_Slopes[i-1])
SSE_Total_rate_of_change_of_Slopes.append(rate_of_change_of_slope)
plt.plot(range(3,9),SSE_Total_rate_of_change_of_Slopes)
plt.title('d2(SSE_Total)/dx2 Curve')
plt.xlabel('Number of Clusters')
plt.ylabel('d2(SSE_Total)/dx2')
plt.show()
Coordinates=[]
for i in range(1,7,1):
x=i+2
y=SSE_Total_rate_of_change_of_Slopes[i-1]
t=[x,y]
Coordinates.append(t)
max_roc_slope= max(SSE_Total_rate_of_change_of_Slopes)
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
break
#################################################################################
##################################################
# K-Means on Iris dataset with K=3
#################################################################################
##################################################
Clustering_Iterations = Partitionings_Iris.n_iter_
print(f"\n\nTotal Number of Iterations Clustering the Iris-DataSet with K-
Means(with K=3) :\n{Clustering_Iterations}\n")
Cluster_Labels = Partitionings_Iris.labels_
print(f"\nCluster Labels for Iris-DataSet with K-Means(with
K=3) :\n{Cluster_Labels}\n")
Clustering_SSE = Partitionings_Iris.inertia_
print(f"\nTotal SSE for the Final Partitionings of Iris-DataSet with K-Means(with
K=3) :\n{Clustering_SSE}\n")
Cluster_Centroid_Coordinates = Partitionings_Iris.cluster_centers_
ccc=Cluster_Centroid_Coordinates
print(f"\nFinal Centroids of the different clusters in Iris-DataSet with K-
Means(with K=3) :\n{Cluster_Centroid_Coordinates}\n\n")
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Silhouette Score calculation using sklearn-library
#################################################################################
##################################################
#As per the sklearn library documentation, the definition of silhouette score is
somewhat different from the actual definition.
#a: The mean distance between a sample and all other points in the same class.
#b: The mean distance between a sample and all other points in the next nearest
cluster.
#The Silhouette Coefficient s for a single sample is then given as: (b-
a)/max(a,b)
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Davies-Bouldin Index calculation using sklearn-library
#################################################################################
##################################################
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Silhouette Score calculation from scratch
#################################################################################
##################################################
cluster_array=[]
for i in range (0, tot_rows, 1) :
alist=[data_clustering[i][0], data_clustering[i][1], data_clustering[i][2],
data_clustering[i][3], Cluster_Labels[i]]
cluster_array.append(alist)
clus0=[]
clus1=[]
clus2=[]
for i in range(0,tot_rows, 1) :
if cluster_array[i][4]==0 :
clus0.append(cluster_array[i])
elif cluster_array[i][4]==1 :
clus1.append(cluster_array[i])
else :
clus2.append(cluster_array[i])
npt0=len(clus0)
npt1=len(clus1)
npt2=len(clus2)
#a(i) is average of distances between point i and all other points in same
cluster
#b(i) is single linkage distance between point i and other clusters to which it
does not belong
#********************************************************************************
***********************************
# Calculation of a(i)'s for all three clusters' points
#********************************************************************************
***********************************
ai0=[]
for i in range(0, npt0, 1) :
euc_dis = 0
for j in range(0,npt0, 1) :
if i!=j :
euc_dis = euc_dis + math.sqrt(((clus0[i][0]-clus0[j][0])**2) +
((clus0[i][1]-clus0[j][1])**2) + ((clus0[i][2]-clus0[j][2])**2) + ((clus0[i][3]-
clus0[j][3])**2))
euc_dis = euc_dis/npt0
ai0.append(euc_dis)
ai1=[]
for i in range(0, npt1, 1) :
euc_dis = 0
for j in range(0,npt1, 1) :
if i!=j :
euc_dis = euc_dis + math.sqrt(((clus1[i][0]-clus1[j][0])**2) +
((clus1[i][1]-clus1[j][1])**2) + ((clus1[i][2]-clus1[j][2])**2) + ((clus1[i][3]-
clus1[j][3])**2))
euc_dis = euc_dis/npt1
ai1.append(euc_dis)
ai2=[]
for i in range(0, npt2, 1) :
euc_dis = 0
for j in range(0,npt2, 1) :
if i!=j :
euc_dis = euc_dis + math.sqrt(((clus2[i][0]-clus2[j][0])**2) +
((clus2[i][1]-clus2[j][1])**2) + ((clus2[i][2]-clus2[j][2])**2) + ((clus2[i][3]-
clus2[j][3])**2))
euc_dis = euc_dis/npt2
ai2.append(euc_dis)
#********************************************************************************
***********************************
# Calculation of b(i)'s for all three clusters' points
#********************************************************************************
***********************************
bi0_1=[]
for i in range (0, npt0, 1) :
k=[]
bi0_1.append(k)
bi0_2=[]
for i in range (0, npt0, 1) :
k=[]
bi0_2.append(k)
bi0=[]
for i in range (0, npt0, 1) :
min1 = min(bi0_1[i])
min2 = min(bi0_2[i])
minb = min(min1,min2)
bi0.append(minb)
bi1_2=[]
for i in range (0, npt1, 1) :
k=[]
bi1_2.append(k)
bi1=[]
for i in range (0, npt1, 1) :
min1 = min(bi1_0[i])
min2 = min(bi1_2[i])
minb = min(min1,min2)
bi1.append(minb)
bi2_0=[]
for i in range (0, npt2, 1) :
k=[]
bi2_0.append(k)
bi2_1=[]
for i in range (0, npt2, 1) :
k=[]
bi2_1.append(k)
bi2=[]
for i in range (0, npt2, 1) :
min1 = min(bi2_0[i])
min2 = min(bi2_1[i])
minb = min(min1,min2)
bi2.append(minb)
#********************************************************************************
***********************************
# Calculation of Silhouette Scores for all three clusters
#********************************************************************************
***********************************
Si0=[]
for i in range(0, npt0, 1) :
t=max(bi0[i],ai0[i])
silh = (bi0[i]-ai0[i])/t
Si0.append(silh)
Silhouette_Average_Cluster0 = np.average(Si0)
Si1=[]
for i in range(0, npt1, 1) :
t=max(bi1[i],ai1[i])
silh = (bi1[i]-ai1[i])/t
Si1.append(silh)
Silhouette_Average_Cluster1 = np.average(Si1)
Si2=[]
for i in range(0, npt2, 1) :
t=max(bi2[i],ai2[i])
silh = (bi2[i]-ai2[i])/t
Si2.append(silh)
Silhouette_Average_Cluster2 = np.average(Si2)
Silhouette_of_all_points=Si0+Si1+Si2
Total_Partitioning_Silhouette_Score = np.average(Silhouette_of_all_points)
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Dunn's Index calculation from scratch
#################################################################################
##################################################
icd0=[]
for i in range(0,npt0,1) :
k=[]
icd0.append(k)
maxd0=[]
max_dia_clus0 = max(maxd0)
#********************************************************************************
*********************************************************************************
*******
icd1=[]
for i in range(0,npt1,1) :
k=[]
icd1.append(k)
maxd1=[]
for i in range (0,npt1,1) :
x=max(icd1[i])
maxd1.append(x)
max_dia_clus1 = max(maxd1)
#********************************************************************************
*********************************************************************************
*******
icd2=[]
for i in range(0,npt2,1) :
k=[]
icd2.append(k)
maxd2=[]
max_dia_clus2 = max(maxd2)
#********************************************************************************
*********************************************************************************
*******
min_lin_01=[]
min_lin_02=[]
sin_lin_02 = min(min_lin_02)
min_lin_12=[]
sin_lin_12 = min(min_lin_12)
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Davies-Bouldin Index calculation from scratch
#################################################################################
##################################################
euc_dis = 0
for i in range(0, npt0, 1) :
euc_dis = euc_dis + math.sqrt(((clus0[i][0]-ccc[0][0])**2) + ((clus0[i][1]-
ccc[0][1])**2) + ((clus0[i][2]-ccc[0][2])**2) + ((clus0[i][3]-ccc[0][3])**2))
Rad0 = euc_dis/npt0
euc_dis = 0
for i in range(0, npt1, 1) :
euc_dis = euc_dis + math.sqrt(((clus1[i][0]-ccc[1][0])**2) + ((clus1[i][1]-
ccc[1][1])**2) + ((clus1[i][2]-ccc[1][2])**2) + ((clus1[i][3]-ccc[1][3])**2))
Rad1 = euc_dis/npt1
euc_dis = 0
for i in range(0, npt2, 1) :
euc_dis = euc_dis + math.sqrt(((clus2[i][0]-ccc[2][0])**2) + ((clus2[i][1]-
ccc[2][1])**2) + ((clus2[i][2]-ccc[2][2])**2) + ((clus2[i][3]-ccc[2][3])**2))
Rad2 = euc_dis/npt2
d01=math.sqrt(((ccc[0][0]-ccc[1][0])**2) + ((ccc[0][1]-ccc[1][1])**2) +
((ccc[0][2]-ccc[1][2])**2) + ((ccc[0][3]-ccc[1][3])**2))
d02=math.sqrt(((ccc[0][0]-ccc[2][0])**2) + ((ccc[0][1]-ccc[2][1])**2) +
((ccc[0][2]-ccc[2][2])**2) + ((ccc[0][3]-ccc[2][3])**2))
d12=math.sqrt(((ccc[1][0]-ccc[2][0])**2) + ((ccc[1][1]-ccc[2][1])**2) +
((ccc[1][2]-ccc[2][2])**2) + ((ccc[1][3]-ccc[2][3])**2))
R0=max(((Rad0+Rad1)/d01), ((Rad0+Rad2)/d02))
R1=max(((Rad0+Rad1)/d01), ((Rad1+Rad2)/d12))
R2=max(((Rad0+Rad2)/d02), ((Rad1+Rad2)/d12))
DBI_Partitioning=(R0+R1+R2)/3
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
#################################################################################
##################################################
# Cluster-Scatter Plots for every feature
#################################################################################
##################################################
sepallen_0=[]
sepalwid_0=[]
petallen_0=[]
petalwid_0=[]
sepallen_X_petallen_0=[]
sepalwid_X_petalwid_0=[]
sepallen_1=[]
sepalwid_1=[]
petallen_1=[]
petalwid_1=[]
sepallen_X_petallen_1=[]
sepalwid_X_petalwid_1=[]
for i in range (0, npt1, 1) :
sepallen_1.append(clus1[i][0])
sepalwid_1.append(clus1[i][1])
petallen_1.append(clus1[i][2])
petalwid_1.append(clus1[i][3])
sepallen_X_petallen_1.append((7*clus1[i][0]) + (3*clus1[i][2]))
sepalwid_X_petalwid_1.append((11*clus1[i][1]) + (1*clus1[i][3]))
sepallen_2=[]
sepalwid_2=[]
petallen_2=[]
petalwid_2=[]
sepallen_X_petallen_2=[]
sepalwid_X_petalwid_2=[]
print("\n")
plt.scatter(ccc[0][0],ccc[0][1],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][0],ccc[1][1],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][0],ccc[2][1],c='crimson',s=200,marker='*')
plt.scatter(sepallen_0,sepalwid_0, c='teal')
plt.scatter(sepallen_1,sepalwid_1, c='steelblue')
plt.scatter(sepallen_2,sepalwid_2, c='gold')
plt.xlabel('sepal length in cm')
plt.ylabel('sepal width in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(ccc[0][2],ccc[0][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][2],ccc[1][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][2],ccc[2][3],c='crimson',s=200,marker='*')
plt.scatter(petallen_0,petalwid_0, c='teal')
plt.scatter(petallen_1,petalwid_1, c='steelblue')
plt.scatter(petallen_2,petalwid_2, c='gold')
plt.xlabel('petal length in cm')
plt.ylabel('petal width in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(ccc[0][0],ccc[0][2],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][0],ccc[1][2],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][0],ccc[2][2],c='crimson',s=200,marker='*')
plt.scatter(sepallen_0,petallen_0, c='teal')
plt.scatter(sepallen_1,petallen_1, c='steelblue')
plt.scatter(sepallen_2,petallen_2, c='gold')
plt.xlabel('sepal length in cm')
plt.ylabel('petal length in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(ccc[0][0],ccc[0][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][0],ccc[1][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][0],ccc[2][3],c='crimson',s=200,marker='*')
plt.scatter(sepallen_0,petalwid_0, c='teal')
plt.scatter(sepallen_1,petalwid_1, c='steelblue')
plt.scatter(sepallen_2,petalwid_2, c='gold')
plt.xlabel('sepal length in cm')
plt.ylabel('petal width in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(ccc[0][1],ccc[0][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][1],ccc[1][3],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][1],ccc[2][3],c='crimson',s=200,marker='*')
plt.scatter(sepalwid_0,petalwid_0, c='teal')
plt.scatter(sepalwid_1,petalwid_1, c='steelblue')
plt.scatter(sepalwid_2,petalwid_2, c='gold')
plt.xlabel('sepal width in cm')
plt.ylabel('petal width in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(ccc[0][1],ccc[0][2],c='crimson',s=200,marker='*')
plt.scatter(ccc[1][1],ccc[1][2],c='crimson',s=200,marker='*')
plt.scatter(ccc[2][1],ccc[2][2],c='crimson',s=200,marker='*')
plt.scatter(sepalwid_0,petallen_0, c='teal')
plt.scatter(sepalwid_1,petallen_1, c='steelblue')
plt.scatter(sepalwid_2,petallen_2, c='gold')
plt.xlabel('sepal width in cm')
plt.ylabel('petal length in cm')
plt.title('Clusterings')
plt.show()
print("\n")
plt.scatter(((7*ccc[0][0])+(3*ccc[0][2])),((11*ccc[0][1])+(ccc[0][3])),c='crimson
',s=200,marker='*')
plt.scatter(((7*ccc[1][0])+(3*ccc[1][2])),((11*ccc[1][1])+(ccc[1][3])),c='crimson
',s=200,marker='*')
plt.scatter(((7*ccc[2][0])+(3*ccc[2][2])),((11*ccc[2][1])+(ccc[2][3])),c='crimson
',s=200,marker='*')
plt.scatter(sepallen_X_petallen_0,sepalwid_X_petalwid_0, c='teal')
plt.scatter(sepallen_X_petallen_1,sepalwid_X_petalwid_1, c='steelblue')
plt.scatter(sepallen_X_petallen_2,sepalwid_X_petalwid_2, c='gold')
plt.xlabel('7x(Sepal length) + 3x(Petal length)')
plt.ylabel('11x(Sepal width) + 1x(Petal width)')
plt.title('Clusterings')
plt.show()
print("\n\n**********************************************************************
*********************************************************************************
*****************************")
print("**************************************************************************
*********************************************************************************
*************************\n")
Output
Elbow Method Investigation is completed!!!
The Optimum Number of Clusters found by Elbow Method, i.e. the number of
clusters, at which, the elbow-curve has the highest rate of change of slope is :
3
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
111110020000000000000000000000002000000000000
000000000020222202222220022220202022002222202
2 2 2 0 2 2 2 0 2 2 2 0 2 2 0]
Total SSE for the Final Partitionings of Iris-DataSet with K-Means(with K=3) :
78.940841426146
Our Definition of the Silhouette Cluster is a bit different from what has been
implemented in the sklearn library, as we consider b(i) to be the minimum
single linkage distance for a point 'i'.
Since Cluster #1 has the Highest Silhouette Score amongst the three clusters,
it is the most well formed cluster amongst the 3 Clusters!!!
The Silhouette Coefficient Value for the complete partitioning by our formula
is : 0.22193646580785792
Since the Davies-Bouldin Index for Cluster#1 is the minimum amongst all the
three clusters, it is the most well formed cluster!!!
Plots