DataMiningProjectProblem1 Clustering
DataMiningProjectProblem1 Clustering
In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
%matplotlib inline
In [2]:
data_bank=pd.read_csv("bank_marketing_part1_Data.csv")
data_bank.head()
Out[2]:
In [4]:
data_bank.describe().T
Out[4]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 1/20
10/25/2020 SomritaProject_Clustering
In [5]:
data_bank.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 spending 210 non-null float64
1 advance_payments 210 non-null float64
2 probability_of_full_payment 210 non-null float64
3 current_balance 210 non-null float64
4 credit_limit 210 non-null float64
5 min_payment_amt 210 non-null float64
6 max_spent_in_single_shopping 210 non-null float64
dtypes: float64(7)
memory usage: 11.6 KB
In [6]:
data_bank.shape
Out[6]:
(210, 7)
In [7]:
data_bank.duplicated().sum()
Out[7]:
In [8]:
data_bank.isnull().sum()
Out[8]:
spending 0
advance_payments 0
probability_of_full_payment 0
current_balance 0
credit_limit 0
min_payment_amt 0
max_spent_in_single_shopping 0
dtype: int64
In [9]:
data_bank.columns
Out[9]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 2/20
10/25/2020 SomritaProject_Clustering
In [10]:
In [11]:
def remove_outlier(col):
sorted(col)
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 3/20
10/25/2020 SomritaProject_Clustering
In [12]:
lrincome,urincome=remove_outlier(data_bank['probability_of_full_payment'])
data_bank['probability_of_full_payment']=np.where(data_bank['probability_of_full_paymen
t']>urincome,urincome,data_bank['probability_of_full_payment'])
data_bank['probability_of_full_payment']=np.where(data_bank['probability_of_full_paymen
t']<lrincome,lrincome,data_bank['probability_of_full_payment'])
lrtravel,urtravel=remove_outlier(data_bank['min_payment_amt'])
data_bank['min_payment_amt']=np.where(data_bank['min_payment_amt']>urtravel,urtravel,da
ta_bank['min_payment_amt'])
data_bank['min_payment_amt']=np.where(data_bank['min_payment_amt']<lrtravel,lrtravel,da
ta_bank['min_payment_amt'])
In [13]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 4/20
10/25/2020 SomritaProject_Clustering
In [14]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 5/20
10/25/2020 SomritaProject_Clustering
In [19]:
data_bank.hist(figsize=(15,20))
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 6/20
10/25/2020 SomritaProject_Clustering
Out[19]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 7/20
10/25/2020 SomritaProject_Clustering
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 8/20
10/25/2020 SomritaProject_Clustering
In [20]:
plt.figure(figsize=[10,5])
sns.pairplot(data_bank)
plt.show()
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 9/20
10/25/2020 SomritaProject_Clustering
In [21]:
corr = data_bank.corr()
corr
Out[21]:
In [22]:
plt.figure(figsize=(12,7))
sns.heatmap(data_bank.corr(), annot=True, fmt='.2f', cmap='Blues')
plt.show()
In [23]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 10/20
10/25/2020 SomritaProject_Clustering
In [24]:
X = StandardScaler()
X
Out[24]:
StandardScaler()
In [25]:
scaled_df= X.fit_transform(data_bank)
In [26]:
Out[26]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 11/20
10/25/2020 SomritaProject_Clustering
In [27]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 12/20
10/25/2020 SomritaProject_Clustering
In [28]:
lrincome,urincome=remove_outlier(scaled_df['probability_of_full_payment'])
scaled_df['probability_of_full_payment']=np.where(scaled_df['probability_of_full_paymen
t']>urincome,urincome,scaled_df['probability_of_full_payment'])
scaled_df['probability_of_full_payment']=np.where(scaled_df['probability_of_full_paymen
t']<lrincome,lrincome,scaled_df['probability_of_full_payment'])
In [29]:
In [30]:
In [31]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 13/20
10/25/2020 SomritaProject_Clustering
In [32]:
dend=dendrogram(wardlink)
In [33]:
dend=dendrogram(wardlink,truncate_mode='lastp',p=10)
In [34]:
In [35]:
clusters=fcluster(wardlink,3,criterion='maxclust')
In [36]:
data_bank['clusters']=clusters
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 14/20
10/25/2020 SomritaProject_Clustering
In [37]:
data_bank.head()
Out[37]:
In [38]:
data_bank.clusters.value_counts().sort_index()
Out[38]:
1 70
2 67
3 73
Name: clusters, dtype: int64
In [39]:
aggdata=data_bank.groupby('clusters').mean()
aggdata['Freq']=data_bank.clusters.value_counts().sort_index()
aggdata
Out[39]:
clusters
In [40]:
data_bank.to_csv("HierarchicalProject.csv")
In [41]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 15/20
10/25/2020 SomritaProject_Clustering
In [42]:
[1 0 1 2 1 0 2 2 1 2 1 1 2 1 0 0 0 2 2 2 2 2 1 2 0 1 0 2 2 2 2 2 2 0 2 2 2
2 2 1 1 0 1 1 2 2 0 1 1 1 2 1 1 1 1 1 2 2 2 1 0 2 2 1 0 1 1 0 1 2 0 2 1 1
2 1 0 2 1 0 0 0 0 1 2 1 1 1 1 0 0 1 0 2 2 1 1 1 2 1 0 1 0 1 0 1 1 2 0 1 1
0 1 2 2 1 0 0 2 1 0 2 2 2 0 0 1 2 0 0 2 0 0 1 2 1 1 2 1 0 0 0 2 2 2 2 1 2
0 2 0 2 0 1 0 0 2 2 0 1 1 2 1 1 1 2 1 0 0 2 0 2 0 1 1 1 0 2 0 2 0 2 0 0 1
1 0 1 0 2 0 0 2 1 0 1 1 2 1 2 0 0 0 2 1 0 1 0 0 1]
In [43]:
data_bank["Agglo_CLusters"]=Cluster_agglo
In [45]:
data_bank.head(10)
Out[45]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 16/20
10/25/2020 SomritaProject_Clustering
In [46]:
agglo_data=data_bank.groupby('Agglo_CLusters').mean()
agglo_data['Freq']=data_bank.Agglo_CLusters.value_counts().sort_index()
agglo_data
Out[46]:
Agglo_CLusters
In [48]:
#KMeans
k_means = KMeans(n_clusters = 3)
k_means.fit(scaled_df)
k_means.inertia_
Out[48]:
430.298481751223
In [50]:
k_means = KMeans(n_clusters = 2)
k_means.fit(scaled_df)
k_means.inertia_
Out[50]:
659.1474009548498
In [51]:
k_means = KMeans(n_clusters = 1)
k_means.fit(scaled_df)
k_means.inertia_
Out[51]:
1469.9999999999995
In [53]:
k_means = KMeans(n_clusters = 4)
k_means.fit(scaled_df)
k_means.inertia_
Out[53]:
370.8685962394206
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 17/20
10/25/2020 SomritaProject_Clustering
In [54]:
k_means = KMeans(n_clusters = 5)
k_means.fit(scaled_df)
k_means.inertia_
Out[54]:
325.9098750065543
In [55]:
wss =[]
In [56]:
for i in range(1,11):
KM = KMeans(n_clusters=i)
KM.fit(scaled_df)
wss.append(KM.inertia_)
In [57]:
wss
Out[57]:
[1469.9999999999995,
659.1474009548498,
430.298481751223,
371.4400252695771,
327.39077808486644,
289.50583097697313,
262.5952276605776,
239.57831775716886,
223.42523703383455,
208.7028049634438]
In [58]:
plt.plot(range(1,11), wss)
Out[58]:
[<matplotlib.lines.Line2D at 0x28f968b9a88>]
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 18/20
10/25/2020 SomritaProject_Clustering
In [59]:
k_means = KMeans(n_clusters = 3)
k_means.fit(scaled_df)
labels = k_means.labels_
In [61]:
data_bank["Clus_kmeans"] = labels
data_bank.head(7)
Out[61]:
In [62]:
In [63]:
silhouette_score(scaled_df,labels)
Out[63]:
0.4008059221522216
In [64]:
sil_width = silhouette_samples(scaled_df,labels)
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 19/20
10/25/2020 SomritaProject_Clustering
In [65]:
data_bank["sil_width"] = sil_width
data_bank.head(7)
Out[65]:
In [71]:
kmeansdata=data_bank.groupby('Clus_kmeans').mean()
kmeansdata['Freq']=data_bank.Clus_kmeans.value_counts().sort_index()
kmeansdata
Out[71]:
Clus_kmeans
In [72]:
silhouette_samples(scaled_df,labels).min()
Out[72]:
0.002768541128616533
In [ ]:
file:///C:/Users/Titlee_Joy/Downloads/SomritaProject_Clustering.html 20/20