Machine Learning Project Problem 1 Jupyter Notebook PDF
Machine Learning Project Problem 1 Jupyter Notebook PDF
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style
%matplotlib inline
import seaborn as sns; sns.set() # for plot styling
from scipy import stats
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import dendrogram,linkage,fcluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics,model_selection
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from scipy.stats import zscore
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')
In [2]:
Out[2]:
1 Labour 43 3 3 4 1 2
2 Labour 36 4 4 4 4 5
3 Labour 35 4 4 5 2 3
4 Labour 24 4 2 2 1 4
5 Labour 41 2 2 1 1 6
In [3]:
In [4]:
<class 'pandas.core.frame.DataFrame'>
In [5]:
In [6]:
cat=[]
num=[]
for i in Elect_df.columns:
if Elect_df[i].dtype=="object":
cat.append(i)
else:
num.append(i)
print(cat)
print(num)
['vote', 'gender']
In [7]:
vote : 0
gender : 0
In [8]:
Elect_df[num].describe().T
Out[8]:
In [9]:
Elect_df[cat].describe().T
Out[9]:
In [10]:
Out[10]:
68 Labour 35 4 4 5 2
627 Labour 39 3 4 4 2
871 Labour 38 2 4 2 2
984 Conservative 74 4 3 2 4
1155 Conservative 53 3 4 2 2
1237 Labour 36 3 3 2 2
1245 Labour 29 4 4 4 2
1439 Labour 40 4 3 4 2
In [11]:
Elect_df.drop_duplicates(inplace=True)
In [12]:
Elect_df.shape
Out[12]:
(1517, 9)
In [13]:
VOTE : 2
Conservative 460
Labour 1057
GENDER : 2
male 709
female 808
In [14]:
Out[14]:
age 0.139800
economic.cond.national -0.238474
economic.cond.household -0.144148
Blair -0.539514
Hague 0.146191
Europe -0.141891
political.knowledge -0.422928
dtype: float64
Univariate Analysis
In [15]:
a=1
plt.figure(figsize=(15,112))
for i in Elect_df.columns:
if Elect_df[i].dtype != 'object':
plt.subplot(21,3,a)
sns.distplot(Elect_df[i])
plt.title("Distribution plot for:" + i)
plt.subplot(21,3,a+1)
sns.histplot(Elect_df[i])
plt.title("Histogram for:" + i)
plt.subplot(21,3,a+2)
sns.boxplot(Elect_df[i])
plt.title("Boxplot for:" + i)
a+=3
In [16]:
fig, (ax1,ax2,ax3,ax4)=plt.subplots(1,4,figsize=(16,5))
fig, (ax5,ax6,ax7)=plt.subplots(1,3,figsize=(12,5))
sns.stripplot(Elect_df["vote"], Elect_df['age'],orient='v',jitter=True,ax=ax1)
ax1.set_xlabel('vote', fontsize=15)
ax1.set_title('Distribution of vote', fontsize=15)
ax1.tick_params(labelsize=15)
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
In [17]:
fig, (ax1,ax2,ax3,ax4)=plt.subplots(1,4,figsize=(16,5))
fig, (ax5,ax6,ax7)=plt.subplots(1,3,figsize=(12,5))
sns.stripplot(Elect_df["gender"], Elect_df['age'],orient='v',jitter=True,ax=ax1)
ax1.set_xlabel('gender', fontsize=15)
ax1.set_title('Distribution of gender', fontsize=15)
ax1.tick_params(labelsize=15)
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
In [18]:
In [19]:
#correlation matrix
Elect_df.corr()
Out[19]:
In [20]:
In [21]:
There are nearly no outliers in most of the numerical columns, only outlier is in economic.cond.national
variable & economic.cond.household Variable . In Gaussian Naive Bayes, outliers will affect the shape
of the Gaussian distribution and have the usual effects on the mean etc. So depending on our use case,
it makes sense to remove outlier .
In [22]:
Range of values: 4
In [23]:
#Central values
print('Minimum value economic.cond.national: ', Elect_df['economic.cond.national'].min())
print('Maximum economic.cond.national: ',Elect_df['economic.cond.national'].max())
print('Mean value economic.cond.national: ', Elect_df['economic.cond.national'].mean())
print('Median value economic.cond.national: ',Elect_df['economic.cond.national'].median())
print('Standard deviation economic.cond.national: ', Elect_df['economic.cond.national'].std
print('Null values economic.cond.national: ',Elect_df['economic.cond.national'].isnull().an
Maximum economic.cond.national: 5
In [24]:
#Quartiles
Q1=Elect_df['economic.cond.national'].quantile(q=0.25)
Q3=Elect_df['economic.cond.national'].quantile(q=0.75)
print('economic.cond.national - 1st Quartile (Q1) is: ', Q1)
print('economic.cond.national - 3st Quartile (Q3) is: ', Q3)
print('Interquartile range (IQR) of economic.cond.national is ', stats.iqr(Elect_df['econom
In [25]:
# IQR=Q3-Q1
#lower 1.5*IQR whisker i.e Q1-1.5*IQR
#upper 1.5*IQR whisker i.e Q3+1.5*IQR
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in economic.cond.national: ', L_outliers)
print('Upper outliers in economic.cond.national: ', U_outliers)
In [26]:
Oulier Treatment
In [27]:
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [28]:
lr,ur=remove_outlier(Elect_df["economic.cond.national"])
Elect_df["economic.cond.national"]=np.where(Elect_df["economic.cond.national"]>ur,ur,Elect_
Elect_df["economic.cond.national"]=np.where(Elect_df["economic.cond.national"]<lr,lr,Elect_
lr,ur=remove_outlier(Elect_df["economic.cond.household"])
Elect_df["economic.cond.household"]=np.where(Elect_df["economic.cond.household"]>ur,ur,Elec
Elect_df["economic.cond.household"]=np.where(Elect_df["economic.cond.household"]<lr,lr,Elec
In [29]:
In [30]:
cat
Out[30]:
['vote', 'gender']
In [31]:
drop_first is used to ensure that multiple columns created based on the levels of categorical variable
are not included else it will result in to multicollinearity . This is done to ensure that we do not land in to
dummy trap.
In [32]:
df=pd.get_dummies(Elect_df, columns=cat1,drop_first=True)
df.head()
Out[32]:
1 43 3.0 3.0 4 1 2
2 36 4.0 4.0 4 4 5
3 35 4.0 4.0 5 2 3
4 24 4.0 2.0 2 1 4
5 41 2.0 2.0 1 1 6
In [33]:
In [34]:
Out[34]:
age 246.544655
economic.cond.national 0.728713
economic.cond.household 0.785491
Blair 1.380089
Hague 1.519005
Europe 10.883687
political.knowledge 1.175961
gender_male 0.249099
dtype: float64
In [35]:
In [37]:
Out[37]:
age 1.00066
economic.cond.national 1.00066
economic.cond.household 1.00066
Blair 1.00066
Hague 1.00066
Europe 1.00066
political.knowledge 1.00066
gender_male 1.00066
dtype: float64
In [38]:
In [39]:
X.head()
Out[39]:
In [40]:
y.head()
Out[40]:
1 1
2 1
3 1
4 1
5 1
Train-Test Split Split X and y into training and test set in 70:30 ratio with
random_state=1
In [41]:
In [42]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)
X_train (1061, 8)
X_test (456, 8)
y_train (1061,)
y_test (456,)
In [43]:
Logistic_model = LogisticRegression(solver='newton-cg',max_iter=10000,penalty='none',verbos
Logistic_model.fit(X_train, y_train)
Out[43]:
verbose=True)
Now LogisticRegression classifier is built. The classifier is trained using training data. We can use fit() method
for training it. After building a classifier, our model is ready to make predictions. We can use predict() method
with test set features as its parameters.
In [44]:
[[197 110]
[ 66 688]]
In [45]:
In [46]:
[[111 42]
[ 36 267]]
In [47]:
In [48]:
f,a = plt.subplots(1,2,sharex=True,sharey=True,squeeze=False)
#Plotting confusion matrix for the different models for the Training Data
plot_0 = sns.heatmap((metrics.confusion_matrix(y_train,y_train_predict)),annot=True,fmt='.5
a[0][0].set_title('Training Data')
plot_1 = sns.heatmap((metrics.confusion_matrix(y_test,y_test_predict)),annot=True,fmt='.5g'
a[0][1].set_title('Test Data');
In [49]:
In [50]:
grid={'penalty':['l2','none','l1','elasticnet'],
'solver':['liblinear','lbfgs','newton-cg'],
'tol':[0.0001,0.00001],
'max_iter': [10000, 5000,15000]}
In [51]:
[LibLinear]
Out[51]:
estimator=LogisticRegression(max_iter=10000, n_jobs=2,
penalty='none', solver='newton-c
g',
verbose=True),
n_jobs=2,
scoring='f1')
In [52]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)
In [53]:
best_model_lr = grid_search.best_estimator_
In [54]:
ytrain_predict_lr = best_model_lr.predict(X_train)
ytest_predict_lr = best_model_lr.predict(X_test)
In [55]:
ytest_predict_prob=best_model_lr.predict_proba(X_test)
pd.DataFrame(ytest_predict_prob).head()
Out[55]:
0 1
0 0.428858 0.571142
1 0.155518 0.844482
2 0.006996 0.993004
3 0.839503 0.160497
4 0.066109 0.933891
In [56]:
print("The Best Logistic Regression Model Score on train data set post tuning is %.3f " % b
The Best Logistic Regression Model Score on train data set post tuning is 0.
834
In [57]:
In [58]:
# predict probabilities
probs = best_model_lr.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for LR Tuned Model train data set %.2f " % auc)
# calculate roc curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='green')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for for LR Tuned Model train data set",fontsize=14,color = 'red');
The ROC_AUC score for LR Tuned Model train data set 0.89
In [59]:
print("The Best Logistic Regression Model Score on train data post tuning set is %.3f " % b
The Best Logistic Regression Model Score on train data post tuning set is 0.
829
In [60]:
In [61]:
# predict probabilities
probs = best_model_lr.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print("The ROC_AUC score for LR Tuned Model test data set %.2f " % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='green')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for for LR Tuned Model test data set",fontsize=14,color = 'red');
The ROC_AUC score for LR Tuned Model test data set 0.88
In [62]:
In [63]:
Out[63]:
0.00517138746961654
LDA_model=LinearDiscriminantAnalysis()
LDA_model.fit(X_train,y_train)
Out[64]:
LinearDiscriminantAnalysis()
In [65]:
[[200 107]
[ 69 685]]
In [66]:
In [67]:
[[111 42]
[ 35 268]]
In [68]:
In [69]:
In [70]:
ytrain_predict_lda = best_model_lda.predict(X_train)
ytest_predict_lda= best_model_lda.predict(X_test)
In [71]:
Out[71]:
0 1
0 0.466328 0.533672
1 0.137291 0.862709
2 0.005950 0.994050
3 0.866706 0.133294
4 0.053474 0.946526
In [72]:
The Best LDA Model Score on train data set post tuning is 0.835
In [73]:
In [74]:
# predict probabilities
probs = best_model_lda.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for LDA Tuned Model train data set %.3f " % auc)
# calculate roc curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for for LDA Tuned Model train data set",fontsize=14,color = 'red');
The ROC_AUC score for LDA Tuned Model train data set 0.890
In [75]:
The Best LDA Model Score on test data post tuning set is 0.831
In [76]:
In [77]:
# predict probabilities
probs = best_model_lda.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print("The ROC_AUC score for LDA Tuned Model test data set %.3f " % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for for LDA Tuned Model test data set",fontsize=14,color = 'red');
The ROC_AUC score for LDA Tuned Model test data set 0.888
In [78]:
In [79]:
Out[79]:
0.3920912082279293
KNN Model
Generally, good KNN performance usually requires preprocessing of data to make all variables similarly
scaled and centered
In [80]:
KNN_model=KNeighborsClassifier()
KNN_model.fit(X_train,y_train)
Out[80]:
KNeighborsClassifier()
In [81]:
[[217 90]
[ 62 692]]
In [82]:
[[109 44]
[ 35 268]]
Run the KNN with no of neighbours to be 1,3,5..19 and *Find the optimal number of neighbours from
K=1,3,5,7....19 using the Mis classification error
Misclassification error (MCE) = 1 - Test accuracy score. Calculated MCE for each model with neighbours
= 1,3,5...19 and find the model with lowest MCE
In [83]:
Out[83]:
[0.2149122807017544,
0.19736842105263153,
0.17324561403508776,
0.1842105263157895,
0.18201754385964908,
0.17105263157894735,
0.17763157894736847,
0.16885964912280704,
0.16666666666666663,
0.17105263157894735]
In [84]:
For K = 11 it is giving the best test accuracy. We will build the model with k=11
In [85]:
Out[85]:
KNeighborsClassifier(n_neighbors=11)
In [86]:
[[206 101]
[ 66 688]]
In [87]:
In [88]:
# predict probabilities
probs = KNN_model_1.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for KNN train data set %.3f " % auc)
# calculate roc curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for for KNN test data set",fontsize=14,color = 'red');
In [89]:
[[105 48]
[ 30 273]]
In [90]:
In [91]:
# predict probabilities
probs = KNN_model_1.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print("The ROC_AUC score for KNN train data set %.3f " % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for for KNN test data set",fontsize=14,color = 'red');
Naive Bayes
In [92]:
NB_model=GaussianNB()
NB_model.fit(X_train, y_train)
Out[92]:
GaussianNB()
Now GaussianNB classifier is built. The classifier is trained using training data. We can use fit() method
for training it. After building a classifier, our model is ready to make predictions. We can use predict()
method with test set features as its parameters.
In [93]:
[[212 95]
[ 81 673]]
In [94]:
In [95]:
[[112 41]
[ 40 263]]
In [96]:
In [98]:
X_train.shape
Out[98]:
(1061, 8)
In [99]:
Out[99]:
(1508, 8)
In [100]:
NB_SM_model = GaussianNB()
NB_SM_model.fit(X_train_res, y_train_res)
Out[100]:
GaussianNB()
In [101]:
[[616 138]
[131 623]]
In [102]:
ROC_AUC Curve for Naive Bayes with SMOTE Model on train data set
In [103]:
probs = NB_SM_model.predict_proba(X_train)
probs = probs[:, 1]
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for Naive Bayes with SMOTE train data set %.3f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for for Naive Bayes with SMOTE train data set",fontsize=14,color = 're
The ROC_AUC score for Naive Bayes with SMOTE train data set 0.887
In [104]:
[[125 28]
[ 59 244]]
In [105]:
ROC_AUC Curve for Naive Bayes with SMOTE Model on test data set
In [106]:
probs_test = NB_SM_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The ROC_AUC score for Naive Bayes with SMOTE test data set %.3f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr)
plt.title("ROC Curve for Naive Bayes with SMOTE test data set",fontsize=14,color = 'red');
The ROC_AUC score for Naive Bayes with SMOTE test data set 0.876
Random Forest
In [107]:
RF_model=RandomForestClassifier(n_estimators=100,random_state=1)
RF_model.fit(X_train, y_train)
Out[107]:
RandomForestClassifier(random_state=1)
In [108]:
[[307 0]
[ 0 754]]
In [109]:
In [110]:
[[104 49]
[ 28 275]]
In [111]:
In [112]:
(RF_model_score_train-RF_model_score_test)*100
Out[112]:
16.885964912280706
Bagging
In [113]:
cart=RandomForestClassifier()
Bagging_model=BaggingClassifier(base_estimator=cart,n_estimators=100, random_state=1)
Bagging_model.fit(X_train,y_train)
Out[113]:
BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=100,
random_state=1)
In [114]:
[[278 29]
[ 5 749]]
In [115]:
In [116]:
[[104 49]
[ 29 274]]
In [117]:
In [118]:
(Bagging_model_score_train-Bagging_model_score_test)
Out[118]:
0.13900739123964478
Boosting
Ada Boost
In [119]:
ADB_model=AdaBoostClassifier(n_estimators=100,random_state=1)
ADB_model.fit(X_train,y_train)
Out[119]:
AdaBoostClassifier(n_estimators=100, random_state=1)
In [120]:
The ADA boost Model Score for train data set is 0.850
[[214 93]
[ 66 688]]
In [121]:
In [122]:
The ADA boost Model Score for test data set is 0.814
[[103 50]
[ 35 268]]
In [123]:
In [124]:
(ADB_model_score_train-ADB_model_score_test)*100
Out[124]:
3.654488483225027
Gradient Boosting
localhost:8888/notebooks/Desktop/MACHINE LEARNING_PROJECT-PROBLEM 1.ipynb 58/85
3/6/22, 10:44 PM MACHINE LEARNING_PROJECT-PROBLEM 1 - Jupyter Notebook
Gradient Boosting
In [125]:
gbc_model=GradientBoostingClassifier(random_state=1)
gbc_model.fit(X_train, y_train)
Out[125]:
GradientBoostingClassifier(random_state=1)
In [126]:
[[239 68]
[ 46 708]]
In [127]:
In [128]:
[[105 48]
[ 27 276]]
In [129]:
In [130]:
(gbc_model_score_train-gbc_model_score_test)*100
Out[130]:
5.702787836698253
In [131]:
The Best Logistic Regression Model Score on train data set is 0.83
In [132]:
# predict probabilities
probs = best_model_lr.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print('The ROC_AUC score for Logistic Regression Train data set: %.3f' % auc)
# calculate ROC curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for Logistic Regression Train data set",fontsize=14,color = 'red');
The ROC_AUC score for Logistic Regression Train data set: 0.890
In [133]:
The Best Logistic Regression Model Score on train data set is 0.83
In [134]:
# predict probabilities
probs = best_model_lr.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print('The ROC_AUC score for Logistic Regression Test data set : %.3f' % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for Logistic Regression Test data set ",fontsize=14,color = 'red');
The ROC_AUC score for Logistic Regression Test data set : 0.883
In [135]:
ROC_AUC Curve for LDA (linear discriminant analysis) on train data set
In [136]:
# predict probabilities
probs = best_model_lda.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for LDA Train data set %.2f " % auc)
# calculate roc curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for LDA Train data set",fontsize=14,color = 'red');
In [137]:
ROC_AUC Curve for LDA (linear discriminant analysis) on test data set
In [138]:
probs = best_model_lda.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
print("The ROC_AUC score for LDA Test data set is' %.3f " % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for LDA Test data set",fontsize=14,color = 'red');
AUC: 0.888
The ROC_AUC score for LDA Test data set is' 0.888
In [139]:
[[206 101]
[ 66 688]]
In [140]:
# predict probabilities
probs = KNN_model_1.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for KNN train data set %.2f " % auc)
# calculate roc curve
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for for KNN test data set",fontsize=14,color = 'red');
In [141]:
[[105 48]
[ 30 273]]
In [142]:
# predict probabilities
probs = KNN_model_1.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(y_test, probs)
print("The ROC_AUC score for KNN train data set %.2f " % auc)
# calculate roc curve
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs)
plt.plot([0, 1], [0, 1], linestyle='--', color='red')
# plot the roc curve for the model
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for for KNN test data set",fontsize=14,color = 'red');
In [143]:
[[616 138]
[131 623]]
ROC_AUC Curve for Naive Bayes with SMOTE Model on train data set
In [144]:
probs = NB_SM_model.predict_proba(X_train_res)
probs = probs[:, 1]
auc = roc_auc_score(y_train_res, probs)
print("The ROC_AUC score for Naive Bayes with SMOTE train data set %.2f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train_res, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for Naive Bayes with SMOTE train data set",fontsize=14,color = 'red');
The ROC_AUC score for Naive Bayes with SMOTE train data set 0.90
In [145]:
[[125 28]
[ 59 244]]
ROC AUC Curve for Naive Bayes with SMOTE Model on test data set
localhost:8888/notebooks/Desktop/MACHINE LEARNING_PROJECT-PROBLEM 1.ipynb 72/85
3/6/22, 10:44 PM MACHINE LEARNING_PROJECT-PROBLEM 1 - Jupyter Notebook
ROC_AUC Curve for Naive Bayes with SMOTE Model on test data set
In [146]:
probs_test = NB_SM_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The ROC_AUC score for Naive Bayes with SMOTE Model on test data set %.2f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr)
plt.title("ROC Curve for Naive Bayes with SMOTE Model on test data set",fontsize=14,color =
The ROC_AUC score for Naive Bayes with SMOTE Model on test data set 0.88
In [147]:
[[307 0]
[ 0 754]]
In [148]:
Recall=(754/(0+754))
print("Random Forest-Train Data Set-Recall for class 1 is %.2f " % Recall)
In [149]:
probs = RF_model.predict_proba(X_train)
probs = probs[:, 1]
auc = roc_auc_score(y_train, probs)
print("The AUC_ROC score for Random Forest train data set %.2f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for Random Forest train data",fontsize=14,color = 'red');
The AUC_ROC score for Random Forest train data set 1.00
In [150]:
[[104 49]
[ 28 275]]
In [151]:
Recall=(275/(28+275))
print("Random Forest-Test Data Set-Recall for class 1 is %.2f " % Recall)
In [152]:
probs_test = RF_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The AUC_ROC score for Random Forest test data set %.2f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for Random Forest Test data set",fontsize=14,color = 'red');
The AUC_ROC score for Random Forest test data set 0.90
In [153]:
[[278 29]
[ 5 749]]
In [154]:
probs = Bagging_model.predict_proba(X_train)
probs = probs[:, 1]
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for Bagging train data set %.2f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for Bagging Train data set",fontsize=14,color = 'red');
In [155]:
[[104 49]
[ 29 274]]
In [156]:
probs_test = Bagging_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The AUC_ROC score for Bagging test data set %.2f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for Bagging Test data set",fontsize=14,color = 'red');
In [157]:
The ADA boost Model Score for train data set is 0.850
[[214 93]
[ 66 688]]
In [158]:
probs = ADB_model.predict_proba(X_train)
probs = probs[:, 1]
auc = roc_auc_score(y_train, probs)
print("The AUC_ROC score for ADB Model train data set %.2f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for ADB Model train data set",fontsize=14,color = 'red');
The AUC_ROC score for ADB Model train data set 0.91
In [159]:
The ADA boost Model Score for test data set is 0.81
[[103 50]
[ 35 268]]
In [160]:
probs_test = ADB_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The AUC_ROC score for ADB Model test data set %.2f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr);
plt.title("ROC Curve for ADB Model test data set",fontsize=14,color = 'red');
The AUC_ROC score for ADB Model test data set 0.88
In [161]:
[[239 68]
[ 46 708]]
In [162]:
Recall=(708/(46+708))
print("Gradient Boosting-Train Data Set-Recall for class 1 is %.3f " % Recall)
In [163]:
probs = gbc_model.predict_proba(X_train)
probs = probs[:, 1]
auc = roc_auc_score(y_train, probs)
print("The ROC_AUC score for Gradient Boosting train data set %.3f " % auc)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(train_fpr, train_tpr);
plt.title("ROC Curve for Gradient Boosting train data set",fontsize=14,color = 'red');
The ROC_AUC score for Gradient Boosting train data set 0.951
In [164]:
[[105 48]
[ 27 276]]
In [165]:
Recall=(276/(27+276))
print("Gradient Boosting-Test Data Set-Recall for class 1 is %.3f " % Recall)
In [166]:
probs_test = gbc_model.predict_proba(X_test)
probs_test = probs_test[:, 1]
auc = roc_auc_score(y_test, probs_test)
print("The ROC_AUC score for Gradient Boosting test data set %.3f " % auc)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, probs_test)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(test_fpr, test_tpr)
plt.title("ROC Curve for Gradient Boosting test data set",fontsize=14,color = 'red');
The ROC_AUC score for Gradient Boosting test data set 0.899
print("The Logistic Regression Model Score Post Tuning on train data set is %.3f " % best_m
print("The Logistic Regression Model Score Post Tuning on test data set is %.3f " % best_m
print("The LDA Model Score Post Tuning on train data set is %.3f " % best_model_lda.score(X
print("The LDA Model Score Post Tuning on test data set is %.3f " % best_model_lda.score(X
print("The KNN Model Score Post Tuning on Train data %.3f " % KNN_model_1.score(X_train, y_
print("The KNN Model Score Post Tuning on Test data %.3f " % KNN_model_1.score(X_test, y_te
print("The Naive Bayes Model Score Post Tuning on train data is %.3f " % NB_SM_model.score(
print("The Naive Bayes Model Score Post Tuning on test data is %.3f " % NB_SM_model.score(X
The Logistic Regression Model Score Post Tuning on train data set is 0.834
The Logistic Regression Model Score Post Tuning on test data set is 0.829
The LDA Model Score Post Tuning on train data set is 0.835
The LDA Model Score Post Tuning on test data set is 0.831
The Naive Bayes Model Score Post Tuning on train data is 0.822
The Naive Bayes Model Score Post Tuning on test data is 0.809
In [169]:
print("Variance in Test and train Scores of LDA Model is %.5f " % (best_model_lr.score(X_tr
In [170]:
print("Variance in Test and train Scores of LDA Model is %.5f " % (best_model_lda.score(X_t
In [171]:
print("Variance in Test and train Scores of KNN Model for is %.5f " % (KNN_model_1.score(X
In [172]:
print("Variance in Test and train Scores of LR Model for is %.5f " % (NB_SM_model.score(X_
Cross Validation
In [173]:
In [174]:
Out[174]:
In [175]:
Out[175]:
--------------------------