Healthcare-Project-Simplilearn - Week3
Healthcare-Project-Simplilearn - Week3
#Check the balance of the data by plotting the count of outcomes by their value. Des
plt.figure(figsize=(8,6))
sns.countplot(df['Outcome'])
plt.title("count of outcomes", fontsize=15,loc='center', color='Black')
plt.xlabel("Outcome")
plt.ylabel("Value count")
plt.show()
#Create scatter charts between the pair ofvariables to understand the relationships.
plt.figure(figsize=(15,5))
sns.scatterplot(x='Pregnancies',y='Glucose',data=df,hue='Outcome',palette="Set1")
plt.xlabel('Pregnancies', fontsize=13)
plt.ylabel('Glucose', fontsize=13)
plt.title('grouped scatter plot - Pregnancies vs Glucose',fontsize=16)
plt.legend()
plt.show()
plt.figure(figsize=(15,5))
sns.scatterplot(x='Pregnancies',y='Outcome',data=df,palette="Set1")
plt.xlabel('Pregnancies', fontsize=13)
localhost:8888/nbconvert/html/Week2 .ipynb?download=false 1/7
9/16/2021 Week2
plt.ylabel('Outcome', fontsize=13)
plt.title('grouped scatter plot - Pregnancies vs Outcome',fontsize=16)
plt.show()
sns.pairplot(df, hue='Outcome')
plt.title('grouped scatter plot - All Variables',fontsize=16)
plt.legend()
plt.show()
for i in range(len(feature_cols)):
sns.FacetGrid(df,hue="Outcome",aspect=3,margin_titles=True).map(sns.kdeplot,feature
corr=df.corr()
corr
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=True,cmap='RdYlGn', linewidths=0.30)
plt.title("Healthcare Dataset Heatmap")
plt.show()
# Correlation Matrix Heatmap Visualization (should run this code again after removin
sns.set(style="white")
# Generate a mask for the upper triangle
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure to control size of heatmap
fig, ax = plt.subplots(figsize=(8,8))
# Create a custom color palette
cmap = sns.diverging_palette(255, 10, as_cmap=True)
# as_cmap returns a matplotlib colormap object rather than a list of colors
# Red=10, Green=128, Blue=255
# Plot the heatmap
sns.heatmap(df.corr(), mask=mask, annot=True, square=True, cmap=cmap , vmin=-1, vmax
# cannot display corr label
# Prevent Heatmap Cut-Off Issue
bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5, top-0.5)
Strategies:-
1. Data modelling is for the prediction of a binary Outcome. Value can be either 0 or 1.
In [3]: # Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_st
In [6]: #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algori
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors =13)
classifier_knn.fit(x_train_scaled, y_train)
y_pred_knn=classifier_knn.predict(x_test_scaled)
print('Accuracy of KNN : {}'.format(accuracy_score(y_test,y_pred_knn)))
In [7]: #Using SVC method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear', random_state= 0)
classifier_svc.fit(x_train_scaled, y_train)
y_pred_svc=classifier_svc.predict(x_test_scaled)
print('Accuracy of SVM-linear: {}'.format(accuracy_score(y_test,y_pred_svc)))
In [8]: #Using SVC-Kernel method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier_svc_rbf = SVC(kernel = 'rbf', random_state = 0,C=1)
classifier_svc_rbf.fit(x_train_scaled, y_train)
y_pred_svc_rbf=classifier_svc_rbf.predict(x_test_scaled)
print('Accuracy of SVM-RBF: {}'.format(accuracy_score(y_test,y_pred_svc_rbf)))
In [11]: #Using RandomForestClassifier method of ensemble class to use Random Forest Classifi
from sklearn.ensemble import RandomForestClassifier
classifier_rnd = RandomForestClassifier(n_estimators= 9, criterion = 'entropy', rand
classifier_rnd.fit(x_train_scaled, y_train)
y_pred_rnd=classifier_rnd.predict(x_test_scaled)
print('Accuracy of Random Forest Classifier: {}'.format(accuracy_score(y_test,y_pred
pd.Series(classifier_rnd.feature_importances_,index=x.columns).nlargest(10).plot(kin
plt.title('Top Features derived by Random Forest', size=20)
plt.show()
('Insulin', 5)
('SkinThickness', 4)
('BloodPressure', 3)
('Age', 2)
('Pregnancies', 1)
('Glucose', 1)
('BMI', 1)
('DiabetesPedigreeFunction', 1)
In [15]: pd.Series(rfe.ranking_,index=x.columns).nlargest(10).plot(kind='barh',color='Pink').
plt.title('Top Features derived by RFE', size=20)
plt.show()
y_pred = model.predict(x_test_scaled)
In [18]: df_models=pd.concat([baseline_model(classifier_logreg,x_train_scaled,y_train,x_test_
baseline_model(classifier_knn,x_train_scaled,y_train,x_test_scaled,y_test,'KNN Clas
baseline_model(classifier_svc,x_train_scaled,y_train,x_test_scaled,y_test,'Linear S
baseline_model(classifier_svc_rbf,x_train_scaled,y_train,x_test_scaled,y_test,'SVC-
baseline_model(classifier_nb,x_train_scaled,y_train,x_test_scaled,y_test,'Gaussian
baseline_model(classifier_dec,x_train_scaled,y_train,x_test_scaled,y_test,'Decision
baseline_model(classifier_rnd,x_train_scaled,y_train,x_test_scaled,y_test,'Random F
# Fine-tune figure; make subplots farther from each other, or nearer to each other.
fig.subplots_adjust(hspace=0.5, wspace=0.5)
In [ ]: