Statisitics Project 6
Statisitics Project 6
In [1]:
import numpy as np
import pandas as pd
filterwarnings('ignore')
In [3]:
df = pd.read_csv('Holiday_Package.csv')
In [4]:
df.head()
Out[4]:
Unnamed:
Holliday_Package Salary age educ no_young_children no_older_children fo
0
0 1 no 48412 30 8 1 1
1 2 yes 37207 45 8 0 1
2 3 no 58022 46 9 0 0
3 4 no 66503 31 11 2 0
4 5 no 66734 44 12 0 2
In [5]:
df.tail()
Out[5]:
Unnamed:
Holliday_Package Salary age educ no_young_children no_older_children
0
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 1/48
7/3/2021 temp-162530817097237750
In [6]:
df.shape
Out[6]:
(872, 8)
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Data Description
In [8]:
df.describe(include ='all').T
Out[8]:
Salary 872 NaN NaN NaN 47729.2 23418.7 1322 35324 41903.5 534
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 2/48
7/3/2021 temp-162530817097237750
In [9]:
df.isnull().sum()
Out[9]:
Unnamed: 0 0
Holliday_Package 0
Salary 0
age 0
educ 0
no_young_children 0
no_older_children 0
foreign 0
dtype: int64
In [10]:
dups = df.duplicated()
In [11]:
if df[column].dtype == 'object':
print(column.upper(),': ',df[column].nunique())
print(df[column].value_counts().sort_values())
print('\n')
HOLLIDAY_PACKAGE : 2
yes 401
no 471
FOREIGN : 2
yes 216
no 656
df.Holliday_Package.value_counts(1)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 3/48
7/3/2021 temp-162530817097237750
In [12]:
fig.set_size_inches(15, 23)
a.set_title("Salary Distribution",fontsize=10)
a.set_title("Salary Distribution",fontsize=15)
a.set_title("age Distribution",fontsize=10)
a.set_title("age Distribution",fontsize=10)
a.set_title("educ Distribution",fontsize=10)
a.set_title("educ Distribution",fontsize=10)
a.set_title("no_young_children Distribution",fontsize=10)
a.set_title("no_young_children Distribution",fontsize=10)
a.set_title("no_older_children Distribution",fontsize=10)
a.set_title("no_older_children Distribution",fontsize=10)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 4/48
7/3/2021 temp-162530817097237750
Out[12]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 5/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 6/48
7/3/2021 temp-162530817097237750
In [13]:
df.columns
Out[13]:
dtype='object')
In [14]:
df.skew()
Out[14]:
Unnamed: 0 0.000000
Salary 3.103216
age 0.146412
educ -0.045501
no_young_children 1.946515
no_older_children 0.953951
dtype: float64
Categorical Variables
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 7/48
7/3/2021 temp-162530817097237750
In [15]:
Out[15]:
<AxesSubplot:xlabel='foreign', ylabel='count'>
In [16]:
Out[16]:
<AxesSubplot:xlabel='Holliday_Package', ylabel='count'>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 8/48
7/3/2021 temp-162530817097237750
In [17]:
sns.catplot(x="Holliday_Package", y="Salary",kind="swarm",data=df)
Out[17]:
<seaborn.axisgrid.FacetGrid at 0x2591280b700>
In [18]:
sns.catplot(x="Holliday_Package", y="age",kind="swarm",data=df)
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x259121c45b0>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 9/48
7/3/2021 temp-162530817097237750
In [19]:
sns.catplot(x="Holliday_Package", y="educ",kind="swarm",data=df)
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x2591280bca0>
In [21]:
sns.catplot(x="Holliday_Package", y="no_young_children",kind="swarm",data=df)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x259122bae20>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 10/48
7/3/2021 temp-162530817097237750
In [22]:
sns.catplot(x="Holliday_Package", y="no_older_children",kind="swarm",data=df)
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x25912222760>
In [23]:
Out[23]:
<AxesSubplot:xlabel='age', ylabel='Salary'>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 11/48
7/3/2021 temp-162530817097237750
In [24]:
palette="Set1")
Out[24]:
<seaborn.axisgrid.FacetGrid at 0x2591231ca30>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 12/48
7/3/2021 temp-162530817097237750
In [25]:
palette="Set1")
Out[25]:
<seaborn.axisgrid.FacetGrid at 0x259124438e0>
In [26]:
Out[26]:
<AxesSubplot:xlabel='educ', ylabel='Salary'>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 13/48
7/3/2021 temp-162530817097237750
In [27]:
Out[27]:
<AxesSubplot:xlabel='no_young_children', ylabel='age'>
In [28]:
palette="Set1")
Out[28]:
<seaborn.axisgrid.FacetGrid at 0x25912509580>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 14/48
7/3/2021 temp-162530817097237750
In [29]:
Out[29]:
<AxesSubplot:xlabel='no_older_children', ylabel='age'>
In [30]:
palette="Set1")
Out[30]:
<seaborn.axisgrid.FacetGrid at 0x259125e3a60>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 15/48
7/3/2021 temp-162530817097237750
In [31]:
for i in cols:
sns.boxplot(df[i])
plt.show()
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 16/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 17/48
7/3/2021 temp-162530817097237750
In [32]:
df.columns
Out[32]:
dtype='object')
Data Distribution
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 18/48
7/3/2021 temp-162530817097237750
In [33]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 19/48
7/3/2021 temp-162530817097237750
In [34]:
df_cor = df.corr()
plt.figure(figsize=(8,6))
Out[34]:
<AxesSubplot:>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 20/48
7/3/2021 temp-162530817097237750
In [35]:
df.isnull().sum()
Out[35]:
Unnamed: 0 0
Holliday_Package 0
Salary 0
age 0
educ 0
no_young_children 0
no_older_children 0
foreign 0
dtype: int64
Treating Outliers
In [36]:
In [37]:
def remove_outlier(col):
sorted(col)
Q1,Q3=np.percentile(col,[25,75])
IQR=Q3-Q1
In [41]:
lr,ur=remove_outlier(df[column])
df[column]=np.where(df[column]>ur,ur,df[column])
df[column]=np.where(df[column]<lr,lr,df[column])
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 21/48
7/3/2021 temp-162530817097237750
In [42]:
for i in cols:
sns.boxplot(df[i])
plt.show()
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 22/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 23/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 24/48
7/3/2021 temp-162530817097237750
In [40]:
df.head()
Out[40]:
Unnamed:
Holliday_Package Salary age educ no_young_children no_older_children
0
In [43]:
In [44]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
In [45]:
df2 = df1.copy()
2.2 Do not scale the data. Encode the data (having string values) for Modelling. Data Split: Split the data into
train and test (70:30). Apply Logistic Regression and LDA (linear discriminant analysis).
In [46]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 25/48
7/3/2021 temp-162530817097237750
In [47]:
data.head()
Out[47]:
In [48]:
data.columns
Out[48]:
'Holliday_Package_yes', 'foreign_yes'],
dtype='object')
In [49]:
X = data.drop('Holliday_Package_yes', axis=1)
y = data['Holliday_Package_yes']
In [50]:
In [51]:
y_train.value_counts(1)
Out[51]:
0 0.539344
1 0.460656
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 26/48
7/3/2021 temp-162530817097237750
In [53]:
grid={'penalty':['l1','l2','none'],
'solver':['lbfgs', 'liblinear'],
'tol':[0.0001,0.000001]}
In [54]:
model = LogisticRegression(max_iter=100000,n_jobs=2)
In [55]:
In [56]:
grid_search.fit(X_train, y_train)
Out[56]:
n_jobs=-1,
scoring='f1')
In [57]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)
In [58]:
best_model = grid_search.best_estimator_
In [59]:
ytrain_predict = best_model.predict(X_train)
ytest_predict = best_model.predict(X_test)
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 27/48
7/3/2021 temp-162530817097237750
In [60]:
ytrain_predict
Out[60]:
array([1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0], dtype=uint8)
In [61]:
ytest_predict_prob=best_model.predict_proba(X_test)
pd.DataFrame(ytest_predict_prob).head()
Out[61]:
0 1
0 0.636523 0.363477
1 0.576651 0.423349
2 0.650835 0.349165
3 0.568064 0.431936
4 0.536356 0.463644
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 28/48
7/3/2021 temp-162530817097237750
In [62]:
plot_confusion_matrix(best_model,X_train,y_train)
print(classification_report(y_train, ytrain_predict),'\n');
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 29/48
7/3/2021 temp-162530817097237750
In [63]:
plot_confusion_matrix(best_model,X_test,y_test)
print(classification_report(y_test, ytest_predict),'\n');
In [64]:
lr_train_acc
Out[64]:
0.6344262295081967
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 30/48
7/3/2021 temp-162530817097237750
In [65]:
# predict probabilities
probs = best_model.predict_proba(X_train)
probs = probs[:, 1]
# calculate AUC
plt.plot(train_fpr, train_tpr);
AUC: 0.661
In [66]:
lr_test_acc
Out[66]:
0.6564885496183206
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 31/48
7/3/2021 temp-162530817097237750
In [67]:
# predict probabilities
probs = best_model.predict_proba(X_test)
probs = probs[:, 1]
# calculate AUC
plt.plot(test_fpr, test_tpr);
AUC: 0.675
In [68]:
lr_metrics=classification_report(y_train, ytrain_predict,output_dict=True)
df=pd.DataFrame(lr_metrics).transpose()
lr_train_f1=round(df.loc["1"][2],2)
lr_train_recall=round(df.loc["1"][1],2)
lr_train_precision=round(df.loc["1"][0],2)
lr_train_precision 0.65
lr_train_recall 0.45
lr_train_f1 0.53
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 32/48
7/3/2021 temp-162530817097237750
In [69]:
lr_metrics=classification_report(y_test, ytest_predict,output_dict=True)
df=pd.DataFrame(lr_metrics).transpose()
lr_test_f1=round(df.loc["1"][2],2)
lr_test_recall=round(df.loc["1"][1],2)
lr_test_precision=round(df.loc["1"][0],2)
lr_test_precision 0.69
lr_test_recall 0.45
lr_test_f1 0.55
LDA MODEL
In [70]:
df1.head()
Out[70]:
In [71]:
df1.shape
Out[71]:
(872, 7)
In [72]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 33/48
7/3/2021 temp-162530817097237750
In [73]:
if df1[feature].dtype == 'object':
print('\n')
print('feature:',feature)
print(pd.Categorical(df1[feature].unique()))
print(pd.Categorical(df1[feature].unique()).codes)
df1[feature] = pd.Categorical(df1[feature]).codes
feature: Holliday_Package
['no', 'yes']
[0 1]
feature: foreign
['no', 'yes']
[0 1]
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 34/48
7/3/2021 temp-162530817097237750
In [74]:
for i in cols:
sns.boxplot(df1[i])
plt.show()
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 35/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 36/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 37/48
7/3/2021 temp-162530817097237750
In [75]:
df1.head()
Out[75]:
In [76]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
In [77]:
X = df1.drop('Holliday_Package',axis=1)
Y = df1.pop('Holliday_Package')
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 38/48
7/3/2021 temp-162530817097237750
In [78]:
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.30,ran
dom_state=1,stratify = Y)
In [79]:
clf = LinearDiscriminantAnalysis()
model=clf.fit(X_train,Y_train)
In [80]:
pred_class_train = model.predict(X_train)
pred_class_test = model.predict(X_test)
In [81]:
pred_class_test
Out[81]:
array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
dtype=int8)
In [82]:
pred_prob_train = model.predict_proba(X_train)
pred_prob_test = model.predict_proba(X_test)
In [83]:
lda_train_acc = model.score(X_train,Y_train)
lda_train_acc
Out[83]:
0.6327868852459017
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 39/48
7/3/2021 temp-162530817097237750
In [84]:
print(classification_report(Y_train, pred_class_train))
In [85]:
confusion_matrix(Y_train, pred_class_train)
Out[85]:
array([[263, 66],
In [86]:
lda_test_acc = model.score(X_test,Y_test)
lda_test_acc
Out[86]:
0.6564885496183206
In [87]:
print(classification_report(Y_test, pred_class_test))
In [88]:
confusion_matrix(Y_test, pred_class_test)
Out[88]:
array([[118, 24],
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 40/48
7/3/2021 temp-162530817097237750
In [89]:
for j in np.arange(0.1,1,0.1):
a=1#if the probability values are greater than the custom cutoff then the v
alue should be 1
else:
a=0#if the probability values are less than the custom cutoff then the valu
e should be 0
print(round(j,3),'\n')
print('Accuracy Score',round(metrics.accuracy_score(Y_train,custom_cutoff_data),4))
print('F1 Score',round(metrics.f1_score(Y_train,custom_cutoff_data),4),'\n')
plt.figure(figsize=(6,4))
print('Confusion Matrix')
sns.heatmap(metrics.confusion_matrix(Y_train,custom_cutoff_data),annot=True,fmt='.4
g'),'\n\n'
plt.show();
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 41/48
7/3/2021 temp-162530817097237750
0.1
F1 Score 0.6308
Confusion Matrix
0.2
F1 Score 0.6365
Confusion Matrix
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 42/48
7/3/2021 temp-162530817097237750
0.3
F1 Score 0.6485
Confusion Matrix
0.4
F1 Score 0.6088
Confusion Matrix
0.5
F1 Score 0.5234
Confusion Matrix
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 43/48
7/3/2021 temp-162530817097237750
0.6
F1 Score 0.446
Confusion Matrix
0.7
F1 Score 0.2455
Confusion Matrix
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 44/48
7/3/2021 temp-162530817097237750
0.8
F1 Score 0.0071
Confusion Matrix
0.9
F1 Score 0.0
Confusion Matrix
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 45/48
7/3/2021 temp-162530817097237750
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 46/48
7/3/2021 temp-162530817097237750
In [90]:
# calculate AUC
lda_train_auc = metrics.roc_auc_score(Y_train,pred_prob_train[:,1])
# calculate AUC
lda_test_auc = metrics.roc_auc_score(Y_test,pred_prob_test[:,1])
plt.legend(loc='best')
plt.show()
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 47/48
7/3/2021 temp-162530817097237750
In [91]:
lda_metrics=classification_report(Y_train, pred_class_train,output_dict=True)
df=pd.DataFrame(lda_metrics).transpose()
lda_train_f1=round(df.loc["1"][2],2)
lda_train_recall=round(df.loc["1"][1],2)
lda_train_precision=round(df.loc["1"][0],2)
lda_train_precision 0.65
lda_train_recall 0.44
lda_train_f1 0.53
In [92]:
lda_metrics=classification_report(Y_test, pred_class_test,output_dict=True)
df=pd.DataFrame(lda_metrics).transpose()
lda_test_f1=round(df.loc["1"][2],2)
lda_test_recall=round(df.loc["1"][1],2)
lda_test_precision=round(df.loc["1"][0],2)
lda_test_precision 0.69
lda_test_recall 0.45
lda_test_f1 0.55
In [93]:
'LR Test':[lr_test_acc,lr_test_auc,lr_test_recall,lr_test_precision,lr_test_f1
],
'LDA Train':[lda_train_acc,lda_train_auc,lda_train_recall,lda_train_precision,ld
a_train_f1],
'LDA Test':[lda_test_acc,lda_test_auc,lda_test_recall,lda_test_precision,lda_te
st_f1],},index=index)
round(data,2)
Out[93]:
In [ ]:
https://htmtopdf.herokuapp.com/ipynbviewer/temp/c9ffe7dee6cf683104ff5b70752d8eb0/Holiday_Package.html?t=1625308272464 48/48