Absenteeism Module
Absenteeism Module
1 # coding: utf-8
2
3 # In[1]:
4
5
6 # import all libraries needed
7 import numpy as np
8 import pandas as pd
9 import pickle
10 from sklearn.preprocessing import StandardScaler
11 from sklearn.base import BaseEstimator, TransformerMixin
12
13 # the custom scaler class
14 class CustomScaler(BaseEstimator,TransformerMixin):
15
16 def __init__(self,columns,copy=True,with_mean=True,with_std=True):
17 self.scaler = StandardScaler(copy,with_mean,with_std)
18 self.columns = columns
19 self.mean_ = None
20 self.var_ = None
21
22 def fit(self, X, y=None):
23 self.scaler.fit(X[self.columns], y)
24 self.mean_ = np.array(np.mean(X[self.columns]))
25 self.var_ = np.array(np.var(X[self.columns]))
26 return self
27
28 def transform(self, X, y=None, copy=None):
29 init_col_order = X.columns
30 X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
31 X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
32 return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
33
34
35 # create the special class that we are going to use from here on to predict new data
36 class absenteeism_model():
37
38 def __init__(self, model_file, scaler_file):
39 # read the 'model' and 'scaler' files which were saved
40 with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
41 self.reg = pickle.load(model_file)
42 self.scaler = pickle.load(scaler_file)
43 self.data = None
44
45 # take a data file (*.csv) and preprocess it in the same way as in the lectures
46 def load_and_clean_data(self, data_file):
47
48 # import the data
49 df = pd.read_csv(data_file,delimiter=',')
50 # store the data in a new variable for later use
51 self.df_with_predictions = df.copy()
52 # drop the 'ID' column
53 df = df.drop(['ID'], axis = 1)
54 # to preserve the code we've created in the previous section, we will add a column with 'NaN' strings
55 df['Absenteeism Time in Hours'] = 'NaN'
56
57 # create a separate dataframe, containing dummy values for ALL avaiable reasons
58 reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
59
60 # split reason_columns into 4 types
61 reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
62 reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
63 reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
64 reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
65
66 # to avoid multicollinearity, drop the 'Reason for Absence' column from df
67 df = df.drop(['Reason for Absence'], axis = 1)
68
69 # concatenate df and the 4 types of reason for absence
70 df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
71
72 # assign names to the 4 reason type columns
73 # note: there is a more universal version of this code, however the following will best suit our current purposes
74 column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
75 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
76 'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
77 df.columns = column_names
78
79 # re-order the columns in df
80 column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense',
81 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
82 'Children', 'Pet', 'Absenteeism Time in Hours']
83 df = df[column_names_reordered]
84
85 # convert the 'Date' column into datetime
86 df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
87
88 # create a list with month values retrieved from the 'Date' column
89 list_months = []
90 for i in range(df.shape[0]):
91 list_months.append(df['Date'][i].month)
92
93 # insert the values in a new column in df, called 'Month Value'
94 df['Month Value'] = list_months
95
96 # create a new feature called 'Day of the Week'
97 df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
98
99
100 # drop the 'Date' column from df
101 df = df.drop(['Date'], axis = 1)
102
103 # re-order the columns in df
104 column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
105 'Transportation Expense', 'Distance to Work', 'Age',
106 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
107 'Pet', 'Absenteeism Time in Hours']
108 df = df[column_names_upd]
https://www.dropbox.com/sh/t536dzy3h9dimjp/AAD-H2Av7myydXkFjMacBUMMa?dl=0&preview=absenteeism_module.py 1/2
2/16/2019 Dropbox - The 5 files rar - Simplify your life
109
110
111 # map 'Education' variables; the result is a dummy
112 df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
113
114 # replace the NaN values
115 df = df.fillna(value=0)
116
117 # drop the original absenteeism time
118 df = df.drop(['Absenteeism Time in Hours'],axis=1)
119
120 # drop the variables we decide we don't need
121 df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
122
123 # we have included this line of code if you want to call the 'preprocessed data'
124 self.preprocessed_data = df.copy()
125
126 # we need this line so we can use it in the next functions
127 self.data = self.scaler.transform(df)
128
129 # a function which outputs the probability of a data point to be 1
130 def predicted_probability(self):
131 if (self.data is not None):
132 pred = self.reg.predict_proba(self.data)[:,1]
133 return pred
134
135 # a function which outputs 0 or 1 based on our model
136 def predicted_output_category(self):
137 if (self.data is not None):
138 pred_outputs = self.reg.predict(self.data)
139 return pred_outputs
140
141 # predict the outputs and the probabilities and
142 # add columns with these values at the end of the new data
143 def predicted_outputs(self):
144 if (self.data is not None):
145 self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
146 self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
147 return self.preprocessed_data
https://www.dropbox.com/sh/t536dzy3h9dimjp/AAD-H2Av7myydXkFjMacBUMMa?dl=0&preview=absenteeism_module.py 2/2