0% found this document useful (0 votes)
10 views2 pages

Absenteeism Module

The document contains a Python script for a custom absenteeism prediction model using machine learning. It includes a CustomScaler class for data preprocessing and an absenteeism_model class that handles data loading, cleaning, and prediction. The script processes absenteeism data by transforming features and predicting probabilities and outputs based on the trained model.

Uploaded by

kmzdr1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views2 pages

Absenteeism Module

The document contains a Python script for a custom absenteeism prediction model using machine learning. It includes a CustomScaler class for data preprocessing and an absenteeism_model class that handles data loading, cleaning, and prediction. The script processes absenteeism data by transforming features and predicting probabilities and outputs based on the trained model.

Uploaded by

kmzdr1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

2/16/2019 Dropbox - The 5 files rar - Simplify your life

1 # coding: utf-8
2
3 # In[1]:
4
5
6 # import all libraries needed
7 import numpy as np
8 import pandas as pd
9 import pickle
10 from sklearn.preprocessing import StandardScaler
11 from sklearn.base import BaseEstimator, TransformerMixin
12
13 # the custom scaler class
14 class CustomScaler(BaseEstimator,TransformerMixin):
15
16 def __init__(self,columns,copy=True,with_mean=True,with_std=True):
17 self.scaler = StandardScaler(copy,with_mean,with_std)
18 self.columns = columns
19 self.mean_ = None
20 self.var_ = None
21
22 def fit(self, X, y=None):
23 self.scaler.fit(X[self.columns], y)
24 self.mean_ = np.array(np.mean(X[self.columns]))
25 self.var_ = np.array(np.var(X[self.columns]))
26 return self
27
28 def transform(self, X, y=None, copy=None):
29 init_col_order = X.columns
30 X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
31 X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
32 return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
33
34
35 # create the special class that we are going to use from here on to predict new data
36 class absenteeism_model():
37
38 def __init__(self, model_file, scaler_file):
39 # read the 'model' and 'scaler' files which were saved
40 with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
41 self.reg = pickle.load(model_file)
42 self.scaler = pickle.load(scaler_file)
43 self.data = None
44
45 # take a data file (*.csv) and preprocess it in the same way as in the lectures
46 def load_and_clean_data(self, data_file):
47
48 # import the data
49 df = pd.read_csv(data_file,delimiter=',')
50 # store the data in a new variable for later use
51 self.df_with_predictions = df.copy()
52 # drop the 'ID' column
53 df = df.drop(['ID'], axis = 1)
54 # to preserve the code we've created in the previous section, we will add a column with 'NaN' strings
55 df['Absenteeism Time in Hours'] = 'NaN'
56
57 # create a separate dataframe, containing dummy values for ALL avaiable reasons
58 reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
59
60 # split reason_columns into 4 types
61 reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
62 reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
63 reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
64 reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
65
66 # to avoid multicollinearity, drop the 'Reason for Absence' column from df
67 df = df.drop(['Reason for Absence'], axis = 1)
68
69 # concatenate df and the 4 types of reason for absence
70 df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
71
72 # assign names to the 4 reason type columns
73 # note: there is a more universal version of this code, however the following will best suit our current purposes
74 column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
75 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
76 'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
77 df.columns = column_names
78
79 # re-order the columns in df
80 column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense',
81 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
82 'Children', 'Pet', 'Absenteeism Time in Hours']
83 df = df[column_names_reordered]
84
85 # convert the 'Date' column into datetime
86 df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
87
88 # create a list with month values retrieved from the 'Date' column
89 list_months = []
90 for i in range(df.shape[0]):
91 list_months.append(df['Date'][i].month)
92
93 # insert the values in a new column in df, called 'Month Value'
94 df['Month Value'] = list_months
95
96 # create a new feature called 'Day of the Week'
97 df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
98
99
100 # drop the 'Date' column from df
101 df = df.drop(['Date'], axis = 1)
102
103 # re-order the columns in df
104 column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
105 'Transportation Expense', 'Distance to Work', 'Age',
106 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
107 'Pet', 'Absenteeism Time in Hours']
108 df = df[column_names_upd]

https://www.dropbox.com/sh/t536dzy3h9dimjp/AAD-H2Av7myydXkFjMacBUMMa?dl=0&preview=absenteeism_module.py 1/2
2/16/2019 Dropbox - The 5 files rar - Simplify your life
109
110
111 # map 'Education' variables; the result is a dummy
112 df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
113
114 # replace the NaN values
115 df = df.fillna(value=0)
116
117 # drop the original absenteeism time
118 df = df.drop(['Absenteeism Time in Hours'],axis=1)
119
120 # drop the variables we decide we don't need
121 df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
122
123 # we have included this line of code if you want to call the 'preprocessed data'
124 self.preprocessed_data = df.copy()
125
126 # we need this line so we can use it in the next functions
127 self.data = self.scaler.transform(df)
128
129 # a function which outputs the probability of a data point to be 1
130 def predicted_probability(self):
131 if (self.data is not None):
132 pred = self.reg.predict_proba(self.data)[:,1]
133 return pred
134
135 # a function which outputs 0 or 1 based on our model
136 def predicted_output_category(self):
137 if (self.data is not None):
138 pred_outputs = self.reg.predict(self.data)
139 return pred_outputs
140
141 # predict the outputs and the probabilities and
142 # add columns with these values at the end of the new data
143 def predicted_outputs(self):
144 if (self.data is not None):
145 self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
146 self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
147 return self.preprocessed_data

https://www.dropbox.com/sh/t536dzy3h9dimjp/AAD-H2Av7myydXkFjMacBUMMa?dl=0&preview=absenteeism_module.py 2/2

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy