Code for Handling Imbalanced Datasets: A Case Study with Customer Churn Tutorial


View on Github

handling_imbalance_datasets_pythoncode_tutorial.ipynb

# %% [markdown]
# ## Loading the dataset

# %%
# !pip install --upgrade gdown

# %%
# !gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs

# %%
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

# %%
data=pd.read_csv("data_regression.csv")
# get the first 10 rows
data.head(10)

# %%
# check for the missing values and dataframes
def datainspection(dataframe):
  print("Types of the variables we are working with:")
  print(dataframe.dtypes)
  
  print("Total Samples with missing values:")

  print(data.isnull().any(axis=1).sum()) # null values

  print("Total Missing Values per Variable")
  print(data.isnull().sum())
  print("Map of missing values")
  sns.heatmap(dataframe.isnull())

# %%
datainspection(data)

# %%
data = data.dropna() # cleaning up null values

# %%
# function for encoding categorical variables
def encode_cat(data, vars):
  ord_en = OrdinalEncoder() 
  for v in vars:
    name = v+'_code' # add _code for encoded variables
    data[name] = ord_en.fit_transform(data[[v]])
    print('The encoded values for '+ v + ' are:')
    print(data[name].unique())
  return data
data.head()

# %%
# check for the encoded variables
data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])
data.head()

# %%
def full_plot(data, class_col, cols_to_exclude):
  cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe
  X = data[cols] # creating a dataframe only with the numerical columns
  X = X[X.columns.difference(cols_to_exclude)] # columns to exclude
  X = X[X.columns.difference([class_col])]
  sns.pairplot(data, hue=class_col)

# %%
full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])

# %%
# function for creating plots for selective columns only
def selected_diagnotic(data,class_col, cols_to_eval):
  cols_to_eval.append(class_col) 
  X = data[cols_to_eval] # only selective columns
  sns.pairplot(X, hue=class_col) # plot

# %%
selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])

# %%
def logistic_regression(data, class_col, cols_to_exclude):
  cols = data.select_dtypes(include=np.number).columns.tolist() 
  X = data[cols]
  X = X[X.columns.difference([class_col])] 
  X = X[X.columns.difference(cols_to_exclude)] # unwanted columns 

  y = data[class_col] # the target variable 
  logit_model = sm.Logit(y,X) 
  result = logit_model.fit() # fit the model 
  print(result.summary2()) # check for summary 

# %%
logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])

# %%
def prepare_data(data, class_col, cols_to_exclude):
  ## Split in training and test set
  ## Selecting only the numerical columns and excluding the columns we specified in the function
  cols = data.select_dtypes(include=np.number).columns.tolist() 
  X = data[cols]
  X = X[X.columns.difference([class_col])] 
  X = X[X.columns.difference(cols_to_exclude)]
  ## Selecting y as a column
  y = data[class_col]
  return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split

# %%
def run_model(X_train, X_test, y_train, y_test):
  # Fitting the logistic regression
  logreg = LogisticRegression(random_state=13)
  logreg.fit(X_train, y_train) # fit the model
  # Predicting y values
  y_pred = logreg.predict(X_test) # make predictions on th test data
  logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
  print(classification_report(y_test, y_pred)) # check for classification report 
  print("The area under the curve is:", logit_roc_auc)  # check for AUC
  return y_pred

# %%
X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
y_pred = run_model(X_train, X_test, y_train, y_test)

# %%
from sklearn.metrics import confusion_matrix

def confusion_m(y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred)
  print(cm)
  tn, fp, fn, tp = cm.ravel()
  print("TN:", tn)
  print("TP:", tp)
  print("FN:", fn)
  print("FP:", fp)

# %%
## Call the function
confusion_m(y_test, y_pred)

# %%
# class imbalance method 1 
def run_model_bweights(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter
    logreg.fit(X_train, y_train) # fit the model 
    y_pred = logreg.predict(X_test) # predict on test data
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score
    print(classification_report(y_test, y_pred)) 
    print("The area under the curve is:", logit_roc_auc) # AUC curve

# %%
run_model_bweights(X_train, X_test, y_train, y_test)

# %%
# class imbalance method 2
def run_model_aweights(X_train, X_test, y_train, y_test, w):
    logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter
    logreg.fit(X_train, y_train) # fit the model 
    y_pred = logreg.predict(X_test) # predict on test data
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))  # ROC AUC score
    print(classification_report(y_test, y_pred))
    print("The area under the curve is: %0.2f"%logit_roc_auc)  # AUC curve

# %%
run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})

# %%
# class imbalance method 3
def adjust_imbalance(X_train, y_train, class_col):
  X = pd.concat([X_train, y_train], axis=1)
  # separate the 2 classes. Here we divide majority and minority classes
  class0 = X[X[class_col] == 0]
  class1 = X[X[class_col] == 1]
  # Case 1 - bootstraps from the minority class
  if len(class1)<len(class0):
    resampled = resample(class1,
                              replace=True, # Upsampling with replacement
                              n_samples=len(class0), ## Number to match majority class
                              random_state=10) 
    resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class
  # Case 1 - resamples from the majority class
  else:
    resampled = resample(class1,
                              replace=False, ## false instead of True like above
                              n_samples=len(class0), 
                              random_state=10) 
    resampled_data = pd.concat([resampled, class0])
  return resampled_data

# %%
## Call the function
resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')

# %%
X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
run_model(X_train, X_test, y_train, y_test)

# %%
def prepare_data_smote(data,class_col,cols_to_exclude):
  # Synthetic Minority Oversampling Technique. 
  # Generates new instances from existing minority cases that you supply as input. 
  cols = data.select_dtypes(include=np.number).columns.tolist() 
  X = data[cols]
  X = X[X.columns.difference([class_col])]
  X = X[X.columns.difference(cols_to_exclude)]
  y = data[class_col]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
  sm = SMOTE(random_state=0, sampling_strategy=1.0)
  # run SMOTE on training set only
  X_train, y_train = sm.fit_resample(X_train, y_train)
  return X_train, X_test, y_train, y_test

# %%
X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
run_model(X_train, X_test, y_train, y_test)




pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy