baseline.ipynb - Colab
baseline.ipynb - Colab
ipynb - Colab
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score, precision_recall_curve, roc_auc_score, classification_report, confusion_matri
import os
train_path = "/content/train.csv"
test_path = "/content/test.csv"
import os
print(os.path.exists(train_path))
print(os.path.exists(test_path))
True
True
if not os.path.isfile(train_path):
raise FileNotFoundError(f"Train file not found at {train_path}")
if not os.path.isfile(test_path):
raise FileNotFoundError(f"Test file not found at {test_path}")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
smoking_status stroke
count 2555 2554
unique 4 4
top never smoked 0
freq 945 2429
mean NaN NaN
std NaN NaN
min NaN NaN
25% NaN NaN
50% NaN NaN
75% NaN NaN
max NaN NaN
# Frequency distributions
print("Frequency of Males vs Females:")
print(train_df['gender'].value_counts())
https://colab.research.google.com/drive/1c7A42T1cSYjgGBxwg3EqrD-DLX2Q_M80#scrollTo=V_4Zg9S-Yrjt&printMode=true 2/5
2/23/25, 2:09 PM baseline.ipynb - Colab
1 221 33 1 0
https://colab.research.google.com/drive/1c7A42T1cSYjgGBxwg3EqrD-DLX2Q_M80#scrollTo=V_4Zg9S-Yrjt&printMode=true 3/5
2/23/25, 2:09 PM baseline.ipynb - Colab
https://colab.research.google.com/drive/1c7A42T1cSYjgGBxwg3EqrD-DLX2Q_M80#scrollTo=V_4Zg9S-Yrjt&printMode=true 4/5
2/23/25, 2:09 PM baseline.ipynb - Colab
# Baseline Model Without Preprocessing
X_baseline = train_df.drop(columns=["stroke"], errors='ignore')
y_baseline = train_df["stroke"].replace({"Yes": 1, "yes": 1, "0": 0, "1": 1})
# Handle categorical variables by simple label encoding (no one-hot encoding for baseline)
X_baseline = X_baseline.apply(lambda col: col.astype('category').cat.codes if col.dtypes == 'O' else col)
# Train-Test Split
X_train_base, X_val_base, y_train_base, y_val_base = train_test_split(X_baseline, y_baseline, test_size=0.2, random_stat
# Predictions
y_probs_base = model_base.predict_proba(X_val_base)[:, 1]
y_pred_base = model_base.predict(X_val_base)
# Evaluation Metrics
auc_score = roc_auc_score(y_val_base, y_probs_base)
f_beta_base = fbeta_score(y_val_base, y_pred_base, beta=10)
class_report = classification_report(y_val_base, y_pred_base)
conf_matrix = confusion_matrix(y_val_base, y_pred_base)
# Gender-based predictions
gender_counts = pd.crosstab(train_df["gender"], train_df["stroke"])
def print_baseline_results():
print(f"Baseline AUC: {auc_score:.4f}")
print(f"Baseline F-beta (β=10): {f_beta_base:.4f}")
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)
print("Stroke distribution by gender:\n", gender_counts)
print_baseline_results()
https://colab.research.google.com/drive/1c7A42T1cSYjgGBxwg3EqrD-DLX2Q_M80#scrollTo=V_4Zg9S-Yrjt&printMode=true 5/5