Survival Analysis: Final Selected CatBoost Model

Author

R. Jerome Dixon

Published

February 15, 2025

Libraries

Demographics Table

TraitTransplantedWL MortalityP-value
Gender (% Female)44.1% (N=4983)43.1% (N=540)0.709
Age4.0 [0.0, 10.5] (N=4983)0.0 [0.0, 1.6] (N=540)0.000
Weight15.0 [0.0, 33.5] (N=4982)7.6 [2.4, 12.7] (N=540)0.000
Height99.0 [55.0, 143.0] (N=4980)67.0 [45.6, 88.4] (N=540)0.000
BMI16.2 [13.7, 18.6] (N=4980)15.4 [13.6, 17.2] (N=540)0.000
Blood Type A36.0% (N=4983)33.9% (N=540)0.345
Blood Type B13.6% (N=4983)11.5% (N=540)0.190
Blood Type AB4.1% (N=4983)2.2% (N=540)0.042
Blood Type O46.2% (N=4983)52.4% (N=540)0.007
Race White53.0% (N=4983)50.4% (N=540)0.268
Race Black20.0% (N=4983)23.0% (N=540)0.123
Race Other3.1% (N=4983)3.0% (N=540)1.000
VAD %13.3% (N=4983)8.5% (N=540)0.002
eGFR96.1 [72.3, 120.0] (N=4966)82.2 [54.2, 110.2] (N=540)0.000
Albumin3.6 [3.1, 4.1] (N=4836)3.3 [2.8, 3.8] (N=518)0.000
Dialysis %1.2% (N=4978)4.8% (N=540)0.000
Ventilator %15.8% (N=4983)35.0% (N=540)0.000
ECMO %4.6% (N=4983)14.6% (N=540)0.000

Native CatBoost Model

Show the code
#| echo: false
#| warning: false
#| message: false
#| eval: true

# Train
train_data <- survival_model %>% 
  select(-outcome)

# Must be factor or numeric "Class"
train_target <- survival_model %>% 
  select(outcome) %>% 
  as.data.frame()

train_Y <- survival_model$outcome


# Test
test_data <- survival_model_test %>% 
  select(-outcome)

# Must be factor or numeric "Class"
test_target <- survival_model_test %>% 
  select(outcome) %>% 
  as.data.frame()

test_Y <- survival_model_test$outcome

Optuna Hyperparameter Optimization

  • Best is trial #19/50 with value: 0.74

Model

Show the code

model_params_native = {
    'learning_rate': 0.08,
    'depth': 7,
    'colsample_bylevel': 0.87,
    'min_data_in_leaf': 82,
    'l2_leaf_reg': 7.61
}
    
Show the code
#| eval: true
#| echo: false
#| message: false
#| warning: false

import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
import optuna
from catboost import CatBoostClassifier, Pool


model_auc = CatBoostClassifier(objective='Logloss',
                               iterations=1000,
                               eval_metric="AUC",
                               **model_params_native, 
                               boosting_type='Ordered',
                               bootstrap_type='MVS',
                               metric_period=25,
                               early_stopping_rounds=100,
                               use_best_model=False, 
                               random_seed=1997)

# Create a Pool object for the training and testing data
train_pool = Pool(X_train, cat_features=cat_index, label=Y_train)
test_pool = Pool(X_test, cat_features=cat_index, label=Y_test)
 

model_auc.fit(train_pool, eval_set=test_pool)
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:  test: 0.6289521 best: 0.6289521 (0) total: 237ms    remaining: 3m 57s
25: test: 0.7240600 best: 0.7271551 (17)    total: 960ms    remaining: 35.9s
50: test: 0.7328268 best: 0.7328268 (50)    total: 1.99s    remaining: 37s
75: test: 0.7346937 best: 0.7354579 (56)    total: 2.9s remaining: 35.2s
100:    test: 0.7344426 best: 0.7361784 (77)    total: 3.85s    remaining: 34.2s
125:    test: 0.7341369 best: 0.7363640 (110)   total: 5.07s    remaining: 35.2s
150:    test: 0.7322045 best: 0.7363640 (110)   total: 6.43s    remaining: 36.1s
175:    test: 0.7320189 best: 0.7368772 (158)   total: 7.77s    remaining: 36.4s
200:    test: 0.7304249 best: 0.7368772 (158)   total: 9.12s    remaining: 36.3s
225:    test: 0.7312983 best: 0.7368772 (158)   total: 10.5s    remaining: 36s
250:    test: 0.7257959 best: 0.7368772 (158)   total: 11.9s    remaining: 35.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7368771562
bestIteration = 158

<catboost.core.CatBoostClassifier object at 0x00000212F2EA4710>

Calibration Plot

Calibrated Model Metrics

Show the code
native_metrics <- py$catboost_metrics_native

native_metrics %>% str()
'data.frame':   1 obs. of  13 variables:
 $ Model              : chr "Native_Catboost"
 $ AUC                : num 0.726
 $ Brier Score        : num 0.0864
 $ Accuracy           : num 0.668
 $ Log Loss           : num 0.302
 $ F1 Score           : num 0.308
 $ Precision          : num 0.196
 $ Recall             : num 0.725
 $ AUPR               : num 0.237
 $ True Negative (TN) : num 594
 $ False Positive (FP): num 304
 $ False Negative (FN): num 28
 $ True Positive (TP) : num 74
 - attr(*, "pandas.index")=RangeIndex(start=0, stop=1, step=1)

SHAP Feature Importance

Show the code
# Retrieve the ranked comparison dataframe from Python
shap_df <- py$shap_df

# Convert the pandas dataframe to an R tibble
shap_tbl <- as_tibble(shap_df) %>% 
  mutate(SHAP_Importance = abs(Importance_SHAP)) %>% 
  arrange(-SHAP_Importance)

# Create a formatted table using huxtable, including the ranks for each method and the 'Direction' column
shap_table <- shap_tbl %>%
  rowid_to_column(var = "Overall Rank") %>%
  select('Overall Rank', 'Feature Id', 
         'Importance_SHAP', 'SHAP_Direction',
         'SHAP_Importance'
 ) 

shap_table %>% 
  DT::datatable(
    rownames = FALSE,
    options = list(
      columnDefs = list(
        list(className = 'dt-center', targets = "_all")
      )
    )
  )

CatBoost - One Hot Encoding (Hybrid)

  • Train Dataset
 [1] "Gender"     "Race"       "Blood_Type" "VAD_TCR"    "WL_Oth_Org"
 [6] "Cereb_Vasc" "Diabetes"   "Diag_Code"  "XMatch_Req" "List_Ctr"  
[1] "Race"       "Blood_Type" "Diag_Code" 
'data.frame':   4523 obs. of  44 variables:
 $ outcome                                           : int  0 0 0 1 0 1 0 1 1 1 ...
 $ Age                                               : num  1 0 0 0 2 0 1 11 0 0 ...
  ..- attr(*, "label")= chr "WL AGE AT LISTING IN YEARS"
 $ Gender                                            : Factor w/ 2 levels "F","M": 1 2 2 2 1 2 1 1 1 2 ...
 $ Weight                                            : num  6.71 3.02 6.6 6 19.4 ...
 $ Height                                            : num  72 49.5 66 57 97.5 ...
 $ BMI                                               : num  12.9 12.3 15.2 18.5 20.4 ...
 $ BSA                                               : num  0.366 0.204 0.348 0.308 0.725 ...
 $ PGE_TCR                                           : num  0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "TCR PGE AT LISTING"
 $ ECMO_Reg                                          : num  0 0 0 1 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "TCR ECMO AT LISTING"
 $ VAD                                               : num  -1 -1 -1 -1 -1 -1 -1 1 -1 1 ...
 $ VAD_TCR                                           : Factor w/ 5 levels "LVAD","LVAD+RVAD",..: 3 3 3 3 3 3 3 1 3 2 ...
 $ Vent_Reg                                          : num  0 0 0 0 0 0 0 0 0 1 ...
  ..- attr(*, "label")= chr "TCR LIFE SUPPORT VENTILATOR"
 $ WL_Oth_Org                                        : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 1 1 1 1 ...
 $ Cereb_Vasc                                        : Factor w/ 3 levels "No","Unknown",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Diabetes                                          : Factor w/ 5 levels "None","Type I",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Dialysis                                          : num  -1 -1 -1 1 -1 -1 -1 -1 -1 -1 ...
 $ Inotrop                                           : num  0 0 0 0 1 0 1 0 1 0 ...
  ..- attr(*, "label")= chr "TCR IV INOTROPES AT LISTING"
 $ Creatinine                                        : num  0.13 0.46 0.28 0.67 0.42 0.34 0.34 0.5 0.22 0.21 ...
  ..- attr(*, "label")= chr "TCR MOST RECENT CREAT."
 $ eGFR                                              : num  228.2 44.3 97.1 35.1 95.6 ...
  ..- attr(*, "label")= chr "TCR MOST RECENT CREAT."
 $ Albumin                                           : num  4.4 3.7 3 3.8 4.6 3.9 4.6 3.6 3.2 3.2 ...
  ..- attr(*, "label")= chr "TCR TOTAL SERUM ALBUMIN AT LISTING (Pre 1/1/2007 for adult)"
 $ Prior_HRTX                                        : num  15 2 13 18 13 15 30 11 27 16 ...
 $ Med_Refusals                                      : num  3 8 4 15 5 3 5 5 4 4 ...
 $ Prop_Refusals                                     : num  0.864 0.951 0.852 0.972 0.882 ...
 $ XMatch_Req                                        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ List_Yr                                           : num  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
 $ Policy_Chg                                        : num  1 1 1 1 1 1 1 1 1 1 ...
 $ List_Ctr                                          : Factor w/ 89 levels "ahbent","alfoba",..: 61 34 37 69 16 61 21 71 66 36 ...
 $ Race.Asian                                        : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Race.Black                                        : num  0 0 1 0 0 0 0 0 0 0 ...
 $ Race.Hispanic                                     : num  0 0 0 0 0 1 0 0 0 0 ...
 $ Race.Other                                        : num  0 0 0 0 0 0 0 1 0 0 ...
 $ Race.White                                        : num  1 1 0 1 1 0 1 0 1 1 ...
 $ Blood_Type.A                                      : num  0 0 0 1 0 0 0 0 1 0 ...
 $ Blood_Type.AB                                     : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Blood_Type.B                                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Blood_Type.O                                      : num  1 1 1 0 1 1 1 1 0 1 ...
 $ Diag_Code.Congenital Heart Disease With Surgery   : num  1 1 1 1 1 0 0 0 0 0 ...
 $ Diag_Code.Congenital Heart Disease Without Surgery: num  0 0 0 0 0 0 0 0 0 0 ...
 $ Diag_Code.Dilated Cardiomyopathy                  : num  0 0 0 0 0 1 1 1 0 1 ...
 $ Diag_Code.Hypertrophic Cardiomyopathy             : num  0 0 0 0 0 0 0 0 1 0 ...
 $ Diag_Code.Myocarditis                             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Diag_Code.Other                                   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Diag_Code.Restrictive Cardiomyopathy              : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Diag_Code.Valvular Heart Disease                  : num  0 0 0 0 0 0 0 0 0 0 ...
'data.frame':   4523 obs. of  44 variables:
 $ outcome      : int  0 0 0 1 0 1 0 1 1 1 ...
 $ Age          : num  1 0 0 0 2 0 1 11 0 0 ...
  ..- attr(*, "label")= chr "WL AGE AT LISTING IN YEARS"
 $ Gender       : Factor w/ 2 levels "F","M": 1 2 2 2 1 2 1 1 1 2 ...
 $ Weight       : num  6.71 3.02 6.6 6 19.4 ...
 $ Height       : num  72 49.5 66 57 97.5 ...
 $ BMI          : num  12.9 12.3 15.2 18.5 20.4 ...
 $ BSA          : num  0.366 0.204 0.348 0.308 0.725 ...
 $ PGE_TCR      : num  0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "TCR PGE AT LISTING"
 $ ECMO_Reg     : num  0 0 0 1 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "TCR ECMO AT LISTING"
 $ VAD          : num  -1 -1 -1 -1 -1 -1 -1 1 -1 1 ...
 $ VAD_TCR      : Factor w/ 5 levels "LVAD","LVAD+RVAD",..: 3 3 3 3 3 3 3 1 3 2 ...
 $ Ventilator   : num  0 0 0 0 0 0 0 0 0 1 ...
  ..- attr(*, "label")= chr "TCR LIFE SUPPORT VENTILATOR"
 $ WL_Oth_Org   : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 1 1 1 1 ...
 $ Cereb_Vasc   : Factor w/ 3 levels "No","Unknown",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Diabetes     : Factor w/ 5 levels "None","Type I",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Dialysis     : num  -1 -1 -1 1 -1 -1 -1 -1 -1 -1 ...
 $ Inotrop      : num  0 0 0 0 1 0 1 0 1 0 ...
  ..- attr(*, "label")= chr "TCR IV INOTROPES AT LISTING"
 $ Creatinine   : num  0.13 0.46 0.28 0.67 0.42 0.34 0.34 0.5 0.22 0.21 ...
  ..- attr(*, "label")= chr "TCR MOST RECENT CREAT."
 $ eGFR         : num  228.2 44.3 97.1 35.1 95.6 ...
  ..- attr(*, "label")= chr "TCR MOST RECENT CREAT."
 $ Albumin      : num  4.4 3.7 3 3.8 4.6 3.9 4.6 3.6 3.2 3.2 ...
  ..- attr(*, "label")= chr "TCR TOTAL SERUM ALBUMIN AT LISTING (Pre 1/1/2007 for adult)"
 $ Txp_Volume   : num  15 2 13 18 13 15 30 11 27 16 ...
 $ Med_Refusals : num  3 8 4 15 5 3 5 5 4 4 ...
 $ Prop_Refusals: num  0.864 0.951 0.852 0.972 0.882 ...
 $ XMatch_Req   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ List_Yr      : num  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
 $ Policy_Chg   : num  1 1 1 1 1 1 1 1 1 1 ...
 $ Listing_Ctr  : Factor w/ 89 levels "ahbent","alfoba",..: 61 34 37 69 16 61 21 71 66 36 ...
 $ Race_Asian   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Race_Black   : num  0 0 1 0 0 0 0 0 0 0 ...
 $ Race_Hispanic: num  0 0 0 0 0 1 0 0 0 0 ...
 $ Race_Other   : num  0 0 0 0 0 0 0 1 0 0 ...
 $ Race_White   : num  1 1 0 1 1 0 1 0 1 1 ...
 $ Blood_Type_A : num  0 0 0 1 0 0 0 0 1 0 ...
 $ Blood_Type_AB: num  0 0 0 0 0 0 0 0 0 0 ...
 $ Blood_Type_B : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Blood_Type_O : num  1 1 1 0 1 1 1 1 0 1 ...
 $ CHD_Surgery  : num  1 1 1 1 1 0 0 0 0 0 ...
 $ CHD_NoSurgery: num  0 0 0 0 0 0 0 0 0 0 ...
 $ DCM          : num  0 0 0 0 0 1 1 1 0 1 ...
 $ HCM          : num  0 0 0 0 0 0 0 0 1 0 ...
 $ Myocard      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Other_Diag   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ RCM          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ VHD          : num  0 0 0 0 0 0 0 0 0 0 ...
  • Test Dataset

Hybrid CatBoost Model

Show the code

#| echo: false
#| warning: false
#| message: false

import numpy as np

# initialize Train and Test datasets
hybrid_X_train = r.hybrid_train_data
hybrid_y_train = r.hybrid_train_Y
hybrid_Y_train = np.array(hybrid_y_train)  

hybrid_X_test = r.hybrid_test_data
hybrid_y_test = r.hybrid_test_Y
hybrid_Y_test = np.array(hybrid_y_test) 

hybrid_cat_index = get_categorical_indexes(hybrid_X_train)
Feature names are consistent between training and test datasets.

Optuna Hyperparameter Optimization

Final Selected Model

  • From Optuna Trial #12/50 with value: 0.742
Show the code

model_params_hybrid = {
    'learning_rate': 0.04,
    'depth': 3,
    'colsample_bylevel': 0.05,
    'min_data_in_leaf': 98,
    'l2_leaf_reg': 11.95
}
    
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0:  test: 0.5813245 best: 0.5813245 (0) total: 4.15ms   remaining: 4.14s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7428381152
bestIteration = 223

<catboost.core.CatBoostClassifier object at 0x00000212F2E83500>

Calibration Plot

Show the code

import pandas as pd

Y_Pred_hybrid = hybrid_model.predict(hybrid_X_test)
Y_Pred_Proba_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 1]  # get the probabilities of the positive class


Y_Pred_Proba_Positive_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 1]  # Probabilities of the positive class
Y_Pred_Proba_Negative_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 0]  # Probabilities of the negative class

# Converting predictions and actuals into a DataFrame for better readability, including negative class probabilities
hybrid_predictions = pd.DataFrame({
    'Prob_Negative_Class': Y_Pred_Proba_Negative_hybrid,
    'Prob_Positive_Class': Y_Pred_Proba_Positive_hybrid,
    'Predicted': Y_Pred_hybrid,
    'Actual': hybrid_y_test
})
Show the code
hybrid_predictions <- py$hybrid_predictions %>% 
  mutate(Class = ifelse(Actual == 0, "survive", "not_survive"),
         .pred_not_survive = Prob_Positive_Class
         )

# Define the levels you want
factor_levels <- c("survive", "not_survive")

# Set the levels of the 'actuals' column
hybrid_predictions$Class <- factor(hybrid_predictions$Class, levels = rev(factor_levels))

hybrid_predictions %>% 
  cal_plot_logistic(Class, .pred_not_survive)

Calibrated Model Metrics

             Model       AUC  ...  False Negative (FN)  True Positive (TP)
0  Hybrid_Catboost  0.741997  ...                   31                  71

[1 rows x 13 columns]
Show the code
final_model_metrics <- py$catboost_metrics_hybrid

final_model_metrics
ModelAUCBrier ScoreAccuracyLog LossF1 ScorePrecisionRecallAUPRTrue Negative (TN)False Positive (FP)False Negative (FN)True Positive (TP)
Hybrid_Catboost0.7420.0840.6740.2940.3030.1940.6960.2846032953171

Final Feature Importances

Show the code
# Retrieve the ranked comparison dataframe from Python
final_shap_df <- py$final_shap_df

# Convert the pandas dataframe to an R tibble
final_shap_tbl <- as_tibble(final_shap_df) %>% 
  arrange(desc(Importance))

# Create a formatted table using huxtable, including the ranks for each method and the 'Direction' column
final_shap_table <- final_shap_tbl %>%
  rowid_to_column(var = "Overall Rank") %>%
  select('Feature Id', 'Importance') 

final_shap_table %>% 
  DT::datatable(
    rownames = FALSE,
    options = list(
      columnDefs = list(
        list(className = 'dt-center', targets = "_all")
      )
    )
  )
Cluster Function for Top Features

Show the code
# Function to cluster data based on optimal clusters
final_clustering <- function(data, optimal_k) {
  # Perform k-means clustering with the optimal number of clusters
  kmeans_res <- kmeans(as.matrix(data), centers = optimal_k, nstart = 25)
  return(kmeans_res$cluster)
}

optimal_k <- 3

# Perform clustering using the optimal number of clusters
final_shap_df <- final_shap_df %>%
  mutate(Cluster = final_clustering(select(., Importance), optimal_k))

# View the clustered data
final_shap_df %>% 
  DT::datatable(
    rownames = FALSE,
    options = list(
      columnDefs = list(
        list(className = 'dt-center', targets = "_all")
      )
    )
  )

‘ECMO_Reg’ is the cutoff feature based on WCSS (within-cluster sum of squares). However we can include a few more additional features that may be potential reasons for ‘Med_Refusals’ as we have several variables that are correlated. For this reason we will set final cutoff at ‘Txp_Volume’ or .02 value for Feature Importance.

CatBoost Model Accuracy Summary

Show the code
model_accuracy
ModelAUCBrier ScoreAccuracyLog LossF1 ScorePrecisionRecallAUPR
Native_Catboost0.7260.08640.6680.3020.3080.1960.7250.237
Hybrid_Catboost0.7420.084 0.6740.2940.3030.1940.6960.284
Show the code
model_confusion_matrix
ModelTrue Negative (TN)False Positive (FP)False Negative (FN)True Positive (TP)
Native_Catboost5943042874
Hybrid_Catboost6032953171

SHAP Value Analysis

Show the code

feature_names = shap_values.feature_names

# Replace '_' with ' ' in each feature name
updated_feature_names = [name.replace('_', ' ') for name in feature_names]

shap_values.feature_names = updated_feature_names

Mean Absolute Value Feature Importance

Show the code
library(ggplot2)
library(plotly)
library(dplyr)


# ggplot bar chart object
p <- ggplot(final_shap_df, aes(x = reorder(`Feature Id`, -Importance), y = Importance, fill = `Feature Id`)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 0.02, linetype = "dashed", color = "red") +  # Add cutoff line
  annotate("text", x = 33.5, y = 0.025, label = "Cutoff@0.02 (Txp_Volume)", color = "red", hjust = 0) +  # Add annotation
  labs(title = "Sorted Mean Absolute SHAP Values", x = "Features", y = "Mean Absolute SHAP Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")  # Adjust text angle for better readability

# Interactive plotly object
p_interactive <- ggplotly(p, tooltip = c("x", "y"))  # Hover effects with tooltips for both feature name and value

# Display the interactive plot
p_interactive
Show the code
# Save the interactive plot as HTML
htmlwidgets::saveWidget(p_interactive, "sorted_shap_values_interactive.html")

Beeswarm (Top Features - Categorical and Numerical)

Figure 1: Beeswarm Chart

Bar Chart for Feature Importance

Figure 2: Feature Importance Chart

Beeswarm Top Numerical Features

Figure 3: Numerical Beeswarm Chart

Feature Importance Correlation Plot

Figure 4: SHAP Value Correlation Plot

SHAP Partial Dependence Plots (Test)

SHAP Partial Dependence Plots

SHAP Partial Dependence Plots (Train)

SHAP Partial Dependence Plots

Show the code
features <- py$updated_feature_names

Stratified Model Accuracy Metrics

Show the code
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss, average_precision_score, accuracy_score, precision_score

# Convert to Series
Y_PRED_HYBRID_series = pd.Series(Y_PRED_HYBRID)
Y_Pred_Proba_Positive_hybrid_series = pd.Series(Y_Pred_Proba_Positive_hybrid)

# Convert from list to DataFrame
HYBRID_Y_TEST_DF = pd.DataFrame({"y_true": hybrid_y_test}, index=hybrid_X_test.index)

# Ensure index consistency across all datasets
hybrid_X_test.index = hybrid_X_test.index.astype(str)
Y_PRED_HYBRID_series.index = Y_PRED_HYBRID_series.index.astype(str)
Y_Pred_Proba_Positive_hybrid_series.index = Y_Pred_Proba_Positive_hybrid_series.index.astype(str)
HYBRID_Y_TEST_DF.index = HYBRID_Y_TEST_DF.index.astype(str)

# List of categorical features
one_hot_columns = [
    'Blood_Type_A', 'Blood_Type_AB', 'Blood_Type_B', 'Blood_Type_O',
    'Cereb_Vasc', 'CHD_NoSurgery', 'CHD_Surgery', "DCM", "Diabetes",
    "Dialysis", "ECMO_Reg", "Gender", "Inotrop", "Myocard",
    'Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Other', 'Race_White',
    "VAD", "Ventilator"
]

# Create an empty list to store results
stratified_metrics = []

# Loop through each categorical feature
for col in one_hot_columns:

    mask = hybrid_X_test.loc[:, col] == 1  # Boolean mask for this category
    mask = mask[mask].index  # Get indices where mask is True

    if len(mask) == 0:
        continue  # Skip if no samples for this group

    # Ensure only valid indices exist in Y_PRED_HYBRID_series
    valid_idx = mask.intersection(Y_PRED_HYBRID_series.index)
    missing_idx = mask.difference(Y_PRED_HYBRID_series.index)

    # Subset using only valid indices
    y_true_subset = HYBRID_Y_TEST_DF.loc[valid_idx, "y_true"]
    y_pred_proba_subset = Y_Pred_Proba_Positive_hybrid_series.loc[valid_idx]
    y_pred_subset = Y_PRED_HYBRID_series.loc[valid_idx]

    # Drop NaNs if they exist in any subset
    non_nan_idx = y_true_subset.dropna().index  # Get non-NaN indices
    y_true_subset = y_true_subset.loc[non_nan_idx]
    y_pred_proba_subset = y_pred_proba_subset.loc[non_nan_idx]
    y_pred_subset = y_pred_subset.loc[non_nan_idx]

    if y_pred_subset.isnull().sum() > 0:
        print(f"⚠️ Warning: {y_pred_subset.isnull().sum()} NaN values found in y_pred_subset for {col}, skipping this feature...")
        continue

    # Compute metrics only if at least 2 unique values exist
    if len(set(y_true_subset)) > 1:
        auc = roc_auc_score(y_true_subset, y_pred_proba_subset)
        logloss = log_loss(y_true_subset, y_pred_proba_subset)
        aupr = average_precision_score(y_true_subset, y_pred_proba_subset)
        sample_size = len(y_true_subset)
    else:
        auc, logloss, aupr = np.nan, np.nan, np.nan

    accuracy = accuracy_score(y_true_subset, y_pred_subset)
    precision = precision_score(y_true_subset, y_pred_subset, zero_division=0)
    recall = recall_score(y_true_subset, y_pred_subset)
    f1 = f1_score(y_true_subset, y_pred_subset)
    brier = brier_score_loss(y_true_subset, y_pred_proba_subset)

    # Store results
    stratified_metrics.append({
        "Feature": col,  
        "AUC": auc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Brier Score": brier,
        "Log Loss": logloss,
        "AUPR": aupr,
        "Sample Size": sample_size
    })

# Convert results to DataFrame
stratified_metrics_df = pd.DataFrame(stratified_metrics)
print("\n✅ Final Stratified Metrics DataFrame")

✅ Final Stratified Metrics DataFrame
Show the code
print(stratified_metrics_df)
          Feature       AUC  Accuracy  ...  Log Loss      AUPR  Sample Size
0    Blood_Type_A  0.618243  0.667702  ...  0.279168  0.144620          322
1   Blood_Type_AB  0.764706  0.600000  ...  0.146638  0.111111           35
2    Blood_Type_B  0.760331  0.686567  ...  0.280873  0.307213          134
3    Blood_Type_O  0.651526  0.635827  ...  0.360254  0.247839          508
4   CHD_NoSurgery  0.944444  0.750000  ...  0.284438  0.607275           48
5     CHD_Surgery  0.655366  0.642534  ...  0.390794  0.251121          442
6             DCM  0.599759  0.634349  ...  0.234869  0.175898          361
7        Dialysis  0.428571  0.545455  ...  1.093794  0.517992           11
8        ECMO_Reg  0.761970  0.650000  ...  0.603077  0.579953           60
9         Inotrop  0.630678  0.614350  ...  0.348669  0.207420          446
10        Myocard  0.838095  0.707317  ...  0.344932  0.590443           41
11     Race_Asian  0.480000  0.622222  ...  0.383320  0.139329           45
12     Race_Black  0.621053  0.619718  ...  0.338536  0.181810          213
13  Race_Hispanic  0.651778  0.665094  ...  0.366889  0.242437          212
14     Race_Other  0.807692  0.642857  ...  0.223097  0.236111           28
15     Race_White  0.687243  0.662675  ...  0.284003  0.275903          501
16            VAD  0.641599  0.681481  ...  0.292282  0.159473          135
17     Ventilator  0.706793  0.637306  ...  0.506780  0.424836          193

[18 rows x 10 columns]
Show the code
risk_metrics <- py$catboost_metrics_hybrid

stratified_risk_metrics <- py$stratified_metrics_df
Show the code
df1 <- risk_metrics 
df2 <-stratified_risk_metrics

# Rename 'Model' column in df1 to 'Feature'
colnames(df1)[colnames(df1) == "Model"] <- "Feature"

# Add 'Sample Size' column with row count of hybrid_X_train
sample_size <- nrow(py$hybrid_X_train)
df1$`Sample Size` <- sample_size

# 3. Select intersection of columns between df1 and df2
common_columns <- intersect(colnames(df1), colnames(df2))
df1 <- df1[, common_columns]
df2 <- df2[, common_columns]

# 4. Concatenate (rbind) df1 on top of df2
consolidated_df <- rbind(df1, df2)

write_csv(consolidated_df, "stratified_risk_metrics.csv")

# Display the consolidated dataframe
print(consolidated_df)
           Feature       AUC Brier Score  Accuracy  Log Loss  F1 Score
1  Hybrid_Catboost 0.7419975  0.08404127 0.6740000 0.2937210 0.3034188
2     Blood_Type_A 0.6182432  0.07389593 0.6677019 0.2791682 0.2302158
3    Blood_Type_AB 0.7647059  0.03148715 0.6000000 0.1466382 0.1250000
4     Blood_Type_B 0.7603306  0.08062061 0.6865672 0.2808726 0.2758621
5     Blood_Type_O 0.6515261  0.10354411 0.6358268 0.3602540 0.2745098
6    CHD_NoSurgery 0.9444444  0.09152532 0.7500000 0.2844378 0.5000000
7      CHD_Surgery 0.6553665  0.11440599 0.6425339 0.3907943 0.3070175
8              DCM 0.5997586  0.05836376 0.6343490 0.2348692 0.1428571
9         Dialysis 0.4285714  0.31969246 0.5454545 1.0937939 0.2857143
10        ECMO_Reg 0.7619699  0.20297276 0.6500000 0.6030771 0.5882353
11         Inotrop 0.6306776  0.09919690 0.6143498 0.3486687 0.2586207
12         Myocard 0.8380952  0.10819964 0.7073171 0.3449324 0.4545455
13      Race_Asian 0.4800000  0.10646730 0.6222222 0.3833199 0.1904762
14      Race_Black 0.6210526  0.09622726 0.6197183 0.3385359 0.2429907
15   Race_Hispanic 0.6517783  0.10525065 0.6650943 0.3668893 0.2828283
16      Race_Other 0.8076923  0.06273958 0.6428571 0.2230974 0.2857143
17      Race_White 0.6872432  0.07773020 0.6626747 0.2840033 0.2555066
18             VAD 0.6415989  0.08055565 0.6814815 0.2922816 0.1886792
19      Ventilator 0.7067932  0.15841963 0.6373057 0.5067799 0.3965517
    Precision    Recall      AUPR Sample Size
1  0.19398907 0.6960784 0.2837264        4523
2  0.14159292 0.6153846 0.1446203         322
3  0.06666667 1.0000000 0.1111111          35
4  0.17777778 0.6153846 0.3072126         134
5  0.18134715 0.5645161 0.2478393         508
6  0.33333333 1.0000000 0.6072751          48
7  0.20833333 0.5833333 0.2511207         442
8  0.08333333 0.5000000 0.1758977         361
9  0.33333333 0.2500000 0.5179924          11
10 0.44117647 0.8823529 0.5799533          60
11 0.16574586 0.5882353 0.2074200         446
12 0.31250000 0.8333333 0.5904431          41
13 0.12500000 0.4000000 0.1393286          45
14 0.15476190 0.5652174 0.1818097         213
15 0.19178082 0.5384615 0.2424367         212
16 0.16666667 1.0000000 0.2361111          28
17 0.16022099 0.6304348 0.2759031         501
18 0.12195122 0.4166667 0.1594732         135
19 0.29870130 0.5897436 0.4248365         193

Final Metrics with Confidence Intervals

Show the code

from sklearn.utils import resample
import numpy as np

# Function for stratified resampling
def stratified_bootstrap(y_true, y_pred, random_state=None):
    # Ensure y_true and y_pred are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # Stratified resampling using indices
    resampled_indices = resample(np.arange(len(y_true)), stratify=y_true, random_state=random_state)
    
    y_true_resampled = y_true[resampled_indices]  # Resample true labels
    y_pred_resampled = y_pred[resampled_indices]  # Resample predicted labels
    
    return y_pred_resampled, y_true_resampled


# Function to compute confidence intervals for a metric
def compute_confidence_interval(metric_function, y_true, y_pred, 
                              n_iterations=1000, ci=95):
    stats = []
    for i in range(n_iterations):
        # Use stratified resampling
        y_pred_resampled, y_true_resampled = stratified_bootstrap(
            y_true, y_pred, random_state=i
        )
        stat = metric_function(y_true_resampled, y_pred_resampled)
        stats.append(stat)
    
    lower = np.percentile(stats, (100 - ci) / 2)
    upper = np.percentile(stats, 100 - (100 - ci) / 2)
    return lower, upper
Show the code

# Define model predictions and true labels
Y_Pred_Proba_Positive_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 1]  # Probabilities of the positive class
Y_PRED_HYBRID = r.hybrid_predictions["predicted_classes"]  # Calibrated predictions from R
y_true = hybrid_y_test

# Calculate AUC (using probabilities)
auc_hybrid = roc_auc_score(y_true, Y_Pred_Proba_Positive_hybrid)
auc_ci = compute_confidence_interval(roc_auc_score, y_true, Y_Pred_Proba_Positive_hybrid)

# Calculate Brier Score (using probabilities)
brier_score_hybrid = brier_score_loss(y_true, Y_Pred_Proba_Positive_hybrid)
brier_score_ci = compute_confidence_interval(brier_score_loss, y_true, Y_Pred_Proba_Positive_hybrid)

# Calculate Accuracy (using binary predictions)
accuracy_hybrid = accuracy_score(y_true, Y_PRED_HYBRID)
accuracy_ci = compute_confidence_interval(accuracy_score, y_true, Y_PRED_HYBRID)

# Calculate Log Loss (using probabilities)
log_loss_value_hybrid = log_loss(y_true, Y_Pred_Proba_Positive_hybrid)
log_loss_ci = compute_confidence_interval(log_loss, y_true, Y_Pred_Proba_Positive_hybrid)

# Calculate F1 Score (using binary predictions)
f1_hybrid = f1_score(y_true, Y_PRED_HYBRID)
f1_ci = compute_confidence_interval(f1_score, y_true, Y_PRED_HYBRID)

# Calculate Precision (using binary predictions)
precision_hybrid = precision_score(y_true, Y_PRED_HYBRID)
precision_ci = compute_confidence_interval(precision_score, y_true, Y_PRED_HYBRID)

# Calculate Recall (using binary predictions)
recall_hybrid = recall_score(y_true, Y_PRED_HYBRID)
recall_ci = compute_confidence_interval(recall_score, y_true, Y_PRED_HYBRID)

# Calculate AUPR (using probabilities)
aupr_hybrid = average_precision_score(y_true, Y_Pred_Proba_Positive_hybrid)
aupr_ci = compute_confidence_interval(average_precision_score, y_true, Y_Pred_Proba_Positive_hybrid)

# Calculate Confusion Matrix (using binary predictions)
conf_matrix_hybrid = confusion_matrix(y_true, Y_PRED_HYBRID)

# Extract TN, FP, FN, TP from the confusion matrix
tn_hybrid, fp_hybrid, fn_hybrid, tp_hybrid = conf_matrix_hybrid.ravel()

# Create a DataFrame with all metrics and their confidence intervals
catboost_metrics_conf_level = pd.DataFrame({
    'Model': ['Hybrid_Catboost'],
    'AUC': [auc_hybrid],  
    'AUC CI Lower': [auc_ci[0]],  
    'AUC CI Upper': [auc_ci[1]],  
    'Brier Score': [brier_score_hybrid],  
    'Brier Score CI Lower': [brier_score_ci[0]],  
    'Brier Score CI Upper': [brier_score_ci[1]],  
    'Accuracy': [accuracy_hybrid],  
    'Accuracy CI Lower': [accuracy_ci[0]],  
    'Accuracy CI Upper': [accuracy_ci[1]],  
    'Log Loss': [log_loss_value_hybrid],  
    'Log Loss CI Lower': [log_loss_ci[0]],  
    'Log Loss CI Upper': [log_loss_ci[1]],  
    'F1 Score': [f1_hybrid],  
    'F1 Score CI Lower': [f1_ci[0]],  
    'F1 Score CI Upper': [f1_ci[1]],  
    'Precision': [precision_hybrid],  
    'Precision CI Lower': [precision_ci[0]],  
    'Precision CI Upper': [precision_ci[1]],  
    'Recall': [recall_hybrid],  
    'Recall CI Lower': [recall_ci[0]],  
    'Recall CI Upper': [recall_ci[1]],  
    'AUPR': [aupr_hybrid],  
    'AUPR CI Lower': [aupr_ci[0]],  
    'AUPR CI Upper': [aupr_ci[1]],  
    'True Negative (TN)': [tn_hybrid],  
    'False Positive (FP)': [fp_hybrid],  
    'False Negative (FN)': [fn_hybrid],  
    'True Positive (TP)': [tp_hybrid]  
})
Show the code
metrics_conf_level <- py$catboost_metrics_conf_level

metrics_conf_level %>% str()
'data.frame':   1 obs. of  29 variables:
 $ Model               : chr "Hybrid_Catboost"
 $ AUC                 : num 0.742
 $ AUC CI Lower        : num 0.691
 $ AUC CI Upper        : num 0.79
 $ Brier Score         : num 0.084
 $ Brier Score CI Lower: num 0.0811
 $ Brier Score CI Upper: num 0.0871
 $ Accuracy            : num 0.674
 $ Accuracy CI Lower   : num 0.646
 $ Accuracy CI Upper   : num 0.702
 $ Log Loss            : num 0.294
 $ Log Loss CI Lower   : num 0.281
 $ Log Loss CI Upper   : num 0.308
 $ F1 Score            : num 0.303
 $ F1 Score CI Lower   : num 0.266
 $ F1 Score CI Upper   : num 0.341
 $ Precision           : num 0.194
 $ Precision CI Lower  : num 0.17
 $ Precision CI Upper  : num 0.217
 $ Recall              : num 0.696
 $ Recall CI Lower     : num 0.598
 $ Recall CI Upper     : num 0.784
 $ AUPR                : num 0.284
 $ AUPR CI Lower       : num 0.221
 $ AUPR CI Upper       : num 0.365
 $ True Negative (TN)  : num 603
 $ False Positive (FP) : num 295
 $ False Negative (FN) : num 31
 $ True Positive (TP)  : num 71
 - attr(*, "pandas.index")=RangeIndex(start=0, stop=1, step=1)