#| echo: false#| warning: false#| message: false#| eval: trueimport pandas as pddef get_categorical_indexes(X_train):ifnotisinstance(X_train, pd.DataFrame):raiseValueError("Input must be a pandas DataFrame")return [i for i, col inenumerate(X_train.columns) if X_train[col].dtype in ['object', 'category']]
Native CatBoost Model
Show the code
#| echo: false#| warning: false#| message: false#| eval: true# Traintrain_data <- survival_model %>%select(-outcome)# Must be factor or numeric "Class"train_target <- survival_model %>%select(outcome) %>%as.data.frame()train_Y <- survival_model$outcome# Testtest_data <- survival_model_test %>%select(-outcome)# Must be factor or numeric "Class"test_target <- survival_model_test %>%select(outcome) %>%as.data.frame()test_Y <- survival_model_test$outcome
Data Cleansing
Show the code
#| echo: false#| warning: false#| message: false# Convert NaN values to a string in categorical columnscategorical_columns = [col for col in X_train.columns if X_train[col].dtype =='object']X_train[categorical_columns] = X_train[categorical_columns].fillna('missing')# Convert NaN values to a string in categorical columnscategorical_columns = [col for col in X_test.columns if X_test[col].dtype =='object']X_test[categorical_columns] = X_test[categorical_columns].fillna('missing')
Show the code
#| echo: false#| warning: false#| message: false# NaN values in the column at index 15has_nan = X_train.iloc[:, 11].isna().any()print("Column at index 11 contains NaN values:", has_nan)
Column at index 11 contains NaN values: True
Clean/Fix index 11
Show the code
#| echo: false#| warning: false#| message: false# Ensure the column at index 11 is categorical and "Missing" is added to categoriesif'Missing'notin X_train.iloc[:, 11].cat.categories: X_train.iloc[:, 11] = X_train.iloc[:, 11].cat.add_categories(['Missing'])
<string>:8: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '0 NONE
1 NONE
2 NONE
3 NONE
4 NONE
...
4518 NONE
4519 NONE
4520 NONE
4521 NONE
4522 NONE
Name: VAD_TCR, Length: 4523, dtype: category
Categories (6, object): ['LVAD', 'LVAD+RVAD', 'NONE', 'RVAD', 'TAH', 'Missing']' has dtype incompatible with category, please explicitly cast to a compatible dtype first.
Show the code
# Explicitly cast the column to a category type to ensure compatibilityX_train.iloc[:, 11] = X_train.iloc[:, 11].astype('category')# Replace NaN values with the "Missing" categoryX_train.iloc[:, 11] = X_train.iloc[:, 11].fillna('Missing')
'data.frame': 1 obs. of 13 variables:
$ Model : chr "Native_Catboost"
$ AUC : num 0.726
$ Brier Score : num 0.0864
$ Accuracy : num 0.668
$ Log Loss : num 0.302
$ F1 Score : num 0.308
$ Precision : num 0.196
$ Recall : num 0.725
$ AUPR : num 0.237
$ True Negative (TN) : num 594
$ False Positive (FP): num 304
$ False Negative (FN): num 28
$ True Positive (TP) : num 74
- attr(*, "pandas.index")=RangeIndex(start=0, stop=1, step=1)
SHAP Feature Importance
Show the code
# Retrieve the ranked comparison dataframe from Pythonshap_df <- py$shap_df# Convert the pandas dataframe to an R tibbleshap_tbl <-as_tibble(shap_df) %>%mutate(SHAP_Importance =abs(Importance_SHAP)) %>%arrange(-SHAP_Importance)# Create a formatted table using huxtable, including the ranks for each method and the 'Direction' columnshap_table <- shap_tbl %>%rowid_to_column(var ="Overall Rank") %>%select('Overall Rank', 'Feature Id', 'Importance_SHAP', 'SHAP_Direction','SHAP_Importance' ) shap_table %>% DT::datatable(rownames =FALSE,options =list(columnDefs =list(list(className ='dt-center', targets ="_all") ) ) )
Feature names are consistent between training and test datasets.
Data Cleansing
Show the code
#| echo: false#| warning: false#| message: false# NaN values in the column at index 131has_nan = hybrid_X_train.iloc[:, 37].isna().any()print("Column at index 37 contains NaN values:", has_nan)
Column at index 37 contains NaN values: True
Show the code
#| echo: false#| warning: false#| message: false# Ensure the column at index 38 is categorical and "Missing" is added to categoriesif'Missing'notin hybrid_X_train.iloc[:, 37].cat.categories: hybrid_X_train.iloc[:, 37] = hybrid_X_train.iloc[:, 37].cat.add_categories(['Missing'])
<string>:8: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '1 NONE
2 NONE
3 NONE
4 NONE
5 NONE
...
4519 NONE
4520 NONE
4521 NONE
4522 NONE
4523 NONE
Name: VAD_TCR, Length: 4523, dtype: category
Categories (6, object): ['LVAD', 'LVAD+RVAD', 'NONE', 'RVAD', 'TAH', 'Missing']' has dtype incompatible with category, please explicitly cast to a compatible dtype first.
Show the code
# Explicitly cast the column to a category type to ensure compatibilityhybrid_X_train.iloc[:, 37] = hybrid_X_train.iloc[:, 37].astype('category')# Replace NaN values with the "Missing" categoryhybrid_X_train.iloc[:, 37] = hybrid_X_train.iloc[:, 37].fillna('Missing')
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: test: 0.5813245 best: 0.5813245 (0) total: 4.15ms remaining: 4.14s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.7428381152
bestIteration = 223
<catboost.core.CatBoostClassifier object at 0x00000212F2E83500>
Calibration Plot
Show the code
import pandas as pdY_Pred_hybrid = hybrid_model.predict(hybrid_X_test)Y_Pred_Proba_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 1] # get the probabilities of the positive classY_Pred_Proba_Positive_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 1] # Probabilities of the positive classY_Pred_Proba_Negative_hybrid = hybrid_model.predict_proba(hybrid_X_test)[:, 0] # Probabilities of the negative class# Converting predictions and actuals into a DataFrame for better readability, including negative class probabilitieshybrid_predictions = pd.DataFrame({'Prob_Negative_Class': Y_Pred_Proba_Negative_hybrid,'Prob_Positive_Class': Y_Pred_Proba_Positive_hybrid,'Predicted': Y_Pred_hybrid,'Actual': hybrid_y_test})
Show the code
hybrid_predictions <- py$hybrid_predictions %>%mutate(Class =ifelse(Actual ==0, "survive", "not_survive"),.pred_not_survive = Prob_Positive_Class )# Define the levels you wantfactor_levels <-c("survive", "not_survive")# Set the levels of the 'actuals' columnhybrid_predictions$Class <-factor(hybrid_predictions$Class, levels =rev(factor_levels))hybrid_predictions %>%cal_plot_logistic(Class, .pred_not_survive)
Decision Threshold
Optimal Threshold: 0.09807151
Number of '1's predicted: 366
Calibrated Model Metrics
Model AUC ... False Negative (FN) True Positive (TP)
0 Hybrid_Catboost 0.741997 ... 31 71
[1 rows x 13 columns]
# Retrieve the ranked comparison dataframe from Pythonfinal_shap_df <- py$final_shap_df# Convert the pandas dataframe to an R tibblefinal_shap_tbl <-as_tibble(final_shap_df) %>%arrange(desc(Importance))# Create a formatted table using huxtable, including the ranks for each method and the 'Direction' columnfinal_shap_table <- final_shap_tbl %>%rowid_to_column(var ="Overall Rank") %>%select('Feature Id', 'Importance') final_shap_table %>% DT::datatable(rownames =FALSE,options =list(columnDefs =list(list(className ='dt-center', targets ="_all") ) ) )
Cluster Function for Top Features
Show the code
# Function to cluster data based on optimal clustersfinal_clustering <-function(data, optimal_k) {# Perform k-means clustering with the optimal number of clusters kmeans_res <-kmeans(as.matrix(data), centers = optimal_k, nstart =25)return(kmeans_res$cluster)}optimal_k <-3# Perform clustering using the optimal number of clustersfinal_shap_df <- final_shap_df %>%mutate(Cluster =final_clustering(select(., Importance), optimal_k))# View the clustered datafinal_shap_df %>% DT::datatable(rownames =FALSE,options =list(columnDefs =list(list(className ='dt-center', targets ="_all") ) ) )
‘ECMO_Reg’ is the cutoff feature based on WCSS (within-cluster sum of squares). However we can include a few more additional features that may be potential reasons for ‘Med_Refusals’ as we have several variables that are correlated. For this reason we will set final cutoff at ‘Txp_Volume’ or .02 value for Feature Importance.
CatBoost Model Accuracy Summary
Show the code
model_accuracy
Model
AUC
Brier Score
Accuracy
Log Loss
F1 Score
Precision
Recall
AUPR
Native_Catboost
0.726
0.0864
0.668
0.302
0.308
0.196
0.725
0.237
Hybrid_Catboost
0.742
0.084
0.674
0.294
0.303
0.194
0.696
0.284
Show the code
model_confusion_matrix
Model
True Negative (TN)
False Positive (FP)
False Negative (FN)
True Positive (TP)
Native_Catboost
594
304
28
74
Hybrid_Catboost
603
295
31
71
SHAP Value Analysis
Show the code
feature_names = shap_values.feature_names# Replace '_' with ' ' in each feature nameupdated_feature_names = [name.replace('_', ' ') for name in feature_names]shap_values.feature_names = updated_feature_names
Mean Absolute Value Feature Importance
Show the code
library(ggplot2)library(plotly)library(dplyr)# ggplot bar chart objectp <-ggplot(final_shap_df, aes(x =reorder(`Feature Id`, -Importance), y = Importance, fill =`Feature Id`)) +geom_bar(stat ="identity") +geom_hline(yintercept =0.02, linetype ="dashed", color ="red") +# Add cutoff lineannotate("text", x =33.5, y =0.025, label ="Cutoff@0.02 (Txp_Volume)", color ="red", hjust =0) +# Add annotationlabs(title ="Sorted Mean Absolute SHAP Values", x ="Features", y ="Mean Absolute SHAP Value") +theme_minimal() +theme(axis.text.x =element_text(angle =45, hjust =1), legend.position ="none") # Adjust text angle for better readability# Interactive plotly objectp_interactive <-ggplotly(p, tooltip =c("x", "y")) # Hover effects with tooltips for both feature name and value# Display the interactive plotp_interactive
Show the code
# Save the interactive plot as HTMLhtmlwidgets::saveWidget(p_interactive, "sorted_shap_values_interactive.html")
Beeswarm (Top Features - Categorical and Numerical)
Final Shap Value Partial Dependency Plotting Function
Show the code
# Function to determine the type of feature and call the appropriate plotting function for multiple featurescreate_shap_pdp_plots <-function(feature_names, pdp_data_df, pdp_shap_data, save_plots =TRUE) {# Loop over each feature in the feature_names listfor (feature_name in feature_names) {# Check if the feature is present in test_dataif (!feature_name %in%colnames(pdp_data_df)) {message(paste("Feature", feature_name, "not found in the data. Skipping..."))next# Skip this feature if not found }# Determine if the feature is numeric or categoricalif (is.numeric(pdp_data_df[[feature_name]])) {# If numeric, call the numeric plotting functioncreate_pdp_numeric(feature_name, pdp_data_df, pdp_shap_data, save_plots =TRUE) } elseif (is.factor(pdp_data_df[[feature_name]]) ||is.character(pdp_data_df[[feature_name]])) {# If categorical, call the categorical plotting functioncreate_pdp_categorical(feature_name, pdp_data_df, pdp_shap_data, save_plot =TRUE, plot_width =5, plot_height =4) } else {message(paste("Unsupported data type for feature:", feature_name, "Skipping...")) } }}
create_pdp_numeric_train <-function(feature_name, pdp_data_train_df, pdp_shap_train_df, save_plots =TRUE) {# Set the threshold for outlier removal based on Z-score threshold <-3# Check if the feature is present in both test_data and pdp_shap_dataif (!feature_name %in%colnames(pdp_data_train_df) ||!feature_name %in%colnames(pdp_shap_df)){stop(paste("Feature", feature_name, "not found in the data.")) }# Dataframe for the feature values and corresponding SHAP values pdp_shap_data_feature <-data.frame(feature_value = pdp_data_train_df[[feature_name]], # Original feature valuesshap_value = pdp_shap_train_df[[feature_name]] # SHAP values )# Remove rows with NA values in either feature_value or shap_value pdp_shap_data_feature <-na.omit(pdp_shap_data_feature)if (feature_name =='eGFR'){ pdp_shap_data_feature <- pdp_shap_data_feature[pdp_shap_data_feature$feature_value <400, ] }# Define unit labels for axis titles only unit_labels <-list(eGFR ="ml/min/1.73m^2",Albumin ="g/dL",Height ="cm",Weight ="kgs",Age ="years",BSA ="m^2",BMI ="kg/m^2" )# Get the appropriate unit label for the feature unit_label <- unit_labels[[feature_name]]# Scatter plot using ggplot2 scatter_plot <-ggplot(pdp_shap_data_feature, aes(x = feature_value, y = shap_value)) +geom_point(color ='blue', alpha =0.6) +scale_x_continuous(breaks =function(x) pretty(x, n =10)) +scale_y_continuous() +labs(x =if (!is.null(unit_label) && unit_label !="") paste(feature_name, " Values (", unit_label, ")", sep ="") elsepaste(feature_name, "Values"),y ="SHAP Values",title =paste("SHAP Partial Dependency Plot for", feature_name)) +theme_minimal() # Create marginal histogram for the bottom (x-axis) histogram_plot <-ggplot(pdp_shap_data_feature, aes(x = feature_value)) +geom_histogram(fill ="grey", bins =30) +theme_minimal() +labs(x =paste(feature_name, "Values (", unit_label, ")", sep =""), y ="Count")# Remove axis labels, ticks, and gridlines from the histogram to clean up appearance histogram_plot <- histogram_plot +theme(axis.title.x =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank(),axis.title.y =element_blank(),axis.text.y =element_blank(),axis.ticks.y =element_blank(),panel.grid =element_blank(), plot.margin =margin(5, 5, 5, 5) ) combined_plot <- scatter_plot / histogram_plot +plot_layout(heights =c(20, 1))# Display the plotprint(combined_plot)# Save the plot as a PNG fileif (save_plots) {ggsave(paste0("images/dependence_plots_train/shap_plot_numeric_", feature_name, ".png"), combined_plot) }}
Categorical Plotting Function
Show the code
create_pdp_categorical_train <-function(feature_name, pdp_data_train_df, pdp_shap_train_df, save_plot =TRUE, plot_width =5, plot_height =4) {# Check if the feature is present in both test_data and pdp_shap_dataif (!feature_name %in%colnames(pdp_data_train_df) ||!feature_name %in%colnames(pdp_shap_train_df)) {stop(paste("Feature", feature_name, "not found in the data.")) }# Create a dataframe for the feature values and corresponding SHAP values pdp_shap_data_feature <-data.frame(feature_value = pdp_data_train_df[[feature_name]], # Actual feature values (categories)shap_value = pdp_shap_train_df[[feature_name]] # Corresponding SHAP values )# Define unit labels for axis titles only cat_labels <-list(DCM ="Dilated Cardiomyopathy",HCM ="Hypertrophic Cardiomyopathy",ECMO ="ECMO Cardiogram",RCM ="Restrictive Cardiomyopathy",Myocard ="Myocardium",Defib ="Defibrillator",`VAD REG`="Ventricular Assist Device (VAD) at Registration",`VAD TCR`="Ventricular Assist Device (VAD) at Listing",VHD ="Valvular Heart Disease (VHD)",`XMatch Req`="Transplant Match Requested" )# Get the appropriate unit label for the feature cat_label <- cat_labels[[feature_name]]# Calculate the mean SHAP value for each category pdp_shap_summary <- pdp_shap_data_feature %>%group_by(feature_value) %>%summarize(mean_shap =mean(shap_value, na.rm =TRUE), .groups ='drop')# Create the plot plot <-ggplot(pdp_shap_summary, aes(x =reorder(feature_value, mean_shap), y = mean_shap, fill = feature_value)) +geom_bar(stat ="identity", show.legend =FALSE) +# Set axis labels and titlelabs(x =if (!is.null(cat_label) && cat_label !="") cat_label elsepaste(feature_name, "Categories"),y ="Mean SHAP Value",title =paste("Partial Dependency Plot for", feature_name) ) +# Apply minimal theme and customize appearancetheme_minimal() +theme(panel.grid =element_blank(), # Remove gridlines if desiredaxis.text.x =if (feature_name =="Listing Ctr") element_blank() elseelement_text(angle =45, hjust =1),axis.title.x =if (feature_name =="Listing Ctr") element_blank() elseelement_text(),axis.ticks.x =if (feature_name =="Listing Ctr") element_blank() elseelement_line(),axis.text.y =element_text(),axis.title.y =element_text() )# Display the plotprint(plot)# Save the plot if requiredif (save_plot) {ggsave(paste0("images/dependence_plots_train/shap_plot_categorical_", feature_name, ".png"), plot, width = plot_width, height = plot_height) }}
Final Shap Value Partial Dependency Plotting Function
Show the code
# Function to determine the type of feature and call the appropriate plotting function for multiple featurescreate_shap_pdp_plots_train <-function(feature_names, pdp_data_train_df, pdp_shap_train_data, save_plots =TRUE) {# Loop over each feature in the feature_names listfor (feature_name in feature_names) {# Check if the feature is present in test_dataif (!feature_name %in%colnames(pdp_data_df)) {message(paste("Feature", feature_name, "not found in the data. Skipping..."))next# Skip this feature if not found }# Determine if the feature is numeric or categoricalif (is.numeric(pdp_data_train_df[[feature_name]])) {# If numeric, call the numeric plotting functioncreate_pdp_numeric_train(feature_name, pdp_data_train_df, pdp_shap_train_data, save_plots =TRUE) } elseif (is.factor(pdp_data_train_df[[feature_name]]) ||is.character(pdp_data_train_df[[feature_name]])) {# If categorical, call the categorical plotting functioncreate_pdp_categorical_train(feature_name, pdp_data_train_df, pdp_shap_train_data, save_plot =TRUE, plot_width =5, plot_height =4) } else {message(paste("Unsupported data type for feature:", feature_name, "Skipping...")) } }}
import pandas as pdimport numpy as npfrom sklearn.metrics import roc_auc_score, log_loss, average_precision_score, accuracy_score, precision_score# Convert to SeriesY_PRED_HYBRID_series = pd.Series(Y_PRED_HYBRID)Y_Pred_Proba_Positive_hybrid_series = pd.Series(Y_Pred_Proba_Positive_hybrid)# Convert from list to DataFrameHYBRID_Y_TEST_DF = pd.DataFrame({"y_true": hybrid_y_test}, index=hybrid_X_test.index)# Ensure index consistency across all datasetshybrid_X_test.index = hybrid_X_test.index.astype(str)Y_PRED_HYBRID_series.index = Y_PRED_HYBRID_series.index.astype(str)Y_Pred_Proba_Positive_hybrid_series.index = Y_Pred_Proba_Positive_hybrid_series.index.astype(str)HYBRID_Y_TEST_DF.index = HYBRID_Y_TEST_DF.index.astype(str)# List of categorical featuresone_hot_columns = ['Blood_Type_A', 'Blood_Type_AB', 'Blood_Type_B', 'Blood_Type_O','Cereb_Vasc', 'CHD_NoSurgery', 'CHD_Surgery', "DCM", "Diabetes","Dialysis", "ECMO_Reg", "Gender", "Inotrop", "Myocard",'Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Other', 'Race_White',"VAD", "Ventilator"]# Create an empty list to store resultsstratified_metrics = []# Loop through each categorical featurefor col in one_hot_columns: mask = hybrid_X_test.loc[:, col] ==1# Boolean mask for this category mask = mask[mask].index # Get indices where mask is Trueiflen(mask) ==0:continue# Skip if no samples for this group# Ensure only valid indices exist in Y_PRED_HYBRID_series valid_idx = mask.intersection(Y_PRED_HYBRID_series.index) missing_idx = mask.difference(Y_PRED_HYBRID_series.index)# Subset using only valid indices y_true_subset = HYBRID_Y_TEST_DF.loc[valid_idx, "y_true"] y_pred_proba_subset = Y_Pred_Proba_Positive_hybrid_series.loc[valid_idx] y_pred_subset = Y_PRED_HYBRID_series.loc[valid_idx]# Drop NaNs if they exist in any subset non_nan_idx = y_true_subset.dropna().index # Get non-NaN indices y_true_subset = y_true_subset.loc[non_nan_idx] y_pred_proba_subset = y_pred_proba_subset.loc[non_nan_idx] y_pred_subset = y_pred_subset.loc[non_nan_idx]if y_pred_subset.isnull().sum() >0:print(f"⚠️ Warning: {y_pred_subset.isnull().sum()} NaN values found in y_pred_subset for {col}, skipping this feature...")continue# Compute metrics only if at least 2 unique values existiflen(set(y_true_subset)) >1: auc = roc_auc_score(y_true_subset, y_pred_proba_subset) logloss = log_loss(y_true_subset, y_pred_proba_subset) aupr = average_precision_score(y_true_subset, y_pred_proba_subset) sample_size =len(y_true_subset)else: auc, logloss, aupr = np.nan, np.nan, np.nan accuracy = accuracy_score(y_true_subset, y_pred_subset) precision = precision_score(y_true_subset, y_pred_subset, zero_division=0) recall = recall_score(y_true_subset, y_pred_subset) f1 = f1_score(y_true_subset, y_pred_subset) brier = brier_score_loss(y_true_subset, y_pred_proba_subset)# Store results stratified_metrics.append({"Feature": col, "AUC": auc,"Accuracy": accuracy,"Precision": precision,"Recall": recall,"F1 Score": f1,"Brier Score": brier,"Log Loss": logloss,"AUPR": aupr,"Sample Size": sample_size })# Convert results to DataFramestratified_metrics_df = pd.DataFrame(stratified_metrics)print("\n✅ Final Stratified Metrics DataFrame")
'data.frame': 1 obs. of 29 variables:
$ Model : chr "Hybrid_Catboost"
$ AUC : num 0.742
$ AUC CI Lower : num 0.691
$ AUC CI Upper : num 0.79
$ Brier Score : num 0.084
$ Brier Score CI Lower: num 0.0811
$ Brier Score CI Upper: num 0.0871
$ Accuracy : num 0.674
$ Accuracy CI Lower : num 0.646
$ Accuracy CI Upper : num 0.702
$ Log Loss : num 0.294
$ Log Loss CI Lower : num 0.281
$ Log Loss CI Upper : num 0.308
$ F1 Score : num 0.303
$ F1 Score CI Lower : num 0.266
$ F1 Score CI Upper : num 0.341
$ Precision : num 0.194
$ Precision CI Lower : num 0.17
$ Precision CI Upper : num 0.217
$ Recall : num 0.696
$ Recall CI Lower : num 0.598
$ Recall CI Upper : num 0.784
$ AUPR : num 0.284
$ AUPR CI Lower : num 0.221
$ AUPR CI Upper : num 0.365
$ True Negative (TN) : num 603
$ False Positive (FP) : num 295
$ False Negative (FN) : num 31
$ True Positive (TP) : num 71
- attr(*, "pandas.index")=RangeIndex(start=0, stop=1, step=1)