set.seed(2024)cv_folds = rsample::vfold_cv(data_train, v =10, strata = outcome)
Predictive Modeling
Create baseline preprocessing recipe and set predictor variables. All models start with this recipe.
Sets predictors and outcome variables
Converts Diabetes to {Yes, No}
Converts Functional Status to numeric, adds indicator for baby and unknown
Cleans eGFR: removes outliers, code based on kidney risk, binary for eGFR < 60
Creates binary for Albumin < 3
Show the code
library(tidymodels)base_rec =#: Set formularecipe(outcome ~ ., data =head(data_train)) %>%# step_naomit(outcome) %>% # remove rows with missing outcomestep_mutate(outcome =factor(outcome, levels =c(1, 0)), skip=TRUE) %>%#: Remove variables from all modelsstep_rm(WL_ID_CODE) %>%step_rm(matches("DONCRIT_.+_AGE"), matches("DONCRIT_.+_HGT"), matches("DONCRIT_.+_WGT"), matches("DONCRIT_.+_MILE"), matches("DONCRIT_"), # remove all DONCRIT variables. They are primarily# recorded for a few UNOS REGIONS. # --- not clinical CITIZENSHIP, HEMODYNAMICS_CO, # lots of missing CEREB_VASC, # --- CAND_DIAG_LISTING, # use CAND_DIAG instead CAND_DIAG_CODE, # use CAND_DIAG instead MOST_RCNT_CREAT, # use eGFR instead VAD_DEVICE_TY_TCR, # not enough info WL_DT, # use LIST_YR for temporal information#------------------------------------- Substitutes# LC_effect, p_refusals, # use median_refusals instead#-------------------------------------# LISTING_CTR_CODE, # Let individual models choose REGION, # Some regions only have 1-2 centers LIFE_SUPPORT_OTHER, PGE_TCR, LIST_YR, ) %>%#: Additional cleaningstep_mutate(# convert Diabetes to Yes = 1, No = 0DIAB =case_match(DIAB, "None"~0L, "Unknown"~0L, .default =1L) ) %>%#: Convert Functional status to numeric; add indicators for missing and babystep_mutate(FUNC_STAT_NUM =str_extract(FUNC_STAT_CAND_REG, "(\\d+)%", group =1) %>%as.numeric() %>%coalesce(0),FUNC_STAT_UNKNOWN =ifelse(FUNC_STAT_CAND_REG =="Unknown", 1L, 0L),CAND_UNDER_1 =ifelse(FUNC_STAT_CAND_REG =="Not Applicable (patient < 1 year old)", 1L, 0L), ) %>%step_rm(FUNC_STAT_CAND_REG) %>%# remove the original variable#: Cutoffs for eGFR (Kidney Disease) and Albumin (Nutrition)step_mutate(eGFR =pmin(eGFR, 250), # fix outliers for non-tree modelseGFR_CODED =case_when( eGFR >120~0, eGFR >=90~1, eGFR >=60~2, eGFR >=45~3, eGFR >=30~3.5, eGFR >=15~4, eGFR <15~5, .default =0), # if missing, assume eGFR is goodeGFR_UNDER_60 =ifelse(eGFR <60, 1, 0) %>%coalesce(0), # if missing, assume eGFR is good (above 60)ALBUM_UNDER_3 =ifelse(TOT_SERUM_ALBUM <3, 1, 0) %>%coalesce(0) # if missing, assume Albumin is good (above 3) )#: outliers in median refusals# step_mutate(median_refusals = pmin(median_refusals, 20))
Logistic Regression
1. Tidymodels specification
Lasso logistic regression model.
Remove LISTING_CTR_CODE
Add new missing indicator feature for all variables with missing
Convert Functional Status to number {0, 10, …, 100}. Add indicator for Unknown status.
One-hot encode all categorical predictors. For variables with {Yes, No, Unknown}, only keep the Yes column. This lumps the Unknown with No.
Truncate median_refusals to 20.
Impute all missing values with median.
Create new features by coding eGFR into stages of chronic kidney failure.
Create binary TOT_SERUM_ALBUM < 3 indicator.
Add polynomial terms for the numeric features.
Show the code
library(tidymodels)# Model specification: Lasso penalized logistic regressionlasso_spec =logistic_reg() %>%set_engine("glmnet") %>%set_args(mixture =1, # 1 = lasso, 0 = ridgepenalty =tune() ) # Recipe:lasso_rec = base_rec %>%#: Remove additional variables for this modelstep_rm(# LISTING_CTR_CODE, ) %>%#: Add additional variables to represent missing predictorsstep_indicate_na(all_predictors()) %>%#: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hotstep_dummy(all_ordered_predictors(), one_hot =TRUE) %>%# one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Impute missing valuesstep_impute_median(all_numeric_predictors()) %>%#: Add polynomial termsstep_poly( AGE, WEIGHT_KG, HEIGHT_CM, BMI, BSA, eGFR,# TOT_SERUM_ALBUM, FUNC_STAT_NUM,# pedhrtx_prev_yr, # median_refusals,# LC_effectdegree =2, ) %>%#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors()) %>%#: Scale all predictors for variable importance scoringstep_scale(all_numeric_predictors()) # Workflow:lasso_wflow =workflow(preprocessor = lasso_rec, spec = lasso_spec)
# Model specification: Random Forestrf_spec =rand_forest() %>%set_mode("classification") %>%set_engine("ranger", seed =2024, importance ="impurity", #"none", num.threads =8 ) %>%set_args(mtry =tune(), trees =2000, min_n =2 ) # Recipe:rf_rec = base_rec %>%#: Remove additional variables for this modelstep_rm(# LISTING_CTR_CODE, ) %>%#: Add additional variables to represent missing predictorsstep_indicate_na(all_predictors()) %>%#: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hot# step_dummy(all_ordered_predictors(), one_hot = TRUE) %>% # one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Impute missing valuesstep_impute_median(all_numeric_predictors()) %>%#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors())# Workflow:rf_wflow =workflow(preprocessor = rf_rec, spec = rf_spec)
2. Tune mtry
Use OOB observations for tuning instead of cross-validation.
Show the code
# pre-tuning: use oob brier score to seed the full cross-val grid search# this is used to speed along tune_grid() oob_brier <-function(mtry){# according to help(ranger), brier metric is used for oob error fit = rf_wflow %>%finalize_workflow(list(mtry = mtry)) %>%fit(data_train) tibble( mtry, oob_error = fit %>%extract_fit_engine() %>%pluck("prediction.error") )}num_cols =prep(rf_rec, data_train)$term_info %>%nrow() # number of featuresmtry_max =min(num_cols, 2*sqrt(num_cols)) %>%floor() # max mtry to trymtry_oob_grid =seq(1, mtry_max, length=50) %>%floor() %>%unique()oob_perf =map_df(mtry_oob_grid, oob_brier) # get oob performancemtry_grid = oob_perf %>%slice_min(oob_error, n =5) %>%# keep best 5 mtry valuesselect(mtry)
Fixing learn_rate(eta) to 0.10, sample_size = 0.80, and tuning tree_depth and trees. Fixing learn_rate and tuning the number of trees is good for efficiency due to the multi-predict capabilities.
Remove LISTING_CTR_CODE
One-hot encode all nominal predictors. For variables with {Yes, No, Unknown}, only keep the Yes column. This lumps the Unknown with No.
Let xgboost handle missing values
Show the code
library(xgboost)library(tidymodels)# Model specification: XGBoostbt_spec =boost_tree() %>%set_mode("classification") %>%set_engine("xgboost", nthread =8) %>%set_args(trees =tune(), tree_depth =tune(),learn_rate =0.10, # fixed sample_size = .80, # fixed ) # Recipe:## Let xgboost handle missing values internally; does not imputebt_rec = base_rec %>%step_rm(# LISTING_CTR_CODE, ) %>%# step_dummy(LISTING_CTR_CODE, one_hot = TRUE) %>% #: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hot# step_dummy(all_ordered_predictors(), one_hot = TRUE) %>% # one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors()) # Workflow:bt_wflow =workflow(preprocessor = bt_rec, spec = bt_spec)
2. Tuning
Show the code
# Tuning grid. Use fewer trees from larger depthbt_grid =bind_rows(tibble(trees =seq(25, 300, by =5), tree_depth =1), tibble(trees =seq(10, 150, by =5), tree_depth =2),tibble(trees =seq(10, 75, by =5), tree_depth =3), tibble(trees =seq(10, 50, by =5), tree_depth =4), )# expand_grid(trees = seq(25, 250, by = 10), tree_depth = 1:4)#: don't use bayes here since it won't exploit the multi-predict efficiencyset.seed(1000)tune_res =tune_grid(object = bt_wflow,resamples = cv_folds,grid = bt_grid,metrics =metric_set(roc_auc, brier_class, mn_log_loss, accuracy), control =control_grid(verbose=FALSE))
3. CV performance
Show the code
tune_res %>%show_best(metric ="roc_auc")
Show the code
tune_res %>%collect_metrics() %>%filter(.metric =="roc_auc") %>%arrange(trees) %>%ggplot(aes(trees, mean, color =factor(tree_depth))) +geom_point() +geom_line() +labs(x ="Number of Trees", y ="Avg AUC", color ="tree depth")
4. Final model fit
Show the code
# Final model fit(bt_tune = tune_res %>%select_best(metric ="roc_auc"))
features =prep(base_rec, data_train) %>%bake(new_data=NULL, all_predictors()) %>%colnames() %>%intersect(colnames(data_train))plot_multi_effects <-function(var){ var = rlang::ensym(var) df =bind_rows(lasso =get_additive_effects(var, model = lasso_fit, data=data_train),xgboost =get_additive_effects(var, model = bt_fit, data=data_train), .id ="model" ) %>%rename(x =!!var, y = eta) rug_data = df %>%filter(!is.na(n)) n_bks =nrow(rug_data) categorical =is.character(df$x) |is.factor(df$x)if(n_bks >6&!categorical ){# plt = plot_numeric_effects(df[[1]], df[[2]], var, rug_data) plt =ggplot(df) +geom_hline(yintercept =0, color ="orange") +geom_line(aes(x, y, color=model)) } else{# plt = plot_categorical_effects(df[[1]], df[[2]], var, rug_data) rug_data = rug_data %>%mutate(x=as.factor(x)) plt =ggplot(rug_data) +geom_hline(yintercept =0, color ="orange") +geom_col(aes(x, y, fill = model), width =1/3, position ="dodge") +scale_x_discrete(label = scales::label_wrap(15)) }print( plt +geom_rug(data = rug_data %>%distinct(x,n), aes(x, linewidth = n), show.legend =FALSE,sides ="b", alpha = .25) +labs(x =as.character(var), y ="partial effect") +scale_y_continuous(breaks =seq(-10, 10, by = .25)) +scale_color_brewer(type ="qual", palette ="Dark2") +scale_fill_brewer(type ="qual", palette ="Dark2") +coord_cartesian(ylim =c(-1,1)) +theme(legend.position =c(0.02, 0.98),legend.justification =c("left", "top"),legend.title=element_blank() ) )}walk(features, plot_multi_effects)
Feature Importance
The feature importance metric is the mean absolute effect (like shap importance).
Show the code
# Mean Absolute Effectfeature_importance <-function(var){ var = rlang::ensym(var) df =bind_rows(lasso =get_additive_effects(var, model = lasso_fit, data=data_train),xgboost =get_additive_effects(var, model = bt_fit, data=data_train), .id ="model" ) %>%rename(x =!!var, y = eta) %>%filter(!is.na(n)) %>%group_by(model) %>%summarize(Importance =sum(n*abs(y))/sum(n))}map(set_names(features), feature_importance) %>%bind_rows(.id ="Variable") %>%mutate(Variable =fct_reorder(Variable, Importance, .fun ="mean")) %>%ggplot(aes(x=Importance, y=Variable, fill = model)) +geom_col(position ="dodge") +scale_fill_brewer(type ="qual", palette ="Dark2")
Center Level Performance
This shows the predictive bias: test survival - predicted survival, by listing center. The top centers have the largest volume (n). Removed centers with \(n \le 2\).
As an example, the third from the top center (19468) has a bias of about -0.10. This means that the actual survival in this center was 10% lower than the models’ predicted.
While I’m not too concerned about the bias in the low volume centers (bottom on plot), there does appear to be modest unaccounted center effects.