estimate censoring probabilities (#855)

topepo · hfrick · web-flow · commit 906071132e3e · 2023-02-14T15:55:46.000-05:00
* add reverse KM curve estimate

* documention uodate

* fix typos

* fix formula name

* Apply suggestions from code review

Co-authored-by: Hannah Frick &lt;hfrick@users.noreply.github.com&gt;

* put censoring functions in their own file

* generalize the censoring model interface for future use

* nocov and fix logic when NAs are there

* updated on reviewer code

* testing notes

* updated number

---------

Co-authored-by: Hannah Frick &lt;hfrick@users.noreply.github.com&gt;
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: parsnip
 Title: A Common API to Modeling and Analysis Functions
-Version: 1.0.3.9002
+Version: 1.0.3.9003
 Authors@R: c(
     person("Max", "Kuhn", , "max@rstudio.com", role = c("aut", "cre")),
     person("Davis", "Vaughan", , "davis@rstudio.com", role = "aut"),
@@ -54,6 +54,7 @@ Suggests:
     mgcv,
     modeldata,
     nlme,
+    prodlim,
     ranger (>= 0.12.0),
     remotes,
     rmarkdown,
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,6 +29,7 @@ S3method(nullmodel,default)
 S3method(predict,"_elnet")
 S3method(predict,"_lognet")
 S3method(predict,"_multnet")
+S3method(predict,censoring_model_reverse_km)
 S3method(predict,model_fit)
 S3method(predict,model_spec)
 S3method(predict,nullmodel)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # parsnip (development version)
 
+* For censored regression models, a "reverse Kaplan-Meier" curve is computed for the censoring distribution. This can be used when evaluating this type of model (#855).
+
 # parsnip 1.0.3
 
 * Adds documentation and tuning infrastructure for the new `flexsurvspline` engine for the `survival_reg()` model specification from the `censored` package (@mattwarkentin, #831).
diff --git a/R/censoring_probs.R b/R/censoring_probs.R
@@ -0,0 +1,106 @@
+# nocov start
+# tested in tidymodels/extratests#67
+
+new_reverse_km_fit <-
+  function(formula,
+           object,
+           pkgs = character(0),
+           label = character(0),
+           extra_cls = character(0)) {
+    res <- list(formula = formula, fit = object, label = label, required_pkgs = pkgs)
+    class(res) <- c(paste0("censoring_model_", label), "censoring_model", extra_cls)
+    res
+  }
+
+# ------------------------------------------------------------------------------
+# estimate the reverse km curve for censored regression models
+
+reverse_km <- function(obj, eval_env) {
+  if (obj$mode != "censored regression") {
+    return(list())
+  }
+  rlang::check_installed("prodlim")
+
+  # Note: even when fit_xy() is called, eval_env will still have
+  # objects data and formula in them
+  f <- eval_env$formula
+  km_form <- stats::update(f, ~ 1)
+  cl <-
+    rlang::call2(
+      "prodlim",
+      formula = km_form,
+      .ns = "prodlim",
+      reverse = TRUE,
+      type = "surv",
+      x = FALSE,
+      data = rlang::expr(eval_env$data)
+    )
+
+  if (!is.null(eval_env$weights)) {
+    cl <- rlang::call_modify(cl, caseweights = rlang::expr(eval_env$weights))
+  }
+  rkm <- try(rlang::eval_tidy(cl), silent = TRUE)
+  new_reverse_km_fit(f, object = rkm, label = "reverse_km", pkgs = "prodlim")
+}
+
+# ------------------------------------------------------------------------------
+# Basic S3 methods
+
+print.censoring_model <- function(x, ...) {
+  cat(x$label, "model for predicting the probability of censoring\n")
+  invisible(x)
+}
+
+predict.censoring_model <- function(object, ...) {
+  rlang::abort(
+    paste("Don't know how to predict with a censoring model of type:", object$label)
+  )
+  invisible(NULL)
+}
+
+#' @export
+predict.censoring_model_reverse_km <- function(object, new_data = NULL, time, as_vector = FALSE, ...) {
+  rlang::check_installed("prodlim")
+
+  res <- rep(NA_real_, length(time))
+  if (length(time) == 0) {
+    return(res)
+  }
+
+  # Some time values might be NA (for Graf category 2)
+  is_na <- which(is.na(time))
+  if (length(is_na) > 0) {
+    time <- time[-is_na]
+  }
+
+  if (is.null(new_data)) {
+    tmp <-
+      purrr::map_dbl(time, ~ predict(object$fit, times = .x, type = "surv"))
+  } else {
+    tmp <-
+      purrr::map_dbl(time, ~ predict(object$fit, newdata = new_data, times = .x, type = "surv"))
+  }
+
+  zero_prob <- purrr::map_lgl(tmp, ~ !is.na(.x) && .x == 0)
+  if (any(zero_prob)) {
+    # Don't want censoring probabilities of zero so add an epsilon
+    # Either use 1/n or half of the minimum survival probability
+    n <- max(object$fit$n.risk)
+    half_min_surv_prob <- min(object$fit$surv[object$fit$surv > 0]) / 2
+    eps <- min(1 / n, half_min_surv_prob)
+    tmp[zero_prob] <- eps
+  }
+
+  if (length(is_na) > 0) {
+    res[-is_na] <- tmp
+  } else {
+    res <- tmp
+  }
+
+  if (!as_vector) {
+    res <- tibble::tibble(.prob_censored = unname(res))
+  }
+  res
+}
+
+# nocov end
diff --git a/R/fit.R b/R/fit.R
@@ -54,6 +54,12 @@
 #' `options(contrasts = c(unordered = "contr.helmert", ordered = "contr.poly"))`.
 #' See the help page for [stats::contr.treatment()] for more possible contrast
 #' types.
+#'
+#' For models with `"censored regression"` modes, an additional computation is
+#' executed and saved in the parsnip object. The `censor_probs` element contains
+#' a "reverse Kaplan-Meier" curve that models the probability of censoring. This
+#' may be used later to compute inverse probability censoring weights for
+#' performance measures.
 #' @examples
 #' # Although `glm()` only has a formula interface, different
 #' # methods for specifying the model can be used
@@ -206,6 +212,7 @@ fit.model_spec <-
 
         rlang::abort(glue::glue("{interfaces} is unknown."))
       )
+    res$censor_probs <- reverse_km(object, eval_env)
     model_classes <- class(res$fit)
     class(res) <- c(paste0("_", model_classes[1]), "model_fit")
     res
@@ -317,6 +324,7 @@ fit_xy.model_spec <-
           ),
         rlang::abort(glue::glue("{interfaces} is unknown."))
       )
+    res$censor_probs <- reverse_km(object, eval_env)
     model_classes <- class(res$fit)
     class(res) <- c(paste0("_", model_classes[1]), "model_fit")
     res
diff --git a/R/fit_helpers.R b/R/fit_helpers.R
@@ -197,5 +197,3 @@ xy_form <- function(object, env, control, ...) {
   res$preproc <- data_obj[c("x_var", "y_var")]
   res
 }
-
-
diff --git a/man/fit.Rd b/man/fit.Rd
diff --git a/tests/testthat/README.md b/tests/testthat/README.md
@@ -0,0 +1,2 @@
+Note that some functionality in parsnip is tested outside of the package. Due to a high degree of dependencies, many additional tests are in the [extratexts](https://github.com/tidymodels/extratests/tree/main/tests/testthat) repository. These are run nightly with CRAN and Github versions of parsnip as well as other tidymodels packages. 
+

Original file line number	Diff line number	Diff line change
`@@ -197,5 +197,3 @@ xy_form <- function(object, env, control, ...) {`
`197`	`197`	`res$preproc <- data_obj[c("x_var", "y_var")]`
`198`	`198`	`res`
`199`	`199`	`}`
`200`		`-`
`201`		`-`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Note that some functionality in parsnip is tested outside of the package. Due to a high degree of dependencies, many additional tests are in the [extratexts](https://github.com/tidymodels/extratests/tree/main/tests/testthat) repository. These are run nightly with CRAN and Github versions of parsnip as well as other tidymodels packages.`
	`2`	`+`