Re-registration of model information (#664)

topepo · DavisVaughan · hfrick · web-flow · commit f8310e59d849 · 2022-02-22T19:56:43.000-05:00
* changes for #653 * minor refactoring * unused variable * update news * unit tests * ugly solution for comparing model info within list columns * Apply suggestions from code review Co-authored-by: Davis Vaughan <davis@rstudio.com> Co-authored-by: Hannah Frick <hfrick@users.noreply.github.com> * simplified testing of model info Co-authored-by: Davis Vaughan <davis@rstudio.com> Co-authored-by: Hannah Frick <hfrick@users.noreply.github.com>
diff --git a/NEWS.md b/NEWS.md
@@ -46,6 +46,10 @@
 
 * xgboost engines now use the new `iterationrange` parameter instead of the deprecated `ntreelimit` (#656).  
 
+## Developer
+
+* Models information can be re-registered as long as the information being registered is the same. This is helpful for packages that add new engines and use `devtools::load_all()` (#653).
+
 
 # parsnip 0.1.7
 
diff --git a/R/aaa_models.R b/R/aaa_models.R
@@ -540,7 +540,7 @@ set_new_model <- function(model) {
 
   current <- get_model_env()
 
-  set_env_val("models", c(current$models, model))
+  set_env_val("models", unique(c(current$models, model)))
   set_env_val(model, dplyr::tibble(engine = character(0), mode = character(0)))
   set_env_val(
     paste0(model, "_pkgs"),
@@ -674,12 +674,12 @@ set_dependency <- function(model, eng, pkg = "parsnip", mode = NULL) {
   check_eng_val(eng)
   check_pkg_val(pkg)
 
-  current <- get_model_env()
   model_info <- get_from_env(model)
   pkg_info <- get_from_env(paste0(model, "_pkgs"))
 
   # ----------------------------------------------------------------------------
   # Check engine
+
   has_engine <-
     model_info %>%
     dplyr::distinct(engine) %>%
@@ -750,37 +750,77 @@ get_dependency <- function(model) {
 
 # ------------------------------------------------------------------------------
 
-#' @rdname set_new_model
-#' @keywords internal
-#' @export
-set_fit <- function(model, mode, eng, value) {
-  check_model_exists(model)
-  check_eng_val(eng)
-  check_spec_mode_engine_val(model, eng, mode)
-  check_fit_info(value)
+# This will be used to see if the same information is being registered for the
+# same model/mode/engine (and prediction type). If it already exists and the
+# new information is different, fail with a message. See issue #653
+is_discordant_info <- function(model, mode, eng, candidate,
+                            pred_type = NULL, component = "fit") {
+  current <- get_from_env(paste0(model, "_", component))
 
-  current <- get_model_env()
-  model_info <- get_from_env(model)
-  old_fits <- get_from_env(paste0(model, "_fit"))
+  # For older versions of parsnip before set_encoding()
+  new_encoding <- is.null(current) & component == "encoding"
+
+  if (new_encoding) {
+    return(TRUE)
+  } else {
+    current <-  dplyr::filter(current, engine == eng & mode == !!mode)
+  }
+
+  if (component == "predict" & !is.null(pred_type)) {
+
+    current <- dplyr::filter(current, type == pred_type)
+    p_type <- paste0("and prediction type '", pred_type, "'")
+  } else {
+    p_type <- ""
+  }
 
+  if (nrow(current) == 0) {
+    return(TRUE)
+  }
+
+  same_info <- isTRUE(all.equal(current, candidate, check.environment = FALSE))
+
+  if (!same_info) {
+    rlang::abort(
+      glue::glue(
+        "The combination of engine '{eng}' and mode '{mode}' {p_type} already has ",
+        "{component} data for model '{model}' and the new information being ",
+        "registered is different."
+      )
+    )
+  }
+
+  FALSE
+}
+
+# Also check for general registration
+
+check_unregistered <- function(model, mode, eng) {
+  model_info <- get_from_env(model)
   has_engine <-
     model_info %>%
     dplyr::filter(engine == eng & mode == !!mode) %>%
     nrow()
   if (has_engine != 1) {
-    rlang::abort(glue::glue("The combination of '{eng}' and mode '{mode}' has not ",
-                            "been registered for model '{model}'."))
+    rlang::abort(
+      glue::glue("The combination of engine '{eng}' and mode '{mode}' has not ",
+                 "been registered for model '{model}'.")
+    )
   }
+  invisible(NULL)
+}
 
-  has_fit <-
-    old_fits %>%
-    dplyr::filter(engine == eng & mode == !!mode) %>%
-    nrow()
 
-  if (has_fit > 0) {
-    rlang::abort(glue::glue("The combination of '{eng}' and mode '{mode}' ",
-                            "already has a fit component for model '{model}'."))
-  }
+
+#' @rdname set_new_model
+#' @keywords internal
+#' @export
+set_fit <- function(model, mode, eng, value) {
+  check_model_exists(model)
+  check_eng_val(eng)
+  check_spec_mode_engine_val(model, eng, mode)
+  check_fit_info(value)
+  check_unregistered(model, mode, eng)
 
   new_fit <-
     dplyr::tibble(
@@ -789,6 +829,11 @@ set_fit <- function(model, mode, eng, value) {
       value = list(value)
     )
 
+  if (!is_discordant_info(model, mode, eng, new_fit)) {
+    return(invisible(NULL))
+  }
+
+  old_fits <- get_from_env(paste0(model, "_fit"))
   updated <- try(dplyr::bind_rows(old_fits, new_fit), silent = TRUE)
   if (inherits(updated, "try-error")) {
     rlang::abort("An error occured when adding the new fit module.")
@@ -824,39 +869,25 @@ set_pred <- function(model, mode, eng, type, value) {
   check_eng_val(eng)
   check_spec_mode_engine_val(model, eng, mode)
   check_pred_info(value, type)
+  check_unregistered(model, mode, eng)
 
-  current <- get_model_env()
   model_info <- get_from_env(model)
-  old_fits <- get_from_env(paste0(model, "_predict"))
-
-  has_engine <-
-    model_info %>%
-    dplyr::filter(engine == eng & mode == !!mode) %>%
-    nrow()
-  if (has_engine != 1) {
-    rlang::abort(glue::glue("The combination of '{eng}' and mode '{mode}'",
-                            "has not been registered for model '{model}'."))
-  }
-
-  has_pred <-
-    old_fits %>%
-    dplyr::filter(engine == eng & mode == !!mode & type == !!type) %>%
-    nrow()
-  if (has_pred > 0) {
-    rlang::abort(glue::glue("The combination of '{eng}', mode '{mode}', ",
-                            "and type '{type}' already has a prediction component",
-                            "for model '{model}'."))
-  }
 
-  new_fit <-
+  new_pred <-
     dplyr::tibble(
       engine = eng,
       mode = mode,
       type = type,
       value = list(value)
     )
 
-  updated <- try(dplyr::bind_rows(old_fits, new_fit), silent = TRUE)
+  pred_check <- is_discordant_info(model, mode, eng, new_pred, pred_type = type, component = "predict")
+  if (!pred_check) {
+    return(invisible(NULL))
+  }
+
+  old_pred <- get_from_env(paste0(model, "_predict"))
+  updated <- try(dplyr::bind_rows(old_pred, new_pred), silent = TRUE)
   if (inherits(updated, "try-error")) {
     rlang::abort("An error occured when adding the new fit module.")
   }
@@ -1032,25 +1063,15 @@ set_encoding <- function(model, mode, eng, options) {
   options <- tibble::as_tibble(options)
   new_values <- dplyr::bind_cols(keys, options)
 
-
-  current_db_list <- ls(envir = get_model_env())
-  nm <- paste(model, "encoding", sep = "_")
-  if (any(current_db_list == nm)) {
-    current <- get_from_env(nm)
-    dup_check <-
-      current %>%
-      dplyr::inner_join(
-        new_values,
-        by = c("model", "engine", "mode", "predictor_indicators")
-      )
-    if (nrow(dup_check)) {
-      rlang::abort(glue::glue("Engine '{eng}' and mode '{mode}' already have defined encodings for model '{model}'."))
-    }
-
-  } else {
-    current <- NULL
+  enc_check <- is_discordant_info(model, mode, eng, new_values, component = "encoding")
+  if (!enc_check) {
+    return(invisible(NULL))
   }
 
+  # Allow for older versions before set_encoding() was created
+  nm <- paste0(model, "_encoding")
+  current <- get_from_env(nm)
+
   db_values <- dplyr::bind_rows(current, new_values)
   set_env_val(nm, db_values)
 
diff --git a/tests/testthat/test_re_registration.R b/tests/testthat/test_re_registration.R
@@ -0,0 +1,158 @@
+# For issue #653 we want to be able to re-run the registration code as
+# long as the information being registered is the same.
+
+
+test_that('re-registration of mode', {
+  old_val <- get_from_env("bart_modes")
+  expect_error(set_model_mode("bart", "classification"), regexp = NA)
+  new_val <- get_from_env("bart_modes")
+  expect_equal(old_val, new_val)
+})
+
+test_that('re-registration of engine', {
+  old_val <- get_from_env("bart")
+  expect_error(
+    set_model_engine("bart", mode = "classification", eng = "dbarts"),
+    regexp = NA
+  )
+  new_val <- get_from_env("bart")
+  expect_equal(old_val, new_val)
+})
+
+
+test_that('re-registration of package dependencies', {
+  old_val <- get_from_env("bart_pkgs")
+  expect_error(
+    set_dependency("bart", "dbarts", "dbarts"),
+    regexp = NA
+  )
+  new_val <- get_from_env("bart_pkgs")
+  expect_equal(old_val, new_val)
+})
+
+test_that('re-registration of fit information', {
+  old_val <- get_from_env("bart_fit")
+  expect_error(
+    set_fit(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      value = list(
+        interface = "data.frame",
+        data = c(x = "x.train", y = "y.train"),
+        protect = c("x", "y"),
+        func = c(pkg = "dbarts", fun = "bart"),
+        defaults = list(verbose = FALSE, keeptrees = TRUE, keepcall = FALSE)
+      )
+    ),
+    regexp = NA
+  )
+  new_val <- get_from_env("bart_fit")
+  expect_equal(old_val, new_val)
+
+  # Fail if newly registered data is different than existing
+  # `verbose` option is different here
+  expect_error(
+    set_fit(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      value = list(
+        interface = "data.frame",
+        data = c(x = "x.train", y = "y.train"),
+        protect = c("x", "y"),
+        func = c(pkg = "dbarts", fun = "bart"),
+        defaults = list(verbose = TRUE, keeptrees = TRUE, keepcall = FALSE)
+      )
+    ),
+    "new information being registered is different"
+  )
+})
+
+test_that('re-registration of encoding information', {
+  old_val <- get_from_env("bart_encoding")
+  expect_error(
+    set_encoding(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      options = list(
+        predictor_indicators = "none",
+        compute_intercept = FALSE,
+        remove_intercept = FALSE,
+        allow_sparse_x = FALSE
+      )
+    ),
+    regexp = NA
+  )
+  new_val <- get_from_env("bart_encoding")
+  expect_equal(old_val, new_val)
+
+  # Fail if newly registered data is different than existing
+  # `compute_intercept` option is different here
+  expect_error(
+    set_encoding(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      options = list(
+        predictor_indicators = "none",
+        compute_intercept = TRUE,
+        remove_intercept = FALSE,
+        allow_sparse_x = FALSE
+      )
+    ),
+    "new information being registered is different"
+  )
+})
+
+
+test_that('re-registration of prediction information', {
+  old_val <- get_from_env("bart_predict")
+  expect_error(
+    set_pred(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      type = "numeric",
+      value = list(
+        pre = NULL,
+        post = NULL,
+        func = c(pkg = "parsnip", fun = "dbart_predict_calc"),
+        args =
+          list(
+            obj = quote(object),
+            new_data =  quote(new_data),
+            type = "numeric"
+          )
+      )
+    ),
+    regexp = NA
+  )
+  new_val <- get_from_env("bart_predict")
+  expect_equal(old_val, new_val)
+
+  # Fail if newly registered data is different than existing
+  # `type` option is different here
+  expect_error(
+    set_pred(
+      model = "bart",
+      eng = "dbarts",
+      mode = "regression",
+      type = "numeric",
+      value = list(
+        pre = NULL,
+        post = NULL,
+        func = c(pkg = "parsnip", fun = "dbart_predict_calc"),
+        args =
+          list(
+            obj = quote(object),
+            new_data =  quote(new_data),
+            type = "tuba"
+          )
+      )
+    ),
+    "new information being registered is different"
+  )
+})
+
diff --git a/tests/testthat/test_registration.R b/tests/testthat/test_registration.R