|
22 | 22 | #' } |
23 | 23 | #' These arguments are converted to their specific names at the |
24 | 24 | #' time that the model is fit. Other options and argument can be |
25 | | -#' set using the `others` argument. If left to their defaults |
| 25 | +#' set using the `...` slot. If left to their defaults |
26 | 26 | #' here (`NULL`), the values are taken from the underlying model |
27 | 27 | #' functions. If parameters need to be modified, `update` can be used |
28 | 28 | #' in lieu of recreating the object from scratch. |
29 | 29 | #' |
30 | 30 | #' @param mode A single character string for the type of model. |
31 | 31 | #' Possible values for this model are "unknown", "regression", or |
32 | 32 | #' "classification". |
33 | | -#' @param others A named list of arguments to be used by the |
34 | | -#' underlying models (e.g., `xgboost::xgb.train`, etc.). . |
35 | 33 | #' @param mtry An number for the number (or proportion) of predictors that will |
36 | 34 | #' be randomly sampled at each split when creating the tree models (`xgboost` |
37 | 35 | #' only). |
|
48 | 46 | #' @param sample_size An number for the number (or proportion) of data that is |
49 | 47 | #' exposed to the fitting routine. For `xgboost`, the sampling is done at at |
50 | 48 | #' each iteration while `C5.0` samples once during traning. |
51 | | -#' @param ... Used for method consistency. Any arguments passed to |
52 | | -#' the ellipses will result in an error. Use `others` instead. |
| 49 | +#' @param ... Other arguments to pass to the specific engine's |
| 50 | +#' model fit function (see the Engine Details section below). This |
| 51 | +#' should not include arguments defined by the main parameters to |
| 52 | +#' this function. For the `update` function, the ellipses can |
| 53 | +#' contain the primary arguments or any others. |
53 | 54 | #' @details |
54 | 55 | #' The data given to the function are not saved and are only used |
55 | 56 | #' to determine the _mode_ of the model. For `boost_tree`, the |
|
62 | 63 | #' \item \pkg{Spark}: `"spark"` |
63 | 64 | #' } |
64 | 65 | #' |
65 | | -#' Main parameter arguments (and those in `others`) can avoid |
| 66 | +#' Main parameter arguments (and those in `...`) can avoid |
66 | 67 | #' evaluation until the underlying function is executed by wrapping the |
67 | 68 | #' argument in [rlang::expr()] (e.g. `mtry = expr(floor(sqrt(p)))`). |
68 | 69 | #' |
| 70 | +#' |
| 71 | +#' @section Engine Details: |
| 72 | +#' |
69 | 73 | #' Engines may have pre-set default arguments when executing the |
70 | | -#' model fit call. These can be changed by using the `others` |
| 74 | +#' model fit call. These can be changed by using the `...` |
71 | 75 | #' argument to pass in the preferred values. For this type of |
72 | 76 | #' model, the template of the fit calls are: |
73 | 77 | #' |
|
114 | 118 |
|
115 | 119 | boost_tree <- |
116 | 120 | function(mode = "unknown", |
117 | | - ..., |
118 | 121 | mtry = NULL, trees = NULL, min_n = NULL, |
119 | 122 | tree_depth = NULL, learn_rate = NULL, |
120 | 123 | loss_reduction = NULL, |
121 | 124 | sample_size = NULL, |
122 | | - others = list()) { |
123 | | - check_empty_ellipse(...) |
| 125 | + ...) { |
| 126 | + |
| 127 | + others <- enquos(...) |
| 128 | + |
| 129 | + args <- list( |
| 130 | + mtry = enquo(mtry), |
| 131 | + trees = enquo(trees), |
| 132 | + min_n = enquo(min_n), |
| 133 | + tree_depth = enquo(tree_depth), |
| 134 | + learn_rate = enquo(learn_rate), |
| 135 | + loss_reduction = enquo(loss_reduction), |
| 136 | + sample_size = enquo(sample_size) |
| 137 | + ) |
124 | 138 |
|
125 | 139 | if (!(mode %in% boost_tree_modes)) |
126 | 140 | stop("`mode` should be one of: ", |
127 | 141 | paste0("'", boost_tree_modes, "'", collapse = ", "), |
128 | 142 | call. = FALSE) |
129 | 143 |
|
130 | | - if (is.numeric(trees) && trees < 0) |
131 | | - stop("`trees` should be >= 1", call. = FALSE) |
132 | | - if (is.numeric(sample_size) && (sample_size < 0 | sample_size > 1)) |
133 | | - stop("`sample_size` should be within [0,1]", call. = FALSE) |
134 | | - if (is.numeric(tree_depth) && tree_depth < 0) |
135 | | - stop("`tree_depth` should be >= 1", call. = FALSE) |
136 | | - if (is.numeric(min_n) && min_n < 0) |
137 | | - stop("`min_n` should be >= 1", call. = FALSE) |
138 | | - |
139 | | - args <- list( |
140 | | - mtry = mtry, trees = trees, min_n = min_n, tree_depth = tree_depth, |
141 | | - learn_rate = learn_rate, loss_reduction = loss_reduction, |
142 | | - sample_size = sample_size |
143 | | - ) |
144 | | - |
145 | | - no_value <- !vapply(others, is.null, logical(1)) |
| 144 | + no_value <- !vapply(others, null_value, logical(1)) |
146 | 145 | others <- others[no_value] |
147 | 146 |
|
148 | 147 | out <- list(args = args, others = others, |
@@ -184,16 +183,20 @@ update.boost_tree <- |
184 | 183 | mtry = NULL, trees = NULL, min_n = NULL, |
185 | 184 | tree_depth = NULL, learn_rate = NULL, |
186 | 185 | loss_reduction = NULL, sample_size = NULL, |
187 | | - others = list(), |
188 | 186 | fresh = FALSE, |
189 | 187 | ...) { |
190 | | - check_empty_ellipse(...) |
| 188 | + |
| 189 | + others <- enquos(...) |
191 | 190 |
|
192 | 191 | args <- list( |
193 | | - mtry = mtry, trees = trees, min_n = min_n, tree_depth = tree_depth, |
194 | | - learn_rate = learn_rate, loss_reduction = loss_reduction, |
195 | | - sample_size = sample_size |
196 | | - ) |
| 192 | + mtry = enquo(mtry), |
| 193 | + trees = enquo(trees), |
| 194 | + min_n = enquo(min_n), |
| 195 | + tree_depth = enquo(tree_depth), |
| 196 | + learn_rate = enquo(learn_rate), |
| 197 | + loss_reduction = enquo(loss_reduction), |
| 198 | + sample_size = enquo(sample_size) |
| 199 | + ) |
197 | 200 |
|
198 | 201 | # TODO make these blocks into a function and document well |
199 | 202 | if (fresh) { |
@@ -235,9 +238,45 @@ translate.boost_tree <- function(x, engine, ...) { |
235 | 238 | x |
236 | 239 | } |
237 | 240 |
|
| 241 | +# ------------------------------------------------------------------------------ |
| 242 | + |
| 243 | +check_args.boost_tree <- function(object) { |
| 244 | + |
| 245 | + args <- lapply(object$args, rlang::eval_tidy) |
| 246 | + |
| 247 | + if (is.numeric(args$trees) && args$trees < 0) |
| 248 | + stop("`trees` should be >= 1", call. = FALSE) |
| 249 | + if (is.numeric(args$sample_size) && (args$sample_size < 0 | args$sample_size > 1)) |
| 250 | + stop("`sample_size` should be within [0,1]", call. = FALSE) |
| 251 | + if (is.numeric(args$tree_depth) && args$tree_depth < 0) |
| 252 | + stop("`tree_depth` should be >= 1", call. = FALSE) |
| 253 | + if (is.numeric(args$min_n) && args$min_n < 0) |
| 254 | + stop("`min_n` should be >= 1", call. = FALSE) |
| 255 | + |
| 256 | + invisible(object) |
| 257 | +} |
238 | 258 |
|
239 | 259 | # xgboost helpers -------------------------------------------------------------- |
240 | 260 |
|
| 261 | +#' Boosted trees via xgboost |
| 262 | +#' |
| 263 | +#' `xgb_train` is a wrapper for `xgboost` tree-based models |
| 264 | +#' where all of the model arguments are in the main function. |
| 265 | +#' |
| 266 | +#' @param x A data frame or matrix of predictors |
| 267 | +#' @param y A vector (factor or numeric) or matrix (numeric) of outcome data. |
| 268 | +#' @param max_depth An integer for the maximum depth of the tree. |
| 269 | +#' @param nrounds An integer for the number of boosting iterations. |
| 270 | +#' @param eta A numeric value between zero and one to control the learning rate. |
| 271 | +#' @param colsample_bytree Subsampling proportion of columns. |
| 272 | +#' @param min_child_weight A numeric value for the minimum sum of instance |
| 273 | +#' weights needed in a child to continue to split. |
| 274 | +#' @param gamma An number for the minimum loss reduction required to make a |
| 275 | +#' further partition on a leaf node of the tree |
| 276 | +#' @param subsample Subsampling proportion of rows. |
| 277 | +#' @param ... Other options to pass to `xgb.train`. |
| 278 | +#' @return A fitted `xgboost` object. |
| 279 | +#' @export |
241 | 280 | xgb_train <- function( |
242 | 281 | x, y, |
243 | 282 | max_depth = 6, nrounds = 15, eta = 0.3, colsample_bytree = 1, |
@@ -380,6 +419,31 @@ xgb_by_tree <- function(tree, object, new_data, type, ...) { |
380 | 419 |
|
381 | 420 | # C5.0 helpers ----------------------------------------------------------------- |
382 | 421 |
|
| 422 | +#' Boosted trees via C5.0 |
| 423 | +#' |
| 424 | +#' `C5.0_train` is a wrapper for [C50::C5.0()] tree-based models |
| 425 | +#' where all of the model arguments are in the main function. |
| 426 | +#' |
| 427 | +#' @param x A data frame or matrix of predictors. |
| 428 | +#' @param y A factor vector with 2 or more levels |
| 429 | +#' @param trials An integer specifying the number of boosting |
| 430 | +#' iterations. A value of one indicates that a single model is |
| 431 | +#' used. |
| 432 | +#' @param weights An optional numeric vector of case weights. Note |
| 433 | +#' that the data used for the case weights will not be used as a |
| 434 | +#' splitting variable in the model (see |
| 435 | +#' \url{http://www.rulequest.com/see5-win.html#CASEWEIGHT} for |
| 436 | +#' Quinlan's notes on case weights). |
| 437 | +#' @param minCases An integer for the smallest number of samples |
| 438 | +#' that must be put in at least two of the splits. |
| 439 | +#' @param sample A value between (0, .999) that specifies the |
| 440 | +#' random proportion of the data should be used to train the model. |
| 441 | +#' By default, all the samples are used for model training. Samples |
| 442 | +#' not used for training are used to evaluate the accuracy of the |
| 443 | +#' model in the printed output. |
| 444 | +#' @param ... Other arguments to pass. |
| 445 | +#' @return A fitted C5.0 model. |
| 446 | +#' @export |
383 | 447 | C5.0_train <- |
384 | 448 | function(x, y, weights = NULL, trials = 15, minCases = 2, sample = 0, ...) { |
385 | 449 | other_args <- list(...) |
|
0 commit comments