ercbk
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎performance-experiment/Kuhn-Johnson/check-results.R‎
Lines changed: 17 additions & 0 deletions b/‎performance-experiment/Kuhn-Johnson/check-results.R‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎performance-experiment/Kuhn-Johnson/make-kj.R‎
Lines changed: 77 additions & 0 deletions b/‎performance-experiment/Kuhn-Johnson/make-kj.R‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎performance-experiment/Kuhn-Johnson/plan-kj.R‎
Lines changed: 104 additions & 0 deletions b/‎performance-experiment/Kuhn-Johnson/plan-kj.R‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎performance-experiment/create-grids.R‎ renamed to ‎performance-experiment/functions/create-grids.R‎
Lines changed: 11 additions & 4 deletions b/‎performance-experiment/create-grids.R‎ renamed to ‎performance-experiment/functions/create-grids.R‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎performance-experiment/create-models.R‎ renamed to ‎performance-experiment/functions/create-models.R‎
Lines changed: 3 additions & 1 deletion b/‎performance-experiment/create-models.R‎ renamed to ‎performance-experiment/functions/create-models.R‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎performance-experiment/create-ncv.R‎ renamed to ‎performance-experiment/functions/create-ncv-objects.R‎
Lines changed: 14 additions & 1 deletion b/‎performance-experiment/create-ncv.R‎ renamed to ‎performance-experiment/functions/create-ncv-objects.R‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎performance-experiment/functions/inner-tune.R‎
Lines changed: 68 additions & 0 deletions b/‎performance-experiment/functions/inner-tune.R‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎performance-experiment/mlbench-data.R‎ renamed to ‎performance-experiment/functions/mlbench-data.R‎ b/‎performance-experiment/mlbench-data.R‎ renamed to ‎performance-experiment/functions/mlbench-data.R‎
diff --git a/‎performance-experiment/ncv-compare.R‎ renamed to ‎performance-experiment/functions/ncv-compare.R‎
Lines changed: 13 additions & 1 deletion b/‎performance-experiment/ncv-compare.R‎ renamed to ‎performance-experiment/functions/ncv-compare.R‎
Lines changed: 13 additions & 1 deletion
@@ -2,4 +2,5 @@
 .Rhistory
 .RData
 .Ruserdata
-.env
+.env
+.drake
@@ -0,0 +1,17 @@
+# Results
+
+
+library(drake); library(dplyr)
+
+# loadd(perf_results_100)
+# View(perf_results_100)
+loadd(perf_results_800)
+View(perf_results_800)
+
+# each target's build time
+bt <- build_times(starts_with("ncv_results"), digits = 4)
+View(bt)
+bt %>% 
+      select(target, elapsed) %>% 
+      kableExtra::kable() %>% 
+      kableExtra::save_kable(file = "performance-experiment/output/kj-build-times.png")
@@ -0,0 +1,77 @@
+# drake make file for Kuhn-Johnson performance experiment
+
+
+# Notes:
+# 1. see plan-kj.R for more details on how this thing works
+# 2. link to {future} issue with instructions on special PuTTY settings, https://github.com/HenrikBengtsson/future/issues/370
+
+
+# load packages, functions, and drake plan
+source("performance-experiment/packages.R")
+source("performance-experiment/functions/mlbench-data.R")
+source("performance-experiment/functions/create-ncv-objects.R")
+source("performance-experiment/functions/create-models.R")
+source("performance-experiment/functions/create-grids.R")
+source("performance-experiment/functions/inner-tune.R")
+source("performance-experiment/functions/outer-cv.R")
+source("performance-experiment/functions/ncv-compare.R")
+source("performance-experiment/functions/run-ncv.R")
+source("performance-experiment/Kuhn-Johnson/plan-kj.R")
+
+
+# text me if an error occurs
+options(error = function() {
+      library(RPushbullet)
+      pbPost("note", "Error", geterrmessage())
+      if(!interactive()) stop(geterrmessage())
+})
+
+
+
+set.seed(2019)
+
+# Using different compute sizes for each model
+ip1 <- Sys.getenv("GLMEC2IP")
+ip2 <- Sys.getenv("RFEC2IP")
+public_ips <- c(ip1, ip2)
+# ppk file converted by PuTTY from an AWS pem file
+ssh_private_key_file <- Sys.getenv("AWSKEYPATH")
+
+
+cl <- future::makeClusterPSOCK(
+   
+   ## Public IP numbers of EC2 instances
+   public_ips,
+   
+   ## User name (always 'ubuntu')
+   user = "ubuntu",
+   
+   ## Use private SSH key registered with AWS
+   ## futureSettings is a saved PuTTY session with settings to keep ssh active
+   rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
+   rshopts = c(
+      "-sshrawlog", "ec2-ssh-raw.log"
+   ),
+   
+   rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
+   ), 
+   verbose = TRUE
+)
+
+
+future::plan(list(tweak(cluster, workers = cl), multiprocess))
+
+
+# verbose = 0 prints nothing, verbose = 1 prints message as each target completes; verbose = 2 adds a progress bar that tracks target completion
+make(
+      plan,
+      verbose = 1
+)
+
+# network graph of the drake plan
+vis_drake_graph(plan, file = "performance-experiment/output/kj-plan-network.png", build_times = "build", main = "Performance Experiment")
+
+# text me when it finishes
+RPushbullet::pbPost("note", title="kj performance experiment", body="perf run finished")
+
+parallel::stopCluster(cl)
@@ -0,0 +1,104 @@
+# Kuhn-Johnson drake plan
+
+
+# Notes:
+# 1. I broke the plan into units by sample size. I'm sure its possible to formulate the plan to perform the whole experiment by looping the kj and raschka method along with sample sizes into one large, more compact plan, but I wanted units that I could run overnight on my desktop.
+# 2. sample_sizes: 100, 800, 2000, 5000, 10000 (maybe)
+# 3. I'm trying to minimize the delta_error. Delta error is the absolute difference between the average error across the outer-folds of the nested cross-validation and the out-of-sample error which uses the chosen model and parameters to predict on a simulated 100K row dataset.
+
+
+
+
+
+error_FUN <- function(y_obs, y_hat){
+      y_obs <- unlist(y_obs)
+      y_hat <- unlist(y_hat)
+      Metrics::mae(y_obs, y_hat)
+}
+
+method <- "kj"
+algorithms <- list("glmnet", "rf")
+repeats <- seq(1:5)
+grid_size <- 100
+
+plan <- drake_plan(
+   # model functions for each algorithm
+   mod_FUN_list = create_models(algorithms),
+   # data used to estimate out-of-sample error
+   # noise_sd, seed settings are the defaults
+   large_dat = mlbench_data(n = 10^5,
+                            noise_sd = 1,
+                            seed = 2019),
+   # sample size = 100
+   sim_dat_100 = mlbench_data(100),
+   # hyperparameter grids for each algorithm
+   # This probably doesn't need to be a "dynamic" target since mtry is only concerned about the number of columns in data (see script), but I'll do it anyways
+   params_list_100 = create_grids(sim_dat_100,
+                              algorithms,
+                              size = grid_size),
+   # create a separate ncv data object for each repeat value
+   ncv_dat_100 = create_ncv_objects(sim_dat_100,
+                                       repeats,
+                                       method),
+   # runs nested-cv and compares ncv error with out-of-sample error
+   # outputs: ncv error, oos error, delta error, chosen algorithm, chosen hyperparameters 
+   ncv_results_100 = target(
+      run_ncv(ncv_dat_100,
+              sim_dat_100,
+              large_dat,
+              mod_FUN_list,
+              params_list_100,
+              error_FUN,
+              method),
+      dynamic = map(ncv_dat_100)
+   ),
+   # add index columns to identify the results according to sample size and number of repeats
+   perf_results_100 = tibble(n = 100, repeats = repeats) %>%
+      bind_cols(ncv_results_100),
+   
+   # repeat for the rest of the sample sizes
+   # sample size = 800
+   sim_dat_800 = mlbench_data(800),
+   params_list_800 = create_grids(sim_dat_800,
+                                  algorithms,
+                                  size = grid_size),
+   ncv_dat_800 = create_ncv_objects(sim_dat_800,
+                                    repeats,
+                                    method),
+   ncv_results_800 = target(
+      run_ncv(ncv_dat_800,
+              sim_dat_800,
+              large_dat,
+              mod_FUN_list,
+              params_list_800,
+              error_FUN,
+              method),
+      dynamic = map(ncv_dat_800)
+   ),
+   perf_results_800 = tibble(n = 800, repeats = repeats) %>%
+      bind_cols(ncv_results_800),
+   
+   # sample size = 2000
+   sim_dat_2000 = mlbench_data(2000),
+   params_list_2000 = create_grids(sim_dat_2000,
+                                  algorithms,
+                                  size = grid_size),
+   ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
+                                    repeats,
+                                    method),
+   ncv_results_2000 = target(
+      run_ncv(ncv_dat_2000,
+              sim_dat_2000,
+              large_dat,
+              mod_FUN_list,
+              params_list_2000,
+              error_FUN,
+              method),
+      dynamic = map(ncv_dat_2000)
+   ),
+   perf_results_2000 = tibble(n = 2000, repeats = repeats) %>%
+      bind_cols(ncv_results_2000)
+   
+)
+
+
@@ -8,9 +8,11 @@
 # "glmnet" = Elastic Net regression
 # "svm" = Support Vector Machines
 
-pacman::p_load(dplyr)
+# output: list of grid objects
 
-create_grids <- function(algorithms, size = 100) {
+
+
+create_grids <- function(sim_dat, algorithms, size = 100) {
 
       # Elastic Net Regression
 
@@ -22,9 +24,12 @@ create_grids <- function(algorithms, size = 100) {
 
       # Random Forest
 
+      # Some of the parnsip model parameters have "unknown" for the default value ranges. finalize replaces the unknowns with values based on the data.
+      mtry_updated <- dials::finalize(dials::mtry(), select(sim_dat, -ncol(sim_dat)))
+      
       rf_params <- dials::grid_latin_hypercube(
-            dials::mtry(range = c(3, 4)),
-            dials::trees(range = c(200, 300)),
+            mtry_updated,
+            dials::trees(),
             size = size 
       )
 
@@ -36,6 +41,8 @@ create_grids <- function(algorithms, size = 100) {
             size = size 
       )
 
+      # list of grid objects depending on the algorithms inputted (switch is pretty cool)
+      # stop_glue throws error if algorithm inputted isn't available (Should be in glue pkg but isn't)
       grid_list <- purrr::map(algorithms, function(alg) {
             switch(alg,
                    rf = rf_params -> alg_grid,
 
@@ -7,7 +7,7 @@
 
 # output: list of model functions
 
-pacman::p_load(dplyr)
+
 
 create_models <- function(algorithms) {
 
@@ -39,6 +39,8 @@ create_models <- function(algorithms) {
             model
       }
 
+      # list of model objects depending on the algorithms inputted (switch is pretty cool)
+      # stop_glue throws error if algorithm inputted isn't available (Should be in glue pkg but isn't)
       mod_FUN_list <- purrr::map(algorithms, function(alg) {
             switch(alg,
                    rf = ranger_FUN -> mod_fun,
 
@@ -1,15 +1,28 @@
 # nested-cv data function
 
+# inputs:
+# 1. dat = dataset
+# 2. repeats = numeric vector with numbers of repeats
+# 3. method = "kj" or "raschka"
+# outputs:
+# 1. list of {rsample} nested cv objects; one object per repeat value
 
 
 
-create_ncv <- function(dat, repeats, method) {
+create_ncv_objects <- function(dat, repeats, method) {
 
    attempt::stop_if_not(repeats, is.numeric, "repeats needs to be a numeric class")
    attempt::stop_if_not(method, is.character, "method needs to be a character class")
 
+   # don't remember but guessing crossing needs a list object
+   if (is.data.frame(dat)) {
+      dat <- list(dat)
+   }
+   # tibble grid of data and repeats
    grid <- tidyr::crossing(dat, repeats)
 
+   # generate list of ncv objects
+   # dynGet needed to get reps out of the envirnonment and into the nested_cv function
    if (method == "kj") {
       ncv_list <- purrr::map2(grid$dat, grid$repeats, function(dat, reps) {
          rsample::nested_cv(dat,
 
@@ -0,0 +1,68 @@
+# inner loop tuning function
+
+# inputs:
+# 1. ncv_dat = one ncv object from the list created by create-ncv-objects.R
+# 2. mod_FUN_list = all the model objects created by create-models.R
+# 3. params_list = all the hyperparameter grids created by create-grids.R
+# 4. error_FUN = specified at the start of plan-<method>.R 
+
+# outputs: df of hyperparameters for each fold that was chosen in the inner-loop
+
+
+
+inner_tune <- function(ncv_dat, mod_FUN_list, params_list, error_FUN) {
+   
+   # inputs params, model, and resample, calls model and error functions, outputs error
+   mod_error <- function(params, mod_FUN, dat) {
+      y_col <- ncol(dat$data)
+      y_obs <- rsample::assessment(dat)[y_col]
+      mod <- mod_FUN(params, rsample::analysis(dat))
+      pred <- predict(mod, rsample::assessment(dat))
+      if (!is.data.frame(pred)) {
+         pred <- pred$predictions
+      }
+      error <- error_FUN(y_obs, pred)
+      error
+   }
+   
+   # inputs resample, loops hyperparam grid values to model/error function, collects error value for hyperparam combo
+   tune_over_params <- function(dat, mod_FUN, params) {
+      params$error <- purrr::map_dbl(1:nrow(params), function(row) {
+         params <- params[row,]
+         mod_error(params, mod_FUN, dat)
+      })
+      params
+   }
+   
+   # inputs and sends fold's resamples to tuning function, collects and averages fold's error for each hyperparameter combo
+   summarize_tune_results <- function(dat, mod_FUN, params) {
+      # Return row-bound tibble that has the 25 bootstrap results
+      param_names <- names(params)
+      furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params, .progress = FALSE) %>%
+      lazy_dt(., key_by = param_names) %>% 
+         # For each value of the tuning parameter, compute the
+         # average <error> which is the inner bootstrap estimate.
+         group_by_at(vars(param_names)) %>%
+         summarize(mean_error = mean(error, na.rm = TRUE),
+                   sd_error = sd(error, na.rm = TRUE),
+                   n = length(error)) %>% 
+         as_tibble()
+   }
+   
+   tune_algorithms <- furrr::future_map2(mod_FUN_list, params_list, function(mod_FUN, params){
+      tuning_results <- purrr::map(ncv_dat$inner_resamples, summarize_tune_results, mod_FUN, params)
+      
+      # Choose best hyperparameter combos across all the resamples for each fold (e.g. 5 repeats 10 folds = 50 best hyperparam combos)
+      best_hyper_vals <- tuning_results %>%
+         purrr::map_df(function(dat) {
+            dat %>% 
+               filter(mean_error == min(mean_error)) %>% 
+               arrange(sd_error) %>% 
+               slice(1)
+         }) %>%
+         select(names(params))
+   })
+}
+
+
+
@@ -3,6 +3,17 @@
 
 # Chooses the best algorithm, fits best model on entire training set, predicts against large simulated data set
 
+# inputs:
+# 1. train_dat = the entire training dataset
+# 2. large_dat = the test dataset
+# 3. cv_stats = outer_cv.R output: df with chosen model, outer fold stats, hyperparams
+# 4. mod_FUN_list = list of model objects created from create_models.R
+# 5. params_list = list of hyperparameter grids created from create_grids.R
+# 6. error_FUN = error function given at the start of plan_<method>.R
+# 7. method = "kj" or "raschka", given at the start of plan_<method>.R
+
+# output: df with algorithm, hyperparams, and error values
+
 
 ncv_compare <- function(train_dat, large_dat, cv_stats, mod_FUN_list, params_list, error_FUN, method) {
 
@@ -27,9 +38,10 @@ ncv_compare <- function(train_dat, large_dat, cv_stats, mod_FUN_list, params_lis
          select(names(params_list[[chosen_alg]]))
    }
 
+   # fit model over entire training set
    fit <- mod_FUN(params, train_dat)
 
-   # fit <- mod_FUN(params, ncv_dat_list$sim_data[[1]])
+   # predict on test set
    preds <- predict(fit, large_dat)
    if (!is.data.frame(preds)) {
       preds <- preds$predictions