performance experiment initial commit

ercbk · ercbk · commit 8f70efd71191 · 2020-03-10T00:31:44.000-04:00
diff --git a/performance-experiment/create-grids.R b/performance-experiment/create-grids.R
@@ -0,0 +1,50 @@
+# Create Hyperparameter grid list
+
+
+# input: 
+# 1, size = number of rows
+# 2. algorithms = list of algorithm abbreviations
+# "rf" = Ranger Random Forest
+# "glmnet" = Elastic Net regression
+# "svm" = Support Vector Machines
+
+pacman::p_load(dplyr)
+
+create_grids <- function(algorithms, size = 100) {
+      
+      # Elastic Net Regression
+      
+      glm_params <- dials::grid_latin_hypercube(
+            dials::mixture(),
+            dials::penalty(),
+            size = size
+      )
+      
+      # Random Forest
+      
+      rf_params <- dials::grid_latin_hypercube(
+            dials::mtry(range = c(3, 4)),
+            dials::trees(range = c(200, 300)),
+            size = size 
+      )
+      
+      # Support Vector Machines
+      
+      svm_params <- dials::grid_latin_hypercube(
+            dials::cost(),
+            dials::margin(),
+            size = size 
+      )
+      
+      grid_list <- purrr::map(algorithms, function(alg) {
+            switch(alg,
+                   rf = rf_params -> alg_grid,
+                   glmnet = glm_params -> alg_grid,
+                   svm = svm_params -> alg_grid,
+                   infer:::stop_glue("{alg} grid not available."))
+            alg_grid
+            
+      }) %>% 
+            purrr::set_names(algorithms)
+}
+
diff --git a/performance-experiment/create-models.R b/performance-experiment/create-models.R
@@ -0,0 +1,54 @@
+# Creates list of model functions
+
+# input: list of algorithm abbreviations
+# "rf" = Ranger Random Forest
+# "glmnet" = Elastic Net regression
+# "svm" = Support Vector Machines
+
+# output: list of model functions
+
+pacman::p_load(dplyr)
+
+create_models <- function(algorithms) {
+      
+      # Random Forest
+      
+      ranger_FUN <- function(params, analysis_set) {
+            mtry <- params$mtry[[1]]
+            trees <- params$trees[[1]]
+            model <- ranger::ranger(y ~ ., data = analysis_set, mtry = mtry, num.trees = trees)
+            model
+      }
+      
+      # Elastic Net Regression
+      
+      glm_FUN <- function(params, analysis_set) {
+            alpha <- params$mixture[[1]]
+            lambda <- params$penalty[[1]]
+            model <- parsnip::linear_reg(mixture = alpha, penalty = lambda) %>%
+                  parsnip::set_engine("glmnet") %>%
+                  generics::fit(y ~ ., data = analysis_set)
+            model
+      }
+      
+      # Support Vector Machines
+      
+      svm_FUN <- function(params, analysis_set) {
+            cost <- params$cost[[1]]
+            model <- kernlab::ksvm(y ~ ., data = analysis_set,  C = cost)
+            model
+      }
+      
+      mod_FUN_list <- purrr::map(algorithms, function(alg) {
+            switch(alg,
+                   rf = ranger_FUN -> mod_fun,
+                   glmnet = glm_FUN -> mod_fun,
+                   svm = svm_FUN -> mod_fun,
+                   infer:::stop_glue("{alg} model function not available."))
+            mod_fun
+            
+      }) %>% 
+            purrr::set_names(algorithms)
+}
+
+
diff --git a/performance-experiment/create-ncv.R b/performance-experiment/create-ncv.R
@@ -0,0 +1,34 @@
+# nested-cv data function
+
+
+
+
+create_ncv <- function(dat, repeats, method) {
+   
+   attempt::stop_if_not(repeats, is.numeric, "repeats needs to be a numeric class")
+   attempt::stop_if_not(method, is.character, "method needs to be a character class")
+   
+   grid <- tidyr::crossing(dat, repeats)
+   
+   if (method == "kj") {
+      ncv_list <- purrr::map2(grid$dat, grid$repeats, function(dat, reps) {
+         rsample::nested_cv(dat,
+                            outside = vfold_cv(v = 10, repeats = dynGet("reps")),
+                            inside = bootstraps(times = 25))
+      })
+   } else if (method == "raschka") {
+      ncv_list <- purrr::map2(grid$dat, grid$repeats, function(dat, reps) {
+         rsample::nested_cv(dat,
+                            outside = vfold_cv(v = 5, repeats = dynGet("reps")),
+                            inside = vfold_cv(v = 2))
+      })
+   } else {
+      stop("Need to specify method as kj or raschka", call. = FALSE)
+   }
+   
+   return(ncv_list)
+}
+
+
+
+
diff --git a/performance-experiment/inner-tune.R b/performance-experiment/inner-tune.R
@@ -0,0 +1,63 @@
+# inner loop tuning function
+
+
+
+pacman::p_load(dplyr, furrr, data.table, dtplyr)
+
+inner_tune <- function(ncv_dat, mod_FUN_list, params_list, error_FUN) {
+      
+      # inputs params, model, and resample, calls model and error functions, outputs error
+      mod_error <- function(params, mod_FUN, dat) {
+            y_col <- ncol(dat$data)
+            y_obs <- rsample::assessment(dat)[y_col]
+            mod <- mod_FUN(params, rsample::analysis(dat))
+            pred <- predict(mod, rsample::assessment(dat))
+            if (!is.data.frame(pred)) {
+                  pred <- pred$predictions
+            }
+            error <- error_FUN(y_obs, pred)
+            error
+      }
+      
+      # inputs resample, loops hyperparam grid values to model/error function, collects error value for hyperparam combo
+      tune_over_params <- function(dat, mod_FUN, params) {
+            params$error <- purrr::map_dbl(1:nrow(params), function(row) {
+                  params <- params[row,]
+                  mod_error(params, mod_FUN, dat)
+            })
+            params
+      }
+      
+      # inputs and sends fold's resamples to tuning function, collects and averages fold's error for each hyperparameter combo
+      summarize_tune_results <- function(dat, mod_FUN, params) {
+            # Return row-bound tibble that has the 25 bootstrap results
+            param_names <- names(params)
+            furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params, .progress = TRUE) %>%
+                  lazy_dt(., key_by = param_names) %>% 
+                  # For each value of the tuning parameter, compute the
+                  # average <error> which is the inner bootstrap estimate.
+                  group_by_at(vars(all_of(param_names))) %>%
+                  summarize(mean_error = mean(error, na.rm = TRUE),
+                            sd_error = sd(error, na.rm = TRUE),
+                            n = length(error)) %>% 
+                  as_tibble()
+      }
+      
+      tune_algorithms <- purrr::map2(mod_FUN_list, params_list, function(mod_FUN, params){
+         tuning_results <- purrr::map(ncv_dat$inner_resamples, summarize_tune_results, mod_FUN, params)
+         
+         # Choose best hyperparameter combos across all the resamples for each fold (e.g. 5 repeats 10 folds = 50 best hyperparam combos)
+         best_hyper_vals <- tuning_results %>%
+            purrr::map_df(function(dat) {
+               dat %>% 
+                  filter(mean_error == min(mean_error)) %>% 
+                  arrange(sd_error) %>% 
+                  slice(1)
+            }) %>%
+            select(all_of(names(params)))
+      })
+}
+
+
+# chosen_hypervals <- inner_tune(ncv_dat = ncv_dat_list[[1]], mod_FUN_list = mod_FUN_list_ranger, params_list = params_list, error_FUN = error_FUN)
+
diff --git a/performance-experiment/main.R b/performance-experiment/main.R
@@ -0,0 +1,101 @@
+
+
+
+
+
+source("performance-experiment/mlbench-data.R")
+source("performance-experiment/create-ncv.R")
+source("performance-experiment/create-models.R")
+source("performance-experiment/create-grids.R")
+source("performance-experiment/inner-tune.R")
+source("performance-experiment/outer-cv.R")
+source("performance-experiment/ncv-compare.R")
+
+# options(error = function() { 
+#       library(RPushbullet)
+#       pbPost("note", "Error", geterrmessage())
+#       if(!interactive()) stop(geterrmessage())
+# })
+# 
+# 
+# library(tictoc)
+# tic()
+# 
+# 
+# pacman::p_load(RPushbullet, glue)
+
+set.seed(2019)
+
+plan(multiprocess)
+
+method <- "raschka"
+# method <- "kj"
+algorithms <- list("glmnet", "rf")
+
+# sample_sizes <- c(100, 800, 2000, 5000, 10000)
+# repeats <- seq(1:5)
+
+sample_sizes <- 100
+repeats <- 1
+
+# method or method list?
+
+large_dat <- mlbench_data(n = 10^5, noise_sd = 1, seed = 2019)
+
+simdat_list <- purrr::map(sample_sizes, ~mlbench_data(.x))
+
+ncv_dat_list <- create_ncv(dat = simdat_list, repeats = repeats, method = method)
+
+
+error_FUN <- function(y_obs, y_hat){
+      y_obs <- unlist(y_obs)
+      y_hat <- unlist(y_hat)
+      Metrics::mae(y_obs, y_hat)
+}
+
+mod_FUN_list <- create_models(algorithms)
+
+params_list <- create_grids(algorithms, size = 100)
+
+ncv_results <- purrr::map2_dfr(ncv_dat_list, simdat_list, function(ncv_dat, sim_dat) {
+   
+   best_hypervals_list <- inner_tune(
+      ncv_dat = ncv_dat,
+      mod_FUN_list = mod_FUN_list,
+      params_list = params_list,
+      error_FUN = error_FUN)
+   
+   # model, mean, median, sd error, and parameter columns
+   if (method == "raschka") {
+      cv_stats <- outer_cv(
+         ncv_dat = ncv_dat,
+         best_hypervals_list = best_hypervals_list,
+         mod_FUN_list = mod_FUN_list,
+         error_FUN = error_FUN,
+         method = method,
+         train_dat = sim_dat,
+         params_list = params_list)
+   } else if (method == "kj") {
+      cv_stats <- outer_cv(
+         ncv_dat = ncv_dat,
+         best_hypervals_list = best_hypervals_list,
+         mod_FUN_list = mod_FUN_list,
+         error_FUN = error_FUN,
+         method = method)
+   }
+   
+   genl_perf_est <- ncv_compare(train_dat = sim_dat,
+                                large_dat = large_dat,
+                                cv_stats = cv_stats,
+                                mod_FUN_list = mod_FUN_list,
+                                params_list = params_list,
+                                error_FUN = error_FUN,
+                                method = method)
+   
+})
+
+indices <- tidyr::crossing(sample_sizes, repeats)
+
+perf_exp_results <- indices %>% 
+   bind_cols(ncv_results)
+
diff --git a/performance-experiment/mlbench-data.R b/performance-experiment/mlbench-data.R
@@ -0,0 +1,13 @@
+# create simulation data
+
+# Inputs are 10 independent variables uniformly distributed on the interval [0,1], only 5 out of these 10 are actually used. Outputs are created according to the formula
+# y = 10 sin(π x1 x2) + 20 (x3 - 0.5)^2 + 10 x4 + 5 x5 + e
+
+mlbench_data <- function(n, noise_sd = 1, seed = 2019) {
+      set.seed(seed)
+      tmp <- mlbench::mlbench.friedman1(n, sd = noise_sd)
+      tmp <- cbind(tmp$x, tmp$y)
+      tmp <- as.data.frame(tmp)
+      names(tmp)[ncol(tmp)] <- "y"
+      tmp
+}
diff --git a/performance-experiment/ncv-compare.R b/performance-experiment/ncv-compare.R
@@ -0,0 +1,67 @@
+# ncv_compare function
+
+
+# Chooses the best algorithm, fits best model on entire training set, predicts against large simulated data set
+
+
+ncv_compare <- function(train_dat, large_dat, cv_stats, mod_FUN_list, params_list, error_FUN, method) {
+   
+   if (method == "kj") {
+      # Choose alg with lowest avg error
+      chosen_alg <- cv_stats %>%
+         bind_rows(.id = "model") %>% 
+         filter(mean_error == min(mean_error)) %>% 
+         pull(model)
+      
+      # Set inputs to chosen alg
+      mod_FUN <- mod_FUN_list[[chosen_alg]]
+      params <- cv_stats[[chosen_alg]] %>%
+         select(names(params_list[[chosen_alg]]))
+      
+   } else if (method == "raschka") {
+      chosen_alg <- cv_stats %>% 
+         pull(model)
+      mod_FUN <- mod_FUN_list[[chosen_alg]]
+      params <- cv_stats %>% 
+         filter(model == chosen_alg) %>% 
+         select(names(params_list[[chosen_alg]]))
+   }
+   
+   fit <- mod_FUN(params, train_dat)
+   
+   # fit <- mod_FUN(params, ncv_dat_list$sim_data[[1]])
+   preds <- predict(fit, large_dat)
+   if (!is.data.frame(preds)) {
+      preds <- preds$predictions
+   }
+   
+   # calculate out-of-sample and retrieve nested-cv error
+   y_col <- ncol(large_dat)
+   y_obs <- large_dat[y_col]
+   oos_error <- round(error_FUN(y_obs, preds), 5)
+   
+   if (method == "kj") {
+      ncv_error <- cv_stats[[chosen_alg]] %>%
+         mutate(mean_error = round(mean_error, 5)) %>% 
+         pull(mean_error)
+   } else if (method == "raschka") {
+      ncv_error <- cv_stats %>%
+         filter(model == chosen_alg) %>% 
+         mutate(mean_error = round(mean_error, 5)) %>% 
+         pull(mean_error)
+   }
+   
+   # delta (the difference between errors) is how well the ncv estimated generalization performance
+   ncv_perf <- bind_cols(oos_error = oos_error, ncv_error = ncv_error) %>% 
+      mutate(method = method,
+             delta_error = abs(oos_error - ncv_error),
+             chosen_algorithm = chosen_alg) %>% 
+      bind_cols(params) %>% 
+      select(method, everything())
+   
+}
+
+
+   
+
+
diff --git a/performance-experiment/outer-cv.R b/performance-experiment/outer-cv.R