changed from make to r_make system; removed some .progress args in map functions

ercbk · ercbk · commit 3e9fabc844cf · 2020-05-23T19:25:53.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 .Ruserdata
 .env
 .drake
+ec2-ssh-raw.log
diff --git a/_drake.R b/_drake.R
@@ -19,13 +19,6 @@ source("performance-experiment/functions/run-ncv.R")
 source("performance-experiment/Kuhn-Johnson/plan-kj.R")
 
 
-# text me if an error occurs
-options(error = function() {
-      library(RPushbullet)
-      pbPost("note", "Error", geterrmessage())
-      if(!interactive()) stop(geterrmessage())
-})
-
 
 
 set.seed(2019)
@@ -39,39 +32,36 @@ ssh_private_key_file <- Sys.getenv("AWSKEYPATH")
 
 
 cl <- future::makeClusterPSOCK(
-   
-   ## Public IP numbers of EC2 instances
-   public_ips,
-   
-   ## User name (always 'ubuntu')
-   user = "ubuntu",
-   
-   ## Use private SSH key registered with AWS
-   ## futureSettings is a saved PuTTY session with settings to keep ssh active
-   rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
-   rshopts = c(
-      "-sshrawlog", "ec2-ssh-raw.log"
-   ),
-   
-   rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
-   ), 
-   verbose = TRUE
+      
+      ## Public IP numbers of EC2 instances
+      public_ips,
+      
+      ## User name (always 'ubuntu')
+      user = "ubuntu",
+      
+      ## Use private SSH key registered with AWS
+      ## futureSettings is a saved PuTTY session with settings to keep ssh active
+      rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
+      rshopts = c(
+            "-sshrawlog", "ec2-ssh-raw.log"
+      ),
+      
+      rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
+      ), 
+      verbose = TRUE,
+      timeout = 2592000*100
 )
 
 
 future::plan(list(tweak(cluster, workers = cl), multiprocess))
 
 
+
 # verbose = 0 prints nothing, verbose = 1 prints message as each target completes; verbose = 2 adds a progress bar that tracks target completion
-make(
+drake_config(
       plan,
-      verbose = 1
+      verbose = 1,
+      lock_envir = FALSE,
+      jobs_preprocess = 7
 )
 
-# network graph of the drake plan
-vis_drake_graph(plan, file = "performance-experiment/output/kj-plan-network.png", build_times = "build", main = "Performance Experiment")
-
-# text me when it finishes
-RPushbullet::pbPost("note", title="kj performance experiment", body="perf run finished")
-
-parallel::stopCluster(cl)
diff --git a/performance-experiment/Kuhn-Johnson/check-results.R b/performance-experiment/Kuhn-Johnson/check-results.R
@@ -3,10 +3,14 @@
 
 library(drake); library(dplyr)
 
-# loadd(perf_results_100)
-# View(perf_results_100)
-loadd(perf_results_800)
-View(perf_results_800)
+loadd(ncv_results_100)
+View(ncv_results_100)
+loadd(ncv_results_800)
+View(ncv_results_800)
+loadd(ncv_results_2000)
+View(ncv_results_2000)
+loadd(ncv_results_5000)
+View(ncv_results_5000)
 
 # each target's build time
 bt <- build_times(starts_with("ncv_results"), digits = 4)
diff --git a/performance-experiment/Kuhn-Johnson/plan-kj.R b/performance-experiment/Kuhn-Johnson/plan-kj.R
@@ -11,9 +11,9 @@
 
 
 error_FUN <- function(y_obs, y_hat){
-      y_obs <- unlist(y_obs)
-      y_hat <- unlist(y_hat)
-      Metrics::mae(y_obs, y_hat)
+   y_obs <- unlist(y_obs)
+   y_hat <- unlist(y_hat)
+   Metrics::mae(y_obs, y_hat)
 }
 
 method <- "kj"
@@ -34,12 +34,12 @@ plan <- drake_plan(
    # hyperparameter grids for each algorithm
    # This probably doesn't need to be a "dynamic" target since mtry is only concerned about the number of columns in data (see script), but I'll do it anyways
    params_list_100 = create_grids(sim_dat_100,
-                              algorithms,
-                              size = grid_size),
+                                  algorithms,
+                                  size = grid_size),
    # create a separate ncv data object for each repeat value
    ncv_dat_100 = create_ncv_objects(sim_dat_100,
-                                       repeats,
-                                       method),
+                                    repeats,
+                                    method),
    # runs nested-cv and compares ncv error with out-of-sample error
    # outputs: ncv error, oos error, delta error, chosen algorithm, chosen hyperparameters 
    ncv_results_100 = target(
@@ -52,9 +52,6 @@ plan <- drake_plan(
               method),
       dynamic = map(ncv_dat_100)
    ),
-   # add index columns to identify the results according to sample size and number of repeats
-   perf_results_100 = tibble(n = 100, repeats = repeats) %>%
-      bind_cols(ncv_results_100),
    
    # repeat for the rest of the sample sizes
    # sample size = 800
@@ -74,30 +71,45 @@ plan <- drake_plan(
               error_FUN,
               method),
       dynamic = map(ncv_dat_800)
-   ),
-   perf_results_800 = tibble(n = 800, repeats = repeats) %>%
-      bind_cols(ncv_results_800),
-   
-   # sample size = 2000
-   sim_dat_2000 = mlbench_data(2000),
-   params_list_2000 = create_grids(sim_dat_2000,
-                                  algorithms,
-                                  size = grid_size),
-   ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
-                                    repeats,
-                                    method),
-   ncv_results_2000 = target(
-      run_ncv(ncv_dat_2000,
-              sim_dat_2000,
-              large_dat,
-              mod_FUN_list,
-              params_list_2000,
-              error_FUN,
-              method),
-      dynamic = map(ncv_dat_2000)
-   ),
-   perf_results_2000 = tibble(n = 2000, repeats = repeats) %>%
-      bind_cols(ncv_results_2000)
+   )#,
+   # 
+   # # sample size = 2000
+   # sim_dat_2000 = mlbench_data(2000),
+   # params_list_2000 = create_grids(sim_dat_2000,
+   #                                algorithms,
+   #                                size = grid_size),
+   # ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
+   #                                  repeats,
+   #                                  method),
+   # ncv_results_2000 = target(
+   #    run_ncv(ncv_dat_2000,
+   #            sim_dat_2000,
+   #            large_dat,
+   #            mod_FUN_list,
+   #            params_list_2000,
+   #            error_FUN,
+   #            method),
+   #    dynamic = map(ncv_dat_2000)
+   # ),
+   # 
+   # # sample size = 5000
+   # sim_dat_5000 = mlbench_data(5000),
+   # params_list_5000 = create_grids(sim_dat_5000,
+   #                                algorithms,
+   #                                size = grid_size),
+   # ncv_dat_5000 = create_ncv_objects(sim_dat_5000,
+   #                                  repeats,
+   #                                  method),
+   # ncv_results_5000 = target(
+   #    run_ncv(ncv_dat_5000,
+   #            sim_dat_5000,
+   #            large_dat,
+   #            mod_FUN_list,
+   #            params_list_5000,
+   #            error_FUN,
+   #            method),
+   #    dynamic = map(ncv_dat_5000)
+   # )
    
 )
 
diff --git a/performance-experiment/functions/inner-tune.R b/performance-experiment/functions/inner-tune.R
@@ -38,7 +38,7 @@ inner_tune <- function(ncv_dat, mod_FUN_list, params_list, error_FUN) {
    summarize_tune_results <- function(dat, mod_FUN, params) {
       # Return row-bound tibble that has the 25 bootstrap results
       param_names <- names(params)
-      furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params, .progress = FALSE) %>%
+      furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params) %>%
       lazy_dt(., key_by = param_names) %>% 
          # For each value of the tuning parameter, compute the
          # average <error> which is the inner bootstrap estimate.
diff --git a/performance-experiment/functions/outer-cv.R b/performance-experiment/functions/outer-cv.R
@@ -45,9 +45,7 @@ outer_cv <- function(ncv_dat, best_hypervals_list, mod_FUN_list, error_FUN, meth
          tibble(
             error = error
          )
-      }, 
-      # progress bar off when working with clusters
-      .progress = FALSE) %>% 
+      }) %>% 
          bind_cols(best_hyper_vals) %>% 
          mutate_all(~round(., 6))
       
diff --git a/performance-experiment/functions/run-ncv.R b/performance-experiment/functions/run-ncv.R
@@ -44,4 +44,24 @@ run_ncv <- function(ncv_dat, sim_dat, large_dat, mod_FUN_list, params_list, erro
                                 error_FUN = error_FUN,
                                 method = method)
    
+   # if there's repeat == 1, then there is no repeat column (id), id becomes the fold co instead of there being an id2 col
+   rep_status <- stringr::str_detect(ncv_dat[[1]]$id[[1]], pattern = "Repeat")
+   
+   if (rep_status == TRUE) {
+      # number of repeats
+      num_reps <- ncv_dat[[1]] %>%
+         select(id) %>%
+         mutate(repeats = stringr::str_extract(id, pattern = "[0-9]") %>% 
+                   as.numeric()) %>% 
+         slice(n()) %>% 
+         pull(repeats)
+   } else {
+      num_reps <- 1
+   }
+   
+   # cols: n, repeats, error calcs, chosen alg, chosen hyperparams
+   final_results <- tibble(n = nrow(ncv_dat[[1]]$splits$`1`[[1]]),
+                           repeats = num_reps) %>% 
+         bind_cols(genl_perf_est)
+   
 }
diff --git a/performance-experiment/output/kj-build-times.png b/performance-experiment/output/kj-build-times.png
diff --git a/performance-experiment/output/kj-plan-network.png b/performance-experiment/output/kj-plan-network.png
diff --git a/performance-experiment/output/plan-network.png b/performance-experiment/output/plan-network.png
diff --git a/renv.lock b/renv.lock
@@ -177,10 +177,10 @@
     },
     "backports": {
       "Package": "backports",
-      "Version": "1.1.5",
+      "Version": "1.1.6",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Hash": "e9f705633dc932bfd5b02b17a5053a06"
+      "Hash": "3997fd62345a616e59e8161ee0a5816f"
     },
     "base64enc": {
       "Package": "base64enc",
@@ -366,10 +366,15 @@
     },
     "drake": {
       "Package": "drake",
-      "Version": "7.11.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "331e17bca2f794d492b03a81d7bbcdd1"
+      "Version": "7.12.0.9000",
+      "Source": "GitHub",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteRepo": "drake",
+      "RemoteUsername": "ropensci",
+      "RemoteRef": "master",
+      "RemoteSha": "cbbb05973480b92e87cc3380ce3f3994bf3caec9",
+      "Hash": "ecb24a2b9844a618f43ffcb6ddc1e9bd"
     },
     "dtplyr": {
       "Package": "dtplyr",
@@ -443,10 +448,10 @@
     },
     "foreach": {
       "Package": "foreach",
-      "Version": "1.4.8",
+      "Version": "1.5.0",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Hash": "fbca7161e09d205648cd784611633ea8"
+      "Hash": "8fb3ff01ee7d85893f56df8d77213381"
     },
     "forge": {
       "Package": "forge",
@@ -623,10 +628,10 @@
     },
     "igraph": {
       "Package": "igraph",
-      "Version": "1.2.4.2",
+      "Version": "1.2.5",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Hash": "fffa635f747cd07ffc56bc13a23701e4"
+      "Hash": "3878c30ce67cdb7f2d7f72554e37f476"
     },
     "infer": {
       "Package": "infer",
@@ -1139,10 +1144,10 @@
     },
     "rlang": {
       "Package": "rlang",
-      "Version": "0.4.4",
+      "Version": "0.4.6",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Hash": "09a36f36a13436be327dad3d000c8dd3"
+      "Hash": "aa263e3ce17b177c49e0daade2ee3cdc"
     },
     "rmarkdown": {
       "Package": "rmarkdown",
@@ -1447,10 +1452,10 @@
     },
     "vctrs": {
       "Package": "vctrs",
-      "Version": "0.2.3",
+      "Version": "0.2.4",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Hash": "2c0f41d87be7a186139a6d3d5215848e"
+      "Hash": "6c839a149a30cb4ffc70443efa74c197"
     },
     "viridisLite": {
       "Package": "viridisLite",