Skip to content

Commit 3e9fabc

Browse files
author
ercbk
committed
changed from make to r_make system; removed some .progress args in map functions
1 parent f80b776 commit 3e9fabc

File tree

11 files changed

+119
-89
lines changed

11 files changed

+119
-89
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
.Ruserdata
55
.env
66
.drake
7+
ec2-ssh-raw.log

performance-experiment/Kuhn-Johnson/make-kj.R renamed to _drake.R

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,6 @@ source("performance-experiment/functions/run-ncv.R")
1919
source("performance-experiment/Kuhn-Johnson/plan-kj.R")
2020

2121

22-
# text me if an error occurs
23-
options(error = function() {
24-
library(RPushbullet)
25-
pbPost("note", "Error", geterrmessage())
26-
if(!interactive()) stop(geterrmessage())
27-
})
28-
2922

3023

3124
set.seed(2019)
@@ -39,39 +32,36 @@ ssh_private_key_file <- Sys.getenv("AWSKEYPATH")
3932

4033

4134
cl <- future::makeClusterPSOCK(
42-
43-
## Public IP numbers of EC2 instances
44-
public_ips,
45-
46-
## User name (always 'ubuntu')
47-
user = "ubuntu",
48-
49-
## Use private SSH key registered with AWS
50-
## futureSettings is a saved PuTTY session with settings to keep ssh active
51-
rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
52-
rshopts = c(
53-
"-sshrawlog", "ec2-ssh-raw.log"
54-
),
55-
56-
rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
57-
),
58-
verbose = TRUE
35+
36+
## Public IP numbers of EC2 instances
37+
public_ips,
38+
39+
## User name (always 'ubuntu')
40+
user = "ubuntu",
41+
42+
## Use private SSH key registered with AWS
43+
## futureSettings is a saved PuTTY session with settings to keep ssh active
44+
rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
45+
rshopts = c(
46+
"-sshrawlog", "ec2-ssh-raw.log"
47+
),
48+
49+
rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
50+
),
51+
verbose = TRUE,
52+
timeout = 2592000*100
5953
)
6054

6155

6256
future::plan(list(tweak(cluster, workers = cl), multiprocess))
6357

6458

59+
6560
# verbose = 0 prints nothing, verbose = 1 prints message as each target completes; verbose = 2 adds a progress bar that tracks target completion
66-
make(
61+
drake_config(
6762
plan,
68-
verbose = 1
63+
verbose = 1,
64+
lock_envir = FALSE,
65+
jobs_preprocess = 7
6966
)
7067

71-
# network graph of the drake plan
72-
vis_drake_graph(plan, file = "performance-experiment/output/kj-plan-network.png", build_times = "build", main = "Performance Experiment")
73-
74-
# text me when it finishes
75-
RPushbullet::pbPost("note", title="kj performance experiment", body="perf run finished")
76-
77-
parallel::stopCluster(cl)

performance-experiment/Kuhn-Johnson/check-results.R

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33

44
library(drake); library(dplyr)
55

6-
# loadd(perf_results_100)
7-
# View(perf_results_100)
8-
loadd(perf_results_800)
9-
View(perf_results_800)
6+
loadd(ncv_results_100)
7+
View(ncv_results_100)
8+
loadd(ncv_results_800)
9+
View(ncv_results_800)
10+
loadd(ncv_results_2000)
11+
View(ncv_results_2000)
12+
loadd(ncv_results_5000)
13+
View(ncv_results_5000)
1014

1115
# each target's build time
1216
bt <- build_times(starts_with("ncv_results"), digits = 4)

performance-experiment/Kuhn-Johnson/plan-kj.R

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212

1313
error_FUN <- function(y_obs, y_hat){
14-
y_obs <- unlist(y_obs)
15-
y_hat <- unlist(y_hat)
16-
Metrics::mae(y_obs, y_hat)
14+
y_obs <- unlist(y_obs)
15+
y_hat <- unlist(y_hat)
16+
Metrics::mae(y_obs, y_hat)
1717
}
1818

1919
method <- "kj"
@@ -34,12 +34,12 @@ plan <- drake_plan(
3434
# hyperparameter grids for each algorithm
3535
# This probably doesn't need to be a "dynamic" target since mtry is only concerned about the number of columns in data (see script), but I'll do it anyways
3636
params_list_100 = create_grids(sim_dat_100,
37-
algorithms,
38-
size = grid_size),
37+
algorithms,
38+
size = grid_size),
3939
# create a separate ncv data object for each repeat value
4040
ncv_dat_100 = create_ncv_objects(sim_dat_100,
41-
repeats,
42-
method),
41+
repeats,
42+
method),
4343
# runs nested-cv and compares ncv error with out-of-sample error
4444
# outputs: ncv error, oos error, delta error, chosen algorithm, chosen hyperparameters
4545
ncv_results_100 = target(
@@ -52,9 +52,6 @@ plan <- drake_plan(
5252
method),
5353
dynamic = map(ncv_dat_100)
5454
),
55-
# add index columns to identify the results according to sample size and number of repeats
56-
perf_results_100 = tibble(n = 100, repeats = repeats) %>%
57-
bind_cols(ncv_results_100),
5855

5956
# repeat for the rest of the sample sizes
6057
# sample size = 800
@@ -74,30 +71,45 @@ plan <- drake_plan(
7471
error_FUN,
7572
method),
7673
dynamic = map(ncv_dat_800)
77-
),
78-
perf_results_800 = tibble(n = 800, repeats = repeats) %>%
79-
bind_cols(ncv_results_800),
80-
81-
# sample size = 2000
82-
sim_dat_2000 = mlbench_data(2000),
83-
params_list_2000 = create_grids(sim_dat_2000,
84-
algorithms,
85-
size = grid_size),
86-
ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
87-
repeats,
88-
method),
89-
ncv_results_2000 = target(
90-
run_ncv(ncv_dat_2000,
91-
sim_dat_2000,
92-
large_dat,
93-
mod_FUN_list,
94-
params_list_2000,
95-
error_FUN,
96-
method),
97-
dynamic = map(ncv_dat_2000)
98-
),
99-
perf_results_2000 = tibble(n = 2000, repeats = repeats) %>%
100-
bind_cols(ncv_results_2000)
74+
)#,
75+
#
76+
# # sample size = 2000
77+
# sim_dat_2000 = mlbench_data(2000),
78+
# params_list_2000 = create_grids(sim_dat_2000,
79+
# algorithms,
80+
# size = grid_size),
81+
# ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
82+
# repeats,
83+
# method),
84+
# ncv_results_2000 = target(
85+
# run_ncv(ncv_dat_2000,
86+
# sim_dat_2000,
87+
# large_dat,
88+
# mod_FUN_list,
89+
# params_list_2000,
90+
# error_FUN,
91+
# method),
92+
# dynamic = map(ncv_dat_2000)
93+
# ),
94+
#
95+
# # sample size = 5000
96+
# sim_dat_5000 = mlbench_data(5000),
97+
# params_list_5000 = create_grids(sim_dat_5000,
98+
# algorithms,
99+
# size = grid_size),
100+
# ncv_dat_5000 = create_ncv_objects(sim_dat_5000,
101+
# repeats,
102+
# method),
103+
# ncv_results_5000 = target(
104+
# run_ncv(ncv_dat_5000,
105+
# sim_dat_5000,
106+
# large_dat,
107+
# mod_FUN_list,
108+
# params_list_5000,
109+
# error_FUN,
110+
# method),
111+
# dynamic = map(ncv_dat_5000)
112+
# )
101113

102114
)
103115

performance-experiment/functions/inner-tune.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ inner_tune <- function(ncv_dat, mod_FUN_list, params_list, error_FUN) {
3838
summarize_tune_results <- function(dat, mod_FUN, params) {
3939
# Return row-bound tibble that has the 25 bootstrap results
4040
param_names <- names(params)
41-
furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params, .progress = FALSE) %>%
41+
furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params) %>%
4242
lazy_dt(., key_by = param_names) %>%
4343
# For each value of the tuning parameter, compute the
4444
# average <error> which is the inner bootstrap estimate.

performance-experiment/functions/outer-cv.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,7 @@ outer_cv <- function(ncv_dat, best_hypervals_list, mod_FUN_list, error_FUN, meth
4545
tibble(
4646
error = error
4747
)
48-
},
49-
# progress bar off when working with clusters
50-
.progress = FALSE) %>%
48+
}) %>%
5149
bind_cols(best_hyper_vals) %>%
5250
mutate_all(~round(., 6))
5351

performance-experiment/functions/run-ncv.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,24 @@ run_ncv <- function(ncv_dat, sim_dat, large_dat, mod_FUN_list, params_list, erro
4444
error_FUN = error_FUN,
4545
method = method)
4646

47+
# if there's repeat == 1, then there is no repeat column (id), id becomes the fold co instead of there being an id2 col
48+
rep_status <- stringr::str_detect(ncv_dat[[1]]$id[[1]], pattern = "Repeat")
49+
50+
if (rep_status == TRUE) {
51+
# number of repeats
52+
num_reps <- ncv_dat[[1]] %>%
53+
select(id) %>%
54+
mutate(repeats = stringr::str_extract(id, pattern = "[0-9]") %>%
55+
as.numeric()) %>%
56+
slice(n()) %>%
57+
pull(repeats)
58+
} else {
59+
num_reps <- 1
60+
}
61+
62+
# cols: n, repeats, error calcs, chosen alg, chosen hyperparams
63+
final_results <- tibble(n = nrow(ncv_dat[[1]]$splits$`1`[[1]]),
64+
repeats = num_reps) %>%
65+
bind_cols(genl_perf_est)
66+
4767
}
-26.4 KB
Binary file not shown.
-134 KB
Binary file not shown.
-62.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)