Skip to content

Commit f80b776

Browse files
author
ercbk
committed
added drake architecture to performance experiment; runs for samples sizes 100 and 800 completed
1 parent 8f70efd commit f80b776

20 files changed

+569
-192
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
.Rhistory
33
.RData
44
.Ruserdata
5-
.env
5+
.env
6+
.drake
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Results
2+
3+
4+
library(drake); library(dplyr)
5+
6+
# loadd(perf_results_100)
7+
# View(perf_results_100)
8+
loadd(perf_results_800)
9+
View(perf_results_800)
10+
11+
# each target's build time
12+
bt <- build_times(starts_with("ncv_results"), digits = 4)
13+
View(bt)
14+
bt %>%
15+
select(target, elapsed) %>%
16+
kableExtra::kable() %>%
17+
kableExtra::save_kable(file = "performance-experiment/output/kj-build-times.png")
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# drake make file for Kuhn-Johnson performance experiment
2+
3+
4+
# Notes:
5+
# 1. see plan-kj.R for more details on how this thing works
6+
# 2. link to {future} issue with instructions on special PuTTY settings, https://github.com/HenrikBengtsson/future/issues/370
7+
8+
9+
# load packages, functions, and drake plan
10+
source("performance-experiment/packages.R")
11+
source("performance-experiment/functions/mlbench-data.R")
12+
source("performance-experiment/functions/create-ncv-objects.R")
13+
source("performance-experiment/functions/create-models.R")
14+
source("performance-experiment/functions/create-grids.R")
15+
source("performance-experiment/functions/inner-tune.R")
16+
source("performance-experiment/functions/outer-cv.R")
17+
source("performance-experiment/functions/ncv-compare.R")
18+
source("performance-experiment/functions/run-ncv.R")
19+
source("performance-experiment/Kuhn-Johnson/plan-kj.R")
20+
21+
22+
# text me if an error occurs
23+
options(error = function() {
24+
library(RPushbullet)
25+
pbPost("note", "Error", geterrmessage())
26+
if(!interactive()) stop(geterrmessage())
27+
})
28+
29+
30+
31+
set.seed(2019)
32+
33+
# Using different compute sizes for each model
34+
ip1 <- Sys.getenv("GLMEC2IP")
35+
ip2 <- Sys.getenv("RFEC2IP")
36+
public_ips <- c(ip1, ip2)
37+
# ppk file converted by PuTTY from an AWS pem file
38+
ssh_private_key_file <- Sys.getenv("AWSKEYPATH")
39+
40+
41+
cl <- future::makeClusterPSOCK(
42+
43+
## Public IP numbers of EC2 instances
44+
public_ips,
45+
46+
## User name (always 'ubuntu')
47+
user = "ubuntu",
48+
49+
## Use private SSH key registered with AWS
50+
## futureSettings is a saved PuTTY session with settings to keep ssh active
51+
rshcmd = c("plink", "-ssh", "-load", "futureSettings","-i", ssh_private_key_file),
52+
rshopts = c(
53+
"-sshrawlog", "ec2-ssh-raw.log"
54+
),
55+
56+
rscript_args = c("-e", shQuote(".libPaths('/home/rstudio/R/x86_64-pc-linux-gnu-library/3.6')")
57+
),
58+
verbose = TRUE
59+
)
60+
61+
62+
future::plan(list(tweak(cluster, workers = cl), multiprocess))
63+
64+
65+
# verbose = 0 prints nothing, verbose = 1 prints message as each target completes; verbose = 2 adds a progress bar that tracks target completion
66+
make(
67+
plan,
68+
verbose = 1
69+
)
70+
71+
# network graph of the drake plan
72+
vis_drake_graph(plan, file = "performance-experiment/output/kj-plan-network.png", build_times = "build", main = "Performance Experiment")
73+
74+
# text me when it finishes
75+
RPushbullet::pbPost("note", title="kj performance experiment", body="perf run finished")
76+
77+
parallel::stopCluster(cl)
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Kuhn-Johnson drake plan
2+
3+
4+
# Notes:
5+
# 1. I broke the plan into units by sample size. I'm sure its possible to formulate the plan to perform the whole experiment by looping the kj and raschka method along with sample sizes into one large, more compact plan, but I wanted units that I could run overnight on my desktop.
6+
# 2. sample_sizes: 100, 800, 2000, 5000, 10000 (maybe)
7+
# 3. I'm trying to minimize the delta_error. Delta error is the absolute difference between the average error across the outer-folds of the nested cross-validation and the out-of-sample error which uses the chosen model and parameters to predict on a simulated 100K row dataset.
8+
9+
10+
11+
12+
13+
error_FUN <- function(y_obs, y_hat){
14+
y_obs <- unlist(y_obs)
15+
y_hat <- unlist(y_hat)
16+
Metrics::mae(y_obs, y_hat)
17+
}
18+
19+
method <- "kj"
20+
algorithms <- list("glmnet", "rf")
21+
repeats <- seq(1:5)
22+
grid_size <- 100
23+
24+
plan <- drake_plan(
25+
# model functions for each algorithm
26+
mod_FUN_list = create_models(algorithms),
27+
# data used to estimate out-of-sample error
28+
# noise_sd, seed settings are the defaults
29+
large_dat = mlbench_data(n = 10^5,
30+
noise_sd = 1,
31+
seed = 2019),
32+
# sample size = 100
33+
sim_dat_100 = mlbench_data(100),
34+
# hyperparameter grids for each algorithm
35+
# This probably doesn't need to be a "dynamic" target since mtry is only concerned about the number of columns in data (see script), but I'll do it anyways
36+
params_list_100 = create_grids(sim_dat_100,
37+
algorithms,
38+
size = grid_size),
39+
# create a separate ncv data object for each repeat value
40+
ncv_dat_100 = create_ncv_objects(sim_dat_100,
41+
repeats,
42+
method),
43+
# runs nested-cv and compares ncv error with out-of-sample error
44+
# outputs: ncv error, oos error, delta error, chosen algorithm, chosen hyperparameters
45+
ncv_results_100 = target(
46+
run_ncv(ncv_dat_100,
47+
sim_dat_100,
48+
large_dat,
49+
mod_FUN_list,
50+
params_list_100,
51+
error_FUN,
52+
method),
53+
dynamic = map(ncv_dat_100)
54+
),
55+
# add index columns to identify the results according to sample size and number of repeats
56+
perf_results_100 = tibble(n = 100, repeats = repeats) %>%
57+
bind_cols(ncv_results_100),
58+
59+
# repeat for the rest of the sample sizes
60+
# sample size = 800
61+
sim_dat_800 = mlbench_data(800),
62+
params_list_800 = create_grids(sim_dat_800,
63+
algorithms,
64+
size = grid_size),
65+
ncv_dat_800 = create_ncv_objects(sim_dat_800,
66+
repeats,
67+
method),
68+
ncv_results_800 = target(
69+
run_ncv(ncv_dat_800,
70+
sim_dat_800,
71+
large_dat,
72+
mod_FUN_list,
73+
params_list_800,
74+
error_FUN,
75+
method),
76+
dynamic = map(ncv_dat_800)
77+
),
78+
perf_results_800 = tibble(n = 800, repeats = repeats) %>%
79+
bind_cols(ncv_results_800),
80+
81+
# sample size = 2000
82+
sim_dat_2000 = mlbench_data(2000),
83+
params_list_2000 = create_grids(sim_dat_2000,
84+
algorithms,
85+
size = grid_size),
86+
ncv_dat_2000 = create_ncv_objects(sim_dat_2000,
87+
repeats,
88+
method),
89+
ncv_results_2000 = target(
90+
run_ncv(ncv_dat_2000,
91+
sim_dat_2000,
92+
large_dat,
93+
mod_FUN_list,
94+
params_list_2000,
95+
error_FUN,
96+
method),
97+
dynamic = map(ncv_dat_2000)
98+
),
99+
perf_results_2000 = tibble(n = 2000, repeats = repeats) %>%
100+
bind_cols(ncv_results_2000)
101+
102+
)
103+
104+

performance-experiment/create-grids.R renamed to performance-experiment/functions/create-grids.R

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
# "glmnet" = Elastic Net regression
99
# "svm" = Support Vector Machines
1010

11-
pacman::p_load(dplyr)
11+
# output: list of grid objects
1212

13-
create_grids <- function(algorithms, size = 100) {
13+
14+
15+
create_grids <- function(sim_dat, algorithms, size = 100) {
1416

1517
# Elastic Net Regression
1618

@@ -22,9 +24,12 @@ create_grids <- function(algorithms, size = 100) {
2224

2325
# Random Forest
2426

27+
# Some of the parnsip model parameters have "unknown" for the default value ranges. finalize replaces the unknowns with values based on the data.
28+
mtry_updated <- dials::finalize(dials::mtry(), select(sim_dat, -ncol(sim_dat)))
29+
2530
rf_params <- dials::grid_latin_hypercube(
26-
dials::mtry(range = c(3, 4)),
27-
dials::trees(range = c(200, 300)),
31+
mtry_updated,
32+
dials::trees(),
2833
size = size
2934
)
3035

@@ -36,6 +41,8 @@ create_grids <- function(algorithms, size = 100) {
3641
size = size
3742
)
3843

44+
# list of grid objects depending on the algorithms inputted (switch is pretty cool)
45+
# stop_glue throws error if algorithm inputted isn't available (Should be in glue pkg but isn't)
3946
grid_list <- purrr::map(algorithms, function(alg) {
4047
switch(alg,
4148
rf = rf_params -> alg_grid,

performance-experiment/create-models.R renamed to performance-experiment/functions/create-models.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# output: list of model functions
99

10-
pacman::p_load(dplyr)
10+
1111

1212
create_models <- function(algorithms) {
1313

@@ -39,6 +39,8 @@ create_models <- function(algorithms) {
3939
model
4040
}
4141

42+
# list of model objects depending on the algorithms inputted (switch is pretty cool)
43+
# stop_glue throws error if algorithm inputted isn't available (Should be in glue pkg but isn't)
4244
mod_FUN_list <- purrr::map(algorithms, function(alg) {
4345
switch(alg,
4446
rf = ranger_FUN -> mod_fun,

performance-experiment/create-ncv.R renamed to performance-experiment/functions/create-ncv-objects.R

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,28 @@
11
# nested-cv data function
22

3+
# inputs:
4+
# 1. dat = dataset
5+
# 2. repeats = numeric vector with numbers of repeats
6+
# 3. method = "kj" or "raschka"
7+
# outputs:
8+
# 1. list of {rsample} nested cv objects; one object per repeat value
39

410

511

6-
create_ncv <- function(dat, repeats, method) {
12+
create_ncv_objects <- function(dat, repeats, method) {
713

814
attempt::stop_if_not(repeats, is.numeric, "repeats needs to be a numeric class")
915
attempt::stop_if_not(method, is.character, "method needs to be a character class")
1016

17+
# don't remember but guessing crossing needs a list object
18+
if (is.data.frame(dat)) {
19+
dat <- list(dat)
20+
}
21+
# tibble grid of data and repeats
1122
grid <- tidyr::crossing(dat, repeats)
1223

24+
# generate list of ncv objects
25+
# dynGet needed to get reps out of the envirnonment and into the nested_cv function
1326
if (method == "kj") {
1427
ncv_list <- purrr::map2(grid$dat, grid$repeats, function(dat, reps) {
1528
rsample::nested_cv(dat,
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# inner loop tuning function
2+
3+
# inputs:
4+
# 1. ncv_dat = one ncv object from the list created by create-ncv-objects.R
5+
# 2. mod_FUN_list = all the model objects created by create-models.R
6+
# 3. params_list = all the hyperparameter grids created by create-grids.R
7+
# 4. error_FUN = specified at the start of plan-<method>.R
8+
9+
# outputs: df of hyperparameters for each fold that was chosen in the inner-loop
10+
11+
12+
13+
inner_tune <- function(ncv_dat, mod_FUN_list, params_list, error_FUN) {
14+
15+
# inputs params, model, and resample, calls model and error functions, outputs error
16+
mod_error <- function(params, mod_FUN, dat) {
17+
y_col <- ncol(dat$data)
18+
y_obs <- rsample::assessment(dat)[y_col]
19+
mod <- mod_FUN(params, rsample::analysis(dat))
20+
pred <- predict(mod, rsample::assessment(dat))
21+
if (!is.data.frame(pred)) {
22+
pred <- pred$predictions
23+
}
24+
error <- error_FUN(y_obs, pred)
25+
error
26+
}
27+
28+
# inputs resample, loops hyperparam grid values to model/error function, collects error value for hyperparam combo
29+
tune_over_params <- function(dat, mod_FUN, params) {
30+
params$error <- purrr::map_dbl(1:nrow(params), function(row) {
31+
params <- params[row,]
32+
mod_error(params, mod_FUN, dat)
33+
})
34+
params
35+
}
36+
37+
# inputs and sends fold's resamples to tuning function, collects and averages fold's error for each hyperparameter combo
38+
summarize_tune_results <- function(dat, mod_FUN, params) {
39+
# Return row-bound tibble that has the 25 bootstrap results
40+
param_names <- names(params)
41+
furrr::future_map_dfr(dat$splits, tune_over_params, mod_FUN, params, .progress = FALSE) %>%
42+
lazy_dt(., key_by = param_names) %>%
43+
# For each value of the tuning parameter, compute the
44+
# average <error> which is the inner bootstrap estimate.
45+
group_by_at(vars(param_names)) %>%
46+
summarize(mean_error = mean(error, na.rm = TRUE),
47+
sd_error = sd(error, na.rm = TRUE),
48+
n = length(error)) %>%
49+
as_tibble()
50+
}
51+
52+
tune_algorithms <- furrr::future_map2(mod_FUN_list, params_list, function(mod_FUN, params){
53+
tuning_results <- purrr::map(ncv_dat$inner_resamples, summarize_tune_results, mod_FUN, params)
54+
55+
# Choose best hyperparameter combos across all the resamples for each fold (e.g. 5 repeats 10 folds = 50 best hyperparam combos)
56+
best_hyper_vals <- tuning_results %>%
57+
purrr::map_df(function(dat) {
58+
dat %>%
59+
filter(mean_error == min(mean_error)) %>%
60+
arrange(sd_error) %>%
61+
slice(1)
62+
}) %>%
63+
select(names(params))
64+
})
65+
}
66+
67+
68+

performance-experiment/ncv-compare.R renamed to performance-experiment/functions/ncv-compare.R

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@
33

44
# Chooses the best algorithm, fits best model on entire training set, predicts against large simulated data set
55

6+
# inputs:
7+
# 1. train_dat = the entire training dataset
8+
# 2. large_dat = the test dataset
9+
# 3. cv_stats = outer_cv.R output: df with chosen model, outer fold stats, hyperparams
10+
# 4. mod_FUN_list = list of model objects created from create_models.R
11+
# 5. params_list = list of hyperparameter grids created from create_grids.R
12+
# 6. error_FUN = error function given at the start of plan_<method>.R
13+
# 7. method = "kj" or "raschka", given at the start of plan_<method>.R
14+
15+
# output: df with algorithm, hyperparams, and error values
16+
617

718
ncv_compare <- function(train_dat, large_dat, cv_stats, mod_FUN_list, params_list, error_FUN, method) {
819

@@ -27,9 +38,10 @@ ncv_compare <- function(train_dat, large_dat, cv_stats, mod_FUN_list, params_lis
2738
select(names(params_list[[chosen_alg]]))
2839
}
2940

41+
# fit model over entire training set
3042
fit <- mod_FUN(params, train_dat)
3143

32-
# fit <- mod_FUN(params, ncv_dat_list$sim_data[[1]])
44+
# predict on test set
3345
preds <- predict(fit, large_dat)
3446
if (!is.data.frame(preds)) {
3547
preds <- preds$predictions

0 commit comments

Comments
 (0)