Skip to content

Commit 2335466

Browse files
author
ercbk
committed
added bar chart and table to readme
1 parent de6b479 commit 2335466

File tree

5 files changed

+112
-42
lines changed

5 files changed

+112
-42
lines changed

README.Rmd

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -38,44 +38,68 @@ Various elements of the technique can be altered to improve performance. These i
3838
4. Grid search strategy
3939

4040
For the performance experiment (question 2), I'll be varying the repeats of the outer-loop cv strategy for each method. The fastest implementation of each method will be tuned with different sizes of data ranging from 100 to 5000 observations. The mean absolute error will be calculated for each combination of repeat, data size, and method.
41-
42-
Notes:
43-
44-
1. I'm using a 4 core, 16 GB RAM machine.
45-
2. "parsnip" refers to the script where both the Elastic Net and Ranger Random Forest model functions come from {parsnip}
46-
3. "ranger" means the Random Forest model function that's used is directly from the {ranger} package.
47-
4. In "sklearn", the Random Forest model function comes for scikit-learn.
48-
5. "ranger-kj" uses all the Kuhn-Johnson loop functions and the {ranger} Random Forest model function to execute Raschka's method.
4941

5042

5143

5244
Progress (duration in seconds)
5345

5446
![](duration-experiment/outputs/0225-results.png)
5547

48+
![](duration-experiment/outputs/duration-pkg-tbl.png)
49+
50+
```{r, echo=FALSE, message=FALSE}
51+
pacman::p_load(extrafont, dplyr, ggplot2, patchwork, stringr, tidytext)
52+
53+
54+
55+
runs_raw <- readr::read_rds("data/duration-runs.rds")
56+
57+
58+
59+
runs <- runs_raw %>%
60+
mutate(duration = round(duration/60, 2),
61+
implementation = as.factor(str_to_title(implementation)),
62+
implementation = reorder_within(implementation, duration, method))
63+
64+
65+
raschka <- runs %>%
66+
filter(method == "raschka") %>%
67+
ggplot(aes(y = duration, x = implementation, label = duration)) +
68+
geom_bar(stat = "identity", width = 0.50, fill = "#195198") +
69+
coord_flip() +
70+
scale_x_reordered() +
71+
geom_text(hjust = 1.3, size = 3.5, color = "white") +
72+
labs(x = NULL, y = NULL,
73+
title = "Raschka") +
74+
theme(plot.title = element_text(size = rel(0.9)))
75+
76+
77+
kj <- runs %>%
78+
filter(method == "kj") %>%
79+
ggplot(aes(y = duration, x = implementation, label = duration)) +
80+
geom_bar(stat = "identity", width = 0.50, fill = "#BD9865") +
81+
coord_flip() +
82+
scale_x_reordered() +
83+
geom_text(hjust = 1.3, size = 3.5, color = "white") +
84+
labs(x = NULL, y = NULL,
85+
title = "Kuhn-Johnson") +
86+
theme(plot.title = element_text(size = rel(0.9)))
87+
88+
durations <- raschka + kj +
89+
plot_annotation(title = "Durations",
90+
subtitle = "minutes") &
91+
theme(text = element_text(family = "Roboto"),
92+
axis.ticks = element_blank(),
93+
panel.background = element_rect(fill = "ivory",
94+
colour = "ivory"),
95+
plot.background = element_rect(fill = "ivory"),
96+
plot.subtitle = element_text(size = rel(0.85)),
97+
panel.border = element_blank(),
98+
panel.grid.major = element_blank(),
99+
panel.grid.minor = element_blank()
100+
)
101+
durations
56102
57-
```{r, echo=FALSE, eval=FALSE, message=FALSE}
58-
library(dplyr, quietly = TRUE)
59-
library(echarts4r, quietly = TRUE)
60-
61-
runs <- readr::read_rds("data/duration-runs.rds")
62-
63-
e_common(
64-
font_family = "Roboto Medium",
65-
theme = NULL
66-
)
67-
68-
runs %>%
69-
group_by(method) %>%
70-
arrange(duration) %>%
71-
mutate(duration = round(duration/60, 2)) %>%
72-
e_charts(implementation) %>%
73-
e_bar(duration) %>%
74-
e_flip_coords() %>%
75-
e_tooltip() %>%
76-
e_legend() %>%
77-
e_title("Duration", "minutes") %>%
78-
e_theme_custom('{"color":["#195198","#BD9865"], "backgroundColor": "ivory"}')
79103
80104
81105
```

README.md

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,22 +58,14 @@ implementation of each method will be tuned with different sizes of data
5858
ranging from 100 to 5000 observations. The mean absolute error will be
5959
calculated for each combination of repeat, data size, and method.
6060

61-
Notes:
62-
63-
1. I’m using a 4 core, 16 GB RAM machine.
64-
2. “parsnip” refers to the script where both the Elastic Net and Ranger
65-
Random Forest model functions come from {parsnip}
66-
3. “ranger” means the Random Forest model function that’s used is
67-
directly from the {ranger} package.
68-
4. In “sklearn”, the Random Forest model function comes for
69-
scikit-learn.
70-
5. “ranger-kj” uses all the Kuhn-Johnson loop functions and the
71-
{ranger} Random Forest model function to execute Raschka’s method.
72-
7361
Progress (duration in seconds)
7462

7563
![](duration-experiment/outputs/0225-results.png)
7664

65+
![](duration-experiment/outputs/duration-pkg-tbl.png)
66+
67+
![](README_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
68+
7769
References
7870

7971
Boulesteix, AL, and C Strobl. 2009. “Optimal Classifier Selection and
7.89 KB
Loading
40.5 KB
Loading
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# GT table: Package Sources for the Model Functions
2+
3+
4+
pacman::p_load(tibble, dplyr, tidyr, gt)
5+
6+
runs_raw <- readr::read_rds("R/Projects/nested-cross-validation-comparison/data/duration-runs.rds") %>%
7+
mutate(implementation = stringr::str_to_title(implementation))
8+
9+
10+
# packages used for the model functions ordered by implementation in the runs_raw file
11+
elast_net <- c("sklearn", "sklearn", "parsnip-glmnet", "mlr-glmnet", "parsnip-glmnet", "h2o", "parsnip-glmnet", "parsnip-glmnet", "parsnip-glmnet")
12+
rand_forest <- c("sklearn", "sklearn", "ranger", "mlr-ranger", "parsnip-ranger", "h2o", "sklearn", "parsnip-ranger", "ranger")
13+
14+
15+
elast_dat <- runs_raw %>%
16+
select(implementation) %>%
17+
bind_cols(`Elastic Net` = elast_net) %>%
18+
pivot_wider(names_from = implementation, values_from = `Elastic Net`)
19+
20+
# implementations as cols, algorithm as rows, values = package used
21+
model_dat <- runs_raw %>%
22+
select(implementation) %>%
23+
bind_cols(`Random Forest` = rand_forest) %>%
24+
pivot_wider(names_from = implementation, values_from = `Random Forest`) %>%
25+
bind_rows(elast_dat) %>%
26+
mutate(rowname = c("Random Forest", "Elastic Net"))
27+
28+
model_dat %>%
29+
gt() %>%
30+
tab_spanner(
31+
label = "Implementation",
32+
columns = everything()
33+
) %>%
34+
data_color(columns = vars(Reticulate, Python, Mlr3, `Ranger-Kj`),
35+
colors = scales::col_factor(
36+
palette = "#195198",
37+
domain = c("sklearn", "ranger", "parsnip-glmnet", "mlr-ranger", "mlr-glmnet" ))) %>%
38+
data_color(columns = vars(Tune, H2o, Sklearn, Parsnip, Ranger),
39+
colors = scales::col_factor(
40+
palette = "#BD9865",
41+
domain = c("sklearn", "ranger", "parsnip-glmnet", "parsnip-ranger", "h2o" ))) %>%
42+
tab_style(
43+
style = cell_text(align = "center"),
44+
locations = cells_body()
45+
) %>%
46+
tab_options(
47+
table.background.color = "ivory",
48+
table.border.top.style = "None"
49+
) %>%
50+
tab_header(
51+
title = "Package Sources for the Model Functions"
52+
) %>%
53+
gtsave(filename = "duration-pkg-tbl.png",
54+
path = "duration-experiment/outputs")

0 commit comments

Comments
 (0)