ercbk
diff --git a/‎README.Rmd‎
Lines changed: 54 additions & 30 deletions b/‎README.Rmd‎
Lines changed: 54 additions & 30 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 12 deletions b/‎README.md‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎README_files/figure-gfm/unnamed-chunk-1-1.png‎
7.89 KB b/‎README_files/figure-gfm/unnamed-chunk-1-1.png‎
7.89 KB
diff --git a/‎duration-experiment/outputs/duration-pkg-tbl.png‎
40.5 KB b/‎duration-experiment/outputs/duration-pkg-tbl.png‎
40.5 KB
diff --git a/‎duration-experiment/package-sources-gt-tbl.R‎
Lines changed: 54 additions & 0 deletions b/‎duration-experiment/package-sources-gt-tbl.R‎
Lines changed: 54 additions & 0 deletions
@@ -38,44 +38,68 @@ Various elements of the technique can be altered to improve performance. These i
 4. Grid search strategy  
 
 For the performance experiment (question 2), I'll be varying the repeats of the outer-loop cv strategy for each method. The fastest implementation of each method will be tuned with different sizes of data ranging from 100 to 5000 observations. The mean absolute error will be calculated for each combination of repeat, data size, and method. 
-
-Notes: 
-
-1. I'm using a 4 core, 16 GB RAM machine.  
-2. "parsnip" refers to the script where both the Elastic Net and Ranger Random Forest model functions come from {parsnip}  
-3. "ranger" means the Random Forest model function that's used is directly from the {ranger} package.  
-4. In "sklearn", the Random Forest model function comes for scikit-learn.  
-5. "ranger-kj" uses all the Kuhn-Johnson loop functions and the {ranger} Random Forest model function to execute Raschka's method.  
 
 
 
 Progress (duration in seconds)  
 
 ![](duration-experiment/outputs/0225-results.png)  
 
+![](duration-experiment/outputs/duration-pkg-tbl.png)  
+
+```{r, echo=FALSE, message=FALSE}
+pacman::p_load(extrafont, dplyr, ggplot2, patchwork, stringr, tidytext)
+
+
+
+runs_raw <- readr::read_rds("data/duration-runs.rds")
+
+
+
+runs <- runs_raw %>%
+      mutate(duration = round(duration/60, 2),
+             implementation = as.factor(str_to_title(implementation)),
+             implementation = reorder_within(implementation, duration, method))
+
+
+raschka <- runs %>% 
+   filter(method == "raschka") %>%
+   ggplot(aes(y = duration, x = implementation, label = duration)) + 
+   geom_bar(stat = "identity", width = 0.50, fill = "#195198") +
+   coord_flip() +
+   scale_x_reordered() +
+   geom_text(hjust = 1.3,  size = 3.5, color = "white") +
+   labs(x = NULL, y = NULL,
+        title = "Raschka") +
+   theme(plot.title = element_text(size = rel(0.9)))
+
+
+kj <- runs %>% 
+   filter(method == "kj") %>%
+   ggplot(aes(y = duration, x = implementation, label = duration)) + 
+   geom_bar(stat = "identity", width = 0.50, fill = "#BD9865") +
+   coord_flip() +
+   scale_x_reordered() +
+   geom_text(hjust = 1.3,  size = 3.5, color = "white") +
+   labs(x = NULL, y = NULL,
+        title = "Kuhn-Johnson") +
+   theme(plot.title = element_text(size = rel(0.9)))
+
+durations <- raschka + kj +
+      plot_annotation(title = "Durations",
+                      subtitle = "minutes") &
+      theme(text = element_text(family = "Roboto"),
+            axis.ticks = element_blank(),
+            panel.background = element_rect(fill = "ivory",
+                                            colour = "ivory"),
+            plot.background = element_rect(fill = "ivory"),
+            plot.subtitle = element_text(size = rel(0.85)),
+            panel.border = element_blank(),
+            panel.grid.major = element_blank(),
+            panel.grid.minor = element_blank()
+      )
+durations
 
-```{r, echo=FALSE, eval=FALSE, message=FALSE}
-library(dplyr, quietly = TRUE)
-library(echarts4r, quietly = TRUE)
-
-runs <- readr::read_rds("data/duration-runs.rds")
-
-e_common(
-      font_family = "Roboto Medium",
-      theme = NULL
-)
-
-runs %>% 
-      group_by(method) %>% 
-      arrange(duration)  %>% 
-      mutate(duration = round(duration/60, 2)) %>% 
-      e_charts(implementation) %>% 
-      e_bar(duration) %>% 
-      e_flip_coords() %>% 
-      e_tooltip() %>%
-      e_legend() %>% 
-      e_title("Duration", "minutes") %>% 
-      e_theme_custom('{"color":["#195198","#BD9865"], "backgroundColor": "ivory"}')
 
 
 ```
 
@@ -58,22 +58,14 @@ implementation of each method will be tuned with different sizes of data
 ranging from 100 to 5000 observations. The mean absolute error will be
 calculated for each combination of repeat, data size, and method.
 
-Notes:
-
-1.  I’m using a 4 core, 16 GB RAM machine.  
-2.  “parsnip” refers to the script where both the Elastic Net and Ranger
-    Random Forest model functions come from {parsnip}  
-3.  “ranger” means the Random Forest model function that’s used is
-    directly from the {ranger} package.  
-4.  In “sklearn”, the Random Forest model function comes for
-    scikit-learn.  
-5.  “ranger-kj” uses all the Kuhn-Johnson loop functions and the
-    {ranger} Random Forest model function to execute Raschka’s method.
-
 Progress (duration in seconds)
 
 ![](duration-experiment/outputs/0225-results.png)
 
+![](duration-experiment/outputs/duration-pkg-tbl.png)
+
+![](README_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
+
 References
 
 Boulesteix, AL, and C Strobl. 2009. “Optimal Classifier Selection and
 
@@ -0,0 +1,54 @@
+# GT table: Package Sources for the Model Functions
+
+
+pacman::p_load(tibble, dplyr, tidyr, gt)
+
+runs_raw <- readr::read_rds("R/Projects/nested-cross-validation-comparison/data/duration-runs.rds") %>%
+      mutate(implementation = stringr::str_to_title(implementation))
+
+
+# packages used for the model functions ordered by implementation in the runs_raw file
+elast_net <- c("sklearn", "sklearn", "parsnip-glmnet", "mlr-glmnet", "parsnip-glmnet", "h2o", "parsnip-glmnet", "parsnip-glmnet", "parsnip-glmnet")
+rand_forest <- c("sklearn", "sklearn", "ranger", "mlr-ranger", "parsnip-ranger", "h2o", "sklearn", "parsnip-ranger", "ranger")
+
+
+elast_dat <- runs_raw %>% 
+      select(implementation) %>% 
+      bind_cols(`Elastic Net` = elast_net) %>% 
+      pivot_wider(names_from = implementation, values_from = `Elastic Net`)
+
+# implementations as cols, algorithm as rows, values = package used
+model_dat <- runs_raw %>% 
+      select(implementation) %>% 
+      bind_cols(`Random Forest` = rand_forest) %>% 
+      pivot_wider(names_from = implementation, values_from = `Random Forest`) %>% 
+      bind_rows(elast_dat) %>% 
+      mutate(rowname = c("Random Forest", "Elastic Net"))
+
+model_dat %>% 
+      gt() %>% 
+      tab_spanner(
+            label = "Implementation",
+            columns = everything()
+      ) %>% 
+      data_color(columns = vars(Reticulate, Python, Mlr3, `Ranger-Kj`),
+                 colors = scales::col_factor(
+                       palette = "#195198",
+                       domain = c("sklearn", "ranger", "parsnip-glmnet", "mlr-ranger", "mlr-glmnet" ))) %>% 
+      data_color(columns = vars(Tune, H2o, Sklearn, Parsnip, Ranger),
+                 colors = scales::col_factor(
+                       palette = "#BD9865",
+                       domain = c("sklearn", "ranger", "parsnip-glmnet", "parsnip-ranger", "h2o" ))) %>%
+      tab_style(
+            style = cell_text(align = "center"),
+            locations = cells_body()
+      ) %>% 
+      tab_options(
+            table.background.color = "ivory",
+            table.border.top.style = "None"
+      ) %>% 
+      tab_header(
+            title = "Package Sources for the Model Functions"
+      ) %>% 
+      gtsave(filename = "duration-pkg-tbl.png",
+             path = "duration-experiment/outputs")