misc minor corrections

ercbk · ercbk · commit ea0936042462 · 2020-02-25T20:53:31.000-05:00
diff --git a/README.Rmd b/README.Rmd
@@ -35,13 +35,13 @@ Various elements of the technique can be altered to improve performance. These i
 3. Inner-Loop CV strategy  
 4. Grid search strategy  
 
-For the performance experiemnt (question 2), I'll be varying the repeats of the outer-loop cv strategy for each method. The fastest implementation of each method will be tuned with different sizes of data ranging from 100 to 5000 observations. The mean absolute error will be calculated for each combination of repeat, data size, and method. 
+For the performance experiment (question 2), I'll be varying the repeats of the outer-loop cv strategy for each method. The fastest implementation of each method will be tuned with different sizes of data ranging from 100 to 5000 observations. The mean absolute error will be calculated for each combination of repeat, data size, and method. 
 
 I'm using a 4 core, 16 GB RAM machine.
 
 Progress (duration in seconds)  
 
-![](duration-experiment/outputs/0224-results.png)  
+![](duration-experiment/outputs/0225-results.png)  
 
 References  
 
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ These include:
 3\. Inner-Loop CV strategy  
 4\. Grid search strategy
 
-For the performance experiemnt (question 2), I’ll be varying the repeats
+For the performance experiment (question 2), I’ll be varying the repeats
 of the outer-loop cv strategy for each method. The fastest
 implementation of each method will be tuned with different sizes of data
 ranging from 100 to 5000 observations. The mean absolute error will be
@@ -60,7 +60,7 @@ I’m using a 4 core, 16 GB RAM machine.
 
 Progress (duration in seconds)
 
-![](duration-experiment/outputs/0224-results.png)
+![](duration-experiment/outputs/0225-results.png)
 
 References
 
diff --git a/duration-experiment/kuhn-johnson/nested-cv-h2o-kj.R b/duration-experiment/kuhn-johnson/nested-cv-h2o-kj.R
@@ -70,7 +70,7 @@ ncv_dat_10 <- rsample::nested_cv(small_dat,
                               inside = bootstraps(times = 25))
 
 
-
+# Start h2o cluster
 h2o.init()
 
 
@@ -86,14 +86,15 @@ error_FUN <- function(model){
 }
 
 
+# Distributed Random Forest
 
 rf_FUN <- function(x, y, anal_h2o, ass_h2o, params) {
    
    mtries <- params$mtries[[1]]
    ntrees <- params$ntrees[[1]]
    
    # h20 ususally needs unique ids or loops will return exact same values over and over
-   modelId <- as.character(Sys.time())
+   gridId <- as.character(dqrng::dqrnorm(1))
    
    h2o.show_progress()
    
@@ -107,6 +108,8 @@ rf_FUN <- function(x, y, anal_h2o, ass_h2o, params) {
 }
 
 
+# Elastic Net Regression
+
 glm_FUN <- function(x, y, anal_h2o, ass_h2o, params) {
    
    alpha <- params$alpha[[1]]
@@ -154,6 +157,7 @@ params_list <- list(glm = list(alpha = c(0, 0.25, 0.5, 0.75, 1),
 #####################################################
 
 
+# inputs params, model, and resample, calls model and error functions, outputs error
 mod_error <- function(params, mod_FUN, dat) {
    anal_df <- rsample::analysis(dat)
    ass_df <- rsample::assessment(dat)
@@ -297,4 +301,5 @@ tic.clearlog()
 # MLflow uses waitress for Windows. Killing it also kills mlflow.exe, python.exe, console window host processes
 installr::kill_process(process = c("waitress-serve.exe"))
 
-
+# shutdown cluster
+h2o.shutdown(prompt = FALSE)
diff --git a/duration-experiment/kuhn-johnson/nested-cv-parsnip-kj.R b/duration-experiment/kuhn-johnson/nested-cv-parsnip-kj.R
@@ -101,7 +101,7 @@ pars_ranger_FUN <- function(params, analysis_set) {
 
 
 
-# Regularized Regression
+# Elastic Net Regression
 
 glm_FUN <- function(params, analysis_set) {
    alpha <- params$mixture[[1]]
diff --git a/duration-experiment/kuhn-johnson/nested-cv-ranger-kj.R b/duration-experiment/kuhn-johnson/nested-cv-ranger-kj.R
@@ -102,7 +102,7 @@ ranger_FUN <- function(params, analysis_set) {
 }
 
 
-# Regularized Regression
+# Elastic Net Regression
 
 glm_FUN <- function(params, analysis_set) {
    alpha <- params$mixture[[1]]
diff --git a/duration-experiment/kuhn-johnson/nested-cv-sklearn-kj.R b/duration-experiment/kuhn-johnson/nested-cv-sklearn-kj.R
@@ -92,6 +92,8 @@ error_FUN <- function(y_obs, y_hat){
 #####################################
 
 
+# Random Forest
+
 sklearn_rf_FUN <- function(params, analysis_set) {
    sklearn_e <- import("sklearn.ensemble")
    max_features <- r_to_py(params$mtry[[1]])
@@ -112,7 +114,7 @@ sklearn_rf_FUN <- function(params, analysis_set) {
 }
 
 
-# Regularized Regression
+# Elastic Net Regression
 
 glm_FUN <- function(params, analysis_set) {
       alpha <- params$mixture[[1]]
diff --git a/duration-experiment/kuhn-johnson/nested-cv-tune-kj.R b/duration-experiment/kuhn-johnson/nested-cv-tune-kj.R
@@ -157,6 +157,7 @@ params_list <- list(glm = glm_params, rf = rf_params)
 ################################
 
 
+# inputs params, model, and resample, calls model and error functions, outputs error
 mod_error <- function(params, mod_FUN, dat) {
    y_col <- ncol(dat$data)
    y_obs <- assessment(dat)[y_col]
diff --git a/duration-experiment/outputs/0222-results.png b/duration-experiment/outputs/0222-results.png
diff --git a/duration-experiment/outputs/0223-results.png b/duration-experiment/outputs/0223-results.png
diff --git a/duration-experiment/outputs/0224-results.png b/duration-experiment/outputs/0224-results.png
diff --git a/duration-experiment/outputs/0225-results.png b/duration-experiment/outputs/0225-results.png
diff --git a/duration-experiment/outputs/0225-runs.csv b/duration-experiment/outputs/0225-runs.csv
@@ -0,0 +1,10 @@
+Run ID,Name,Source Type,Source Name,User,Status,duration
+0525165fd1c4474aad9e1ce36136d2fc,,LOCAL,C:\Users\tbats\Documents\R\Projects\nested-cv-comp-temp\duration-experiment\Raschka\mlflow\nested-cv-py-raschka.py,tbats,FINISHED,1986.82
+e61a875fa83f4febbb4107239e7ff76e,,LOCAL,C:\Users\tbats\Documents\R\Projects\nested-cv-comp-temp\duration-experiment\Raschka\mlflow\nested-cv-retic-raschka.R,tbats,FINISHED,1984.2
+180345e9ddad49d393cf0482087176c5,,LOCAL,C:\Users\tbats\Documents\R\Projects\nested-cv-comp-temp\duration-experiment\Raschka\mlflow\nested-cv-kj-raschka.R,tbats,FINISHED,317.68
+b54322c7bd1a4993a100dc6b44b78cb8,,LOCAL,C:\Users\tbats\Documents\R\Projects\nested-cv-comp-temp\duration-experiment\Raschka\mlflow\nested-cv-mlr3-raschka.R,tbats,FINISHED,307.45
+2c66543390bc4183a288602c572d3514,,LOCAL,nested-cv-tune-kj.R,tbats,FINISHED,7034.82
+1e519c7e647845f79a027f2aba6ab89e,,LOCAL,nested-cv-h2o-kj.R,tbats,FINISHED,12374.44
+56d45204fb45490b8fe02c57377d70ef,,LOCAL,nested-cv-sklearn-kj.R,tbats,FINISHED,7405.58
+f89e830aa4744acfadcdcc30dbdb7f31,,LOCAL,nested-cv-parsnip-kj.R,tbats,FINISHED,4622.9
+f4a72f4bedf94af9a4c3748247bfa189,,LOCAL,nested-cv-ranger-kj.R,tbats,FINISHED,2593.17
diff --git a/duration-experiment/raschka/nested-cv-py-raschka.py b/duration-experiment/raschka/nested-cv-py-raschka.py
@@ -74,12 +74,13 @@
 
 np.random.seed(2019)
 
+# dotenv allows for persistent environment variables
 from dotenv import load_dotenv
 load_dotenv()
 pb_token = os.getenv('PUSHBULLET_TOKEN')
 pb = Pushbullet(pb_token)
 
-
+# make explicit the name of the exeriement to record to
 mlflow.set_experiment("ncv_duration")
 
 
@@ -94,7 +95,7 @@
 with open('./data/fivek-simdat.pickle', 'rb') as fried:
       pdat = pickle.load(fried)
 
-# load penalyzed regression hyperparameter values
+# load elastic net regression hyperparameter values
 with open('./grids/elast-latin-params.pickle', 'rb') as elastp:
       elast_params = pickle.load(elastp)
 
@@ -159,16 +160,15 @@
 ####################################
 
 
+# vessel for my inner-loop grid search objects
 gridcvs = {}
 
 # shuffle = True required for setting random state
 # setting random state makes sure all algorithms tuned on the same splits
 inner_cv = KFold(n_splits = 2, shuffle = True, random_state = 1)
 
 # Setting this parameter to the size of the grid tells Random Search to use every grid value once
-elast_iter = len(elast_params)
-rf_iter = len(rf_params)
-iter_dict = {'Elastic Net': elast_iter, 'Random Forest': rf_iter}
+iter_dict = {'Elastic Net': len(elast_params), 'Random Forest': len(rf_params)}
 
 
 # Setting up multiple RandomSearchCV objects, 1 for each algorithm
@@ -194,7 +194,7 @@
 # Run nested-cv
 ####################################
 
-
+# vessel for stats on the outer fold results
 results = pd.DataFrame()
 
 # The validation set scores of the outer loop folds will be used to choose the best algorithm
@@ -278,5 +278,6 @@
 pb.push_note("Nested CV script finished", msg)
 
 
+# only necessary if running in RStudio or using reticulate::source_python
 # MLflow uses waitress for Windows. Killing it also kills mlflow.exe, python.exe, console window host processes
-os.system('taskkill /f /im waitress-serve.exe')
+# os.system('taskkill /f /im waitress-serve.exe')
diff --git a/duration-experiment/raschka/nested-cv-retic-raschka.R b/duration-experiment/raschka/nested-cv-retic-raschka.R
@@ -30,6 +30,7 @@
 ######################################
 
 
+# text me if any errors occur
 options(error = function() { 
    library(RPushbullet)
    pbPost("note", "Error", geterrmessage())
@@ -113,7 +114,7 @@ y_test <- as.numeric(dat_splits[[4]])
 # Estimators
 ######################################
 
-# Penalyzed Regression
+# Elastic Net Regression
 elast_est <- sk_lm$ElasticNet(normalize = TRUE,
                          fit_intercept = TRUE)
 
@@ -132,7 +133,7 @@ alg_list <- list(elastic_net = elast_est, rf = rf_est)
 ######################################
 
 
-# Penalyzed Regression
+# Elastic Net Regression
 elast_params <- r_to_py(dials::grid_latin_hypercube(
    dials::mixture(),
    dials::penalty(),
@@ -311,6 +312,8 @@ log.txt <- tic.log(format = TRUE)
 text_msg <- glue("{log.txt[[1]]} for script to complete
                  Results:
                  {msg}")
+
+# text me the results
 pbPost("note", title="reticulate-raschka script finished", body=text_msg)
 tic.clearlog()
 
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,63 @@
+name: null
+channels:
+  - defaults
+dependencies:
+  - certifi=2019.11.28=py36_0
+  - pip=20.0.2=py36_1
+  - python=3.6.10=h9f7ef89_0
+  - setuptools=45.2.0=py36_0
+  - sqlite=3.31.1=he774522_0
+  - vc=14.1=h0510ff6_4
+  - vs2015_runtime=14.16.27012=hf0eaf9b_1
+  - wheel=0.34.2=py36_0
+  - wincertstore=0.2=py36h7fe50ca_0
+  - pip:
+    - alembic==1.4.0
+    - chardet==3.0.4
+    - click==7.0
+    - cloudpickle==1.3.0
+    - configparser==4.0.2
+    - databricks-cli==0.9.1
+    - docker==4.2.0
+    - entrypoints==0.3
+    - flask==1.1.1
+    - gitdb2==3.0.2
+    - gitpython==3.0.8
+    - gorilla==0.3.0
+    - idna==2.9
+    - itsdangerous==1.1.0
+    - jinja2==2.11.1
+    - joblib==0.14.1
+    - mako==1.1.1
+    - markupsafe==1.1.1
+    - mlflow==1.6.0
+    - numpy==1.18.1
+    - pandas==1.0.1
+    - prometheus-client==0.7.1
+    - prometheus-flask-exporter==0.12.2
+    - protobuf==3.11.3
+    - pushbullet-py==0.11.0
+    - pypiwin32==223
+    - python-dateutil==2.8.1
+    - python-dotenv==0.11.0
+    - python-editor==1.0.4
+    - python-magic==0.4.15
+    - pytictoc==1.5.0
+    - pytz==2019.3
+    - pywin32==227
+    - pyyaml==5.3
+    - querystring-parser==1.2.4
+    - requests==2.23.0
+    - scikit-learn==0.22.1
+    - scipy==1.4.1
+    - simplejson==3.17.0
+    - six==1.14.0
+    - smmap2==2.0.5
+    - sqlalchemy==1.3.13
+    - sqlparse==0.3.0
+    - tabulate==0.8.6
+    - urllib3==1.25.8
+    - waitress==1.4.3
+    - websocket-client==0.57.0
+    - werkzeug==1.0.0
+prefix: C:\Users\tbats\Documents\R\Projects\nested-cross-validation-comparison\renv\python\condaenvs\renv-python
diff --git a/renv.lock b/renv.lock

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ ranger_FUN <- function(params, analysis_set) {`
`102`	`102`	`}`
`103`	`103`
`104`	`104`
`105`		`-# Regularized Regression`
	`105`	`+# Elastic Net Regression`
`106`	`106`
`107`	`107`	`glm_FUN <- function(params, analysis_set) {`
`108`	`108`	`alpha <- params$mixture[[1]]`