Hyperparameters tuning with tidymodels

Source: MLOPs with vetiver

Hyperparameter tuning is a important skill to πŸš€ boost model performance πŸš€


but…


you waste time 😴 without an intentional & systematic approach.

From last time

Load Pacakges & Set Options
library(tidyverse)      
library(tidymodels)     
library(palmerpenguins) # penguin dataset
library(gt)             # better tables
library(bonsai)         # tree-based models
library(conflicted)     # function conflicts
tidymodels_prefer()     # handle conflicts
conflict_prefer("penguins", "palmerpenguins")
options(tidymodels.dark = TRUE) # dark mode
theme_set(theme_bw()) # set default ggplot2 theme
Exploratory Data Analysis
penguins |>
  filter(!is.na(sex)) |>
  ggplot(aes(x     = flipper_length_mm,
             y     = bill_length_mm,
             color = sex,
             size  = body_mass_g)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~species)
Prepare & Split Data
# remove rows with missing sex
# exclude year and island
penguins_df <-
  penguins |>
  drop_na(sex) |>
  select(-year, -island)

# set the seed for reproducibility
set.seed(1234)

# Split the data into train and test sets
# stratify by sex
penguin_split <- initial_split(penguins_df,
                               strata = sex)
penguin_train <- training(penguin_split)
penguin_test  <- testing(penguin_split)

# create folds for cross validation
penguin_folds <- vfold_cv(penguin_train, v = 10, strata = sex)

Create Recipe

For more on creating recipes, see the getting started page from the {recipes} pkgdown site. You can also learn more about the recommended ordering of steps. For recommend preprocessing steps based on model type, see the Appendix from Tidy Modeling with R.


penguin_rec <-
  recipe(sex ~ ., data = penguin_train) |>    
  step_dummy(species)

Specify Model

Let’s specify a boosted tree model with boost_tree() from {bonsai}.

rlang::check_installed("lightgbm")
bt_bonsai_spec <-
  boost_tree(learn_rate     = tune(),
             stop_iter      = tune(),
             trees          = 1000) |>
  set_engine(engine     = "lightgbm",
             num_leaves = tune()) |>
  set_mode("classification")

Build Grid for Tuning

bt_bonsai_spec |> 
  extract_parameter_set_dials()
Collection of 3 parameters for tuning

 identifier       type    object
 learn_rate learn_rate nparam[+]
  stop_iter  stop_iter nparam[+]
 num_leaves num_leaves nparam[+]
num_leaves()
Number of Leaves (quantitative)
Range: [5, 100]

Build Grid for Tuning

grid_tune <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 50)


grid_tune |> glimpse(width = 50)
Rows: 50
Columns: 3
$ learn_rate <dbl> 4.499336e-02, 3.174162e-05, 9…
$ stop_iter  <int> 7, 17, 11, 8, 8, 13, 5, 20, 9…
$ num_leaves <int> 81, 39, 34, 74, 22, 79, 46, 8…

Fit Models & Tune Hyperparameters

Construct our workflow

bt_bonsai_wf <-
  workflow() |> 
  add_recipe(penguin_rec) |> 
  add_model(bt_bonsai_spec)

Specify the grid control parameters

cntl   <- control_grid(save_pred     = TRUE,
                       save_workflow = TRUE)

Fit Models & Tune Hyperparameters

bt_tune_grid <- 
  bt_bonsai_wf |> 
  tune_grid(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = cntl
  )

Tuning Results

autoplot(bt_tune_grid)

Racing with {finetune}

library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)

Racing with {finetune}

library(finetune)
race_cntl <- control_race(save_pred     = TRUE,
                          save_workflow = TRUE)
bt_tune_race <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples = penguin_folds,
    grid      = grid_tune,
    control   = race_cntl
  )

Racing Results

autoplot(bt_tune_race)

Racing Results

plot_race(bt_tune_race)

Faster πŸŽοΈπŸ’¨

big_grid <- 
  bt_bonsai_spec |> 
  extract_parameter_set_dials() |> 
  grid_latin_hypercube(size = 250)

Faster πŸŽοΈπŸ’¨

# tune in parallel
library(doMC)
registerDoMC(cores = 10)

Faster πŸŽοΈπŸ’¨

# tune in parallel
library(doMC)
registerDoMC(cores = 10)

bt_tune_fast <- 
  bt_bonsai_wf |> 
  tune_race_anova(
    resamples  = penguin_folds,
    grid       = big_grid,
    control    = race_cntl
  )

Faster πŸŽοΈπŸ’¨

autoplot(bt_tune_fast)

Faster πŸŽοΈπŸ’¨

plot_race(bt_tune_fast)

πŸš€ Finalize Model πŸš€

bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

πŸš€ Finalize Model πŸš€

bt_best_id <-
  bt_tune_fast |>
  select_best(metric = "roc_auc")

# extract the best model from the workflow
best_bt_race <-
  bt_tune_fast |>
  extract_workflow() |>
  finalize_workflow(bt_best_id) |>
  last_fit(penguin_split)

πŸš€ Finalize Model πŸš€

# collect the metrics for the best model
best_bt_race |>
  collect_metrics()
# A tibble: 2 Γ— 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.905 Preprocessor1_Model1
2 roc_auc  binary         0.973 Preprocessor1_Model1

πŸš€ Finalize Model πŸš€

# plot results of test set fit
best_bt_race |>
  collect_predictions() |>
  roc_curve(sex, .pred_female) |>
  autoplot()

Hyperparameter Tuning Summary

  1. Use {tune} and {finetune} to perform hyperparameter tuning
  2. Use race functions (e.g., tune_race_anova()) to screen parameters most likely to perform well
  3. Use parallel processing to speed up tuning

Where to Next?

Iterative hyperparameter tuning πŸŒ€