library(tidyverse) # data wrangling and cleaninglibrary(tidymodels) # modeling and machine learninglibrary(palmerpenguins) # penguin datasetlibrary(gt) # creating table objects for datalibrary(ranger) # random forest model enginelibrary(brulee) # neural network with torchlibrary(pins) # sharing resources across sessions & userslibrary(vetiver) # model versioning and deploymentlibrary(plumber) # API creationlibrary(conflicted) # handling function conflictstidymodels_prefer() # handle common conflicts with tidymodelsconflict_prefer("penguins", "palmerpenguins")options(tidymodels.dark =TRUE) # dark mode console messages
Exploratory Data Analysis
Note: You should do more exploration than this for a new set of data.
Note: You should do more exploration than this for a new set of data.
Prepare & Split Data
# remove rows with missing sex, exclude year and islandpenguins_df <- penguins |>drop_na(sex) |>select(-year, -island)# set the seed for reproducibilityset.seed(1234)# Split the data into train and test sets stratified by sexpenguin_split <-initial_split(penguins_df, strata = sex)penguin_train <-training(penguin_split)penguin_test <-testing(penguin_split)# create folds for cross validationpenguin_folds <-vfold_cv(penguin_train)
# create table of best models defined using roc_auc metricrank_results(workflow_set,rank_metric ="roc_auc",select_best =TRUE) |>gt()
wflow_id
.config
.metric
mean
std_err
n
preprocessor
model
rank
recipe_torch
Iter30
accuracy
0.8998333
0.01903287
10
recipe
mlp
1
recipe_torch
Iter30
roc_auc
0.9791730
0.01022562
10
recipe
mlp
1
recipe_glm
Preprocessor1_Model1
accuracy
0.8998333
0.01994506
10
recipe
logistic_reg
2
recipe_glm
Preprocessor1_Model1
roc_auc
0.9686469
0.01234732
10
recipe
logistic_reg
2
recipe_tree
Iter2
accuracy
0.9118333
0.02512248
10
recipe
rand_forest
3
recipe_tree
Iter2
roc_auc
0.9672987
0.01320487
10
recipe
rand_forest
3
Compare Model Results
Plotting performance
workflow_set |>autoplot()
best_model_id <-"recipe_glm"
Finalize Fit
# select best modelbest_fit <- workflow_set |>extract_workflow_set_result(best_model_id) |>select_best(metric ="accuracy")# create workflow for best modelfinal_workflow <- workflow_set |>extract_workflow(best_model_id) |>finalize_workflow(best_fit)# fit final model with all datafinal_fit <- final_workflow |>last_fit(penguin_split)
Final Fit Metrics
# show model performancefinal_fit |>collect_metrics() |>gt()
This results in an error because the board produced by board_url is not parsible.
Plumber API
Original Plumber File
# Generated by the vetiver package; edit with carelibrary(pins)library(plumber)library(rapidoc)library(vetiver)# Packages needed to generate model predictionsif (FALSE) {library(parsnip)library(recipes)library(stats)library(workflows)}b <-board_folder(path ="pins-r")v <-vetiver_pin_read(b, "penguins_model", version ="20230730T172358Z-54641")#* @plumberfunction(pr) { pr |>vetiver_api(v)}
Plumber API
Updated Plumber File
# Generated by the vetiver package; edit with carelibrary(pins)library(plumber)library(rapidoc)library(vetiver)# Packages needed to generate model predictionsif (FALSE) {library(parsnip)library(recipes)library(stats)library(workflows)}pin_loc <- pins:::github_raw("JamesHWade/r-mlops/main/pins-r/_pins.yaml")b <-board_url(pin_loc)v <-vetiver_pin_read(b, "penguins_model")#* @plumberfunction(pr) { pr |>vetiver_api(v)}
Write Dockerfile
Original Dockerfile
# Generated by the vetiver package; edit with careFROM rocker/r-ver:4.3.1ENV RENV_CONFIG_REPOS_OVERRIDE https://packagemanager.rstudio.com/cran/latestRUNapt-get update -qq&&apt-get install -y--no-install-recommends\ libcurl4-openssl-dev \ libicu-dev \ libsodium-dev \ libssl-dev \ make \ zlib1g-dev \&&apt-get cleanCOPY vetiver_renv.lock renv.lockRUNRscript-e"install.packages('renv')"RUNRscript-e"renv::restore()"COPY plumber.R /opt/ml/plumber.REXPOSE 8000ENTRYPOINT ["R", "-e", "pr <- plumber::plumb('/opt/ml/plumber.R'); pr$run(host = '0.0.0.0', port = 8000)"]
Write Dockerfile
Updated Dockerfile
# Generated by the vetiver package; edit with careFROM rocker/r-ver:4.3.1# Create a non-root user to run the applicationRUNuseradd--create-home appuserENV RENV_CONFIG_REPOS_OVERRIDE=https://packagemanager.rstudio.com/cran/latestENV HOME=/home/appuserWORKDIR $HOMERUNapt-get update -qq&&apt-get install -y--no-install-recommends\ libcurl4-openssl-dev \ libicu-dev \ libsodium-dev \ libssl-dev \ make \ zlib1g-dev \&&apt-get cleanCOPY vetiver_renv.lock renv.lock# Create the .cache directory and give appuser permission to write to itRUNmkdir-p /home/appuser/.cache &&chown-R appuser:appuser /home/appuser/.cache# Create the .cache/pins/url directory and give appuser permission to write to itRUNmkdir-p /home/appuser/.cache/pins/url &&chown-R appuser:appuser /home/appuser/.cache/pins/urlRUNRscript-e"install.packages('renv')"RUNRscript-e"renv::restore()"COPY plumber.R /opt/ml/plumber.REXPOSE 7860ENTRYPOINT ["R", "-e", "pr <- plumber::plumb('/opt/ml/plumber.R'); pr$run(host = '0.0.0.0', port = 7860)"]
library(shiny)library(bslib)library(vetiver)endpoint <-vetiver_endpoint("https://jameshwade-penguins-model.hf.space/predict")ui <- bslib::page_sidebar(sidebar =sidebar(selectInput("species", "Select Species", choices =c("Adelie", "Chinstrap", "Gentoo")),sliderInput("bill_length_mm", "Enter Bill Length (mm):",min =30, max =60, step =0.5, value =45),sliderInput("bill_depth_mm", "Enter Bill Depth (mm):",min =10, max =22, step =0.5, value =15),sliderInput("flipper_length_mm", "Enter Flipper Length (mm):",min =170, max =235, step =0.5, value =200),sliderInput("body_mass_g", "Enter Body Mass (g):",min =2700, max =6300, step =10, value =3500),actionButton("predict", "Predict"), open =TRUE ),verbatimTextOutput("info"))server <-function(input, output, session) {observe({ new_data <-data.frame(species = input$species,bill_length_mm = input$bill_length_mm,bill_depth_mm = input$bill_depth_mm,flipper_length_mm = input$flipper_length_mm,body_mass_g = input$body_mass_g ) prediction <-predict(endpoint, new_data) output$info <-renderPrint(prediction) }) |>bindEvent(input$predict)}shinyApp(ui, server)
Model Monitoring
This a bit of a contrived example but shows you the idea.
set.seed(1234)# add observation date to training datapenguin_train_by_date <- penguin_train |>rowwise() |>mutate(date_obs =Sys.Date() -sample(4:10, 1)) |>ungroup() |>arrange(date_obs)# compute metrics on training data by dateoriginal_metrics <-augment(v, penguin_train_by_date) |>vetiver_compute_metrics(date_var = date_obs,period ="day",truth ="sex",estimate =".pred_class" )
Model Monitoring
This a bit of a contrived example but shows you the idea.