20  Model Selection (Prediction)

When you use a DML approach, the first stage estimations are prediction tasks. In this section, R codes to select a model for the first stage estimations are presented. We first implement the task using mlr3 (Section 20.1) and then use the reticulate pacakge to emulate the task done in Python in ?sec-python-model-selection from within R (Section 20.2).

Packages to load for replication
library(data.table)
library(reticulate)
library(mlr3verse)
library(tidyverse)

20.1 R implementation using mlr3

You can use benchmark_grid() and benchmark() to imitate GridSearchCVList() from the Python econml package with a bit of extra coding.

Let’s use mtcars data for demonstration.

#=== load the mtcars data ===#
data("mtcars", package = "datasets")

reg_task <-
  TaskRegr$new(
    id = "regression",
    backend = mtcars,
    target = "mpg"
  )

We can create a list of learners that vary in the value of hyper-parameters we would like to tune. Suppose you are interested in using random forest and extreme gradient boosting. We can create such a list for each of them and then combine. Let’s do that for random forest first. It is much like what we did for GridSearchCVList() in ?sec-python-model-selection.

First create a sequence of values for parameters to be tuned.

#=== sequence of values ===#
mtry_seq <- c(1, 3, 5)
min_node_size_seq <- c(5, 10, 20)

We can create a grid of values ourselves.

(
search_space_ranger <- 
  data.table::CJ(
    mtry = mtry_seq,
    min_node_size = min_node_size_seq
  )
)
   mtry min_node_size
1:    1             5
2:    1            10
3:    1            20
4:    3             5
5:    3            10
6:    3            20
7:    5             5
8:    5            10
9:    5            20

We now loop over the row of search_space_ranger to create learners with each pair (row) stored in search_space_ranger.

lrn_list_ranger <-
  lapply(
    seq_len(nrow(search_space_ranger)),
    function(x) {
      lrn(
        "regr.ranger", 
        mtry = search_space_ranger[x, mtry], 
        min.node.size = search_space_ranger[x, min_node_size]
      )
    }
  )

We can do the same for xgboost.

#=== sequence of values ===#
nrounds_seq <- c(100, 400, by = 100)
eta_seq <- seq(0.01, 1, length = 5)

#=== define the grid ===#
search_space_xgb <- 
  data.table::CJ(
    nrounds = nrounds_seq,
    eta = eta_seq
  )

lrn_list_xgboost <-
  lapply(
    seq_len(nrow(search_space_xgb)),
    function(x) {
      lrn(
        "regr.xgboost",
        nrounds = search_space_xgb[x, nrounds],
        eta = search_space_xgb[x, eta]
      )
    }
  )

Now, we combine all the learners we want to evaluate.

all_learners <- c(lrn_list_ranger,lrn_list_xgboost)

We need all the learners to use the same resampled datasets. So, we need to used an instantiated resampling.

resampling <- rsmp("cv", folds = 3)
resampling$instantiate(reg_task)

We can then supply all_learners to benchmark_grid() along with a task and resampling method, and ten call benchmark() on it.

#=== benchmark design ===#
benchmark_design <- 
  benchmark_grid(
    task = reg_task, 
    learners = all_learners,
    resampling = resampling
  )

#=== benchmark implementation ===#
bmr_results <- benchmark(benchmark_design)
INFO  [10:53:30.816] [mlr3] Running benchmark with 72 resampling iterations 
INFO  [10:53:30.847] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:30.879] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:30.892] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:30.925] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:30.937] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:30.955] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:30.966] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:30.993] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.004] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.013] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.028] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.037] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.045] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.072] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.084] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.113] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.123] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.138] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.165] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.175] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.185] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.196] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.222] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.233] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.242] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.256] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.282] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.293] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.321] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.332] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.344] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.352] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.367] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.376] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.387] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.399] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.410] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.438] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.450] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.458] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.469] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.481] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.497] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.510] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.520] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.531] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.540] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.549] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.561] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.571] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.581] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.593] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.603] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.615] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.647] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.660] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.690] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3) 
INFO  [10:53:31.699] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.710] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.719] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.727] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.738] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.754] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.779] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.790] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3) 
INFO  [10:53:31.801] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.827] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3) 
INFO  [10:53:31.838] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.872] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3) 
INFO  [10:53:31.881] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.889] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3) 
INFO  [10:53:31.915] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3) 
INFO  [10:53:31.927] [mlr3] Finished benchmark 

We can calculate MSE for each of the learners using the aggregate() method with msr("regr.mse").

(
bmr_perf <- bmr_results$aggregate(msr("regr.mse"))
)
    nr      resample_result    task_id   learner_id resampling_id iters
 1:  1 <ResampleResult[22]> regression  regr.ranger            cv     3
 2:  2 <ResampleResult[22]> regression  regr.ranger            cv     3
 3:  3 <ResampleResult[22]> regression  regr.ranger            cv     3
 4:  4 <ResampleResult[22]> regression  regr.ranger            cv     3
 5:  5 <ResampleResult[22]> regression  regr.ranger            cv     3
 6:  6 <ResampleResult[22]> regression  regr.ranger            cv     3
 7:  7 <ResampleResult[22]> regression  regr.ranger            cv     3
 8:  8 <ResampleResult[22]> regression  regr.ranger            cv     3
 9:  9 <ResampleResult[22]> regression  regr.ranger            cv     3
10: 10 <ResampleResult[22]> regression regr.xgboost            cv     3
11: 11 <ResampleResult[22]> regression regr.xgboost            cv     3
12: 12 <ResampleResult[22]> regression regr.xgboost            cv     3
13: 13 <ResampleResult[22]> regression regr.xgboost            cv     3
14: 14 <ResampleResult[22]> regression regr.xgboost            cv     3
15: 15 <ResampleResult[22]> regression regr.xgboost            cv     3
16: 16 <ResampleResult[22]> regression regr.xgboost            cv     3
17: 17 <ResampleResult[22]> regression regr.xgboost            cv     3
18: 18 <ResampleResult[22]> regression regr.xgboost            cv     3
19: 19 <ResampleResult[22]> regression regr.xgboost            cv     3
20: 20 <ResampleResult[22]> regression regr.xgboost            cv     3
21: 21 <ResampleResult[22]> regression regr.xgboost            cv     3
22: 22 <ResampleResult[22]> regression regr.xgboost            cv     3
23: 23 <ResampleResult[22]> regression regr.xgboost            cv     3
24: 24 <ResampleResult[22]> regression regr.xgboost            cv     3
    nr      resample_result    task_id   learner_id resampling_id iters
     regr.mse
 1:  8.822252
 2: 11.706459
 3: 15.811861
 4:  5.904024
 5:  7.962141
 6: 12.767845
 7:  5.663546
 8:  7.304351
 9: 12.172980
10: 80.098296
11: 80.098296
12:  6.791134
13:  6.791134
14:  9.209117
15:  9.209117
16:  9.030887
17:  9.030887
18:  7.870318
19:  7.870318
20:  7.355720
21:  6.791134
22:  9.209117
23:  9.030887
24:  7.870318
     regr.mse

We then pick the best learner which minimized regr.mse.

#=== get the index ===#
best_nr <- as.data.table(bmr_perf) %>% 
  .[which.min(regr.mse), nr]

#=== get the best learner by the index ===#
best_learner <- all_learners[[best_nr]] 

20.2 reticulate implementation in R

Here is the R codes that emulates what is done in ?sec-python-model-selection using the reticulate package from within R.

#--------------------------
# Import modules  
#--------------------------
library(reticulate)
sk_ensemble <- import("sklearn.ensemble") 
sk_lm <- import("sklearn.linear_model")
sk_ms <- import("sklearn.model_selection")
econml_sk_ms <- import("econml.sklearn_extensions.model_selection")
econml_dml <- import("econml.dml")

sk_data <- import("sklearn.datasets")
make_regression <- sk_data$make_regression
np <- import("numpy")

#--------------------------
# Generate synthetic data
#--------------------------
#=== set parameters for data generation ===#
n_samples <- 2000L
n_features <- 20L
n_informative <- 15L
noise <- 2L
rng <- np$random$RandomState(8934L)

#=== generate synthetic data ===#
data <-
  make_regression(
    n_samples, 
    n_features, 
    n_informative = n_informative, 
    noise = noise, 
    random_state = rng
  )

#=== Create data matrices ===#
XT <- data[[1]]
T <- XT[, 1]
X <- XT[, -1]
Y  <- data[[2]]

#--------------------------
# Set up GridSearchCVList
#--------------------------
#=== list of estimators ===#
est_list <- 
  c(
    sk_lm$Lasso(max_iter=1000L), 
    sk_ensemble$GradientBoostingRegressor(),
    sk_ensemble$RandomForestRegressor(min_samples_leaf = 5L)
  ) 

#=== list of parameter values ===#
par_grid_list <- 
  list(
    list(alpha = c(0.001, 0.01, 0.1, 1, 10)),
    list(
      max_depth = as.integer(c(3, 5, 10)), 
      n_estimators = as.integer(c(50, 100, 200))
    ),
    list(
      max_depth = as.integer(c(3, 5, 10)), 
      max_features = as.integer(c(3, 5, 10, 20))
    )
  )

#=== CV specification ===#
rk_cv <- 
  sk_ms$RepeatedKFold(
    n_splits = 4L, 
    n_repeats = 2L, 
    random_state = 123L
  )

#=== set up a GridSearchCVList ===#
first_stage <-
  econml_sk_ms$GridSearchCVList(
    estimator_list = est_list,
    param_grid_list = par_grid_list,
    cv = rk_cv
  )

#--------------------------
# Run CV
#--------------------------
#=== Y ===#
model_y <- first_stage$fit(X, Y)$best_estimator_

#=== T ===#
model_t <- first_stage$fit(X, T)$best_estimator_

#--------------------------
# DML
#--------------------------
#=== set up a linear DML ===#
est = econml_dml$LinearDML(
    model_y = model_y, 
    model_t = model_t
)

#=== train ===#
est$fit(Y, T, X=X, W=X)