library(data.table)
library(reticulate)
library(mlr3verse)
library(tidyverse)
20 Model Selection (Prediction)
When you use a DML approach, the first stage estimations are prediction tasks. In this section, R codes to select a model for the first stage estimations are presented. We first implement the task using mlr3
(Section 20.1) and then use the reticulate
pacakge to emulate the task done in Python in ?sec-python-model-selection from within R (Section 20.2).
20.1 R implementation using mlr3
You can use benchmark_grid()
and benchmark()
to imitate GridSearchCVList()
from the Python econml
package with a bit of extra coding.
Let’s use mtcars
data for demonstration.
#=== load the mtcars data ===#
data("mtcars", package = "datasets")
<-
reg_task $new(
TaskRegrid = "regression",
backend = mtcars,
target = "mpg"
)
We can create a list of learners that vary in the value of hyper-parameters we would like to tune. Suppose you are interested in using random forest and extreme gradient boosting. We can create such a list for each of them and then combine. Let’s do that for random forest first. It is much like what we did for GridSearchCVList()
in ?sec-python-model-selection.
First create a sequence of values for parameters to be tuned.
#=== sequence of values ===#
<- c(1, 3, 5)
mtry_seq <- c(5, 10, 20) min_node_size_seq
We can create a grid of values ourselves.
(<-
search_space_ranger ::CJ(
data.tablemtry = mtry_seq,
min_node_size = min_node_size_seq
) )
mtry min_node_size
1: 1 5
2: 1 10
3: 1 20
4: 3 5
5: 3 10
6: 3 20
7: 5 5
8: 5 10
9: 5 20
We now loop over the row of search_space_ranger
to create learners with each pair (row) stored in search_space_ranger
.
<-
lrn_list_ranger lapply(
seq_len(nrow(search_space_ranger)),
function(x) {
lrn(
"regr.ranger",
mtry = search_space_ranger[x, mtry],
min.node.size = search_space_ranger[x, min_node_size]
)
} )
We can do the same for xgboost.
#=== sequence of values ===#
<- c(100, 400, by = 100)
nrounds_seq <- seq(0.01, 1, length = 5)
eta_seq
#=== define the grid ===#
<-
search_space_xgb ::CJ(
data.tablenrounds = nrounds_seq,
eta = eta_seq
)
<-
lrn_list_xgboost lapply(
seq_len(nrow(search_space_xgb)),
function(x) {
lrn(
"regr.xgboost",
nrounds = search_space_xgb[x, nrounds],
eta = search_space_xgb[x, eta]
)
} )
Now, we combine all the learners we want to evaluate.
<- c(lrn_list_ranger,lrn_list_xgboost) all_learners
We need all the learners to use the same resampled datasets. So, we need to used an instantiated resampling.
<- rsmp("cv", folds = 3)
resampling $instantiate(reg_task) resampling
We can then supply all_learners
to benchmark_grid()
along with a task and resampling method, and ten call benchmark()
on it.
#=== benchmark design ===#
<-
benchmark_design benchmark_grid(
task = reg_task,
learners = all_learners,
resampling = resampling
)
#=== benchmark implementation ===#
<- benchmark(benchmark_design) bmr_results
INFO [10:53:30.816] [mlr3] Running benchmark with 72 resampling iterations
INFO [10:53:30.847] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:30.879] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:30.892] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:30.925] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:30.937] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:30.955] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:30.966] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:30.993] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.004] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.013] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.028] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.037] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.045] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.072] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.084] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.113] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.123] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.138] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.165] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.175] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.185] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.196] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.222] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.233] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.242] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.256] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.282] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.293] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.321] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.332] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.344] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.352] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.367] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.376] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.387] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.399] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.410] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.438] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.450] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.458] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.469] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.481] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.497] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.510] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.520] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.531] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.540] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.549] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.561] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.571] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.581] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.593] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.603] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.615] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.647] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.660] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.690] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 1/3)
INFO [10:53:31.699] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.710] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.719] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.727] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.738] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.754] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.779] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.790] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 3/3)
INFO [10:53:31.801] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.827] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 2/3)
INFO [10:53:31.838] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.872] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 2/3)
INFO [10:53:31.881] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.889] [mlr3] Applying learner 'regr.xgboost' on task 'regression' (iter 1/3)
INFO [10:53:31.915] [mlr3] Applying learner 'regr.ranger' on task 'regression' (iter 3/3)
INFO [10:53:31.927] [mlr3] Finished benchmark
We can calculate MSE for each of the learners using the aggregate()
method with msr("regr.mse")
.
(<- bmr_results$aggregate(msr("regr.mse"))
bmr_perf )
nr resample_result task_id learner_id resampling_id iters
1: 1 <ResampleResult[22]> regression regr.ranger cv 3
2: 2 <ResampleResult[22]> regression regr.ranger cv 3
3: 3 <ResampleResult[22]> regression regr.ranger cv 3
4: 4 <ResampleResult[22]> regression regr.ranger cv 3
5: 5 <ResampleResult[22]> regression regr.ranger cv 3
6: 6 <ResampleResult[22]> regression regr.ranger cv 3
7: 7 <ResampleResult[22]> regression regr.ranger cv 3
8: 8 <ResampleResult[22]> regression regr.ranger cv 3
9: 9 <ResampleResult[22]> regression regr.ranger cv 3
10: 10 <ResampleResult[22]> regression regr.xgboost cv 3
11: 11 <ResampleResult[22]> regression regr.xgboost cv 3
12: 12 <ResampleResult[22]> regression regr.xgboost cv 3
13: 13 <ResampleResult[22]> regression regr.xgboost cv 3
14: 14 <ResampleResult[22]> regression regr.xgboost cv 3
15: 15 <ResampleResult[22]> regression regr.xgboost cv 3
16: 16 <ResampleResult[22]> regression regr.xgboost cv 3
17: 17 <ResampleResult[22]> regression regr.xgboost cv 3
18: 18 <ResampleResult[22]> regression regr.xgboost cv 3
19: 19 <ResampleResult[22]> regression regr.xgboost cv 3
20: 20 <ResampleResult[22]> regression regr.xgboost cv 3
21: 21 <ResampleResult[22]> regression regr.xgboost cv 3
22: 22 <ResampleResult[22]> regression regr.xgboost cv 3
23: 23 <ResampleResult[22]> regression regr.xgboost cv 3
24: 24 <ResampleResult[22]> regression regr.xgboost cv 3
nr resample_result task_id learner_id resampling_id iters
regr.mse
1: 8.822252
2: 11.706459
3: 15.811861
4: 5.904024
5: 7.962141
6: 12.767845
7: 5.663546
8: 7.304351
9: 12.172980
10: 80.098296
11: 80.098296
12: 6.791134
13: 6.791134
14: 9.209117
15: 9.209117
16: 9.030887
17: 9.030887
18: 7.870318
19: 7.870318
20: 7.355720
21: 6.791134
22: 9.209117
23: 9.030887
24: 7.870318
regr.mse
We then pick the best learner which minimized regr.mse
.
#=== get the index ===#
<- as.data.table(bmr_perf) %>%
best_nr which.min(regr.mse), nr]
.[
#=== get the best learner by the index ===#
<- all_learners[[best_nr]] best_learner
20.2 reticulate
implementation in R
Here is the R codes that emulates what is done in ?sec-python-model-selection using the reticulate
package from within R.
#--------------------------
# Import modules
#--------------------------
library(reticulate)
<- import("sklearn.ensemble")
sk_ensemble <- import("sklearn.linear_model")
sk_lm <- import("sklearn.model_selection")
sk_ms <- import("econml.sklearn_extensions.model_selection")
econml_sk_ms <- import("econml.dml")
econml_dml
<- import("sklearn.datasets")
sk_data <- sk_data$make_regression
make_regression <- import("numpy")
np
#--------------------------
# Generate synthetic data
#--------------------------
#=== set parameters for data generation ===#
<- 2000L
n_samples <- 20L
n_features <- 15L
n_informative <- 2L
noise <- np$random$RandomState(8934L)
rng
#=== generate synthetic data ===#
<-
data make_regression(
n_samples,
n_features, n_informative = n_informative,
noise = noise,
random_state = rng
)
#=== Create data matrices ===#
<- data[[1]]
XT <- XT[, 1]
T <- XT[, -1]
X <- data[[2]]
Y
#--------------------------
# Set up GridSearchCVList
#--------------------------
#=== list of estimators ===#
<-
est_list c(
$Lasso(max_iter=1000L),
sk_lm$GradientBoostingRegressor(),
sk_ensemble$RandomForestRegressor(min_samples_leaf = 5L)
sk_ensemble
)
#=== list of parameter values ===#
<-
par_grid_list list(
list(alpha = c(0.001, 0.01, 0.1, 1, 10)),
list(
max_depth = as.integer(c(3, 5, 10)),
n_estimators = as.integer(c(50, 100, 200))
),list(
max_depth = as.integer(c(3, 5, 10)),
max_features = as.integer(c(3, 5, 10, 20))
)
)
#=== CV specification ===#
<-
rk_cv $RepeatedKFold(
sk_msn_splits = 4L,
n_repeats = 2L,
random_state = 123L
)
#=== set up a GridSearchCVList ===#
<-
first_stage $GridSearchCVList(
econml_sk_msestimator_list = est_list,
param_grid_list = par_grid_list,
cv = rk_cv
)
#--------------------------
# Run CV
#--------------------------
#=== Y ===#
<- first_stage$fit(X, Y)$best_estimator_
model_y
#=== T ===#
<- first_stage$fit(X, T)$best_estimator_
model_t
#--------------------------
# DML
#--------------------------
#=== set up a linear DML ===#
= econml_dml$LinearDML(
est model_y = model_y,
model_t = model_t
)
#=== train ===#
$fit(Y, T, X=X, W=X) est