Accepts a formula to run an xgboost model. Automatically determines whether the formula is for classification or regression. Returns the xgboost model.
tidy_xgboost(
.data,
formula,
...,
mtry = 1,
trees = 15L,
min_n = 1L,
tree_depth = 6L,
learn_rate = 0.3,
loss_reduction = 0,
sample_size = 1,
stop_iter = 10L,
counts = FALSE,
tree_method = c("auto", "exact", "approx", "hist", "gpu_hist"),
monotone_constraints = 0L,
num_parallel_tree = 1L,
lambda = 1,
alpha = 0,
scale_pos_weight = 1,
verbosity = 0L,
validate = TRUE
)
dataframe
formula
additional parameters to be passed to set_engine
# Randomly Selected Predictors (xgboost: colsample_bynode) (type: numeric, range 0 - 1) (or type: integer if count = TRUE
)
# Trees (xgboost: nrounds) (type: integer, default: 15L)
Minimal Node Size (xgboost: min_child_weight) (type: integer, default: 1L); [typical range: 2-10] Keep small value for highly imbalanced class data where leaf nodes can have smaller size groups. Otherwise increase size to prevent overfitting outliers.
Tree Depth (xgboost: max_depth) (type: integer, default: 6L); Typical values: 3-10
Learning Rate (xgboost: eta) (type: double, default: 0.3); Typical values: 0.01-0.3
Minimum Loss Reduction (xgboost: gamma) (type: double, default: 0.0); range: 0 to Inf; typical value: 0 - 20 assuming low-mid tree depth
Proportion Observations Sampled (xgboost: subsample) (type: double, default: 1.0); Typical values: 0.5 - 1
# Iterations Before Stopping (xgboost: early_stop) (type: integer, default: 15L) only enabled if validation set is provided
if TRUE
specify mtry
as an integer number of cols. Default FALSE
to specify mtry
as fraction of cols from 0 to 1
xgboost tree_method. default is auto
. reference: tree method docs
an integer vector with length of the predictor cols, of -1, 1, 0
corresponding to decreasing, increasing, and no constraint respectively for the index of the predictor col. reference: monotonicity docs.
should be set to the size of the forest being trained. default 1L
[default=1] L2 regularization term on weights. Increasing this value will make model more conservative.
[default=0] L1 regularization term on weights. Increasing this value will make model more conservative.
[default=1] Control the balance of positive and negative weights, useful for unbalanced classes. if set to TRUE, calculates sum(negative instances) / sum(positive instances). If first level is majority class, use values < 1, otherwise normally values >1 are used to balance the class distribution.
[default=1] Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug).
default TRUE. report accuracy metrics on a validation set.
xgb.Booster model
In binary classification the target variable must be a factor with the first level set to the event of interest. A higher probability will predict the first level.
reference for parameters: xgboost docs
options(rlang_trace_top_env = rlang::current_env())
# regression on numeric variable
iris %>%
framecleaner::create_dummies(Species) -> iris_dummy
#> 1 column(s) have become 3 dummy columns
iris_dummy %>%
tidy_formula(target= Petal.Length) -> petal_form
iris_dummy %>%
tidy_xgboost(
petal_form,
trees = 20,
mtry = .5
) -> xg1
#> accuracy tested on a validation set
#> # A tibble: 3 × 2
#> .metric .estimate
#> <chr> <dbl>
#> 1 ccc 0.974
#> 2 rmse 0.390
#> 3 rsq 0.954
xg1 %>%
visualize_model(top_n = 2)
xg1 %>%
tidy_predict(newdata = iris_dummy, form = petal_form) -> iris_preds
#> created the following column: Petal.Length_preds_xg1
iris_preds %>%
eval_preds()
#> # A tibble: 3 × 5
#> .metric .estimator .estimate model target
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 ccc standard 0.998 xg1 Petal.Length
#> 2 rmse standard 0.111 xg1 Petal.Length
#> 3 rsq standard 0.996 xg1 Petal.Length
# binary classification
# returns probabilty and labels
iris %>%
tidy_formula(Species) -> species_form
iris %>%
dplyr::filter(Species != "versicolor") %>%
dplyr::mutate(Species = forcats::fct_drop(Species)) -> iris_binary
iris_binary %>%
tidy_xgboost(formula = species_form, trees = 30L, mtry = 0.2) -> xgb_bin
#> accuracy tested on a validation set
#> # A tibble: 15 × 3
#> .metric .estimate .formula
#> <chr> <dbl> <chr>
#> 1 accuracy 1 TP + TN / total
#> 2 kap 1 NA
#> 3 sens 1 TP / actually P
#> 4 spec 1 TN / actually N
#> 5 ppv 1 TP / predicted P
#> 6 npv 1 TN / predicted N
#> 7 mcc 1 NA
#> 8 j_index 1 NA
#> 9 bal_accuracy 1 sens + spec / 2
#> 10 detection_prevalence 0.547 predicted P / total
#> 11 precision 1 PPV, 1-FDR
#> 12 recall 1 sens, TPR
#> 13 f_meas 1 HM(ppv, sens)
#> 14 baseline_accuracy 0.547 majority class / total
#> 15 roc_auc 1 NA
xgb_bin %>%
tidy_predict(newdata = iris_binary, form = species_form) -> iris_binary1
#> created the following columns:
#> Species_preds_prob_xgb_bin
#> Species_preds_class_xgb_bin
iris_binary1 %>%
eval_preds()
#> # A tibble: 4 × 5
#> .metric .estimator .estimate model target
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 accuracy binary 1 xgb_bin Species
#> 2 f_meas binary 1 xgb_bin Species
#> 3 precision binary 1 xgb_bin Species
#> 4 roc_auc binary 1 xgb_bin Species
# multiclass classification that returns labels
iris %>%
tidy_xgboost(species_form,
objective = "multi:softmax",
trees = 15L,
tree_depth = 3L,
loss_reduction = 0.5) -> xgb2
#> accuracy tested on a validation set
#> # A tibble: 13 × 2
#> .metric .estimate
#> <chr> <dbl>
#> 1 accuracy 0.938
#> 2 kap 0.906
#> 3 sens 0.933
#> 4 spec 0.968
#> 5 ppv 0.943
#> 6 npv 0.971
#> 7 mcc 0.909
#> 8 j_index 0.901
#> 9 bal_accuracy 0.950
#> 10 detection_prevalence 0.333
#> 11 precision 0.943
#> 12 recall 0.933
#> 13 f_meas 0.935
xgb2 %>%
tidy_predict(newdata = iris, form = species_form) -> iris_preds
#> created the following column: Species_preds_class_xgb2
# additional yardstick metrics can be supplied to the dots in eval_preds
iris_preds %>%
eval_preds(yardstick::j_index)
#> # A tibble: 3 × 5
#> .metric .estimator .estimate model target
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 accuracy multiclass 0.987 xgb2 Species
#> 2 f_meas macro 0.987 xgb2 Species
#> 3 j_index macro 0.98 xgb2 Species
# multiclass classification that returns probabilities
iris %>%
tidy_xgboost(species_form,
objective = "multi:softprob",
trees = 20L,
sample_size = .2,
mtry = .5,
tree_depth = 2L,
loss_reduction = 3) -> xgb2_prob
# predict on the data that already has the class labels, so the resulting data frame
# has class and prob predictions
xgb2_prob %>%
tidy_predict(newdata = iris_preds, form = species_form) -> iris_preds1
#> created the following columns:
#> setosa_preds_prob_xgb2_prob
#> versicolor_preds_prob_xgb2_prob
#> virginica_preds_prob_xgb2_prob
# also requires the labels in the dataframe to evaluate preds
# the model name must be supplied as well. Then roc metrics can be calculated
#iris_preds1 %>%
# eval_preds( yardstick::average_precision, softprob_model = "xgb2_prob"
# )