Accepts a formula to run an xgboost model. Automatically determines whether the formula is for classification or regression. Returns the xgboost model.

tidy_xgboost(
  .data,
  formula,
  ...,
  mtry = 1,
  trees = 15L,
  min_n = 1L,
  tree_depth = 6L,
  learn_rate = 0.3,
  loss_reduction = 0,
  sample_size = 1,
  stop_iter = 10L,
  counts = FALSE,
  tree_method = c("auto", "exact", "approx", "hist", "gpu_hist"),
  monotone_constraints = 0L,
  num_parallel_tree = 1L,
  lambda = 1,
  alpha = 0,
  scale_pos_weight = 1,
  verbosity = 0L,
  validate = TRUE
)

Arguments

.data

dataframe

formula

formula

...

additional parameters to be passed to set_engine

mtry

# Randomly Selected Predictors (xgboost: colsample_bynode) (type: numeric, range 0 - 1) (or type: integer if count = TRUE)

trees

# Trees (xgboost: nrounds) (type: integer, default: 15L)

min_n

Minimal Node Size (xgboost: min_child_weight) (type: integer, default: 1L); [typical range: 2-10] Keep small value for highly imbalanced class data where leaf nodes can have smaller size groups. Otherwise increase size to prevent overfitting outliers.

tree_depth

Tree Depth (xgboost: max_depth) (type: integer, default: 6L); Typical values: 3-10

learn_rate

Learning Rate (xgboost: eta) (type: double, default: 0.3); Typical values: 0.01-0.3

loss_reduction

Minimum Loss Reduction (xgboost: gamma) (type: double, default: 0.0); range: 0 to Inf; typical value: 0 - 20 assuming low-mid tree depth

sample_size

Proportion Observations Sampled (xgboost: subsample) (type: double, default: 1.0); Typical values: 0.5 - 1

stop_iter

# Iterations Before Stopping (xgboost: early_stop) (type: integer, default: 15L) only enabled if validation set is provided

counts

if TRUE specify mtry as an integer number of cols. Default FALSE to specify mtry as fraction of cols from 0 to 1

tree_method

xgboost tree_method. default is auto. reference: tree method docs

monotone_constraints

an integer vector with length of the predictor cols, of -1, 1, 0 corresponding to decreasing, increasing, and no constraint respectively for the index of the predictor col. reference: monotonicity docs.

num_parallel_tree

should be set to the size of the forest being trained. default 1L

lambda

[default=1] L2 regularization term on weights. Increasing this value will make model more conservative.

alpha

[default=0] L1 regularization term on weights. Increasing this value will make model more conservative.

scale_pos_weight

[default=1] Control the balance of positive and negative weights, useful for unbalanced classes. if set to TRUE, calculates sum(negative instances) / sum(positive instances). If first level is majority class, use values < 1, otherwise normally values >1 are used to balance the class distribution.

verbosity

[default=1] Verbosity of printing messages. Valid values are 0 (silent), 1 (warning), 2 (info), 3 (debug).

validate

default TRUE. report accuracy metrics on a validation set.

Value

xgb.Booster model

Details

In binary classification the target variable must be a factor with the first level set to the event of interest. A higher probability will predict the first level.

reference for parameters: xgboost docs

Examples


options(rlang_trace_top_env = rlang::current_env())


# regression on numeric variable

iris %>%
 framecleaner::create_dummies(Species) -> iris_dummy
#> 1 column(s) have become 3 dummy columns

iris_dummy %>%
 tidy_formula(target= Petal.Length) -> petal_form

iris_dummy %>%
 tidy_xgboost(
   petal_form,
   trees = 20,
   mtry = .5
 )  -> xg1
#> accuracy tested on a validation set
#> # A tibble: 3 × 2
#>   .metric .estimate
#>   <chr>       <dbl>
#> 1 ccc         0.974
#> 2 rmse        0.390
#> 3 rsq         0.954


xg1 %>%
 visualize_model(top_n = 2)


xg1 %>%
 tidy_predict(newdata = iris_dummy, form = petal_form) -> iris_preds
#> created the following column: Petal.Length_preds_xg1

iris_preds %>%
 eval_preds()
#> # A tibble: 3 × 5
#>   .metric .estimator .estimate model target      
#>   <chr>   <chr>          <dbl> <chr> <chr>       
#> 1 ccc     standard       0.998 xg1   Petal.Length
#> 2 rmse    standard       0.111 xg1   Petal.Length
#> 3 rsq     standard       0.996 xg1   Petal.Length


# binary classification
# returns probabilty and labels

iris %>%
 tidy_formula(Species) -> species_form

iris %>%
 dplyr::filter(Species != "versicolor") %>%
 dplyr::mutate(Species = forcats::fct_drop(Species)) -> iris_binary

iris_binary %>%
 tidy_xgboost(formula = species_form, trees = 30L, mtry = 0.2) -> xgb_bin
#> accuracy tested on a validation set

#> # A tibble: 15 × 3
#>    .metric              .estimate .formula              
#>    <chr>                    <dbl> <chr>                 
#>  1 accuracy                 1     TP + TN / total       
#>  2 kap                      1     NA                    
#>  3 sens                     1     TP / actually P       
#>  4 spec                     1     TN / actually N       
#>  5 ppv                      1     TP / predicted P      
#>  6 npv                      1     TN / predicted N      
#>  7 mcc                      1     NA                    
#>  8 j_index                  1     NA                    
#>  9 bal_accuracy             1     sens + spec / 2       
#> 10 detection_prevalence     0.547 predicted P / total   
#> 11 precision                1     PPV, 1-FDR            
#> 12 recall                   1     sens, TPR             
#> 13 f_meas                   1     HM(ppv, sens)         
#> 14 baseline_accuracy        0.547 majority class / total
#> 15 roc_auc                  1     NA                    


xgb_bin %>%
 tidy_predict(newdata = iris_binary, form = species_form) -> iris_binary1
#> created the following columns: 
#> Species_preds_prob_xgb_bin
#> Species_preds_class_xgb_bin

iris_binary1 %>%
 eval_preds()
#> # A tibble: 4 × 5
#>   .metric   .estimator .estimate model   target 
#>   <chr>     <chr>          <dbl> <chr>   <chr>  
#> 1 accuracy  binary             1 xgb_bin Species
#> 2 f_meas    binary             1 xgb_bin Species
#> 3 precision binary             1 xgb_bin Species
#> 4 roc_auc   binary             1 xgb_bin Species


# multiclass classification that returns labels




iris %>%
 tidy_xgboost(species_form,
              objective = "multi:softmax",
              trees = 15L,
              tree_depth = 3L,
              loss_reduction = 0.5) -> xgb2
#> accuracy tested on a validation set

#> # A tibble: 13 × 2
#>    .metric              .estimate
#>    <chr>                    <dbl>
#>  1 accuracy                 0.938
#>  2 kap                      0.906
#>  3 sens                     0.933
#>  4 spec                     0.968
#>  5 ppv                      0.943
#>  6 npv                      0.971
#>  7 mcc                      0.909
#>  8 j_index                  0.901
#>  9 bal_accuracy             0.950
#> 10 detection_prevalence     0.333
#> 11 precision                0.943
#> 12 recall                   0.933
#> 13 f_meas                   0.935




xgb2 %>%
 tidy_predict(newdata = iris, form = species_form) -> iris_preds
#> created the following column: Species_preds_class_xgb2

# additional yardstick metrics can be supplied to the dots in eval_preds

iris_preds %>%
 eval_preds(yardstick::j_index)
#> # A tibble: 3 × 5
#>   .metric  .estimator .estimate model target 
#>   <chr>    <chr>          <dbl> <chr> <chr>  
#> 1 accuracy multiclass     0.987 xgb2  Species
#> 2 f_meas   macro          0.987 xgb2  Species
#> 3 j_index  macro          0.98  xgb2  Species


# multiclass classification that returns probabilities


iris %>%
 tidy_xgboost(species_form,
              objective = "multi:softprob",
              trees = 20L,
              sample_size = .2,
              mtry = .5,
              tree_depth = 2L,
              loss_reduction = 3) -> xgb2_prob


# predict on the data that already has the class labels, so the resulting data frame
# has class and prob predictions

xgb2_prob %>%
 tidy_predict(newdata = iris_preds, form = species_form) -> iris_preds1
#> created the following columns: 
#> setosa_preds_prob_xgb2_prob
#> versicolor_preds_prob_xgb2_prob
#> virginica_preds_prob_xgb2_prob

# also requires the labels in the dataframe to evaluate preds
# the model name must be supplied as well. Then roc metrics can be calculated
#iris_preds1 %>%
#  eval_preds( yardstick::average_precision, softprob_model = "xgb2_prob"
#  )