Make bins in a tidy fashion. Adds a column to your data frame containing the integer codes of the specified bins of a certain column. Specifying multiple columns is only intended for supervised binning, so mutliple columns can be simultaneously binned optimally with respect to a target variable.
bin_cols(
.data,
col,
n_bins = 10,
bin_type = "frequency",
...,
target = NULL,
pretty_labels = FALSE,
seed = 1,
method = "mdlp"
)
a data frame
a column, vector of columns, or tidyselect
number of bins
method to make bins
params to be passed to selected binning method
unquoted column for supervised binning
logical. If T returns interval label rather than integer rank
seed for stochastic binning (xgboost)
method for bin mdlp
a data frame
Description of the arguments for bin_type
frequency (fr) creates bins of equal content via quantiles. Wraps bin
with method "content". Similar to ntile
width (wi) create bins of equal numeric width. Wraps bin
with method "length"
kmeans (km) create bins using 1-dimensional kmeans. Wraps bin
with method "clusters"
value (va) each bin has equal sum of values
xgboost (xg) column is binned by best predictor of a target column using step_discretize_xgb
cart (ca) if the col does not have enough distinct values, xgboost will fail and automatically revert to step_discretize_cart
woe (wo) column is binned by weight of evidence. Requires binary target
logreg (lr) column is binned by logistic regression. Requires binary target.
mdlp uses the discretizeDF.supervised
algorithm with a variety of methods.
iris %>%
bin_cols(Sepal.Width, n_bins = 5, pretty_labels = TRUE) %>%
bin_cols(Petal.Width, n_bins = 3, bin_type = c("width", "kmeans")) %>%
bin_cols(Sepal.Width, bin_type = "xgboost", target = Species, seed = 1) -> iris1
#> [12:01:27] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
#binned columns are named by original name + method abbreviation + number bins created.
#Sometimes the actual number of bins is less than n_bins if the col lacks enough variance.
iris1 %>%
print(width = Inf)
#> # A tibble: 150 × 9
#> Sepal.Width_fr5 Sepal.Width_xg6 Sepal.Width Petal.Width_km3 Petal.Width_wi3
#> <fct> <int> <dbl> <int> <int>
#> 1 (3.4,4.4] 6 3.5 1 1
#> 2 (2.7,3] 4 3 1 1
#> 3 (3.1,3.4] 4 3.2 1 1
#> 4 (3,3.1] 4 3.1 1 1
#> 5 (3.4,4.4] 7 3.6 1 1
#> 6 (3.4,4.4] 7 3.9 1 1
#> 7 (3.1,3.4] 5 3.4 1 1
#> 8 (3.1,3.4] 5 3.4 1 1
#> 9 (2.7,3] 2 2.9 1 1
#> 10 (3,3.1] 4 3.1 1 1
#> Sepal.Length Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <fct>
#> 1 5.1 1.4 0.2 setosa
#> 2 4.9 1.4 0.2 setosa
#> 3 4.7 1.3 0.2 setosa
#> 4 4.6 1.5 0.2 setosa
#> 5 5 1.4 0.2 setosa
#> 6 5.4 1.7 0.4 setosa
#> 7 4.6 1.4 0.3 setosa
#> 8 5 1.5 0.2 setosa
#> 9 4.4 1.4 0.2 setosa
#> 10 4.9 1.5 0.1 setosa
#> # … with 140 more rows
iris1 %>%
bin_summary() %>%
print(width = Inf)
#> # A tibble: 17 × 14
#> column method n_bins .rank .min .mean .max .count .uniques
#> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <int> <int>
#> 1 Sepal.Width equal freq 5 5 3.5 3.75 4.4 25 9
#> 2 Sepal.Width equal freq 5 4 3.2 3.30 3.4 31 3
#> 3 Sepal.Width equal freq 5 3 3.1 3.1 3.1 11 1
#> 4 Sepal.Width equal freq 5 2 2.8 2.92 3 50 3
#> 5 Sepal.Width equal freq 5 1 2 2.49 2.7 33 7
#> 6 Sepal.Width xgboost 6 7 3.6 3.83 4.4 19 8
#> 7 Sepal.Width xgboost 6 6 3.5 3.5 3.5 6 1
#> 8 Sepal.Width xgboost 6 5 3.3 3.37 3.4 18 2
#> 9 Sepal.Width xgboost 6 4 3 3.07 3.2 50 3
#> 10 Sepal.Width xgboost 6 2 2.8 2.84 2.9 24 2
#> 11 Sepal.Width xgboost 6 1 2 2.49 2.7 33 7
#> 12 Petal.Width kmeans 3 3 1.8 2.07 2.5 46 8
#> 13 Petal.Width kmeans 3 2 1 1.34 1.7 54 8
#> 14 Petal.Width kmeans 3 1 0.1 0.246 0.6 50 6
#> 15 Petal.Width equal width 3 3 1.8 2.07 2.5 46 8
#> 16 Petal.Width equal width 3 2 1 1.34 1.7 54 8
#> 17 Petal.Width equal width 3 1 0.1 0.246 0.6 50 6
#> relative_value .sum .med .sd width
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 100 93.8 3.7 0.237 0.9
#> 2 87.9 102. 3.3 0.0912 0.200
#> 3 82.6 34.1 3.1 0 0
#> 4 77.9 146. 3 0.0870 0.200
#> 5 66.5 82.3 2.5 0.187 0.7
#> 6 100 72.8 3.8 0.216 0.8
#> 7 91.3 21 3.5 0 0
#> 8 87.9 60.6 3.4 0.0485 0.100
#> 9 80.2 154. 3 0.0853 0.200
#> 10 74.2 68.2 2.8 0.0504 0.100
#> 11 65.1 82.3 2.5 0.187 0.7
#> 12 100 95.4 2.05 0.231 0.7
#> 13 64.5 72.2 1.3 0.193 0.7
#> 14 11.9 12.3 0.2 0.105 0.5
#> 15 100 95.4 2.05 0.231 0.7
#> 16 64.5 72.2 1.3 0.193 0.7
#> 17 11.9 12.3 0.2 0.105 0.5