This cuts a numeric vector into sample quantile bins. Note that the intervals are closed on
the right side. That is, the first bin is the interval [-Inf, q1]
where q1
is
the first quantile, the second bin is then (q1, q2]
, etc., and the last bin
is (qn, +Inf]
where qn
is the last quantile.
Usage
cut_quantile_bins(
x,
probs = c(0.25, 0.5, 0.75),
labels = NULL,
type = 7,
ordered = TRUE
)
Arguments
- x
(
numeric
)
the continuous variable values which should be cut into quantile bins. This may containNA
values, which are then not used for the quantile calculations, but included in the return vector.- probs
(
proportion
vector)
the probabilities identifying the quantiles. This is a sorted vector of uniqueproportion
values, i.e. between 0 and 1, where the boundaries 0 and 1 must not be included.- labels
(
character
)
the unique labels for the quantile bins. When there aren
probabilities inprobs
, then this must ben + 1
long.- type
(
integer
)
type of quantiles to use, seestats::quantile()
for details.- ordered
(
flag
)
should the result be an ordered factor.
Examples
# Default is to cut into quartile bins.
cut_quantile_bins(cars$speed)
#> [1] [0%,25%] [0%,25%] [0%,25%] [0%,25%] [0%,25%] [0%,25%]
#> [7] [0%,25%] [0%,25%] [0%,25%] [0%,25%] [0%,25%] [0%,25%]
#> [13] [0%,25%] [0%,25%] [0%,25%] (25%,50%] (25%,50%] (25%,50%]
#> [19] (25%,50%] (25%,50%] (25%,50%] (25%,50%] (25%,50%] (25%,50%]
#> [25] (25%,50%] (25%,50%] (50%,75%] (50%,75%] (50%,75%] (50%,75%]
#> [31] (50%,75%] (50%,75%] (50%,75%] (50%,75%] (50%,75%] (50%,75%]
#> [37] (50%,75%] (50%,75%] (75%,100%] (75%,100%] (75%,100%] (75%,100%]
#> [43] (75%,100%] (75%,100%] (75%,100%] (75%,100%] (75%,100%] (75%,100%]
#> [49] (75%,100%] (75%,100%]
#> Levels: [0%,25%] < (25%,50%] < (50%,75%] < (75%,100%]
# Use custom quantiles.
cut_quantile_bins(cars$speed, probs = c(0.1, 0.2, 0.6, 0.88))
#> [1] [0%,10%] [0%,10%] [0%,10%] [0%,10%] [0%,10%] (10%,20%]
#> [7] (10%,20%] (10%,20%] (10%,20%] (10%,20%] (10%,20%] (20%,60%]
#> [13] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%]
#> [19] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%]
#> [25] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%] (20%,60%]
#> [31] (20%,60%] (60%,88%] (60%,88%] (60%,88%] (60%,88%] (60%,88%]
#> [37] (60%,88%] (60%,88%] (60%,88%] (60%,88%] (60%,88%] (60%,88%]
#> [43] (60%,88%] (60%,88%] (88%,100%] (88%,100%] (88%,100%] (88%,100%]
#> [49] (88%,100%] (88%,100%]
#> Levels: [0%,10%] < (10%,20%] < (20%,60%] < (60%,88%] < (88%,100%]
# Use custom labels.
cut_quantile_bins(cars$speed, labels = paste0("Q", 1:4))
#> [1] Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q1 Q2 Q2 Q2 Q2 Q2 Q2 Q2 Q2 Q2 Q2
#> [26] Q2 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q3 Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4
#> Levels: Q1 < Q2 < Q3 < Q4
# NAs are preserved in result factor.
ozone_binned <- cut_quantile_bins(airquality$Ozone)
which(is.na(ozone_binned))
#> [1] 5 10 25 26 27 32 33 34 35 36 37 39 42 43 45 46 52 53 54
#> [20] 55 56 57 58 59 60 61 65 72 75 83 84 102 103 107 115 119 150
# So you might want to make these explicit.
explicit_na(ozone_binned)
#> [1] (50%,75%] (50%,75%] [0%,25%] [0%,25%] <Missing> (25%,50%]
#> [7] (25%,50%] (25%,50%] [0%,25%] <Missing> [0%,25%] [0%,25%]
#> [13] [0%,25%] [0%,25%] [0%,25%] [0%,25%] (50%,75%] [0%,25%]
#> [19] (25%,50%] [0%,25%] [0%,25%] [0%,25%] [0%,25%] (50%,75%]
#> [25] <Missing> <Missing> <Missing> (25%,50%] (50%,75%] (75%,100%]
#> [31] (50%,75%] <Missing> <Missing> <Missing> <Missing> <Missing>
#> [37] <Missing> (25%,50%] <Missing> (75%,100%] (50%,75%] <Missing>
#> [43] <Missing> (25%,50%] <Missing> <Missing> (25%,50%] (50%,75%]
#> [49] (25%,50%] [0%,25%] [0%,25%] <Missing> <Missing> <Missing>
#> [55] <Missing> <Missing> <Missing> <Missing> <Missing> <Missing>
#> [61] <Missing> (75%,100%] (50%,75%] (50%,75%] <Missing> (75%,100%]
#> [67] (50%,75%] (75%,100%] (75%,100%] (75%,100%] (75%,100%] <Missing>
#> [73] [0%,25%] (25%,50%] <Missing> [0%,25%] (50%,75%] (50%,75%]
#> [79] (50%,75%] (75%,100%] (50%,75%] [0%,25%] <Missing> <Missing>
#> [85] (75%,100%] (75%,100%] (25%,50%] (50%,75%] (75%,100%] (50%,75%]
#> [91] (75%,100%] (50%,75%] (50%,75%] [0%,25%] [0%,25%] (75%,100%]
#> [97] (50%,75%] (75%,100%] (75%,100%] (75%,100%] (75%,100%] <Missing>
#> [103] <Missing> (50%,75%] (25%,50%] (75%,100%] <Missing> (25%,50%]
#> [109] (50%,75%] (25%,50%] (25%,50%] (50%,75%] (25%,50%] [0%,25%]
#> [115] <Missing> (50%,75%] (75%,100%] (75%,100%] <Missing> (75%,100%]
#> [121] (75%,100%] (75%,100%] (75%,100%] (75%,100%] (75%,100%] (75%,100%]
#> [127] (75%,100%] (50%,75%] (50%,75%] (25%,50%] (25%,50%] (25%,50%]
#> [133] (25%,50%] (50%,75%] (25%,50%] (25%,50%] [0%,25%] [0%,25%]
#> [139] (50%,75%] [0%,25%] [0%,25%] (25%,50%] [0%,25%] [0%,25%]
#> [145] (25%,50%] (50%,75%] [0%,25%] [0%,25%] (25%,50%] <Missing>
#> [151] [0%,25%] [0%,25%] (25%,50%]
#> Levels: [0%,25%] < (25%,50%] < (50%,75%] < (75%,100%] < <Missing>