@@ -27,17 +27,56 @@ function fill_refs!(refs::AbstractArray, X::AbstractArray,
2727 end
2828end
2929
30+ const CUT_FMT = Printf. Format (" %.*g" )
31+
32+ """
33+ CategoricalArrays.default_formatter(from, to, i::Integer;
34+ leftclosed::Bool, rightclosed::Bool,
35+ sigdigits::Integer)
36+
37+ Provide the default label format for the `cut(x, breaks)` method,
38+ which is `"[from, to)"` if `leftclosed` is `true` and `"[from, to)"` otherwise.
39+
40+ If they are floating points values, breaks are turned into to strings using
41+ `@sprintf("%.*g", sigdigits, break)`
42+ (or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
3043"""
31- default_formatter(from, to, i; leftclosed, rightclosed)
44+ function default_formatter (from, to, i:: Integer ;
45+ leftclosed:: Bool , rightclosed:: Bool ,
46+ sigdigits:: Integer )
47+ from_str = from isa AbstractFloat ?
48+ Printf. format (CUT_FMT, sigdigits, from) :
49+ string (from)
50+ to_str = to isa AbstractFloat ?
51+ Printf. format (CUT_FMT, sigdigits, to) :
52+ string (to)
53+ string (leftclosed ? " [" : " (" , from_str, " , " , to_str, rightclosed ? " ]" : " )" )
54+ end
3255
33- Provide the default label format for the `cut(x, breaks)` method.
3456"""
35- default_formatter (from, to, i; leftclosed, rightclosed) =
36- string (leftclosed ? " [" : " (" , from, " , " , to, rightclosed ? " ]" : " )" )
57+ CategoricalArrays.numbered_formatter(from, to, i::Integer;
58+ leftclosed::Bool, rightclosed::Bool,
59+ sigdigits::Integer)
60+
61+ Provide the default label format for the `cut(x, ngroups)` method
62+ when `allowempty=true`, which is `"i: [from, to)"` if `leftclosed`
63+ is `true` and `"i: [from, to)"` otherwise.
64+
65+ If they are floating points values, breaks are turned into to strings using
66+ `@sprintf("%.*g", sigdigits, breaks)`
67+ (or `to` using `@sprintf("%.*g", sigdigits, break)` for the last break).
68+ """
69+ numbered_formatter (from, to, i:: Integer ;
70+ leftclosed:: Bool , rightclosed:: Bool ,
71+ sigdigits:: Integer ) =
72+ string (i, " : " ,
73+ default_formatter (from, to, i, leftclosed= leftclosed, rightclosed= rightclosed,
74+ sigdigits= sigdigits))
3775
3876@doc raw """
3977 cut(x::AbstractArray, breaks::AbstractVector;
4078 labels::Union{AbstractVector,Function},
79+ sigdigits::Integer=3,
4180 extend::Union{Bool,Missing}=false, allowempty::Bool=false)
4281
4382Cut a numeric array into intervals at values `breaks`
@@ -54,10 +93,15 @@ also accept them.
5493 in `x` fall outside of the breaks; when `true`, breaks are automatically added to include
5594 all values in `x`; when `missing`, values outside of the breaks generate `missing` entries.
5695* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
57- or numbers giving the names to use for
58- the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
96+ or numbers giving the names to use for the intervals; or a function
97+ `f(from, to, i::Integer ; leftclosed::Bool , rightclosed::Bool, sigdigits::Integer )` that generates
5998 the labels from the left and right interval boundaries and the group index. Defaults to
60- `"[from, to)"` (or `"[from, to]"` for the rightmost interval if `extend == true`).
99+ [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
100+ for the rightmost interval if `extend == true`).
101+ * `sigdigits::Integer=3`: the minimum number of significant digits to use in labels.
102+ This value is increased automatically if necessary so that rounded breaks are unique.
103+ Only used for floating point types and when `labels` is a function, in which case it
104+ is passed to it as a keyword argument.
61105* `allowempty::Bool=false`: when `false`, an error is raised if some breaks other than
62106 the last one appear multiple times, generating empty intervals; when `true`,
63107 duplicate breaks are allowed and the intervals they generate are kept as
@@ -69,19 +113,19 @@ julia> using CategoricalArrays
69113
70114julia> cut(-1:0.5:1, [0, 1], extend=true)
711155-element CategoricalArray{String,1,UInt32}:
72- "[-1.0, 0. 0)"
73- "[-1.0, 0. 0)"
74- "[0.0 , 1.0 ]"
75- "[0.0 , 1.0 ]"
76- "[0.0 , 1.0 ]"
116+ "[-1, 0)"
117+ "[-1, 0)"
118+ "[0, 1]"
119+ "[0, 1]"
120+ "[0, 1]"
77121
78122julia> cut(-1:0.5:1, 2)
791235-element CategoricalArray{String,1,UInt32}:
80- "Q1: [-1.0, 0. 0)"
81- "Q1: [-1.0, 0. 0)"
82- "Q2: [0.0 , 1.0 ]"
83- "Q2: [0.0 , 1.0 ]"
84- "Q2: [0.0 , 1.0 ]"
124+ "[-1, 0)"
125+ "[-1, 0)"
126+ "[0 , 1]"
127+ "[0 , 1]"
128+ "[0 , 1]"
85129
86130julia> cut(-1:0.5:1, 2, labels=["A", "B"])
871315-element CategoricalArray{String,1,UInt32}:
@@ -114,6 +158,7 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
114158@inline function cut (x:: AbstractArray , breaks:: AbstractVector ;
115159 extend:: Union{Bool, Missing} = false ,
116160 labels:: Union{AbstractVector{<:SupportedTypes},Function} = default_formatter,
161+ sigdigits:: Integer = 3 ,
117162 allowmissing:: Union{Bool, Nothing} = nothing ,
118163 allow_missing:: Union{Bool, Nothing} = nothing ,
119164 allowempty:: Bool = false )
@@ -127,14 +172,15 @@ julia> cut(-1:0.5:1, 3, labels=fmt)
127172 :cut )
128173 extend = missing
129174 end
130- return _cut (x, breaks, extend, labels, allowempty)
175+ return _cut (x, breaks, extend, labels, sigdigits, allowempty)
131176end
132177
133178# Separate function for inferability (thanks to inlining of cut)
134179function _cut (x:: AbstractArray{T, N} , breaks:: AbstractVector ,
135180 extend:: Union{Bool, Missing} ,
136181 labels:: Union{AbstractVector{<:SupportedTypes},Function} ,
137- allowempty:: Bool = false ) where {T, N}
182+ sigdigits:: Integer ,
183+ allowempty:: Bool ) where {T, N}
138184 if ! issorted (breaks)
139185 breaks = sort (breaks)
140186 end
@@ -191,21 +237,55 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
191237 end
192238 end
193239
240+ # Find minimal number of digits so that distinct breaks remain so
241+ if eltype (breaks) <: AbstractFloat
242+ while true
243+ local i
244+ for outer i in 2 : lastindex (breaks)
245+ b1 = breaks[i- 1 ]
246+ b2 = breaks[i]
247+ isequal (b1, b2) && continue
248+
249+ b1_str = Printf. format (CUT_FMT, sigdigits, b1)
250+ b2_str = Printf. format (CUT_FMT, sigdigits, b2)
251+ if b1_str == b2_str
252+ sigdigits += 1
253+ break
254+ end
255+ end
256+ i == lastindex (breaks) && break
257+ end
258+ end
194259 n = length (breaks)
195260 n >= 2 || throw (ArgumentError (" at least two breaks must be provided when extend is not true" ))
196261 if labels isa Function
197262 from = breaks[1 : n- 1 ]
198263 to = breaks[2 : n]
199- firstlevel = labels (from[1 ], to[1 ], 1 ,
200- leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false )
264+ local firstlevel
265+ try
266+ firstlevel = labels (from[1 ], to[1 ], 1 ,
267+ leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false ,
268+ sigdigits= sigdigits)
269+ catch
270+ # Support functions defined before v1.0, where sigdigits did not exist
271+ Base. depwarn (" `labels` function is now required to accept a `sigdigits` keyword argument" ,
272+ :cut )
273+ labels_orig = labels
274+ labels = (from, to, i; leftclosed, rightclosed, sigdigits) ->
275+ labels_orig (from, to, i; leftclosed, rightclosed)
276+ firstlevel = labels_orig (from[1 ], to[1 ], 1 ,
277+ leftclosed= ! isequal (breaks[1 ], breaks[2 ]), rightclosed= false )
278+ end
201279 levs = Vector {typeof(firstlevel)} (undef, n- 1 )
202280 levs[1 ] = firstlevel
203281 for i in 2 : n- 2
204282 levs[i] = labels (from[i], to[i], i,
205- leftclosed= ! isequal (breaks[i], breaks[i+ 1 ]), rightclosed= false )
283+ leftclosed= ! isequal (breaks[i], breaks[i+ 1 ]), rightclosed= false ,
284+ sigdigits= sigdigits)
206285 end
207286 levs[end ] = labels (from[end ], to[end ], n- 1 ,
208- leftclosed= true , rightclosed= true )
287+ leftclosed= true , rightclosed= true ,
288+ sigdigits= sigdigits)
209289 else
210290 length (labels) == n- 1 ||
211291 throw (ArgumentError (" labels must be of length $(n- 1 ) , but got length $(length (labels)) " ))
@@ -225,40 +305,37 @@ function _cut(x::AbstractArray{T, N}, breaks::AbstractVector,
225305 CategoricalArray {S, N} (refs, pool)
226306end
227307
228- """
229- quantile_formatter(from, to, i; leftclosed, rightclosed)
230-
231- Provide the default label format for the `cut(x, ngroups)` method.
232- """
233- quantile_formatter (from, to, i; leftclosed, rightclosed) =
234- string (" Q" , i, " : " , leftclosed ? " [" : " (" , from, " , " , to, rightclosed ? " ]" : " )" )
235-
236308"""
237309Find first value in (sorted) `v` which is greater than or equal to each quantile
238310in (sorted) `qs`.
239311"""
240312function find_breaks (v:: AbstractVector , qs:: AbstractVector )
241313 n = length (qs)
242314 breaks = similar (v, n)
243- n == 0 && return breaks
315+ breaks_prev = similar (v, n)
316+ n == 0 && return (breaks, breaks_prev)
244317
245318 i = 1
246319 q = qs[1 ]
247- @inbounds for x in v
320+ @inbounds for j in eachindex (v)
321+ x = v[j]
248322 # Use isless and isequal to differentiate -0.0 from 0.0
249323 if isless (q, x) || isequal (q, x)
250324 breaks[i] = x
325+ # FIXME : handle duplicated breaks
326+ breaks_prev[i] = v[clamp (j- 1 , firstindex (v), lastindex (v))]
251327 i += 1
252328 i > n && break
253329 q = qs[i]
254330 end
255331 end
256- return breaks
332+ return ( breaks, breaks_prev)
257333end
258334
259335"""
260336 cut(x::AbstractArray, ngroups::Integer;
261337 labels::Union{AbstractVector{<:AbstractString},Function},
338+ sigdigits::Integer=3,
262339 allowempty::Bool=false)
263340
264341Cut a numeric array into `ngroups` quantiles.
@@ -271,17 +348,25 @@ quantiles.
271348
272349# Keyword arguments
273350* `labels::Union{AbstractVector, Function}`: a vector of strings, characters
274- or numbers giving the names to use for
275- the intervals; or a function `f(from, to, i; leftclosed, rightclosed)` that generates
351+ or numbers giving the names to use for the intervals; or a function
352+ `f(from, to, i::Integer ; leftclosed::Bool , rightclosed::Bool, sigdigits::Integer )` that generates
276353 the labels from the left and right interval boundaries and the group index. Defaults to
277- `"Qi: [from, to)"` (or `"Qi: [from, to]"` for the rightmost interval).
354+ [`CategoricalArrays.default_formatter`](@ref), giving `"[from, to)"` (or `"[from, to]"`
355+ for the rightmost interval if `extend == true`) if `allowempty=false`, otherwise to
356+ [`CategoricalArrays.numbered_formatter`](@ref), which prefixes the label with the quantile
357+ number to ensure uniqueness.
358+ * `sigdigits::Integer=3`: the minimum number of significant digits to use when rounding
359+ breaks for inclusion in generated labels. This value is increased automatically if necessary
360+ so that rounded breaks are unique. Only used for floating point types and when `labels` is a
361+ function, in which case it is passed to it as a keyword argument.
278362* `allowempty::Bool=false`: when `false`, an error is raised if some quantiles breakpoints
279363 other than the last one are equal, generating empty intervals;
280364 when `true`, duplicate breaks are allowed and the intervals they generate are kept as
281365 unused levels (but duplicate labels are not allowed).
282366"""
283367function cut (x:: AbstractArray , ngroups:: Integer ;
284- labels:: Union{AbstractVector{<:SupportedTypes},Function} = quantile_formatter,
368+ labels:: Union{AbstractVector{<:SupportedTypes},Function,Nothing} = nothing ,
369+ sigdigits:: Integer = 3 ,
285370 allowempty:: Bool = false )
286371 ngroups >= 1 || throw (ArgumentError (" ngroups must be strictly positive (got $ngroups )" ))
287372 sorted_x = eltype (x) >: Missing ? sort! (collect (skipmissing (x))) : sort (x)
@@ -291,12 +376,48 @@ function cut(x::AbstractArray, ngroups::Integer;
291376 throw (ArgumentError (" NaN values are not allowed in input vector" ))
292377 end
293378 qs = quantile! (sorted_x, (1 : (ngroups- 1 ))/ ngroups, sorted= true )
294- breaks = [min_x; find_breaks (sorted_x, qs); max_x]
379+ breaks, breaks_prev = find_breaks (sorted_x, qs)
380+ breaks = [min_x; breaks; max_x]
295381 if ! allowempty && ! allunique (@view breaks[1 : end - 1 ])
296382 throw (ArgumentError (" cannot compute $ngroups quantiles due to " *
297383 " too many duplicated values in `x`. " *
298384 " Pass `allowempty=true` to allow empty quantiles or " *
299385 " choose a lower value for `ngroups`." ))
300386 end
301- cut (x, breaks; labels= labels, allowempty= allowempty)
387+ if labels === nothing
388+ labels = allowempty ? numbered_formatter : default_formatter
389+
390+ if eltype (breaks) <: AbstractFloat
391+ while true
392+ local i
393+ for outer i in 2 : lastindex (breaks)
394+ b1 = breaks[i- 1 ]
395+ b2 = breaks[i]
396+ isequal (b1, b2) && continue
397+
398+ # Find minimal number of digits so that `floor` does not
399+ # return a value that is lower than value immediately below break
400+ # We skip the first break, which is the minimum and has no equivalent
401+ # in `breaks_prev`
402+ b1_rounded = round (b1, sigdigits= sigdigits)
403+ b2_rounded = round (b2, sigdigits= sigdigits)
404+ if i < lastindex (breaks) &&
405+ (isequal (b2_rounded, breaks_prev[i- 1 ]) || isless (b2_rounded, breaks_prev[i- 1 ]))
406+ sigdigits += 1
407+ break
408+ end
409+
410+ # Find minimal number of digits so that breaks are unique
411+ b1_str = Printf. format (CUT_FMT, sigdigits, b1)
412+ b2_str = Printf. format (CUT_FMT, sigdigits, b2)
413+ if b1_str == b2_str
414+ sigdigits += 1
415+ break
416+ end
417+ end
418+ i == lastindex (breaks) && break
419+ end
420+ end
421+ end
422+ return cut (x, breaks; labels= labels, sigdigits= sigdigits, allowempty= allowempty)
302423end
0 commit comments