1- //! Strategies to build [`Bins`]s and [`Grid`]s (using [`GridBuilder`]) inferring
2- //! optimal parameters directly from data .
1+ //! Strategies used by [`GridBuilder`] to infer optimal parameters from data for building [`Bins`]
2+ //! and [`Grid`] instances .
33//!
44//! The docs for each strategy have been taken almost verbatim from [`NumPy`].
55//!
6- //! Each strategy specifies how to compute the optimal number of [`Bins`] or
7- //! the optimal bin width.
8- //! For those strategies that prescribe the optimal number
9- //! of [`Bins`] we then compute the optimal bin width with
6+ //! Each strategy specifies how to compute the optimal number of [`Bins`] or the optimal bin width.
7+ //! For those strategies that prescribe the optimal number of [`Bins`], the optimal bin width is
8+ //! computed by `bin_width = (max - min)/n`.
109//!
11- //! `bin_width = (max - min)/n`
10+ //! Since all bins are left-closed and right-open, it is guaranteed to add an extra bin to include
11+ //! the maximum value from the given data when necessary, so that no data is discarded.
1212//!
13- //! All our bins are left-inclusive and right-exclusive: we make sure to add an extra bin
14- //! if it is necessary to include the maximum value of the array that has been passed as argument
15- //! to the `from_array` method.
13+ //! # Strategies
1614//!
15+ //! Currently, the following strategies are implemented:
16+ //!
17+ //! - [`Auto`]: Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all
18+ //! around performance.
19+ //! - [`FreedmanDiaconis`]: Robust (resilient to outliers) strategy that takes into account data
20+ //! variability and data size.
21+ //! - [`Rice`]: A strategy that does not take variability into account, only data size. Commonly
22+ //! overestimates number of bins required.
23+ //! - [`Sqrt`]: Square root (of data size) strategy, used by Excel and other programs
24+ //! for its speed and simplicity.
25+ //! - [`Sturges`]: R’s default strategy, only accounts for data size. Only optimal for gaussian data
26+ //! and underestimates number of bins for large non-gaussian datasets.
27+ //!
28+ //! # Notes
29+ //!
30+ //! In general, successful infererence on optimal bin width and number of bins relies on
31+ //! **variability** of data. In other word, the provided ovservations should not be empty or
32+ //! constant.
33+ //!
34+ //! In addition, [`Auto`] and [`FreedmanDiaconis`] requires the [`interquartile range (IQR)`][iqr],
35+ //! i.e. the difference between upper and lower quartiles, to be positive.
36+ //!
37+ //! [`GridBuilder`]: ../struct.GridBuilder.html
1738//! [`Bins`]: ../struct.Bins.html
1839//! [`Grid`]: ../struct.Grid.html
19- //! [`GridBuilder`]: ../struct.GridBuilder.html
2040//! [`NumPy`]: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
21- use super :: super :: interpolate:: Nearest ;
22- use super :: super :: { Quantile1dExt , QuantileExt } ;
23- use super :: errors:: BinsBuildError ;
24- use super :: { Bins , Edges } ;
25- use ndarray:: prelude:: * ;
26- use ndarray:: Data ;
41+ //! [`Auto`]: struct.Auto.html
42+ //! [`Sturges`]: struct.Sturges.html
43+ //! [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
44+ //! [`Rice`]: struct.Rice.html
45+ //! [`Sqrt`]: struct.Sqrt.html
46+ //! [iqr]: https://www.wikiwand.com/en/Interquartile_range
47+ use crate :: {
48+ histogram:: { errors:: BinsBuildError , Bins , Edges } ,
49+ quantile:: { interpolate:: Nearest , Quantile1dExt , QuantileExt } ,
50+ } ;
51+ use ndarray:: { prelude:: * , Data } ;
2752use noisy_float:: types:: n64;
2853use num_traits:: { FromPrimitive , NumOps , Zero } ;
2954
30- /// A trait implemented by all strategies to build [`Bins`]
31- /// with parameters inferred from observations.
55+ /// A trait implemented by all strategies to build [`Bins`] with parameters inferred from
56+ /// observations.
3257///
33- /// A `BinsBuildingStrategy` is required by [`GridBuilder`]
34- /// to know how to build a [`Grid`]'s projections on the
58+ /// This is required by [`GridBuilder`] to know how to build a [`Grid`]'s projections on the
3559/// coordinate axes.
3660///
3761/// [`Bins`]: ../struct.Bins.html
38- /// [`Grid`]: ../struct.Grid.html
3962/// [`GridBuilder`]: ../struct.GridBuilder.html
63+ /// [`Grid`]: ../struct.Grid.html
4064pub trait BinsBuildingStrategy {
4165 type Elem : Ord ;
42- /// Given some observations in a 1-dimensional array it returns a `BinsBuildingStrategy`
43- /// that has learned the required parameter to build a collection of [`Bins`].
66+ /// Returns a strategy that has learnt the required parameter fo building [`Bins`] for given
67+ /// 1-dimensional array, or an `Err` if it is not possible to infer the required parameter
68+ /// with the given data and specified strategy.
4469 ///
45- /// It returns `Err` if it is not possible to build a collection of
46- /// [`Bins`] given the observed data according to the chosen strategy.
70+ /// # Errors
71+ ///
72+ /// See each of the struct-level documentation for details on errors an implementor may return.
4773 ///
4874 /// [`Bins`]: ../struct.Bins.html
4975 fn from_array < S > ( array : & ArrayBase < S , Ix1 > ) -> Result < Self , BinsBuildError >
5076 where
5177 S : Data < Elem = Self :: Elem > ,
5278 Self : std:: marker:: Sized ;
5379
54- /// Returns a [`Bins`] instance, built accordingly to the parameters
55- /// inferred from observations in [`from_array`].
80+ /// Returns a [`Bins`] instance, according to parameters inferred from observations.
5681 ///
5782 /// [`Bins`]: ../struct.Bins.html
58- /// [`from_array`]: #method.from_array.html
5983 fn build ( & self ) -> Bins < Self :: Elem > ;
6084
61- /// Returns the optimal number of bins, according to the parameters
62- /// inferred from observations in [`from_array`].
63- ///
64- /// [`from_array`]: #method.from_array.html
85+ /// Returns the optimal number of bins, according to parameters inferred from observations.
6586 fn n_bins ( & self ) -> usize ;
6687}
6788
@@ -72,12 +93,19 @@ struct EquiSpaced<T> {
7293 max : T ,
7394}
7495
75- /// Square root (of data size) strategy, used by Excel and other programs
76- /// for its speed and simplicity.
96+ /// Square root (of data size) strategy, used by Excel and other programs for its speed and
97+ /// simplicity.
7798///
7899/// Let `n` be the number of observations. Then
79100///
80101/// `n_bins` = `sqrt(n)`
102+ ///
103+ /// # Notes
104+ ///
105+ /// This strategy requires the data
106+ ///
107+ /// - not being empty
108+ /// - not being constant
81109#[ derive( Debug ) ]
82110pub struct Sqrt < T > {
83111 builder : EquiSpaced < T > ,
@@ -86,12 +114,19 @@ pub struct Sqrt<T> {
86114/// A strategy that does not take variability into account, only data size. Commonly
87115/// overestimates number of bins required.
88116///
89- /// Let `n` be the number of observations and `n_bins` the number of bins.
117+ /// Let `n` be the number of observations and `n_bins` be the number of bins.
90118///
91119/// `n_bins` = 2`n`<sup>1/3</sup>
92120///
93121/// `n_bins` is only proportional to cube root of `n`. It tends to overestimate
94122/// the `n_bins` and it does not take into account data variability.
123+ ///
124+ /// # Notes
125+ ///
126+ /// This strategy requires the data
127+ ///
128+ /// - not being empty
129+ /// - not being constant
95130#[ derive( Debug ) ]
96131pub struct Rice < T > {
97132 builder : EquiSpaced < T > ,
@@ -105,24 +140,38 @@ pub struct Rice<T> {
105140/// is too conservative for larger, non-normal datasets.
106141///
107142/// This is the default method in R’s hist method.
143+ ///
144+ /// # Notes
145+ ///
146+ /// This strategy requires the data
147+ ///
148+ /// - not being empty
149+ /// - not being constant
108150#[ derive( Debug ) ]
109151pub struct Sturges < T > {
110152 builder : EquiSpaced < T > ,
111153}
112154
113- /// Robust (resilient to outliers) strategy that takes into
114- /// account data variability and data size.
155+ /// Robust (resilient to outliers) strategy that takes into account data variability and data size.
115156///
116157/// Let `n` be the number of observations.
117158///
118- /// `bin_width` = 2× `IQR`× `n`<sup>−1/3</sup>
159+ /// `bin_width` = 2 × `IQR` × `n`<sup>−1/3</sup>
119160///
120161/// The bin width is proportional to the interquartile range ([`IQR`]) and inversely proportional to
121- /// cube root of `n`. It can be too conservative for small datasets, but it is quite good for
122- /// large datasets.
162+ /// cube root of `n`. It can be too conservative for small datasets, but it is quite good for large
163+ /// datasets.
123164///
124165/// The [`IQR`] is very robust to outliers.
125166///
167+ /// # Notes
168+ ///
169+ /// This strategy requires the data
170+ ///
171+ /// - not being empty
172+ /// - not being constant
173+ /// - having positive [`IQR`]
174+ ///
126175/// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
127176#[ derive( Debug ) ]
128177pub struct FreedmanDiaconis < T > {
@@ -135,16 +184,25 @@ enum SturgesOrFD<T> {
135184 FreedmanDiaconis ( FreedmanDiaconis < T > ) ,
136185}
137186
138- /// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies.
139- /// Provides good all around performance.
187+ /// Maximum of the [`Sturges`] and [`FreedmanDiaconis`] strategies. Provides good all around
188+ /// performance.
189+ ///
190+ /// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be
191+ /// chosen, while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
192+ /// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for small and large datasets
193+ /// respectively.
140194///
141- /// A compromise to get a good value. For small datasets the [`Sturges`] value will usually be chosen,
142- /// while larger datasets will usually default to [`FreedmanDiaconis`]. Avoids the overly
143- /// conservative behaviour of [`FreedmanDiaconis`] and [`Sturges`] for
144- /// small and large datasets respectively.
195+ /// # Notes
196+ ///
197+ /// This strategy requires the data
198+ ///
199+ /// - not being empty
200+ /// - not being constant
201+ /// - having positive [`IQR`]
145202///
146203/// [`Sturges`]: struct.Sturges.html
147204/// [`FreedmanDiaconis`]: struct.FreedmanDiaconis.html
205+ /// [`IQR`]: https://en.wikipedia.org/wiki/Interquartile_range
148206#[ derive( Debug ) ]
149207pub struct Auto < T > {
150208 builder : SturgesOrFD < T > ,
0 commit comments