@@ -90,6 +90,29 @@ const (
9090 // values from the spans a check constraint is allowed to have in order to build
9191 // a histogram from it.
9292 maxValuesForFullHistogramFromCheckConstraint = tabledesc .MaxBucketAllowed
93+
94+ // histogramPessimisticThreshold determines the cutoff point below which the
95+ // selectivity estimate of a histogram is overridden with a more pessimistic
96+ // estimate. This is to avoid over-fitting to a stale or inaccurate histogram.
97+ //
98+ // The value (1 in 10,000) was chosen because we choose sample sizes according
99+ // to table size such that we expect to *nearly* always sample all values with
100+ // multiplicity >= row_count/10000. Cardinality estimates below this threshold
101+ // are increasingly likely to be inaccurate. See also computeNumberSamples.
102+ histogramPessimisticThreshold = 1.0 / 10000.0
103+
104+ // histogramInequalityMinSelectivity determines the minimum selectivity
105+ // estimate that can be derived from an unbounded (above or below) inequality
106+ // used to filter a histogram. Similar to histogramPessimisticThreshold, this
107+ // is to avoid over-fitting to a stale or inaccurate histogram.
108+ //
109+ // The value (1 in 10,000) was chosen based on similar logic in Postgres,
110+ // which caps the selectivity to (1 / bucket_count*100). Postgres uses 100
111+ // histogram buckets by default, so the number comes out to 10,000. We avoid
112+ // using the number of histogram buckets directly to avoid arbitrary
113+ // variation in the selectivity cap depending on user settings and partial
114+ // stat collections.
115+ histogramUnboundedInequalityMinSelectivity = 1.0 / 10000.0
93116)
94117
95118// statisticsBuilder is responsible for building the statistics that are
@@ -761,7 +784,10 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
761784 // is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
762785 col := cols .SingleColumn ()
763786 colStat .Histogram = & props.Histogram {}
764- colStat .Histogram .Init (sb .evalCtx , col , stat .Histogram ())
787+ // Track the minimum number of rows for which histogram selectivity
788+ // estimates are trusted.
789+ resolution := histogramPessimisticThreshold * stats .RowCount
790+ colStat .Histogram .Init (sb .evalCtx , col , stat .Histogram (), resolution )
765791 }
766792
767793 // Make sure the distinct count is at least 1, for the same reason as
@@ -786,7 +812,18 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
786812 invCols := opt .MakeColSet (invCol )
787813 if invColStat , ok := stats .ColStats .Add (invCols ); ok {
788814 invColStat .Histogram = & props.Histogram {}
789- invColStat .Histogram .Init (sb .evalCtx , invCol , stat .Histogram ())
815+ // Track the minimum number of rows for which histogram selectivity
816+ // estimates are trusted.
817+ //
818+ // NOTE: an inverted index can have multiple entries per table row.
819+ // However, we still use the number of table rows here because the
820+ // max multiplicity of a missed value is proportional to the number
821+ // of table rows, not the number of inverted index entries. For
822+ // example, the arrays [10, 20, 30] and [20, 40, 60] result in six
823+ // inverted index entries, but only a maximum multiplicity of two
824+ // for the value "20".
825+ resolution := histogramPessimisticThreshold * stats .RowCount
826+ invColStat .Histogram .Init (sb .evalCtx , invCol , stat .Histogram (), resolution )
790827 // Set inverted entry counts from the histogram. Make sure the
791828 // distinct count is at least 1, for the same reason as the row
792829 // count above.
@@ -4558,10 +4595,15 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45584595 newCount := newHist .ValuesCount ()
45594596 oldCount := oldHist .ValuesCount ()
45604597
4561- // Calculate the selectivity of the predicate. Nulls are already included
4562- // in the histogram, so we do not need to account for them separately.
4598+ // Calculate the selectivity of the predicate using the histogram. Nulls
4599+ // are already included in the histogram, so we do not need to account for
4600+ // them separately.
45634601 predicateSelectivity := props .MakeSelectivityFromFraction (newCount , oldCount )
45644602
4603+ // Possibly clamp the selectivity to a higher value to avoid overly
4604+ // optimistic estimates.
4605+ predicateSelectivity = sb .clampSelForHistogram (inputColStat , colStat , s , predicateSelectivity )
4606+
45654607 // The maximum possible selectivity of the entire expression is the minimum
45664608 // selectivity of all individual predicates.
45674609 selectivityUpperBound = props .MinSelectivity (selectivityUpperBound , predicateSelectivity )
@@ -4572,6 +4614,50 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45724614 return selectivity , selectivityUpperBound
45734615}
45744616
4617+ // clampSelForHistogram clamps the selectivity estimate derived from a histogram
4618+ // to a minimum value. This accounts for the possibility that the histogram is
4619+ // missing values due to sampling or staleness. See also
4620+ // histogramPessimisticThreshold.
4621+ func (sb * statisticsBuilder ) clampSelForHistogram (
4622+ oldColStat , newColStat * props.ColumnStatistic , s * props.Statistics , originalSel props.Selectivity ,
4623+ ) (clampedSel props.Selectivity ) {
4624+ clampedSel = originalSel
4625+ oldHist , newHist := oldColStat .Histogram , newColStat .Histogram
4626+ if sb .evalCtx .SessionData ().OptimizerClampLowHistogramSelectivity &&
4627+ newHist .ValuesCount () < oldHist .Resolution () {
4628+ // NOTE: columns with histograms are skipped when considering distinct
4629+ // counts in selectivityFromSingleColDistinctCounts, so this doesn't
4630+ // double count the effect of the predicate.
4631+ resClamp := props .MakeSelectivityFromFraction (newColStat .DistinctCount , oldColStat .DistinctCount )
4632+
4633+ // Cap the selectivity so that the row count estimate is no more than the
4634+ // pessimistic threshold. This can result in a lower estimate if the
4635+ // multiplicities of the filtered values really are low compared to the
4636+ // average multiplicity.
4637+ resClamp = props .MinSelectivity (resClamp ,
4638+ props .MakeSelectivityFromFraction (oldHist .Resolution (), s .RowCount ),
4639+ )
4640+ if resClamp .AsFloat () > clampedSel .AsFloat () {
4641+ sb .mem .optimizationStats .ClampedHistogramSelectivity = true
4642+ }
4643+ clampedSel = props .MaxSelectivity (clampedSel , resClamp )
4644+ }
4645+
4646+ tightUpperBound , tightLowerBound := newHist .TightBounds ()
4647+ if sb .evalCtx .SessionData ().OptimizerClampInequalitySelectivity &&
4648+ (! tightUpperBound || ! tightLowerBound ) {
4649+ // Similar to Postgres, assume that an open-ended inequality predicate will
4650+ // scan at least 1/10000th of the table. This accounts for the possibility
4651+ // that the histogram missed extreme values due to sampling or staleness.
4652+ inequalityClamp := props .MakeSelectivity (histogramUnboundedInequalityMinSelectivity )
4653+ if inequalityClamp .AsFloat () > clampedSel .AsFloat () {
4654+ sb .mem .optimizationStats .ClampedInequalitySelectivity = true
4655+ }
4656+ clampedSel = props .MaxSelectivity (clampedSel , inequalityClamp )
4657+ }
4658+ return clampedSel
4659+ }
4660+
45754661// selectivityFromMaxFrequencies calculates the selectivity of an equality
45764662// filters by using the maximum frequency of the histograms of the constrained
45774663// columns. This represents a worst-case selectivity estimate and is used to
@@ -5332,7 +5418,10 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
53325418 colStat .NullCount = nullCount
53335419 if useHistogram {
53345420 colStat .Histogram = & props.Histogram {}
5335- colStat .Histogram .Init (sb .evalCtx , firstColID , histogram )
5421+ // Track the minimum number of rows for which histogram selectivity
5422+ // estimates are trusted.
5423+ resolution := histogramPessimisticThreshold * statistics .RowCount
5424+ colStat .Histogram .Init (sb .evalCtx , firstColID , histogram , resolution )
53365425 }
53375426 sb .finalizeFromRowCountAndDistinctCounts (colStat , statistics )
53385427 tabMeta .AddCheckConstraintsStats (firstColID , colStat )
0 commit comments