Skip to content

Commit f3ad3f9

Browse files
authored
Merge pull request #156399 from DrewKimball/backport25.4-153067
release-25.4: opt: make outside-of-histogram estimates more pessimistic
2 parents 865b8bd + 13ab6fd commit f3ad3f9

File tree

19 files changed

+3654
-315
lines changed

19 files changed

+3654
-315
lines changed

pkg/sql/exec_util.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4447,6 +4447,14 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedHoistJoinProject(val bool) {
44474447
m.data.OptimizerUseImprovedHoistJoinProject = val
44484448
}
44494449

4450+
func (m *sessionDataMutator) SetOptimizerClampLowHistogramSelectivity(val bool) {
4451+
m.data.OptimizerClampLowHistogramSelectivity = val
4452+
}
4453+
4454+
func (m *sessionDataMutator) SetOptimizerClampInequalitySelectivity(val bool) {
4455+
m.data.OptimizerClampInequalitySelectivity = val
4456+
}
4457+
44504458
// Utility functions related to scrubbing sensitive information on SQL Stats.
44514459

44524460
// quantizeCounts ensures that the Count field in the

pkg/sql/logictest/testdata/logic_test/information_schema

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4189,6 +4189,8 @@ opt_split_scan_limit 2048
41894189
optimizer on
41904190
optimizer_always_use_histograms on
41914191
optimizer_check_input_min_row_count 1
4192+
optimizer_clamp_inequality_selectivity off
4193+
optimizer_clamp_low_histogram_selectivity off
41924194
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
41934195
optimizer_enable_lock_elision on
41944196
optimizer_hoist_uncorrelated_equality_subqueries on

pkg/sql/logictest/testdata/logic_test/pg_catalog

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3091,6 +3091,8 @@ on_update_rehome_row_enabled on
30913091
opt_split_scan_limit 2048 NULL NULL NULL string
30923092
optimizer_always_use_histograms on NULL NULL NULL string
30933093
optimizer_check_input_min_row_count 1 NULL NULL NULL string
3094+
optimizer_clamp_inequality_selectivity off NULL NULL NULL string
3095+
optimizer_clamp_low_histogram_selectivity off NULL NULL NULL string
30943096
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL NULL NULL string
30953097
optimizer_enable_lock_elision on NULL NULL NULL string
30963098
optimizer_hoist_uncorrelated_equality_subqueries on NULL NULL NULL string
@@ -3336,6 +3338,8 @@ on_update_rehome_row_enabled on
33363338
opt_split_scan_limit 2048 NULL user NULL 2048 2048
33373339
optimizer_always_use_histograms on NULL user NULL on on
33383340
optimizer_check_input_min_row_count 1 NULL user NULL 1 1
3341+
optimizer_clamp_inequality_selectivity off NULL user NULL off off
3342+
optimizer_clamp_low_histogram_selectivity off NULL user NULL off off
33393343
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on NULL user NULL on on
33403344
optimizer_enable_lock_elision on NULL user NULL on on
33413345
optimizer_hoist_uncorrelated_equality_subqueries on NULL user NULL on on
@@ -3572,6 +3576,8 @@ opt_split_scan_limit NULL NULL
35723576
optimizer NULL NULL NULL NULL NULL
35733577
optimizer_always_use_histograms NULL NULL NULL NULL NULL
35743578
optimizer_check_input_min_row_count NULL NULL NULL NULL NULL
3579+
optimizer_clamp_inequality_selectivity NULL NULL NULL NULL NULL
3580+
optimizer_clamp_low_histogram_selectivity NULL NULL NULL NULL NULL
35753581
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables NULL NULL NULL NULL NULL
35763582
optimizer_enable_lock_elision NULL NULL NULL NULL NULL
35773583
optimizer_hoist_uncorrelated_equality_subqueries NULL NULL NULL NULL NULL

pkg/sql/logictest/testdata/logic_test/show_source

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ on_update_rehome_row_enabled on
155155
opt_split_scan_limit 2048
156156
optimizer_always_use_histograms on
157157
optimizer_check_input_min_row_count 1
158+
optimizer_clamp_inequality_selectivity off
159+
optimizer_clamp_low_histogram_selectivity off
158160
optimizer_disable_cross_region_cascade_fast_path_for_rbr_tables on
159161
optimizer_enable_lock_elision on
160162
optimizer_hoist_uncorrelated_equality_subqueries on

pkg/sql/opt/memo/memo.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ type Memo struct {
211211
useExistsFilterHoistRule bool
212212
disableSlowCascadeFastPathForRBRTables bool
213213
useImprovedHoistJoinProject bool
214+
clampLowHistogramSelectivity bool
215+
clampInequalitySelectivity bool
214216

215217
// txnIsoLevel is the isolation level under which the plan was created. This
216218
// affects the planning of some locking operations, so it must be included in
@@ -236,6 +238,10 @@ type Memo struct {
236238
// erring with partially normalized expressions.
237239
disableCheckExpr bool
238240

241+
// optimizationStats tracks decisions made during optimization, for example,
242+
// to clamp selectivity estimates to a lower bound.
243+
optimizationStats OptimizationStats
244+
239245
// WARNING: if you add more members, add initialization code in Init (if
240246
// reusing allocated data structures is desired).
241247
}
@@ -318,6 +324,8 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
318324
useExistsFilterHoistRule: evalCtx.SessionData().OptimizerUseExistsFilterHoistRule,
319325
disableSlowCascadeFastPathForRBRTables: evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables,
320326
useImprovedHoistJoinProject: evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject,
327+
clampLowHistogramSelectivity: evalCtx.SessionData().OptimizerClampLowHistogramSelectivity,
328+
clampInequalitySelectivity: evalCtx.SessionData().OptimizerClampInequalitySelectivity,
321329
txnIsoLevel: evalCtx.TxnIsoLevel,
322330
}
323331
m.metadata.Init()
@@ -493,6 +501,8 @@ func (m *Memo) IsStale(
493501
m.useExistsFilterHoistRule != evalCtx.SessionData().OptimizerUseExistsFilterHoistRule ||
494502
m.disableSlowCascadeFastPathForRBRTables != evalCtx.SessionData().OptimizerDisableCrossRegionCascadeFastPathForRBRTables ||
495503
m.useImprovedHoistJoinProject != evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject ||
504+
m.clampLowHistogramSelectivity != evalCtx.SessionData().OptimizerClampLowHistogramSelectivity ||
505+
m.clampInequalitySelectivity != evalCtx.SessionData().OptimizerClampInequalitySelectivity ||
496506
m.txnIsoLevel != evalCtx.TxnIsoLevel {
497507
return true, nil
498508
}
@@ -692,6 +702,25 @@ func (m *Memo) FormatExpr(expr opt.Expr) string {
692702
return f.Buffer.String()
693703
}
694704

705+
// OptimizationStats surfaces information about choices made during optimization
706+
// of a query for top-level observability (e.g. metrics, EXPLAIN output).
707+
type OptimizationStats struct {
708+
// ClampedHistogramSelectivity is true if the selectivity estimate based on a
709+
// histogram was prevented from dropping too low. See also the session var
710+
// "optimizer_clamp_low_histogram_selectivity".
711+
ClampedHistogramSelectivity bool
712+
// ClampedInequalitySelectivity is true if the selectivity estimate for an
713+
// inequality unbounded on one or both sides was prevented from dropping too
714+
// low. See also the session var "optimizer_clamp_inequality_selectivity".
715+
ClampedInequalitySelectivity bool
716+
}
717+
718+
// GetOptimizationStats returns the OptimizationStats collected during a
719+
// previous optimization pass.
720+
func (m *Memo) GetOptimizationStats() *OptimizationStats {
721+
return &m.optimizationStats
722+
}
723+
695724
// ValuesContainer lets ValuesExpr and LiteralValuesExpr share code.
696725
type ValuesContainer interface {
697726
RelExpr

pkg/sql/opt/memo/memo_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,16 @@ func TestMemoIsStale(t *testing.T) {
595595
evalCtx.SessionData().OptimizerUseImprovedHoistJoinProject = false
596596
notStale()
597597

598+
evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = true
599+
stale()
600+
evalCtx.SessionData().OptimizerClampLowHistogramSelectivity = false
601+
notStale()
602+
603+
evalCtx.SessionData().OptimizerClampInequalitySelectivity = true
604+
stale()
605+
evalCtx.SessionData().OptimizerClampInequalitySelectivity = false
606+
notStale()
607+
598608
// User no longer has access to view.
599609
catalog.View(tree.NewTableNameWithSchema("t", catconstants.PublicSchemaName, "abcview")).Revoked = true
600610
_, err = o.Memo().IsStale(ctx, &evalCtx, catalog)

pkg/sql/opt/memo/statistics_builder.go

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,29 @@ const (
9090
// values from the spans a check constraint is allowed to have in order to build
9191
// a histogram from it.
9292
maxValuesForFullHistogramFromCheckConstraint = tabledesc.MaxBucketAllowed
93+
94+
// histogramPessimisticThreshold determines the cutoff point below which the
95+
// selectivity estimate of a histogram is overridden with a more pessimistic
96+
// estimate. This is to avoid over-fitting to a stale or inaccurate histogram.
97+
//
98+
// The value (1 in 10,000) was chosen because we choose sample sizes according
99+
// to table size such that we expect to *nearly* always sample all values with
100+
// multiplicity >= row_count/10000. Cardinality estimates below this threshold
101+
// are increasingly likely to be inaccurate. See also computeNumberSamples.
102+
histogramPessimisticThreshold = 1.0 / 10000.0
103+
104+
// histogramInequalityMinSelectivity determines the minimum selectivity
105+
// estimate that can be derived from an unbounded (above or below) inequality
106+
// used to filter a histogram. Similar to histogramPessimisticThreshold, this
107+
// is to avoid over-fitting to a stale or inaccurate histogram.
108+
//
109+
// The value (1 in 10,000) was chosen based on similar logic in Postgres,
110+
// which caps the selectivity to (1 / bucket_count*100). Postgres uses 100
111+
// histogram buckets by default, so the number comes out to 10,000. We avoid
112+
// using the number of histogram buckets directly to avoid arbitrary
113+
// variation in the selectivity cap depending on user settings and partial
114+
// stat collections.
115+
histogramUnboundedInequalityMinSelectivity = 1.0 / 10000.0
93116
)
94117

95118
// statisticsBuilder is responsible for building the statistics that are
@@ -761,7 +784,10 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
761784
// is tracked here: https://github.com/cockroachdb/cockroach/issues/50655
762785
col := cols.SingleColumn()
763786
colStat.Histogram = &props.Histogram{}
764-
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
787+
// Track the minimum number of rows for which histogram selectivity
788+
// estimates are trusted.
789+
resolution := histogramPessimisticThreshold * stats.RowCount
790+
colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram(), resolution)
765791
}
766792

767793
// Make sure the distinct count is at least 1, for the same reason as
@@ -786,7 +812,18 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
786812
invCols := opt.MakeColSet(invCol)
787813
if invColStat, ok := stats.ColStats.Add(invCols); ok {
788814
invColStat.Histogram = &props.Histogram{}
789-
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram())
815+
// Track the minimum number of rows for which histogram selectivity
816+
// estimates are trusted.
817+
//
818+
// NOTE: an inverted index can have multiple entries per table row.
819+
// However, we still use the number of table rows here because the
820+
// max multiplicity of a missed value is proportional to the number
821+
// of table rows, not the number of inverted index entries. For
822+
// example, the arrays [10, 20, 30] and [20, 40, 60] result in six
823+
// inverted index entries, but only a maximum multiplicity of two
824+
// for the value "20".
825+
resolution := histogramPessimisticThreshold * stats.RowCount
826+
invColStat.Histogram.Init(sb.evalCtx, invCol, stat.Histogram(), resolution)
790827
// Set inverted entry counts from the histogram. Make sure the
791828
// distinct count is at least 1, for the same reason as the row
792829
// count above.
@@ -4558,10 +4595,15 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45584595
newCount := newHist.ValuesCount()
45594596
oldCount := oldHist.ValuesCount()
45604597

4561-
// Calculate the selectivity of the predicate. Nulls are already included
4562-
// in the histogram, so we do not need to account for them separately.
4598+
// Calculate the selectivity of the predicate using the histogram. Nulls
4599+
// are already included in the histogram, so we do not need to account for
4600+
// them separately.
45634601
predicateSelectivity := props.MakeSelectivityFromFraction(newCount, oldCount)
45644602

4603+
// Possibly clamp the selectivity to a higher value to avoid overly
4604+
// optimistic estimates.
4605+
predicateSelectivity = sb.clampSelForHistogram(inputColStat, colStat, s, predicateSelectivity)
4606+
45654607
// The maximum possible selectivity of the entire expression is the minimum
45664608
// selectivity of all individual predicates.
45674609
selectivityUpperBound = props.MinSelectivity(selectivityUpperBound, predicateSelectivity)
@@ -4572,6 +4614,50 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
45724614
return selectivity, selectivityUpperBound
45734615
}
45744616

4617+
// clampSelForHistogram clamps the selectivity estimate derived from a histogram
4618+
// to a minimum value. This accounts for the possibility that the histogram is
4619+
// missing values due to sampling or staleness. See also
4620+
// histogramPessimisticThreshold.
4621+
func (sb *statisticsBuilder) clampSelForHistogram(
4622+
oldColStat, newColStat *props.ColumnStatistic, s *props.Statistics, originalSel props.Selectivity,
4623+
) (clampedSel props.Selectivity) {
4624+
clampedSel = originalSel
4625+
oldHist, newHist := oldColStat.Histogram, newColStat.Histogram
4626+
if sb.evalCtx.SessionData().OptimizerClampLowHistogramSelectivity &&
4627+
newHist.ValuesCount() < oldHist.Resolution() {
4628+
// NOTE: columns with histograms are skipped when considering distinct
4629+
// counts in selectivityFromSingleColDistinctCounts, so this doesn't
4630+
// double count the effect of the predicate.
4631+
resClamp := props.MakeSelectivityFromFraction(newColStat.DistinctCount, oldColStat.DistinctCount)
4632+
4633+
// Cap the selectivity so that the row count estimate is no more than the
4634+
// pessimistic threshold. This can result in a lower estimate if the
4635+
// multiplicities of the filtered values really are low compared to the
4636+
// average multiplicity.
4637+
resClamp = props.MinSelectivity(resClamp,
4638+
props.MakeSelectivityFromFraction(oldHist.Resolution(), s.RowCount),
4639+
)
4640+
if resClamp.AsFloat() > clampedSel.AsFloat() {
4641+
sb.mem.optimizationStats.ClampedHistogramSelectivity = true
4642+
}
4643+
clampedSel = props.MaxSelectivity(clampedSel, resClamp)
4644+
}
4645+
4646+
tightUpperBound, tightLowerBound := newHist.TightBounds()
4647+
if sb.evalCtx.SessionData().OptimizerClampInequalitySelectivity &&
4648+
(!tightUpperBound || !tightLowerBound) {
4649+
// Similar to Postgres, assume that an open-ended inequality predicate will
4650+
// scan at least 1/10000th of the table. This accounts for the possibility
4651+
// that the histogram missed extreme values due to sampling or staleness.
4652+
inequalityClamp := props.MakeSelectivity(histogramUnboundedInequalityMinSelectivity)
4653+
if inequalityClamp.AsFloat() > clampedSel.AsFloat() {
4654+
sb.mem.optimizationStats.ClampedInequalitySelectivity = true
4655+
}
4656+
clampedSel = props.MaxSelectivity(clampedSel, inequalityClamp)
4657+
}
4658+
return clampedSel
4659+
}
4660+
45754661
// selectivityFromMaxFrequencies calculates the selectivity of an equality
45764662
// filters by using the maximum frequency of the histograms of the constrained
45774663
// columns. This represents a worst-case selectivity estimate and is used to
@@ -5332,7 +5418,10 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
53325418
colStat.NullCount = nullCount
53335419
if useHistogram {
53345420
colStat.Histogram = &props.Histogram{}
5335-
colStat.Histogram.Init(sb.evalCtx, firstColID, histogram)
5421+
// Track the minimum number of rows for which histogram selectivity
5422+
// estimates are trusted.
5423+
resolution := histogramPessimisticThreshold * statistics.RowCount
5424+
colStat.Histogram.Init(sb.evalCtx, firstColID, histogram, resolution)
53365425
}
53375426
sb.finalizeFromRowCountAndDistinctCounts(colStat, statistics)
53385427
tabMeta.AddCheckConstraintsStats(firstColID, colStat)

0 commit comments

Comments
 (0)