Skip to content

Commit 86885b5

Browse files
committed
sql/stats: avoid mutating input buckets in stripOuterBuckets
This commit changes `stripOuterBuckets` to modify and return a copy of the given histogram buckets if it finds outer buckets to remove. Previously, we would mutate the caller's histograms with leading outer buckets in-place by zeroing the range counts on the first non-outer bucket. This effectively corrupts the first histogram bucket in stats passed in from the stats cache. Although this bug has existed since 90e311d (which zeroes the first buckets range counts), it would only impact full statistics that we tried merging with partial stats, as that was the only case in which `stripOuterBuckets` was called. The surface area of this bug increased after db9a344, which calls `stripOuterBuckets` on every full statistic, regardless of whether we end up merging it with partial stats or not. Fixes: #155184 Release note (bug fix): Previously, we could corrupt the first bucket of table statistic histograms in certain cases, causing underestimates for range counts near the lower end of the domain, which is now fixed.
1 parent 588e5f6 commit 86885b5

File tree

3 files changed

+168
-8
lines changed

3 files changed

+168
-8
lines changed

pkg/sql/logictest/testdata/logic_test/distsql_stats

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3890,3 +3890,64 @@ SELECT jsonb_pretty(statistics->0->'histo_buckets') FROM
38903890
"upper_bound": "c"
38913891
}
38923892
]
3893+
3894+
# Ensure that stripOuterBuckets doesn't overwrite statistics (see #155184).
3895+
3896+
statement ok
3897+
CREATE TABLE t155184 (
3898+
a INT PRIMARY KEY
3899+
) WITH (sql_stats_automatic_collection_enabled = false, sql_stats_histogram_samples_count = 2)
3900+
3901+
# These stats were created with the following statements:
3902+
#
3903+
# INSERT INTO t155184 SELECT generate_series(1,10)
3904+
# ANALYZE t155184
3905+
3906+
statement ok
3907+
ALTER TABLE t155184 INJECT STATISTICS '[
3908+
{
3909+
"avg_size": 1,
3910+
"columns": [
3911+
"a"
3912+
],
3913+
"created_at": "2025-10-10 13:41:08.711908",
3914+
"distinct_count": 10,
3915+
"histo_buckets": [
3916+
{
3917+
"distinct_range": 0,
3918+
"num_eq": 0,
3919+
"num_range": 0,
3920+
"upper_bound": "-9223372036854775808"
3921+
},
3922+
{
3923+
"distinct_range": 3.5,
3924+
"num_eq": 1,
3925+
"num_range": 4,
3926+
"upper_bound": "8"
3927+
},
3928+
{
3929+
"distinct_range": 1,
3930+
"num_eq": 1,
3931+
"num_range": 1,
3932+
"upper_bound": "10"
3933+
},
3934+
{
3935+
"distinct_range": 3.5,
3936+
"num_eq": 0,
3937+
"num_range": 4,
3938+
"upper_bound": "9223372036854775807"
3939+
}
3940+
],
3941+
"histo_col_type": "INT8",
3942+
"histo_version": 3,
3943+
"id": 1114221014922133505,
3944+
"null_count": 0,
3945+
"row_count": 10
3946+
}
3947+
]'
3948+
3949+
# All we care about is the row counts, so don't error if other parts of the explain output change
3950+
query T
3951+
SELECT info FROM [EXPLAIN SELECT * FROM t155184 WHERE a < 8] WHERE info LIKE '%estimated row count:%'
3952+
----
3953+
estimated row count: 4 (36% of the table; stats collected <hidden> ago)

pkg/sql/stats/merge.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,9 @@ func MergedStatistics(
8686
return mergedStats
8787
}
8888

89-
// stripOuterBuckets removes the outer buckets from a histogram without a
90-
// leading NULL bucket.
89+
// stripOuterBuckets returns a copy of the histogram buckets with any outer
90+
// buckets that may have been added by addOuterBuckets removed. histogram must
91+
// not have a leading NULL bucket.
9192
func stripOuterBuckets(
9293
ctx context.Context, evalCtx *eval.Context, histogram []cat.HistogramBucket,
9394
) []cat.HistogramBucket {
@@ -96,17 +97,28 @@ func stripOuterBuckets(
9697
}
9798
startIdx := 0
9899
endIdx := len(histogram)
99-
if histogram[0].UpperBound.IsMin(ctx, evalCtx) && histogram[0].NumEq == 0 {
100+
hasLowerOuter := histogram[0].UpperBound.IsMin(ctx, evalCtx) && histogram[0].NumEq == 0
101+
if hasLowerOuter {
100102
startIdx = 1
101-
// Set the first range counts to zero to counteract range counts added by
102-
// addOuterBuckets.
103-
histogram[startIdx].NumRange = 0
104-
histogram[startIdx].DistinctRange = 0
105103
}
106104
if histogram[len(histogram)-1].UpperBound.IsMax(ctx, evalCtx) && histogram[len(histogram)-1].NumEq == 0 {
107105
endIdx = len(histogram) - 1
108106
}
109-
return histogram[startIdx:endIdx]
107+
if startIdx == 0 && endIdx == len(histogram) {
108+
return histogram
109+
}
110+
if startIdx >= endIdx {
111+
return nil
112+
}
113+
114+
out := append([]cat.HistogramBucket(nil), histogram[startIdx:endIdx]...)
115+
if hasLowerOuter {
116+
// Set the first range counts to zero to counteract range counts added by
117+
// addOuterBuckets.
118+
out[0].NumRange = 0
119+
out[0].DistinctRange = 0
120+
}
121+
return out
110122
}
111123

112124
// mergePartialStatistic merges a full statistic with a more recent partial

pkg/sql/stats/merge_test.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
1717
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
18+
"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
1819
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
1920
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
2021
)
@@ -916,6 +917,92 @@ func TestMergedStatistics(t *testing.T) {
916917
}
917918
}
918919

920+
// TestStripOuterBuckets tests stripOuterBuckets which removes outer buckets
921+
// added by addOuterBuckets before merging partial statistics.
922+
func TestStripOuterBuckets(t *testing.T) {
923+
ctx := context.Background()
924+
st := cluster.MakeTestingClusterSettings()
925+
evalCtx := eval.NewTestingEvalContext(st)
926+
defer evalCtx.Stop(ctx)
927+
928+
t.Run("no outer buckets returns input", func(t *testing.T) {
929+
buckets := []cat.HistogramBucket{
930+
{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDInt(10)},
931+
{NumEq: 2, NumRange: 1, DistinctRange: 1, UpperBound: tree.NewDInt(20)},
932+
}
933+
strippedBuckets := stripOuterBuckets(ctx, evalCtx, buckets)
934+
if !reflect.DeepEqual(strippedBuckets, buckets) {
935+
t.Fatalf("expected buckets unchanged: %v", strippedBuckets)
936+
}
937+
if len(strippedBuckets) > 0 && &strippedBuckets[0] != &buckets[0] {
938+
t.Fatalf("unexpected copy of backing array when no outer buckets")
939+
}
940+
})
941+
942+
testCases := []struct {
943+
name string
944+
buckets []cat.HistogramBucket
945+
expected []cat.HistogramBucket
946+
}{
947+
{
948+
buckets: []cat.HistogramBucket{
949+
{NumEq: 0, UpperBound: tree.NewDInt(math.MinInt64)},
950+
{NumEq: 1, NumRange: 10, DistinctRange: 5, UpperBound: tree.NewDInt(30)},
951+
{NumEq: 0, UpperBound: tree.NewDInt(math.MaxInt64)},
952+
},
953+
expected: []cat.HistogramBucket{
954+
{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDInt(30)},
955+
},
956+
},
957+
{
958+
buckets: []cat.HistogramBucket{
959+
{NumEq: 0, UpperBound: tree.NewDInt(math.MinInt64)},
960+
{NumEq: 1, NumRange: 10, DistinctRange: 5, UpperBound: tree.NewDInt(30)},
961+
{NumEq: 2, NumRange: 4, DistinctRange: 3, UpperBound: tree.NewDInt(40)},
962+
},
963+
expected: []cat.HistogramBucket{
964+
{NumEq: 1, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDInt(30)},
965+
{NumEq: 2, NumRange: 4, DistinctRange: 3, UpperBound: tree.NewDInt(40)},
966+
},
967+
},
968+
{
969+
buckets: []cat.HistogramBucket{
970+
{NumEq: 3, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDInt(30)},
971+
{NumEq: 2, NumRange: 4, DistinctRange: 3, UpperBound: tree.NewDInt(40)},
972+
{NumEq: 0, UpperBound: tree.NewDInt(math.MaxInt64)},
973+
},
974+
expected: []cat.HistogramBucket{
975+
{NumEq: 3, NumRange: 0, DistinctRange: 0, UpperBound: tree.NewDInt(30)},
976+
{NumEq: 2, NumRange: 4, DistinctRange: 3, UpperBound: tree.NewDInt(40)},
977+
},
978+
},
979+
}
980+
for i, tc := range testCases {
981+
t.Run(tc.name, func(t *testing.T) {
982+
buckets := append([]cat.HistogramBucket(nil), tc.buckets...)
983+
bucketsCopy := append([]cat.HistogramBucket(nil), buckets...)
984+
strippedBuckets := stripOuterBuckets(ctx, evalCtx, buckets)
985+
986+
if !reflect.DeepEqual(strippedBuckets, tc.expected) {
987+
t.Fatalf("test case %d incorrect, stripped buckets:\n%v\nexpected:\n%v",
988+
i, strippedBuckets, tc.expected)
989+
}
990+
if !reflect.DeepEqual(buckets, bucketsCopy) {
991+
t.Fatalf("test case %d unexpected mutation of input buckets:\n%v\nexpected:\n%v",
992+
i, buckets, bucketsCopy)
993+
}
994+
if len(strippedBuckets) > 0 {
995+
for bi := range buckets {
996+
if &strippedBuckets[0] == &buckets[bi] {
997+
t.Fatalf("test case %d expected stripped buckets result to copy"+
998+
" input slice, but shares backing array", i)
999+
}
1000+
}
1001+
}
1002+
})
1003+
}
1004+
}
1005+
9191006
func (tabStat *TableStatistic) RoundDistinctRanges() {
9201007
for i := range tabStat.Histogram {
9211008
tabStat.Histogram[i].DistinctRange = math.Round(tabStat.Histogram[i].DistinctRange)

0 commit comments

Comments
 (0)