Skip to content

Commit 6cabe06

Browse files
craig[bot]michae2kev-cao
committed
155035: sql/stats: handle range counts when skipping dropped enum hist buckets r=Uzair5162,yuzefovich a=michae2 **opt/props: only check histogram NumRange=0 in test builds** These assertions that the first histogram bucket has NumRange=0 are useful for catching bugs, but a liability in production environments. Even with malformed histograms, we should not be failing query execution. Only check these assertions in test builds. Informs: #154461 Release note (bug fix): Change assertions about histogram NumRange=0 to only be checked in test builds. --- **sql/stats: handle range counts when skipping dropped enum hist buckets** In #136538 we changed DecodeHistogramBuckets to skip over histogram buckets for dropped enum values. This change assumed that histogram buckets for enum types would never contain range counts, but this is not true. For example, if the number of distinct enum values exceeds the number of histogram buckets there could be range counts. Fixes: #154461 Release note (bug fix): Fix a bug in which range counts in table statistics histograms were not handled correctly after a user-defined enum type was modified. 155274: logical: add initial/catchup scan metrics to offline scan r=msbutler a=kev-cao This patch teaches the LDR offline scan processor to emit checkpoint range stats and update the `logical_replication.scanning_ranges` and `logical_replication.catchup_ranges` metrics. Informs: #152273 Release note: LDR now updates the `logical_replication.scanning_ranges` and `logical_replication.catchup_ranges` metrics during fast initial scan. 155492: physical: deflake TestCreateTenantFromReplicationUsingID r=msbutler a=kev-cao We occasionally see slow quiesces on the `TestCreateTenantFromReplicationUsingID` test. This appears to be caused by shutting down the server while the source tenant is still being created. To avoid this, we wait until the SQL server on the source tenant is ready before teardown. Fixes: #155345 Release note: None Co-authored-by: Michael Erickson <michae2@cockroachlabs.com> Co-authored-by: Kevin Cao <39608887+kev-cao@users.noreply.github.com>
4 parents 0cda953 + d82a204 + 02bdcbc + 97b2e83 commit 6cabe06

File tree

17 files changed

+792
-96
lines changed

17 files changed

+792
-96
lines changed

pkg/crosscluster/logical/logical_replication_job.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,9 +456,11 @@ type logicalReplicationPlanner struct {
456456
}
457457

458458
type logicalReplicationPlanInfo struct {
459-
sourceSpans []roachpb.Span
460-
partitionPgUrls []string
461-
destTableBySrcID map[descpb.ID]dstTableMetadata
459+
sourceSpans []roachpb.Span
460+
partitionPgUrls []string
461+
destTableBySrcID map[descpb.ID]dstTableMetadata
462+
// Number of processors writing data on the destination cluster (offline or
463+
// otherwise).
462464
writeProcessorCount int
463465
}
464466

@@ -738,6 +740,7 @@ func (p *logicalReplicationPlanner) planOfflineInitialScan(
738740
SQLInstanceID: instanceID,
739741
Core: execinfrapb.ProcessorCoreUnion{LogicalReplicationOfflineScan: &spec},
740742
})
743+
info.writeProcessorCount++
741744
}
742745
}
743746

pkg/crosscluster/logical/offline_initial_scan_processor.go

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/cockroachdb/cockroach/pkg/backup"
1515
"github.com/cockroachdb/cockroach/pkg/crosscluster"
16+
"github.com/cockroachdb/cockroach/pkg/crosscluster/replicationutils"
1617
"github.com/cockroachdb/cockroach/pkg/crosscluster/streamclient"
1718
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
1819
"github.com/cockroachdb/cockroach/pkg/kv/bulk"
@@ -65,6 +66,8 @@ type offlineInitialScanProcessor struct {
6566

6667
checkpointCh chan offlineCheckpoint
6768

69+
rangeStatsCh chan *streampb.StreamEvent_RangeStats
70+
6871
rekey *backup.KeyRewriter
6972

7073
batcher *bulk.SSTBatcher
@@ -104,6 +107,7 @@ func newNewOfflineInitialScanProcessor(
104107
processorID: processorID,
105108
stopCh: make(chan struct{}),
106109
checkpointCh: make(chan offlineCheckpoint),
110+
rangeStatsCh: make(chan *streampb.StreamEvent_RangeStats),
107111
errCh: make(chan error, 1),
108112
rekey: rekeyer,
109113
lastKeyAdded: roachpb.Key{},
@@ -220,6 +224,7 @@ func (o *offlineInitialScanProcessor) Start(ctx context.Context) {
220224
})
221225
o.workerGroup.GoCtx(func(ctx context.Context) error {
222226
defer close(o.checkpointCh)
227+
defer close(o.rangeStatsCh)
223228
pprof.Do(ctx, pprof.Labels("proc", fmt.Sprintf("%d", o.ProcessorID)), func(ctx context.Context) {
224229
for event := range o.subscription.Events() {
225230
if err := o.handleEvent(ctx, event); err != nil {
@@ -245,16 +250,8 @@ func (o *offlineInitialScanProcessor) Next() (rowenc.EncDatumRow, *execinfrapb.P
245250
case checkpoint, ok := <-o.checkpointCh:
246251
switch {
247252
case !ok:
248-
select {
249-
case err := <-o.errCh:
250-
o.MoveToDrainingAndLogError(err)
251-
return nil, o.DrainHelper()
252-
case <-time.After(10 * time.Second):
253-
logcrash.ReportOrPanic(o.Ctx(), &o.FlowCtx.Cfg.Settings.SV,
254-
"event channel closed but no error found on err channel after 10 seconds")
255-
o.MoveToDrainingAndLogError(nil /* error */)
256-
return nil, o.DrainHelper()
257-
}
253+
o.MoveToDrainingAndLogError(o.waitForErr())
254+
return nil, o.DrainHelper()
258255
case checkpoint.afterInitialScanCompletion:
259256
// The previous checkpoint completed the initial scan and was already
260257
// ingested by the coordinator, so we can gracefully shut down the
@@ -273,6 +270,18 @@ func (o *offlineInitialScanProcessor) Next() (rowenc.EncDatumRow, *execinfrapb.P
273270
}
274271
return row, nil
275272
}
273+
case stats, ok := <-o.rangeStatsCh:
274+
if !ok {
275+
o.MoveToDrainingAndLogError(o.waitForErr())
276+
return nil, o.DrainHelper()
277+
}
278+
279+
meta, err := replicationutils.StreamRangeStatsToProgressMeta(o.FlowCtx, o.ProcessorID, stats)
280+
if err != nil {
281+
o.MoveToDrainingAndLogError(err)
282+
return nil, o.DrainHelper()
283+
}
284+
return nil, meta
276285
case err := <-o.errCh:
277286
o.MoveToDrainingAndLogError(err)
278287
return nil, o.DrainHelper()
@@ -345,7 +354,7 @@ func (o *offlineInitialScanProcessor) handleEvent(
345354
return err
346355
}
347356
case crosscluster.CheckpointEvent:
348-
if err := o.checkpoint(ctx, event.GetCheckpoint().ResolvedSpans); err != nil {
357+
if err := o.checkpoint(ctx, event.GetCheckpoint()); err != nil {
349358
return err
350359
}
351360
case crosscluster.SSTableEvent, crosscluster.DeleteRangeEvent:
@@ -358,9 +367,26 @@ func (o *offlineInitialScanProcessor) handleEvent(
358367
return nil
359368
}
360369

370+
// waitForErr waits for an error to be sent on the error channel and returns the
371+
// error if one is found within the timeout.
372+
func (o *offlineInitialScanProcessor) waitForErr() error {
373+
select {
374+
case err := <-o.errCh:
375+
return err
376+
case <-time.After(10 * time.Second):
377+
logcrash.ReportOrPanic(o.Ctx(), &o.FlowCtx.Cfg.Settings.SV,
378+
"event channel closed but no error found on err channel after 10 seconds")
379+
return nil
380+
}
381+
}
382+
361383
func (o *offlineInitialScanProcessor) checkpoint(
362-
ctx context.Context, resolvedSpans []jobspb.ResolvedSpan,
384+
ctx context.Context, checkpoint *streampb.StreamEvent_StreamCheckpoint,
363385
) error {
386+
if checkpoint == nil {
387+
return errors.New("nil checkpoint event")
388+
}
389+
resolvedSpans := checkpoint.ResolvedSpans
364390
if resolvedSpans == nil {
365391
return errors.New("checkpoint event expected to have resolved spans")
366392
}
@@ -406,6 +432,15 @@ func (o *offlineInitialScanProcessor) checkpoint(
406432
// shutdown the processor.
407433
o.initialScanCompleted = true
408434
}
435+
436+
if checkpoint.RangeStats != nil {
437+
select {
438+
case o.rangeStatsCh <- checkpoint.RangeStats:
439+
case <-o.stopCh:
440+
case <-ctx.Done():
441+
return ctx.Err()
442+
}
443+
}
409444
return nil
410445
}
411446

pkg/crosscluster/physical/stream_ingestion_planning_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/cockroachdb/cockroach/pkg/crosscluster/replicationtestutils"
1414
"github.com/cockroachdb/cockroach/pkg/jobs"
1515
"github.com/cockroachdb/cockroach/pkg/security/username"
16+
"github.com/cockroachdb/cockroach/pkg/testutils"
1617
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
1718
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1819
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
@@ -67,4 +68,16 @@ func TestCreateTenantFromReplicationUsingID(t *testing.T) {
6768
t.Logf("starting replication [50]->[51]")
6869
sqlB.Exec(t, "CREATE VIRTUAL CLUSTER [51] FROM REPLICATION OF 'cluster-50' ON $1", serverAURL.String())
6970
})
71+
72+
// We wait for tenant 50's SQL server to be up and running before ending the
73+
// test. This should help reduce flakes where the tenant is being setup when
74+
// the stopper begins quiescing.
75+
testutils.SucceedsSoon(t, func() error {
76+
tenantConn, err := serverA.ApplicationLayer().SQLConnE(serverutils.DBName("cluster:cluster-50"))
77+
if err != nil {
78+
return err
79+
}
80+
defer tenantConn.Close()
81+
return tenantConn.PingContext(ctx)
82+
})
7083
}

pkg/sql/logictest/testdata/logic_test/enums

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,3 +1987,132 @@ CREATE TABLE public.t (
19871987
rowid INT8 NOT VISIBLE NOT NULL DEFAULT unique_rowid(),
19881988
CONSTRAINT t_pkey PRIMARY KEY (rowid ASC)
19891989
) WITH (schema_locked = true);
1990+
1991+
# Testcase for issue 154461: drop an enum value when the histogram contains
1992+
# range counts.
1993+
subtest 154461
1994+
1995+
statement ok
1996+
USE test
1997+
1998+
statement ok
1999+
CREATE TYPE e154461 AS ENUM ('e', 'f', 'g')
2000+
2001+
statement ok
2002+
CREATE TABLE t154461 (a e154461, INDEX (a)) WITH (sql_stats_histogram_buckets_count = 2)
2003+
2004+
statement ok
2005+
INSERT INTO t154461 VALUES ('e'), ('e'), ('f'), ('g'), ('g')
2006+
2007+
statement ok
2008+
CREATE STATISTICS s FROM t154461
2009+
2010+
query T
2011+
SELECT * FROM t154461 WHERE a != 'g' ORDER BY a
2012+
----
2013+
e
2014+
e
2015+
f
2016+
2017+
let $hist_id_1
2018+
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE t154461]
2019+
WHERE statistics_name = 's' AND column_names = '{a}'
2020+
2021+
query TIRI colnames,nosort
2022+
SHOW HISTOGRAM $hist_id_1
2023+
----
2024+
upper_bound range_rows distinct_range_rows equal_rows
2025+
'e' 0 0 2
2026+
'g' 1 1 2
2027+
2028+
query T
2029+
SELECT jsonb_pretty(stat)
2030+
FROM (
2031+
SELECT json_array_elements(statistics) - 'created_at' - 'id' - 'avg_size' AS stat
2032+
FROM [SHOW STATISTICS USING JSON FOR TABLE t154461]
2033+
)
2034+
WHERE stat->>'columns' = '["a"]'
2035+
----
2036+
{
2037+
"columns": [
2038+
"a"
2039+
],
2040+
"distinct_count": 3,
2041+
"histo_buckets": [
2042+
{
2043+
"distinct_range": 0,
2044+
"num_eq": 2,
2045+
"num_range": 0,
2046+
"upper_bound": "e"
2047+
},
2048+
{
2049+
"distinct_range": 1,
2050+
"num_eq": 2,
2051+
"num_range": 1,
2052+
"upper_bound": "g"
2053+
}
2054+
],
2055+
"histo_col_type": "test.public.e154461",
2056+
"histo_version": 3,
2057+
"name": "s",
2058+
"null_count": 0,
2059+
"row_count": 5
2060+
}
2061+
2062+
# Now drop the enum value that is the first bucket in the 2-bucket histogram.
2063+
2064+
statement ok
2065+
DELETE FROM t154461 WHERE a = 'e'
2066+
2067+
statement ok
2068+
ALTER TYPE e154461 DROP VALUE 'e'
2069+
2070+
query T
2071+
SELECT * FROM t154461 WHERE a != 'g' ORDER BY a
2072+
----
2073+
f
2074+
2075+
let $hist_id_1
2076+
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE t154461]
2077+
WHERE statistics_name = 's' AND column_names = '{a}'
2078+
2079+
query TIRI colnames,nosort
2080+
SHOW HISTOGRAM $hist_id_1
2081+
----
2082+
upper_bound range_rows distinct_range_rows equal_rows
2083+
'f' 0 0 1
2084+
'g' 0 0 2
2085+
2086+
query T
2087+
SELECT jsonb_pretty(stat)
2088+
FROM (
2089+
SELECT json_array_elements(statistics) - 'created_at' - 'id' - 'avg_size' AS stat
2090+
FROM [SHOW STATISTICS USING JSON FOR TABLE t154461]
2091+
)
2092+
WHERE stat->>'columns' = '["a"]'
2093+
----
2094+
{
2095+
"columns": [
2096+
"a"
2097+
],
2098+
"distinct_count": 2,
2099+
"histo_buckets": [
2100+
{
2101+
"distinct_range": 0,
2102+
"num_eq": 1,
2103+
"num_range": 0,
2104+
"upper_bound": "f"
2105+
},
2106+
{
2107+
"distinct_range": 0,
2108+
"num_eq": 2,
2109+
"num_range": 0,
2110+
"upper_bound": "g"
2111+
}
2112+
],
2113+
"histo_col_type": "test.public.e154461",
2114+
"histo_version": 3,
2115+
"name": "s",
2116+
"null_count": 0,
2117+
"row_count": 5
2118+
}

pkg/sql/opt/cat/table.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ type HistogramBucket struct {
300300

301301
// DistinctRange is the estimated number of distinct values between the upper
302302
// bound of the previous bucket and UpperBound (both boundaries are
303-
// exclusive).
303+
// exclusive). The first bucket should always have DistinctRange=0.
304304
DistinctRange float64
305305

306306
// UpperBound is the upper bound of the bucket.

pkg/sql/opt/exec/execbuilder/testdata/forecast

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -396,23 +396,11 @@ WHERE stat->>'name' = '__forecast__'
396396
"created_at": "1988-08-08 00:00:00",
397397
"distinct_count": 1,
398398
"histo_buckets": [
399-
{
400-
"distinct_range": 0,
401-
"num_eq": 0,
402-
"num_range": 0,
403-
"upper_bound": "-1"
404-
},
405399
{
406400
"distinct_range": 0,
407401
"num_eq": 1,
408402
"num_range": 0,
409403
"upper_bound": "0"
410-
},
411-
{
412-
"distinct_range": 0,
413-
"num_eq": 0,
414-
"num_range": 0,
415-
"upper_bound": "1"
416404
}
417405
],
418406
"histo_col_type": "INT8",

pkg/sql/opt/exec/execbuilder/testdata/partial_stats

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -375,12 +375,6 @@ WHERE stat->>'name' = '__forecast__';
375375
"num_range": 0,
376376
"upper_bound": "9"
377377
},
378-
{
379-
"distinct_range": 0,
380-
"num_eq": 0,
381-
"num_range": 0,
382-
"upper_bound": "10"
383-
},
384378
{
385379
"distinct_range": 0,
386380
"num_eq": 2,
@@ -411,12 +405,6 @@ WHERE stat->>'name' = '__forecast__';
411405
"num_range": 0,
412406
"upper_bound": "15"
413407
},
414-
{
415-
"distinct_range": 0,
416-
"num_eq": 0,
417-
"num_range": 0,
418-
"upper_bound": "16"
419-
},
420408
{
421409
"distinct_range": 0,
422410
"num_eq": 1,

0 commit comments

Comments
 (0)