Skip to content

Commit abe5ceb

Browse files
craig[bot]fqazistevendannaspilchen
committed
155393: schemachangerccl: limit number of stages tested for MR r=fqazi a=fqazi Previously, this test was timing out frequently because of the number of stages that we were testing causing it to hit the test timeout. This patch reduces the number of stages tested on MR for declarative schema changes during backup / restore. This patch also adjusts timeout waiting for schema changes to allow for stress runs for testing. Release note: None Fixes: #154993 Fixes: #155137 Fixes: #154786 Fixes: #152076 Fixes: #155327 Fixes: #154805 Fixes: #155147 155453: server: deflake TestCheckRestartSafe_Integration r=stevendanna a=stevendanna This test seems to fail under DRPC. Informs #155111 Release note: None 155462: sql: use existing PK index for inverted index valdiation r=fqazi a=fqazi Previously, when we were validating inverted index, we made all first mutations public. For a PK swap operation this meant that incomplete primary indexes would be made public, which may not be fully ready for scans depending on the plan. This meant after recent changes to have better plans for alter primary key, we could end up scanning incomplete primary indexes. To address this, this patch ensures that only non-PK swap indexes are used for validating inverted indexes. Fixes: #155145 Fixes: #155141 Fixes: #155053 Fixes: #155038 Release note: None 155463: spanconfigjob: include underlying error in job failure message r=spilchen a=spilchen Previously, when the span config reconciliation job exhausted its internal retries, it would fail with a generic "reconciliation unsuccessful, failing job" message without including the underlying cause of the failure. This change tracks the last reconciliation error and wraps it in the final job error message. Closes #155380 Epic: None Release note: none Co-authored-by: Faizan Qazi <faizan@cockroachlabs.com> Co-authored-by: Steven Danna <danna@cockroachlabs.com> Co-authored-by: Matt Spilchen <matt.spilchen@cockroachlabs.com>
5 parents 3a6b122 + d0acab8 + 29a29f4 + 07d4a96 + 928e377 commit abe5ceb

File tree

8 files changed

+106
-4
lines changed

8 files changed

+106
-4
lines changed

pkg/server/api_v2_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,12 @@ func TestCheckRestartSafe_Integration(t *testing.T) {
575575
ctx := context.Background()
576576
var err error
577577

578-
testCluster := serverutils.StartCluster(t, 3, base.TestClusterArgs{})
578+
testCluster := serverutils.StartCluster(t, 3, base.TestClusterArgs{
579+
ServerArgs: base.TestServerArgs{
580+
// TODO(155111): Test is flakey with DRPC enabled.
581+
DefaultDRPCOption: base.TestDRPCDisabled,
582+
},
583+
})
579584
defer testCluster.Stopper().Stop(ctx)
580585

581586
ts0 := testCluster.Server(0)

pkg/spanconfig/spanconfigjob/job.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
134134
}
135135

136136
var lastCheckpoint = hlc.Timestamp{}
137+
var lastErr error
137138
const aWhile = 5 * time.Minute // arbitrary but much longer than a retry
138139
for retrier := retry.StartWithCtx(ctx, retryOpts); retrier.Next(); {
139140
started := timeutil.Now()
@@ -167,6 +168,7 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
167168
Checkpoint: rc.Checkpoint(),
168169
})
169170
}); err != nil {
171+
lastErr = err
170172
if shouldSkipRetry {
171173
break
172174
}
@@ -192,6 +194,9 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
192194
return nil // we're done here (the stopper was stopped, Reconcile exited cleanly)
193195
}
194196

197+
if lastErr != nil {
198+
return errors.Wrap(lastErr, "reconciliation unsuccessful, failing job")
199+
}
195200
return errors.Newf("reconciliation unsuccessful, failing job")
196201
}
197202

pkg/spanconfig/spanconfigmanager/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ go_test(
4343
"//pkg/sql/catalog",
4444
"//pkg/sql/isql",
4545
"//pkg/testutils",
46+
"//pkg/testutils/jobutils",
4647
"//pkg/testutils/serverutils",
4748
"//pkg/testutils/sqlutils",
4849
"//pkg/testutils/testcluster",

pkg/spanconfig/spanconfigmanager/manager_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
2222
"github.com/cockroachdb/cockroach/pkg/sql/isql"
2323
"github.com/cockroachdb/cockroach/pkg/testutils"
24+
"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
2425
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
2526
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2627
"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -365,6 +366,66 @@ func TestReconciliationJobErrorAndRecovery(t *testing.T) {
365366
mu.Unlock()
366367
}
367368

369+
// TestReconciliationJobFinalErrorIncludesCause verifies that the reconciliation
370+
// job surfaces the last reconciliation error when internal retries are exhausted.
371+
func TestReconciliationJobFinalErrorIncludesCause(t *testing.T) {
372+
defer leaktest.AfterTest(t)()
373+
defer log.Scope(t).Close(t)
374+
375+
ctx := context.Background()
376+
const injectedErrMsg = "injected reconciliation failure"
377+
injectedErr := errors.New(injectedErrMsg)
378+
379+
var jobID jobspb.JobID
380+
srv, db, _ := serverutils.StartServer(t, base.TestServerArgs{
381+
Knobs: base.TestingKnobs{
382+
SpanConfig: &spanconfig.TestingKnobs{
383+
ManagerDisableJobCreation: true, // disable automatic job creation; we'll create one explicitly
384+
JobDisableInternalRetry: true,
385+
JobOnCheckpointInterceptor: func(_ hlc.Timestamp) error {
386+
return injectedErr
387+
},
388+
},
389+
},
390+
})
391+
defer srv.Stopper().Stop(ctx)
392+
ts := srv.ApplicationLayer()
393+
394+
tdb := sqlutils.MakeSQLRunner(db)
395+
396+
manager := spanconfigmanager.New(
397+
ts.InternalDB().(isql.DB),
398+
ts.JobRegistry().(*jobs.Registry),
399+
ts.AppStopper(),
400+
ts.ClusterSettings(),
401+
ts.SpanConfigReconciler().(spanconfig.Reconciler),
402+
&spanconfig.TestingKnobs{
403+
ManagerCreatedJobInterceptor: func(jobI interface{}) {
404+
jobID = jobI.(*jobs.Job).ID()
405+
},
406+
},
407+
)
408+
409+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
410+
require.NoError(t, err)
411+
require.True(t, started)
412+
413+
testutils.SucceedsSoon(t, func() error {
414+
if jobID == 0 {
415+
return errors.New("waiting for job to be created")
416+
}
417+
return nil
418+
})
419+
420+
waitForJobState(t, tdb, jobID, jobs.StateFailed)
421+
422+
payload := jobutils.GetJobPayload(t, tdb, jobID)
423+
require.NotNil(t, payload.FinalResumeError)
424+
decodedErr := errors.DecodeError(ctx, *payload.FinalResumeError)
425+
require.ErrorContains(t, decodedErr, "reconciliation unsuccessful, failing job")
426+
require.ErrorContains(t, decodedErr, injectedErrMsg)
427+
}
428+
368429
// TestReconciliationUsesRightCheckpoint verifies that the reconciliation
369430
// internal retry uses the right checkpoint during internal retries.
370431
func TestReconciliationUsesRightCheckpoint(t *testing.T) {

pkg/sql/backfill.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1785,8 +1785,10 @@ func countExpectedRowsForInvertedIndex(
17851785
if withFirstMutationPublic {
17861786
// Make the mutations public in an in-memory copy of the descriptor and
17871787
// add it to the Collection's synthetic descriptors, so that we can use
1788-
// SQL below to perform the validation.
1789-
fakeDesc, err := tableDesc.MakeFirstMutationPublic(catalog.IgnoreConstraints, catalog.RetainDroppingColumns)
1788+
// SQL below to perform the validation. Avoid making PK swaps public, otherwise
1789+
// in the declarative schema changer we can select incomplete primary indexes
1790+
// for validation.
1791+
fakeDesc, err := tableDesc.MakeFirstMutationPublic(catalog.IgnoreConstraints, catalog.RetainDroppingColumns, catalog.IgnorePKSwaps)
17901792
if err != nil {
17911793
return 0, err
17921794
}

pkg/sql/logictest/testdata/logic_test/alter_table

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5265,3 +5265,23 @@ statement ok
52655265
RESET experimental_enable_unique_without_index_constraints
52665266

52675267
subtest end
5268+
5269+
# Tests for a regression found by #155145 where the wrong primary index could
5270+
# be scanned for when validating inverted indexes recreated after an ALTER
5271+
# PRIMARY KEY.
5272+
subtest validate_alter_pk_with_inverted
5273+
5274+
statement ok
5275+
CREATE TABLE t_alter_pk_with_inverted(
5276+
a STRING NOT NULL,
5277+
d DECIMAL[],
5278+
INVERTED INDEX (d)
5279+
);
5280+
5281+
statement ok
5282+
INSERT INTO t_alter_pk_with_inverted VALUES ('test', ARRAY[1.0, 2.0]);
5283+
5284+
statement ok
5285+
ALTER TABLE t_alter_pk_with_inverted ALTER PRIMARY KEY USING COLUMNS (a) USING HASH;
5286+
5287+
subtest end

pkg/sql/schemachanger/sctest/backup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ const skipRate = .6
129129

130130
// Stop once max stages has been hit.
131131
const maxStagesToTest = 8
132-
const maxStagesToTestMultiRegion = 6
132+
const maxStagesToTestMultiRegion = 4
133133

134134
// shouldSkipBackup samples about 60% of total stages or up to
135135
// max stages.

pkg/sql/schemachanger/sctest/framework.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,14 @@ func waitForSchemaChangesToSucceed(t *testing.T, tdb *sqlutils.SQLRunner) {
965965
}
966966

967967
func waitForSchemaChangesToFinish(t *testing.T, tdb *sqlutils.SQLRunner) {
968+
// Wait longer on stress for schema changes.
969+
if skip.Stress() {
970+
old := tdb.SucceedsSoonDuration
971+
tdb.SucceedsSoonDuration = time.Minute * 2
972+
defer func() {
973+
tdb.SucceedsSoonDuration = old
974+
}()
975+
}
968976
tdb.CheckQueryResultsRetry(
969977
t, schemaChangeWaitQuery(`('succeeded', 'failed')`), [][]string{},
970978
)

0 commit comments

Comments
 (0)