Skip to content

Commit 928e377

Browse files
committed
spanconfigjob: include underlying error in job failure message
Previously, when the span config reconciliation job exhausted its internal retries, it would fail with a generic "reconciliation unsuccessful, failing job" message without including the underlying cause of the failure. This change tracks the last reconciliation error and wraps it in the final job error message. Closes #155380 Epic: None Release note: none
1 parent dae24c3 commit 928e377

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed

pkg/spanconfig/spanconfigjob/job.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
134134
}
135135

136136
var lastCheckpoint = hlc.Timestamp{}
137+
var lastErr error
137138
const aWhile = 5 * time.Minute // arbitrary but much longer than a retry
138139
for retrier := retry.StartWithCtx(ctx, retryOpts); retrier.Next(); {
139140
started := timeutil.Now()
@@ -167,6 +168,7 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
167168
Checkpoint: rc.Checkpoint(),
168169
})
169170
}); err != nil {
171+
lastErr = err
170172
if shouldSkipRetry {
171173
break
172174
}
@@ -192,6 +194,9 @@ func (r *resumer) Resume(ctx context.Context, execCtxI interface{}) (jobErr erro
192194
return nil // we're done here (the stopper was stopped, Reconcile exited cleanly)
193195
}
194196

197+
if lastErr != nil {
198+
return errors.Wrap(lastErr, "reconciliation unsuccessful, failing job")
199+
}
195200
return errors.Newf("reconciliation unsuccessful, failing job")
196201
}
197202

pkg/spanconfig/spanconfigmanager/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ go_test(
4343
"//pkg/sql/catalog",
4444
"//pkg/sql/isql",
4545
"//pkg/testutils",
46+
"//pkg/testutils/jobutils",
4647
"//pkg/testutils/serverutils",
4748
"//pkg/testutils/sqlutils",
4849
"//pkg/testutils/testcluster",

pkg/spanconfig/spanconfigmanager/manager_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
2222
"github.com/cockroachdb/cockroach/pkg/sql/isql"
2323
"github.com/cockroachdb/cockroach/pkg/testutils"
24+
"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
2425
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
2526
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2627
"github.com/cockroachdb/cockroach/pkg/util/hlc"
@@ -365,6 +366,66 @@ func TestReconciliationJobErrorAndRecovery(t *testing.T) {
365366
mu.Unlock()
366367
}
367368

369+
// TestReconciliationJobFinalErrorIncludesCause verifies that the reconciliation
370+
// job surfaces the last reconciliation error when internal retries are exhausted.
371+
func TestReconciliationJobFinalErrorIncludesCause(t *testing.T) {
372+
defer leaktest.AfterTest(t)()
373+
defer log.Scope(t).Close(t)
374+
375+
ctx := context.Background()
376+
const injectedErrMsg = "injected reconciliation failure"
377+
injectedErr := errors.New(injectedErrMsg)
378+
379+
var jobID jobspb.JobID
380+
srv, db, _ := serverutils.StartServer(t, base.TestServerArgs{
381+
Knobs: base.TestingKnobs{
382+
SpanConfig: &spanconfig.TestingKnobs{
383+
ManagerDisableJobCreation: true, // disable automatic job creation; we'll create one explicitly
384+
JobDisableInternalRetry: true,
385+
JobOnCheckpointInterceptor: func(_ hlc.Timestamp) error {
386+
return injectedErr
387+
},
388+
},
389+
},
390+
})
391+
defer srv.Stopper().Stop(ctx)
392+
ts := srv.ApplicationLayer()
393+
394+
tdb := sqlutils.MakeSQLRunner(db)
395+
396+
manager := spanconfigmanager.New(
397+
ts.InternalDB().(isql.DB),
398+
ts.JobRegistry().(*jobs.Registry),
399+
ts.AppStopper(),
400+
ts.ClusterSettings(),
401+
ts.SpanConfigReconciler().(spanconfig.Reconciler),
402+
&spanconfig.TestingKnobs{
403+
ManagerCreatedJobInterceptor: func(jobI interface{}) {
404+
jobID = jobI.(*jobs.Job).ID()
405+
},
406+
},
407+
)
408+
409+
started, err := manager.TestingCreateAndStartJobIfNoneExists(ctx, ts.ClusterSettings())
410+
require.NoError(t, err)
411+
require.True(t, started)
412+
413+
testutils.SucceedsSoon(t, func() error {
414+
if jobID == 0 {
415+
return errors.New("waiting for job to be created")
416+
}
417+
return nil
418+
})
419+
420+
waitForJobState(t, tdb, jobID, jobs.StateFailed)
421+
422+
payload := jobutils.GetJobPayload(t, tdb, jobID)
423+
require.NotNil(t, payload.FinalResumeError)
424+
decodedErr := errors.DecodeError(ctx, *payload.FinalResumeError)
425+
require.ErrorContains(t, decodedErr, "reconciliation unsuccessful, failing job")
426+
require.ErrorContains(t, decodedErr, injectedErrMsg)
427+
}
428+
368429
// TestReconciliationUsesRightCheckpoint verifies that the reconciliation
369430
// internal retry uses the right checkpoint during internal retries.
370431
func TestReconciliationUsesRightCheckpoint(t *testing.T) {

0 commit comments

Comments
 (0)