Skip to content

Commit f8b539b

Browse files
committed
roachtest: run INSPECT as background job and poll for completion
Long-running INSPECT statements (e.g., >3h) have previously failed due to connection resets, despite no server crash. This likely stems from a missed keep-alive or a client timeout, though the exact cause is unclear. This change updates the roachtest to: - Run the INSPECT statement as a background job using a short statement_timeout (5s). - Poll the job table for job completion instead of waiting synchronously. - Report progress at 10% intervals to improve visibility without overwhelming the logs. Fixes #155610 Release note: none Epic: none
1 parent e6d3d9d commit f8b539b

File tree

1 file changed

+82
-12
lines changed

1 file changed

+82
-12
lines changed

pkg/cmd/roachtest/tests/inspect_throughput.go

Lines changed: 82 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@ import (
2121
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
2222
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
2323
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
24+
"github.com/cockroachdb/cockroach/pkg/jobs"
2425
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2526
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2627
"github.com/cockroachdb/cockroach/pkg/workload/histogram"
2728
"github.com/cockroachdb/cockroach/pkg/workload/histogram/exporter"
2829
"github.com/lib/pq"
30+
"github.com/stretchr/testify/require"
2931
)
3032

3133
func registerInspectThoughput(r registry.Registry) {
@@ -34,7 +36,7 @@ func registerInspectThoughput(r registry.Registry) {
3436

3537
// Long run: 12 nodes × 8 CPUs, 1B rows, 2 index checks (runs INSPECT twice: 1 index, then 2 indexes), ~5 hours (in v25.4)
3638
const indexesForLongRun = 2
37-
r.Add(makeInspectThroughputTest(r, 12, 8, 1_000_000_000, 8*time.Hour, indexesForLongRun))
39+
r.Add(makeInspectThroughputTest(r, 12, 8, 1_000_000_000, 11*time.Hour, indexesForLongRun))
3840
}
3941

4042
// initInspectHistograms creates a histogram registry with multiple named metrics.
@@ -225,9 +227,7 @@ func makeInspectThroughputTest(
225227
before := timeutil.Now()
226228

227229
inspectSQL := fmt.Sprintf("INSPECT TABLE bulkingest.bulkingest WITH OPTIONS INDEX (%s)", cfg.indexListSQL)
228-
if _, err := db.Exec(inspectSQL); err != nil {
229-
t.Fatal(err)
230-
}
230+
jobID := runInspectInBackground(ctx, t, db, inspectSQL)
231231

232232
// Tick after INSPECT completes to capture elapsed time for this specific metric.
233233
tickHistogram(cfg.metricName)
@@ -249,15 +249,9 @@ func makeInspectThroughputTest(
249249
0
250250
)
251251
FROM system.job_info
252-
WHERE job_id = (
253-
SELECT job_id
254-
FROM [SHOW JOBS]
255-
WHERE job_type = 'INSPECT'
256-
ORDER BY created DESC
257-
LIMIT 1
258-
)
252+
WHERE job_id = $1
259253
AND info_key = 'legacy_progress'`
260-
err := db.QueryRow(querySQL).Scan(&jobTotalCheckCount)
254+
err := db.QueryRow(querySQL, jobID).Scan(&jobTotalCheckCount)
261255
if err != nil {
262256
t.L().Printf("Warning: failed to query job total check count: %v", err)
263257
} else {
@@ -301,3 +295,79 @@ func disableRowCountValidation(t test.Test, db *gosql.DB) {
301295
}
302296
}
303297
}
298+
299+
// runInspectInBackground runs an INSPECT command with a short statement timeout,
300+
// forcing it to run as a background job. It then polls the job until completion,
301+
// reporting progress at 10% intervals. Returns the job ID.
302+
func runInspectInBackground(
303+
ctx context.Context, t test.Test, db *gosql.DB, inspectSQL string,
304+
) (jobID int64) {
305+
// Set a short statement timeout to force INSPECT to run as a background job.
306+
_, err := db.Exec("SET statement_timeout = '5s'")
307+
require.NoError(t, err)
308+
309+
// Reset statement timeout after we're done.
310+
defer func() {
311+
_, resetErr := db.Exec("RESET statement_timeout")
312+
require.NoError(t, resetErr)
313+
}()
314+
315+
_, err = db.Exec(inspectSQL)
316+
317+
// This may fail if the INSPECT took longer than the statement_timeout to run.
318+
// So, we tolerate only statement timeout errors here.
319+
if err != nil {
320+
require.ErrorContains(t, err, "statement timeout")
321+
}
322+
323+
// Get the INSPECT job ID.
324+
getJobIDSQL := `
325+
SELECT job_id
326+
FROM [SHOW JOBS]
327+
WHERE job_type = 'INSPECT'
328+
ORDER BY created DESC
329+
LIMIT 1`
330+
if err := db.QueryRow(getJobIDSQL).Scan(&jobID); err != nil {
331+
t.Fatalf("failed to get INSPECT job ID: %v", err)
332+
}
333+
t.L().Printf("INSPECT job ID: %d", jobID)
334+
335+
// Poll the job until it completes, reporting progress at 10% intervals.
336+
const pollInterval = 5 * time.Second
337+
lastReportedThreshold := -1
338+
ticker := time.NewTicker(pollInterval)
339+
defer ticker.Stop()
340+
341+
for {
342+
select {
343+
case <-ctx.Done():
344+
t.Fatalf("context canceled while waiting for INSPECT job %d", jobID)
345+
case <-ticker.C:
346+
var status jobs.State
347+
var fractionCompleted float64
348+
checkJobSQL := `
349+
SELECT status, fraction_completed
350+
FROM [SHOW JOBS]
351+
WHERE job_id = $1`
352+
if err := db.QueryRow(checkJobSQL, jobID).Scan(&status, &fractionCompleted); err != nil {
353+
t.Fatalf("failed to query job %d status: %v", jobID, err)
354+
}
355+
356+
// Report progress at 10% thresholds (0%, 10%, 20%, ..., 90%).
357+
currentThreshold := int(fractionCompleted * 10)
358+
if currentThreshold > lastReportedThreshold && currentThreshold < 10 {
359+
t.L().Printf("INSPECT job %d: %d%% complete", jobID, currentThreshold*10)
360+
lastReportedThreshold = currentThreshold
361+
}
362+
363+
// Check if job is complete.
364+
switch status {
365+
case jobs.StateSucceeded:
366+
t.L().Printf("INSPECT job %d: 100%% complete (succeeded)", jobID)
367+
return jobID
368+
case jobs.StateFailed, jobs.StateCanceled:
369+
t.Fatalf("INSPECT job %d finished with status: %s", jobID, status)
370+
}
371+
}
372+
}
373+
}

0 commit comments

Comments
 (0)