Skip to content

Commit 10da9b4

Browse files
committed
mixedversion: fix panic in refresh{Cluster,Binary}Version
A recent change #155713 fixed the indexing when nodes were unavailable. However, this change did not account for multi cluster tests where the roachprod node ID is not the same as the CRDB node ID, i.e. we can't assume the 4th VM is the same as the 4th node in the cluster. Fixes: none Epic: none Release note: none
1 parent 5fdc01e commit 10da9b4

File tree

2 files changed

+34
-4
lines changed

2 files changed

+34
-4
lines changed

pkg/cmd/roachtest/option/node_list_option.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ func (n NodeListOption) Equals(o NodeListOption) bool {
4242
return true
4343
}
4444

45+
func (n NodeListOption) Contains(node int) bool {
46+
for _, n := range n {
47+
if n == node {
48+
return true
49+
}
50+
}
51+
return false
52+
}
53+
4554
// Option implements Option.
4655
func (n NodeListOption) Option() {}
4756

pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -525,16 +525,26 @@ func (tr *testRunner) refreshBinaryVersions(ctx context.Context, service *servic
525525
defer cancel()
526526

527527
group := ctxgroup.WithContext(connectionCtx)
528-
for _, node := range tr.getAvailableNodes(service.descriptor) {
528+
// We still attempt to refresh binary versions on nodes we expect to be unavailable:
529+
// 1. Some failure injections are overly conservative in marking nodes as
530+
// unavailable out of caution (e.g. network partitions).
531+
// 2. The monitor returns the roachprod node ID, which may be offset compared
532+
// to what the mixed-version test expects, e.g. a multi cluster test where
533+
// node 1 may be the 5th VM in the roachprod cluster.
534+
availableNodes := tr.getAvailableNodes(service.descriptor)
535+
for j, node := range service.descriptor.Nodes {
529536
group.GoCtx(func(ctx context.Context) error {
530537
bv, err := clusterupgrade.BinaryVersion(ctx, tr.conn(node, service.descriptor.Name))
531538
if err != nil {
539+
if !availableNodes.Contains(node) {
540+
return nil
541+
}
532542
return fmt.Errorf(
533543
"failed to get binary version for node %d (%s): %w",
534544
node, service.descriptor.Name, err,
535545
)
536546
}
537-
newBinaryVersions[node-1] = bv
547+
newBinaryVersions[j] = bv
538548
return nil
539549
})
540550
}
@@ -556,17 +566,28 @@ func (tr *testRunner) refreshClusterVersions(ctx context.Context, service *servi
556566
defer cancel()
557567

558568
group := ctxgroup.WithContext(connectionCtx)
559-
for _, node := range tr.getAvailableNodes(service.descriptor) {
569+
// We still attempt to refresh cluster versions on nodes we expect to be unavailable:
570+
// 1. Some failure injections are overly conservative in marking nodes as
571+
// unavailable out of caution (e.g. network partitions).
572+
// 2. The monitor returns the roachprod node ID, which may be offset compared
573+
// to what the mixed-version test expects, e.g. a multi cluster test where
574+
// node 1 may be the 5th VM in the roachprod cluster.
575+
availableNodes := tr.getAvailableNodes(service.descriptor)
576+
for j, node := range service.descriptor.Nodes {
560577
group.GoCtx(func(ctx context.Context) error {
561578
cv, err := clusterupgrade.ClusterVersion(ctx, tr.conn(node, service.descriptor.Name))
562579
if err != nil {
580+
if !availableNodes.Contains(node) {
581+
return nil
582+
}
583+
563584
return fmt.Errorf(
564585
"failed to get cluster version for node %d (%s): %w",
565586
node, service.descriptor.Name, err,
566587
)
567588
}
568589

569-
newClusterVersions[node-1] = cv
590+
newClusterVersions[j] = cv
570591
return nil
571592
})
572593
}

0 commit comments

Comments
 (0)