@@ -525,16 +525,26 @@ func (tr *testRunner) refreshBinaryVersions(ctx context.Context, service *servic
525525 defer cancel ()
526526
527527 group := ctxgroup .WithContext (connectionCtx )
528- for _ , node := range tr .getAvailableNodes (service .descriptor ) {
528+ // We still attempt to refresh binary versions on nodes we expect to be unavailable:
529+ // 1. Some failure injections are overly conservative in marking nodes as
530+ // unavailable out of caution (e.g. network partitions).
531+ // 2. The monitor returns the roachprod node ID, which may be offset compared
532+ // to what the mixed-version test expects, e.g. a multi cluster test where
533+ // node 1 may be the 5th VM in the roachprod cluster.
534+ availableNodes := tr .getAvailableNodes (service .descriptor )
535+ for j , node := range service .descriptor .Nodes {
529536 group .GoCtx (func (ctx context.Context ) error {
530537 bv , err := clusterupgrade .BinaryVersion (ctx , tr .conn (node , service .descriptor .Name ))
531538 if err != nil {
539+ if ! availableNodes .Contains (node ) {
540+ return nil
541+ }
532542 return fmt .Errorf (
533543 "failed to get binary version for node %d (%s): %w" ,
534544 node , service .descriptor .Name , err ,
535545 )
536546 }
537- newBinaryVersions [node - 1 ] = bv
547+ newBinaryVersions [j ] = bv
538548 return nil
539549 })
540550 }
@@ -556,17 +566,28 @@ func (tr *testRunner) refreshClusterVersions(ctx context.Context, service *servi
556566 defer cancel ()
557567
558568 group := ctxgroup .WithContext (connectionCtx )
559- for _ , node := range tr .getAvailableNodes (service .descriptor ) {
569+ // We still attempt to refresh cluster versions on nodes we expect to be unavailable:
570+ // 1. Some failure injections are overly conservative in marking nodes as
571+ // unavailable out of caution (e.g. network partitions).
572+ // 2. The monitor returns the roachprod node ID, which may be offset compared
573+ // to what the mixed-version test expects, e.g. a multi cluster test where
574+ // node 1 may be the 5th VM in the roachprod cluster.
575+ availableNodes := tr .getAvailableNodes (service .descriptor )
576+ for j , node := range service .descriptor .Nodes {
560577 group .GoCtx (func (ctx context.Context ) error {
561578 cv , err := clusterupgrade .ClusterVersion (ctx , tr .conn (node , service .descriptor .Name ))
562579 if err != nil {
580+ if ! availableNodes .Contains (node ) {
581+ return nil
582+ }
583+
563584 return fmt .Errorf (
564585 "failed to get cluster version for node %d (%s): %w" ,
565586 node , service .descriptor .Name , err ,
566587 )
567588 }
568589
569- newClusterVersions [node - 1 ] = cv
590+ newClusterVersions [j ] = cv
570591 return nil
571592 })
572593 }
0 commit comments