@@ -10,15 +10,18 @@ import (
1010 "os"
1111 "path"
1212 "strings"
13+ "time"
1314
1415 "github.com/cockroachdb/cockroach/pkg/roachprod"
16+ "github.com/cockroachdb/cockroach/pkg/roachprod/install"
1517 "github.com/cockroachdb/cockroach/pkg/roachprod/logger"
18+ "github.com/cockroachdb/cockroach/pkg/util/retry"
1619 "github.com/cockroachdb/errors"
1720)
1821
1922// stage copies the specified archive to the remote machine and extracts it to
2023// the specified directory (creating it if it does not exist).
21- func stage (cluster , archivePath , remoteDest string ) (err error ) {
24+ func stage (cluster , archivePath , remoteDest string , longRetries bool ) (err error ) {
2225 ctx := context .Background ()
2326
2427 InitRoachprod ()
@@ -28,27 +31,41 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
2831 return
2932 }
3033
34+ runOptions := install .DefaultRunOptions ()
35+ if longRetries {
36+ // For VMs that have started failing with transient errors (usually due to
37+ // preemption), we want to introduce a longer retry period. These VMs may
38+ // still be recoverable since they're part of a managed instance group
39+ // that attempts to recover failed VMs.
40+ runOptions = runOptions .WithRetryOpts (retry.Options {
41+ InitialBackoff : 1 * time .Minute ,
42+ MaxBackoff : 5 * time .Minute ,
43+ Multiplier : 2 ,
44+ MaxRetries : 10 ,
45+ })
46+ }
47+
3148 archiveName := path .Base (archivePath )
3249 archiveRemotePath := path .Join ("/tmp" , archiveName )
3350
3451 defer func () {
3552 // Remove the remote archive after we're done.
36- cleanUpErr := RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath })
53+ cleanUpErr := RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }, runOptions )
3754 err = errors .CombineErrors (err , errors .Wrapf (cleanUpErr , "removing remote archive: %s" , archiveRemotePath ))
3855 }()
3956
4057 // Remove the remote archive and destination directory if they exist.
41- if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }); err != nil {
58+ if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }, runOptions ); err != nil {
4259 return errors .Wrapf (err , "removing remote archive: %s" , archiveRemotePath )
4360 }
44- if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , remoteDest }); err != nil {
61+ if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , remoteDest }, runOptions ); err != nil {
4562 return errors .Wrapf (err , "removing remote destination: %s" , remoteDest )
4663 }
4764
4865 // Copy the archive to the remote machine.
4966 copyFromGCS := strings .HasPrefix (archivePath , "gs://" )
5067 if copyFromGCS {
51- if err = RoachprodRun (cluster , l , []string {"gsutil" , "-q" , "-m" , "cp" , archivePath , archiveRemotePath }); err != nil {
68+ if err = RoachprodRun (cluster , l , []string {"gsutil" , "-q" , "-m" , "cp" , archivePath , archiveRemotePath }, runOptions ); err != nil {
5269 return errors .Wrapf (err , "copying archive from GCS: %s" , archivePath )
5370 }
5471 } else {
@@ -58,10 +75,10 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
5875 }
5976
6077 // Extract the archive on the remote machine.
61- if err = RoachprodRun (cluster , l , []string {"mkdir" , "-p" , remoteDest }); err != nil {
78+ if err = RoachprodRun (cluster , l , []string {"mkdir" , "-p" , remoteDest }, runOptions ); err != nil {
6279 return errors .Wrapf (err , "creating remote destination: %s" , remoteDest )
6380 }
64- if err = RoachprodRun (cluster , l , []string {"tar" , "-C" , remoteDest , "-xzf" , archiveRemotePath }); err != nil {
81+ if err = RoachprodRun (cluster , l , []string {"tar" , "-C" , remoteDest , "-xzf" , archiveRemotePath }, runOptions ); err != nil {
6582 return errors .Wrapf (err , "extracting archive: %s" , archiveRemotePath )
6683 }
6784
0 commit comments