@@ -90,6 +90,7 @@ type Updater struct {
9090 usageReporter usage.Reporter
9191 tenantCals * cals.CostAttributionLabels
9292 supportsProtocolSecrets bool
93+ connectTimeout time.Duration
9394}
9495
9596type apiInfo struct {
@@ -128,6 +129,7 @@ type UpdaterOptions struct {
128129 UsageReporter usage.Reporter
129130 CostAttributionLabels * cals.CostAttributionLabels
130131 SupportsProtocolSecrets bool
132+ GrpcConnectTimeout time.Duration
131133}
132134
133135func NewUpdater (opts UpdaterOptions ) (* Updater , error ) {
@@ -252,6 +254,7 @@ func NewUpdater(opts UpdaterOptions) (*Updater, error) {
252254 tenantSecrets : opts .SecretProvider ,
253255 telemeter : opts .Telemeter ,
254256 supportsProtocolSecrets : opts .SupportsProtocolSecrets ,
257+ connectTimeout : opts .GrpcConnectTimeout ,
255258 metrics : metrics {
256259 changeErrorsCounter : changeErrorsCounter ,
257260 changesCounter : changesCounter ,
@@ -345,9 +348,11 @@ func handleError(ctx context.Context, logger zerolog.Logger, backoff Backoffer,
345348 return true , nil
346349
347350 case errors .As (err , & transientErr ):
348- logger . Warn (). Err ( err ). Msg ( "transient error, trying to reconnect" )
351+ dur := backoff . Duration ( )
349352
350- if err := sleepCtx (ctx , backoff .Duration ()); err != nil {
353+ logger .Warn ().Err (err ).Dur ("backoff" , dur ).Msg ("transient error, trying to reconnect" )
354+
355+ if err := sleepCtx (ctx , dur ); err != nil {
351356 return true , err
352357 }
353358
@@ -408,7 +413,7 @@ func (c *Updater) loop(ctx context.Context) (bool, error) {
408413 // Context was cancelled
409414 return context .Canceled
410415
411- case codes .Unavailable :
416+ case codes .Unavailable , codes . DeadlineExceeded :
412417 // Network errors, connection resets, transport closing, etc.
413418 // All these are transient and should trigger retry logic
414419 return TransientError (fmt .Sprintf ("%s: %s" , action , st .Message ()))
@@ -427,12 +432,25 @@ func (c *Updater) loop(ctx context.Context) (bool, error) {
427432 }
428433 }
429434
430- result , err := client .RegisterProbe (ctx , & sm.ProbeInfo {
435+ // Create a timeout context for RegisterProbe if configured.
436+ // This ensures we don't wait indefinitely if the server is unreachable.
437+ registerCtx := ctx
438+ var cancel context.CancelFunc
439+
440+ if c .connectTimeout > 0 {
441+ registerCtx , cancel = context .WithTimeout (ctx , c .connectTimeout )
442+ defer cancel ()
443+ c .logger .Info ().
444+ Dur ("timeout" , c .connectTimeout ).
445+ Msg ("using explicit connection timeout for RegisterProbe" )
446+ }
447+
448+ result , err := client .RegisterProbe (registerCtx , & sm.ProbeInfo {
431449 Version : version .Short (),
432450 Commit : version .Commit (),
433451 Buildstamp : version .Buildstamp (),
434452 SupportsProtocolSecrets : c .supportsProtocolSecrets ,
435- }, grpc .WaitForReady (true )) // Wait for connection on critical startup RPC
453+ }, grpc .WaitForReady (true )) // Wait for connection on critical startup RPC (respects context timeout)
436454 if err != nil {
437455 return connected , grpcErrorHandler ("registering probe with synthetic-monitoring-api" , err )
438456 }
0 commit comments