@@ -606,6 +606,28 @@ func (d *defaultManager) getWorkloadContainer(ctx context.Context, name string)
606606 return & container , nil
607607}
608608
609+ // isSupervisorProcessAlive checks if the supervisor process for a workload is alive
610+ // by checking if a PID exists. If a PID exists, we assume the supervisor is running.
611+ // This is a reasonable assumption because:
612+ // - If the supervisor exits cleanly, it cleans up the PID
613+ // - If killed unexpectedly, the PID remains but stopProcess will handle it gracefully
614+ // - The main issue we're preventing is accumulating zombie supervisors from repeated restarts
615+ func (d * defaultManager ) isSupervisorProcessAlive (ctx context.Context , name string ) bool {
616+ if name == "" {
617+ return false
618+ }
619+
620+ // Try to read the PID - if it exists, assume supervisor is running
621+ _ , err := d .statuses .GetWorkloadPID (ctx , name )
622+ if err != nil {
623+ // No PID found, supervisor is not running
624+ return false
625+ }
626+
627+ // PID exists, assume supervisor is alive
628+ return true
629+ }
630+
609631// stopProcess stops the proxy process associated with the container
610632func (d * defaultManager ) stopProcess (ctx context.Context , name string ) {
611633 if name == "" {
@@ -812,10 +834,38 @@ func (d *defaultManager) restartRemoteWorkload(
812834 return err
813835 }
814836
815- // Check if already running - compare status to WorkloadStatusRunning
837+ // If workload is already running, check if the supervisor process is healthy
816838 if err == nil && workload .Status == rt .WorkloadStatusRunning {
817- logger .Infof ("Remote workload %s is already running" , name )
818- return nil
839+ // Check if the supervisor process is actually alive
840+ supervisorAlive := d .isSupervisorProcessAlive (ctx , runConfig .BaseName )
841+
842+ if supervisorAlive {
843+ // Workload is running and healthy - preserve old behavior (no-op)
844+ logger .Infof ("Remote workload %s is already running" , name )
845+ return nil
846+ }
847+
848+ // Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
849+ logger .Infof ("Remote workload %s is running but supervisor is dead, cleaning up before restart" , name )
850+
851+ // Set status to stopping
852+ if err := d .statuses .SetWorkloadStatus (ctx , name , rt .WorkloadStatusStopping , "" ); err != nil {
853+ logger .Debugf ("Failed to set workload %s status to stopping: %v" , name , err )
854+ }
855+
856+ // Stop the supervisor process (proxy) if it exists (may already be dead)
857+ // This ensures we clean up any orphaned supervisor processes
858+ d .stopProxyIfNeeded (ctx , name , runConfig .BaseName )
859+
860+ // Clean up client configurations
861+ if err := removeClientConfigurations (name , false ); err != nil {
862+ logger .Warnf ("Warning: Failed to remove client configurations: %v" , err )
863+ }
864+
865+ // Set status to stopped after cleanup is complete
866+ if err := d .statuses .SetWorkloadStatus (ctx , name , rt .WorkloadStatusStopped , "" ); err != nil {
867+ logger .Debugf ("Failed to set workload %s status to stopped: %v" , name , err )
868+ }
819869 }
820870
821871 // Load runner configuration from state
@@ -837,6 +887,8 @@ func (d *defaultManager) restartRemoteWorkload(
837887}
838888
839889// restartContainerWorkload handles restarting a container-based workload
890+ //
891+ //nolint:gocyclo // Complexity is justified - handles multiple restart scenarios and edge cases
840892func (d * defaultManager ) restartContainerWorkload (ctx context.Context , name string , foreground bool ) error {
841893 // Get container info to resolve partial names and extract proper workload name
842894 var containerName string
@@ -864,10 +916,67 @@ func (d *defaultManager) restartContainerWorkload(ctx context.Context, name stri
864916 return err
865917 }
866918
867- // Check if already running - compare status to WorkloadStatusRunning
919+ // Check if workload is running and healthy (including supervisor process)
868920 if err == nil && workload .Status == rt .WorkloadStatusRunning {
869- logger .Infof ("Container %s is already running" , containerName )
870- return nil
921+ // Check if the supervisor process is actually alive
922+ supervisorAlive := d .isSupervisorProcessAlive (ctx , workloadName )
923+
924+ if supervisorAlive {
925+ // Workload is running and healthy - preserve old behavior (no-op)
926+ logger .Infof ("Container %s is already running" , containerName )
927+ return nil
928+ }
929+
930+ // Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
931+ logger .Infof ("Container %s is running but supervisor is dead, cleaning up before restart" , containerName )
932+ }
933+
934+ // Check if we need to stop the workload before restarting
935+ // This happens when: 1) container is running, or 2) inconsistent state
936+ shouldStop := false
937+ if err == nil && workload .Status == rt .WorkloadStatusRunning {
938+ // Workload status shows running (and supervisor is dead, otherwise we would have returned above)
939+ shouldStop = true
940+ } else if container .IsRunning () {
941+ // Container is running but status is not running (inconsistent state)
942+ shouldStop = true
943+ }
944+
945+ // If we need to stop, do it now (including cleanup of any remaining supervisor process)
946+ if shouldStop {
947+ logger .Infof ("Stopping container %s before restart" , containerName )
948+
949+ // Set status to stopping
950+ if err := d .statuses .SetWorkloadStatus (ctx , workloadName , rt .WorkloadStatusStopping , "" ); err != nil {
951+ logger .Debugf ("Failed to set workload %s status to stopping: %v" , workloadName , err )
952+ }
953+
954+ // Stop the supervisor process (proxy) if it exists (may already be dead)
955+ // This ensures we clean up any orphaned supervisor processes
956+ if ! labels .IsAuxiliaryWorkload (container .Labels ) {
957+ d .stopProcess (ctx , workloadName )
958+ }
959+
960+ // Now stop the container if it's running
961+ if container .IsRunning () {
962+ if err := d .runtime .StopWorkload (ctx , containerName ); err != nil {
963+ if statusErr := d .statuses .SetWorkloadStatus (ctx , workloadName , rt .WorkloadStatusError , err .Error ()); statusErr != nil {
964+ logger .Warnf ("Failed to set workload %s status to error: %v" , workloadName , statusErr )
965+ }
966+ return fmt .Errorf ("failed to stop container %s: %v" , containerName , err )
967+ }
968+ logger .Infof ("Container %s stopped" , containerName )
969+ }
970+
971+ // Clean up client configurations
972+ if err := removeClientConfigurations (workloadName , labels .IsAuxiliaryWorkload (container .Labels )); err != nil {
973+ logger .Warnf ("Warning: Failed to remove client configurations: %v" , err )
974+ }
975+
976+ // Set status to stopped after cleanup is complete
977+ if err := d .statuses .SetWorkloadStatus (ctx , workloadName , rt .WorkloadStatusStopped , "" ); err != nil {
978+ logger .Debugf ("Failed to set workload %s status to stopped: %v" , workloadName , err )
979+ }
871980 }
872981
873982 // Load runner configuration from state
@@ -882,18 +991,6 @@ func (d *defaultManager) restartContainerWorkload(ctx context.Context, name stri
882991 }
883992 logger .Infof ("Loaded configuration from state for %s" , workloadName )
884993
885- // Stop container if needed - since workload is not in running status, check if container needs stopping
886- if container .IsRunning () {
887- logger .Infof ("Container %s is running but workload is not in running state. Stopping container..." , containerName )
888- if err := d .runtime .StopWorkload (ctx , containerName ); err != nil {
889- if statusErr := d .statuses .SetWorkloadStatus (ctx , workloadName , rt .WorkloadStatusError , "" ); statusErr != nil {
890- logger .Warnf ("Failed to set workload %s status to error: %v" , workloadName , statusErr )
891- }
892- return fmt .Errorf ("failed to stop container %s: %v" , containerName , err )
893- }
894- logger .Infof ("Container %s stopped" , containerName )
895- }
896-
897994 // Start the workload with background context to avoid timeout cancellation
898995 // The ctx with AsyncOperationTimeout is only for the restart setup operations,
899996 // but the actual workload should run indefinitely with its own lifecycle management
0 commit comments