stacklok
diff --git a/‎pkg/workloads/manager.go‎
Lines changed: 115 additions & 18 deletions b/‎pkg/workloads/manager.go‎
Lines changed: 115 additions & 18 deletions
@@ -606,6 +606,28 @@ func (d *defaultManager) getWorkloadContainer(ctx context.Context, name string)
 	return &container, nil
 }
 
+// isSupervisorProcessAlive checks if the supervisor process for a workload is alive
+// by checking if a PID exists. If a PID exists, we assume the supervisor is running.
+// This is a reasonable assumption because:
+// - If the supervisor exits cleanly, it cleans up the PID
+// - If killed unexpectedly, the PID remains but stopProcess will handle it gracefully
+// - The main issue we're preventing is accumulating zombie supervisors from repeated restarts
+func (d *defaultManager) isSupervisorProcessAlive(ctx context.Context, name string) bool {
+	if name == "" {
+		return false
+	}
+
+	// Try to read the PID - if it exists, assume supervisor is running
+	_, err := d.statuses.GetWorkloadPID(ctx, name)
+	if err != nil {
+		// No PID found, supervisor is not running
+		return false
+	}
+
+	// PID exists, assume supervisor is alive
+	return true
+}
+
 // stopProcess stops the proxy process associated with the container
 func (d *defaultManager) stopProcess(ctx context.Context, name string) {
 	if name == "" {
@@ -812,10 +834,38 @@ func (d *defaultManager) restartRemoteWorkload(
 		return err
 	}
 
-	// Check if already running - compare status to WorkloadStatusRunning
+	// If workload is already running, check if the supervisor process is healthy
 	if err == nil && workload.Status == rt.WorkloadStatusRunning {
-		logger.Infof("Remote workload %s is already running", name)
-		return nil
+		// Check if the supervisor process is actually alive
+		supervisorAlive := d.isSupervisorProcessAlive(ctx, runConfig.BaseName)
+
+		if supervisorAlive {
+			// Workload is running and healthy - preserve old behavior (no-op)
+			logger.Infof("Remote workload %s is already running", name)
+			return nil
+		}
+
+		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
+		logger.Infof("Remote workload %s is running but supervisor is dead, cleaning up before restart", name)
+
+		// Set status to stopping
+		if err := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusStopping, ""); err != nil {
+			logger.Debugf("Failed to set workload %s status to stopping: %v", name, err)
+		}
+
+		// Stop the supervisor process (proxy) if it exists (may already be dead)
+		// This ensures we clean up any orphaned supervisor processes
+		d.stopProxyIfNeeded(ctx, name, runConfig.BaseName)
+
+		// Clean up client configurations
+		if err := removeClientConfigurations(name, false); err != nil {
+			logger.Warnf("Warning: Failed to remove client configurations: %v", err)
+		}
+
+		// Set status to stopped after cleanup is complete
+		if err := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusStopped, ""); err != nil {
+			logger.Debugf("Failed to set workload %s status to stopped: %v", name, err)
+		}
 	}
 
 	// Load runner configuration from state
@@ -837,6 +887,8 @@ func (d *defaultManager) restartRemoteWorkload(
 }
 
 // restartContainerWorkload handles restarting a container-based workload
+//
+//nolint:gocyclo // Complexity is justified - handles multiple restart scenarios and edge cases
 func (d *defaultManager) restartContainerWorkload(ctx context.Context, name string, foreground bool) error {
 	// Get container info to resolve partial names and extract proper workload name
 	var containerName string
@@ -864,10 +916,67 @@ func (d *defaultManager) restartContainerWorkload(ctx context.Context, name stri
 		return err
 	}
 
-	// Check if already running - compare status to WorkloadStatusRunning
+	// Check if workload is running and healthy (including supervisor process)
 	if err == nil && workload.Status == rt.WorkloadStatusRunning {
-		logger.Infof("Container %s is already running", containerName)
-		return nil
+		// Check if the supervisor process is actually alive
+		supervisorAlive := d.isSupervisorProcessAlive(ctx, workloadName)
+
+		if supervisorAlive {
+			// Workload is running and healthy - preserve old behavior (no-op)
+			logger.Infof("Container %s is already running", containerName)
+			return nil
+		}
+
+		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
+		logger.Infof("Container %s is running but supervisor is dead, cleaning up before restart", containerName)
+	}
+
+	// Check if we need to stop the workload before restarting
+	// This happens when: 1) container is running, or 2) inconsistent state
+	shouldStop := false
+	if err == nil && workload.Status == rt.WorkloadStatusRunning {
+		// Workload status shows running (and supervisor is dead, otherwise we would have returned above)
+		shouldStop = true
+	} else if container.IsRunning() {
+		// Container is running but status is not running (inconsistent state)
+		shouldStop = true
+	}
+
+	// If we need to stop, do it now (including cleanup of any remaining supervisor process)
+	if shouldStop {
+		logger.Infof("Stopping container %s before restart", containerName)
+
+		// Set status to stopping
+		if err := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusStopping, ""); err != nil {
+			logger.Debugf("Failed to set workload %s status to stopping: %v", workloadName, err)
+		}
+
+		// Stop the supervisor process (proxy) if it exists (may already be dead)
+		// This ensures we clean up any orphaned supervisor processes
+		if !labels.IsAuxiliaryWorkload(container.Labels) {
+			d.stopProcess(ctx, workloadName)
+		}
+
+		// Now stop the container if it's running
+		if container.IsRunning() {
+			if err := d.runtime.StopWorkload(ctx, containerName); err != nil {
+				if statusErr := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusError, err.Error()); statusErr != nil {
+					logger.Warnf("Failed to set workload %s status to error: %v", workloadName, statusErr)
+				}
+				return fmt.Errorf("failed to stop container %s: %v", containerName, err)
+			}
+			logger.Infof("Container %s stopped", containerName)
+		}
+
+		// Clean up client configurations
+		if err := removeClientConfigurations(workloadName, labels.IsAuxiliaryWorkload(container.Labels)); err != nil {
+			logger.Warnf("Warning: Failed to remove client configurations: %v", err)
+		}
+
+		// Set status to stopped after cleanup is complete
+		if err := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusStopped, ""); err != nil {
+			logger.Debugf("Failed to set workload %s status to stopped: %v", workloadName, err)
+		}
 	}
 
 	// Load runner configuration from state
@@ -882,18 +991,6 @@ func (d *defaultManager) restartContainerWorkload(ctx context.Context, name stri
 	}
 	logger.Infof("Loaded configuration from state for %s", workloadName)
 
-	// Stop container if needed - since workload is not in running status, check if container needs stopping
-	if container.IsRunning() {
-		logger.Infof("Container %s is running but workload is not in running state. Stopping container...", containerName)
-		if err := d.runtime.StopWorkload(ctx, containerName); err != nil {
-			if statusErr := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusError, ""); statusErr != nil {
-				logger.Warnf("Failed to set workload %s status to error: %v", workloadName, statusErr)
-			}
-			return fmt.Errorf("failed to stop container %s: %v", containerName, err)
-		}
-		logger.Infof("Container %s stopped", containerName)
-	}
-
 	// Start the workload with background context to avoid timeout cancellation
 	// The ctx with AsyncOperationTimeout is only for the restart setup operations,
 	// but the actual workload should run indefinitely with its own lifecycle management