Skip to content

Commit 48753af

Browse files
committed
cleanup collection
1 parent a652690 commit 48753af

File tree

1 file changed

+63
-10
lines changed

1 file changed

+63
-10
lines changed

scripts/multinode-runner.sh

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ cleanup() {
149149
log "Cleaning up test processes (exit code: $exit_code)..."
150150

151151
local cleaned_count=0
152+
local cleanup_failures=() # Track cleanup operations that fail
152153

153154
# Calculate cleanup timeouts with CI scaling
154155
# In CI environments, processes take longer to terminate gracefully
@@ -167,53 +168,89 @@ cleanup() {
167168
# Kill relay first with graceful shutdown attempt
168169
if is_valid_pid "$RELAY_PID"; then
169170
log "Stopping relay (PID: $RELAY_PID)"
170-
kill "$RELAY_PID" 2>/dev/null || true
171+
if ! kill "$RELAY_PID" 2>/dev/null; then
172+
cleanup_failures+=("Failed to send SIGTERM to relay PID $RELAY_PID")
173+
fi
171174

172175
# Wait for graceful shutdown with timeout
173176
local wait_count=0
177+
local relay_terminated=false
174178
while [[ $wait_count -lt $relay_grace_period ]]; do
175179
if ! kill -0 "$RELAY_PID" 2>/dev/null; then
176180
log "Relay terminated gracefully"
181+
relay_terminated=true
177182
break
178183
fi
179184
sleep 1
180185
((wait_count++)) || true
181186
done
182187

183188
# Force kill if still running
184-
# No race condition: kill -9 will simply fail if process already dead
185-
if kill -9 "$RELAY_PID" 2>/dev/null; then
186-
log "Force-killed relay after ${relay_grace_period}s timeout"
189+
if [[ "$relay_terminated" == "false" ]]; then
190+
if kill -9 "$RELAY_PID" 2>/dev/null; then
191+
log "Force-killed relay after ${relay_grace_period}s timeout"
192+
cleanup_failures+=("Relay PID $RELAY_PID required force-kill after ${relay_grace_period}s")
193+
else
194+
cleanup_failures+=("Failed to force-kill relay PID $RELAY_PID (may have already exited)")
195+
fi
187196
fi
188197
((cleaned_count++)) || true
189198
fi
190199

191200
# Kill by process name as fallback
192-
pkill -f "relay.mjs" 2>/dev/null && ((cleaned_count++)) || true
193-
pkill -f "node.*network.sh" 2>/dev/null && ((cleaned_count++)) || true
201+
if pkill -f "relay.mjs" 2>/dev/null; then
202+
((cleaned_count++)) || true
203+
cleanup_failures+=("Found relay.mjs processes via pkill (PID tracking may have failed)")
204+
fi
205+
if pkill -f "node.*network.sh" 2>/dev/null; then
206+
((cleaned_count++)) || true
207+
cleanup_failures+=("Found network.sh processes via pkill (PID tracking may have failed)")
208+
fi
194209

195210
# Kill all spawned node processes
196211
if [[ ${#CLEANUP_PIDS[@]} -gt 0 ]]; then
197212
log "Cleaning up ${#CLEANUP_PIDS[@]} node processes"
213+
local failed_kills=()
198214
for pid in "${CLEANUP_PIDS[@]}"; do
199215
if is_valid_pid "$pid"; then
200-
kill "$pid" 2>/dev/null || true
201-
((cleaned_count++)) || true
216+
if ! kill "$pid" 2>/dev/null; then
217+
failed_kills+=("$pid")
218+
else
219+
((cleaned_count++)) || true
220+
fi
202221
fi
203222
done
204223

224+
if [[ ${#failed_kills[@]} -gt 0 ]]; then
225+
cleanup_failures+=("Failed to send SIGTERM to node PIDs: ${failed_kills[*]}")
226+
fi
227+
205228
# Give processes time to clean up gracefully
206229
sleep "$node_grace_period"
207230

208231
# Force kill remaining processes
209-
# No race condition: kill -9 will simply fail if process already dead
232+
local force_killed=()
233+
local still_running=()
210234
for pid in "${CLEANUP_PIDS[@]}"; do
211235
if is_valid_pid "$pid"; then
212236
if kill -9 "$pid" 2>/dev/null; then
213237
log "Force-killed remaining process $pid"
238+
force_killed+=("$pid")
239+
else
240+
# Process may have exited between check and kill
241+
if kill -0 "$pid" 2>/dev/null; then
242+
still_running+=("$pid")
243+
fi
214244
fi
215245
fi
216246
done
247+
248+
if [[ ${#force_killed[@]} -gt 0 ]]; then
249+
cleanup_failures+=("Node PIDs required force-kill after ${node_grace_period}s: ${force_killed[*]}")
250+
fi
251+
if [[ ${#still_running[@]} -gt 0 ]]; then
252+
cleanup_failures+=("Failed to kill node PIDs: ${still_running[*]}")
253+
fi
217254
fi
218255

219256
# Clean up ports used by the test
@@ -228,7 +265,9 @@ cleanup() {
228265
fi
229266

230267
# Final cleanup of any troupe processes
231-
pkill -9 -f "node.*troupe" || true
268+
if pkill -9 -f "node.*troupe" 2>/dev/null; then
269+
cleanup_failures+=("Found lingering troupe processes requiring pkill -9")
270+
fi
232271

233272
# Clean up temporary directory (preserve on failure for debugging)
234273
if [[ -n "$TEMP_DIR" && -d "$TEMP_DIR" ]]; then
@@ -253,6 +292,20 @@ cleanup() {
253292
log "Cleaned up $cleaned_count processes, waiting ${socket_release_wait}s for socket release..."
254293
sleep "$socket_release_wait"
255294

295+
# Report cleanup issues if any occurred
296+
# These are informational only and don't affect the test result
297+
if [[ ${#cleanup_failures[@]} -gt 0 ]]; then
298+
echo "" >&2
299+
echo "=== Cleanup Issues Report (non-fatal) ===" >&2
300+
echo "The following cleanup operations had issues:" >&2
301+
for failure in "${cleanup_failures[@]}"; do
302+
echo " - $failure" >&2
303+
done
304+
echo "" >&2
305+
echo "Note: These cleanup issues did not affect the test result (exit code: $exit_code)" >&2
306+
echo "They are reported for diagnostic purposes only." >&2
307+
fi
308+
256309
# CRITICAL: Exit with the original exit code from the test, not from cleanup
257310
# Without this, the script would exit with the status of the last cleanup command
258311
# This is why tests were "failing" in CI even though they succeeded

0 commit comments

Comments
 (0)