@@ -149,6 +149,7 @@ cleanup() {
149149 log " Cleaning up test processes (exit code: $exit_code )..."
150150
151151 local cleaned_count=0
152+ local cleanup_failures=() # Track cleanup operations that fail
152153
153154 # Calculate cleanup timeouts with CI scaling
154155 # In CI environments, processes take longer to terminate gracefully
@@ -167,53 +168,89 @@ cleanup() {
167168 # Kill relay first with graceful shutdown attempt
168169 if is_valid_pid " $RELAY_PID " ; then
169170 log " Stopping relay (PID: $RELAY_PID )"
170- kill " $RELAY_PID " 2> /dev/null || true
171+ if ! kill " $RELAY_PID " 2> /dev/null; then
172+ cleanup_failures+=(" Failed to send SIGTERM to relay PID $RELAY_PID " )
173+ fi
171174
172175 # Wait for graceful shutdown with timeout
173176 local wait_count=0
177+ local relay_terminated=false
174178 while [[ $wait_count -lt $relay_grace_period ]]; do
175179 if ! kill -0 " $RELAY_PID " 2> /dev/null; then
176180 log " Relay terminated gracefully"
181+ relay_terminated=true
177182 break
178183 fi
179184 sleep 1
180185 (( wait_count++ )) || true
181186 done
182187
183188 # Force kill if still running
184- # No race condition: kill -9 will simply fail if process already dead
185- if kill -9 " $RELAY_PID " 2> /dev/null; then
186- log " Force-killed relay after ${relay_grace_period} s timeout"
189+ if [[ " $relay_terminated " == " false" ]]; then
190+ if kill -9 " $RELAY_PID " 2> /dev/null; then
191+ log " Force-killed relay after ${relay_grace_period} s timeout"
192+ cleanup_failures+=(" Relay PID $RELAY_PID required force-kill after ${relay_grace_period} s" )
193+ else
194+ cleanup_failures+=(" Failed to force-kill relay PID $RELAY_PID (may have already exited)" )
195+ fi
187196 fi
188197 (( cleaned_count++ )) || true
189198 fi
190199
191200 # Kill by process name as fallback
192- pkill -f " relay.mjs" 2> /dev/null && (( cleaned_count++ )) || true
193- pkill -f " node.*network.sh" 2> /dev/null && (( cleaned_count++ )) || true
201+ if pkill -f " relay.mjs" 2> /dev/null; then
202+ (( cleaned_count++ )) || true
203+ cleanup_failures+=(" Found relay.mjs processes via pkill (PID tracking may have failed)" )
204+ fi
205+ if pkill -f " node.*network.sh" 2> /dev/null; then
206+ (( cleaned_count++ )) || true
207+ cleanup_failures+=(" Found network.sh processes via pkill (PID tracking may have failed)" )
208+ fi
194209
195210 # Kill all spawned node processes
196211 if [[ ${# CLEANUP_PIDS[@]} -gt 0 ]]; then
197212 log " Cleaning up ${# CLEANUP_PIDS[@]} node processes"
213+ local failed_kills=()
198214 for pid in " ${CLEANUP_PIDS[@]} " ; do
199215 if is_valid_pid " $pid " ; then
200- kill " $pid " 2> /dev/null || true
201- (( cleaned_count++ )) || true
216+ if ! kill " $pid " 2> /dev/null; then
217+ failed_kills+=(" $pid " )
218+ else
219+ (( cleaned_count++ )) || true
220+ fi
202221 fi
203222 done
204223
224+ if [[ ${# failed_kills[@]} -gt 0 ]]; then
225+ cleanup_failures+=(" Failed to send SIGTERM to node PIDs: ${failed_kills[*]} " )
226+ fi
227+
205228 # Give processes time to clean up gracefully
206229 sleep " $node_grace_period "
207230
208231 # Force kill remaining processes
209- # No race condition: kill -9 will simply fail if process already dead
232+ local force_killed=()
233+ local still_running=()
210234 for pid in " ${CLEANUP_PIDS[@]} " ; do
211235 if is_valid_pid " $pid " ; then
212236 if kill -9 " $pid " 2> /dev/null; then
213237 log " Force-killed remaining process $pid "
238+ force_killed+=(" $pid " )
239+ else
240+ # Process may have exited between check and kill
241+ if kill -0 " $pid " 2> /dev/null; then
242+ still_running+=(" $pid " )
243+ fi
214244 fi
215245 fi
216246 done
247+
248+ if [[ ${# force_killed[@]} -gt 0 ]]; then
249+ cleanup_failures+=(" Node PIDs required force-kill after ${node_grace_period} s: ${force_killed[*]} " )
250+ fi
251+ if [[ ${# still_running[@]} -gt 0 ]]; then
252+ cleanup_failures+=(" Failed to kill node PIDs: ${still_running[*]} " )
253+ fi
217254 fi
218255
219256 # Clean up ports used by the test
@@ -228,7 +265,9 @@ cleanup() {
228265 fi
229266
230267 # Final cleanup of any troupe processes
231- pkill -9 -f " node.*troupe" || true
268+ if pkill -9 -f " node.*troupe" 2> /dev/null; then
269+ cleanup_failures+=(" Found lingering troupe processes requiring pkill -9" )
270+ fi
232271
233272 # Clean up temporary directory (preserve on failure for debugging)
234273 if [[ -n " $TEMP_DIR " && -d " $TEMP_DIR " ]]; then
@@ -253,6 +292,20 @@ cleanup() {
253292 log " Cleaned up $cleaned_count processes, waiting ${socket_release_wait} s for socket release..."
254293 sleep " $socket_release_wait "
255294
295+ # Report cleanup issues if any occurred
296+ # These are informational only and don't affect the test result
297+ if [[ ${# cleanup_failures[@]} -gt 0 ]]; then
298+ echo " " >&2
299+ echo " === Cleanup Issues Report (non-fatal) ===" >&2
300+ echo " The following cleanup operations had issues:" >&2
301+ for failure in " ${cleanup_failures[@]} " ; do
302+ echo " - $failure " >&2
303+ done
304+ echo " " >&2
305+ echo " Note: These cleanup issues did not affect the test result (exit code: $exit_code )" >&2
306+ echo " They are reported for diagnostic purposes only." >&2
307+ fi
308+
256309 # CRITICAL: Exit with the original exit code from the test, not from cleanup
257310 # Without this, the script would exit with the status of the last cleanup command
258311 # This is why tests were "failing" in CI even though they succeeded
0 commit comments