Skip to content

Commit a652690

Browse files
committed
fix:cc:timeout-cleanup-issues
1 parent 984b8f2 commit a652690

File tree

1 file changed

+69
-23
lines changed

1 file changed

+69
-23
lines changed

scripts/multinode-runner.sh

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,18 @@ set -euo pipefail
2929
# 6. CI SCALING: In CI environments, timeouts are scaled by 1.2x (20% increase)
3030
# to account for slower machines. This is a simple multiplier, not retry logic.
3131
#
32-
# 7. PORT MANAGEMENT: Tests use specific ports. Cleanup must be thorough to
32+
# 7. EXIT CODE PRESERVATION: The cleanup function MUST preserve the original test
33+
# exit code. Without 'exit $exit_code' at the end of cleanup, the script would
34+
# exit with the status of the last cleanup command, causing false test failures.
35+
#
36+
# 8. CLEANUP TIMING: Cleanup operations need CI scaling too. Processes take longer
37+
# to terminate gracefully in CI. We give relay/nodes 3s base (3.6s in CI) to
38+
# shutdown cleanly before force-killing to avoid race conditions.
39+
#
40+
# 9. ARITHMETIC SAFETY: Use '|| true' after ((var++)) when set -e is active.
41+
# In bash, ((0)) returns exit code 1, which terminates the script with set -e.
42+
#
43+
# 10. PORT MANAGEMENT: Tests use specific ports. Cleanup must be thorough to
3344
# prevent "port already in use" errors, but without retrying binds.
3445

3546
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -132,22 +143,49 @@ EOF
132143
}
133144

134145
cleanup() {
135-
# Capture exit code before any other commands
146+
# CRITICAL: Capture the original exit code immediately
147+
# This must be the FIRST command in cleanup to preserve test results
136148
exit_code=$?
137149
log "Cleaning up test processes (exit code: $exit_code)..."
138150

139151
local cleaned_count=0
140152

141-
# Kill relay first with multiple attempts
153+
# Calculate cleanup timeouts with CI scaling
154+
# In CI environments, processes take longer to terminate gracefully
155+
local relay_grace_period=3
156+
local node_grace_period=3
157+
local socket_release_wait=3
158+
159+
if [[ -n "${TIMEOUT_SCALE:-}" ]]; then
160+
# Apply the same scaling factor used for test timeouts to cleanup
161+
relay_grace_period=$(awk "BEGIN {print int($relay_grace_period * $TIMEOUT_SCALE + 0.5)}")
162+
node_grace_period=$(awk "BEGIN {print int($node_grace_period * $TIMEOUT_SCALE + 0.5)}")
163+
socket_release_wait=$(awk "BEGIN {print int($socket_release_wait * $TIMEOUT_SCALE + 0.5)}")
164+
log "CI environment: using scaled cleanup timeouts (relay=${relay_grace_period}s, nodes=${node_grace_period}s, sockets=${socket_release_wait}s)"
165+
fi
166+
167+
# Kill relay first with graceful shutdown attempt
142168
if is_valid_pid "$RELAY_PID"; then
143169
log "Stopping relay (PID: $RELAY_PID)"
144170
kill "$RELAY_PID" 2>/dev/null || true
145-
sleep 1
146-
if kill -0 "$RELAY_PID" 2>/dev/null; then
147-
log "Force-killing relay"
148-
kill -9 "$RELAY_PID" 2>/dev/null || true
171+
172+
# Wait for graceful shutdown with timeout
173+
local wait_count=0
174+
while [[ $wait_count -lt $relay_grace_period ]]; do
175+
if ! kill -0 "$RELAY_PID" 2>/dev/null; then
176+
log "Relay terminated gracefully"
177+
break
178+
fi
179+
sleep 1
180+
((wait_count++)) || true
181+
done
182+
183+
# Force kill if still running
184+
# No race condition: kill -9 will simply fail if process already dead
185+
if kill -9 "$RELAY_PID" 2>/dev/null; then
186+
log "Force-killed relay after ${relay_grace_period}s timeout"
149187
fi
150-
((cleaned_count++))
188+
((cleaned_count++)) || true
151189
fi
152190

153191
# Kill by process name as fallback
@@ -160,22 +198,24 @@ cleanup() {
160198
for pid in "${CLEANUP_PIDS[@]}"; do
161199
if is_valid_pid "$pid"; then
162200
kill "$pid" 2>/dev/null || true
163-
((cleaned_count++))
201+
((cleaned_count++)) || true
164202
fi
165203
done
166204

167-
# Give processes more time to clean up
168-
sleep 2
205+
# Give processes time to clean up gracefully
206+
sleep "$node_grace_period"
169207

170208
# Force kill remaining processes
209+
# No race condition: kill -9 will simply fail if process already dead
171210
for pid in "${CLEANUP_PIDS[@]}"; do
172211
if is_valid_pid "$pid"; then
173-
log "Force-killing remaining process $pid"
174-
kill -9 "$pid" 2>/dev/null || true
212+
if kill -9 "$pid" 2>/dev/null; then
213+
log "Force-killed remaining process $pid"
214+
fi
175215
fi
176216
done
177217
fi
178-
218+
179219
# Clean up ports used by the test
180220
if [[ -n "$TEST_CONFIG" ]]; then
181221
local ports=($(get_all_ports "$TEST_CONFIG"))
@@ -186,10 +226,10 @@ cleanup() {
186226
fi
187227
done
188228
fi
189-
229+
190230
# Final cleanup of any troupe processes
191231
pkill -9 -f "node.*troupe" || true
192-
232+
193233
# Clean up temporary directory (preserve on failure for debugging)
194234
if [[ -n "$TEMP_DIR" && -d "$TEMP_DIR" ]]; then
195235
if [[ $exit_code -ne 0 ]]; then
@@ -206,11 +246,17 @@ cleanup() {
206246
rm -rf "$TEMP_DIR"
207247
fi
208248
fi
209-
210-
# Additional delay to ensure OS releases sockets
211-
# This is critical for preventing "port already in use" errors in subsequent tests
212-
log "Cleaned up $cleaned_count processes, waiting for socket release..."
213-
sleep 3
249+
250+
# Wait for OS to release sockets
251+
# This prevents "port already in use" errors in subsequent tests
252+
# Critical in CI where socket cleanup can be slower
253+
log "Cleaned up $cleaned_count processes, waiting ${socket_release_wait}s for socket release..."
254+
sleep "$socket_release_wait"
255+
256+
# CRITICAL: Exit with the original exit code from the test, not from cleanup
257+
# Without this, the script would exit with the status of the last cleanup command
258+
# This is why tests were "failing" in CI even though they succeeded
259+
exit $exit_code
214260
}
215261

216262
trap cleanup EXIT INT TERM
@@ -251,7 +297,7 @@ wait_for_port_release() {
251297
fi
252298
log "Port $port still in use, waiting... ($((max_wait - wait_count))s remaining)"
253299
sleep 1
254-
((wait_count++))
300+
((wait_count++)) || true
255301
done
256302

257303
# Force kill processes using the port
@@ -502,7 +548,7 @@ start_relay() {
502548
fi
503549

504550
sleep 0.5
505-
((wait_count++))
551+
((wait_count++)) || true
506552
done
507553

508554
# Re-enable exit on error before erroring out

0 commit comments

Comments
 (0)