@@ -29,7 +29,18 @@ set -euo pipefail
2929# 6. CI SCALING: In CI environments, timeouts are scaled by 1.2x (20% increase)
3030# to account for slower machines. This is a simple multiplier, not retry logic.
3131#
32- # 7. PORT MANAGEMENT: Tests use specific ports. Cleanup must be thorough to
32+ # 7. EXIT CODE PRESERVATION: The cleanup function MUST preserve the original test
33+ # exit code. Without 'exit $exit_code' at the end of cleanup, the script would
34+ # exit with the status of the last cleanup command, causing false test failures.
35+ #
36+ # 8. CLEANUP TIMING: Cleanup operations need CI scaling too. Processes take longer
37+ # to terminate gracefully in CI. We give relay/nodes 3s base (3.6s in CI) to
38+ # shutdown cleanly before force-killing to avoid race conditions.
39+ #
40+ # 9. ARITHMETIC SAFETY: Use '|| true' after ((var++)) when set -e is active.
41+ # In bash, ((0)) returns exit code 1, which terminates the script with set -e.
42+ #
43+ # 10. PORT MANAGEMENT: Tests use specific ports. Cleanup must be thorough to
3344# prevent "port already in use" errors, but without retrying binds.
3445
3546SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
@@ -132,22 +143,49 @@ EOF
132143}
133144
134145cleanup () {
135- # Capture exit code before any other commands
146+ # CRITICAL: Capture the original exit code immediately
147+ # This must be the FIRST command in cleanup to preserve test results
136148 exit_code=$?
137149 log " Cleaning up test processes (exit code: $exit_code )..."
138150
139151 local cleaned_count=0
140152
141- # Kill relay first with multiple attempts
153+ # Calculate cleanup timeouts with CI scaling
154+ # In CI environments, processes take longer to terminate gracefully
155+ local relay_grace_period=3
156+ local node_grace_period=3
157+ local socket_release_wait=3
158+
159+ if [[ -n " ${TIMEOUT_SCALE:- } " ]]; then
160+ # Apply the same scaling factor used for test timeouts to cleanup
161+ relay_grace_period=$( awk " BEGIN {print int($relay_grace_period * $TIMEOUT_SCALE + 0.5)}" )
162+ node_grace_period=$( awk " BEGIN {print int($node_grace_period * $TIMEOUT_SCALE + 0.5)}" )
163+ socket_release_wait=$( awk " BEGIN {print int($socket_release_wait * $TIMEOUT_SCALE + 0.5)}" )
164+ log " CI environment: using scaled cleanup timeouts (relay=${relay_grace_period} s, nodes=${node_grace_period} s, sockets=${socket_release_wait} s)"
165+ fi
166+
167+ # Kill relay first with graceful shutdown attempt
142168 if is_valid_pid " $RELAY_PID " ; then
143169 log " Stopping relay (PID: $RELAY_PID )"
144170 kill " $RELAY_PID " 2> /dev/null || true
145- sleep 1
146- if kill -0 " $RELAY_PID " 2> /dev/null; then
147- log " Force-killing relay"
148- kill -9 " $RELAY_PID " 2> /dev/null || true
171+
172+ # Wait for graceful shutdown with timeout
173+ local wait_count=0
174+ while [[ $wait_count -lt $relay_grace_period ]]; do
175+ if ! kill -0 " $RELAY_PID " 2> /dev/null; then
176+ log " Relay terminated gracefully"
177+ break
178+ fi
179+ sleep 1
180+ (( wait_count++ )) || true
181+ done
182+
183+ # Force kill if still running
184+ # No race condition: kill -9 will simply fail if process already dead
185+ if kill -9 " $RELAY_PID " 2> /dev/null; then
186+ log " Force-killed relay after ${relay_grace_period} s timeout"
149187 fi
150- (( cleaned_count++ ))
188+ (( cleaned_count++ )) || true
151189 fi
152190
153191 # Kill by process name as fallback
@@ -160,22 +198,24 @@ cleanup() {
160198 for pid in " ${CLEANUP_PIDS[@]} " ; do
161199 if is_valid_pid " $pid " ; then
162200 kill " $pid " 2> /dev/null || true
163- (( cleaned_count++ ))
201+ (( cleaned_count++ )) || true
164202 fi
165203 done
166204
167- # Give processes more time to clean up
168- sleep 2
205+ # Give processes time to clean up gracefully
206+ sleep " $node_grace_period "
169207
170208 # Force kill remaining processes
209+ # No race condition: kill -9 will simply fail if process already dead
171210 for pid in " ${CLEANUP_PIDS[@]} " ; do
172211 if is_valid_pid " $pid " ; then
173- log " Force-killing remaining process $pid "
174- kill -9 " $pid " 2> /dev/null || true
212+ if kill -9 " $pid " 2> /dev/null; then
213+ log " Force-killed remaining process $pid "
214+ fi
175215 fi
176216 done
177217 fi
178-
218+
179219 # Clean up ports used by the test
180220 if [[ -n " $TEST_CONFIG " ]]; then
181221 local ports=($( get_all_ports " $TEST_CONFIG " ) )
@@ -186,10 +226,10 @@ cleanup() {
186226 fi
187227 done
188228 fi
189-
229+
190230 # Final cleanup of any troupe processes
191231 pkill -9 -f " node.*troupe" || true
192-
232+
193233 # Clean up temporary directory (preserve on failure for debugging)
194234 if [[ -n " $TEMP_DIR " && -d " $TEMP_DIR " ]]; then
195235 if [[ $exit_code -ne 0 ]]; then
@@ -206,11 +246,17 @@ cleanup() {
206246 rm -rf " $TEMP_DIR "
207247 fi
208248 fi
209-
210- # Additional delay to ensure OS releases sockets
211- # This is critical for preventing "port already in use" errors in subsequent tests
212- log " Cleaned up $cleaned_count processes, waiting for socket release..."
213- sleep 3
249+
250+ # Wait for OS to release sockets
251+ # This prevents "port already in use" errors in subsequent tests
252+ # Critical in CI where socket cleanup can be slower
253+ log " Cleaned up $cleaned_count processes, waiting ${socket_release_wait} s for socket release..."
254+ sleep " $socket_release_wait "
255+
256+ # CRITICAL: Exit with the original exit code from the test, not from cleanup
257+ # Without this, the script would exit with the status of the last cleanup command
258+ # This is why tests were "failing" in CI even though they succeeded
259+ exit $exit_code
214260}
215261
216262trap cleanup EXIT INT TERM
@@ -251,7 +297,7 @@ wait_for_port_release() {
251297 fi
252298 log " Port $port still in use, waiting... ($(( max_wait - wait_count)) s remaining)"
253299 sleep 1
254- (( wait_count++ ))
300+ (( wait_count++ )) || true
255301 done
256302
257303 # Force kill processes using the port
@@ -502,7 +548,7 @@ start_relay() {
502548 fi
503549
504550 sleep 0.5
505- (( wait_count++ ))
551+ (( wait_count++ )) || true
506552 done
507553
508554 # Re-enable exit on error before erroring out
0 commit comments