Skip to content

Commit 984b8f2

Browse files
committed
improvement:refactoring of the multirunner
1 parent dc2d60c commit 984b8f2

File tree

1 file changed

+83
-48
lines changed

1 file changed

+83
-48
lines changed

scripts/multinode-runner.sh

Lines changed: 83 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,72 @@ if [[ -n "${CI:-}" ]] || [[ -n "${GITHUB_ACTIONS:-}" ]]; then
4848
echo "[INFO] CI environment detected, scaling timeouts by ${TIMEOUT_SCALE}x" >&2
4949
fi
5050

51+
# ============================================================================
52+
# Helper Functions - Common operations extracted to reduce duplication
53+
# ============================================================================
54+
55+
# Check if a PID is valid and the process is running
56+
is_valid_pid() {
57+
local pid="$1"
58+
[[ -n "$pid" ]] && [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" 2>/dev/null
59+
}
60+
61+
# Kill all processes listening on a specific port (OS-aware)
62+
kill_processes_on_port() {
63+
local port="$1"
64+
if command -v lsof >/dev/null 2>&1; then
65+
if [[ "$(uname)" == "Darwin" ]]; then
66+
lsof -ti ":$port" | xargs kill -9 2>/dev/null || true
67+
else
68+
lsof -ti ":$port" | xargs -r kill -9 2>/dev/null || true
69+
fi
70+
fi
71+
}
72+
73+
# Get all ports used by the test (nodes + relay)
74+
get_all_ports() {
75+
local config_file="$1"
76+
local ports=($(jq -r '.nodes[].port' "$config_file" 2>/dev/null))
77+
local relay_port=$(jq -r '.network.relay_port // 5555' "$config_file" 2>/dev/null)
78+
ports+=("$relay_port")
79+
echo "${ports[@]}"
80+
}
81+
82+
# Get the number of nodes in the test configuration
83+
get_node_count() {
84+
local config_file="$1"
85+
jq -r '.nodes | length' "$config_file"
86+
}
87+
88+
# Display standardized error message for node failures
89+
display_node_error() {
90+
local node_id="$1"
91+
local error_type="$2" # "timeout" or "exit_code"
92+
local actual="$3"
93+
local expected="$4"
94+
local output_file="$5"
95+
local error_file="$6"
96+
local timeout_val="${7:-}" # Optional, only for timeout errors
97+
98+
if [[ "$error_type" == "timeout" ]]; then
99+
echo "ERROR: Node $node_id timed out unexpectedly" >&2
100+
echo " Timeout: ${actual}s (original: ${timeout_val}s)" >&2
101+
echo " Expected exit code: $expected" >&2
102+
else
103+
echo "ERROR: Node $node_id exit code mismatch" >&2
104+
echo " Actual: $actual" >&2
105+
echo " Expected: $expected" >&2
106+
fi
107+
108+
echo " Last 10 lines of output:" >&2
109+
tail -n 10 "$output_file" 2>/dev/null | sed 's/^/ /' >&2
110+
111+
if [[ "$error_type" == "exit_code" ]] && [[ -s "$error_file" ]]; then
112+
echo " Last 5 lines of errors:" >&2
113+
tail -n 5 "$error_file" 2>/dev/null | sed 's/^/ /' >&2
114+
fi
115+
}
116+
51117
usage() {
52118
cat << EOF
53119
Usage: $0 [options] <test-config.json>
@@ -73,7 +139,7 @@ cleanup() {
73139
local cleaned_count=0
74140

75141
# Kill relay first with multiple attempts
76-
if [[ -n "$RELAY_PID" ]] && [[ "$RELAY_PID" =~ ^[0-9]+$ ]] && kill -0 "$RELAY_PID" 2>/dev/null; then
142+
if is_valid_pid "$RELAY_PID"; then
77143
log "Stopping relay (PID: $RELAY_PID)"
78144
kill "$RELAY_PID" 2>/dev/null || true
79145
sleep 1
@@ -92,7 +158,7 @@ cleanup() {
92158
if [[ ${#CLEANUP_PIDS[@]} -gt 0 ]]; then
93159
log "Cleaning up ${#CLEANUP_PIDS[@]} node processes"
94160
for pid in "${CLEANUP_PIDS[@]}"; do
95-
if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" 2>/dev/null; then
161+
if is_valid_pid "$pid"; then
96162
kill "$pid" 2>/dev/null || true
97163
((cleaned_count++))
98164
fi
@@ -103,7 +169,7 @@ cleanup() {
103169

104170
# Force kill remaining processes
105171
for pid in "${CLEANUP_PIDS[@]}"; do
106-
if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" 2>/dev/null; then
172+
if is_valid_pid "$pid"; then
107173
log "Force-killing remaining process $pid"
108174
kill -9 "$pid" 2>/dev/null || true
109175
fi
@@ -112,21 +178,11 @@ cleanup() {
112178

113179
# Clean up ports used by the test
114180
if [[ -n "$TEST_CONFIG" ]]; then
115-
local ports=($(jq -r '.nodes[].port' "$TEST_CONFIG" 2>/dev/null))
116-
local relay_port=$(jq -r '.network.relay_port // 5555' "$TEST_CONFIG" 2>/dev/null)
117-
ports+=("$relay_port")
118-
181+
local ports=($(get_all_ports "$TEST_CONFIG"))
182+
119183
for port in "${ports[@]}"; do
120184
if [[ "$port" =~ ^[0-9]+$ ]]; then
121-
# Kill any process listening on this port
122-
if command -v lsof >/dev/null 2>&1; then
123-
# macOS doesn't support -r flag for xargs
124-
if [[ "$(uname)" == "Darwin" ]]; then
125-
lsof -ti ":$port" | xargs kill -9 2>/dev/null || true
126-
else
127-
lsof -ti ":$port" | xargs -r kill -9 2>/dev/null || true
128-
fi
129-
fi
185+
kill_processes_on_port "$port"
130186
fi
131187
done
132188
fi
@@ -187,7 +243,7 @@ wait_for_port_release() {
187243
local port="$1"
188244
local max_wait=10
189245
local wait_count=0
190-
246+
191247
while [[ $wait_count -lt $max_wait ]]; do
192248
if check_port_available "$port"; then
193249
log "Port $port is now available"
@@ -197,28 +253,19 @@ wait_for_port_release() {
197253
sleep 1
198254
((wait_count++))
199255
done
200-
256+
201257
# Force kill processes using the port
202258
log "Force killing processes on port $port"
203-
if command -v lsof >/dev/null 2>&1; then
204-
# macOS doesn't support -r flag for xargs
205-
if [[ "$(uname)" == "Darwin" ]]; then
206-
lsof -ti ":$port" | xargs kill -9 2>/dev/null || true
207-
else
208-
lsof -ti ":$port" | xargs -r kill -9 2>/dev/null || true
209-
fi
210-
fi
259+
kill_processes_on_port "$port"
211260
sleep 1
212261
return 0
213262
}
214263

215264
validate_ports() {
216265
local config_file="$1"
217266
log "Validating port availability..."
218-
219-
local ports=($(jq -r '.nodes[].port' "$config_file" 2>/dev/null))
220-
local relay_port=$(jq -r '.network.relay_port // 5555' "$config_file" 2>/dev/null)
221-
ports+=("$relay_port")
267+
268+
local ports=($(get_all_ports "$config_file"))
222269

223270
local port_conflict=false
224271
for port in "${ports[@]}"; do
@@ -267,7 +314,7 @@ setup_network_identities() {
267314

268315
# Generate node identities
269316
local node_count
270-
node_count=$(jq -r '.nodes | length' "$config_file")
317+
node_count=$(get_node_count "$config_file")
271318

272319
for ((i=0; i<node_count; i++)); do
273320
local node_id
@@ -606,23 +653,11 @@ run_node() {
606653
if [[ "$expected_exit_code" == "124" ]]; then
607654
log "Node $node_id timed out as expected after ${scaled_timeout}s"
608655
else
609-
echo "ERROR: Node $node_id timed out unexpectedly" >&2
610-
echo " Timeout: ${scaled_timeout}s (original: ${timeout_val}s)" >&2
611-
echo " Expected exit code: $expected_exit_code" >&2
612-
echo " Last 10 lines of output:" >&2
613-
tail -n 10 "$output_file" 2>/dev/null | sed 's/^/ /' >&2
656+
display_node_error "$node_id" "timeout" "$scaled_timeout" "$expected_exit_code" "$output_file" "$error_file" "$timeout_val"
614657
error "Node $node_id timed out unexpectedly"
615658
fi
616659
elif [[ "$actual_exit_code" != "$expected_exit_code" ]]; then
617-
echo "ERROR: Node $node_id exit code mismatch" >&2
618-
echo " Actual: $actual_exit_code" >&2
619-
echo " Expected: $expected_exit_code" >&2
620-
echo " Last 10 lines of output:" >&2
621-
tail -n 10 "$output_file" 2>/dev/null | sed 's/^/ /' >&2
622-
if [[ -s "$error_file" ]]; then
623-
echo " Last 5 lines of errors:" >&2
624-
tail -n 5 "$error_file" 2>/dev/null | sed 's/^/ /' >&2
625-
fi
660+
display_node_error "$node_id" "exit_code" "$actual_exit_code" "$expected_exit_code" "$output_file" "$error_file"
626661
error "Node $node_id exited with code $actual_exit_code, expected $expected_exit_code"
627662
fi
628663

@@ -649,7 +684,7 @@ merge_outputs() {
649684
"sequential")
650685
# Concatenate outputs in node order
651686
local node_count
652-
node_count=$(jq -r '.nodes | length' "$config_file")
687+
node_count=$(get_node_count "$config_file")
653688

654689
for ((i=0; i<node_count; i++)); do
655690
local node_id
@@ -665,7 +700,7 @@ merge_outputs() {
665700
"per_node")
666701
# Output each node separately without filtering
667702
local node_count
668-
node_count=$(jq -r '.nodes | length' "$config_file")
703+
node_count=$(get_node_count "$config_file")
669704

670705
for ((i=0; i<node_count; i++)); do
671706
local node_id
@@ -714,7 +749,7 @@ run_test() {
714749
coordination=$(jq -r '.coordination // "parallel"' "$config_file")
715750

716751
local node_count
717-
node_count=$(jq -r '.nodes | length' "$config_file")
752+
node_count=$(get_node_count "$config_file")
718753

719754
case "$coordination" in
720755
"parallel")

0 commit comments

Comments
 (0)