Skip to content

Commit 856781d

Browse files
committed
workbench: fixes to the healthcheck service
1 parent e9c9531 commit 856781d

File tree

1 file changed

+110
-48
lines changed

1 file changed

+110
-48
lines changed

nix/workbench/service/healthcheck.nix

Lines changed: 110 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -32,35 +32,51 @@ let
3232
3333
# Strict runtime
3434
################
35+
3536
# e: Immediately exit if any command has a non-zero exit status
3637
# u: Reference to non previously defined variables is an error
3738
# pipefail: Any failed command in a pipeline is used as return code
3839
set -euo pipefail
3940
4041
# Fetch profile parameters
4142
##########################
42-
network_magic=$(${jq}/bin/jq .genesis.network_magic ../profile.json)
43-
slot_duration=$(${jq}/bin/jq .genesis.slot_duration ../profile.json)
44-
epoch_length=$(${jq}/bin/jq .genesis.epoch_length ../profile.json)
45-
active_slots_coeff=$(${jq}/bin/jq .genesis.active_slots_coeff ../profile.json)
46-
#active_slots="$((epoch_length * active_slots_coeff))"
43+
44+
network_magic="$(${jq}/bin/jq .genesis.network_magic ../profile.json)"
45+
# Slot duration in seconds
46+
slot_duration="$(${jq}/bin/jq .genesis.slot_duration ../profile.json)"
47+
# Number of slots per epoch
48+
epoch_length="$(${jq}/bin/jq .genesis.epoch_length ../profile.json)"
49+
# Average (%) of slots per epoch that can be used to mint blocks
50+
active_slots_coeff="$(${jq}/bin/jq .genesis.active_slots_coeff ../profile.json)"
51+
# The number of active slots per epoch
52+
active_slots="$(${jq}/bin/jq --null-input -r \
53+
"''${epoch_length} * ''${active_slots_coeff}" \
54+
)"
4755
${coreutils}/bin/echo "profile.json:"
4856
${coreutils}/bin/echo "- network_magic: ''${network_magic}"
4957
${coreutils}/bin/echo "- slot_duration: ''${slot_duration}"
5058
${coreutils}/bin/echo "- epoch_length: ''${epoch_length}"
5159
${coreutils}/bin/echo "- active_slots_coeff: ''${active_slots_coeff}"
52-
#${coreutils}/bin/echo "- active_slots: ''${active_slots}"
60+
${coreutils}/bin/echo "- active_slots: ''${active_slots}"
5361
5462
# Fetch node names (Including "explorer" nodes)
5563
###############################################
56-
node_specs_nodes=$(${jq}/bin/jq --raw-output "keys | join (\" \")" ../node-specs.json)
57-
node_specs_pools=$(${jq}/bin/jq 'map(select(.kind == "pool")) | length' ../node-specs.json)
64+
65+
node_specs_nodes=$(${jq}/bin/jq --raw-output \
66+
"keys | join (\" \")" \
67+
../node-specs.json \
68+
)
69+
node_specs_pools=$(${jq}/bin/jq \
70+
'map(select(.kind == "pool")) | length' \
71+
../node-specs.json \
72+
)
5873
${coreutils}/bin/echo "node-specs.json:"
5974
${coreutils}/bin/echo "- Nodes: [''${node_specs_nodes[*]}]"
6075
${coreutils}/bin/echo "- Pools: ''${node_specs_pools}"
6176
6277
# Look for available nodes and allocate healthcheck
6378
###################################################
79+
6480
nodes=()
6581
now=$(${coreutils}/bin/date +%s)
6682
for node in ''${node_specs_nodes[*]}
@@ -80,10 +96,13 @@ let
8096
8197
# Look for the generator
8298
########################
99+
generator=0
83100
if test -d "../generator"
84101
then
102+
generator=1
85103
${coreutils}/bin/echo "Found deployed generator"
86104
else
105+
generator=0
87106
${coreutils}/bin/echo "Found no deployed generator"
88107
fi
89108
@@ -92,26 +111,51 @@ let
92111
93112
# The main function, called at the end of the file/script.
94113
function healthcheck() {
95-
# Start the healthcheck infinite loop
114+
# Ignore PIPE "errors", mixing 'jq', 'tac', 'grep' and/or 'head'
115+
# will evetually throw a PIPE exception (see jq_node_stdout_last).
96116
trap "${coreutils}/bin/echo \"trap PIPE\" >&2" PIPE
117+
97118
msg "Started!"
119+
98120
# Do a one and only networking/latency test!
99121
for node in ''${nodes[*]}
100122
do
101123
latency_topology_producers "''${node}"
102124
done
125+
126+
# Start the healthcheck infinite loop
103127
while true
104128
do
129+
130+
# First available nodes
105131
for node in ''${nodes[*]}
106132
do
107133
healthcheck_node "''${node}"
108134
done
109-
healthcheck_generator
110-
${coreutils}/bin/sleep 5 # Enough?
135+
136+
# Then generator if available
137+
if test "''${generator}" != "0"
138+
then
139+
healthcheck_generator
140+
fi
141+
142+
if test "''${#nodes[@]}" = "1"
143+
then
144+
# This healthcheck run is monitoring only one node
145+
# This is the case for all Nomad runs, either local or cloud
146+
${coreutils}/bin/sleep 10
147+
else
148+
# This healthcheck run is monitoring many nodes
149+
# Local/supervisord uses one healthcheck for the entire cluster
150+
${coreutils}/bin/sleep 1
151+
fi
152+
111153
done
154+
112155
trap - PIPE
113156
}
114157
158+
# Latency ############################################################
115159
######################################################################
116160
117161
function latency_topology_producers() {
@@ -134,9 +178,13 @@ let
134178
done
135179
}
136180
181+
# Node ###############################################################
182+
######################################################################
183+
137184
function healthcheck_node() {
138185
local node=$1
139-
if is_program_running "''${node}"
186+
# Checks if the node has not exited with errors
187+
if assert_program_running "''${node}"
140188
then
141189
# The node is running! ###########################################
142190
##################################################################
@@ -175,12 +223,15 @@ let
175223
}
176224
177225
function healthcheck_generator() {
178-
# Only checks that the generator has not exited with errors
179-
is_program_running "generator" || true
226+
# Checks if the generator has not exited with errors
227+
assert_program_running "generator" || true
180228
}
181229
182-
function is_program_running() {
230+
# Error if program exits with a non-zero exit code, else returns
231+
# 'true' if running or 'false' if not running.
232+
function assert_program_running() {
183233
local program=$1
234+
# Using the "exit_code" files created by our supervisord config
184235
local exit_code_path="../''${program}/exit_code"
185236
# File exists and is a regular file?
186237
if ! test -f "''${exit_code_path}"
@@ -279,7 +330,8 @@ let
279330
local now=$(${coreutils}/bin/date +%s)
280331
local last_forged
281332
last_forged=$(last_block_forged "''${node}")
282-
if test -z "''${last_forged}"
333+
# Just for precaution also check if 'jq' returned "null"
334+
if test -z "''${last_forged}" || test "''${last_forged}" = "null"
283335
then
284336
start_time=$(${coreutils}/bin/cat "../''${node}/healthcheck/start_time")
285337
if test $((now - start_time)) -ge 300
@@ -288,7 +340,7 @@ let
288340
fi
289341
else
290342
${coreutils}/bin/echo "''${last_forged}" > "../''${node}/healthcheck/last_forged"
291-
start_time=$(${coreutils}/bin/echo "''${last_forged}" | ${jq}/bin/jq .at)
343+
start_time=$(msg_unix_time "''${last_forged}")
292344
if test $((now - start_time)) -ge 120
293345
then
294346
exit_healthcheck "''${node}: More than 2m with no newer blocks forged"
@@ -302,19 +354,22 @@ let
302354
local now=$(${coreutils}/bin/date +%s)
303355
local last_block
304356
last_block=$(last_block_transmitted "''${node}")
305-
if test -z "''${last_block}"
357+
# Just for precaution also check if 'jq' returned "null"
358+
if test -z "''${last_block}" || test "''${last_block}" = "null"
306359
then
307360
start_time=$(${coreutils}/bin/cat "../''${node}/healthcheck/start_time")
308361
if test $((now - start_time)) -ge 300
309362
then
363+
# This is fatal error, exit!
310364
exit_healthcheck "''${node}: More than 5m without a first block sent or received"
311365
fi
312366
else
313367
${coreutils}/bin/echo "''${last_block}" > "../''${node}/healthcheck/last_block"
314-
start_time=$(${coreutils}/bin/echo "''${last_block}" | ${jq}/bin/jq .at)
315-
if test $((now - start_time)) -ge 180
368+
start_time=$(msg_unix_time "''${last_block}")
369+
if test $((now - start_time)) -ge 60
316370
then
317-
exit_healthcheck "''${node}: More than 3m with no newer blocks sent or received\n''${last_block}"
371+
# This is just a warning, don't exit!
372+
msg "''${node}: More than 1m with no newer blocks sent or received\n''${last_block}"
318373
fi
319374
fi
320375
}
@@ -324,27 +379,50 @@ let
324379
local start_time
325380
local now=$(${coreutils}/bin/date +%s)
326381
local last_txs
327-
last_txs=$(last_block_with_txs "''${node}")
328-
if test -z "''${last_txs}"
382+
last_txs=$(last_block_with_txs_received "''${node}")
383+
# Just for precaution also check if 'jq' returned "null"
384+
if test -z "''${last_txs}" || test "''${last_txs}" = "null"
329385
then
330386
start_time=$(${coreutils}/bin/cat "../''${node}/healthcheck/start_time")
331387
if test $((now - start_time)) -ge 300
332388
then
389+
# This is fatal error, exit!
333390
exit_healthcheck "''${node}: More than 5m without a first block with transactions"
334391
fi
335392
else
336393
${coreutils}/bin/echo "''${last_txs}" > "../''${node}/healthcheck/last_txs"
337-
start_time=$(${coreutils}/bin/echo "''${last_txs}" | ${jq}/bin/jq .at)
394+
start_time=$(msg_unix_time "''${last_txs}")
395+
# TODO: Ask for "data.msg.txIds" in "BlockFetch.Server.SendBlock"
396+
# Doing 3 minutes because there's no log message for block sent
397+
# that includes the transaction details to know if its empty!
338398
if test $((now - start_time)) -ge 180
339399
then
340-
exit_healthcheck "''${node}: More than 3m with no newer blocks with transactions\n''${last_txs}"
400+
# This is just a warning, don't exit!
401+
msg "''${node}: More than 3m with no newer blocks with transactions\n''${last_txs}"
341402
fi
342403
fi
343404
}
344405
345-
# Helper/auxiliary functions!
406+
# Helper/auxiliary functions! ########################################
346407
######################################################################
347408
409+
# The "at" time has format "2023-05-16 19:57:19.0231Z" and I can't
410+
# parse it using any standard `date` format so I'm stripping the
411+
# milliseconds part and converting to Unix time (Integer).
412+
function msg_unix_time() {
413+
local msg=$1
414+
echo "''${msg}" \
415+
| \
416+
${jq}/bin/jq -r \
417+
'
418+
.at[:20]
419+
|
420+
strptime("%Y-%m-%d %H:%M:%S.")
421+
|
422+
mktime
423+
'
424+
}
425+
348426
# Gets the last "matching" JSON message from a Node's stdout file.
349427
#
350428
# To avoid reading the whole node's "stdout" file its contents are
@@ -369,11 +447,8 @@ let
369447
# finishes and exists `tac` or `grep` may throw the following error:
370448
# "writing output failed: Broken pipe"
371449
#
372-
# Finally the "at" time has format "2023-05-16 19:57:19.0231Z" and I
373-
# can't parse it using any standard `date` format so I'm stripping the
374-
# milliseconds part and converting to Unix time (Integer). This has to
375-
# be done after filtering for "null" inputs that are the output of
376-
# 'nth(0;...)', if not an error occurs.
450+
# Finally filter for "null" inputs that are the output of 'nth(0;...)'
451+
# if no occurrence. Return the empty "string" if no value.
377452
#
378453
# $1: node name
379454
# $2: jq's query string
@@ -393,15 +468,7 @@ let
393468
--null-input \
394469
"nth(0; inputs | select(''${select}))" \
395470
| \
396-
${jq}/bin/jq \
397-
' select( . != null )
398-
|
399-
(
400-
.at
401-
|=
402-
(.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime)
403-
)
404-
' \
471+
${jq}/bin/jq 'select(. != null)' \
405472
|| \
406473
{ return_code="$?"; pipe_status="''${PIPESTATUS[@]}"; } \
407474
)"
@@ -444,12 +511,7 @@ let
444511
| \
445512
{ ${coreutils}/bin/head --lines=+1 2>/dev/null; } \
446513
| \
447-
${jq}/bin/jq \
448-
'
449-
.at
450-
|=
451-
(.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime)
452-
' \
514+
${jq}/bin/jq 'select(. != null)' \
453515
|| \
454516
{ return_code="$?"; pipe_status="''${PIPESTATUS[@]}"; } \
455517
)"
@@ -640,7 +702,7 @@ let
640702
# "thread": "77",
641703
# "host": "localhost"
642704
# }
643-
function last_block_with_txs() {
705+
function last_block_with_txs_received() {
644706
local node=$1
645707
if ! jq_node_stdout_last "''${node}" \
646708
'
@@ -657,7 +719,7 @@ let
657719
(.data.msg?.txIds != [])
658720
'
659721
then
660-
exit_22 "jq error: last_block_with_txs: ''${node}"
722+
exit_22 "jq error: last_block_with_txs_received: ''${node}"
661723
fi
662724
}
663725

0 commit comments

Comments
 (0)