3232
3333 # Strict runtime
3434 ################
35+
3536 # e: Immediately exit if any command has a non-zero exit status
3637 # u: Reference to non previously defined variables is an error
3738 # pipefail: Any failed command in a pipeline is used as return code
3839 set -euo pipefail
3940
4041 # Fetch profile parameters
4142 ##########################
42- network_magic=$(${ jq } /bin/jq .genesis.network_magic ../profile.json)
43- slot_duration=$(${ jq } /bin/jq .genesis.slot_duration ../profile.json)
44- epoch_length=$(${ jq } /bin/jq .genesis.epoch_length ../profile.json)
45- active_slots_coeff=$(${ jq } /bin/jq .genesis.active_slots_coeff ../profile.json)
46- #active_slots="$((epoch_length * active_slots_coeff))"
43+
44+ network_magic="$(${ jq } /bin/jq .genesis.network_magic ../profile.json)"
45+ # Slot duration in seconds
46+ slot_duration="$(${ jq } /bin/jq .genesis.slot_duration ../profile.json)"
47+ # Number of slots per epoch
48+ epoch_length="$(${ jq } /bin/jq .genesis.epoch_length ../profile.json)"
49+ # Average (%) of slots per epoch that can be used to mint blocks
50+ active_slots_coeff="$(${ jq } /bin/jq .genesis.active_slots_coeff ../profile.json)"
51+ # The number of active slots per epoch
52+ active_slots="$(${ jq } /bin/jq --null-input -r \
53+ "'' ${epoch_length} * '' ${active_slots_coeff}" \
54+ )"
4755 ${ coreutils } /bin/echo "profile.json:"
4856 ${ coreutils } /bin/echo "- network_magic: '' ${network_magic}"
4957 ${ coreutils } /bin/echo "- slot_duration: '' ${slot_duration}"
5058 ${ coreutils } /bin/echo "- epoch_length: '' ${epoch_length}"
5159 ${ coreutils } /bin/echo "- active_slots_coeff: '' ${active_slots_coeff}"
52- # ${ coreutils } /bin/echo "- active_slots: '' ${active_slots}"
60+ ${ coreutils } /bin/echo "- active_slots: '' ${active_slots}"
5361
5462 # Fetch node names (Including "explorer" nodes)
5563 ###############################################
56- node_specs_nodes=$(${ jq } /bin/jq --raw-output "keys | join (\" \")" ../node-specs.json)
57- node_specs_pools=$(${ jq } /bin/jq 'map(select(.kind == "pool")) | length' ../node-specs.json)
64+
65+ node_specs_nodes=$(${ jq } /bin/jq --raw-output \
66+ "keys | join (\" \")" \
67+ ../node-specs.json \
68+ )
69+ node_specs_pools=$(${ jq } /bin/jq \
70+ 'map(select(.kind == "pool")) | length' \
71+ ../node-specs.json \
72+ )
5873 ${ coreutils } /bin/echo "node-specs.json:"
5974 ${ coreutils } /bin/echo "- Nodes: ['' ${node_specs_nodes[*]}]"
6075 ${ coreutils } /bin/echo "- Pools: '' ${node_specs_pools}"
6176
6277 # Look for available nodes and allocate healthcheck
6378 ###################################################
79+
6480 nodes=()
6581 now=$(${ coreutils } /bin/date +%s)
6682 for node in '' ${node_specs_nodes[*]}
8096
8197 # Look for the generator
8298 ########################
99+ generator=0
83100 if test -d "../generator"
84101 then
102+ generator=1
85103 ${ coreutils } /bin/echo "Found deployed generator"
86104 else
105+ generator=0
87106 ${ coreutils } /bin/echo "Found no deployed generator"
88107 fi
89108
92111
93112 # The main function, called at the end of the file/script.
94113 function healthcheck() {
95- # Start the healthcheck infinite loop
114+ # Ignore PIPE "errors", mixing 'jq', 'tac', 'grep' and/or 'head'
115+ # will evetually throw a PIPE exception (see jq_node_stdout_last).
96116 trap "${ coreutils } /bin/echo \"trap PIPE\" >&2" PIPE
117+
97118 msg "Started!"
119+
98120 # Do a one and only networking/latency test!
99121 for node in '' ${nodes[*]}
100122 do
101123 latency_topology_producers "'' ${node}"
102124 done
125+
126+ # Start the healthcheck infinite loop
103127 while true
104128 do
129+
130+ # First available nodes
105131 for node in '' ${nodes[*]}
106132 do
107133 healthcheck_node "'' ${node}"
108134 done
109- healthcheck_generator
110- ${ coreutils } /bin/sleep 5 # Enough?
135+
136+ # Then generator if available
137+ if test "'' ${generator}" != "0"
138+ then
139+ healthcheck_generator
140+ fi
141+
142+ if test "'' ${#nodes[@]}" = "1"
143+ then
144+ # This healthcheck run is monitoring only one node
145+ # This is the case for all Nomad runs, either local or cloud
146+ ${ coreutils } /bin/sleep 10
147+ else
148+ # This healthcheck run is monitoring many nodes
149+ # Local/supervisord uses one healthcheck for the entire cluster
150+ ${ coreutils } /bin/sleep 1
151+ fi
152+
111153 done
154+
112155 trap - PIPE
113156 }
114157
158+ # Latency ############################################################
115159 ######################################################################
116160
117161 function latency_topology_producers() {
134178 done
135179 }
136180
181+ # Node ###############################################################
182+ ######################################################################
183+
137184 function healthcheck_node() {
138185 local node=$1
139- if is_program_running "'' ${node}"
186+ # Checks if the node has not exited with errors
187+ if assert_program_running "'' ${node}"
140188 then
141189 # The node is running! ###########################################
142190 ##################################################################
@@ -175,12 +223,15 @@ let
175223 }
176224
177225 function healthcheck_generator() {
178- # Only checks that the generator has not exited with errors
179- is_program_running "generator" || true
226+ # Checks if the generator has not exited with errors
227+ assert_program_running "generator" || true
180228 }
181229
182- function is_program_running() {
230+ # Error if program exits with a non-zero exit code, else returns
231+ # 'true' if running or 'false' if not running.
232+ function assert_program_running() {
183233 local program=$1
234+ # Using the "exit_code" files created by our supervisord config
184235 local exit_code_path="../'' ${program}/exit_code"
185236 # File exists and is a regular file?
186237 if ! test -f "'' ${exit_code_path}"
279330 local now=$(${ coreutils } /bin/date +%s)
280331 local last_forged
281332 last_forged=$(last_block_forged "'' ${node}")
282- if test -z "'' ${last_forged}"
333+ # Just for precaution also check if 'jq' returned "null"
334+ if test -z "'' ${last_forged}" || test "'' ${last_forged}" = "null"
283335 then
284336 start_time=$(${ coreutils } /bin/cat "../'' ${node}/healthcheck/start_time")
285337 if test $((now - start_time)) -ge 300
288340 fi
289341 else
290342 ${ coreutils } /bin/echo "'' ${last_forged}" > "../'' ${node}/healthcheck/last_forged"
291- start_time=$(${ coreutils } /bin/echo "'' ${last_forged}" | ${ jq } /bin/jq .at )
343+ start_time=$(msg_unix_time "'' ${last_forged}")
292344 if test $((now - start_time)) -ge 120
293345 then
294346 exit_healthcheck "'' ${node}: More than 2m with no newer blocks forged"
@@ -302,19 +354,22 @@ let
302354 local now=$(${ coreutils } /bin/date +%s)
303355 local last_block
304356 last_block=$(last_block_transmitted "'' ${node}")
305- if test -z "'' ${last_block}"
357+ # Just for precaution also check if 'jq' returned "null"
358+ if test -z "'' ${last_block}" || test "'' ${last_block}" = "null"
306359 then
307360 start_time=$(${ coreutils } /bin/cat "../'' ${node}/healthcheck/start_time")
308361 if test $((now - start_time)) -ge 300
309362 then
363+ # This is fatal error, exit!
310364 exit_healthcheck "'' ${node}: More than 5m without a first block sent or received"
311365 fi
312366 else
313367 ${ coreutils } /bin/echo "'' ${last_block}" > "../'' ${node}/healthcheck/last_block"
314- start_time=$(${ coreutils } /bin/echo "'' ${last_block}" | ${ jq } /bin/jq .at )
315- if test $((now - start_time)) -ge 180
368+ start_time=$(msg_unix_time "'' ${last_block}")
369+ if test $((now - start_time)) -ge 60
316370 then
317- exit_healthcheck "'' ${node}: More than 3m with no newer blocks sent or received\n'' ${last_block}"
371+ # This is just a warning, don't exit!
372+ msg "'' ${node}: More than 1m with no newer blocks sent or received\n'' ${last_block}"
318373 fi
319374 fi
320375 }
@@ -324,27 +379,50 @@ let
324379 local start_time
325380 local now=$(${ coreutils } /bin/date +%s)
326381 local last_txs
327- last_txs=$(last_block_with_txs "'' ${node}")
328- if test -z "'' ${last_txs}"
382+ last_txs=$(last_block_with_txs_received "'' ${node}")
383+ # Just for precaution also check if 'jq' returned "null"
384+ if test -z "'' ${last_txs}" || test "'' ${last_txs}" = "null"
329385 then
330386 start_time=$(${ coreutils } /bin/cat "../'' ${node}/healthcheck/start_time")
331387 if test $((now - start_time)) -ge 300
332388 then
389+ # This is fatal error, exit!
333390 exit_healthcheck "'' ${node}: More than 5m without a first block with transactions"
334391 fi
335392 else
336393 ${ coreutils } /bin/echo "'' ${last_txs}" > "../'' ${node}/healthcheck/last_txs"
337- start_time=$(${ coreutils } /bin/echo "'' ${last_txs}" | ${ jq } /bin/jq .at)
394+ start_time=$(msg_unix_time "'' ${last_txs}")
395+ # TODO: Ask for "data.msg.txIds" in "BlockFetch.Server.SendBlock"
396+ # Doing 3 minutes because there's no log message for block sent
397+ # that includes the transaction details to know if its empty!
338398 if test $((now - start_time)) -ge 180
339399 then
340- exit_healthcheck "'' ${node}: More than 3m with no newer blocks with transactions\n'' ${last_txs}"
400+ # This is just a warning, don't exit!
401+ msg "'' ${node}: More than 3m with no newer blocks with transactions\n'' ${last_txs}"
341402 fi
342403 fi
343404 }
344405
345- # Helper/auxiliary functions!
406+ # Helper/auxiliary functions! ########################################
346407 ######################################################################
347408
409+ # The "at" time has format "2023-05-16 19:57:19.0231Z" and I can't
410+ # parse it using any standard `date` format so I'm stripping the
411+ # milliseconds part and converting to Unix time (Integer).
412+ function msg_unix_time() {
413+ local msg=$1
414+ echo "'' ${msg}" \
415+ | \
416+ ${ jq } /bin/jq -r \
417+ '
418+ .at[:20]
419+ |
420+ strptime("%Y-%m-%d %H:%M:%S.")
421+ |
422+ mktime
423+ '
424+ }
425+
348426 # Gets the last "matching" JSON message from a Node's stdout file.
349427 #
350428 # To avoid reading the whole node's "stdout" file its contents are
369447 # finishes and exists `tac` or `grep` may throw the following error:
370448 # "writing output failed: Broken pipe"
371449 #
372- # Finally the "at" time has format "2023-05-16 19:57:19.0231Z" and I
373- # can't parse it using any standard `date` format so I'm stripping the
374- # milliseconds part and converting to Unix time (Integer). This has to
375- # be done after filtering for "null" inputs that are the output of
376- # 'nth(0;...)', if not an error occurs.
450+ # Finally filter for "null" inputs that are the output of 'nth(0;...)'
451+ # if no occurrence. Return the empty "string" if no value.
377452 #
378453 # $1: node name
379454 # $2: jq's query string
393468 --null-input \
394469 "nth(0; inputs | select('' ${select}))" \
395470 | \
396- ${ jq } /bin/jq \
397- ' select( . != null )
398- |
399- (
400- .at
401- |=
402- (.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime)
403- )
404- ' \
471+ ${ jq } /bin/jq 'select(. != null)' \
405472 || \
406473 { return_code="$?"; pipe_status="'' ${PIPESTATUS[@]}"; } \
407474 )"
444511 | \
445512 { ${ coreutils } /bin/head --lines=+1 2>/dev/null; } \
446513 | \
447- ${ jq } /bin/jq \
448- '
449- .at
450- |=
451- (.[:20] | strptime("%Y-%m-%d %H:%M:%S.") | mktime)
452- ' \
514+ ${ jq } /bin/jq 'select(. != null)' \
453515 || \
454516 { return_code="$?"; pipe_status="'' ${PIPESTATUS[@]}"; } \
455517 )"
640702 # "thread": "77",
641703 # "host": "localhost"
642704 # }
643- function last_block_with_txs () {
705+ function last_block_with_txs_received () {
644706 local node=$1
645707 if ! jq_node_stdout_last "'' ${node}" \
646708 '
657719 (.data.msg?.txIds != [])
658720 '
659721 then
660- exit_22 "jq error: last_block_with_txs : '' ${node}"
722+ exit_22 "jq error: last_block_with_txs_received : '' ${node}"
661723 fi
662724 }
663725
0 commit comments