Skip to content

Commit b0ded1a

Browse files
committed
workbench: use the same EC2 instance-id for every node between run
1 parent 04e1319 commit b0ded1a

File tree

1 file changed

+237
-51
lines changed

1 file changed

+237
-51
lines changed

nix/workbench/backend/nomad/cloud.sh

Lines changed: 237 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,16 @@ backend_nomadcloud() {
150150
then
151151
msg $(yellow "WARNING: Amazon S3 \"AWS_ACCESS_KEY_ID\" or \"AWS_SECRET_ACCESS_KEY\" envar is not set")
152152
msg $(blue "INFO: Fetching \"AWS_ACCESS_KEY_ID\" and \"AWS_SECRET_ACCESS_KEY\" from SRE provided Vault for \"Performance and Tracing\"")
153-
local aws_credentials="$(wb_nomad vault world aws-s3-credentials)"
153+
local aws_credentials
154+
aws_credentials="$(wb_nomad vault world aws-s3-credentials)"
154155
export AWS_ACCESS_KEY_ID=$(echo "${aws_credentials}" | jq -r .data.access_key)
155156
export AWS_SECRET_ACCESS_KEY=$(echo "${aws_credentials}" | jq -r .data.secret_key)
156157
fi
157158
# The Nomad job spec will contain links ("nix_installables" stanza) to
158159
# the Nix Flake outputs it needs inside the container, these are
159160
# refereced with a GitHub commit ID inside the "container-specs" file.
160-
local gitrev=$(jq -r .gitrev "${profile_container_specs_file}")
161+
local gitrev
162+
gitrev=$(jq -r .gitrev "${profile_container_specs_file}")
161163
msg $(blue "INFO: Found GitHub commit with ID \"$gitrev\"")
162164
# Check if the Nix package was created from a dirty git tree
163165
if test "$gitrev" = "0000000000000000000000000000000000000000"
@@ -172,21 +174,23 @@ backend_nomadcloud() {
172174
then
173175
# Check HTTP status code for existance
174176
# https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#get-a-commit
175-
local headers=$(echo "${curl_response}" | jq -s .[1])
177+
local headers
178+
headers=$(echo "${curl_response}" | jq -s .[1])
176179
if test "$(echo "${headers}" | jq .http_code)" != 200
177180
then
178181
fatal "GitHub commit \"$gitrev\" is not available online!"
179182
fi
180183
# Show returned commit info in `git log` fashion
181-
local body=$(echo "${curl_response}" | jq -s .[0])
184+
local body author_name author_email author_date message
185+
body=$(echo "${curl_response}" | jq -s .[0])
186+
author_name=$(echo $body | jq -r .commit.author.name)
187+
author_email=$(echo $body | jq -r .commit.author.email)
188+
author_date=$(echo $body | jq -r .commit.author.date)
189+
message=$(echo $body | jq -r .commit.message)
182190
msg $(green "commit ${gitrev}")
183-
local author_name=$(echo $body | jq -r .commit.author.name)
184-
local author_email=$(echo $body | jq -r .commit.author.email)
185191
msg $(green "Author: ${author_name} <${author_email}>")
186-
local author_date=$(echo $body | jq -r .commit.author.date)
187192
msg $(green "Date: ${author_date}")
188193
msg $(green "\n")
189-
local message=$(echo $body | jq -r .commit.message)
190194
msg $(green "\t${message}\n")
191195
msg $(green "\n")
192196
else
@@ -220,13 +224,29 @@ backend_nomadcloud() {
220224
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"
221225

222226
# Set the placement info and resources accordingly
223-
local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
224-
if test -z "${WB_SHELL_PROFILE}"
227+
local nomad_job_name
228+
nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
229+
if test -z "${WB_SHELL_PROFILE:-}"
225230
then
226231
fatal "Envar \"WB_SHELL_PROFILE\" is empty!"
227232
else
228-
# Placement:
229-
############
233+
########################################################################
234+
# Fix for region mismatches ############################################
235+
########################################################################
236+
# We use "us-east-2" and they use "us-east-1"
237+
jq \
238+
".[\"job\"][\"${nomad_job_name}\"][\"datacenters\"] |= [\"eu-central-1\", \"us-east-1\", \"ap-southeast-2\"]" \
239+
"${dir}"/nomad/nomad-job.json \
240+
| \
241+
sponge "${dir}"/nomad/nomad-job.json
242+
jq \
243+
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries( if (.value.affinity.value == \"us-east-2\") then (.value.affinity.value |= \"us-east-1\") else (.) end )" \
244+
"${dir}"/nomad/nomad-job.json \
245+
| \
246+
sponge "${dir}"/nomad/nomad-job.json
247+
########################################################################
248+
# Unique placement: ####################################################
249+
########################################################################
230250
## "distinct_hosts": Instructs the scheduler to not co-locate any groups
231251
## on the same machine. When specified as a job constraint, it applies
232252
## to all groups in the job. When specified as a group constraint, the
@@ -243,19 +263,21 @@ backend_nomadcloud() {
243263
}
244264
]
245265
'
266+
# Adds it as a job level contraint.
246267
jq \
247268
--argjson job_constraints_array "${job_constraints_array}" \
248269
".[\"job\"][\"${nomad_job_name}\"].constraint |= \$job_constraints_array" \
249270
"${dir}"/nomad/nomad-job.json \
250271
| \
251272
sponge "${dir}"/nomad/nomad-job.json
252-
# Resources:
253-
############
273+
########################################################################
274+
# Node class: ##########################################################
275+
########################################################################
254276
local group_constraints_array
255277
# "perf" profiles run on the "perf" class
256278
if test "${WB_SHELL_PROFILE:0:7}" = 'cw-perf'
257279
then
258-
# Right now only "live" is using "perf" class distinct nodes!
280+
# Using Performance & Tracing exclusive "perf" class distinct nodes!
259281
group_constraints_array='
260282
[
261283
{
@@ -265,7 +287,33 @@ backend_nomadcloud() {
265287
}
266288
]
267289
'
268-
# Set the resources, only for perf!
290+
else
291+
# Using "qa" class distinct nodes. Only "short" test allowed here.
292+
group_constraints_array='
293+
[
294+
{
295+
"operator": "="
296+
, "attribute": "${node.class}"
297+
, "value": "qa"
298+
}
299+
]
300+
'
301+
fi
302+
# Adds it as a group level contraint.
303+
jq \
304+
--argjson group_constraints_array "${group_constraints_array}" \
305+
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \
306+
"${dir}"/nomad/nomad-job.json \
307+
| \
308+
sponge "${dir}"/nomad/nomad-job.json
309+
########################################################################
310+
# Memory/resources: ####################################################
311+
########################################################################
312+
# Set the resources, only for perf!
313+
# When not "perf", when "cw-qa", only "short" tests are allowed.
314+
if test "${WB_SHELL_PROFILE:0:7}" = 'cw-perf'
315+
then
316+
# Producer nodes use this specs, make sure they are available!
269317
# AWS:
270318
## c5.2xlarge: 8 vCPU and 16 Memory (GiB)
271319
## https://aws.amazon.com/ec2/instance-types/c5/
@@ -279,18 +327,36 @@ backend_nomadcloud() {
279327
## - memory.totalbytes = 16300142592
280328
## Pesimistic: 1,798 MiB / 15,545 MiB Total
281329
## Optimistic: 1,396 MiB / 15,545 MiB Total
282-
local resources='{
330+
local producer_resources='{
283331
"cores": 8
284332
, "memory": 13000
285333
, "memory_max": 15000
286334
}'
335+
# Set this for every non-explorer node
287336
jq \
288-
--argjson resources "${resources}" \
289-
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.task |= with_entries( .value.resources = \$resources ) )" \
337+
--argjson producer_resources "${producer_resources}" \
338+
" \
339+
.[\"job\"][\"${nomad_job_name}\"][\"group\"] \
340+
|= \
341+
with_entries( \
342+
if ( .key != \"explorer\" ) \
343+
then ( \
344+
.value.task \
345+
|= \
346+
with_entries( .value.resources = \$producer_resources ) \
347+
) else ( \
348+
. \
349+
) end \
350+
) \
351+
" \
290352
"${dir}"/nomad/nomad-job.json \
291353
| \
292354
sponge "${dir}"/nomad/nomad-job.json
293-
# The explorer node: Using an "m5.4xlarge" instance type
355+
# The explorer node uses this specs, make sure they are available!
356+
# AWS
357+
## m5.4xlarge: 8 vCPU and 16 Memory (GiB)
358+
## https://aws.amazon.com/ec2/instance-types/m5/
359+
# Nomad:
294360
## - cpu.arch = amd64
295361
## - cpu.frequency = 3100
296362
## - cpu.modelname = Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz
@@ -311,38 +377,157 @@ backend_nomadcloud() {
311377
"${dir}"/nomad/nomad-job.json \
312378
| \
313379
sponge "${dir}"/nomad/nomad-job.json
314-
# Fix for region mismatches
315-
###########################
316-
# We use "us-east-2" and they use "us-east-1"
317-
jq \
318-
".[\"job\"][\"${nomad_job_name}\"][\"datacenters\"] |= [\"eu-central-1\", \"us-east-1\", \"ap-southeast-2\"]" \
319-
"${dir}"/nomad/nomad-job.json \
320-
| \
321-
sponge "${dir}"/nomad/nomad-job.json
322-
jq \
323-
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries( if (.value.affinity.value == \"us-east-2\") then (.value.affinity.value |= \"us-east-1\") else (.) end )" \
380+
fi
381+
########################################################################
382+
# Reproducibility: #####################################################
383+
########################################################################
384+
# If value profile on "perf", using always the same placement!
385+
# This means node-N always runs on the same Nomad Client/AWS EC2 machine
386+
if test "${WB_SHELL_PROFILE:0:13}" = 'cw-perf-value'
387+
then
388+
# A file with all the available Nomad Clients is needed!
389+
# This files is a list of Nomad Clients with a minimun of ".id",
390+
# ".datacenter", ".attributes.platform.aws["instance-type"]",
391+
# ".attributes.platform.aws.placement["availability-zone"]",
392+
# ".attributes.unique.platform.aws["instance-id"]",
393+
# ".attributes.unique.platform.aws.["public-ipv4"]" and
394+
# ".attributes.unique.platform.aws.mac".
395+
if test -z "${NOMAD_CLIENTS_FILE:-}" || ! test -f "${NOMAD_CLIENTS_FILE}"
396+
then
397+
fatal "No \"\$NOMAD_CLIENTS_FILE\""
398+
fi
399+
# For each (instance-type, datacener/region) we look incrementally for
400+
# the unique AWS EC2 "instance-id" only after ordering the Nomad
401+
# Clients by its unique Nomad provided "id".
402+
local count_ap=0 count_eu=0 count_us=0
403+
# For each Nomad Job Group
404+
local groups_array
405+
# Keys MUST be sorted to always get the same order for the same profile!
406+
groups_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"] | keys | sort | join (\" \")" "${dir}"/nomad/nomad-job.json)
407+
for group_name in ${groups_array[*]}
408+
do
409+
# Obtain the datacenter as Nomad sees it, not as an AWS attributes.
410+
# For example "eu-central-1" instead of "eu-central-1a".
411+
local datacenter
412+
datacenter=$(jq \
413+
-r \
414+
".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"].affinity.value" \
324415
"${dir}"/nomad/nomad-job.json \
325-
| \
326-
sponge "${dir}"/nomad/nomad-job.json
327-
# Non "perf" profiles run on the "qa" class
328-
else
329-
# Right now only testing, using "qa" class distinct nodes!
330-
group_constraints_array='
331-
[
332-
{
333-
"operator": "="
334-
, "attribute": "${node.class}"
335-
, "value": "qa"
336-
}
337-
]
338-
'
416+
)
417+
# For each Nomad Job Group Task
418+
local tasks_array
419+
# Keys MUST be sorted to always get the same order for the same profile!
420+
tasks_array=$(jq -S -r ".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"task\"] | keys | sort | join (\" \")" "${dir}"/nomad/nomad-job.json)
421+
for task_name in ${tasks_array[*]}
422+
do
423+
local count instance_type
424+
if test "${task_name}" = "explorer"
425+
then
426+
# There is only one of this instance!
427+
instance_type="m5.4xlarge"
428+
count=0
429+
else
430+
# There are many of these instances and we need to always fetch
431+
# them in the same order for reproducibility.
432+
instance_type="c5.2xlarge"
433+
if test "${datacenter}" = "ap-southeast-2"
434+
then
435+
count="${count_ap}"
436+
count_ap=$(( count_ap + 1 ))
437+
elif test "${datacenter}" = "eu-central-1"
438+
then
439+
count="${count_eu}"
440+
count_eu=$(( count_eu + 1 ))
441+
elif test "${datacenter}" = "us-east-1"
442+
then
443+
count="${count_us}"
444+
count_us=$(( count_us + 1 ))
445+
fi
446+
fi
447+
# Get the actual client for this datacenter and instance type.
448+
local actual_client
449+
actual_client=$(jq \
450+
" . \
451+
| \
452+
sort_by(.id) \
453+
| \
454+
map(select(.datacenter == \"${datacenter}\")) \
455+
| \
456+
map(select(.attributes.platform.aws[\"instance-type\"] == \"${instance_type}\")) \
457+
| \
458+
.[${count}] \
459+
" \
460+
"${NOMAD_CLIENTS_FILE}" \
461+
)
462+
local instance_id availability_zone public_ipv4 mac_address
463+
instance_id="$( \
464+
echo "${actual_client}" \
465+
| \
466+
jq -r \
467+
'.attributes.unique.platform.aws["instance-id"]' \
468+
)"
469+
availability_zone="$( \
470+
echo "${actual_client}" \
471+
| \
472+
jq -r \
473+
'.attributes.platform.aws.placement["availability-zone"]' \
474+
)"
475+
public_ipv4="$( \
476+
echo "${actual_client}" \
477+
| \
478+
jq -r \
479+
'.attributes.unique.platform.aws["public-ipv4"]' \
480+
)"
481+
mac_address="$( \
482+
echo "${actual_client}" \
483+
| \
484+
jq -r \
485+
'.attributes.unique.platform.aws.mac' \
486+
)"
487+
# Pin the actual node to an specific Nomad Client / AWS instance
488+
# by appending below constraints to the already there group
489+
# constraints.
490+
# We pin it to a couple of AWS specifics attributes so if SRE
491+
# changes something related to Nomad Clients or AWS instances we
492+
# may hopefully notice it when the job fails to start (placement
493+
# errors).
494+
local group_constraints_array_plus="
495+
[ \
496+
{ \
497+
\"attribute\": \"\${attr.platform.aws.instance-type}\" \
498+
, \"value\": \"${instance_type}\" \
499+
} \
500+
,
501+
{ \
502+
\"attribute\": \"\${attr.platform.aws.placement.availability-zone}\" \
503+
, \"value\": \"${availability_zone}\" \
504+
} \
505+
,
506+
{ \
507+
\"attribute\": \"\${attr.unique.platform.aws.instance-id}\" \
508+
, \"value\": \"${instance_id}\" \
509+
} \
510+
,
511+
{ \
512+
\"attribute\": \"\${attr.unique.platform.aws.public-ipv4}\" \
513+
, \"value\": \"${public_ipv4}\" \
514+
} \
515+
,
516+
{ \
517+
\"attribute\": \"\${attr.unique.platform.aws.mac}\" \
518+
, \"value\": \"${mac_address}\" \
519+
} \
520+
] \
521+
"
522+
jq \
523+
--argjson group_constraints_array_plus "${group_constraints_array_plus}" \
524+
".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"${group_name}\"][\"constraint\"] |= ( . + \$group_constraints_array_plus)" \
525+
"${dir}"/nomad/nomad-job.json \
526+
| \
527+
sponge "${dir}"/nomad/nomad-job.json
528+
done
529+
done
339530
fi
340-
jq \
341-
--argjson group_constraints_array "${group_constraints_array}" \
342-
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \
343-
"${dir}"/nomad/nomad-job.json \
344-
| \
345-
sponge "${dir}"/nomad/nomad-job.json
346531
fi
347532

348533
# Store a summary of the job.
@@ -360,7 +545,8 @@ backend_nomadcloud() {
360545
, "tasks": (
361546
.task | with_entries(
362547
.value |= {
363-
"resources": .resources
548+
"constraint": .constraint
549+
, "resources": .resources
364550
, "nix_installables": .config.nix_installables
365551
, "templates": ( .template | map(.destination) )
366552
}

0 commit comments

Comments
 (0)