@@ -150,14 +150,16 @@ backend_nomadcloud() {
150150 then
151151 msg $( yellow " WARNING: Amazon S3 \" AWS_ACCESS_KEY_ID\" or \" AWS_SECRET_ACCESS_KEY\" envar is not set" )
152152 msg $( blue " INFO: Fetching \" AWS_ACCESS_KEY_ID\" and \" AWS_SECRET_ACCESS_KEY\" from SRE provided Vault for \" Performance and Tracing\" " )
153- local aws_credentials=" $( wb_nomad vault world aws-s3-credentials) "
153+ local aws_credentials
154+ aws_credentials=" $( wb_nomad vault world aws-s3-credentials) "
154155 export AWS_ACCESS_KEY_ID=$( echo " ${aws_credentials} " | jq -r .data.access_key)
155156 export AWS_SECRET_ACCESS_KEY=$( echo " ${aws_credentials} " | jq -r .data.secret_key)
156157 fi
157158 # The Nomad job spec will contain links ("nix_installables" stanza) to
158159 # the Nix Flake outputs it needs inside the container, these are
159160 # refereced with a GitHub commit ID inside the "container-specs" file.
160- local gitrev=$( jq -r .gitrev " ${profile_container_specs_file} " )
161+ local gitrev
162+ gitrev=$( jq -r .gitrev " ${profile_container_specs_file} " )
161163 msg $( blue " INFO: Found GitHub commit with ID \" $gitrev \" " )
162164 # Check if the Nix package was created from a dirty git tree
163165 if test " $gitrev " = " 0000000000000000000000000000000000000000"
@@ -172,21 +174,23 @@ backend_nomadcloud() {
172174 then
173175 # Check HTTP status code for existance
174176 # https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#get-a-commit
175- local headers=$( echo " ${curl_response} " | jq -s .[1])
177+ local headers
178+ headers=$( echo " ${curl_response} " | jq -s .[1])
176179 if test " $( echo " ${headers} " | jq .http_code) " ! = 200
177180 then
178181 fatal " GitHub commit \" $gitrev \" is not available online!"
179182 fi
180183 # Show returned commit info in `git log` fashion
181- local body=$( echo " ${curl_response} " | jq -s .[0])
184+ local body author_name author_email author_date message
185+ body=$( echo " ${curl_response} " | jq -s .[0])
186+ author_name=$( echo $body | jq -r .commit.author.name)
187+ author_email=$( echo $body | jq -r .commit.author.email)
188+ author_date=$( echo $body | jq -r .commit.author.date)
189+ message=$( echo $body | jq -r .commit.message)
182190 msg $( green " commit ${gitrev} " )
183- local author_name=$( echo $body | jq -r .commit.author.name)
184- local author_email=$( echo $body | jq -r .commit.author.email)
185191 msg $( green " Author: ${author_name} <${author_email} >" )
186- local author_date=$( echo $body | jq -r .commit.author.date)
187192 msg $( green " Date: ${author_date} " )
188193 msg $( green " \n" )
189- local message=$( echo $body | jq -r .commit.message)
190194 msg $( green " \t${message} \n" )
191195 msg $( green " \n" )
192196 else
@@ -220,13 +224,29 @@ backend_nomadcloud() {
220224 backend_nomad allocate-run-nomad-job-patch-nix " ${dir} "
221225
222226 # Set the placement info and resources accordingly
223- local nomad_job_name=$( jq -r " . [\" job\" ] | keys[0]" " ${dir} " /nomad/nomad-job.json)
224- if test -z " ${WB_SHELL_PROFILE} "
227+ local nomad_job_name
228+ nomad_job_name=$( jq -r " . [\" job\" ] | keys[0]" " ${dir} " /nomad/nomad-job.json)
229+ if test -z " ${WB_SHELL_PROFILE:- } "
225230 then
226231 fatal " Envar \" WB_SHELL_PROFILE\" is empty!"
227232 else
228- # Placement:
229- # ###########
233+ # #######################################################################
234+ # Fix for region mismatches ############################################
235+ # #######################################################################
236+ # We use "us-east-2" and they use "us-east-1"
237+ jq \
238+ " .[\" job\" ][\" ${nomad_job_name} \" ][\" datacenters\" ] |= [\" eu-central-1\" , \" us-east-1\" , \" ap-southeast-2\" ]" \
239+ " ${dir} " /nomad/nomad-job.json \
240+ | \
241+ sponge " ${dir} " /nomad/nomad-job.json
242+ jq \
243+ " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] |= with_entries( if (.value.affinity.value == \" us-east-2\" ) then (.value.affinity.value |= \" us-east-1\" ) else (.) end )" \
244+ " ${dir} " /nomad/nomad-job.json \
245+ | \
246+ sponge " ${dir} " /nomad/nomad-job.json
247+ # #######################################################################
248+ # Unique placement: ####################################################
249+ # #######################################################################
230250 # # "distinct_hosts": Instructs the scheduler to not co-locate any groups
231251 # # on the same machine. When specified as a job constraint, it applies
232252 # # to all groups in the job. When specified as a group constraint, the
@@ -243,19 +263,21 @@ backend_nomadcloud() {
243263 }
244264 ]
245265 '
266+ # Adds it as a job level contraint.
246267 jq \
247268 --argjson job_constraints_array " ${job_constraints_array} " \
248269 " .[\" job\" ][\" ${nomad_job_name} \" ].constraint |= \$ job_constraints_array" \
249270 " ${dir} " /nomad/nomad-job.json \
250271 | \
251272 sponge " ${dir} " /nomad/nomad-job.json
252- # Resources:
253- # ###########
273+ # #######################################################################
274+ # Node class: ##########################################################
275+ # #######################################################################
254276 local group_constraints_array
255277 # "perf" profiles run on the "perf" class
256278 if test " ${WB_SHELL_PROFILE: 0: 7} " = ' cw-perf'
257279 then
258- # Right now only "live" is using "perf" class distinct nodes!
280+ # Using Performance & Tracing exclusive "perf" class distinct nodes!
259281 group_constraints_array='
260282 [
261283 {
@@ -265,7 +287,33 @@ backend_nomadcloud() {
265287 }
266288 ]
267289 '
268- # Set the resources, only for perf!
290+ else
291+ # Using "qa" class distinct nodes. Only "short" test allowed here.
292+ group_constraints_array='
293+ [
294+ {
295+ "operator": "="
296+ , "attribute": "${node.class}"
297+ , "value": "qa"
298+ }
299+ ]
300+ '
301+ fi
302+ # Adds it as a group level contraint.
303+ jq \
304+ --argjson group_constraints_array " ${group_constraints_array} " \
305+ " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] |= with_entries(.value.constraint = \$ group_constraints_array)" \
306+ " ${dir} " /nomad/nomad-job.json \
307+ | \
308+ sponge " ${dir} " /nomad/nomad-job.json
309+ # #######################################################################
310+ # Memory/resources: ####################################################
311+ # #######################################################################
312+ # Set the resources, only for perf!
313+ # When not "perf", when "cw-qa", only "short" tests are allowed.
314+ if test " ${WB_SHELL_PROFILE: 0: 7} " = ' cw-perf'
315+ then
316+ # Producer nodes use this specs, make sure they are available!
269317 # AWS:
270318 # # c5.2xlarge: 8 vCPU and 16 Memory (GiB)
271319 # # https://aws.amazon.com/ec2/instance-types/c5/
@@ -279,18 +327,36 @@ backend_nomadcloud() {
279327 # # - memory.totalbytes = 16300142592
280328 # # Pesimistic: 1,798 MiB / 15,545 MiB Total
281329 # # Optimistic: 1,396 MiB / 15,545 MiB Total
282- local resources =' {
330+ local producer_resources =' {
283331 "cores": 8
284332 , "memory": 13000
285333 , "memory_max": 15000
286334 }'
335+ # Set this for every non-explorer node
287336 jq \
288- --argjson resources " ${resources} " \
289- " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] |= with_entries(.value.task |= with_entries( .value.resources = \$ resources ) )" \
337+ --argjson producer_resources " ${producer_resources} " \
338+ " \
339+ .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] \
340+ |= \
341+ with_entries( \
342+ if ( .key != \" explorer\" ) \
343+ then ( \
344+ .value.task \
345+ |= \
346+ with_entries( .value.resources = \$ producer_resources ) \
347+ ) else ( \
348+ . \
349+ ) end \
350+ ) \
351+ " \
290352 " ${dir} " /nomad/nomad-job.json \
291353 | \
292354 sponge " ${dir} " /nomad/nomad-job.json
293- # The explorer node: Using an "m5.4xlarge" instance type
355+ # The explorer node uses this specs, make sure they are available!
356+ # AWS
357+ # # m5.4xlarge: 8 vCPU and 16 Memory (GiB)
358+ # # https://aws.amazon.com/ec2/instance-types/m5/
359+ # Nomad:
294360 # # - cpu.arch = amd64
295361 # # - cpu.frequency = 3100
296362 # # - cpu.modelname = Intel(R) Xeon(R) Platinum 8175M CPU @ 2.50GHz
@@ -311,38 +377,157 @@ backend_nomadcloud() {
311377 " ${dir} " /nomad/nomad-job.json \
312378 | \
313379 sponge " ${dir} " /nomad/nomad-job.json
314- # Fix for region mismatches
315- # ##########################
316- # We use "us-east-2" and they use "us-east-1"
317- jq \
318- " .[\" job\" ][\" ${nomad_job_name} \" ][\" datacenters\" ] |= [\" eu-central-1\" , \" us-east-1\" , \" ap-southeast-2\" ]" \
319- " ${dir} " /nomad/nomad-job.json \
320- | \
321- sponge " ${dir} " /nomad/nomad-job.json
322- jq \
323- " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] |= with_entries( if (.value.affinity.value == \" us-east-2\" ) then (.value.affinity.value |= \" us-east-1\" ) else (.) end )" \
380+ fi
381+ # #######################################################################
382+ # Reproducibility: #####################################################
383+ # #######################################################################
384+ # If value profile on "perf", using always the same placement!
385+ # This means node-N always runs on the same Nomad Client/AWS EC2 machine
386+ if test " ${WB_SHELL_PROFILE: 0: 13} " = ' cw-perf-value'
387+ then
388+ # A file with all the available Nomad Clients is needed!
389+ # This files is a list of Nomad Clients with a minimun of ".id",
390+ # ".datacenter", ".attributes.platform.aws["instance-type"]",
391+ # ".attributes.platform.aws.placement["availability-zone"]",
392+ # ".attributes.unique.platform.aws["instance-id"]",
393+ # ".attributes.unique.platform.aws.["public-ipv4"]" and
394+ # ".attributes.unique.platform.aws.mac".
395+ if test -z " ${NOMAD_CLIENTS_FILE:- } " || ! test -f " ${NOMAD_CLIENTS_FILE} "
396+ then
397+ fatal " No \"\$ NOMAD_CLIENTS_FILE\" "
398+ fi
399+ # For each (instance-type, datacener/region) we look incrementally for
400+ # the unique AWS EC2 "instance-id" only after ordering the Nomad
401+ # Clients by its unique Nomad provided "id".
402+ local count_ap=0 count_eu=0 count_us=0
403+ # For each Nomad Job Group
404+ local groups_array
405+ # Keys MUST be sorted to always get the same order for the same profile!
406+ groups_array=$( jq -S -r " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] | keys | sort | join (\" \" )" " ${dir} " /nomad/nomad-job.json)
407+ for group_name in ${groups_array[*]}
408+ do
409+ # Obtain the datacenter as Nomad sees it, not as an AWS attributes.
410+ # For example "eu-central-1" instead of "eu-central-1a".
411+ local datacenter
412+ datacenter=$( jq \
413+ -r \
414+ " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ][\" ${group_name} \" ].affinity.value" \
324415 " ${dir} " /nomad/nomad-job.json \
325- | \
326- sponge " ${dir} " /nomad/nomad-job.json
327- # Non "perf" profiles run on the "qa" class
328- else
329- # Right now only testing, using "qa" class distinct nodes!
330- group_constraints_array='
331- [
332- {
333- "operator": "="
334- , "attribute": "${node.class}"
335- , "value": "qa"
336- }
337- ]
338- '
416+ )
417+ # For each Nomad Job Group Task
418+ local tasks_array
419+ # Keys MUST be sorted to always get the same order for the same profile!
420+ tasks_array=$( jq -S -r " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ][\" ${group_name} \" ][\" task\" ] | keys | sort | join (\" \" )" " ${dir} " /nomad/nomad-job.json)
421+ for task_name in ${tasks_array[*]}
422+ do
423+ local count instance_type
424+ if test " ${task_name} " = " explorer"
425+ then
426+ # There is only one of this instance!
427+ instance_type=" m5.4xlarge"
428+ count=0
429+ else
430+ # There are many of these instances and we need to always fetch
431+ # them in the same order for reproducibility.
432+ instance_type=" c5.2xlarge"
433+ if test " ${datacenter} " = " ap-southeast-2"
434+ then
435+ count=" ${count_ap} "
436+ count_ap=$(( count_ap + 1 ))
437+ elif test " ${datacenter} " = " eu-central-1"
438+ then
439+ count=" ${count_eu} "
440+ count_eu=$(( count_eu + 1 ))
441+ elif test " ${datacenter} " = " us-east-1"
442+ then
443+ count=" ${count_us} "
444+ count_us=$(( count_us + 1 ))
445+ fi
446+ fi
447+ # Get the actual client for this datacenter and instance type.
448+ local actual_client
449+ actual_client=$( jq \
450+ " . \
451+ | \
452+ sort_by(.id) \
453+ | \
454+ map(select(.datacenter == \" ${datacenter} \" )) \
455+ | \
456+ map(select(.attributes.platform.aws[\" instance-type\" ] == \" ${instance_type} \" )) \
457+ | \
458+ .[${count} ] \
459+ " \
460+ " ${NOMAD_CLIENTS_FILE} " \
461+ )
462+ local instance_id availability_zone public_ipv4 mac_address
463+ instance_id=" $( \
464+ echo " ${actual_client} " \
465+ | \
466+ jq -r \
467+ ' .attributes.unique.platform.aws["instance-id"]' \
468+ ) "
469+ availability_zone=" $( \
470+ echo " ${actual_client} " \
471+ | \
472+ jq -r \
473+ ' .attributes.platform.aws.placement["availability-zone"]' \
474+ ) "
475+ public_ipv4=" $( \
476+ echo " ${actual_client} " \
477+ | \
478+ jq -r \
479+ ' .attributes.unique.platform.aws["public-ipv4"]' \
480+ ) "
481+ mac_address=" $( \
482+ echo " ${actual_client} " \
483+ | \
484+ jq -r \
485+ ' .attributes.unique.platform.aws.mac' \
486+ ) "
487+ # Pin the actual node to an specific Nomad Client / AWS instance
488+ # by appending below constraints to the already there group
489+ # constraints.
490+ # We pin it to a couple of AWS specifics attributes so if SRE
491+ # changes something related to Nomad Clients or AWS instances we
492+ # may hopefully notice it when the job fails to start (placement
493+ # errors).
494+ local group_constraints_array_plus="
495+ [ \
496+ { \
497+ \" attribute\" : \"\$ {attr.platform.aws.instance-type}\" \
498+ , \" value\" : \" ${instance_type} \" \
499+ } \
500+ ,
501+ { \
502+ \" attribute\" : \"\$ {attr.platform.aws.placement.availability-zone}\" \
503+ , \" value\" : \" ${availability_zone} \" \
504+ } \
505+ ,
506+ { \
507+ \" attribute\" : \"\$ {attr.unique.platform.aws.instance-id}\" \
508+ , \" value\" : \" ${instance_id} \" \
509+ } \
510+ ,
511+ { \
512+ \" attribute\" : \"\$ {attr.unique.platform.aws.public-ipv4}\" \
513+ , \" value\" : \" ${public_ipv4} \" \
514+ } \
515+ ,
516+ { \
517+ \" attribute\" : \"\$ {attr.unique.platform.aws.mac}\" \
518+ , \" value\" : \" ${mac_address} \" \
519+ } \
520+ ] \
521+ "
522+ jq \
523+ --argjson group_constraints_array_plus " ${group_constraints_array_plus} " \
524+ " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ][\" ${group_name} \" ][\" constraint\" ] |= ( . + \$ group_constraints_array_plus)" \
525+ " ${dir} " /nomad/nomad-job.json \
526+ | \
527+ sponge " ${dir} " /nomad/nomad-job.json
528+ done
529+ done
339530 fi
340- jq \
341- --argjson group_constraints_array " ${group_constraints_array} " \
342- " .[\" job\" ][\" ${nomad_job_name} \" ][\" group\" ] |= with_entries(.value.constraint = \$ group_constraints_array)" \
343- " ${dir} " /nomad/nomad-job.json \
344- | \
345- sponge " ${dir} " /nomad/nomad-job.json
346531 fi
347532
348533 # Store a summary of the job.
@@ -360,7 +545,8 @@ backend_nomadcloud() {
360545 , "tasks": (
361546 .task | with_entries(
362547 .value |= {
363- "resources": .resources
548+ "constraint": .constraint
549+ , "resources": .resources
364550 , "nix_installables": .config.nix_installables
365551 , "templates": ( .template | map(.destination) )
366552 }
0 commit comments