Skip to content

Commit 3187c49

Browse files
committed
management: fix deployWorkflow for bare metal with gpu hosts
1 parent 3647851 commit 3187c49

File tree

1 file changed

+27
-36
lines changed

1 file changed

+27
-36
lines changed

ManagementService/python/deployWorkflow.py

Lines changed: 27 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -642,53 +642,47 @@ def handle(value, sapi):
642642
status = "failed"
643643
else:
644644
# We're running BARE METAL mode
645-
# _XXX_: due to the queue service still being in java in the sandbox
646645
print("gpu_usage before decision:" + str(gpu_usage))
647-
if gpu_usage == 0:
648-
sandbox_image_name = "microfn/sandbox" # default value
649-
elif gpu_usage > 0:
646+
if gpu_usage > 0:
650647
sandbox_image_name = "microfn/sandbox_gpu" # sandbox uses GPU
651-
if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map):
648+
elif any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map):
652649
sandbox_image_name = "microfn/sandbox_java"
650+
else:
651+
sandbox_image_name = "microfn/sandbox" # default value
653652

654653
# TODO: intelligence on how to pick hosts
655654
hosts = sapi.get("available_hosts", True)
656655
# hostst is string representation of list or dict
657656
print("available_hosts: " + hosts)
658657
hosts = json.loads(hosts)
659658

659+
deployed_hosts = {}
660660
if hosts is not None and hosts != "" and isinstance(hosts,dict):
661661
host_has_gpu = False
662-
deployed_hosts = {}
663662
gpu_hosts = {}
664-
picked_hosts = {}
663+
picked_hosts = None
665664
plain_hosts={}
666-
hostname_t = ""
667665
for hostname in hosts: # individual host dict
668-
nodeHasGPU = hosts["has_gpu"] # check if host has a GPU
669-
if hostname != "has_gpu": # skip this key
670-
hostname_t = hostname
671-
#print("current hostnae: " + str(hostname) + str(hosts[hostname]))
672-
hostip = hosts[hostname]
673-
plain_hosts[hostname] = hostip # add to general hosts
674-
if nodeHasGPU:
675-
gpu_hosts[hostname] = hostip # add to GPU hosts
676-
hostname = hostname_t
666+
host_has_gpu = hosts[hostname]["has_gpu"] # check if host has a GPU
667+
hostip = hosts[hostname]["ip"]
668+
plain_hosts[hostname] = hostip # add to general hosts
669+
if host_has_gpu:
670+
gpu_hosts[hostname] = hostip # add to GPU hosts
677671
# instruct hosts to start the sandbox and deploy workflow
678672
print("selected host:" + str(hostname) + " " + str(hostip))
679-
#print("calulated host:" + str(gpu_hosts) + " " + str(plain_hosts))
680-
if sandbox_image_name == "microfn/sandbox" or sandbox_image_name=="microfn/sandbox_java": # can use any host
681-
picked_hosts = plain_hosts
682-
#hosts["has_gpu"] = False
683-
#print("picked_hosts: " + str(picked_hosts))
684-
elif len(gpu_hosts) > 0:
673+
#print("founds hosts:" + str(gpu_hosts) + " " + str(plain_hosts))
674+
if sandbox_image_name == "microfn/sandbox_gpu" and gpu_hosts:
685675
picked_hosts = gpu_hosts
686-
else:
687-
picked_hosts = plain_hosts # fallback as there are no gpu hosts available
688-
print("available GPU hosts list is empty. Deploying on general purpose host")
676+
elif sandbox_image_name == "microfn/sandbox_gpu":
677+
# can't deploy; no gpu hosts available.
678+
picked_hosts = {}
679+
elif sandbox_image_name == "microfn/sandbox" or sandbox_image_name=="microfn/sandbox_java": # can use any host
680+
picked_hosts = plain_hosts
681+
682+
print("picked_hosts: " + str(picked_hosts))
689683

690684
for hostname in picked_hosts: # loop over all hosts, need to pich gpu hosts for python/gpu workflows
691-
hostip = hosts[hostname]
685+
hostip = hosts[hostname]["ip"]
692686
host_to_deploy = (hostname, hostip)
693687
print("host_to_deploy: " + str(host_to_deploy) )
694688
#host_to_deploy = ("userslfu99", "192.168.8.99")
@@ -713,17 +707,14 @@ def handle(value, sapi):
713707
sapi.putMapEntry(workflow_info["workflowId"] + "_sandbox_status_map", endpoint_key, json.dumps(sbinfo), is_private=True)
714708
#endpoints = sapi.retrieveMap(workflow_info["workflowId"] + "_workflow_endpoints", True)
715709
#sapi.log(str(endpoints))
716-
elif hosts is not None and hosts != "" and isinstance(hosts,list):
717-
print("hosts is not dict type!")
718-
719-
if not bool(deployed_hosts):
720-
status = "failed"
721-
else:
722-
#sapi.log("deployed on hosts: " + json.dumps(deployed_hosts))
723-
sapi.put(email + "_workflow_hosts_" + workflow["id"], json.dumps(deployed_hosts), True)
724710
else:
725-
print("available_hosts is empty. Not deploying")
711+
print("available_hosts is empty or not a dictionary; not deploying...")
712+
713+
if not bool(deployed_hosts):
726714
status = "failed"
715+
else:
716+
#sapi.log("deployed on hosts: " + json.dumps(deployed_hosts))
717+
sapi.put(email + "_workflow_hosts_" + workflow["id"], json.dumps(deployed_hosts), True)
727718

728719
# Update workflow status
729720
wfmeta["status"] = status

0 commit comments

Comments
 (0)