Skip to content

Commit 4508ea6

Browse files
committed
adding support for dynamic config of helm deployments on GPU to ManagementService
1 parent dfc7cd7 commit 4508ea6

File tree

3 files changed

+16
-25
lines changed

3 files changed

+16
-25
lines changed

ManagementService/management_init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ def printUsage():
355355
sys.path.append(workflowdir)
356356
if os.getenv("KUBERNETES_PORT", None) != None:
357357
import deployWorkflow
358-
url, endpoint_key = deployWorkflow.create_k8s_deployment(email, workflow_info, "Python", management=True)
358+
url, endpoint_key = deployWorkflow.create_k8s_deployment(email, workflow_info, "Python", 0, management=True)
359359
DLCLIENT_MANAGEMENT.putMapEntry("Management_workflow_endpoint_map", endpoint_key, url)
360360
# Kubernetes mode only has one url
361361
endpoint_list = [url]

ManagementService/python/addWorkflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def handle(value, sapi):
3939
wf["status"] = "undeployed"
4040
wf["modified"] = time.time()
4141
wf["endpoints"] = []
42-
wf["gpu_usage"] = None
42+
#wf["gpu_usage"] = None
4343
if "gpu_usage" in workflow:
4444
wf["gpu_usage"] = str(workflow["gpu_usage"])
4545

ManagementService/python/deployWorkflow.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,6 @@ def compile_resource_info_map(resource_names, uploaded_resources, email, sapi, d
149149
resource_metadata = json.loads(resource_metadata)
150150
if "runtime" in resource_metadata:
151151
resource_info["runtime"] = resource_metadata["runtime"]
152-
#if "num_gpu" in resource_metadata:
153-
# print("RESOURCE_INFO: " + str(resource_info["num_gpu"]))
154152

155153
num_chunks_str = dlc.get("grain_source_zip_num_chunks_" + resource_id)
156154
try:
@@ -261,7 +259,7 @@ def get_workflow_host_port(host_to_deploy, sid):
261259

262260
return success, host_port
263261

264-
def create_k8s_deployment(email, workflow_info, runtime, management=False, use_gpus=0):
262+
def create_k8s_deployment(email, workflow_info, runtime, gpu_usage, management=False):
265263
# KUBERNETES MODE
266264
new_workflow_conf = {}
267265
conf_file = '/opt/mfn/SandboxAgent/conf/new_workflow.conf'
@@ -313,16 +311,17 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False, use_g
313311
env.append({'name': 'WORKFLOWID', 'value': workflow_info["workflowId"]})
314312
env.append({'name': 'WORKFLOWNAME', 'value': workflow_info["workflowName"]})
315313

316-
if use_gpus >= 0:
317-
#print("INSIDE K8S Deploy, num_gpu: " + str(workflow_info['num_gpu']))
318-
#num_gpu = int(workflow_info['num_gpu'])
314+
# apply gpu_usage fraction to k8s deployment configuration
315+
use_gpus = gpu_usage
316+
317+
if not management and use_gpus >= 0:
319318
# overwrite values from values.yaml for new workflows
320319
kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = str(use_gpus)
321320
kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = str(use_gpus)
322321
#kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox"
323322
kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox_gpu"
324323

325-
# Special handling for the management container
324+
# Special handling for the management container: never run on gpu
326325
if management:
327326
kservice['spec']['template']['spec']['volumes'] = [{ 'name': 'new-workflow-conf', 'configMap': {'name': new_workflow_conf['configmap']}}]
328327
kservice['spec']['template']['spec']['containers'][0]['volumeMounts'] = [{'name': 'new-workflow-conf', 'mountPath': '/opt/mfn/SandboxAgent/conf'}]
@@ -421,13 +420,10 @@ def handle(value, sapi):
421420
workflow = data["workflow"]
422421
if "id" not in workflow:
423422
raise Exception("malformed input")
424-
"""
425-
if "gpu_usage" not in workflow:
426-
raise Exception("malformed input: no gpu_usage")
427-
use_gpus = int(data['gpu_usage'])
428-
"""
429423
sapi.log(json.dumps(workflow))
430424
wfmeta = sapi.get(email + "_workflow_" + workflow["id"], True)
425+
print("WFMETA in deployWorkflow: "+ str(wfmeta))
426+
431427
if wfmeta is None or wfmeta == "":
432428
raise Exception("workflow metadata is not valid.")
433429
try:
@@ -498,16 +494,14 @@ def handle(value, sapi):
498494
else:
499495
runtime = "Python"
500496

501-
"""
502-
if "num_gpu" in resource_info_map.keys():
503-
print ("RESOURCE_INFO_MAP: " + str(resource_info_map))
504-
workflow_info['num_gpu'] = resource_info_map['num_gpu']
497+
if "gpu_usage" in wfmeta and wfmeta["gpu_usage"] != "None":
498+
gpu_usage = float(wfmeta["gpu_usage"])
505499
else:
506-
workflow_info['num_gpu'] = 0
507-
"""
508-
use_gpus = 0
500+
gpu_usage = 0.
501+
502+
#print("deduced gpu_usage: " + str(gpu_usage))
509503

510-
url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime, use_gpus)
504+
url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime, gpu_usage)
511505
if url is not None and len(url) > 0:
512506
status = "deploying"
513507
sapi.addSetEntry(workflow_info["workflowId"] + "_workflow_endpoints", str(url), is_private=True)
@@ -522,9 +516,6 @@ def handle(value, sapi):
522516
# _XXX_: due to the queue service still being in java in the sandbox
523517

524518
sandbox_image_name = "microfn/sandbox" # default value
525-
#if "on_gpu" in resource_info_map.keys(): # sandbox_gpu image should be used for ths workflow
526-
# if resource_info_map["on_gpu"] == True:
527-
# sandbox_image_name = "microfn/sandbox_gpu"
528519

529520
if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map):
530521
sandbox_image_name = "microfn/sandbox_java"

0 commit comments

Comments
 (0)