first cut on extending Workflow class with GPU properties

ksatzke · ksatzke · commit 89e8d24d7320 · 2020-10-02T10:01:42.000Z
diff --git a/ManagementService/python/addWorkflow.py b/ManagementService/python/addWorkflow.py
@@ -27,6 +27,7 @@ def handle(value, sapi):
     success = False
 
     email = data["email"]
+    
 
     if "workflow" in data:
         workflow = data["workflow"]
@@ -38,6 +39,9 @@ def handle(value, sapi):
         wf["status"] = "undeployed"
         wf["modified"] = time.time()
         wf["endpoints"] = []
+        wf["gpu_usage"] = None
+        if "gpu_usage" in workflow:
+            wf["gpu_usage"] = workflow["gpu_usage"]
 
         wf["id"] = hashlib.md5(str(uuid.uuid4()).encode()).hexdigest()
 
diff --git a/ManagementService/python/deployWorkflow.py b/ManagementService/python/deployWorkflow.py
@@ -26,6 +26,23 @@
 WF_TYPE_SAND = 0
 WF_TYPE_ASL = 1
 
+def get_kv_pairs(testdict, keys, dicts=None):
+    # find and return kv pairs with particular keys in testdict
+    if not dicts:
+        dicts = [testdict]
+        testdict = [testdict]  
+    data = testdict.pop(0)
+    if isinstance(data, dict):
+        data = data.values()
+    for d in data:        
+        if isinstance(d, dict) or isinstance(d, list): # check d for type        
+            testdict.append(d)
+            if isinstance(d, dict):
+                dicts.append(d)
+    if testdict: # no more data to search
+        return get_kv_pairs(testdict, keys, dicts)
+    return [(k, v) for d in dicts for k, v in d.items() if k in keys]
+
 def is_asl_workflow(wfobj):
     return 'StartAt' in wfobj and 'States' in wfobj and isinstance(wfobj['States'], dict)
 
@@ -132,7 +149,6 @@ def compile_resource_info_map(resource_names, uploaded_resources, email, sapi, d
                 resource_metadata = json.loads(resource_metadata)
                 if "runtime" in resource_metadata:
                     resource_info["runtime"] = resource_metadata["runtime"]
-                print("RESOURCE_INFO_ALL: " +str(resource_info))
                 #if "num_gpu" in resource_metadata:
                 #    print("RESOURCE_INFO: " + str(resource_info["num_gpu"]))
 
@@ -245,7 +261,7 @@ def get_workflow_host_port(host_to_deploy, sid):
 
     return success, host_port
 
-def create_k8s_deployment(email, workflow_info, runtime, management=False):
+def create_k8s_deployment(email, workflow_info, runtime, management=False, use_gpus=0):
     # KUBERNETES MODE
     new_workflow_conf = {}
     conf_file = '/opt/mfn/SandboxAgent/conf/new_workflow.conf'
@@ -297,17 +313,15 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
     env.append({'name': 'WORKFLOWID', 'value': workflow_info["workflowId"]})
     env.append({'name': 'WORKFLOWNAME', 'value': workflow_info["workflowName"]})
 
-    """
-    if "num_gpu" in workflow_info.keys():
-        print("INSIDE K8S Deploy, num_gpu: " + str(workflow_info['num_gpu']))
-        num_gpu = int(workflow_info['num_gpu'])
+    if use_gpus >= 0:
+        #print("INSIDE K8S Deploy, num_gpu: " + str(workflow_info['num_gpu']))
+        #num_gpu = int(workflow_info['num_gpu'])
         # overwrite values from values.yaml for new workflows
-        kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = str(num_gpu)
-        kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = str(num_gpu)
-        kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox" 
-        if num_gpu > 0:
-            kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox" 
-    """ 
+        kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = str(use_gpus)
+        #kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox" 
+        kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox_gpu" 
+     
     # Special handling for the management container
     if management:
         kservice['spec']['template']['spec']['volumes'] = [{ 'name': 'new-workflow-conf', 'configMap': {'name': new_workflow_conf['configmap']}}]
@@ -407,6 +421,11 @@ def handle(value, sapi):
         workflow = data["workflow"]
         if "id" not in workflow:
             raise Exception("malformed input")
+        """
+        if "gpu_usage" not in workflow:
+            raise Exception("malformed input: no gpu_usage")
+            use_gpus = int(data['gpu_usage'])
+        """ 
         sapi.log(json.dumps(workflow))
         wfmeta = sapi.get(email + "_workflow_" + workflow["id"], True)
         if wfmeta is None or wfmeta == "":
@@ -437,6 +456,8 @@ def handle(value, sapi):
         if is_asl_workflow(wfobj):
             wf_type = WF_TYPE_ASL
 
+        #use_gpus = int(wfmeta._gpu_usage)
+
         success, errmsg, resource_names, uploaded_resources = check_workflow_functions(wf_type, wfobj, email, sapi)
         if not success:
             raise Exception("Couldn't deploy workflow; " + errmsg)
@@ -477,13 +498,16 @@ def handle(value, sapi):
             else:
                 runtime = "Python"
 
+            """
             if "num_gpu" in resource_info_map.keys():
                 print ("RESOURCE_INFO_MAP: " + str(resource_info_map))
                 workflow_info['num_gpu'] = resource_info_map['num_gpu']
             else:
                 workflow_info['num_gpu'] = 0
+            """
+            use_gpus = 0
 
-            url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime)
+            url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime, use_gpus)
             if url is not None and len(url) > 0:
                 status = "deploying"
                 sapi.addSetEntry(workflow_info["workflowId"] + "_workflow_endpoints", str(url), is_private=True)
diff --git a/deploy/helm/microfunctions/values.yaml b/deploy/helm/microfunctions/values.yaml
@@ -24,8 +24,8 @@ imageRepo: "localhost:5000"
 manager:
   #httpProxy: "http://<host>:<port>"
   #httpsProxy: "http://<host>:<port>"
-  #httpGatewayPort: 80
-  #httpsGatewayPort: 443
+  httpGatewayPort: 30336
+  #httpsGatewayPort: 32533
   nameOverride: "microfunctions"
   newWorkflow:
     hpa:
diff --git a/mfn_sdk/mfn_sdk/mfnclient.py b/mfn_sdk/mfn_sdk/mfnclient.py
@@ -266,6 +266,7 @@ def action(self,action,data=None):
         r.raise_for_status()
         log.debug("%s: %s <- %s", self.user, action, r.text[:256]+(r.text[256:] and '...'))
         resp = r.json()
+        print(str(resp))
         if resp.get('status','') != 'success':
             if resp.get('has_error',False):
                 raise Exception(f"MicroFunctions Error for action {action}: {resp['error_type']}")
@@ -449,7 +450,7 @@ def _get_state_names_and_resource(self, desired_state_type, wf_dict):
         return state_list
 
 
-    def add_workflow(self,name,filename=None):
+    def add_workflow(self,name,filename=None, gpu_usage=None):
         """ add a workflow
 
         returns an existing workflow if the name exists, registers a new workflow name if it doesn't exist
@@ -458,7 +459,7 @@ def add_workflow(self,name,filename=None):
         for wf in self._workflows:
             if wf._name == name:
                 return wf
-        data = self.action('addWorkflow',{'workflow':{'name':name}})
+        data = self.action('addWorkflow',{'workflow':{'name':name, "gpu_usage":gpu_usage}})
         wfd = data['workflow']
         wf = Workflow(self,wfd)
         self._workflows.append(wf)
@@ -475,6 +476,7 @@ def add_workflow(self,name,filename=None):
             # parse the WF json to find required functions
             fnames = []
             wfjson = json.loads(wfdesc)
+            #print("wfjson: "+ str(wfjson))
             if 'States' in wfjson:
                 state_list = self._get_state_names_and_resource('Task', wfjson)
                 for state_info in state_list:
@@ -505,7 +507,6 @@ def add_workflow(self,name,filename=None):
                     with open(fpyname, 'r') as f:
                         fcode = f.read()
                     f.code = fcode
-
         return wf
 
 
diff --git a/mfn_sdk/mfn_sdk/workflow.py b/mfn_sdk/mfn_sdk/workflow.py
@@ -52,9 +52,13 @@ class Workflow(object):
     """
 
     def __init__(self,client,wf):
+        print(str(wf))
         self.client=client
         self.id=wf["id"]
         self._name=wf["name"]
+        self._gpu_usage=None
+        if "gpu_usage" in wf:
+            self._gpu_usage=wf["gpu_usage"]
         self._modified=wf["modified"]
         self._status=wf.get("status",None)
         self._endpoints=wf.get("endpoints",None)
@@ -68,6 +72,19 @@ def __str__(self):
         else:
             return f"{self.id} ({self._name}, status: {self._status})"
 
+    @property
+    def gpu_usage(self):
+        # TODO: workflow GPU usage could have been updated, decide if we should fetch workflow status
+        return self._gpu_usage
+
+    """    
+    @gpu_usage.setter
+    def gpu_usage(self,gpu_usage):
+        # TODO: workflow GPU could have been updated, decide if we should fetch workflow status
+        res = self.client.action('modifyWorkflow',{'workflow':{'id':self.id,'name':name,'gpu_usage':self._gpu_usage}})
+        self.gpu_usage = gpu_usage
+    """    
+
     @property
     def name(self):
         # TODO: workflow name could have been updated, decide if we should fetch workflow status
@@ -124,15 +141,17 @@ def json(self):
     def json(self,json):
         if json != self.json:
             self._json = json
+            print ("uploaded workflow JOSN"+ str( json))
             self.client.action('uploadWorkflowJSON',{'workflow':{'id':self.id,'json':base64.b64encode(self._json.encode()).decode()}})
 
 
-    def deploy(self, timeout=None): #, num_gpu=None):
+    def deploy(self, timeout=None): 
         """ deploy a workflow and optionally wait in linearly increasing multiples of 1000ms
         :timeout: By default returns after calling deploy on the workflow without waiting for it to be actually deployed.
             If timeout is set to a numeric <= 0, it waits indefinitely in intervals of 1000ms, 2000ms, 3000ms, ...
             If timeout is set to a numeric > 0, it waits for the workflow to be deployed in increasing multiples of 100ms, but no longer than the timeout. When the timeout expires and the workflow is not deployed, the function raises an Exception
         """
+
         s = self.status
         if s == 'deployed':
             log.debug("deploy: wf %s already deployed",self.name)
@@ -145,9 +164,6 @@ def deploy(self, timeout=None): #, num_gpu=None):
         else:
             self.client.action('deployWorkflow',{'workflow':{'id':self.id}})
 
-        #if num_gpu is not None:
-        #    print("NUM_GPU:" + str(num_gpu))
-
 
         # if timeout is None, do not wait but return immediately even if it's not yet deployed
         if timeout is None:
@@ -283,8 +299,6 @@ def execute(self,data,timeout=60, check_duration=False):
 
         # we are already deployed and have the endpoints stored in self._endpoints
         url = random.choice(self._endpoints)
-        print(url)
-        url=url+":30336"
         try:
             #postdata = {}
             #postdata["value"] = json.dumps(data)
diff --git a/tests/mfn_test_utils.py b/tests/mfn_test_utils.py
@@ -84,8 +84,17 @@ def __init__(self, test_name=None, timeout=None, workflow_filename=None, new_use
         if timeout is not None:
             self._settings["timeout"] = timeout
 
+        """
+        else:
+            #self._gpu_usage = None
+            #self._workflow_description['num_gpu'] = self._settings["num_gpu"]
+            #print("Workflow_description:" + str(self._workflow_description))
+        
+        self.gpu_usage = 0 # hardcoded for now 
         if num_gpu is not None:
             self._settings["num_gpu"] = num_gpu
+            self._gpu_usage = self._settings["num_gpu"]
+        """
 
         self._log_clear_timestamp = int(time.time() * 1000.0 * 1000.0)
 
@@ -118,6 +127,7 @@ def _get_settings(self):
 
         # Defaults
         settings.setdefault("timeout", 60)
+        settings.setdefault("num_gpu", 0)
 
         return settings
 
@@ -174,6 +184,7 @@ def _get_resource_info(self, resource_ref):
         return retval
 
     def _get_resource_info_map(self, workflow_description=None, resource_info_map=None):
+        #print(str("wf description: " + str(workflow_description)))
         if workflow_description is None:
             workflow_description = self._workflow_description
         if resource_info_map is None:
@@ -194,8 +205,8 @@ def _get_resource_info_map(self, workflow_description=None, resource_info_map=No
                         resource_info["resource_env_filename"] = "environment_variables/" + resource_ref + "_environment_variables.txt"
                         resource_info_map[resource_ref] = resource_info
                         resource_info_map[resource_ref]['num_gpu'] = self._settings['num_gpu']
-                        print("resource_info_map: " + json.dumps(resource_info_map))
-
+                        #resource_info_map['num_gpu'] = self._settings['num_gpu']
+                        #print("resource_info_map: " + json.dumps(resource_info_map))
 
         elif "States" in workflow_description:
             states = workflow_description["States"]
@@ -210,8 +221,8 @@ def _get_resource_info_map(self, workflow_description=None, resource_info_map=No
                         resource_info["resource_env_filename"] = "environment_variables/" + resource_name + "_environment_variables.txt"
                         resource_info_map[resource_name] = resource_info
                         resource_info_map[resource_name]['num_gpu'] = self._settings['num_gpu']
-                        print("resource_info_map: " + json.dumps(resource_info_map))
-
+                        #resource_info_map['num_gpu'] = self._settings['num_gpu']
+                        #print("resource_info_map: " + json.dumps(resource_info_map))
 
                 if "Type" in state and state["Type"] == "Parallel":
                     branches = state['Branches']
@@ -228,6 +239,7 @@ def _get_resource_info_map(self, workflow_description=None, resource_info_map=No
             print("ERROR: invalid workflow description.")
             assert False
 
+        #resource_info_map['num_gpu'] = self._settings['num_gpu']
         return resource_info_map
 
     def _delete_resource_if_existing(self, existing_resources, resource_name):
@@ -239,6 +251,7 @@ def _delete_resource_if_existing(self, existing_resources, resource_name):
 
     def _create_and_upload_resource(self, resource_name, resource_info):
         print("Deploying resource: " + resource_name)
+        #print(str (resource_info))
 
         resource_filename = resource_info["resource_filename"]
         is_zip = resource_info["is_zip"]
@@ -287,10 +300,13 @@ def upload_workflow(self):
         self.undeploy_workflow()
 
         resource_info_map = self._get_resource_info_map()
+        #resource_info_map['num_gpu'] = 1
+        #print(str(resource_info_map))
 
         existing_resources = self._client.functions
 
         for resource_name in resource_info_map.keys():
+          #if not resource_name == 'num_gpu':
             self._delete_resource_if_existing(existing_resources, resource_name)
 
             resource_info = resource_info_map[resource_name]
@@ -304,8 +320,14 @@ def deploy_workflow(self):
         try:
             wf = self._client.add_workflow(self._workflow_name)
             wf.json = json.dumps(self._workflow_description)
+            #print (wf.json)
+            #wf._use_gpu=self._settings["num_gpu"]
+            wf._gpu_usage = "teststringgpu" # _use_gpu=self._settings["num_gpu"]
             wf.deploy(self._settings["timeout"]) #, num_gpu=self._settings['num_gpu'])
             self._workflow = wf
+            #print ("WF: " + str(wf._use_gpu))
+            #print ("WF1: " + str(wf.gpu_usage))
+            #wf.gpu_usage = "teststring"
             if self._workflow.status != "failed":
                 print("MFN workflow " + self._workflow_name + " deployed.")
             else: