-
Notifications
You must be signed in to change notification settings - Fork 117
[feat] Improve handling of maintenance reservations in Slurm #3573
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,6 +147,7 @@ def __init__(self): | |
| self._sched_access_in_submit = self.get_option( | ||
| 'sched_access_in_submit' | ||
| ) | ||
| self.addl_avail_states = set() | ||
|
|
||
| def make_job(self, *args, **kwargs): | ||
| return _SlurmJob(*args, **kwargs) | ||
|
|
@@ -323,7 +324,7 @@ def allnodes(self): | |
| 'could not retrieve node information') from e | ||
|
|
||
| node_descriptions = completed.stdout.splitlines() | ||
| return _create_nodes(node_descriptions) | ||
| return _create_nodes(node_descriptions, self.addl_avail_states) | ||
|
|
||
| def _get_default_partition(self): | ||
| completed = _run_strict('scontrol -a show -o partitions') | ||
|
|
@@ -436,15 +437,23 @@ def _get_reservation_nodes(self, reservation): | |
| raise JobSchedulerError("could not extract the node names for " | ||
| "reservation '%s'" % reservation) | ||
|
|
||
| flags_match = re.search(r'Flags=(\S+)', completed.stdout) | ||
| if flags_match: | ||
| if 'MAINT' in flags_match[1].split(','): | ||
| self.addl_avail_states.add('MAINTENANCE') | ||
| # else: | ||
| # raise JobSchedulerError(f"could not extract the reservation " | ||
| # f"flags for reservation '{reservation}'") | ||
|
Comment on lines
+444
to
+446
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't sure if we want to fail here or silently ignore it?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather simply log it with |
||
|
|
||
| completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) | ||
| node_descriptions = completed.stdout.splitlines() | ||
| return _create_nodes(node_descriptions) | ||
| return _create_nodes(node_descriptions, self.addl_avail_states) | ||
|
|
||
| def _get_nodes_by_name(self, nodespec): | ||
| completed = osext.run_command('scontrol -a show -o node %s' % | ||
| nodespec) | ||
| node_descriptions = completed.stdout.splitlines() | ||
| return _create_nodes(node_descriptions) | ||
| return _create_nodes(node_descriptions, self.addl_avail_states) | ||
|
|
||
| def _update_completion_time(self, job, timestamps): | ||
| if job._completion_time is not None: | ||
|
|
@@ -691,19 +700,19 @@ def poll(self, *jobs): | |
| self._cancel_if_pending_too_long(job) | ||
|
|
||
|
|
||
| def _create_nodes(descriptions): | ||
| def _create_nodes(descriptions, addl_avail_states=None): | ||
| nodes = set() | ||
| for descr in descriptions: | ||
| with suppress(JobSchedulerError): | ||
| nodes.add(_SlurmNode(descr)) | ||
| nodes.add(_SlurmNode(descr, addl_avail_states=addl_avail_states)) | ||
|
|
||
| return nodes | ||
|
|
||
|
|
||
| class _SlurmNode(sched.Node): | ||
| '''Class representing a Slurm node.''' | ||
|
|
||
| def __init__(self, node_descr): | ||
| def __init__(self, node_descr, addl_avail_states=None): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather pass the flags and let the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, just to make sure I understand:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes.
Let's keep it clean at the moment. If we want define custom available states, maybe we should do it with a configuration parameter. |
||
| self._name = self._extract_attribute('NodeName', node_descr) | ||
| if not self._name: | ||
| raise JobSchedulerError( | ||
|
|
@@ -718,6 +727,15 @@ def __init__(self, node_descr): | |
| 'State', node_descr, sep='+') or set() | ||
| self._descr = node_descr | ||
|
|
||
| self.addl_avail_states = addl_avail_states or set() | ||
| self.available_states = { | ||
| 'ALLOCATED', | ||
| 'COMPLETING', | ||
| 'IDLE', | ||
| 'PLANNED', | ||
| 'RESERVED' | ||
| } | self.addl_avail_states | ||
|
|
||
| def __eq__(self, other): | ||
| if not isinstance(other, type(self)): | ||
| return NotImplemented | ||
|
|
@@ -735,14 +753,7 @@ def in_statex(self, state): | |
| return self._states == set(state.upper().split('+')) | ||
|
|
||
| def is_avail(self): | ||
| available_states = { | ||
| 'ALLOCATED', | ||
| 'COMPLETING', | ||
| 'IDLE', | ||
| 'PLANNED', | ||
| 'RESERVED' | ||
| } | ||
| return self._states <= available_states | ||
| return self._states <= self.available_states | ||
|
|
||
| def is_down(self): | ||
| return not self.is_avail() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.