chatgpt stripped out the preemptible stuff and moved back to sctonrol from sacct (which is not enabled in aws pclsuter slurm. Simplified, this should also work on slurm installs generally which do not have sacct enabled, which is not default behavior

John Major · John Major · commit 80f0910ae379 · 2024-10-10T00:49:23.000-07:00
diff --git a/snakemake_executor_plugin_slurm/__init__.py b/snakemake_executor_plugin_slurm/__init__.py
@@ -246,23 +246,10 @@ def run_job(self, job: JobExecutorInterface):
                 job, external_jobid=slurm_jobid, aux={"slurm_logfile": slurm_logfile}
             )
         )
-
+        
     async def check_active_jobs(
         self, active_jobs: List[SubmittedJobInfo]
     ) -> Generator[SubmittedJobInfo, None, None]:
-        # Check the status of active jobs.
-        # You have to iterate over the given list active_jobs.
-        # For jobs that have finished successfully, you have to call
-        # self.report_job_success(job).
-        # For jobs that have errored, you have to call
-        # self.report_job_error(job).
-        # Jobs that are still running have to be yielded.
-        #
-        # For queries to the remote middleware, please use
-        # self.status_rate_limiter like this:
-        #
-        # async with self.status_rate_limiter:
-        #    # query remote middleware here
         fail_stati = (
             "BOOT_FAIL",
             "CANCELLED",
@@ -271,139 +258,55 @@ async def check_active_jobs(
             "NODE_FAIL",
             "OUT_OF_MEMORY",
             "TIMEOUT",
-            "ERROR",
+            "PREEMPTED",
+            "SUSPENDED",
+            "STOPPED",            
+            "REVOKED",  # slurm docs suggest this should be here too
         )
-        # Cap sleeping time between querying the status of all active jobs:
-        # If `AccountingStorageType`` for `sacct` is set to `accounting_storage/none`,
-        # sacct will query `slurmctld` (instead of `slurmdbd`) and this in turn can
-        # rely on default config, see: https://stackoverflow.com/a/46667605
-        # This config defaults to `MinJobAge=300`, which implies that jobs will be
-        # removed from `slurmctld` within 6 minutes of finishing. So we're conservative
-        # here, with half that time
-        max_sleep_time = 180
-
-        sacct_query_durations = []
-
-        status_attempts = 5
-
-        active_jobs_ids = {job_info.external_jobid for job_info in active_jobs}
-        active_jobs_seen_by_sacct = set()
-        missing_sacct_status = set()
-
-        # We use this sacct syntax for argument 'starttime' to keep it compatible
-        # with slurm < 20.11
-        sacct_starttime = f"{datetime.now() - timedelta(days = 2):%Y-%m-%dT%H:00}"
-        # previously we had
-        # f"--starttime now-2days --endtime now --name {self.run_uuid}"
-        # in line 218 - once v20.11 is definitively not in use any more,
-        # the more readable version ought to be re-adapted
-
-        # -X: only show main job, no substeps
-        sacct_command = f"""sacct -X --parsable2 \
-                        --clusters all \
-                        --noheader --format=JobIdRaw,State \
-                        --starttime {sacct_starttime} \
-                        --endtime now --name {self.run_uuid}"""
-
-        # for better redability in verbose output
-        sacct_command = " ".join(shlex.split(sacct_command))
-
-        # this code is inspired by the snakemake profile:
-        # https://github.com/Snakemake-Profiles/slurm
-        for i in range(status_attempts):
-            async with self.status_rate_limiter:
-                (status_of_jobs, sacct_query_duration) = await self.job_stati(
-                    sacct_command
-                )
-                if status_of_jobs is None and sacct_query_duration is None:
-                    self.logger.debug(f"could not check status of job {self.run_uuid}")
-                    continue
-                sacct_query_durations.append(sacct_query_duration)
-                self.logger.debug(f"status_of_jobs after sacct is: {status_of_jobs}")
-                # only take jobs that are still active
-                active_jobs_ids_with_current_sacct_status = (
-                    set(status_of_jobs.keys()) & active_jobs_ids
-                )
-                self.logger.debug(
-                    f"active_jobs_ids_with_current_sacct_status are: "
-                    f"{active_jobs_ids_with_current_sacct_status}"
-                )
-                active_jobs_seen_by_sacct = (
-                    active_jobs_seen_by_sacct
-                    | active_jobs_ids_with_current_sacct_status
-                )
-                self.logger.debug(
-                    f"active_jobs_seen_by_sacct are: {active_jobs_seen_by_sacct}"
-                )
-                missing_sacct_status = (
-                    active_jobs_seen_by_sacct
-                    - active_jobs_ids_with_current_sacct_status
-                )
-                self.logger.debug(f"missing_sacct_status are: {missing_sacct_status}")
-                if not missing_sacct_status:
-                    break
-
-        if missing_sacct_status:
-            self.logger.warning(
-                f"Unable to get the status of all active jobs that should be "
-                f"in slurmdbd, even after {status_attempts} attempts.\n"
-                f"The jobs with the following slurm job ids were previously seen "
-                "by sacct, but sacct doesn't report them any more:\n"
-                f"{missing_sacct_status}\n"
-                f"Please double-check with your slurm cluster administrator, that "
-                "slurmdbd job accounting is properly set up.\n"
-            )
-
-        if status_of_jobs is not None:
-            any_finished = False
-            for j in active_jobs:
-                # the job probably didn't make it into slurmdbd yet, so
-                # `sacct` doesn't return it
-                if j.external_jobid not in status_of_jobs:
-                    # but the job should still be queueing or running and
-                    # appear in slurmdbd (and thus `sacct` output) later
-                    yield j
-                    continue
-                status = status_of_jobs[j.external_jobid]
-                if status == "COMPLETED":
-                    self.report_job_success(j)
-                    any_finished = True
-                    active_jobs_seen_by_sacct.remove(j.external_jobid)
-                elif status == "PREEMPTED" and not self._preemption_warning:
-                    self._preemption_warning = True
-                    self.logger.warning(
-                        """
-===== A Job preemption  occured! =====
-Leave Snakemake running, if possible. Otherwise Snakemake
-needs to restart this job upon a Snakemake restart.
 
-We leave it to SLURM to resume your job(s)"""
+        for job_info in active_jobs:
+            jobid = job_info.external_jobid
+            async with self.status_rate_limiter:
+                try:
+                    # Run scontrol command
+                    command = f"scontrol -o show job {jobid}"
+                    command_res = subprocess.check_output(
+                        command, text=True, shell=True, stderr=subprocess.PIPE
                     )
-                    yield j
-                elif status == "UNKNOWN":
-                    # the job probably does not exist anymore, but 'sacct' did not work
-                    # so we assume it is finished
-                    self.report_job_success(j)
-                    any_finished = True
-                    active_jobs_seen_by_sacct.remove(j.external_jobid)
-                elif status in fail_stati:
-                    msg = (
-                        f"SLURM-job '{j.external_jobid}' failed, SLURM status is: "
-                        # message ends with '. ', because it is proceeded
-                        # with a new sentence
-                        f"'{status}'. "
+                    # Parse JobState
+                    match = re.search(r'JobState=(\S+)', command_res)
+                    if match:
+                        status = match.group(1)
+                    else:
+                        # If JobState is not found, assume unknown status
+                        status = "UNKNOWN"
+
+                    self.logger.debug(f"Job {jobid} status: {status}")
+
+                    if status == "COMPLETED":
+                        self.report_job_success(job_info)
+                    elif status in fail_stati:
+                        msg = (
+                            f"SLURM job '{jobid}' failed with status '{status}'."
+                        )
+                        self.report_job_error(job_info, msg=msg, aux_logs=[job_info.aux["slurm_logfile"]])
+                    else:
+                        # Job is still running or pending
+                        yield job_info
+                except subprocess.CalledProcessError as e:
+                    # Handle errors from scontrol
+                    self.logger.error(
+                        f"Failed to get status of job {jobid} with scontrol: {e.stderr.strip()}"
                     )
-                    self.report_job_error(j, msg=msg, aux_logs=[j.aux["slurm_logfile"]])
-                    active_jobs_seen_by_sacct.remove(j.external_jobid)
-                else:  # still running?
-                    yield j
-
-            if not any_finished:
-                self.next_seconds_between_status_checks = min(
-                    self.next_seconds_between_status_checks + 10, max_sleep_time
-                )
-            else:
-                self.next_seconds_between_status_checks = None
+                    # Assume job has failed
+                    msg = f"Failed to get status of job {jobid}."
+                    self.report_job_error(job_info, msg=msg, aux_logs=[job_info.aux["slurm_logfile"]])
+                except Exception as e:
+                    # Handle any other exceptions
+                    self.logger.error(f"Unexpected error while checking job {jobid}: {e}")
+                    # Assume job is still running
+                    yield job_info
+
 
     def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
         # Cancel all active jobs.
@@ -436,41 +339,6 @@ def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
                     f"(exit code {e.returncode}){msg}"
                 ) from e
 
-    async def job_stati(self, command):
-        """Obtain SLURM job status of all submitted jobs with sacct
-
-        Keyword arguments:
-        command -- a slurm command that returns one line for each job with:
-                   "<raw/main_job_id>|<long_status_string>"
-        """
-        res = query_duration = None
-        try:
-            time_before_query = time.time()
-            command_res = subprocess.check_output(
-                command, text=True, shell=True, stderr=subprocess.PIPE
-            )
-            query_duration = time.time() - time_before_query
-            self.logger.debug(
-                f"The job status was queried with command: {command}\n"
-                f"It took: {query_duration} seconds\n"
-                f"The output is:\n'{command_res}'\n"
-            )
-            res = {
-                # We split the second field in the output, as the State field
-                # could contain info beyond the JOB STATE CODE according to:
-                # https://slurm.schedmd.com/sacct.html#OPT_State
-                entry[0]: entry[1].split(sep=None, maxsplit=1)[0]
-                for entry in csv.reader(StringIO(command_res), delimiter="|")
-            }
-        except subprocess.CalledProcessError as e:
-            self.logger.error(
-                f"The job status query failed with command: {command}\n"
-                f"Error message: {e.stderr.strip()}\n"
-            )
-            pass
-
-        return (res, query_duration)
-
     def get_account_arg(self, job: JobExecutorInterface):
         """
         checks whether the desired account is valid,