From 00c616dbc5604e6bb2ccd382e8a802d9ad6189e9 Mon Sep 17 00:00:00 2001 From: meesters Date: Thu, 3 Jul 2025 12:51:04 +0200 Subject: [PATCH 1/2] feat: attempt for detailed error reporting for job-steps --- snakemake_executor_plugin_slurm/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/snakemake_executor_plugin_slurm/__init__.py b/snakemake_executor_plugin_slurm/__init__.py index 627e9cdd..13d3e46d 100644 --- a/snakemake_executor_plugin_slurm/__init__.py +++ b/snakemake_executor_plugin_slurm/__init__.py @@ -497,11 +497,21 @@ async def check_active_jobs( any_finished = True active_jobs_seen_by_sacct.remove(j.external_jobid) elif status in fail_stati: + reason_command = f"sacct -j {j.external_jobid}.0 --format=Reason --noheader" + try: + reason_output = subprocess.check_output( + reason_command, shell=True, text=True, stderr=subprocess.PIPE + ).strip() + reason = reason_output if reason_output else "Unknown" + except subprocess.CalledProcessError as e: + reason = "Unable to retrieve reason" + self.logger.warning( + f"Failed to retrieve jobstep reason for SLURM job '{j.external_jobid}': {e.stderr.strip()}" + ) + msg = ( f"SLURM-job '{j.external_jobid}' failed, SLURM status is: " - # message ends with '. ', because it is proceeded - # with a new sentence - f"'{status}'. " + f"'{status}'. Reason: '{reason}'. " ) self.report_job_error( j, msg=msg, aux_logs=[j.aux["slurm_logfile"]._str] From 780a00a0aed9a07875726ca37a6449f84da868de Mon Sep 17 00:00:00 2001 From: meesters Date: Thu, 3 Jul 2025 12:57:45 +0200 Subject: [PATCH 2/2] feat: checking potentially up to 10 job steps - ready for MPI and Job Arrays --- snakemake_executor_plugin_slurm/__init__.py | 49 +++++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/snakemake_executor_plugin_slurm/__init__.py b/snakemake_executor_plugin_slurm/__init__.py index 13d3e46d..84a46a5e 100644 --- a/snakemake_executor_plugin_slurm/__init__.py +++ b/snakemake_executor_plugin_slurm/__init__.py @@ -497,22 +497,43 @@ async def check_active_jobs( any_finished = True active_jobs_seen_by_sacct.remove(j.external_jobid) elif status in fail_stati: - reason_command = f"sacct -j {j.external_jobid}.0 --format=Reason --noheader" - try: - reason_output = subprocess.check_output( - reason_command, shell=True, text=True, stderr=subprocess.PIPE - ).strip() - reason = reason_output if reason_output else "Unknown" - except subprocess.CalledProcessError as e: - reason = "Unable to retrieve reason" - self.logger.warning( - f"Failed to retrieve jobstep reason for SLURM job '{j.external_jobid}': {e.stderr.strip()}" + reasons = [] + for step in range(10): # Iterate over up to 10 job steps + reason_command = ( + f"sacct -j {j.external_jobid}.{step} " + "--format=Reason --noheader" ) + try: + reason_output = subprocess.check_output( + reason_command, + shell=True, + text=True, + stderr=subprocess.PIPE, + ).strip() + if reason_output: + reasons.append(f"Step {step}: {reason_output}") + except subprocess.CalledProcessError as e: + self.logger.warning( + f"Failed to retrieve jobstep reason for SLURM job " + f"'{j.external_jobid}.{step}': {e.stderr.strip()}" + ) + reasons.append(f"Step {step}: Unable to retrieve reason") - msg = ( - f"SLURM-job '{j.external_jobid}' failed, SLURM status is: " - f"'{status}'. Reason: '{reason}'. " - ) + if not reasons: + reasons.append("Unknown") + + if len(reasons) == 1: + msg = ( + f"SLURM-job '{j.external_jobid}' failed, " + f"SLURM status is: '{status}'. " + f"Reason: {reasons[0]}." + ) + else: + msg = ( + f"SLURM-job '{j.external_jobid}' failed, " + f"SLURM status is: '{status}'. " + f"Reasons: {', '.join(reasons)}." + ) self.report_job_error( j, msg=msg, aux_logs=[j.aux["slurm_logfile"]._str] )