Skip to content

Commit 7ebf64a

Browse files
authored
Add new attributes to db.Jobs and truncate_time option to db.JobFilter (#321)
1 parent 4ecdfa0 commit 7ebf64a

File tree

5 files changed

+185
-63
lines changed

5 files changed

+185
-63
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313
- New Classes to interact with Database QoS (WIP)
1414
- `pyslurm.db.QualityOfService`
1515
- `pyslurm.db.QualitiesOfService`
16+
- Add `truncate_time` option to `pyslurm.db.JobFilter`, which is the same as -T /
17+
--truncate from sacct.
18+
- Add new Attributes to `pyslurm.db.Jobs` that help gathering statistics for a
19+
collection of Jobs more convenient.
20+
- Fix `allocated_gres` attribute in the `pyslurm.Node` Class returning nothing.
21+
- Add new `idle_memory` and `allocated_tres` attributes to `pyslurm.Node` class
22+
- Fix Node State being displayed as `ALLOCATED` when it should actually be
23+
`MIXED`.
1624

1725
## [23.2.2](https://github.com/PySlurm/pyslurm/releases/tag/v23.2.2) - 2023-07-18
1826

pyslurm/db/job.pxd

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ cdef class JobFilter:
123123
Instruct the slurmdbd to also send the job environment(s)
124124
Note: This requires specifying explictiy job ids, and is mutually
125125
exclusive with `with_script`
126+
truncate_time (bool):
127+
Truncate start and end time.
128+
For example, when a Job has actually started before the requested
129+
`start_time`, the time will be truncated to `start_time`. Same
130+
logic applies for `end_time`. This is like the `-T` / `--truncate`
131+
option from `sacct`.
126132
"""
127133
cdef slurmdb_job_cond_t *ptr
128134

@@ -149,11 +155,60 @@ cdef class JobFilter:
149155
nodelist
150156
with_script
151157
with_env
158+
truncate_time
152159

153160

154161
cdef class Jobs(MultiClusterMap):
155-
"""A [`Multi Cluster`][pyslurm.xcollections.MultiClusterMap] collection of [pyslurm.db.Job][] objects."""
156-
pass
162+
"""A [`Multi Cluster`][pyslurm.xcollections.MultiClusterMap] collection of [pyslurm.db.Job][] objects.
163+
164+
Args:
165+
jobs (Union[list[int], dict[int, pyslurm.db.Job], str], optional=None):
166+
Jobs to initialize this collection with.
167+
168+
Attributes:
169+
consumed_energy (int):
170+
Total amount of energy consumed, in joules.
171+
disk_read (int):
172+
Total amount of bytes read.
173+
disk_write (int):
174+
Total amount of bytes written.
175+
page_faults (int):
176+
Total amount of page faults.
177+
resident_memory (int):
178+
Total Resident Set Size (RSS) used in bytes.
179+
virtual_memory (int):
180+
Total Virtual Memory Size (VSZ) used in bytes.
181+
elapsed_cpu_time (int):
182+
Total amount of time used (Elapsed time * cpu count) in seconds.
183+
This is not the real CPU-Efficiency, but rather the total amount
184+
of cpu-time the CPUs were occupied for.
185+
total_cpu_time (int):
186+
Sum of `user_cpu_time` and `system_cpu_time`, in seconds
187+
user_cpu_time (int):
188+
Total amount of Time spent in user space, in seconds
189+
system_cpu_time (int):
190+
Total amount of Time spent in kernel space, in seconds
191+
cpus (int):
192+
Total amount of cpus.
193+
nodes (int):
194+
Total amount of nodes.
195+
memory (int):
196+
Total amount of requested memory in Mebibytes.
197+
"""
198+
cdef public:
199+
consumed_energy
200+
disk_read
201+
disk_write
202+
page_faults
203+
resident_memory
204+
virtual_memory
205+
elapsed_cpu_time
206+
total_cpu_time
207+
user_cpu_time
208+
system_cpu_time
209+
cpus
210+
nodes
211+
memory
157212

158213

159214
cdef class Job:
@@ -252,7 +307,7 @@ cdef class Job:
252307
Amount of CPUs the Job has/had allocated, or, if the Job is still
253308
pending, this will reflect the amount requested.
254309
memory (int):
255-
Amount of memory the Job requested in total
310+
Amount of memory the Job requested in total, in Mebibytes
256311
reservation (str):
257312
Name of the Reservation for this Job
258313
script (str):

pyslurm/db/job.pyx

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ from typing import Any
2929
from pyslurm.utils.uint import *
3030
from pyslurm.settings import LOCAL_CLUSTER
3131
from pyslurm import xcollections
32+
from pyslurm.db.stats import (
33+
reset_stats_for_job_collection,
34+
add_stats_to_job_collection,
35+
)
3236
from pyslurm.utils.ctime import (
3337
date_to_timestamp,
3438
timestr_to_mins,
@@ -146,6 +150,9 @@ cdef class JobFilter:
146150
if self.nodelist:
147151
cstr.fmalloc(&ptr.used_nodes,
148152
nodelist_to_range_str(self.nodelist))
153+
154+
if self.truncate_time:
155+
ptr.flags &= ~slurm.JOBCOND_FLAG_NO_TRUNC
149156

150157
if self.ids:
151158
# These are only allowed by the slurmdbd when specific jobs are
@@ -196,6 +203,7 @@ cdef class Jobs(MultiClusterMap):
196203
val_type=Job,
197204
id_attr=Job.id,
198205
key_type=int)
206+
self._reset_stats()
199207

200208
@staticmethod
201209
def load(JobFilter db_filter=None, Connection db_connection=None):
@@ -275,15 +283,35 @@ cdef class Jobs(MultiClusterMap):
275283
job = Job.from_ptr(<slurmdb_job_rec_t*>job_ptr.data)
276284
job.qos_data = qos_data
277285
job._create_steps()
278-
JobStatistics._sum_step_stats_for_job(job, job.steps)
286+
job.stats = JobStatistics.from_job_steps(job)
279287

280288
cluster = job.cluster
281289
if cluster not in out.data:
282290
out.data[cluster] = {}
283291
out[cluster][job.id] = job
284292

293+
add_stats_to_job_collection(out, job.stats)
294+
out.cpus += job.cpus
295+
out.nodes += job.num_nodes
296+
out.memory += job.memory
297+
285298
return out
286299

300+
def _reset_stats(self):
301+
reset_stats_for_job_collection(self)
302+
self.cpus = 0
303+
self.nodes = 0
304+
self.memory = 0
305+
306+
def calc_stats(self):
307+
"""(Re)Calculate Statistics for the Job Collection."""
308+
self._reset_stats()
309+
for job in self.values():
310+
add_stats_to_job_collection(self, job.stats)
311+
self.cpus += job.cpus
312+
self.nodes += job.num_nodes
313+
self.memory += job.memory
314+
287315
@staticmethod
288316
def modify(db_filter, Job changes, db_connection=None):
289317
"""Modify Slurm database Jobs.
@@ -445,7 +473,6 @@ cdef class Job:
445473
cdef Job wrap = Job.__new__(Job)
446474
wrap.ptr = in_ptr
447475
wrap.steps = JobSteps.__new__(JobSteps)
448-
wrap.stats = JobStatistics()
449476
return wrap
450477

451478
@staticmethod
@@ -738,7 +765,7 @@ cdef class Job:
738765
else:
739766
# Job is still pending, so we return the number of requested cpus
740767
# instead.
741-
return u32_parse(self.ptr.req_cpus)
768+
return u32_parse(self.ptr.req_cpus, on_noval=0, zero_is_noval=False)
742769

743770
@property
744771
def memory(self):

pyslurm/db/stats.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ cdef class JobStatistics:
139139
user_cpu_time
140140
system_cpu_time
141141

142+
@staticmethod
143+
cdef JobStatistics from_job_steps(Job job)
144+
142145
@staticmethod
143146
cdef JobStatistics from_step(JobStep step)
144147

pyslurm/db/stats.pyx

Lines changed: 86 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,32 @@ from pyslurm.utils.helpers import (
2828
)
2929

3030

31+
def reset_stats_for_job_collection(jobs):
32+
jobs.consumed_energy = 0
33+
jobs.disk_read = 0
34+
jobs.disk_write = 0
35+
jobs.page_faults = 0
36+
jobs.resident_memory = 0
37+
jobs.virtual_memory = 0
38+
jobs.elapsed_cpu_time = 0
39+
jobs.total_cpu_time = 0
40+
jobs.user_cpu_time = 0
41+
jobs.system_cpu_time = 0
42+
43+
44+
def add_stats_to_job_collection(jobs, JobStatistics js):
45+
jobs.consumed_energy += js.consumed_energy
46+
jobs.disk_read += js.avg_disk_read
47+
jobs.disk_write += js.avg_disk_write
48+
jobs.page_faults += js.avg_page_faults
49+
jobs.resident_memory += js.avg_resident_memory
50+
jobs.virtual_memory += js.avg_virtual_memory
51+
jobs.elapsed_cpu_time += js.elapsed_cpu_time
52+
jobs.total_cpu_time += js.total_cpu_time
53+
jobs.user_cpu_time += js.user_cpu_time
54+
jobs.system_cpu_time += js.system_cpu_time
55+
56+
3157
cdef class JobStatistics:
3258

3359
def __init__(self):
@@ -50,6 +76,21 @@ cdef class JobStatistics:
5076
def to_dict(self):
5177
return instance_to_dict(self)
5278

79+
@staticmethod
80+
cdef JobStatistics from_job_steps(Job job):
81+
cdef JobStatistics job_stats = JobStatistics()
82+
83+
for step in job.steps.values():
84+
job_stats._add_base_stats(step.stats)
85+
86+
job_stats._sum_cpu_time(job)
87+
88+
step_count = len(job.steps)
89+
if step_count:
90+
job_stats.avg_cpu_frequency /= step_count
91+
92+
return job_stats
93+
5394
@staticmethod
5495
cdef JobStatistics from_step(JobStep step):
5596
cdef JobStatistics wrap = JobStatistics()
@@ -140,68 +181,56 @@ cdef class JobStatistics:
140181

141182
return wrap
142183

143-
@staticmethod
144-
def _sum_step_stats_for_job(Job job, JobSteps steps):
145-
cdef:
146-
JobStatistics job_stats = job.stats
147-
JobStatistics step_stats = None
148-
149-
for step in steps.values():
150-
step_stats = step.stats
151-
152-
job_stats.consumed_energy += step_stats.consumed_energy
153-
job_stats.avg_cpu_time += step_stats.avg_cpu_time
154-
job_stats.avg_cpu_frequency += step_stats.avg_cpu_frequency
155-
job_stats.avg_disk_read += step_stats.avg_disk_read
156-
job_stats.avg_disk_write += step_stats.avg_disk_write
157-
job_stats.avg_page_faults += step_stats.avg_page_faults
158-
159-
if step_stats.max_disk_read >= job_stats.max_disk_read:
160-
job_stats.max_disk_read = step_stats.max_disk_read
161-
job_stats.max_disk_read_node = step_stats.max_disk_read_node
162-
job_stats.max_disk_read_task = step_stats.max_disk_read_task
163-
164-
if step_stats.max_disk_write >= job_stats.max_disk_write:
165-
job_stats.max_disk_write = step_stats.max_disk_write
166-
job_stats.max_disk_write_node = step_stats.max_disk_write_node
167-
job_stats.max_disk_write_task = step_stats.max_disk_write_task
168-
169-
if step_stats.max_page_faults >= job_stats.max_page_faults:
170-
job_stats.max_page_faults = step_stats.max_page_faults
171-
job_stats.max_page_faults_node = step_stats.max_page_faults_node
172-
job_stats.max_page_faults_task = step_stats.max_page_faults_task
173-
174-
if step_stats.max_resident_memory >= job_stats.max_resident_memory:
175-
job_stats.max_resident_memory = step_stats.max_resident_memory
176-
job_stats.max_resident_memory_node = step_stats.max_resident_memory_node
177-
job_stats.max_resident_memory_task = step_stats.max_resident_memory_task
178-
job_stats.avg_resident_memory = job_stats.max_resident_memory
179-
180-
if step_stats.max_virtual_memory >= job_stats.max_virtual_memory:
181-
job_stats.max_virtual_memory = step_stats.max_virtual_memory
182-
job_stats.max_virtual_memory_node = step_stats.max_virtual_memory_node
183-
job_stats.max_virtual_memory_task = step_stats.max_virtual_memory_task
184-
job_stats.avg_virtual_memory = job_stats.max_virtual_memory
185-
186-
if step_stats.min_cpu_time >= job_stats.min_cpu_time:
187-
job_stats.min_cpu_time = step_stats.min_cpu_time
188-
job_stats.min_cpu_time_node = step_stats.min_cpu_time_node
189-
job_stats.min_cpu_time_task = step_stats.min_cpu_time_task
190-
184+
def _add_base_stats(self, JobStatistics src):
185+
self.consumed_energy += src.consumed_energy
186+
self.avg_cpu_time += src.avg_cpu_time
187+
self.avg_cpu_frequency += src.avg_cpu_frequency
188+
self.avg_disk_read += src.avg_disk_read
189+
self.avg_disk_write += src.avg_disk_write
190+
self.avg_page_faults += src.avg_page_faults
191+
192+
if src.max_disk_read >= self.max_disk_read:
193+
self.max_disk_read = src.max_disk_read
194+
self.max_disk_read_node = src.max_disk_read_node
195+
self.max_disk_read_task = src.max_disk_read_task
196+
197+
if src.max_disk_write >= self.max_disk_write:
198+
self.max_disk_write = src.max_disk_write
199+
self.max_disk_write_node = src.max_disk_write_node
200+
self.max_disk_write_task = src.max_disk_write_task
201+
202+
if src.max_page_faults >= self.max_page_faults:
203+
self.max_page_faults = src.max_page_faults
204+
self.max_page_faults_node = src.max_page_faults_node
205+
self.max_page_faults_task = src.max_page_faults_task
206+
207+
if src.max_resident_memory >= self.max_resident_memory:
208+
self.max_resident_memory = src.max_resident_memory
209+
self.max_resident_memory_node = src.max_resident_memory_node
210+
self.max_resident_memory_task = src.max_resident_memory_task
211+
self.avg_resident_memory = self.max_resident_memory
212+
213+
if src.max_virtual_memory >= self.max_virtual_memory:
214+
self.max_virtual_memory = src.max_virtual_memory
215+
self.max_virtual_memory_node = src.max_virtual_memory_node
216+
self.max_virtual_memory_task = src.max_virtual_memory_task
217+
self.avg_virtual_memory = self.max_virtual_memory
218+
219+
if src.min_cpu_time >= self.min_cpu_time:
220+
self.min_cpu_time = src.min_cpu_time
221+
self.min_cpu_time_node = src.min_cpu_time_node
222+
self.min_cpu_time_task = src.min_cpu_time_task
223+
224+
def _sum_cpu_time(self, Job job):
191225
if job.ptr.tot_cpu_sec != slurm.NO_VAL64:
192-
job_stats.total_cpu_time = job.ptr.tot_cpu_sec
226+
self.total_cpu_time += job.ptr.tot_cpu_sec
193227

194228
if job.ptr.user_cpu_sec != slurm.NO_VAL64:
195-
job_stats.user_cpu_time = job.ptr.user_cpu_sec
229+
self.user_cpu_time += job.ptr.user_cpu_sec
196230

197231
if job.ptr.sys_cpu_sec != slurm.NO_VAL64:
198-
job_stats.system_cpu_time = job.ptr.sys_cpu_sec
232+
self.system_cpu_time += job.ptr.sys_cpu_sec
199233

200234
elapsed = job.elapsed_time if job.elapsed_time else 0
201235
cpus = job.cpus if job.cpus else 0
202-
job_stats.elapsed_cpu_time = elapsed * cpus
203-
204-
step_count = len(steps)
205-
if step_count:
206-
job_stats.avg_cpu_frequency /= step_count
207-
236+
self.elapsed_cpu_time += elapsed * cpus

0 commit comments

Comments
 (0)