@@ -4,11 +4,12 @@ ClusterManager for a Slurm allocation
44Represents the resources available within a slurm allocation created by salloc/sbatch.
55The environment variables `SLURM_JOBID` and `SLURM_NTASKS` must be defined to construct this object.
66"""
7- struct SlurmManager <: ClusterManager
7+ mutable struct SlurmManager <: ClusterManager
88 jobid:: Int
99 ntasks:: Int
1010 verbose:: Bool
1111 launch_timeout:: Float64
12+ srun_proc:: IO
1213
1314 function SlurmManager (;verbose= false , launch_timeout= 60.0 )
1415 if ! (" SLURM_JOBID" in keys (ENV ) && " SLURM_NTASKS" in keys (ENV ))
@@ -29,21 +30,19 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
2930 exeflags = params[:exeflags ]
3031
3132 srun_cmd = ` srun -D $exehome $exename $exeflags --worker=$(cluster_cookie ()) `
32- srun_proc = open (srun_cmd)
33+ manager . srun_proc = open (srun_cmd)
3334
3435 t = @async for i in 1 : manager. ntasks
3536 manager. verbose && println (" connecting to worker $i out of $(manager. ntasks) " )
3637
37- line = readline (srun_proc)
38+ line = readline (manager . srun_proc)
3839 m = match (r" .*:(\d *)#(.*)" , line)
3940 m === nothing && error (" could not parse $line " )
4041
4142 config = WorkerConfig ()
4243 config. port = parse (Int, m[1 ])
4344 config. host = strip (m[2 ])
4445
45- # Keep a reference to the proc, so it's properly closed once the last worker exits.
46- config. userdata = srun_proc
4746 push! (instances_arr, config)
4847 notify (c)
4948 end
@@ -56,11 +55,17 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, c::Co
5655 wait (t)
5756
5857 # redirect output
59- @async while ! eof (srun_proc)
60- line = readline (srun_proc)
58+ @async while ! eof (manager . srun_proc)
59+ line = readline (manager . srun_proc)
6160 println (line)
6261 end
6362
63+ # wait to make sure that srun_proc exits before main program to avoid slurm complaining
64+ # avoids "Job step aborted: Waiting up to 32 seconds for job step to finish" message
65+ finalizer (manager) do manager
66+ wait (manager. srun_proc)
67+ end
68+
6469 catch e
6570 println (" Error launching Slurm job:" )
6671 rethrow (e)
0 commit comments