@@ -349,20 +349,19 @@ async def test_torque(db, event_loop):
349349 re .compile (r"ppn=5" ),
350350 re .compile (r"^#PBS some_option_asdf" , re .M ),
351351 ]
352+ poll_running = (
353+ re .compile (r"sudo.*qstat" ),
354+ f"<job_state>R</job_state><exec_host>{ testhost } /1</exec_host>" ,
355+ )
352356 script = [
353357 (re .compile (r"sudo.*qsub" ), str (testjob )),
354358 (
355359 re .compile (r"sudo.*qstat" ),
356360 "<job_state>Q</job_state><exec_host></exec_host>" ,
357361 ), # pending
358- (
359- re .compile (r"sudo.*qstat" ),
360- f"<job_state>R</job_state><exec_host>{ testhost } /1</exec_host>" ,
361- ), # running
362- (
363- re .compile (r"sudo.*qstat" ),
364- f"<job_state>R</job_state><exec_host>{ testhost } /1</exec_host>" ,
365- ), # running
362+ poll_running ,
363+ poll_running ,
364+ poll_running ,
366365 (re .compile (r"sudo.*qdel" ), "STOP" ),
367366 (re .compile (r"sudo.*qstat" ), "" ),
368367 ]
@@ -394,17 +393,16 @@ async def test_moab(db, event_loop):
394393 re .compile (r"ppn=5" ),
395394 re .compile (r"^#PBS some_option_asdf" , re .M ),
396395 ]
396+ poll_running = (
397+ re .compile (r"sudo.*mdiag" ),
398+ f'State="Running" AllocNodeList="{ testhost } "' ,
399+ )
397400 script = [
398401 (re .compile (r"sudo.*msub" ), str (testjob )),
399402 (re .compile (r"sudo.*mdiag" ), 'State="Idle"' ), # pending
400- (
401- re .compile (r"sudo.*mdiag" ),
402- f'State="Running" AllocNodeList="{ testhost } "' ,
403- ), # running
404- (
405- re .compile (r"sudo.*mdiag" ),
406- f'State="Running" AllocNodeList="{ testhost } "' ,
407- ), # running
403+ poll_running ,
404+ poll_running ,
405+ poll_running ,
408406 (re .compile (r"sudo.*mjobctl.*-c" ), "STOP" ),
409407 (re .compile (r"sudo.*mdiag" ), "" ),
410408 ]
@@ -436,17 +434,16 @@ async def test_pbs(db, event_loop):
436434 re .compile (r"@some_pbs_admin_node" ),
437435 re .compile (r"^#PBS some_option_asdf" , re .M ),
438436 ]
437+ poll_running = (
438+ re .compile (r"sudo.*qstat" ),
439+ f"job_state = R\n exec_host = { testhost } /2*1" ,
440+ )
439441 script = [
440442 (re .compile (r"sudo.*qsub" ), str (testjob )),
441443 (re .compile (r"sudo.*qstat" ), "job_state = Q" ), # pending
442- (
443- re .compile (r"sudo.*qstat" ),
444- f"job_state = R\n exec_host = { testhost } /2*1" ,
445- ), # running
446- (
447- re .compile (r"sudo.*qstat" ),
448- f"job_state = R\n exec_host = { testhost } /2*1" ,
449- ), # running
444+ poll_running ,
445+ poll_running ,
446+ poll_running ,
450447 (re .compile (r"sudo.*qdel" ), "STOP" ),
451448 (re .compile (r"sudo.*qstat" ), "" ),
452449 ]
@@ -504,6 +501,7 @@ async def test_slurm(db, event_loop):
504501 ), # unknown
505502 (re .compile (r"sudo.*squeue" ), "RUNNING " + testhost ), # running
506503 (re .compile (r"sudo.*squeue" ), "RUNNING " + testhost ),
504+ (re .compile (r"sudo.*squeue" ), "RUNNING " + testhost ),
507505 (re .compile (r"sudo.*scancel" ), "STOP" ),
508506 (re .compile (r"sudo.*squeue" ), "" ),
509507]
@@ -573,6 +571,7 @@ async def test_condor(db, event_loop):
573571 (re .compile (r"sudo.*condor_q" ), "1," ), # pending
574572 (re .compile (r"sudo.*condor_q" ), f"2, @{ testhost } " ), # runing
575573 (re .compile (r"sudo.*condor_q" ), f"2, @{ testhost } " ),
574+ (re .compile (r"sudo.*condor_q" ), f"2, @{ testhost } " ),
576575 (re .compile (r"sudo.*condor_rm" ), "STOP" ),
577576 (re .compile (r"sudo.*condor_q" ), "" ),
578577 ]
@@ -611,6 +610,7 @@ async def test_lfs(db, event_loop):
611610 (re .compile (r"sudo.*bjobs" ), "PEND " ), # pending
612611 (re .compile (r"sudo.*bjobs" ), f"RUN { testhost } " ), # running
613612 (re .compile (r"sudo.*bjobs" ), f"RUN { testhost } " ),
613+ (re .compile (r"sudo.*bjobs" ), f"RUN { testhost } " ),
614614 (re .compile (r"sudo.*bkill" ), "STOP" ),
615615 (re .compile (r"sudo.*bjobs" ), "" ),
616616 ]
@@ -652,3 +652,19 @@ async def test_keepvars(db, event_loop):
652652 spawner_kwargs = spawner_kwargs ,
653653 batch_script_re_list = batch_script_re_list ,
654654 )
655+
656+
657+ async def test_early_stop (db , event_loop ):
658+ script = [
659+ (re .compile (r"sudo.*sbatch" ), str (testjob )),
660+ (re .compile (r"sudo.*squeue" ), "PENDING " ), # pending
661+ (
662+ re .compile (r"sudo.*squeue" ),
663+ "slurm_load_jobs error: Unable to contact slurm controller" ,
664+ ), # unknown
665+ # job exits early during start
666+ (re .compile (r"sudo.*squeue" ), "" ),
667+ (re .compile (r"sudo.*scancel" ), "STOP" ),
668+ ]
669+ with pytest .raises (RuntimeError , match = "job has disappeared" ):
670+ await run_spawner_script (db , SlurmSpawner , script )
0 commit comments