Skip to content

Commit ebeca95

Browse files
committed
BF: CS-1054 with pe setting ign_sreq_on_mhost=true a globally defined resource which is requested for slave scope is not booked into the resource diagram
1 parent d7911cb commit ebeca95

File tree

7 files changed

+36
-20
lines changed

7 files changed

+36
-20
lines changed

source/daemons/qmaster/setup_qmaster.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,7 @@ static void debit_all_jobs_from_qs() {
12781278
const lListElem *pe = lGetObject(jatep, JAT_pe_object);
12791279

12801280
/* don't look at states - we only trust in "granted destin. ident. list" */
1281+
bool do_per_global_host_booking = true;
12811282
const char *last_hostname = nullptr;
12821283
for_each_ep(gdi, lGetList(jatep, JAT_granted_destin_identifier_list)) {
12831284
u_long32 ar_id = lGetUlong(jep, JB_ar);
@@ -1300,7 +1301,7 @@ static void debit_all_jobs_from_qs() {
13001301
debit_host_consumable(jep, jatep, pe,
13011302
host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST),
13021303
SGE_GLOBAL_NAME), master_centry_list, slots,
1303-
master_task, do_per_host_booking, nullptr);
1304+
master_task, do_per_global_host_booking, nullptr);
13041305
debit_host_consumable(jep, jatep, pe,
13051306
host_list_locate(*ocs::DataStore::get_master_list(SGE_TYPE_EXECHOST),
13061307
lGetHost(qep, QU_qhostname)), master_centry_list,
@@ -1329,6 +1330,7 @@ static void debit_all_jobs_from_qs() {
13291330
}
13301331
}
13311332
master_task = false;
1333+
do_per_global_host_booking = false;
13321334
}
13331335
}
13341336
}

source/daemons/qmaster/sge_centry_qmaster.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,7 @@ void centry_redebit_consumables(const lList *centries, u_long64 gdi_version) {
528528
const char *last_hostname = nullptr;
529529
const lListElem *pe = lGetObject(jatep, JAT_pe_object);
530530

531+
bool do_per_global_host_booking = true;
531532
for_each_ep(gdil, lGetList(jatep, JAT_granted_destin_identifier_list)) {
532533
int qslots;
533534

@@ -539,6 +540,9 @@ void centry_redebit_consumables(const lList *centries, u_long64 gdi_version) {
539540

540541
qslots = lGetUlong(gdil, JG_slots);
541542

543+
debit_host_consumable(jep, jatep, pe, host_list_locate(master_ehost_list, SGE_GLOBAL_NAME),
544+
master_centry_list, slots, master_task, do_per_global_host_booking, nullptr);
545+
542546
bool do_per_host_booking = host_do_per_host_booking(&last_hostname, lGetHost(gdil, JG_qhostname));
543547
debit_host_consumable(jep, jatep, pe, host_list_locate(master_ehost_list,
544548
lGetHost(qep, QU_qhostname)),
@@ -548,9 +552,8 @@ void centry_redebit_consumables(const lList *centries, u_long64 gdi_version) {
548552
nullptr);
549553
slots += qslots;
550554
master_task = false;
555+
do_per_global_host_booking = false;
551556
}
552-
debit_host_consumable(jep, jatep, pe, host_list_locate(master_ehost_list, SGE_GLOBAL_NAME),
553-
master_centry_list, slots, true, true, nullptr);
554557
}
555558
}
556559

source/daemons/qmaster/sge_give_jobs.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,7 @@ sge_commit_job(lListElem *jep, lListElem *jatep, lListElem *jr, sge_commit_mode_
965965
MSG_LOG_SENT2EXECD);
966966

967967
global_host_ep = host_list_locate(master_exechost_list, "global");
968+
bool do_per_global_host_booking = true;
968969
const char *last_hostname = nullptr;
969970
const lListElem *gdil_ep;
970971
const lList *gdil = lGetList(jatep, JAT_granted_destin_identifier_list);
@@ -1009,7 +1010,7 @@ sge_commit_job(lListElem *jep, lListElem *jatep, lListElem *jr, sge_commit_mode_
10091010

10101011
/* debit consumable resources */
10111012
if (debit_host_consumable(jep, jatep, pe, global_host_ep, master_centry_list, tmp_slot, master_task,
1012-
do_per_host_booking,
1013+
do_per_global_host_booking,
10131014
nullptr) > 0) {
10141015
/* this info is not spooled */
10151016
sge_add_event(0, sgeE_EXECHOST_MOD, 0, 0,
@@ -1056,6 +1057,7 @@ sge_commit_job(lListElem *jep, lListElem *jatep, lListElem *jr, sge_commit_mode_
10561057
}
10571058
}
10581059
master_task = false;
1060+
do_per_global_host_booking = false;
10591061
}
10601062

10611063
lSetUlong64(jatep, JAT_wallclock_limit, task_wallclock);
@@ -1535,6 +1537,7 @@ sge_clear_granted_resources(lListElem *job, lListElem *ja_task, int incslots, mo
15351537
cqueue_list_x_on_subordinate_gdil(master_cqueue_list, false, gdi_list, monitor, gdi_session);
15361538

15371539
global_host_ep = host_list_locate(master_exechost_list, SGE_GLOBAL_NAME);
1540+
bool do_per_global_host_booking = true;
15381541

15391542
const char *pe_name = lGetString(ja_task, JAT_granted_pe);
15401543
lListElem *pe = lGetObject(ja_task, JAT_pe_object);
@@ -1566,7 +1569,7 @@ sge_clear_granted_resources(lListElem *job, lListElem *ja_task, int incslots, mo
15661569

15671570
/* undebit consumable resources */
15681571
if (debit_host_consumable(job, ja_task, pe, global_host_ep, master_centry_list, -tmp_slot, master_task,
1569-
do_per_host_booking, nullptr) > 0) {
1572+
do_per_global_host_booking, nullptr) > 0) {
15701573
/* this info is not spooled */
15711574
sge_add_event(0, sgeE_EXECHOST_MOD, 0, 0, SGE_GLOBAL_NAME, nullptr, nullptr, global_host_ep, gdi_session);
15721575
ocs::ReportingFileWriter::create_host_consumable_records(&answer_list, global_host_ep, job, now);
@@ -1613,6 +1616,7 @@ sge_clear_granted_resources(lListElem *job, lListElem *ja_task, int incslots, mo
16131616
}
16141617
}
16151618
master_task = false;
1619+
do_per_global_host_booking = false;
16161620
}
16171621

16181622
/* free granted resources of the parallel environment */

source/libs/sched/debit.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ debit_job_from_hosts(lListElem *job, lListElem *ja_task, const lListElem *pe, lL
282282
so = lParseSortOrderVarArg(lGetListDescr(host_list), "%I+", EH_sort_value);
283283

284284
global = host_list_locate(host_list, "global");
285+
bool do_per_global_host_booking = true;
285286

286287
load_formula = sconf_get_load_formula();
287288

@@ -304,9 +305,10 @@ debit_job_from_hosts(lListElem *job, lListElem *ja_task, const lListElem *pe, lL
304305
}
305306

306307
debit_host_consumable(job, ja_task, pe, host_list_locate(host_list, SGE_GLOBAL_NAME), centry_list, slots,
307-
is_master_task, do_per_host_booking, nullptr);
308+
is_master_task, do_per_global_host_booking, nullptr);
308309
debit_host_consumable(job, ja_task, pe, hep, centry_list, slots, is_master_task, do_per_host_booking, nullptr);
309310
is_master_task = false;
311+
do_per_global_host_booking = false;
310312

311313
/* compute new combined load for this host and put it into the host */
312314
old_sort_value = lGetDouble(hep, EH_sort_value);

source/libs/sched/sge_resource_utilization.cc

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,6 @@ int utilization_add(lListElem *cr, u_long64 start_time, u_long64 duration, doubl
269269
lList *resource_diagram;
270270
lListElem *thiz, *prev, *start, *end;
271271
const char *name = lGetString(cr, RUE_name);
272-
char level_char = CENTRY_LEVEL_TO_CHAR(level);
273272
u_long64 end_time;
274273
int nm;
275274
double util_prev;
@@ -299,7 +298,7 @@ int utilization_add(lListElem *cr, u_long64 start_time, u_long64 duration, doubl
299298
end_time = utilization_endtime(start_time, duration);
300299

301300
serf_record_entry(job_id, ja_taskid, (type!=nullptr)?type:"<unknown>", start_time, end_time,
302-
level_char, object_name, name, utilization);
301+
level, object_name, name, utilization);
303302

304303
/* ensure resource diagram is initialized */
305304
if (resource_diagram == nullptr) {
@@ -705,14 +704,10 @@ int add_job_utilization(const sge_assignment_t *a, const char *type, bool for_jo
705704
a->job_id, a->ja_task_id, PE_TAG, lGetString(a->pe, PE_name), type, for_job_scheduling, false);
706705
}
707706

708-
/* global */
709-
rc_add_job_utilization(a->job, a->pe, a->ja_task_id, type, a->gep, a->centry_list, a->slots,
710-
EH_consumable_config_list, EH_resource_utilization, SGE_GLOBAL_NAME,
711-
a->start, a->duration, GLOBAL_TAG, for_job_scheduling, true, true);
712-
713707
bool is_master_task = true;
714708
const lListElem *gdil_ep;
715709
const char *last_eh_name = nullptr;
710+
bool do_per_global_host_booking = true;
716711
for_each_ep(gdil_ep, a->gdil) {
717712
int slots = lGetUlong(gdil_ep, JG_slots);
718713
const char *eh_name = lGetHost(gdil_ep, JG_qhostname);
@@ -723,14 +718,20 @@ int add_job_utilization(const sge_assignment_t *a, const char *type, bool for_jo
723718
const lListElem *rqs = nullptr;
724719
bool do_per_host_booking = host_do_per_host_booking(&last_eh_name, eh_name);
725720

726-
/* hosts */
721+
// global
722+
// we really need to do it per gdil_ep, because we have to consider is_master_task and ign_sreq_on_mhost
723+
rc_add_job_utilization(a->job, a->pe, a->ja_task_id, type, a->gep, a->centry_list, slots,
724+
EH_consumable_config_list, EH_resource_utilization, SGE_GLOBAL_NAME,
725+
a->start, a->duration, GLOBAL_TAG, for_job_scheduling, is_master_task, do_per_global_host_booking);
726+
727+
// host
727728
if ((hep = host_list_locate(a->host_list, eh_name)) != nullptr) {
728729
rc_add_job_utilization(a->job, a->pe, a->ja_task_id, type, hep, a->centry_list, slots,
729730
EH_consumable_config_list, EH_resource_utilization, eh_name, a->start,
730731
a->duration, HOST_TAG, for_job_scheduling, is_master_task, do_per_host_booking);
731732
}
732733

733-
/* queues */
734+
// queue
734735
if ((qep = qinstance_list_locate2(a->queue_list, qname)) != nullptr) {
735736
/*
736737
* The nullptr case happens in case of queues that were sorted out b/c they
@@ -767,6 +768,7 @@ int add_job_utilization(const sge_assignment_t *a, const char *type, bool for_jo
767768

768769
sge_free(&queue);
769770
is_master_task = false;
771+
do_per_global_host_booking = false;
770772
}
771773

772774
sge_dstring_free(&rue_name);

source/libs/sched/sge_serf.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
#include "cull/cull.h"
4141

42+
#include "sgeobj/sge_centry.h"
4243
#include "sgeobj/sge_schedd_conf.h"
4344

4445
#include "sge_serf.h"
@@ -87,7 +88,7 @@ void serf_init(record_schedule_entry_func_t write, new_schedule_func_t newline)
8788
*
8889
* SYNOPSIS
8990
* void serf_record_entry(u_long32 job_id, u_long32 ja_taskid, const char
90-
* *state, u_long32 start_time, u_long32 end_time, char level_char, const
91+
* *state, u_long32 start_time, u_long32 end_time, u_long32 level, const
9192
* char *object_name, const char *name, double utilization)
9293
*
9394
* FUNCTION
@@ -132,11 +133,13 @@ void serf_init(record_schedule_entry_func_t write, new_schedule_func_t newline)
132133
* MT-NOTE: MT safety of registered recording function
133134
*******************************************************************************/
134135
void serf_record_entry(u_long32 job_id, u_long32 ja_taskid,
135-
const char *type, u_long64 start_time, u_long64 end_time, char level_char,
136-
const char *object_name, const char *name, double utilization)
136+
const char *type, u_long64 start_time, u_long64 end_time, u_long32 level,
137+
const char *object_name, const char *name, double utilization)
137138
{
138139
DENTER(TOP_LAYER);
139140

141+
char level_char = CENTRY_LEVEL_TO_CHAR(level);
142+
140143
/* human-readable format */
141144
if (DPRINTF_IS_ACTIVE) {
142145
DSTRING_STATIC(dstr_s, 64);

source/libs/sched/sge_serf.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ typedef void (*new_schedule_func_t)();
7777

7878
void serf_init(record_schedule_entry_func_t, new_schedule_func_t);
7979
void serf_record_entry(u_long32 job_id, u_long32 ja_taskid,
80-
const char *state, u_long64 start_time, u_long64 end_time, char level_char,
81-
const char *object_name, const char *name, double utilization);
80+
const char *state, u_long64 start_time, u_long64 end_time, u_long32 level,
81+
const char *object_name, const char *name, double utilization);
8282
void serf_new_interval(u_long64 time);
8383
void serf_exit();

0 commit comments

Comments
 (0)