@@ -380,14 +380,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
380380 }
381381
382382 void ReportStatus (Ydb::Monitoring::StatusFlag::Status status,
383- const TString& message = {},
384- ETags setTag = ETags::None,
385- std::initializer_list<ETags> includeTags = {}) {
383+ const TString& message,
384+ ETags setTag,
385+ std::initializer_list<ETags> includeTags,
386+ const TList<TIssueRecord>& includeRecords) {
386387 OverallStatus = MaxStatus (OverallStatus, status);
387388 if (IsErrorStatus (status)) {
388389 std::vector<TString> reason;
389390 if (includeTags.size () != 0 ) {
390- for (const TIssueRecord& record : IssueRecords ) {
391+ for (const TIssueRecord& record : includeRecords ) {
391392 for (const ETags& tag : includeTags) {
392393 if (record.Tag == tag) {
393394 reason.push_back (record.IssueLog .id ());
@@ -421,6 +422,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
421422 }
422423 }
423424
425+ void ReportStatus (Ydb::Monitoring::StatusFlag::Status status,
426+ const TString& message = {},
427+ ETags setTag = ETags::None,
428+ std::initializer_list<ETags> includeTags = {}) {
429+ ReportStatus (status, message, setTag, includeTags, IssueRecords);
430+ }
431+
424432 bool HasTags (std::initializer_list<ETags> tags) const {
425433 for (const TIssueRecord& record : IssueRecords) {
426434 for (const ETags tag : tags) {
@@ -444,12 +452,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
444452 return status;
445453 }
446454
447- void ReportWithMaxChildStatus (const TString& message = {},
455+ bool ReportWithMaxChildStatus (const TString& message = {},
448456 ETags setTag = ETags::None,
449457 std::initializer_list<ETags> includeTags = {}) {
450458 if (HasTags (includeTags)) {
451459 ReportStatus (FindMaxStatus (includeTags), message, setTag, includeTags);
460+ return true ;
452461 }
462+ return false ;
453463 }
454464
455465 Ydb::Monitoring::StatusFlag::Status GetOverallStatus () const {
@@ -2193,9 +2203,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21932203 ui64 clockSkew = abs (databaseState.MaxClockSkewNodeAvgUs );
21942204 clockSkewStatus.set_clock_skew (-databaseState.MaxClockSkewNodeAvgUs / 1000 ); // in ms
21952205 if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceOrange ()) {
2196- tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Clock skew exceeds threshold" , ETags::NodeClockSkew);
2206+ tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Node clock skew exceeds threshold" , ETags::NodeClockSkew);
21972207 } else if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceYellow ()) {
2198- tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " Clock skew above recommended limit" , ETags::NodeClockSkew);
2208+ tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " Node clock skew above recommended limit" , ETags::NodeClockSkew);
21992209 } else {
22002210 tdContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
22012211 }
@@ -2226,10 +2236,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
22262236 FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
22272237
22282238 }
2239+ FillComputeDatabaseClockSkew (databaseState, computeStatus, {&context, " CLOCK_SKEW" }, context);
22292240 context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::PileComputeState, {ETags::Uptime});
22302241 context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::PileComputeState, {ETags::OverloadState});
22312242 context.ReportWithMaxChildStatus (" Compute quota usage" , ETags::PileComputeState, {ETags::QuotaUsage});
2232- context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::PileComputeState, {ETags::NodeClockSkew});
2243+ if (!context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::PileComputeState, {ETags::DatabaseClockSkew})) {
2244+ context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::PileComputeState, {ETags::NodeClockSkew});
2245+ }
22332246 }
22342247
22352248 void FillComputeDatabaseQuota (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
@@ -2265,12 +2278,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
22652278 }
22662279 }
22672280
2268- void FillComputeDatabaseClockSkew (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2281+ void FillComputeDatabaseClockSkew (TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context, const TSelfCheckContext& relatedContext ) {
22692282 ui64 clockSkew = databaseState.MaxClockSkewUs ;
22702283 if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceOrange ()) {
2271- context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Clock skew exceeds threshold" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2284+ context.ReportStatus (Ydb::Monitoring::StatusFlag::ORANGE, " Database clock skew exceeds threshold" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew}, relatedContext. IssueRecords );
22722285 } else if (clockSkew >= HealthCheckConfig.GetThresholds ().GetNodesTimeDifferenceYellow ()) {
2273- context.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " Clock skew above recommended limit" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2286+ context.ReportStatus (Ydb::Monitoring::StatusFlag::YELLOW, " Database clock skew above recommended limit" , ETags::DatabaseClockSkew, {ETags::NodeClockSkew}, relatedContext. IssueRecords );
22742287 } else {
22752288 context.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
22762289 }
@@ -2357,18 +2370,20 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23572370 auto & computeNode = *computeStatus.add_nodes ();
23582371 FillComputeNodeStatus (databaseState, nodeId, computeNode, {&context, " COMPUTE_NODE" });
23592372 }
2373+ FillComputeDatabaseClockSkew (databaseState, computeStatus, {&context, " CLOCK_SKEW" }, context);
23602374 context.ReportWithMaxChildStatus (" Some nodes are restarting too often" , ETags::ComputeState, {ETags::Uptime});
23612375 context.ReportWithMaxChildStatus (" Compute is overloaded" , ETags::ComputeState, {ETags::OverloadState});
23622376 context.ReportWithMaxChildStatus (" Compute quota usage" , ETags::ComputeState, {ETags::QuotaUsage});
2363- context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::ComputeState, {ETags::NodeClockSkew});
2377+ if (!context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::ComputeState, {ETags::DatabaseClockSkew})) {
2378+ context.ReportWithMaxChildStatus (" Clock skew issues" , ETags::ComputeState, {ETags::NodeClockSkew});
2379+ }
23642380 }
23652381 }
23662382 Ydb::Monitoring::StatusFlag::Status systemStatus = FillSystemTablets (databaseState, {&context, " SYSTEM_TABLET" });
23672383 if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
23682384 context.ReportStatus (systemStatus, " Compute has issues with system tablets" , ETags::ComputeState, {ETags::SystemTabletState});
23692385 }
23702386 FillComputeDatabaseQuota (databaseState, computeStatus, {&context, " COMPUTE_QUOTA" });
2371- FillComputeDatabaseClockSkew (databaseState, computeStatus, {&context, " CLOCK_SKEW" });
23722387 Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
23732388 computeNodeIds->push_back (0 ); // for tablets without node
23742389 for (TNodeId nodeId : *computeNodeIds) {
0 commit comments