Skip to content

Commit 8d49194

Browse files
authored
improve relations in clock skew issues (#28612)
1 parent 0c2162e commit 8d49194

File tree

2 files changed

+29
-14
lines changed

2 files changed

+29
-14
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -380,14 +380,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
380380
}
381381

382382
void ReportStatus(Ydb::Monitoring::StatusFlag::Status status,
383-
const TString& message = {},
384-
ETags setTag = ETags::None,
385-
std::initializer_list<ETags> includeTags = {}) {
383+
const TString& message,
384+
ETags setTag,
385+
std::initializer_list<ETags> includeTags,
386+
const TList<TIssueRecord>& includeRecords) {
386387
OverallStatus = MaxStatus(OverallStatus, status);
387388
if (IsErrorStatus(status)) {
388389
std::vector<TString> reason;
389390
if (includeTags.size() != 0) {
390-
for (const TIssueRecord& record : IssueRecords) {
391+
for (const TIssueRecord& record : includeRecords) {
391392
for (const ETags& tag : includeTags) {
392393
if (record.Tag == tag) {
393394
reason.push_back(record.IssueLog.id());
@@ -421,6 +422,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
421422
}
422423
}
423424

425+
void ReportStatus(Ydb::Monitoring::StatusFlag::Status status,
426+
const TString& message = {},
427+
ETags setTag = ETags::None,
428+
std::initializer_list<ETags> includeTags = {}) {
429+
ReportStatus(status, message, setTag, includeTags, IssueRecords);
430+
}
431+
424432
bool HasTags(std::initializer_list<ETags> tags) const {
425433
for (const TIssueRecord& record : IssueRecords) {
426434
for (const ETags tag : tags) {
@@ -444,12 +452,14 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
444452
return status;
445453
}
446454

447-
void ReportWithMaxChildStatus(const TString& message = {},
455+
bool ReportWithMaxChildStatus(const TString& message = {},
448456
ETags setTag = ETags::None,
449457
std::initializer_list<ETags> includeTags = {}) {
450458
if (HasTags(includeTags)) {
451459
ReportStatus(FindMaxStatus(includeTags), message, setTag, includeTags);
460+
return true;
452461
}
462+
return false;
453463
}
454464

455465
Ydb::Monitoring::StatusFlag::Status GetOverallStatus() const {
@@ -2193,9 +2203,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
21932203
ui64 clockSkew = abs(databaseState.MaxClockSkewNodeAvgUs);
21942204
clockSkewStatus.set_clock_skew(-databaseState.MaxClockSkewNodeAvgUs / 1000); // in ms
21952205
if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
2196-
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::NodeClockSkew);
2206+
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node clock skew exceeds threshold", ETags::NodeClockSkew);
21972207
} else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
2198-
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::NodeClockSkew);
2208+
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Node clock skew above recommended limit", ETags::NodeClockSkew);
21992209
} else {
22002210
tdContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
22012211
}
@@ -2226,10 +2236,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
22262236
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
22272237

22282238
}
2239+
FillComputeDatabaseClockSkew(databaseState, computeStatus, {&context, "CLOCK_SKEW"}, context);
22292240
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::PileComputeState, {ETags::Uptime});
22302241
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::PileComputeState, {ETags::OverloadState});
22312242
context.ReportWithMaxChildStatus("Compute quota usage", ETags::PileComputeState, {ETags::QuotaUsage});
2232-
context.ReportWithMaxChildStatus("Clock skew issues", ETags::PileComputeState, {ETags::NodeClockSkew});
2243+
if (!context.ReportWithMaxChildStatus("Clock skew issues", ETags::PileComputeState, {ETags::DatabaseClockSkew})) {
2244+
context.ReportWithMaxChildStatus("Clock skew issues", ETags::PileComputeState, {ETags::NodeClockSkew});
2245+
}
22332246
}
22342247

22352248
void FillComputeDatabaseQuota(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
@@ -2265,12 +2278,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
22652278
}
22662279
}
22672280

2268-
void FillComputeDatabaseClockSkew(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
2281+
void FillComputeDatabaseClockSkew(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context, const TSelfCheckContext& relatedContext) {
22692282
ui64 clockSkew = databaseState.MaxClockSkewUs;
22702283
if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange()) {
2271-
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Clock skew exceeds threshold", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2284+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Database clock skew exceeds threshold", ETags::DatabaseClockSkew, {ETags::NodeClockSkew}, relatedContext.IssueRecords);
22722285
} else if (clockSkew >= HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow()) {
2273-
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Clock skew above recommended limit", ETags::DatabaseClockSkew, {ETags::NodeClockSkew});
2286+
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Database clock skew above recommended limit", ETags::DatabaseClockSkew, {ETags::NodeClockSkew}, relatedContext.IssueRecords);
22742287
} else {
22752288
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
22762289
}
@@ -2357,18 +2370,20 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23572370
auto& computeNode = *computeStatus.add_nodes();
23582371
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
23592372
}
2373+
FillComputeDatabaseClockSkew(databaseState, computeStatus, {&context, "CLOCK_SKEW"}, context);
23602374
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
23612375
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
23622376
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
2363-
context.ReportWithMaxChildStatus("Clock skew issues", ETags::ComputeState, {ETags::NodeClockSkew});
2377+
if (!context.ReportWithMaxChildStatus("Clock skew issues", ETags::ComputeState, {ETags::DatabaseClockSkew})) {
2378+
context.ReportWithMaxChildStatus("Clock skew issues", ETags::ComputeState, {ETags::NodeClockSkew});
2379+
}
23642380
}
23652381
}
23662382
Ydb::Monitoring::StatusFlag::Status systemStatus = FillSystemTablets(databaseState, {&context, "SYSTEM_TABLET"});
23672383
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
23682384
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
23692385
}
23702386
FillComputeDatabaseQuota(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
2371-
FillComputeDatabaseClockSkew(databaseState, computeStatus, {&context, "CLOCK_SKEW"});
23722387
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
23732388
computeNodeIds->push_back(0); // for tablets without node
23742389
for (TNodeId nodeId : *computeNodeIds) {

ydb/core/health_check/health_check_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3090,7 +3090,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
30903090
auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
30913091

30923092
Ctest << result.ShortDebugString() << Endl;
3093-
CheckHcResultHasIssuesWithStatus(result, "CLOCK_SKEW", Ydb::Monitoring::StatusFlag::YELLOW, 1, TLocationFilter().Pile("pile0"));
3093+
CheckHcResultHasIssuesWithStatus(result, "CLOCK_SKEW", Ydb::Monitoring::StatusFlag::YELLOW, 2, TLocationFilter().Pile("pile0"));
30943094
}
30953095
}
30963096
}

0 commit comments

Comments
 (0)