Skip to content

Commit d7c1c77

Browse files
authored
chore: Modernize the Presto Mixin (#1491)
* modernize the presto mixin * make fmt * some legend cleanup and fixing some titles * make dashboards_out * use commonlib for presto stats/timeseries where appropriate
1 parent 54a1543 commit d7c1c77

23 files changed

+2714
-5804
lines changed

presto-mixin/alerts.libsonnet

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
{
2+
new(this):
3+
{
4+
groups+: [
5+
{
6+
name: 'presto-alerts',
7+
rules: [
8+
{
9+
alert: 'PrestoHighInsufficientResources',
10+
expr: |||
11+
increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s
12+
||| % this.config,
13+
'for': '5m',
14+
labels: {
15+
severity: 'critical',
16+
},
17+
annotations: {
18+
summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.',
19+
description:
20+
(
21+
'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.'
22+
) % this.config,
23+
},
24+
},
25+
{
26+
alert: 'PrestoHighTaskFailuresWarning',
27+
expr: |||
28+
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s
29+
||| % this.config,
30+
'for': '5m',
31+
labels: {
32+
severity: 'warning',
33+
},
34+
annotations: {
35+
summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.',
36+
description:
37+
(
38+
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.'
39+
) % this.config,
40+
},
41+
},
42+
{
43+
alert: 'PrestoHighTaskFailuresCritical',
44+
expr: |||
45+
increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s
46+
||| % this.config,
47+
'for': '5m',
48+
labels: {
49+
severity: 'critical',
50+
},
51+
annotations: {
52+
summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.',
53+
description:
54+
(
55+
'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.'
56+
) % this.config,
57+
},
58+
},
59+
{
60+
alert: 'PrestoHighQueuedTaskCount',
61+
expr: |||
62+
increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s
63+
||| % this.config,
64+
'for': '5m',
65+
labels: {
66+
severity: 'warning',
67+
},
68+
annotations: {
69+
summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.',
70+
description:
71+
(
72+
'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s'
73+
) % this.config,
74+
},
75+
},
76+
{
77+
alert: 'PrestoHighBlockedNodes',
78+
expr: |||
79+
increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s
80+
||| % this.config,
81+
'for': '5m',
82+
labels: {
83+
severity: 'critical',
84+
},
85+
annotations: {
86+
summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.',
87+
description:
88+
(
89+
'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s'
90+
) % this.config,
91+
},
92+
},
93+
{
94+
alert: 'PrestoHighFailedQueriesWarning',
95+
expr: |||
96+
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s
97+
||| % this.config,
98+
'for': '5m',
99+
labels: {
100+
severity: 'warning',
101+
},
102+
annotations: {
103+
summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
104+
description:
105+
(
106+
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s'
107+
) % this.config,
108+
},
109+
},
110+
{
111+
alert: 'PrestoHighFailedQueriesCritical',
112+
expr: |||
113+
increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s
114+
||| % this.config,
115+
'for': '5m',
116+
labels: {
117+
severity: 'critical',
118+
},
119+
annotations: {
120+
summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.',
121+
description:
122+
(
123+
'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.'
124+
) % this.config,
125+
},
126+
},
127+
],
128+
},
129+
],
130+
},
131+
}

presto-mixin/alerts/alerts.libsonnet

Lines changed: 0 additions & 130 deletions
This file was deleted.

presto-mixin/config.libsonnet

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,39 @@
11
{
2-
_config+:: {
3-
enableMultiCluster: false,
4-
prestoOverviewSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
5-
prestoSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"',
6-
prestoAlertSelector: if self.enableMultiCluster then 'job=~"${job:regex}", cluster=~"${cluster:regex}"' else 'job=~"${job:regex}"',
7-
prestoOverviewLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{presto_cluster}}' else '{{presto_cluster}}',
8-
prestoLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{instance}}' else '{{instance}}',
9-
filterSelector: 'job=~"integrations/presto"',
2+
local this = self,
3+
filteringSelector: 'job=~"integrations/presto"',
4+
groupLabels: ['job', 'cluster', 'presto_cluster'],
5+
overviewLegendLabels: ['presto_cluster'],
6+
coordinatorLegendLabels: ['instance'],
7+
workerLegendLabels: ['instance'],
8+
instanceLabels: ['instance'],
9+
uid: 'presto',
1010

11-
dashboardTags: ['presto-mixin'],
12-
dashboardPeriod: 'now-30m',
13-
dashboardTimezone: 'default',
14-
dashboardRefresh: '1m',
11+
dashboardNamePrefix: 'Presto',
12+
dashboardTags: ['presto-mixin'],
13+
dashboardPeriod: 'now-30m',
14+
dashboardTimezone: 'default',
15+
dashboardRefresh: '1m',
1516

16-
// alerts thresholds
17-
alertsHighInsufficientResourceErrors: 0, // count
18-
alertsHighTaskFailuresWarning: 0, // count
19-
alertsHighTaskFailuresCritical: 30, // percent
20-
alertsHighQueuedTaskCount: 5, // count
21-
alertsHighBlockedNodesCount: 0, // count
22-
alertsHighFailedQueryCountWarning: 0, // count
23-
alertsHighFailedQueryCountCritical: 30, // percent
24-
enableLokiLogs: true,
17+
// Data source configuration
18+
metricsSource: 'prometheus',
19+
enableLokiLogs: true,
20+
logLabels: this.groupLabels + this.instanceLabels,
21+
extraLogLabels: [],
22+
logsVolumeGroupBy: 'level',
23+
showLogsVolume: true,
24+
25+
// Alerts configuration
26+
alertsHighInsufficientResourceErrors: 0, // count
27+
alertsHighTaskFailuresWarning: 0, // count
28+
alertsHighTaskFailuresCritical: 30, // percent
29+
alertsHighQueuedTaskCount: 5, // count
30+
alertsHighBlockedNodesCount: 0, // count
31+
alertsHighFailedQueryCountWarning: 0, // count
32+
alertsHighFailedQueryCountCritical: 30, // percent
33+
34+
signals+: {
35+
overview: (import './signals/overview.libsonnet')(this),
36+
coordinator: (import './signals/coordinator.libsonnet')(this),
37+
worker: (import './signals/worker.libsonnet')(this),
2538
},
2639
}

0 commit comments

Comments
 (0)