Skip to content

Commit f529886

Browse files
committed
Modernize opensearch-mixin to grafonnet v11 and signals architecture
1 parent 54a1543 commit f529886

27 files changed

+5082
-9166
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
vendor
33
jsonnetfile.lock.json
44
*.zip
5+
.worktrees

opensearch-mixin/alerts/alerts.libsonnet renamed to opensearch-mixin/alerts.libsonnet

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
2-
prometheusAlerts+:: {
2+
new(this): {
33
groups+: [
44
{
5-
name: $._config.uid + '-alerts',
5+
name: this.config.uid + '-alerts',
66
rules: [
77
{
88
alert: 'OpenSearchYellowCluster',
99
expr: |||
1010
opensearch_cluster_status{%(filteringSelector)s} == 1
11-
||| % $._config,
11+
||| % this.config,
1212
'for': '5m',
1313
labels: {
1414
severity: 'warning',
@@ -18,14 +18,14 @@
1818
description:
1919
(
2020
'{{$labels.cluster}} health status is yellow over the last 5 minutes'
21-
) % $._config,
21+
) % this.config,
2222
},
2323
},
2424
{
2525
alert: 'OpenSearchRedCluster',
2626
expr: |||
2727
opensearch_cluster_status{%(filteringSelector)s} == 2
28-
||| % $._config,
28+
||| % this.config,
2929
'for': '5m',
3030
labels: {
3131
severity: 'critical',
@@ -35,14 +35,14 @@
3535
description:
3636
(
3737
'{{$labels.cluster}} health status is red over the last 5 minutes'
38-
) % $._config,
38+
) % this.config,
3939
},
4040
},
4141
{
4242
alert: 'OpenSearchUnstableShardReallocation',
4343
expr: |||
4444
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="relocating"}) > %(alertsWarningShardReallocations)s
45-
||| % $._config,
45+
||| % this.config,
4646
'for': '1m',
4747
labels: {
4848
severity: 'warning',
@@ -51,14 +51,14 @@
5151
summary: 'A node has gone offline or has been disconnected triggering shard reallocation.',
5252
description: |||
5353
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard reallocation over the last 1m which is above the threshold of %(alertsWarningShardReallocations)s.
54-
||| % $._config,
54+
||| % this.config,
5555
},
5656
},
5757
{
5858
alert: 'OpenSearchUnstableShardUnassigned',
5959
expr: |||
6060
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="unassigned"}) > %(alertsWarningShardUnassigned)s
61-
||| % $._config,
61+
||| % this.config,
6262
'for': '5m',
6363
labels: {
6464
severity: 'warning',
@@ -67,14 +67,14 @@
6767
summary: 'There are shards that have been detected as unassigned.',
6868
description: |||
6969
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard unassigned over the last 5m which is above the threshold of %(alertsWarningShardUnassigned)s.
70-
||| % $._config,
70+
||| % this.config,
7171
},
7272
},
7373
{
7474
alert: 'OpenSearchHighNodeDiskUsage',
7575
expr: |||
7676
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsWarningDiskUsage)s
77-
||| % $._config,
77+
||| % this.config,
7878
'for': '5m',
7979
labels: {
8080
severity: 'warning',
@@ -83,14 +83,14 @@
8383
summary: 'The node disk usage has exceeded the warning threshold.',
8484
description: |||
8585
{{$labels.node}} has had {{ printf "%%.0f" $value }} disk usage over the last 5m which is above the threshold of %(alertsWarningDiskUsage)s.
86-
||| % $._config,
86+
||| % this.config,
8787
},
8888
},
8989
{
9090
alert: 'OpenSearchHighNodeDiskUsage',
9191
expr: |||
9292
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsCriticalDiskUsage)s
93-
||| % $._config,
93+
||| % this.config,
9494
'for': '5m',
9595
labels: {
9696
severity: 'critical',
@@ -99,14 +99,14 @@
9999
summary: 'The node disk usage has exceeded the critical threshold.',
100100
description: |||
101101
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% disk usage over the last 5m which is above the threshold of %(alertsCriticalDiskUsage)s.
102-
||| % $._config,
102+
||| % this.config,
103103
},
104104
},
105105
{
106106
alert: 'OpenSearchHighNodeCpuUsage',
107107
expr: |||
108108
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsWarningCPUUsage)s
109-
||| % $._config,
109+
||| % this.config,
110110
'for': '5m',
111111
labels: {
112112
severity: 'warning',
@@ -115,14 +115,14 @@
115115
summary: 'The node CPU usage has exceeded the warning threshold.',
116116
description: |||
117117
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsWarningCPUUsage)s.
118-
||| % $._config,
118+
||| % this.config,
119119
},
120120
},
121121
{
122122
alert: 'OpenSearchHighNodeCpuUsage',
123123
expr: |||
124124
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsCriticalCPUUsage)s
125-
||| % $._config,
125+
||| % this.config,
126126
'for': '5m',
127127
labels: {
128128
severity: 'critical',
@@ -131,14 +131,14 @@
131131
summary: 'The node CPU usage has exceeded the critical threshold.',
132132
description: |||
133133
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsCriticalCPUUsage)s.
134-
||| % $._config,
134+
||| % this.config,
135135
},
136136
},
137137
{
138138
alert: 'OpenSearchHighNodeMemoryUsage',
139139
expr: |||
140140
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsWarningMemoryUsage)s
141-
||| % $._config,
141+
||| % this.config,
142142
'for': '5m',
143143
labels: {
144144
severity: 'warning',
@@ -147,14 +147,14 @@
147147
summary: 'The node memory usage has exceeded the warning threshold.',
148148
description: |||
149149
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsWarningMemoryUsage)s.
150-
||| % $._config,
150+
||| % this.config,
151151
},
152152
},
153153
{
154154
alert: 'OpenSearchHighNodeMemoryUsage',
155155
expr: |||
156156
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsCriticalMemoryUsage)s
157-
||| % $._config,
157+
||| % this.config,
158158
'for': '5m',
159159
labels: {
160160
severity: 'critical',
@@ -163,14 +163,14 @@
163163
summary: 'The node memory usage has exceeded the critical threshold.',
164164
description: |||
165165
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsCriticalMemoryUsage)s.
166-
||| % $._config,
166+
||| % this.config,
167167
},
168168
},
169169
{
170170
alert: 'OpenSearchModerateRequestLatency',
171171
expr: |||
172172
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{%(filteringSelector)s, context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s
173-
||| % $._config,
173+
||| % this.config,
174174
'for': '5m',
175175
labels: {
176176
severity: 'warning',
@@ -179,14 +179,14 @@
179179
summary: 'The request latency has exceeded the warning threshold.',
180180
description: |||
181181
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of request latency over the last 5m which is above the threshold of %(alertsWarningRequestLatency)s.
182-
||| % $._config,
182+
||| % this.config,
183183
},
184184
},
185185
{
186186
alert: 'OpenSearchModerateIndexLatency',
187187
expr: |||
188188
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{%(filteringSelector)s, context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s
189-
||| % $._config,
189+
||| % this.config,
190190
'for': '5m',
191191
labels: {
192192
severity: 'warning',
@@ -195,7 +195,7 @@
195195
summary: 'The index latency has exceeded the warning threshold.',
196196
description: |||
197197
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of index latency over the last 5m which is above the threshold of %(alertsWarningIndexLatency)s.
198-
||| % $._config,
198+
||| % this.config,
199199
},
200200
},
201201
],

opensearch-mixin/config.libsonnet

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,52 @@
11
{
2-
_config+:: {
3-
enableMultiCluster: false,
4-
// extra static selector to apply to all templated variables and alerts
5-
filteringSelector: if self.enableMultiCluster then 'cluster!="",opensearch_cluster!=""' else 'opensearch_cluster!=""',
6-
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'opensearch_cluster'] else ['job', 'opensearch_cluster'],
7-
instanceLabels: ['node'],
8-
dashboardTags: ['opensearch-mixin'],
9-
dashboardPeriod: 'now-1h',
10-
dashboardTimezone: 'default',
11-
dashboardRefresh: '1m',
12-
dashboardNamePrefix: '',
2+
local this = self,
3+
filteringSelector: if self.enableMultiCluster then 'cluster!="",opensearch_cluster!=""' else 'opensearch_cluster!=""',
4+
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'opensearch_cluster'] else ['job', 'opensearch_cluster'],
5+
logLabels: ['job', 'cluster', 'node'],
6+
instanceLabels: ['node'],
137

14-
// prefix dashboards uids
15-
uid: 'opensearch',
8+
dashboardTags: [self.uid],
9+
uid: 'opensearch',
10+
dashboardNamePrefix: 'OpenSearch',
11+
dashboardPeriod: 'now-1h',
12+
dashboardTimezone: 'default',
13+
dashboardRefresh: '1m',
14+
metricsSource: 'prometheus', // metrics source for signals
1615

17-
// alerts thresholds
18-
alertsWarningShardReallocations: 0,
19-
alertsWarningShardUnassigned: 0,
20-
alertsWarningDiskUsage: 60,
21-
alertsCriticalDiskUsage: 80,
22-
alertsWarningCPUUsage: 70,
23-
alertsCriticalCPUUsage: 85,
24-
alertsWarningMemoryUsage: 70,
25-
alertsCriticalMemoryUsage: 85,
26-
alertsWarningRequestLatency: 0.5, // seconds
27-
alertsWarningIndexLatency: 0.5, // seconds
16+
// Agg Lists
17+
groupAggList: std.join(',', this.groupLabels),
18+
groupAggListWithInstance: std.join(',', this.groupLabels + this.instanceLabels),
19+
20+
// Multi-cluster support
21+
enableMultiCluster: false,
22+
opensearchSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"',
2823

29-
enableLokiLogs: true,
24+
// Logging configuration
25+
enableLokiLogs: true,
26+
extraLogLabels: ['level', 'severity'], // Required by logs-lib
27+
logsVolumeGroupBy: 'level',
28+
showLogsVolume: true,
29+
logExpression: '{job=~"$job", cluster=~"$cluster", instance=~"$instance", exception_class=~".+"} | json | line_format "{{.severity}} {{.exception_class}} - {{.exception_message}}" | drop time_extracted, severity_extracted, exception_class_extracted, correlation_id_extracted',
30+
31+
// Alerts configuration
32+
alertsWarningShardReallocations: 0, // count
33+
alertsWarningShardUnassigned: 0, // count
34+
alertsWarningDiskUsage: 60, // %
35+
alertsCriticalDiskUsage: 80, // %
36+
alertsWarningCPUUsage: 70, // %
37+
alertsCriticalCPUUsage: 85, // %
38+
alertsWarningMemoryUsage: 70, // %
39+
alertsCriticalMemoryUsage: 85, // %
40+
alertsWarningRequestLatency: 0.5, // seconds
41+
alertsWarningIndexLatency: 0.5, // seconds
42+
43+
// Signals configuration
44+
signals+: {
45+
cluster: (import './signals/cluster.libsonnet')(this),
46+
node: (import './signals/node.libsonnet')(this),
47+
topk: (import './signals/topk.libsonnet')(this),
48+
roles: (import './signals/roles.libsonnet')(this),
49+
search: (import './signals/search.libsonnet')(this),
50+
indexing: (import './signals/indexing.libsonnet')(this),
3051
},
3152
}

0 commit comments

Comments
 (0)