Skip to content

Commit 38e888e

Browse files
schmikeiDasomeone
andauthored
chore: Apache Cassandra Mixin Modernization (#1513)
* wip: cassandra modernization * forgot to commit latest * fix lint * fix typo in links.libsonnet * apply pr suggestions * specify g1 young generation gc * fix units on heatmaps * fix regex on what should be a string literal comparison --------- Co-authored-by: Emily <1282515+Dasomeone@users.noreply.github.com>
1 parent 8fba58f commit 38e888e

25 files changed

+5022
-9298
lines changed

apache-cassandra-mixin/alerts/alerts.libsonnet renamed to apache-cassandra-mixin/alerts.libsonnet

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
2-
prometheusAlerts+:: {
2+
new(this): {
33
groups+: [
44
{
55
name: 'ApacheCassandraAlerts',
66
rules: [
77
{
88
alert: 'HighReadLatency',
99
expr: |||
10-
sum(cassandra_table_readlatency_seconds_sum) by (instance) / sum(cassandra_table_readlatency_seconds_count) by (instance) * 1000 > %(alertsCriticalReadLatency5m)s
11-
||| % $._config,
10+
sum(cassandra_table_readlatency_seconds_sum{%(filteringSelector)s}) by (instance) / sum(cassandra_table_readlatency_seconds_count{%(filteringSelector)s}) by (instance) * 1000 > %(alertsCriticalReadLatency5m)s
11+
||| % this.config,
1212
'for': '5m',
1313
labels: {
1414
severity: 'critical',
@@ -19,14 +19,14 @@
1919
(
2020
'An average of {{ printf "%%.0f" $value }}ms of read latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
2121
'which is above the threshold of %(alertsCriticalReadLatency5m)sms. '
22-
) % $._config,
22+
) % this.config,
2323
},
2424
},
2525
{
2626
alert: 'HighWriteLatency',
2727
expr: |||
28-
sum(cassandra_keyspace_writelatency_seconds_sum) by (instance) / sum(cassandra_keyspace_writelatency_seconds_count) by (instance) * 1000 > %(alertsCriticalWriteLatency5m)s
29-
||| % $._config,
28+
sum(cassandra_keyspace_writelatency_seconds_sum{%(filteringSelector)s}) by (instance) / sum(cassandra_keyspace_writelatency_seconds_count{%(filteringSelector)s}) by (instance) * 1000 > %(alertsCriticalWriteLatency5m)s
29+
||| % this.config,
3030
'for': '5m',
3131
labels: {
3232
severity: 'critical',
@@ -37,14 +37,14 @@
3737
(
3838
'An average of {{ printf "%%.0f" $value }}ms of write latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
3939
'which is above the threshold of %(alertsCriticalWriteLatency5m)sms. '
40-
) % $._config,
40+
) % this.config,
4141
},
4242
},
4343
{
4444
alert: 'HighPendingCompactionTasks',
4545
expr: |||
46-
cassandra_compaction_pendingtasks > %(alertsWarningPendingCompactionTasks15m)s
47-
||| % $._config,
46+
cassandra_compaction_pendingtasks{%(filteringSelector)s} > %(alertsWarningPendingCompactionTasks15m)s
47+
||| % this.config,
4848
'for': '15m',
4949
labels: {
5050
severity: 'warning',
@@ -55,14 +55,14 @@
5555
(
5656
'{{ printf "%%.0f" $value }} compaction tasks have been pending over the last 15 minutes on {{$labels.instance}}, ' +
5757
'which is above the threshold of %(alertsWarningPendingCompactionTasks15m)s. '
58-
) % $._config,
58+
) % this.config,
5959
},
6060
},
6161
{
6262
alert: 'BlockedCompactionTasksFound',
6363
expr: |||
64-
cassandra_threadpools_currentlyblockedtasks_count{threadpools="CompactionExecutor", path="internal"} > %(alertsCriticalBlockedCompactionTasks5m)s
65-
||| % $._config,
64+
cassandra_threadpools_currentlyblockedtasks_count{threadpools="CompactionExecutor", path="internal", %(filteringSelector)s} > %(alertsCriticalBlockedCompactionTasks5m)s
65+
||| % this.config,
6666
'for': '5m',
6767
labels: {
6868
severity: 'critical',
@@ -73,14 +73,14 @@
7373
(
7474
'{{ printf "%%.0f" $value }} compaction tasks have been blocked over the last 5 minutes on {{$labels.instance}}, ' +
7575
'which is above the threshold of %(alertsCriticalBlockedCompactionTasks5m)s. '
76-
) % $._config,
76+
) % this.config,
7777
},
7878
},
7979
{
8080
alert: 'HintsStoredOnNode',
8181
expr: |||
82-
increase(cassandra_storage_totalhints_count[5m]) > %(alertsWarningHintsStored1m)s
83-
||| % $._config,
82+
increase(cassandra_storage_totalhints_count{%(filteringSelector)s}[5m]) > %(alertsWarningHintsStored1m)s
83+
||| % this.config,
8484
'for': '1m',
8585
labels: {
8686
severity: 'warning',
@@ -91,32 +91,23 @@
9191
(
9292
'{{ printf "%%.0f" $value }} hints have been written to the node over the last minute on {{$labels.instance}}, ' +
9393
'which is above the threshold of %(alertsWarningHintsStored1m)s. '
94-
) % $._config,
94+
) % this.config,
9595
},
9696
},
9797
{
9898
alert: 'UnavailableWriteRequestsFound',
9999
expr: |||
100-
sum(cassandra_clientrequest_unavailables_count{clientrequest="Write"}) by (cassandra_cluster) > %(alertsCriticalUnavailableWriteRequests5m)s
101-
||| % $._config,
100+
sum without (cassandra_cluster) (cassandra_clientrequest_unavailables_count{clientrequest="Write", %(filteringSelector)s}) > %(alertsCriticalUnavailableWriteRequests5m)s
101+
||| % this.config,
102102
'for': '5m',
103-
labels: {
104-
severity: 'critical',
105-
},
106-
annotations: {
107-
summary: 'Unavailable exceptions have been encountered while performing writes in this cluster.',
108-
description:
109-
(
110-
'{{ printf "%%.0f" $value }} unavailable write requests have been found over the last 5 minutes on {{$labels.instance}}, ' +
111-
'which is above the threshold of %(alertsCriticalUnavailableWriteRequests5m)s. '
112-
) % $._config,
113-
},
103+
labels: { severity: 'critical' },
104+
annotations: { summary: 'Unavailable exceptions have been encountered while performing writes in this cluster.', description: ('{{ printf "%%.0f" $value }} unavailable write requests have been found over the last 5 minutes on {{$labels.instance}}, ' + 'which is above the threshold of %(alertsCriticalUnavailableWriteRequests5m)s. ') % this.config },
114105
},
115106
{
116107
alert: 'HighCpuUsage',
117108
expr: |||
118-
jvm_process_cpu_load{job=~"integrations/apache-cassandra"} * 100 > %(alertsCriticalHighCpuUsage5m)s
119-
||| % $._config,
109+
jvm_process_cpu_load{%(filteringSelector)s} * 100 > %(alertsCriticalHighCpuUsage5m)s
110+
||| % this.config,
120111
'for': '5m',
121112
labels: {
122113
severity: 'critical',
@@ -127,14 +118,14 @@
127118
(
128119
'Cpu usage is at {{ printf "%%.0f" $value }} percent over the last 5 minutes on {{$labels.instance}}, ' +
129120
'which is above the threshold of %(alertsCriticalHighCpuUsage5m)s. '
130-
) % $._config,
121+
) % this.config,
131122
},
132123
},
133124
{
134125
alert: 'HighMemoryUsage',
135126
expr: |||
136-
sum(jvm_memory_usage_used_bytes{job=~"integrations/apache-cassandra", area="Heap"}) / sum(jvm_physical_memory_size{job=~"integrations/apache-cassandra"}) * 100 > %(alertsCriticalHighMemoryUsage5m)s
137-
||| % $._config,
127+
sum by (instance) (jvm_memory_usage_used_bytes{%(filteringSelector)s, area="Heap"}) / sum by (instance) (jvm_physical_memory_size{%(filteringSelector)s}) * 100 > %(alertsCriticalHighMemoryUsage5m)s
128+
||| % this.config,
138129
'for': '5m',
139130
labels: {
140131
severity: 'critical',
@@ -145,7 +136,7 @@
145136
(
146137
'Memory usage is at {{ printf "%%.0f" $value }} percent over the last 5 minutes on {{$labels.instance}}, ' +
147138
'which is above the threshold of %(alertsCriticalHighMemoryUsage5m)s }}. '
148-
) % $._config,
139+
) % this.config,
149140
},
150141
},
151142
],
Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,37 @@
11
{
2-
_config+:: {
3-
enableMultiCluster: false,
4-
cassandraSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
5-
multiclusterSelector: 'job=~"$job"',
2+
local this = self,
3+
enableDatacenterLabel: false,
4+
enableRackLabel: false,
5+
filteringSelector: 'job="integrations/apache-cassandra"',
66

7-
dashboardTags: ['apache-cassandra-mixin'],
8-
dashboardPeriod: 'now-1h',
9-
dashboardTimezone: 'default',
10-
dashboardRefresh: '1m',
7+
groupLabels: ['job', 'cassandra_cluster'] + (if this.enableDatacenterLabel then ['datacenter'] else []) + (if this.enableRackLabel then ['rack'] else []),
8+
instanceLabels: ['instance'],
9+
uid: 'cassandra',
10+
dashboardNamePrefix: 'Apache Cassandra',
11+
dashboardTags: [self.uid + '-mixin'],
12+
dashboardPeriod: 'now-30m',
13+
dashboardTimezone: 'default',
14+
dashboardRefresh: '1m',
1115

12-
//alert thresholds
13-
alertsCriticalReadLatency5m: 200, //ms
14-
alertsCriticalWriteLatency5m: 200, //ms
15-
alertsWarningPendingCompactionTasks15m: 30,
16-
alertsCriticalBlockedCompactionTasks5m: 1,
17-
alertsWarningHintsStored1m: 1,
18-
alertsCriticalUnavailableWriteRequests5m: 1,
19-
alertsCriticalHighCpuUsage5m: 80, //percent: emitted metric has range 0-100
20-
alertsCriticalHighMemoryUsage5m: 80, //percent: calculated as ratio then multiplied by query
16+
//alert thresholds
17+
alertsCriticalReadLatency5m: 200, //ms
18+
alertsCriticalWriteLatency5m: 200, //ms
19+
alertsWarningPendingCompactionTasks15m: 30,
20+
alertsCriticalBlockedCompactionTasks5m: 1,
21+
alertsWarningHintsStored1m: 1,
22+
alertsCriticalUnavailableWriteRequests5m: 1,
23+
alertsCriticalHighCpuUsage5m: 80, //percent: emitted metric has range 0-100
24+
alertsCriticalHighMemoryUsage5m: 80, //percent: calculated as ratio then multiplied by query
2125

22-
enableLokiLogs: true,
23-
enableDatacenterLabel: false,
24-
enableRackLabel: false,
26+
enableLokiLogs: true,
27+
extraLogLabels: [],
28+
showLogsVolume: true,
29+
30+
// metrics source for signals
31+
metricsSource: 'prometheus',
32+
signals: {
33+
overview: (import './signals/overview.libsonnet')(this),
34+
nodes: (import './signals/nodes.libsonnet')(this),
35+
keyspaces: (import './signals/keyspaces.libsonnet')(this),
2536
},
2637
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
local g = import './g.libsonnet';
2+
local commonlib = import 'common-lib/common/main.libsonnet';
3+
local logslib = import 'logs-lib/logs/main.libsonnet';
4+
5+
6+
{
7+
local root = self,
8+
new(this)::
9+
local prefix = this.config.dashboardNamePrefix;
10+
local links = this.grafana.links;
11+
local tags = this.config.dashboardTags;
12+
local uid = g.util.string.slugify(this.config.uid);
13+
local vars = this.grafana.variables;
14+
local annotations = this.grafana.annotations;
15+
local refresh = this.config.dashboardRefresh;
16+
local period = this.config.dashboardPeriod;
17+
local timezone = this.config.dashboardTimezone;
18+
19+
{
20+
'apache-cassandra-overview.json':
21+
g.dashboard.new(prefix + ' overview')
22+
+ g.dashboard.withPanels(
23+
g.util.panel.resolveCollapsedFlagOnRows(
24+
g.util.grid.wrapPanels(
25+
[
26+
this.grafana.rows.overviewRow,
27+
this.grafana.rows.overviewClientRequestsRow,
28+
]
29+
)
30+
)
31+
) + root.applyCommon(
32+
vars.multiInstance,
33+
uid + '_overview',
34+
tags,
35+
links { apacheCassandraOverview:: {} },
36+
annotations,
37+
timezone,
38+
refresh,
39+
period,
40+
),
41+
'apache-cassandra-nodes.json':
42+
g.dashboard.new(prefix + ' nodes')
43+
+ g.dashboard.withPanels(
44+
g.util.panel.resolveCollapsedFlagOnRows(
45+
g.util.grid.wrapPanels(
46+
[
47+
this.grafana.rows.nodesRow,
48+
]
49+
)
50+
)
51+
) + root.applyCommon(
52+
vars.multiInstance,
53+
uid + '_nodes',
54+
tags,
55+
links { apacheCassandraNodes:: {} },
56+
annotations,
57+
timezone,
58+
refresh,
59+
period,
60+
),
61+
'apache-cassandra-keyspaces.json':
62+
g.dashboard.new(prefix + ' keyspaces')
63+
+ g.dashboard.withPanels(
64+
g.util.panel.resolveCollapsedFlagOnRows(
65+
g.util.grid.wrapPanels(
66+
[
67+
this.grafana.rows.keyspacesRow,
68+
]
69+
),
70+
)
71+
) + root.applyCommon(
72+
vars.multiInstance,
73+
uid + '_keyspaces',
74+
tags,
75+
links { apacheCassandraKeyspaces:: {} },
76+
annotations,
77+
timezone,
78+
refresh,
79+
period,
80+
),
81+
} + if this.config.enableLokiLogs then {
82+
'apache-cassandra-logs.json':
83+
logslib.new(
84+
prefix + ' logs',
85+
datasourceName=this.grafana.variables.datasources.loki.name,
86+
datasourceRegex=this.grafana.variables.datasources.loki.regex,
87+
filterSelector=this.config.filteringSelector,
88+
labels=this.config.groupLabels + this.config.extraLogLabels,
89+
formatParser=null,
90+
showLogsVolume=this.config.showLogsVolume,
91+
)
92+
{
93+
dashboards+:
94+
{
95+
logs+:
96+
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { apacheCassandraLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
97+
},
98+
panels+:
99+
{
100+
logs+:
101+
g.panel.logs.options.withEnableLogDetails(true)
102+
+ g.panel.logs.options.withShowTime(false)
103+
+ g.panel.logs.options.withWrapLogMessage(false),
104+
},
105+
variables+: {
106+
toArray+: [
107+
this.grafana.variables.datasources.prometheus { hide: 2 },
108+
],
109+
},
110+
}.dashboards.logs,
111+
}
112+
else {},
113+
applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
114+
g.dashboard.withTags(tags)
115+
+ g.dashboard.withUid(uid)
116+
+ g.dashboard.withLinks(std.objectValues(links))
117+
+ g.dashboard.withTimezone(timezone)
118+
+ g.dashboard.withRefresh(refresh)
119+
+ g.dashboard.time.withFrom(period)
120+
+ g.dashboard.withVariables(vars)
121+
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
122+
}

0 commit comments

Comments
 (0)