Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: 'ApacheCouchDBAlerts',
Expand All @@ -8,7 +8,7 @@
alert: 'CouchDBUnhealthyCluster',
expr: |||
min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the alerts use the filteringSelector from the config? Just wondering.

expr: |||
  min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable{%(filteringSelector)s}) < %(alertsCriticalClusterIsUnstable5m)s
||| % this.config,

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Went ahead and implemented these!

||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -19,14 +19,14 @@
(
'{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' +
'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh4xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the 4.* regex more general than the 4..?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/grafana/jsonnet-libs/actions/runs/19470908585/job/55717683328?pr=1522

Unfortunately its considered a messy selector. For some reason the lint passes locally for myself but within Ci it fails

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for this failing is due to the Pint linter rules that takes a bit more setup to install locally. I've no strong feelings about this either way. On one hand it's good to be specific to the three digit setup, on the other it could be an issue if mixed with text, e.g. 404NotFound as a label.

I doubt we will run into the latter, so for now I think 4.. is perfectly fine

||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -37,14 +37,14 @@
(
'{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh5xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -55,14 +55,14 @@
(
'{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBModerateRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -73,14 +73,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHighRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know that you didn't change this but I think there is a bug in the calculation, sum / count is in seconds but the config declares the threshold in milliseconds (500), we should multiply the value before the compare operator by 1000. I think this is the second time I catch this bug, which is not on you but it's there in the original implementations.

expr: |||
  sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) * 1000 > %(alertsWarningRequestLatency5m)s
||| % this.config,

Alternatively, we can change the alert threshold in the config. I am fine with either solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yep didn't really audit the content of the queries too much. Will go ahead and take a stab at a fix

||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -91,14 +91,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBManyReplicatorJobsPending',
expr: |||
sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -109,14 +109,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorJobsCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -127,14 +127,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorChangesQueuesDying',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -145,14 +145,14 @@
(
'{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionOwnersCrashing',
alert: 'CouchDBReplicatorOwnersCrashing',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would changing the alert name affect existing installations? What if someone uninstalls or upgrades an integration, would they get duplicate alerts?

Looking in @Dasomeone for more context.

expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -163,14 +163,14 @@
(
'{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionWorkersCrashing',
alert: 'CouchDBReplicatorWorkersCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -181,7 +181,7 @@
(
'{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. '
) % $._config,
) % this.config,
},
},
],
Expand Down
63 changes: 43 additions & 20 deletions apache-couchdb-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
{
_config+:: {
enableMultiCluster: false,
couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
multiClusterSelector: 'job=~"$job"',
local this = self,
filteringSelector: 'job="integrations/apache-couchdb"',
groupLabels: ['job', 'couchdb_cluster', 'cluster'],
logLabels: ['job', 'cluster', 'instance'],
instanceLabels: ['instance'],

dashboardTags: ['apache-couchdb-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardTags: ['apache-couchdb-mixin'],
uid: 'couchdb',
dashboardNamePrefix: 'Apache CouchDB',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: [
'prometheus',
/*
* the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin.
* i.e. couchdb_open_os_files_total => couchdb_open_os_files
* This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected.
* This was an identified as a noticeable change from 3.3.0 to 3.5.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! 🚀
Can you call this out in the readme as well please? E.g. just what versions are supported, and what the different metricSources are for

*/
'prometheusWithTotal',
],

//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in milliseconds, which means that the alert threshold is broken.

alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,

// Signals configuration
signals+: {
overview: (import './signals/overview.libsonnet')(this),
nodes: (import './signals/nodes.libsonnet')(this),
replicator: (import './signals/replicator.libsonnet')(this),
},
}
107 changes: 107 additions & 0 deletions apache-couchdb-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
local g = import './g.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,

new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'couchdb-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overview,
this.grafana.rows.overviewRequests,
this.grafana.rows.overviewReplication,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_overview',
tags,
links { couchdbOverview+:: {} },
annotations,
timezone,
refresh,
period
),

'couchdb-nodes.json':
g.dashboard.new(prefix + ' nodes')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.nodes,
this.grafana.rows.nodeRequests,
this.grafana.rows.nodeLogs,
],
),
),
) + root.applyCommon(
vars.multiInstance,
uid + '_nodes',
tags,
links { couchdbNodes+:: {} },
annotations,
timezone,
refresh,
period
),

}
+ if this.config.enableLokiLogs then {
'couchdb-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { couchdbLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading