Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: 'ApacheCouchDBAlerts',
Expand All @@ -8,7 +8,7 @@
alert: 'CouchDBUnhealthyCluster',
expr: |||
min by(job, couchdb_cluster) (couchdb_couch_replicator_cluster_is_stable) < %(alertsCriticalClusterIsUnstable5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -19,14 +19,14 @@
(
'{{$labels.couchdb_cluster}} has reported a value of {{ printf "%%.0f" $value }} for its stability over the last 5 minutes, ' +
'which is below the threshold of %(alertsCriticalClusterIsUnstable5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh4xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.*"}[5m])) > %(alertsWarning4xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"4.."}[5m])) > %(alertsWarning4xxResponseCodes5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -37,14 +37,14 @@
(
'{{ printf "%%.0f" $value }} 4xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarning4xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHigh5xxResponseCodes',
expr: |||
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.*"}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % $._config,
sum by(job, instance) (increase(couchdb_httpd_status_codes{code=~"5.."}[5m])) > %(alertsCritical5xxResponseCodes5m)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -55,14 +55,14 @@
(
'{{ printf "%%.0f" $value }} 5xx responses have been detected over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCritical5xxResponseCodes5m)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBModerateRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsWarningRequestLatency5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -73,14 +73,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBHighRequestLatency',
expr: |||
sum by(job, instance) (couchdb_request_time_seconds_sum / couchdb_request_time_seconds_count) > %(alertsCriticalRequestLatency5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -91,14 +91,14 @@
(
'An average of {{ printf "%%.0f" $value }}ms of request latency has occurred over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalRequestLatency5m)sms. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBManyReplicatorJobsPending',
expr: |||
sum by(job, instance) (couchdb_couch_replicator_jobs_pending) > %(alertsWarningPendingReplicatorJobs5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -109,14 +109,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs are pending on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningPendingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorJobsCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_jobs_crashes_total[5m])) > %(alertsCriticalCrashingReplicatorJobs5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -127,14 +127,14 @@
(
'{{ printf "%%.0f" $value }} replicator jobs have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsCriticalCrashingReplicatorJobs5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorChangesQueuesDying',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_changes_queue_deaths_total[5m])) > %(alertsWarningDyingReplicatorChangesQueues5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -145,14 +145,14 @@
(
'{{ printf "%%.0f" $value }} replicator changes queue processes have died over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningDyingReplicatorChangesQueues5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionOwnersCrashing',
alert: 'CouchDBReplicatorOwnersCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_owner_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionOwners5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -163,14 +163,14 @@
(
'{{ printf "%%.0f" $value }} replicator connection owner processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionOwners5m)s. '
) % $._config,
) % this.config,
},
},
{
alert: 'CouchDBReplicatorConnectionWorkersCrashing',
alert: 'CouchDBReplicatorWorkersCrashing',
expr: |||
sum by(job, instance) (increase(couchdb_couch_replicator_connection_worker_crashes_total[5m])) > %(alertsWarningCrashingReplicatorConnectionWorkers5m)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -181,7 +181,7 @@
(
'{{ printf "%%.0f" $value }} replicator connection worker processes have crashed over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(alertsWarningCrashingReplicatorConnectionWorkers5m)s. '
) % $._config,
) % this.config,
},
},
],
Expand Down
63 changes: 43 additions & 20 deletions apache-couchdb-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
{
_config+:: {
enableMultiCluster: false,
couchDBSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
multiClusterSelector: 'job=~"$job"',
local this = self,
filteringSelector: 'job="integrations/apache-couchdb"',
groupLabels: ['job', 'couchdb_cluster', 'cluster'],
logLabels: ['job', 'cluster', 'instance'],
instanceLabels: ['instance'],

dashboardTags: ['apache-couchdb-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardTags: ['apache-couchdb-mixin'],
uid: 'couchdb',
dashboardNamePrefix: 'Apache CouchDB',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: [
'prometheus',
/*
* the prometheusWithTotal is used for backwards compatibility as some metrics are suffixed with _total but in later versions of the couchdb-mixin.
* i.e. couchdb_open_os_files_total => couchdb_open_os_files
* This is to ensure that the signals for the metrics that are suffixed with _total continue to work as expected.
* This was an identified as a noticeable change from 3.3.0 to 3.5.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! 🚀
Can you call this out in the readme as well please? E.g. just what versions are supported, and what the different metricSources are for

*/
'prometheusWithTotal',
],

//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
//alert thresholds
alertsCriticalClusterIsUnstable5m: 1, //1 is stable
alertsWarning4xxResponseCodes5m: 5,
alertsCritical5xxResponseCodes5m: 0,
alertsWarningRequestLatency5m: 500, //ms
alertsCriticalRequestLatency5m: 1000, //ms
alertsWarningPendingReplicatorJobs5m: 10,
alertsCriticalCrashingReplicatorJobs5m: 0,
alertsWarningDyingReplicatorChangesQueues5m: 0,
alertsWarningCrashingReplicatorConnectionOwners5m: 0,
alertsWarningCrashingReplicatorConnectionWorkers5m: 0,

// Signals configuration
signals+: {
overview: (import './signals/overview.libsonnet')(this),
nodes: (import './signals/nodes.libsonnet')(this),
replicator: (import './signals/replicator.libsonnet')(this),
},
}
107 changes: 107 additions & 0 deletions apache-couchdb-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
local g = import './g.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,

new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'couchdb-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overview,
this.grafana.rows.overviewRequests,
this.grafana.rows.overviewReplication,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_overview',
tags,
links { couchdbOverview+:: {} },
annotations,
timezone,
refresh,
period
),

'couchdb-nodes.json':
g.dashboard.new(prefix + ' nodes')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.nodes,
this.grafana.rows.nodeRequests,
this.grafana.rows.nodeLogs,
],
),
),
) + root.applyCommon(
vars.multiInstance,
uid + '_nodes',
tags,
links { couchdbNodes+:: {} },
annotations,
timezone,
refresh,
period
),

}
+ if this.config.enableLokiLogs then {
'couchdb-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { couchdbLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading