diff --git a/mongodb-mixin/README.md b/mongodb-mixin/README.md index fc5d20b69..534bad444 100644 --- a/mongodb-mixin/README.md +++ b/mongodb-mixin/README.md @@ -1,6 +1,6 @@ -# MongoDB Mixin +# MongoDB mixin -The MongoDB Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by [Percona MongoDB Exporter](https://github.com/percona/mongodb_exporter). +The MongoDB mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by [Percona MongoDB Exporter](https://github.com/percona/mongodb_exporter). This mixin includes 3 dashboards suited for MongoDB, namely MongoDB Cluster, MongoDB Instance and MongoDB ReplicaSet. diff --git a/mongodb-mixin/alerts/mongodbAlerts.libsonnet b/mongodb-mixin/alerts/mongodbAlerts.libsonnet new file mode 100755 index 000000000..3569fc992 --- /dev/null +++ b/mongodb-mixin/alerts/mongodbAlerts.libsonnet @@ -0,0 +1,140 @@ +local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; +{ + prometheusAlerts+:: { + local config = + $._config + { + agg: std.join(',', $._config.groupLabels + $._config.instanceLabels), + aggCluster: std.join(',', $._config.groupLabels), + instanceLabel: xtd.array.slice($._config.instanceLabels, -1)[0], + groupLabel: xtd.array.slice($._config.groupLabels, -1)[0], + + }, + groups+: [ + { + name: 'MongodbAlerts', + rules: [ + { + alert: 'MongodbDown', + annotations: { + summary: 'MongoDB instance is down.', + description: 'MongoDB instance {{ $labels.%(instanceLabel)s }} is down.' % config, + }, + expr: 'mongodb_up{%(filteringSelector)s} == 0' % config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'MongodbReplicaMemberUnhealthy', + annotations: { + description: 'MongoDB replica member is unhealthy (instance {{ $labels.%(instanceLabel)s }}).' % config, + summary: 'MongoDB replica member is unhealthy.', + }, + expr: 'mongodb_mongod_replset_member_health{%(filteringSelector)s} == 0' % config, + labels: { + severity: 'critical', + }, + }, + { + alert: 'MongodbReplicationLag', + annotations: { + description: 'MongoDB replication lag is more than 60s (instance {{ $labels.%(instanceLabel)s }})' % config, + summary: 'MongoDB replication lag is exceeding the threshold.', + }, + expr: 'mongodb_mongod_replset_member_replication_lag{state="SECONDARY", %(filteringSelector)s} > 60' % config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'MongodbReplicationHeadroom', + annotations: { + description: 'MongoDB replication headroom is <= 0 for {{ $labels.%(groupLabel)s }}.' % config, + summary: 'MongoDB replication headroom is exceeding the threshold.', + }, + expr: '(avg by (%(aggCluster)s) (mongodb_mongod_replset_oplog_tail_timestamp{%(filteringSelector)s} - mongodb_mongod_replset_oplog_head_timestamp{%(filteringSelector)s}) - (avg by (%(aggCluster)s) (mongodb_mongod_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY",%(filteringSelector)s}))) <= 0' % config, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'MongodbNumberCursorsOpen', + annotations: { + description: 'Too many cursors opened by MongoDB for clients (> 10k) on {{ $labels.%(instanceLabel)s }}.' % config, + summary: 'MongoDB number of cursors open too high.', + }, + expr: 'mongodb_mongod_metrics_cursor_open{state="total", %(filteringSelector)s} > 10 * 1000' % config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'MongodbCursorsTimeouts', + annotations: { + description: 'Too many cursors are timing out on {{ $labels.%(instanceLabel)s }}.' % config, + summary: 'MongoDB cursors timeouts are exceeding the threshold.', + }, + expr: 'increase(mongodb_mongod_metrics_cursor_timed_out_total{%(filteringSelector)s}[1m]) > 100' % config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'MongodbTooManyConnections', + annotations: { + description: 'Too many connections to MongoDB instance {{ $labels.%(instanceLabel)s }} (> 80%%).' % config, + summary: 'MongoDB has too many connections.', + }, + expr: 'avg by (%(agg)s) (rate(mongodb_connections{state="current",%(filteringSelector)s}[1m])) / avg by (%(agg)s) (sum (mongodb_connections) by (%(agg)s)) * 100 > 80' % config, + 'for': '2m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'MongodbVirtualMemoryUsage', + annotations: { + description: 'MongoDB virtual memory usage is too high on {{ $labels.%(instanceLabel)s }}.' % config, + summary: 'MongoDB high memory usage.', + }, + expr: '(sum(mongodb_memory{type="virtual",%(filteringSelector)s}) by (%(agg)s) / sum(mongodb_memory{type="mapped",%(filteringSelector)s}) by (%(agg)s)) > 3' % config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'MongodbReadRequestsQueueingUp', + annotations: { + description: 'MongoDB requests are queuing up on {{ $labels.%(instanceLabel)s }}.' % config, + summary: 'MongoDB read requests are queuing up.', + }, + expr: 'delta(mongodb_mongod_global_lock_current_queue{type="reader",%(filteringSelector)s}[1m]) > 0' % config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'MongodbWriteRequestsQueueingUp', + annotations: { + description: 'MongoDB write requests are queueing up on {{ $labels.%(instanceLabel)s }}.' % config, + summary: 'MongoDB write requests are queueing up.', + }, + expr: 'delta(mongodb_mongod_global_lock_current_queue{type="writer",%(filteringSelector)s}[1m]) > 0' % config, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mongodb-mixin/alerts/mongodbAlerts.yaml b/mongodb-mixin/alerts/mongodbAlerts.yaml deleted file mode 100755 index 4dd50eaab..000000000 --- a/mongodb-mixin/alerts/mongodbAlerts.yaml +++ /dev/null @@ -1,92 +0,0 @@ -groups: -- name: MongodbAlerts - rules: - - alert: MongodbDown - expr: mongodb_up == 0 - for: 5m - labels: - severity: critical - annotations: - summary: MongoDB Instance is Down. - description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbReplicaMemberUnhealthy - expr: mongodb_mongod_replset_member_health == 0 - labels: - severity: critical - annotations: - summary: MongoDB replica member unhealthy. - description: "Mongodb replica member unhealthy (instance {{ $labels.instance }})" - - - alert: MongodbReplicationLag - expr: mongodb_mongod_replset_member_replication_lag{state="SECONDARY"} > 60 - for: 5m - labels: - severity: critical - annotations: - summary: MongoDB replication lag is exceeding the threshold. - description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbReplicationHeadroom - expr: (avg(mongodb_mongod_replset_oplog_tail_timestamp - mongodb_mongod_replset_oplog_head_timestamp) - (avg(mongodb_mongod_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY"}))) <= 0 - for: 5m - labels: - severity: critical - annotations: - summary: MongoDB replication headroom is exceeding the threshold. - description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbNumberCursorsOpen - expr: mongodb_mongod_metrics_cursor_open{state="total"} > 10 * 1000 - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB number cursors open too high. - description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbCursorsTimeouts - expr: increase(mongodb_mongod_metrics_cursor_timed_out_total[1m]) > 100 - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB cursors timeouts is exceeding the threshold. - description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbTooManyConnections - expr: avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80 - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB too many connections. - description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbVirtualMemoryUsage - expr: (sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3 - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB virtual memory usage is too high. - description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - # If the queue for read/write locks keeps growing - - alert: MongodbReadRequestsQueueingUp - expr: delta(mongodb_mongod_global_lock_current_queue{type="reader"}[1m]) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: MongoDB read requests queuing up. - description: "MongoDB requests are queuing up\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbWriteRequestsQueueingUp - expr: delta(mongodb_mongod_global_lock_current_queue{type="writer"}[1m]) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: MongoDB write requests queuing up. - description: "MongoDB write requests are queueing up\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/mongodb-mixin/config.libsonnet b/mongodb-mixin/config.libsonnet new file mode 100644 index 000000000..d694dec0b --- /dev/null +++ b/mongodb-mixin/config.libsonnet @@ -0,0 +1,8 @@ +{ + _config+:: { + // used only in alerts + filteringSelector: '', + groupLabels: ['job', 'mongodb_cluster'], + instanceLabels: ['service_name'], + }, +} diff --git a/mongodb-mixin/dashboards/MongoDB_Cluster.json b/mongodb-mixin/dashboards/MongoDB_Cluster.json index d118f6b68..7a79af51d 100644 --- a/mongodb-mixin/dashboards/MongoDB_Cluster.json +++ b/mongodb-mixin/dashboards/MongoDB_Cluster.json @@ -88,7 +88,7 @@ "refId": "A" } ], - "title": "Number of Collections in Shards", + "title": "Number of collections in shards", "transformations": [ { "id": "filterFieldsByName", @@ -160,7 +160,7 @@ "refId": "A" } ], - "title": "Size of Collections in Shards", + "title": "Size of collections in shards", "transformations": [ { "id": "labelsToFields", @@ -239,7 +239,7 @@ "refId": "A" } ], - "title": "# of Shards", + "title": "# of shards", "type": "stat" }, { @@ -416,7 +416,7 @@ "refId": "A" } ], - "title": "Draining Shards", + "title": "Draining shards", "type": "stat" }, { @@ -475,7 +475,7 @@ "refId": "A" } ], - "title": "Sharded Collections", + "title": "Sharded collections", "type": "stat" }, { @@ -589,11 +589,11 @@ "exemplar": true, "expr": "max(mongodb_mongos_sharding_balancer_enabled{job=~\"$job\",mongodb_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "Cluster Balanced", + "legendFormat": "Cluster balanced", "refId": "A" } ], - "title": "Balancer Enabled", + "title": "Balancer enabled", "type": "stat" }, { @@ -648,11 +648,11 @@ "exemplar": true, "expr": "min(mongodb_mongos_sharding_chunks_is_balanced{job=~\"$job\",mongodb_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "Cluster Balanced", + "legendFormat": "Cluster balanced", "refId": "A" } ], - "title": "Chuncks Balanced", + "title": "Chuncks balanced", "type": "stat" }, { @@ -707,7 +707,7 @@ "refId": "A" } ], - "title": "Shard Services QPS - $shard", + "title": "Shard services QPS - $shard", "type": "gauge" }, { @@ -760,7 +760,7 @@ "refId": "A" } ], - "title": "Config Services QPS - ", + "title": "Config services QPS - ", "type": "gauge" }, { @@ -813,7 +813,7 @@ "refId": "A" } ], - "title": "Mongos Services QPS", + "title": "Mongos services QPS", "type": "gauge" }, { @@ -827,7 +827,7 @@ }, "id": 20, "panels": [], - "title": "Chunks in Shards", + "title": "Chunks in shards", "type": "row" }, { @@ -879,7 +879,7 @@ "refId": "A" } ], - "title": "# of Chunks in Shards", + "title": "# of chunks in shards", "transformations": [ { "id": "labelsToFields", @@ -987,7 +987,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Chunks", + "title": "Dynamic of chunks", "type": "timeseries" }, { @@ -1073,7 +1073,7 @@ "refId": "A" } ], - "title": "Chunk Split Events", + "title": "Chunk split events", "type": "timeseries" }, { @@ -1159,7 +1159,7 @@ "refId": "A" } ], - "title": "Chunk Move Events", + "title": "Chunk move events", "type": "timeseries" }, { @@ -1173,7 +1173,7 @@ }, "id": 29, "panels": [], - "title": "Indexes in Shards", + "title": "Indexes in shards", "type": "row" }, { @@ -1225,7 +1225,7 @@ "refId": "A" } ], - "title": "# Indexes per Shard", + "title": "# Indexes per shard", "transformations": [ { "id": "labelsToFields", @@ -1334,7 +1334,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Indexes", + "title": "Dynamic of indexes", "type": "timeseries" }, { @@ -1387,7 +1387,7 @@ "refId": "A" } ], - "title": "Size of Indexes per Shard", + "title": "Size of indexes per shard", "transformations": [ { "id": "labelsToFields", @@ -1496,7 +1496,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Indexes Size", + "title": "Dynamic of indexes size", "type": "timeseries" }, { @@ -1591,7 +1591,7 @@ "refId": "A" } ], - "title": "Current Connections per Instance", + "title": "Current connections per instance", "type": "timeseries" }, { @@ -1672,7 +1672,7 @@ "refId": "A" } ], - "title": "Current Connections per Instance", + "title": "Current connections per instance", "type": "timeseries" }, { @@ -1753,7 +1753,7 @@ "refId": "A" } ], - "title": "Available Connections per Instance", + "title": "Available connections per instance", "type": "timeseries" }, { @@ -1834,7 +1834,7 @@ "refId": "A" } ], - "title": "Current Connections per Shard", + "title": "Current connections per shard", "type": "timeseries" }, { @@ -1929,7 +1929,7 @@ "refId": "A" } ], - "title": "Operattions Per Shard", + "title": "Operattions per shard", "type": "timeseries" }, { @@ -2010,7 +2010,7 @@ "refId": "A" } ], - "title": "Operations By Type", + "title": "Operations by type", "type": "timeseries" }, { @@ -2192,7 +2192,7 @@ "refId": "A" } ], - "title": "Cursors Per Shard", + "title": "Cursors per shard", "type": "timeseries" }, { @@ -2273,7 +2273,7 @@ "refId": "A" } ], - "title": "Mongos Cursors", + "title": "MongoDB cursors", "type": "timeseries" }, { @@ -2354,7 +2354,7 @@ "refId": "A" } ], - "title": "Cursors By Instance", + "title": "Cursors By instance", "type": "timeseries" }, { @@ -2368,7 +2368,7 @@ }, "id": 47, "panels": [], - "title": "Additional Info", + "title": "Additional info", "type": "row" }, { @@ -2449,7 +2449,7 @@ "refId": "A" } ], - "title": "Replication Lag by Set", + "title": "Replication lag by set", "type": "timeseries" }, { @@ -2530,7 +2530,7 @@ "refId": "A" } ], - "title": "Replication Lag by Set", + "title": "Replication lag by set", "type": "timeseries" } ], @@ -2663,7 +2663,7 @@ }, "timepicker": {}, "timezone": "", - "title": "MongoDB Cluster", + "title": "MongoDB cluster", "uid": "nU5ylgi7z", "version": 2 } diff --git a/mongodb-mixin/dashboards/MongoDB_Instance.json b/mongodb-mixin/dashboards/MongoDB_Instance.json index 34d0c9b1a..e0c3cd42b 100644 --- a/mongodb-mixin/dashboards/MongoDB_Instance.json +++ b/mongodb-mixin/dashboards/MongoDB_Instance.json @@ -217,7 +217,7 @@ "refId": "A" } ], - "title": "ReplSet", + "title": "Replica set", "transformations": [ { "id": "labelsToFields", @@ -293,7 +293,7 @@ "y": 1 }, "type": "stat", - "title": "Current ReplSet State", + "title": "Current replica set state", "transformations": [], "datasource": { "type": "prometheus", @@ -411,7 +411,7 @@ }, "id": 4, "panels": [], - "title": "Service Summary", + "title": "Service summary", "type": "row" }, { @@ -513,7 +513,7 @@ "refId": "C" } ], - "title": "Command Operations", + "title": "Command operations", "type": "timeseries" }, { @@ -685,7 +685,7 @@ "refId": "A" } ], - "title": "Document Operations", + "title": "Document operations", "type": "timeseries" }, { @@ -771,7 +771,7 @@ "refId": "A" } ], - "title": "Latency Detail", + "title": "Latency detail", "type": "timeseries" }, { @@ -857,7 +857,7 @@ "refId": "A" } ], - "title": "Queued Operations", + "title": "Queued operations", "type": "timeseries" }, { @@ -1037,7 +1037,7 @@ "refId": "B" } ], - "title": "Scanned and Moved Objects", + "title": "Scanned and moved objects", "type": "timeseries" }, { @@ -1124,7 +1124,7 @@ "refId": "A" } ], - "title": "Assert Events", + "title": "Assert events", "type": "timeseries" }, { @@ -1218,7 +1218,7 @@ "refId": "B" } ], - "title": "getLastError Write Operations", + "title": "getLastError write operations", "type": "timeseries" }, { @@ -1312,7 +1312,7 @@ "refId": "B" } ], - "title": "Query Efficiency", + "title": "Query efficiency", "type": "timeseries" }, { @@ -1391,11 +1391,11 @@ "exemplar": true, "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_get_last_error_wtime_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]) or irate(mongodb_mongos_metrics_get_last_error_wtime_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "interval": "", - "legendFormat": "Write Wait Time", + "legendFormat": "Write wait time", "refId": "A" } ], - "title": "getLastError Write Time", + "title": "getLastError write time", "type": "timeseries" }, { @@ -1481,7 +1481,7 @@ "refId": "A" } ], - "title": "Page Faults", + "title": "Page faults", "type": "timeseries" } ], @@ -1586,7 +1586,7 @@ }, "timepicker": {}, "timezone": "", - "title": "MongoDB Instance", + "title": "MongoDB instance", "uid": "68bDO-m7k", "version": 3 } diff --git a/mongodb-mixin/dashboards/MongoDB_ReplicaSet.json b/mongodb-mixin/dashboards/MongoDB_ReplicaSet.json index 661802e68..028b0f687 100644 --- a/mongodb-mixin/dashboards/MongoDB_ReplicaSet.json +++ b/mongodb-mixin/dashboards/MongoDB_ReplicaSet.json @@ -116,7 +116,7 @@ "refId": "A" } ], - "title": "ReplSet Members", + "title": "Replica set members", "type": "stat" }, { @@ -177,7 +177,7 @@ "refId": "A" } ], - "title": "ReplSet Last Election", + "title": "Replica set last election", "type": "stat" }, { @@ -238,7 +238,7 @@ "refId": "A" } ], - "title": "Avg ReplSet Lag", + "title": "Average replica set lag", "type": "stat" }, { @@ -304,7 +304,7 @@ "refId": "A" } ], - "title": "MongoDB Versions", + "title": "MongoDB versions", "transformations": [ { "id": "filterFieldsByName", @@ -330,7 +330,7 @@ "y": 3 }, "type": "state-timeline", - "title": "ReplSet States", + "title": "Replica set states", "transformations": [ { "id": "labelsToFields", @@ -384,12 +384,12 @@ "text": "STARTUP" }, "1": { - "color": "green", + "color": "semi-dark-blue", "index": 0, "text": "PRIMARY" }, "2": { - "color": "yellow", + "color": "light-blue", "index": 1, "text": "SECONDARY" }, @@ -409,7 +409,7 @@ "text": "UNKNOWN" }, "7": { - "color": "red", + "color": "dark-blue", "index": 6, "text": "ARBITER" }, @@ -914,7 +914,7 @@ "refId": "A" } ], - "title": "Max Heartbeat Time", + "title": "Max heartbeat time", "type": "timeseries" }, { @@ -1008,7 +1008,7 @@ "refId": "A" } ], - "title": "Max Member Ping Time - $service_name", + "title": "Max member ping time - $service_name", "type": "timeseries" }, { @@ -1034,7 +1034,7 @@ "refId": "A" } ], - "title": "Oplog Details", + "title": "Oplog details", "type": "row" }, { @@ -1128,7 +1128,7 @@ "refId": "A" } ], - "title": "Oplog Buffered Operations", + "title": "Oplog buffered operations", "type": "timeseries" }, { @@ -1222,7 +1222,7 @@ "refId": "A" } ], - "title": "Oplog Getmore Time", + "title": "Oplog getmore time", "type": "timeseries" }, { @@ -1313,7 +1313,7 @@ "expr": "time()-avg by (service_name) (mongodb_mongod_replset_oplog_tail_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"})", "hide": false, "interval": "", - "legendFormat": "Now to End", + "legendFormat": "Now to end", "refId": "A" }, { @@ -1324,11 +1324,11 @@ "expr": "avg by (service_name) (mongodb_mongod_replset_oplog_head_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}-mongodb_mongod_replset_oplog_tail_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"})", "hide": false, "interval": "", - "legendFormat": "Oplog Range", + "legendFormat": "Oplog range", "refId": "B" } ], - "title": "Oplog Recovery Window - $service_name", + "title": "Oplog recovery window - $service_name", "type": "timeseries" }, { @@ -1419,7 +1419,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_docs_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Document Preload", + "legendFormat": "Document preload", "refId": "A" }, { @@ -1430,7 +1430,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_indexes_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Index Preload", + "legendFormat": "Index preload", "refId": "B" }, { @@ -1441,11 +1441,11 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_apply_batches_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Batch Apply", + "legendFormat": "Batch apply", "refId": "C" } ], - "title": "Oplog Processing Time - $service_name", + "title": "Oplog processing time - $service_name", "type": "timeseries" }, { @@ -1536,7 +1536,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_docs_num_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Document Preload", + "legendFormat": "Document preload", "refId": "A" }, { @@ -1547,7 +1547,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_indexes_num_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Index Preload", + "legendFormat": "Index preload", "refId": "B" }, { @@ -1558,11 +1558,11 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_apply_ops_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Batch Apply", + "legendFormat": "Batch apply", "refId": "C" } ], - "title": "Oplog Operations - $service_name", + "title": "Oplog operations - $service_name", "type": "timeseries" } ], @@ -1715,7 +1715,7 @@ }, "timepicker": {}, "timezone": "", - "title": "MongoDB ReplicaSet", + "title": "MongoDB replica set", "uid": "U5CBoam7z", "version": 3, "weekStart": "" diff --git a/mongodb-mixin/dashboards_out/MongoDB_Cluster.json b/mongodb-mixin/dashboards_out/MongoDB_Cluster.json index 4da3708f2..f71e0f0e1 100644 --- a/mongodb-mixin/dashboards_out/MongoDB_Cluster.json +++ b/mongodb-mixin/dashboards_out/MongoDB_Cluster.json @@ -88,7 +88,7 @@ "refId": "A" } ], - "title": "Number of Collections in Shards", + "title": "Number of collections in shards", "transformations": [ { "id": "filterFieldsByName", @@ -160,7 +160,7 @@ "refId": "A" } ], - "title": "Size of Collections in Shards", + "title": "Size of collections in shards", "transformations": [ { "id": "labelsToFields", @@ -239,7 +239,7 @@ "refId": "A" } ], - "title": "# of Shards", + "title": "# of shards", "type": "stat" }, { @@ -416,7 +416,7 @@ "refId": "A" } ], - "title": "Draining Shards", + "title": "Draining shards", "type": "stat" }, { @@ -475,7 +475,7 @@ "refId": "A" } ], - "title": "Sharded Collections", + "title": "Sharded collections", "type": "stat" }, { @@ -589,11 +589,11 @@ "exemplar": true, "expr": "max(mongodb_mongos_sharding_balancer_enabled{job=~\"$job\",mongodb_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "Cluster Balanced", + "legendFormat": "Cluster balanced", "refId": "A" } ], - "title": "Balancer Enabled", + "title": "Balancer enabled", "type": "stat" }, { @@ -648,11 +648,11 @@ "exemplar": true, "expr": "min(mongodb_mongos_sharding_chunks_is_balanced{job=~\"$job\",mongodb_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "Cluster Balanced", + "legendFormat": "Cluster balanced", "refId": "A" } ], - "title": "Chuncks Balanced", + "title": "Chuncks balanced", "type": "stat" }, { @@ -707,7 +707,7 @@ "refId": "A" } ], - "title": "Shard Services QPS - $shard", + "title": "Shard services QPS - $shard", "type": "gauge" }, { @@ -760,7 +760,7 @@ "refId": "A" } ], - "title": "Config Services QPS - ", + "title": "Config services QPS - ", "type": "gauge" }, { @@ -813,7 +813,7 @@ "refId": "A" } ], - "title": "Mongos Services QPS", + "title": "Mongos services QPS", "type": "gauge" }, { @@ -827,7 +827,7 @@ }, "id": 20, "panels": [ ], - "title": "Chunks in Shards", + "title": "Chunks in shards", "type": "row" }, { @@ -879,7 +879,7 @@ "refId": "A" } ], - "title": "# of Chunks in Shards", + "title": "# of chunks in shards", "transformations": [ { "id": "labelsToFields", @@ -987,7 +987,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Chunks", + "title": "Dynamic of chunks", "type": "timeseries" }, { @@ -1073,7 +1073,7 @@ "refId": "A" } ], - "title": "Chunk Split Events", + "title": "Chunk split events", "type": "timeseries" }, { @@ -1159,7 +1159,7 @@ "refId": "A" } ], - "title": "Chunk Move Events", + "title": "Chunk move events", "type": "timeseries" }, { @@ -1173,7 +1173,7 @@ }, "id": 29, "panels": [ ], - "title": "Indexes in Shards", + "title": "Indexes in shards", "type": "row" }, { @@ -1225,7 +1225,7 @@ "refId": "A" } ], - "title": "# Indexes per Shard", + "title": "# Indexes per shard", "transformations": [ { "id": "labelsToFields", @@ -1334,7 +1334,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Indexes", + "title": "Dynamic of indexes", "type": "timeseries" }, { @@ -1387,7 +1387,7 @@ "refId": "A" } ], - "title": "Size of Indexes per Shard", + "title": "Size of indexes per shard", "transformations": [ { "id": "labelsToFields", @@ -1496,7 +1496,7 @@ ], "timeFrom": null, "timeShift": null, - "title": "Dynamic of Indexes Size", + "title": "Dynamic of indexes size", "type": "timeseries" }, { @@ -1591,7 +1591,7 @@ "refId": "A" } ], - "title": "Current Connections per Instance", + "title": "Current connections per instance", "type": "timeseries" }, { @@ -1672,7 +1672,7 @@ "refId": "A" } ], - "title": "Current Connections per Instance", + "title": "Current connections per instance", "type": "timeseries" }, { @@ -1753,7 +1753,7 @@ "refId": "A" } ], - "title": "Available Connections per Instance", + "title": "Available connections per instance", "type": "timeseries" }, { @@ -1834,7 +1834,7 @@ "refId": "A" } ], - "title": "Current Connections per Shard", + "title": "Current connections per shard", "type": "timeseries" }, { @@ -1929,7 +1929,7 @@ "refId": "A" } ], - "title": "Operattions Per Shard", + "title": "Operattions per shard", "type": "timeseries" }, { @@ -2010,7 +2010,7 @@ "refId": "A" } ], - "title": "Operations By Type", + "title": "Operations by type", "type": "timeseries" }, { @@ -2192,7 +2192,7 @@ "refId": "A" } ], - "title": "Cursors Per Shard", + "title": "Cursors per shard", "type": "timeseries" }, { @@ -2273,7 +2273,7 @@ "refId": "A" } ], - "title": "Mongos Cursors", + "title": "MongoDB cursors", "type": "timeseries" }, { @@ -2354,7 +2354,7 @@ "refId": "A" } ], - "title": "Cursors By Instance", + "title": "Cursors By instance", "type": "timeseries" }, { @@ -2368,7 +2368,7 @@ }, "id": 47, "panels": [ ], - "title": "Additional Info", + "title": "Additional info", "type": "row" }, { @@ -2449,7 +2449,7 @@ "refId": "A" } ], - "title": "Replication Lag by Set", + "title": "Replication lag by set", "type": "timeseries" }, { @@ -2530,7 +2530,7 @@ "refId": "A" } ], - "title": "Replication Lag by Set", + "title": "Replication lag by set", "type": "timeseries" } ], @@ -2663,7 +2663,7 @@ }, "timepicker": { }, "timezone": "", - "title": "MongoDB Cluster", + "title": "MongoDB cluster", "uid": "nU5ylgi7z", "version": 2 } \ No newline at end of file diff --git a/mongodb-mixin/dashboards_out/MongoDB_Instance.json b/mongodb-mixin/dashboards_out/MongoDB_Instance.json index 937cc7fe4..8ebdd30e1 100644 --- a/mongodb-mixin/dashboards_out/MongoDB_Instance.json +++ b/mongodb-mixin/dashboards_out/MongoDB_Instance.json @@ -217,7 +217,7 @@ "refId": "A" } ], - "title": "ReplSet", + "title": "Replica set", "transformations": [ { "id": "labelsToFields", @@ -396,7 +396,7 @@ "refId": "A" } ], - "title": "Current ReplSet State", + "title": "Current replica set state", "transformations": [ ], "type": "stat" }, @@ -411,7 +411,7 @@ }, "id": 4, "panels": [ ], - "title": "Service Summary", + "title": "Service summary", "type": "row" }, { @@ -513,7 +513,7 @@ "refId": "C" } ], - "title": "Command Operations", + "title": "Command operations", "type": "timeseries" }, { @@ -685,7 +685,7 @@ "refId": "A" } ], - "title": "Document Operations", + "title": "Document operations", "type": "timeseries" }, { @@ -771,7 +771,7 @@ "refId": "A" } ], - "title": "Latency Detail", + "title": "Latency detail", "type": "timeseries" }, { @@ -857,7 +857,7 @@ "refId": "A" } ], - "title": "Queued Operations", + "title": "Queued operations", "type": "timeseries" }, { @@ -1037,7 +1037,7 @@ "refId": "B" } ], - "title": "Scanned and Moved Objects", + "title": "Scanned and moved objects", "type": "timeseries" }, { @@ -1124,7 +1124,7 @@ "refId": "A" } ], - "title": "Assert Events", + "title": "Assert events", "type": "timeseries" }, { @@ -1218,7 +1218,7 @@ "refId": "B" } ], - "title": "getLastError Write Operations", + "title": "getLastError write operations", "type": "timeseries" }, { @@ -1312,7 +1312,7 @@ "refId": "B" } ], - "title": "Query Efficiency", + "title": "Query efficiency", "type": "timeseries" }, { @@ -1391,11 +1391,11 @@ "exemplar": true, "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_get_last_error_wtime_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]) or irate(mongodb_mongos_metrics_get_last_error_wtime_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "interval": "", - "legendFormat": "Write Wait Time", + "legendFormat": "Write wait time", "refId": "A" } ], - "title": "getLastError Write Time", + "title": "getLastError write time", "type": "timeseries" }, { @@ -1481,7 +1481,7 @@ "refId": "A" } ], - "title": "Page Faults", + "title": "Page faults", "type": "timeseries" } ], @@ -1586,7 +1586,7 @@ }, "timepicker": { }, "timezone": "", - "title": "MongoDB Instance", + "title": "MongoDB instance", "uid": "68bDO-m7k", "version": 3 } \ No newline at end of file diff --git a/mongodb-mixin/dashboards_out/MongoDB_ReplicaSet.json b/mongodb-mixin/dashboards_out/MongoDB_ReplicaSet.json index 5aae2f226..aa2c08621 100644 --- a/mongodb-mixin/dashboards_out/MongoDB_ReplicaSet.json +++ b/mongodb-mixin/dashboards_out/MongoDB_ReplicaSet.json @@ -116,7 +116,7 @@ "refId": "A" } ], - "title": "ReplSet Members", + "title": "Replica set members", "type": "stat" }, { @@ -177,7 +177,7 @@ "refId": "A" } ], - "title": "ReplSet Last Election", + "title": "Replica set last election", "type": "stat" }, { @@ -238,7 +238,7 @@ "refId": "A" } ], - "title": "Avg ReplSet Lag", + "title": "Average replica set lag", "type": "stat" }, { @@ -304,7 +304,7 @@ "refId": "A" } ], - "title": "MongoDB Versions", + "title": "MongoDB versions", "transformations": [ { "id": "filterFieldsByName", @@ -345,7 +345,7 @@ "text": "STARTUP" }, "1": { - "color": "green", + "color": "semi-dark-blue", "index": 0, "text": "PRIMARY" }, @@ -355,7 +355,7 @@ "text": "REMOVED" }, "2": { - "color": "yellow", + "color": "light-blue", "index": 1, "text": "SECONDARY" }, @@ -375,7 +375,7 @@ "text": "UNKNOWN" }, "7": { - "color": "red", + "color": "dark-blue", "index": 6, "text": "ARBITER" }, @@ -466,7 +466,7 @@ "refId": "A" } ], - "title": "ReplSet States", + "title": "Replica set states", "transformations": [ { "id": "labelsToFields", @@ -914,7 +914,7 @@ "refId": "A" } ], - "title": "Max Heartbeat Time", + "title": "Max heartbeat time", "type": "timeseries" }, { @@ -1008,7 +1008,7 @@ "refId": "A" } ], - "title": "Max Member Ping Time - $service_name", + "title": "Max member ping time - $service_name", "type": "timeseries" }, { @@ -1034,7 +1034,7 @@ "refId": "A" } ], - "title": "Oplog Details", + "title": "Oplog details", "type": "row" }, { @@ -1128,7 +1128,7 @@ "refId": "A" } ], - "title": "Oplog Buffered Operations", + "title": "Oplog buffered operations", "type": "timeseries" }, { @@ -1222,7 +1222,7 @@ "refId": "A" } ], - "title": "Oplog Getmore Time", + "title": "Oplog getmore time", "type": "timeseries" }, { @@ -1313,7 +1313,7 @@ "expr": "time()-avg by (service_name) (mongodb_mongod_replset_oplog_tail_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"})", "hide": false, "interval": "", - "legendFormat": "Now to End", + "legendFormat": "Now to end", "refId": "A" }, { @@ -1324,11 +1324,11 @@ "expr": "avg by (service_name) (mongodb_mongod_replset_oplog_head_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}-mongodb_mongod_replset_oplog_tail_timestamp{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"})", "hide": false, "interval": "", - "legendFormat": "Oplog Range", + "legendFormat": "Oplog range", "refId": "B" } ], - "title": "Oplog Recovery Window - $service_name", + "title": "Oplog recovery window - $service_name", "type": "timeseries" }, { @@ -1419,7 +1419,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_docs_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Document Preload", + "legendFormat": "Document preload", "refId": "A" }, { @@ -1430,7 +1430,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_indexes_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Index Preload", + "legendFormat": "Index preload", "refId": "B" }, { @@ -1441,11 +1441,11 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_apply_batches_total_milliseconds{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Batch Apply", + "legendFormat": "Batch apply", "refId": "C" } ], - "title": "Oplog Processing Time - $service_name", + "title": "Oplog processing time - $service_name", "type": "timeseries" }, { @@ -1536,7 +1536,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_docs_num_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Document Preload", + "legendFormat": "Document preload", "refId": "A" }, { @@ -1547,7 +1547,7 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_preload_indexes_num_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Index Preload", + "legendFormat": "Index preload", "refId": "B" }, { @@ -1558,11 +1558,11 @@ "expr": "avg by (service_name) (irate(mongodb_mongod_metrics_repl_apply_ops_total{service_name=~\"$service_name\", mongodb_cluster=\"$cluster\",job=~\"$job\"}[$__rate_interval]))", "hide": false, "interval": "", - "legendFormat": "Batch Apply", + "legendFormat": "Batch apply", "refId": "C" } ], - "title": "Oplog Operations - $service_name", + "title": "Oplog operations - $service_name", "type": "timeseries" } ], @@ -1715,7 +1715,7 @@ }, "timepicker": { }, "timezone": "", - "title": "MongoDB ReplicaSet", + "title": "MongoDB replica set", "uid": "U5CBoam7z", "version": 3, "weekStart": "" diff --git a/mongodb-mixin/jsonnetfile.json b/mongodb-mixin/jsonnetfile.json new file mode 100644 index 000000000..9d21470f3 --- /dev/null +++ b/mongodb-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "master" + } + ], + "legacyImports": true +} \ No newline at end of file diff --git a/mongodb-mixin/mixin.libsonnet b/mongodb-mixin/mixin.libsonnet index 88a412fc0..bf3c4f430 100644 --- a/mongodb-mixin/mixin.libsonnet +++ b/mongodb-mixin/mixin.libsonnet @@ -4,13 +4,7 @@ 'MongoDB_ReplicaSet.json': (import 'dashboards/MongoDB_ReplicaSet.json'), 'MongoDB_Cluster.json': (import 'dashboards/MongoDB_Cluster.json'), }, - - // Helper function to ensure that we don't override other rules, by forcing - // the patching of the groups list, and not the overall rules object. - local importRules(rules) = { - groups+: std.parseYaml(rules).groups, - }, - - prometheusAlerts+: - importRules(importstr 'alerts/mongodbAlerts.yaml'), } + ++ (import './alerts/mongodbAlerts.libsonnet') ++ (import './config.libsonnet') diff --git a/mongodb-mixin/prometheus_rules_out/prometheus_alerts.yaml b/mongodb-mixin/prometheus_rules_out/prometheus_alerts.yaml index 4d0a23734..fe04a4324 100644 --- a/mongodb-mixin/prometheus_rules_out/prometheus_alerts.yaml +++ b/mongodb-mixin/prometheus_rules_out/prometheus_alerts.yaml @@ -3,107 +3,80 @@ groups: rules: - alert: MongodbDown annotations: - description: |- - MongoDB instance is down - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB Instance is Down. - expr: mongodb_up == 0 + description: MongoDB instance {{ $labels.service_name }} is down. + summary: MongoDB instance is down. + expr: mongodb_up{} == 0 for: 5m labels: severity: critical - alert: MongodbReplicaMemberUnhealthy annotations: - description: Mongodb replica member unhealthy (instance {{ $labels.instance }}) - summary: MongoDB replica member unhealthy. - expr: mongodb_mongod_replset_member_health == 0 + description: MongoDB replica member is unhealthy (instance {{ $labels.service_name }}). + summary: MongoDB replica member is unhealthy. + expr: mongodb_mongod_replset_member_health{} == 0 labels: severity: critical - alert: MongodbReplicationLag annotations: - description: |- - Mongodb replication lag is more than 10s - VALUE = {{ $value }} - LABELS = {{ $labels }} + description: MongoDB replication lag is more than 60s (instance {{ $labels.service_name }}) summary: MongoDB replication lag is exceeding the threshold. - expr: mongodb_mongod_replset_member_replication_lag{state="SECONDARY"} > 60 + expr: mongodb_mongod_replset_member_replication_lag{state="SECONDARY", } > 60 for: 5m labels: severity: critical - alert: MongodbReplicationHeadroom annotations: - description: |- - MongoDB replication headroom is <= 0 - VALUE = {{ $value }} - LABELS = {{ $labels }} + description: MongoDB replication headroom is <= 0 for {{ $labels.mongodb_cluster }}. summary: MongoDB replication headroom is exceeding the threshold. - expr: (avg(mongodb_mongod_replset_oplog_tail_timestamp - mongodb_mongod_replset_oplog_head_timestamp) - (avg(mongodb_mongod_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY"}))) <= 0 + expr: (avg by (job,mongodb_cluster) (mongodb_mongod_replset_oplog_tail_timestamp{} - mongodb_mongod_replset_oplog_head_timestamp{}) - (avg by (job,mongodb_cluster) (mongodb_mongod_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY",}))) <= 0 for: 5m labels: severity: critical - alert: MongodbNumberCursorsOpen annotations: - description: |- - Too many cursors opened by MongoDB for clients (> 10k) - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB number cursors open too high. - expr: mongodb_mongod_metrics_cursor_open{state="total"} > 10 * 1000 + description: Too many cursors opened by MongoDB for clients (> 10k) on {{ $labels.service_name }}. + summary: MongoDB number of cursors open too high. + expr: mongodb_mongod_metrics_cursor_open{state="total", } > 10 * 1000 for: 2m labels: severity: warning - alert: MongodbCursorsTimeouts annotations: - description: |- - Too many cursors are timing out - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB cursors timeouts is exceeding the threshold. - expr: increase(mongodb_mongod_metrics_cursor_timed_out_total[1m]) > 100 + description: Too many cursors are timing out on {{ $labels.service_name }}. + summary: MongoDB cursors timeouts are exceeding the threshold. + expr: increase(mongodb_mongod_metrics_cursor_timed_out_total{}[1m]) > 100 for: 2m labels: severity: warning - alert: MongodbTooManyConnections annotations: - description: |- - Too many connections (> 80%) - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB too many connections. - expr: avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80 + description: Too many connections to MongoDB instance {{ $labels.service_name }} (> 80%). + summary: MongoDB has too many connections. + expr: avg by (job,mongodb_cluster,service_name) (rate(mongodb_connections{state="current",}[1m])) / avg by (job,mongodb_cluster,service_name) (sum (mongodb_connections) by (job,mongodb_cluster,service_name)) * 100 > 80 for: 2m labels: severity: warning - alert: MongodbVirtualMemoryUsage annotations: - description: |- - High memory usage - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB virtual memory usage is too high. - expr: (sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3 - for: 2m + description: MongoDB virtual memory usage is too high on {{ $labels.service_name }}. + summary: MongoDB high memory usage. + expr: (sum(mongodb_memory{type="virtual",}) by (job,mongodb_cluster,service_name) / sum(mongodb_memory{type="mapped",}) by (job,mongodb_cluster,service_name)) > 3 + for: 5m labels: severity: warning - alert: MongodbReadRequestsQueueingUp annotations: - description: |- - MongoDB requests are queuing up - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB read requests queuing up. - expr: delta(mongodb_mongod_global_lock_current_queue{type="reader"}[1m]) > 0 + description: MongoDB requests are queuing up on {{ $labels.service_name }}. + summary: MongoDB read requests are queuing up. + expr: delta(mongodb_mongod_global_lock_current_queue{type="reader",}[1m]) > 0 for: 5m labels: severity: warning - alert: MongodbWriteRequestsQueueingUp annotations: - description: |- - MongoDB write requests are queueing up - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: MongoDB write requests queuing up. - expr: delta(mongodb_mongod_global_lock_current_queue{type="writer"}[1m]) > 0 + description: MongoDB write requests are queueing up on {{ $labels.service_name }}. + summary: MongoDB write requests are queueing up. + expr: delta(mongodb_mongod_global_lock_current_queue{type="writer",}[1m]) > 0 for: 5m labels: severity: warning