Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
groups+: [
new(this): {
groups: [
{
name: 'RedisEnterpriseAlerts',
rules: [
{
alert: 'RedisEnterpriseClusterOutOfMemory',
expr: |||
sum(redis_used_memory) by (redis_cluster, node) / sum(node_available_memory) by (redis_cluster, node) * 100 > %(alertsClusterOutOfMemoryThreshold)s
||| % $._config,
sum(redis_used_memory{%(filteringSelector)s}) by (redis_cluster, node) / sum(node_available_memory{%(filteringSelector)s}) by (redis_cluster, node) * 100 > %(alertsClusterOutOfMemoryThreshold)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -19,14 +19,14 @@
(
'Memory usage is at {{ printf "%%.0f" $value }} percent on the cluster {{$labels.redis_cluster}}, ' +
"which is above the configured threshold of %(alertsClusterOutOfMemoryThreshold)s%% of the cluster's available memory"
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseNodeNotResponding',
expr: |||
node_up == 0
||| % $._config,
node_up{%(filteringSelector)s} == 0
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -36,14 +36,14 @@
description:
(
'The node {{$labels.node}} in {{$labels.redis_cluster}} is offline or unreachable.'
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseDatabaseNotResponding',
expr: |||
bdb_up == 0
||| % $._config,
bdb_up{%(filteringSelector)s} == 0
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -53,14 +53,14 @@
description:
(
'The database {{$labels.bdb}} in {{$labels.redis_cluster}} is offline or unreachable.'
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseShardNotResponding',
expr: |||
redis_up == 0
||| % $._config,
redis_up{%(filteringSelector)s} == 0
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -70,14 +70,14 @@
description:
(
'The shard {{$labels.redis}} on database {{$labels.bdb}} running on node {{$labels.node}} in the cluster {{$labels.redis_cluster}} is offline or unreachable.'
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseNodeHighCPUUtilization',
expr: |||
(sum(node_cpu_user) by (node, redis_cluster, job) + sum(node_cpu_system) by (node, redis_cluster, job)) * 100 > %(alertsNodeCPUHighUtilizationThreshold)s
||| % $._config,
(sum(node_cpu_user{%(filteringSelector)s}) by (node, redis_cluster, job) + sum(node_cpu_system{%(filteringSelector)s}) by (node, redis_cluster, job)) * 100 > %(alertsNodeCPUHighUtilizationThreshold)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -88,14 +88,14 @@
(
'The node {{$labels.node}} in cluster {{$labels.redis_cluster}} has a CPU percentage of ${{ printf "%%.0f" $value }}, which exceeds ' +
'the threshold %(alertsNodeCPUHighUtilizationThreshold)s%%.'
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseDatabaseHighMemoryUtilization',
alert: 'RedisEnterpriseHighMemUtilization',
expr: |||
sum(bdb_used_memory) by (bdb, redis_cluster) / sum(bdb_memory_limit) by (bdb, redis_cluster) * 100 > %(alertsDatabaseHighMemoryUtiliation)s
||| % $._config,
sum(bdb_used_memory{%(filteringSelector)s}) by (bdb, redis_cluster) / sum(bdb_memory_limit{%(filteringSelector)s}) by (bdb, redis_cluster) * 100 > %(alertsDatabaseHighMemoryUtilization)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -104,16 +104,16 @@
summary: 'Node memory utilization is above the configured threshold.',
description:
(
'The database {{$labels.bdb}} in cluster {{$labels.redis_cluster}} has a memory utiliztaion of ${{ printf "%%.0f" $value }}, which exceeds ' +
'the threshold %(alertsDatabaseHighMemoryUtiliation)s%%.'
) % $._config,
'The database {{$labels.bdb}} in cluster {{$labels.redis_cluster}} has a memory utilization of ${{ printf "%%.0f" $value }}, which exceeds ' +
'the threshold %(alertsDatabaseHighMemoryUtilization)s%%.'
) % this.config,
},
},
{
alert: 'RedisEnterpriseAverageLatencyIncreasing',
expr: |||
bdb_avg_latency / 1000 > %(alertsDatabaseHighLatencyMs)s
||| % $._config,
bdb_avg_latency{%(filteringSelector)s} / 1000 > %(alertsDatabaseHighLatencyMs)s
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we divide by 1000 that would convert the value to seconds, but the alert threshold is in milliseconds - alertsDatabaseHighLatencyMs.

||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -124,14 +124,14 @@
(
'The database {{$labels.bdb}} in cluster {{$labels.redis_cluster}} has high latency of ${{ printf "%%.0f" $value }}, which exceeds ' +
'the threshold of %(alertsDatabaseHighLatencyMs)s ms.'
) % $._config,
) % this.config,
},
},
{
alert: 'RedisEnterpriseKeyEvictionsIncreasing',
expr: |||
bdb_evicted_objects >= %(alertsEvictedObjectsThreshold)s
||| % $._config,
bdb_evicted_objects{%(filteringSelector)s} >= %(alertsEvictedObjectsThreshold)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -142,7 +142,7 @@
(
'The database {{$labels.bdb}} in cluster {{$labels.redis_cluster}} is evicting ${{ printf "%%.0f" $value }} objects, which exceeds ' +
'the threshold of %(alertsEvictedObjectsThreshold)s evicted objects.'
) % $._config,
) % this.config,
},
},
],
Expand Down
40 changes: 28 additions & 12 deletions redis-enterprise-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
{
_config+:: {
dashboardTags: ['redis-enterprise-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
local this = self,
filteringSelector: 'job="integrations/redis-enterprise"',
groupLabels: ['job', 'redis_cluster'],
instanceLabels: ['instance'],
nodeLabels: ['node'],
databaseLabels: ['bdb'],
uid: 'redis-enterprise',
dashboardNamePrefix: 'Redis Enterprise',
dashboardTags: [self.uid + '-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

//alert thresholds
alertsClusterOutOfMemoryThreshold: 80, // %
alertsNodeCPUHighUtilizationThreshold: 80, // %
alertsDatabaseHighMemoryUtiliation: 80, // %
alertsDatabaseHighLatencyMs: 1000, // ms
alertsEvictedObjectsThreshold: 1,
// Alert thresholds
alertsClusterOutOfMemoryThreshold: 80, // %
alertsNodeCPUHighUtilizationThreshold: 80, // %
alertsDatabaseHighMemoryUtilization: 80, // %
alertsDatabaseHighLatencyMs: 1000, // ms
alertsEvictedObjectsThreshold: 1,

enableLokiLogs: true,
enableLokiLogs: true,
extraLogLabels: [],
showLogsVolume: true,

// Metrics source for signals
metricsSource: 'prometheus',
signals: {
overview: (import './signals/overview.libsonnet')(this),
nodes: (import './signals/nodes.libsonnet')(this),
databases: (import './signals/databases.libsonnet')(this),
},
}
148 changes: 148 additions & 0 deletions redis-enterprise-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
local g = import './g.libsonnet';
local commonlib = import 'common-lib/common/main.libsonnet';
local util = import 'common-lib/common/util.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';


{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;

local nodeVars = std.map(
function(label)
g.dashboard.variable.query.new(label)
+ g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label, metric='node_up')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true),
this.config.nodeLabels
);

local databaseVars = std.map(
function(label)
g.dashboard.variable.query.new(label)
+ g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.custom.selectionOptions.withIncludeAll(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label, metric='bdb_up')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus), this.config.databaseLabels
);


{
'redis-enterprise-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overviewRow,
this.grafana.rows.overviewNodesKPIsRow,
this.grafana.rows.overviewDatabaseKPIsRow,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_overview',
tags,
links { redisEnterpriseOverview:: {} },
annotations,
timezone,
refresh,
period,
),
'redis-enterprise-node-overview.json':
g.dashboard.new(prefix + ' nodes')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.nodesOverviewRow,
this.grafana.rows.nodesMetricsRow,
]
)
)
) + root.applyCommon(
vars.multiInstance + nodeVars,
uid + '_nodes',
tags,
links { redisEnterpriseNodes:: {} },
annotations,
timezone,
refresh,
period,
),
'redis-enterprise-database-overview.jsonn':
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo? .jsonn --> .json

g.dashboard.new(prefix + ' databases')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.databasesOverviewRow,
this.grafana.rows.databasesMetricsRow,
this.grafana.rows.databasesCRDBRow,
]
),
)
) + root.applyCommon(
vars.multiInstance + nodeVars + databaseVars,
uid + '_databases',
tags,
links { redisEnterpriseDatabases:: {} },
annotations,
timezone,
refresh,
period,
),
} + if this.config.enableLokiLogs then {
'redis-enterprise-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { redisEnterpriseLogs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
3 changes: 0 additions & 3 deletions redis-enterprise-mixin/dashboards/dashboards.libsonnet

This file was deleted.

Loading
Loading