Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
groups+: [
new(this): {
groups: [
{
name: 'apache-hbase-alerts',
rules: [
{
alert: 'HBaseHighHeapMemUsage',
expr: |||
100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{%(filterSelector)s} / clamp_min(jvm_metrics_mem_heap_committed_m{%(filterSelector)s}, 1)) > %(alertsHighHeapMemUsage)s
||| % $._config,
100 * sum without(context, hostname, processname) (jvm_metrics_mem_heap_used_m{%(filteringSelector)s} / clamp_min(jvm_metrics_mem_heap_committed_m{%(filteringSelector)s}, 1)) > %(alertsHighHeapMemUsage)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -18,14 +18,14 @@
description:
(
'The heap memory usage for the JVM on instance {{$labels.instance}} in cluster {{$labels.hbase_cluster}} is {{printf "%%.0f" $value}} percent, which is above the threshold of %(alertsHighHeapMemUsage)s percent'
) % $._config,
) % this.config,
},
},
{
alert: 'HBaseDeadRegionServer',
expr: |||
server_num_dead_region_servers > %(alertsDeadRegionServer)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -35,14 +35,14 @@
description:
(
'{{$value}} RegionServer(s) in cluster {{$labels.hbase_cluster}} are unresponsive, which is above the threshold of %(alertsDeadRegionServer)s. The name(s) of the dead RegionServer(s) are {{$labels.deadregionservers}}'
) % $._config,
) % this.config,
},
},
{
alert: 'HBaseOldRegionsInTransition',
expr: |||
100 * assignment_manager_rit_count_over_threshold / clamp_min(assignment_manager_rit_count, 1) > %(alertsOldRegionsInTransition)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -52,14 +52,14 @@
description:
(
'{{printf "%%.0f" $value}} percent of RegionServers in transition in cluster {{$labels.hbase_cluster}} are transitioning for longer than expected, which is above the threshold of %(alertsOldRegionsInTransition)s percent'
) % $._config,
) % this.config,
},
},
{
alert: 'HBaseHighMasterAuthFailRate',
expr: |||
100 * rate(master_authentication_failures[5m]) / (clamp_min(rate(master_authentication_successes[5m]), 1) + clamp_min(rate(master_authentication_failures[5m]), 1)) > %(alertsHighMasterAuthFailRate)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -69,14 +69,14 @@
description:
(
'{{printf "%%.0f" $value}} percent of authentication attempts to the master are failing in cluster {{$labels.hbase_cluster}}, which is above the threshold of %(alertsHighMasterAuthFailRate)s percent'
) % $._config,
) % this.config,
},
},
{
alert: 'HBaseHighRSAuthFailRate',
expr: |||
100 * rate(region_server_authentication_failures[5m]) / (clamp_min(rate(region_server_authentication_successes[5m]), 1) + clamp_min(rate(region_server_authentication_failures[5m]), 1)) > %(alertsHighRSAuthFailRate)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -86,7 +86,7 @@
description:
(
'{{printf "%%.0f" $value}} percent of authentication attempts to the RegionServer {{$labels.instance}} are failing in cluster {{$labels.hbase_cluster}}, which is above the threshold of %(alertsHighRSAuthFailRate)s percent'
) % $._config,
) % this.config,
},
},
],
Expand Down
43 changes: 29 additions & 14 deletions apache-hbase-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
{
_config+:: {
filterSelector: 'job="integrations/apache-hbase"',
local this = self,
filteringSelector: 'job="integrations/apache-hbase"',
groupLabels: ['job', 'hbase_cluster'],
instanceLabels: ['instance'],
logLabels: ['job', 'hbase_cluster', 'instance'],

dashboardTags: ['apache-hbase-mixin'],
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardTags: [self.uid + '-mixin'],
uid: 'apache-hbase',
dashboardNamePrefix: 'Apache HBase',
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: ['prometheus', 'prometheusv2'],

// alerts thresholds
alertsHighHeapMemUsage: 80, // percentage
alertsHighNonHeapMemUsage: 80, // percentage
alertsDeadRegionServer: 0, // count
alertsOldRegionsInTransition: 50, // percentage
alertsHighMasterAuthFailRate: 35, // percentage
alertsHighRSAuthFailRate: 35, // percentage
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
// Alerts thresholds
alertsHighHeapMemUsage: 80, // percentage
alertsHighNonHeapMemUsage: 80, // percentage
alertsDeadRegionServer: 0, // count
alertsOldRegionsInTransition: 50, // percentage
alertsHighMasterAuthFailRate: 35, // percentage
alertsHighRSAuthFailRate: 35, // percentage

// Signals configuration
signals+: {
cluster: (import './signals/cluster.libsonnet')(this),
regionserver: (import './signals/regionserver.libsonnet')(this),
},
}
105 changes: 105 additions & 0 deletions apache-hbase-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
local g = import './g.libsonnet';
local commonlib = import 'common-lib/common/main.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'apache-hbase-cluster-overview.json':
g.dashboard.new(prefix + ' cluster overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.clusterOverview,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '-cluster-overview',
tags,
links { clusterOverview:: {} },
annotations,
timezone,
refresh,
period,
),

'apache-hbase-regionserver-overview.json':
g.dashboard.new(prefix + ' RegionServer overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.regionServerOverview,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '-regionserver-overview',
tags,
links { regionServerOverview:: {} },
annotations,
timezone,
refresh,
period,
),
}
+
if this.config.enableLokiLogs then
{
'apache-hbase-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.logLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
logsVolumeGroupBy=this.config.logsVolumeGroupBy,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '_logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading