Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
vendor
jsonnetfile.lock.json
*.zip
.worktrees
2 changes: 2 additions & 0 deletions sap-hana-mixin/.lint
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ exclusions:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
target-instance-rule:
reason: "SAP HANA uses the label 'host' to identify separate instances"
panel-datasource-rule:
reason: "Modern mixins use signal-based architecture where datasource references are handled by the framework"
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
groups+: [
new(this): {
groups: [
{
name: 'sap-hana-alerts',
name: this.config.uid + '-alerts',
rules: [
{
alert: 'SapHanaHighCpuUtilization',
expr: |||
sum without (database_name) (hanadb_cpu_busy_percent) > %(alertsCriticalHighCpuUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -17,15 +17,15 @@
summary: 'CPU utilization is high.',
description:
(
'The CPU usage is at {{$labels.value}}%% on {{$labels.core}} on {{$labels.host}} which is above the threshold of %(alertsCriticalHighCpuUsage)s%%.'
) % $._config,
'The CPU usage is at {{$value}}%% on {{$labels.core}} on {{$labels.host}} which is above the threshold of %(alertsCriticalHighCpuUsage)s%%.'
) % this.config,
},
},
{
alert: 'SapHanaHighPhysicalMemoryUsage',
alert: 'SapHanaHighPhysicalMemory',
expr: |||
100 * sum without (database_name)(hanadb_host_memory_resident_mb) / sum without (database_name) (hanadb_host_memory_physical_total_mb) > %(alertsCriticalHighPhysicalMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -34,15 +34,15 @@
summary: 'Current physical memory usage of the host is approaching capacity.',
description:
(
'The physical memory usage is at {{$labels.value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighPhysicalMemoryUsage)s%%.'
) % $._config,
'The physical memory usage is at {{$value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighPhysicalMemoryUsage)s%%.'
) % this.config,
},
},
{
alert: 'SapHanaMemAllocLimitBelowRecommendation',
alert: 'SapHanaMemAllocBelowLimit',
expr: |||
100 * sum without (database_name) (hanadb_host_memory_alloc_limit_mb) / sum without (database_name) (hanadb_host_memory_physical_total_mb) < %(alertsWarningLowMemAllocLimit)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -51,15 +51,15 @@
summary: 'Memory allocation limit set below recommended limit.',
description:
(
'The memory allocation limit is set at {{$labels.value}}%% on {{$labels.host}} which is below the recommended value of %(alertsWarningLowMemAllocLimit)s%%.'
) % $._config,
'The memory allocation limit is set at {{$value}}%% on {{$labels.host}} which is below the recommended value of %(alertsWarningLowMemAllocLimit)s%%.'
) % this.config,
},
},
{
alert: 'SapHanaHighMemoryUsage',
expr: |||
100 * sum without (database_name) (hanadb_host_memory_used_total_mb) / sum without (database_name) (hanadb_host_memory_alloc_limit_mb) > %(alertsCriticalHighMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -68,15 +68,15 @@
summary: 'Current SAP HANA memory usage is approaching capacity.',
description:
(
'The memory usage is at {{$labels.value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighMemoryUsage)s%%.'
) % $._config,
'The memory usage is at {{$value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighMemoryUsage)s%%.'
) % this.config,
},
},
{
alert: 'SapHanaHighDiskUtilization',
expr: |||
100 * sum without (database_name, filesystem_type, path, usage_type) (hanadb_disk_total_used_size_mb) / sum without (database_name, filesystem_type, path, usage_type) (hanadb_disk_total_size_mb) > %(alertsCriticalHighDiskUtilization)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -85,15 +85,15 @@
summary: 'SAP HANA disk is approaching capacity.',
description:
(
'The disk usage is at {{$labels.value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighDiskUtilization)s%%.'
) % $._config,
'The disk usage is at {{$value}}%% on {{$labels.host}} which is above the threshold of %(alertsCriticalHighDiskUtilization)s%%.'
) % this.config,
},
},
{
alert: 'SapHanaHighSqlExecutionTime',
expr: |||
avg without (database_name, port, service, sql_type) (hanadb_sql_service_elap_per_exec_avg_ms) / 1000 > %(alertsCriticalHighSqlExecutionTime)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -102,15 +102,15 @@
summary: 'SAP HANA SQL average execution time is high.',
description:
(
'The average SQL execution time is at {{$labels.value}}s on {{$labels.host}} which is above the threshold of %(alertsCriticalHighSqlExecutionTime)ss.'
) % $._config,
'The average SQL execution time is at {{$value}}s on {{$labels.host}} which is above the threshold of %(alertsCriticalHighSqlExecutionTime)ss.'
) % this.config,
},
},
{
alert: 'SapHanaHighReplicationShippingTime',
alert: 'SapHanaHighReplicationDelay',
expr: |||
avg without (database_name, port, secondary_port, replication_mode) (hanadb_sr_ship_delay) > %(alertsCriticalHighReplicationShippingTime)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -119,15 +119,15 @@
summary: 'SAP HANA system replication log shipping delay is high.',
description:
(
'The average system replication log shipping delay is at {{$labels.value}}s from primary site {{$labels.site_name}} to replica site {{$labels.secondary_site_name}} which is above the threshold of %(alertsCriticalHighReplicationShippingTime)ss.'
) % $._config,
'The average system replication log shipping delay is at {{$value}}s from primary site {{$labels.site_name}} to replica site {{$labels.secondary_site_name}} which is above the threshold of %(alertsCriticalHighReplicationShippingTime)ss.'
) % this.config,
},
},
{
alert: 'SapHanaReplicationStatusError',
alert: 'SapHanaReplicationError',
expr: |||
hanadb_sr_replication == 4
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -137,7 +137,7 @@
description:
(
'The replication status of replica {{$labels.secondary_site_name}} is ERROR'
) % $._config,
) % this.config,
},
},
],
Expand Down
57 changes: 43 additions & 14 deletions sap-hana-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,19 +1,48 @@
{
_config+:: {
dashboardTags: ['sap-hana-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
local this = self,

// alerts thresholds
alertsCriticalHighCpuUsage: 80, // percent 0-100
alertsCriticalHighPhysicalMemoryUsage: 80, // percent 0-100
alertsWarningLowMemAllocLimit: 90, // percent 0-100
alertsCriticalHighMemoryUsage: 80, // percent 0-100
alertsCriticalHighDiskUtilization: 80, //percent 0-100
alertsCriticalHighSqlExecutionTime: 1, // second
alertsCriticalHighReplicationShippingTime: 1, //second
// Basic filtering
filteringSelector: 'job=~"$job", sid=~"$sid"',
groupLabels: ['job', 'sid'],
instanceLabels: ['host'],

enableLokiLogs: true,
// Dashboard settings
dashboardTags: ['sap-hana-mixin'],
uid: 'sap-hana',
dashboardNamePrefix: 'SAP HANA',
dashboardRefresh: '1m',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',

// Logs configuration
enableLokiLogs: true,
logLabels: ['job', 'sid', 'host', 'filename'],
extraLogLabels: [],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

// Alert thresholds
alertsCriticalHighCpuUsage: 80, // %
alertsCriticalHighPhysicalMemoryUsage: 80, // %
alertsWarningLowMemAllocLimit: 90, // %
alertsCriticalHighMemoryUsage: 80, // %
alertsCriticalHighDiskUtilization: 80, // %
alertsCriticalHighSqlExecutionTime: 1, // s
alertsCriticalHighReplicationShippingTime: 1, // s

// Metrics source
metricsSource: 'prometheus',

// Signals
signals+: {
cpu: (import './signals/cpu.libsonnet')(this),
memory: (import './signals/memory.libsonnet')(this),
disk: (import './signals/disk.libsonnet')(this),
network: (import './signals/network.libsonnet')(this),
replication: (import './signals/replication.libsonnet')(this),
sql: (import './signals/sql.libsonnet')(this),
connections: (import './signals/connections.libsonnet')(this),
storage: (import './signals/storage.libsonnet')(this),
alerts: (import './signals/alerts.libsonnet')(this),
},
}
Loading
Loading