Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
vendor
jsonnetfile.lock.json
*.zip
.worktrees
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: $._config.uid + '-alerts',
name: this.config.uid + '-alerts',
rules: [
{
alert: 'OpenSearchYellowCluster',
expr: |||
opensearch_cluster_status{%(filteringSelector)s} == 1
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -18,14 +18,14 @@
description:
(
'{{$labels.cluster}} health status is yellow over the last 5 minutes'
) % $._config,
) % this.config,
},
},
{
alert: 'OpenSearchRedCluster',
expr: |||
opensearch_cluster_status{%(filteringSelector)s} == 2
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -35,14 +35,14 @@
description:
(
'{{$labels.cluster}} health status is red over the last 5 minutes'
) % $._config,
) % this.config,
},
},
{
alert: 'OpenSearchUnstableShardReallocation',
expr: |||
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="relocating"}) > %(alertsWarningShardReallocations)s
||| % $._config,
||| % this.config,
'for': '1m',
labels: {
severity: 'warning',
Expand All @@ -51,14 +51,14 @@
summary: 'A node has gone offline or has been disconnected triggering shard reallocation.',
description: |||
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard reallocation over the last 1m which is above the threshold of %(alertsWarningShardReallocations)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchUnstableShardUnassigned',
expr: |||
sum without(type) (opensearch_cluster_shards_number{%(filteringSelector)s, type="unassigned"}) > %(alertsWarningShardUnassigned)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -67,14 +67,14 @@
summary: 'There are shards that have been detected as unassigned.',
description: |||
{{$labels.cluster}} has had {{ printf "%%.0f" $value }} shard unassigned over the last 5m which is above the threshold of %(alertsWarningShardUnassigned)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeDiskUsage',
expr: |||
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsWarningDiskUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -83,14 +83,14 @@
summary: 'The node disk usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }} disk usage over the last 5m which is above the threshold of %(alertsWarningDiskUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeDiskUsage',
expr: |||
100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{%(filteringSelector)s} - opensearch_fs_path_free_bytes{%(filteringSelector)s}) / opensearch_fs_path_total_bytes{%(filteringSelector)s}) > %(alertsCriticalDiskUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -99,14 +99,14 @@
summary: 'The node disk usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% disk usage over the last 5m which is above the threshold of %(alertsCriticalDiskUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeCpuUsage',
expr: |||
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsWarningCPUUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -115,14 +115,14 @@
summary: 'The node CPU usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsWarningCPUUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeCpuUsage',
expr: |||
sum without(nodeid) (opensearch_os_cpu_percent{%(filteringSelector)s}) > %(alertsCriticalCPUUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -131,14 +131,14 @@
summary: 'The node CPU usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% CPU usage over the last 5m which is above the threshold of %(alertsCriticalCPUUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeMemoryUsage',
expr: |||
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsWarningMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -147,14 +147,14 @@
summary: 'The node memory usage has exceeded the warning threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsWarningMemoryUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchHighNodeMemoryUsage',
expr: |||
sum without(nodeid) (opensearch_os_mem_used_percent{%(filteringSelector)s}) > %(alertsCriticalMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -163,14 +163,14 @@
summary: 'The node memory usage has exceeded the critical threshold.',
description: |||
{{$labels.node}} has had {{ printf "%%.0f" $value }}%% memory usage over the last 5m which is above the threshold of %(alertsCriticalMemoryUsage)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchModerateRequestLatency',
expr: |||
sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{%(filteringSelector)s, context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > %(alertsWarningRequestLatency)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -179,14 +179,14 @@
summary: 'The request latency has exceeded the warning threshold.',
description: |||
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of request latency over the last 5m which is above the threshold of %(alertsWarningRequestLatency)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'OpenSearchModerateIndexLatency',
expr: |||
sum without(context) (increase(opensearch_index_indexing_index_time_seconds{%(filteringSelector)s, context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > %(alertsWarningIndexLatency)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -195,7 +195,7 @@
summary: 'The index latency has exceeded the warning threshold.',
description: |||
{{$labels.index}} has had {{ printf "%%.0f" $value }}s of index latency over the last 5m which is above the threshold of %(alertsWarningIndexLatency)s.
||| % $._config,
||| % this.config,
},
},
],
Expand Down
71 changes: 46 additions & 25 deletions opensearch-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,31 +1,52 @@
{
_config+:: {
enableMultiCluster: false,
// extra static selector to apply to all templated variables and alerts
filteringSelector: if self.enableMultiCluster then 'cluster!="",opensearch_cluster!=""' else 'opensearch_cluster!=""',
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'opensearch_cluster'] else ['job', 'opensearch_cluster'],
instanceLabels: ['node'],
dashboardTags: ['opensearch-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardNamePrefix: '',
local this = self,
filteringSelector: if self.enableMultiCluster then 'cluster!="",opensearch_cluster!=""' else 'opensearch_cluster!=""',
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'opensearch_cluster'] else ['job', 'opensearch_cluster'],
logLabels: ['job', 'cluster', 'node'],
instanceLabels: ['node'],

// prefix dashboards uids
uid: 'opensearch',
dashboardTags: [self.uid],
uid: 'opensearch',
dashboardNamePrefix: 'OpenSearch',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: 'prometheus', // metrics source for signals

// alerts thresholds
alertsWarningShardReallocations: 0,
alertsWarningShardUnassigned: 0,
alertsWarningDiskUsage: 60,
alertsCriticalDiskUsage: 80,
alertsWarningCPUUsage: 70,
alertsCriticalCPUUsage: 85,
alertsWarningMemoryUsage: 70,
alertsCriticalMemoryUsage: 85,
alertsWarningRequestLatency: 0.5, // seconds
alertsWarningIndexLatency: 0.5, // seconds
// Agg Lists
groupAggList: std.join(',', this.groupLabels),
groupAggListWithInstance: std.join(',', this.groupLabels + this.instanceLabels),

// Multi-cluster support
enableMultiCluster: false,
opensearchSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"',

enableLokiLogs: true,
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level', 'severity'], // Required by logs-lib
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logExpression: '{job=~"$job", cluster=~"$cluster", instance=~"$instance", exception_class=~".+"} | json | line_format "{{.severity}} {{.exception_class}} - {{.exception_message}}" | drop time_extracted, severity_extracted, exception_class_extracted, correlation_id_extracted',

// Alerts configuration
alertsWarningShardReallocations: 0, // count
alertsWarningShardUnassigned: 0, // count
alertsWarningDiskUsage: 60, // %
alertsCriticalDiskUsage: 80, // %
alertsWarningCPUUsage: 70, // %
alertsCriticalCPUUsage: 85, // %
alertsWarningMemoryUsage: 70, // %
alertsCriticalMemoryUsage: 85, // %
alertsWarningRequestLatency: 0.5, // seconds
alertsWarningIndexLatency: 0.5, // seconds

// Signals configuration
signals+: {
cluster: (import './signals/cluster.libsonnet')(this),
node: (import './signals/node.libsonnet')(this),
topk: (import './signals/topk.libsonnet')(this),
roles: (import './signals/roles.libsonnet')(this),
search: (import './signals/search.libsonnet')(this),
indexing: (import './signals/indexing.libsonnet')(this),
},
}
Loading
Loading