Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
vendor
jsonnetfile.lock.json
*.zip
.worktrees
2 changes: 2 additions & 0 deletions ibm-mq-mixin/.lint
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ exclusions:
- panel: "Queue operations"
- panel: "Operations"
- panel: "Depth"
panel-datasource-rule:
reason: "Panels using signal-based targets with multiple queries may show '-- Mixed --' datasource, which is expected for panels with multi-datasource support"
template-datasource-rule:
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
template-instance-rule:
Expand Down
79 changes: 79 additions & 0 deletions ibm-mq-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
new(this): {
groups: [
{
name: this.config.uid + '-alerts',
rules: [
{
alert: 'IBMMQExpiredMessages',
expr: |||
sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_expired_message_count) > %(alertsExpiredMessages)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'There are expired messages, which imply that application resilience is failing.',
description:
(
'The number of expired messages in the {{$labels.qmgr}} is {{$value}} which is above the threshold of %(alertsExpiredMessages)s.'
) % this.config,
},
},
{
alert: 'IBMMQStaleMessages',
expr: |||
sum without (description,instance,job,platform) (ibmmq_queue_oldest_message_age) >= %(alertsStaleMessagesSeconds)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Stale messages have been detected.',
description:
(
'A stale message with an age of {{$value}} has been sitting in the {{$labels.queue}} which is above the threshold of %(alertsStaleMessagesSeconds)s seconds.'
) % this.config,
},
},
{
alert: 'IBMMQLowDiskSpace',
expr: |||
sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_queue_manager_file_system_free_space_percentage) <= %(alertsLowDiskSpace)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'There is limited disk available for a queue manager.',
description:
(
'The amount of disk space available for {{$labels.qmgr}} is at {{$value}}%% which is below the threshold of %(alertsLowDiskSpace)s%%.'
) % this.config,
},
},
{
alert: 'IBMMQHighQMgrCpuUsage',
expr: |||
sum without (description,hostname,instance,job,platform) (ibmmq_qmgr_user_cpu_time_estimate_for_queue_manager_percentage) >= %(alertsHighQueueManagerCpuUsage)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'There is a high CPU usage estimate for a queue manager.',
description:
(
'The amount of CPU usage for the queue manager {{$labels.qmgr}} is at {{$value}}%% which is above the threshold of %(alertsHighQueueManagerCpuUsage)s%%.'
) % this.config,
},
},
],
},
],
},
}
56 changes: 41 additions & 15 deletions ibm-mq-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,20 +1,46 @@
{
_config+:: {
enableMultiCluster: false,
ibmmqSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
dashboardTags: ['ibm-mq-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
logExpression: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster", qmgr=~"$qmgr"'
else 'job=~"$job", qmgr=~"$qmgr"',
local this = self,

//alerts thresholds
alertsExpiredMessages: 2, //count
alertsStaleMessagesSeconds: 300, //seconds
alertsLowDiskSpace: 5, //percentage: 0-100
alertsHighQueueManagerCpuUsage: 85, //percentage: 0-100
// Enable multi-cluster support
enableMultiCluster: false,

enableLokiLogs: true,
// Basic filtering and labeling
filteringSelector: '',
groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'],
instanceLabels: ['qmgr'],

// Dashboard settings
dashboardTags: ['ibm-mq-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardNamePrefix: 'IBM MQ',
uid: 'ibm-mq',

// Log settings
enableLokiLogs: true,
logLabels: ['job', 'qmgr', 'filename'],
extraLogLabels: if self.enableMultiCluster then ['cluster'] else [],
logsVolumeGroupBy: 'level',
showLogsVolume: true,
logExpression: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster", qmgr=~"$qmgr"' else 'job=~"$job", qmgr=~"$qmgr"',

// Alert thresholds
alertsExpiredMessages: 2, // count
alertsStaleMessagesSeconds: 300, // seconds
alertsLowDiskSpace: 5, // %
alertsHighQueueManagerCpuUsage: 85, // %

// Metrics source
metricsSource: 'prometheus',

// Signal definitions
signals: {
cluster: (import './signals/cluster.libsonnet')(this),
qmgr: (import './signals/qmgr.libsonnet')(this),
queue: (import './signals/queue.libsonnet')(this),
topic: (import './signals/topic.libsonnet')(this),
subscription: (import './signals/subscription.libsonnet')(this),
channel: (import './signals/channel.libsonnet')(this),
},
}
138 changes: 138 additions & 0 deletions ibm-mq-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
local g = import './g.libsonnet';
local commonlib = import 'common-lib/common/main.libsonnet';

{
local root = self,
new(this): {
local prefix = this.config.dashboardNamePrefix,
local links = this.grafana.links,
local tags = this.config.dashboardTags,
local uid = this.config.uid,
local vars = this.grafana.variables,
local annotations = this.grafana.annotations,
local refresh = this.config.dashboardRefresh,
local period = this.config.dashboardPeriod,
local timezone = this.config.dashboardTimezone,
local rows = this.grafana.rows,

clusterOverview:
g.dashboard.new(prefix + ' - cluster overview')
+ g.dashboard.withPanels(
g.util.grid.wrapPanels(
[
rows.clusterOverviewStats,
rows.clusterStatus,
rows.clusterChannels,
], panelHeight=1, startY=0
)
)
+ root.applyCommon(
vars.multiInstance,
uid + '-cluster-overview',
tags,
links,
annotations,
timezone,
refresh,
period
),

queueManagerOverview:
g.dashboard.new(prefix + ' - queue manager overview')
+ g.dashboard.withPanels(
g.util.grid.wrapPanels(
[
rows.queueManagerOverviewStats,
rows.queueManagerStatus,
rows.queueManagerPerformance,
rows.queueManagerLogs,
], panelHeight=1, startY=0
)
)
+ root.applyCommon(
vars.multiInstance,
uid + '-queue-manager-overview',
tags,
links,
annotations,
timezone,
refresh,
period
),

queueOverview:
local queueVar = g.dashboard.variable.query.new('queue')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.queryTypes.withLabelValues('queue', 'ibmmq_queue_depth{%(filteringSelector)s,queue!~"SYSTEM.*|AMQ.*"}' % this.config)
+ g.dashboard.variable.query.withRegex('')
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+')
+ g.dashboard.variable.query.generalOptions.withLabel('Queue')
+ g.dashboard.variable.query.refresh.onTime()
+ g.dashboard.variable.query.withSort(type='alphabetical');
g.dashboard.new(prefix + ' - queue overview')
+ g.dashboard.withPanels(
g.util.grid.wrapPanels(
[
rows.queueMetrics,
], panelHeight=1, startY=0
)
)
+ root.applyCommon(
vars.multiInstance + [queueVar],
uid + '-queue-overview',
tags,
links,
annotations,
timezone,
refresh,
period
),

topicsOverview:
local topicVar = g.dashboard.variable.query.new('topic')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.queryTypes.withLabelValues('topic', 'ibmmq_topic_subscriber_count{%(filteringSelector)s,topic!~"SYSTEM.*|\\\\$SYS.*|"}' % this.config)
+ g.dashboard.variable.query.withRegex('')
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+')
+ g.dashboard.variable.query.generalOptions.withLabel('Topic')
+ g.dashboard.variable.query.refresh.onTime()
+ g.dashboard.variable.query.withSort(type='alphabetical');
local subscriptionVar = g.dashboard.variable.query.new('subscription')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.queryTypes.withLabelValues('subscription', 'ibmmq_subscription_messsages_received{%(filteringSelector)s,subscription!~"SYSTEM.*|"}' % this.config)
+ g.dashboard.variable.query.withRegex('')
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+')
+ g.dashboard.variable.query.generalOptions.withLabel('Subscription')
+ g.dashboard.variable.query.refresh.onTime()
+ g.dashboard.variable.query.withSort(type='alphabetical');
g.dashboard.new(prefix + ' - topics overview')
+ g.dashboard.withPanels(
g.util.grid.wrapPanels(
[
rows.topics,
rows.subscriptions,
], panelHeight=1, startY=0
)
)
+ root.applyCommon(
vars.multiInstance + [topicVar, subscriptionVar],
uid + '-topics-overview',
tags,
links,
annotations,
timezone,
refresh,
period
),
},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading