Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions kafka-observ-lib/.pint.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
//ignore fragile promql selectors for JVM memory alerts
checks {
disabled = ["promql/fragile"]
}
291 changes: 257 additions & 34 deletions kafka-observ-lib/alerts.libsonnet

Large diffs are not rendered by default.

122 changes: 61 additions & 61 deletions kafka-observ-lib/dashboards_out/kafka-overview-dashboard.json

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions kafka-observ-lib/dashboards_out/kafka-topic-dashboard.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions kafka-observ-lib/panels/replicaManager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ local commonlib = import 'common-lib/common/main.libsonnet';
uncleanLeaderElectionStat:
signals.clusterReplicaManager.uncleanLeaderElection.asStat()
+ commonlib.panels.generic.stat.base.stylize(),
preferredReplicaInbalanceStat:
signals.clusterReplicaManager.preferredReplicaInbalance.asStat()
preferredReplicaImbalanceStat:
signals.clusterReplicaManager.preferredReplicaImbalance.asStat()
+ commonlib.panels.generic.stat.base.stylize(),

onlinePartitionsStat:
Expand Down
235 changes: 221 additions & 14 deletions kafka-observ-lib/prometheus_rules_out/prometheus_alerts.yaml

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions kafka-observ-lib/rows.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';
panels.cluster.activeControllers { gridPos+: { w: 3, h: 4 } },
panels.cluster.brokersCount { gridPos+: { w: 3, h: 4 } },
panels.replicaManager.uncleanLeaderElectionStat { gridPos+: { w: 3, h: 4 } },
panels.replicaManager.preferredReplicaInbalanceStat { gridPos+: { w: 3, h: 4 } },
panels.replicaManager.preferredReplicaImbalanceStat { gridPos+: { w: 3, h: 4 } },
panels.cluster.clusterBytesBothPerSec { gridPos+: { w: 6, h: 8 } },
panels.cluster.clusterMessagesPerSec { gridPos+: { w: 6, h: 8 } },
//next row
Expand All @@ -19,7 +19,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';
panels.replicaManager.underReplicatedPartitionsStat { gridPos+: { w: 3, h: 4 } },
panels.replicaManager.underMinISRPartitionsStat { gridPos+: { w: 3, h: 4 } },
// status rows
panels.cluster.clusterRoles { gridPos+: { w: 24, h: 7 } },
panels.cluster.clusterRoles { gridPos+: { w: 24, h: 10 } },
]
),
throughput:
Expand Down
15 changes: 12 additions & 3 deletions kafka-observ-lib/signals/broker.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ function(this)
signals: {
brokerMessagesInPerSec: {
name: 'Broker messages in',
description: 'Broker messages in.',
description: |||
Rate of incoming messages published to this broker across all topics.
Tracks producer throughput and write workload.
|||,
type: 'counter',
unit: 'mps',
sources: {
Expand All @@ -32,7 +35,10 @@ function(this)
},
brokerBytesInPerSec: {
name: 'Broker bytes in',
description: 'Broker bytes in rate.',
description: |||
Rate of incoming data in bytes published to this broker from producers.
Measures network and storage write load.
|||,
type: 'counter',
unit: 'Bps',
sources: {
Expand All @@ -49,7 +55,10 @@ function(this)
},
brokerBytesOutPerSec: {
name: 'Broker bytes out',
description: 'Broker bytes out rate.',
description: |||
Rate of outgoing data in bytes sent from this broker to consumers and followers.
Measures network read load and consumer throughput.
|||,
type: 'counter',
unit: 'Bps',
sources: {
Expand Down
42 changes: 34 additions & 8 deletions kafka-observ-lib/signals/brokerReplicaManager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ function(this)
onlinePartitions: {
name: 'Online partitions',
description: |||
Online partitions.
Number of partitions that are currently online and available on this broker. This includes
partitions where this broker is either the leader or a follower replica. The total count
reflects the broker's share of the topic partitions across the cluster. A sudden drop in
online partitions may indicate broker issues, partition reassignments, or cluster rebalancing.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -103,7 +106,11 @@ function(this)
offlinePartitions: {
name: 'Offline partitions',
description: |||
Number of partitions that dont have an active leader and are hence not writable or readable.
Number of partitions that don't have an active leader and are hence not writable or readable.
Offline partitions indicate a critical availability issue as producers cannot write to these
partitions and consumers cannot read from them. This typically occurs when all replicas for
a partition are down or when there are not enough in-sync replicas to elect a new leader.
Any non-zero value requires immediate investigation and remediation to restore service availability.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -125,7 +132,11 @@ function(this)
underReplicatedPartitions: {
name: 'Under replicated partitions',
description: |||
Number of under replicated partitions (| ISR | < | all replicas |).
Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor.
Under-replicated partitions indicate potential data availability issues, as there are fewer copies
of the data than desired. This could be caused by broker failures, network issues, or brokers
falling behind in replication. A high number of under-replicated partitions poses a risk to
data durability and availability, as the loss of additional brokers could result in data loss.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -145,7 +156,11 @@ function(this)
underMinISRPartitions: {
name: 'Under min ISR partitions',
description: |||
Under min ISR(In-Sync replicas) partitions.
Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold.
When the number of ISRs for a partition falls below the min.insync.replicas setting, the partition
becomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts
availability. This metric indicates potential issues with broker availability, network connectivity,
or replication lag that need immediate attention to restore write availability.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -165,7 +180,12 @@ function(this)
uncleanLeaderElection: {
name: 'Unclean leader election',
description: |||
Unclean leader election rate.
Rate of unclean leader elections occurring in the cluster. An unclean leader election happens
when a partition leader fails and a replica that was not fully in-sync (not in the ISR) is
elected as the new leader. This results in potential data loss as the new leader may be missing
messages that were committed to the previous leader. Unclean elections occur when unclean.leader.election.enable
is set to true and there are no in-sync replicas available. Any occurrence of unclean elections
indicates a serious problem with cluster availability and replication health that risks data integrity.
|||,
type: 'raw',
unit: 'short',
Expand All @@ -182,10 +202,16 @@ function(this)
},
},
},
preferredReplicaInbalance: {
name: 'Preferred replica inbalance',
preferredReplicaImbalance: {
name: 'Preferred replica imbalance',
description: |||
The count of topic partitions for which the leader is not the preferred leader.
The count of topic partitions for which the leader is not the preferred leader. In Kafka,
each partition has a preferred leader replica (typically the first replica in the replica list).
When leadership is not on the preferred replica, the cluster may experience uneven load distribution
across brokers, leading to performance imbalances. This can occur after broker failures and restarts,
or during cluster maintenance. Running the preferred replica election can help rebalance leadership
and optimize cluster performance. A consistently high imbalance may indicate issues with automatic
leader rebalancing or the need for manual intervention.
|||,
type: 'gauge',
unit: 'short',
Expand Down
30 changes: 23 additions & 7 deletions kafka-observ-lib/signals/cluster.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ function(this)
activeControllers: {
name: 'Active kafka controllers',
description: |||
Active kafka controllers count.
Number of active controllers in the cluster. Should always be exactly 1.
Zero indicates no controller elected, preventing cluster operations.
More than one indicates split-brain requiring immediate attention.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -41,7 +43,10 @@ function(this)
role: {
name: 'Current role',
description: |||
0 - follower, 1 - controller.
Broker's current controller role: 0 indicates follower, 1 indicates active controller.
Only one broker should have value 1 at any time.
Used to identify which broker is managing cluster metadata and leadership.
Current controller role: 0 - follower, 1 - controller.
|||,
type: 'gauge',
unit: 'short',
Expand Down Expand Up @@ -119,7 +124,9 @@ function(this)
kraftBrokerRole: {
name: 'Current role (kraft)',
description: |||
Any value - broker in kraft.
Broker state in KRaft mode (Kafka without ZooKeeper).
Any value indicates the broker is running in KRaft mode.
Used to identify KRaft-enabled brokers in the cluster.
|||,
type: 'gauge',
unit: 'short',
Expand Down Expand Up @@ -155,7 +162,7 @@ function(this)
brokersCount: {
name: 'Brokers count',
description: |||
Active brokers count.
Total number of active brokers currently registered and reporting in the cluster.
|||,
type: 'gauge',
unit: 'short',
Expand All @@ -178,7 +185,10 @@ function(this)

clusterMessagesInPerSec: {
name: 'Cluster messages in',
description: 'Cluster messages in.',
description: |||
Aggregate rate of incoming messages across all brokers and topics in the cluster.
Represents total producer throughput and write workload.
|||,
type: 'counter',
unit: 'mps',
sources: {
Expand All @@ -195,7 +205,10 @@ function(this)
},
clusterBytesInPerSec: {
name: 'Cluster bytes in',
description: 'Cluster bytes in rate.',
description: |||
Aggregate rate of incoming data in bytes across all brokers from producers.
Measures total network ingress and storage write load.
|||,
type: 'counter',
unit: 'Bps',
sources: {
Expand All @@ -212,7 +225,10 @@ function(this)
},
clusterBytesOutPerSec: {
name: 'Cluster bytes out',
description: 'Cluster bytes out rate.',
description: |||
Aggregate rate of outgoing data in bytes across all brokers to consumers and followers.
Measures total network egress load and consumer throughput.
|||,
type: 'counter',
unit: 'Bps',
sources: {
Expand Down
18 changes: 15 additions & 3 deletions kafka-observ-lib/signals/consumerGroup.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ function(this)
signals: {
consumerGroupLag: {
name: 'Consumer group lag',
description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
description: |||
Number of messages a consumer group is behind the latest available offset for a topic partition.
High or growing lag indicates consumers can't keep up with producer throughput.
Critical metric for consumer health and real-time processing requirements.
|||,
type: 'gauge',
unit: 'short',
aggFunction: 'sum',
Expand All @@ -35,7 +39,11 @@ function(this)

consumerGroupLagTime: {
name: 'Consumer group lag in ms',
description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
description: |||
Time lag in milliseconds between message production and consumption for a consumer group.
Represents real-time delay in message processing.
More intuitive than message count lag for understanding business impact of delays.
|||,
type: 'gauge',
unit: 'ms',
optional: true,
Expand All @@ -50,7 +58,11 @@ function(this)

consumerGroupConsumeRate: {
name: 'Consumer group consume rate',
description: 'Consumer group consume rate.',
description: |||
Rate at which a consumer group is consuming and committing offsets for a topic.
Measures consumer throughput and processing speed.
Should match or exceed producer rate to prevent growing lag.
|||,
type: 'counter',
unit: 'mps',
sources: {
Expand Down
12 changes: 10 additions & 2 deletions kafka-observ-lib/signals/conversion.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ function(this)
signals: {
producerConversion: {
name: 'Message conversion (producer)',
description: 'The number of messages produced converted to match the log.message.format.version.',
description: |||
Rate of producer messages requiring format conversion to match broker's log.message.format.version.
Conversions add CPU overhead and latency.
Non-zero values suggest producer and broker version mismatches requiring alignment.
|||,
type: 'counter',
unit: 'mps',
sources: {
Expand All @@ -32,7 +36,11 @@ function(this)
},
consumerConversion: {
name: 'Message conversion (consumer)',
description: 'The number of messages consumed converted at consumer to match the log.message.format.version.',
description: |||
Rate of messages requiring format conversion during consumer fetch to match log.message.format.version.
Conversions impact broker CPU and consumer latency.
Indicates version mismatch between stored messages and consumer expectations.
|||,
type: 'counter',
unit: 'mps',
sources: {
Expand Down
Loading