grafana · v-zhuravlev · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
@@ -0,0 +1,4 @@
+//ignore fragile promql selectors for JVM memory alerts
+checks {
+  disabled = ["promql/fragile"]
+}
@@ -25,8 +25,8 @@ local commonlib = import 'common-lib/common/main.libsonnet';
     uncleanLeaderElectionStat:
       signals.clusterReplicaManager.uncleanLeaderElection.asStat()
       + commonlib.panels.generic.stat.base.stylize(),
-    preferredReplicaInbalanceStat:
-      signals.clusterReplicaManager.preferredReplicaInbalance.asStat()
+    preferredReplicaImbalanceStat:
+      signals.clusterReplicaManager.preferredReplicaImbalance.asStat()
       + commonlib.panels.generic.stat.base.stylize(),
 
     onlinePartitionsStat:

@@ -10,7 +10,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';
           panels.cluster.activeControllers { gridPos+: { w: 3, h: 4 } },
           panels.cluster.brokersCount { gridPos+: { w: 3, h: 4 } },
           panels.replicaManager.uncleanLeaderElectionStat { gridPos+: { w: 3, h: 4 } },
-          panels.replicaManager.preferredReplicaInbalanceStat { gridPos+: { w: 3, h: 4 } },
+          panels.replicaManager.preferredReplicaImbalanceStat { gridPos+: { w: 3, h: 4 } },
           panels.cluster.clusterBytesBothPerSec { gridPos+: { w: 6, h: 8 } },
           panels.cluster.clusterMessagesPerSec { gridPos+: { w: 6, h: 8 } },
           //next row
@@ -19,7 +19,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';
           panels.replicaManager.underReplicatedPartitionsStat { gridPos+: { w: 3, h: 4 } },
           panels.replicaManager.underMinISRPartitionsStat { gridPos+: { w: 3, h: 4 } },
           // status rows
-          panels.cluster.clusterRoles { gridPos+: { w: 24, h: 7 } },
+          panels.cluster.clusterRoles { gridPos+: { w: 24, h: 10 } },
         ]
       ),
     throughput:

@@ -15,7 +15,10 @@ function(this)
     signals: {
       brokerMessagesInPerSec: {
         name: 'Broker messages in',
-        description: 'Broker messages in.',
+        description: |||
+          Rate of incoming messages published to this broker across all topics.  
+          Tracks producer throughput and write workload.  
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -32,7 +35,10 @@ function(this)
       },
       brokerBytesInPerSec: {
         name: 'Broker bytes in',
-        description: 'Broker bytes in rate.',
+        description: |||
+          Rate of incoming data in bytes published to this broker from producers.  
+          Measures network and storage write load.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -49,7 +55,10 @@ function(this)
       },
       brokerBytesOutPerSec: {
         name: 'Broker bytes out',
-        description: 'Broker bytes out rate.',
+        description: |||
+          Rate of outgoing data in bytes sent from this broker to consumers and followers.  
+          Measures network read load and consumer throughput.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {

@@ -80,7 +80,10 @@ function(this)
       onlinePartitions: {
         name: 'Online partitions',
         description: |||
-          Online partitions.
+          Number of partitions that are currently online and available on this broker. This includes
+          partitions where this broker is either the leader or a follower replica. The total count
+          reflects the broker's share of the topic partitions across the cluster. A sudden drop in
+          online partitions may indicate broker issues, partition reassignments, or cluster rebalancing.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -103,7 +106,11 @@ function(this)
       offlinePartitions: {
         name: 'Offline partitions',
         description: |||
-          Number of partitions that dont have an active leader and are hence not writable or readable.
+          Number of partitions that don't have an active leader and are hence not writable or readable.
+          Offline partitions indicate a critical availability issue as producers cannot write to these
+          partitions and consumers cannot read from them. This typically occurs when all replicas for
+          a partition are down or when there are not enough in-sync replicas to elect a new leader.
+          Any non-zero value requires immediate investigation and remediation to restore service availability.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -125,7 +132,11 @@ function(this)
       underReplicatedPartitions: {
         name: 'Under replicated partitions',
         description: |||
-          Number of under replicated partitions (| ISR | < | all replicas |).
+          Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor.
+          Under-replicated partitions indicate potential data availability issues, as there are fewer copies
+          of the data than desired. This could be caused by broker failures, network issues, or brokers
+          falling behind in replication. A high number of under-replicated partitions poses a risk to
+          data durability and availability, as the loss of additional brokers could result in data loss.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -145,7 +156,11 @@ function(this)
       underMinISRPartitions: {
         name: 'Under min ISR partitions',
         description: |||
-          Under min ISR(In-Sync replicas) partitions.
+          Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold.
+          When the number of ISRs for a partition falls below the min.insync.replicas setting, the partition
+          becomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts
+          availability. This metric indicates potential issues with broker availability, network connectivity,
+          or replication lag that need immediate attention to restore write availability.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -165,7 +180,12 @@ function(this)
       uncleanLeaderElection: {
         name: 'Unclean leader election',
         description: |||
-          Unclean leader election rate.
+          Rate of unclean leader elections occurring in the cluster. An unclean leader election happens
+          when a partition leader fails and a replica that was not fully in-sync (not in the ISR) is
+          elected as the new leader. This results in potential data loss as the new leader may be missing
+          messages that were committed to the previous leader. Unclean elections occur when unclean.leader.election.enable
+          is set to true and there are no in-sync replicas available. Any occurrence of unclean elections
+          indicates a serious problem with cluster availability and replication health that risks data integrity.
         |||,
         type: 'raw',
         unit: 'short',
@@ -182,10 +202,16 @@ function(this)
             },
         },
       },
-      preferredReplicaInbalance: {
-        name: 'Preferred replica inbalance',
+      preferredReplicaImbalance: {
+        name: 'Preferred replica imbalance',
         description: |||
-          The count of topic partitions for which the leader is not the preferred leader.
+          The count of topic partitions for which the leader is not the preferred leader. In Kafka,
+          each partition has a preferred leader replica (typically the first replica in the replica list).
+          When leadership is not on the preferred replica, the cluster may experience uneven load distribution
+          across brokers, leading to performance imbalances. This can occur after broker failures and restarts,
+          or during cluster maintenance. Running the preferred replica election can help rebalance leadership
+          and optimize cluster performance. A consistently high imbalance may indicate issues with automatic
+          leader rebalancing or the need for manual intervention.
         |||,
         type: 'gauge',
         unit: 'short',

@@ -17,7 +17,9 @@ function(this)
       activeControllers: {
         name: 'Active kafka controllers',
         description: |||
-          Active kafka controllers count.
+          Number of active controllers in the cluster. Should always be exactly 1.  
+          Zero indicates no controller elected, preventing cluster operations.  
+          More than one indicates split-brain requiring immediate attention.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -41,7 +43,10 @@ function(this)
       role: {
         name: 'Current role',
         description: |||
-          0 - follower, 1 - controller.
+          Broker's current controller role: 0 indicates follower, 1 indicates active controller.  
+          Only one broker should have value 1 at any time.  
+          Used to identify which broker is managing cluster metadata and leadership.
+          Current controller role: 0 - follower, 1 - controller.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -119,7 +124,9 @@ function(this)
       kraftBrokerRole: {
         name: 'Current role (kraft)',
         description: |||
-          Any value - broker in kraft.
+          Broker state in KRaft mode (Kafka without ZooKeeper).  
+          Any value indicates the broker is running in KRaft mode.  
+          Used to identify KRaft-enabled brokers in the cluster.
         |||,
         type: 'gauge',
         unit: 'short',
@@ -155,7 +162,7 @@ function(this)
       brokersCount: {
         name: 'Brokers count',
         description: |||
-          Active brokers count.
+          Total number of active brokers currently registered and reporting in the cluster.  
         |||,
         type: 'gauge',
         unit: 'short',
@@ -178,7 +185,10 @@ function(this)
 
       clusterMessagesInPerSec: {
         name: 'Cluster messages in',
-        description: 'Cluster messages in.',
+        description: |||
+          Aggregate rate of incoming messages across all brokers and topics in the cluster.  
+          Represents total producer throughput and write workload.  
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -195,7 +205,10 @@ function(this)
       },
       clusterBytesInPerSec: {
         name: 'Cluster bytes in',
-        description: 'Cluster bytes in rate.',
+        description: |||
+          Aggregate rate of incoming data in bytes across all brokers from producers.  
+          Measures total network ingress and storage write load.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {
@@ -212,7 +225,10 @@ function(this)
       },
       clusterBytesOutPerSec: {
         name: 'Cluster bytes out',
-        description: 'Cluster bytes out rate.',
+        description: |||
+          Aggregate rate of outgoing data in bytes across all brokers to consumers and followers.  
+          Measures total network egress load and consumer throughput.  
+        |||,
         type: 'counter',
         unit: 'Bps',
         sources: {

@@ -16,7 +16,11 @@ function(this)
     signals: {
       consumerGroupLag: {
         name: 'Consumer group lag',
-        description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
+        description: |||
+          Number of messages a consumer group is behind the latest available offset for a topic partition.  
+          High or growing lag indicates consumers can't keep up with producer throughput.  
+          Critical metric for consumer health and real-time processing requirements.
+        |||,
         type: 'gauge',
         unit: 'short',
         aggFunction: 'sum',
@@ -35,7 +39,11 @@ function(this)
 
       consumerGroupLagTime: {
         name: 'Consumer group lag in ms',
-        description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.',
+        description: |||
+          Time lag in milliseconds between message production and consumption for a consumer group.  
+          Represents real-time delay in message processing.  
+          More intuitive than message count lag for understanding business impact of delays.
+        |||,
         type: 'gauge',
         unit: 'ms',
         optional: true,
@@ -50,7 +58,11 @@ function(this)
 
       consumerGroupConsumeRate: {
         name: 'Consumer group consume rate',
-        description: 'Consumer group consume rate.',
+        description: |||
+          Rate at which a consumer group is consuming and committing offsets for a topic.  
+          Measures consumer throughput and processing speed.  
+          Should match or exceed producer rate to prevent growing lag.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {

@@ -15,7 +15,11 @@ function(this)
     signals: {
       producerConversion: {
         name: 'Message conversion (producer)',
-        description: 'The number of messages produced converted to match the log.message.format.version.',
+        description: |||
+          Rate of producer messages requiring format conversion to match broker's log.message.format.version.  
+          Conversions add CPU overhead and latency.  
+          Non-zero values suggest producer and broker version mismatches requiring alignment.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {
@@ -32,7 +36,11 @@ function(this)
       },
       consumerConversion: {
         name: 'Message conversion (consumer)',
-        description: 'The number of messages consumed converted at consumer to match the log.message.format.version.',
+        description: |||
+          Rate of messages requiring format conversion during consumer fetch to match log.message.format.version.  
+          Conversions impact broker CPU and consumer latency.  
+          Indicates version mismatch between stored messages and consumer expectations.
+        |||,
         type: 'counter',
         unit: 'mps',
         sources: {