diff --git a/kafka-observ-lib/.pint.hcl b/kafka-observ-lib/.pint.hcl new file mode 100644 index 000000000..b4017b815 --- /dev/null +++ b/kafka-observ-lib/.pint.hcl @@ -0,0 +1,4 @@ +//ignore fragile promql selectors for JVM memory alerts +checks { + disabled = ["promql/fragile"] +} diff --git a/kafka-observ-lib/alerts.libsonnet b/kafka-observ-lib/alerts.libsonnet index 3ec08e010..88b55a001 100644 --- a/kafka-observ-lib/alerts.libsonnet +++ b/kafka-observ-lib/alerts.libsonnet @@ -65,12 +65,23 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; severity: 'warning', }, annotations: { - summary: 'Kafka ISR expansion rate is expanding.', - description: 'Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} In-Sync Replica (ISR) is expanding by {{ $value }} per second. If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR expansion rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag.' - % [ - instanceLabel, - groupLabel, - ], + summary: 'Kafka ISR expansion detected.', + description: ||| + Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has In-Sync Replica (ISR) expanding at {{ printf "%%.2f" $value }} per second. + + ISR expansion typically occurs when a broker recovers and its replicas catch up to the leader. The expected steady-state value for ISR expansion rate is 0. + + Frequent ISR expansion and shrinkage indicates instability and may suggest: + - Brokers frequently going offline/online + - Network connectivity issues + - Replica lag configuration too tight (adjust replica.lag.max.messages or replica.socket.timeout.ms) + - Insufficient broker resources causing replicas to fall behind + + If this alert fires frequently without corresponding broker outages, investigate broker health and adjust replica lag settings. + ||| % [ + instanceLabel, + groupLabel, + ], }, }, { @@ -87,16 +98,31 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; severity: 'warning', }, annotations: { - summary: 'Kafka ISR expansion rate is shrinking.', - description: 'Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} In-Sync Replica (ISR) is shrinking by {{ $value }} per second. If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR shrink rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag.' - % [ - instanceLabel, - groupLabel, - ], + summary: 'Kafka ISR shrinkage detected.', + description: ||| + Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has In-Sync Replica (ISR) shrinking at {{ printf "%%.2f" $value }} per second. + + ISR shrinkage occurs when a replica falls too far behind the leader and is removed from the ISR set. This reduces fault tolerance as fewer replicas are in-sync. + The expected steady-state value for ISR shrink rate is 0. + + Common causes include: + - Broker failures or restarts + - Network latency or connectivity issues + - Replica lag exceeding replica.lag.max.messages threshold + - Replica not contacting leader within replica.socket.timeout.ms + - Insufficient broker resources (CPU, disk I/O, memory) + - High producer throughput overwhelming broker capacity + + If ISR is shrinking without corresponding expansion shortly after, investigate broker health, network connectivity, and resource utilization. + Consider adjusting replica.lag.max.messages or replica.socket.timeout.ms if shrinkage is frequent but brokers are healthy. + ||| % [ + instanceLabel, + groupLabel, + ], }, }, { - alert: 'KafkaOfflinePartitonCount', + alert: 'KafkaOfflinePartitionCount', expr: ||| sum by (%s) (%s) > 0 ||| % [ @@ -104,13 +130,33 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; this.signals.brokerReplicaManager.offlinePartitions.asRuleExpression(), ], 'for': '5m', + keep_firing_for: '5m', labels: { severity: 'critical', }, annotations: { - summary: 'Kafka has offline partitons.', - description: 'Kafka cluster {{ $labels.%s }} has {{ $value }} offline partitions. After successful leader election, if the leader for partition dies, then the partition moves to the OfflinePartition state. Offline partitions are not available for reading and writing. Restart the brokers, if needed, and check the logs for errors.' - % groupLabel, + summary: 'Kafka has offline partitions.', + description: ||| + Kafka cluster {{ $labels.%s }} has {{ printf "%%.0f" $value }} offline partitions. + + Offline partitions have no active leader, making them completely unavailable for both reads and writes. This directly impacts application functionality. + + Common causes include: + - All replicas for the partition are down + - No in-sync replicas available for leader election + - Cluster controller issues preventing leader election + - Insufficient replica count for the replication factor + + Immediate actions: + 1. Check broker status - identify which brokers are down + 2. Review broker logs for errors and exceptions + 3. Restart failed brokers if needed + 4. Verify ZooKeeper connectivity + 5. Check for disk space or I/O issues on broker hosts + 6. Monitor ISR status to ensure replicas are catching up + + Until resolved, affected topics cannot serve traffic for these partitions. + ||| % groupLabel, }, }, { @@ -122,16 +168,120 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; this.signals.brokerReplicaManager.underReplicatedPartitions.asRuleExpression(), ], 'for': '5m', + keep_firing_for: '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Kafka has under-replicated partitions.', + description: ||| + Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has {{ printf "%%.0f" $value }} under-replicated partitions. + + Under-replicated partitions have fewer in-sync replicas (ISR) than the configured replication factor, reducing fault tolerance and risking data loss. + + Impact: + - Reduced data durability (fewer backup copies) + - Increased risk of data loss if additional brokers fail + - Lower fault tolerance for partition availability + + Common causes: + - Broker failures or network connectivity issues + - Brokers unable to keep up with replication (resource constraints) + - High producer throughput overwhelming replica capacity + - Disk I/O saturation on replica brokers + - Network partition between brokers + + Actions: + 1. Identify which brokers are lagging (check ISR for affected partitions) + 2. Review broker resource utilization (CPU, memory, disk I/O) + 3. Check network connectivity between brokers + 4. Verify broker logs for replication errors + 5. Consider adding broker capacity if consistently under-replicated + 6. Reassign partitions if specific brokers are problematic + ||| % [ + instanceLabel, + groupLabel, + ], + }, + }, + { + alert: 'KafkaUnderMinISRPartitionCount', + expr: ||| + sum by (%s) (%s) > 0 + ||| % [ + std.join(',', this.config.groupLabels), + this.signals.brokerReplicaManager.underMinISRPartitions.asRuleExpression(), + ], + 'for': '2m', + keep_firing_for: '5m', labels: { severity: 'critical', }, annotations: { - summary: 'Kafka has under replicated partitons.', - description: 'Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has {{ $value }} under replicated partitons' - % [ - instanceLabel, - groupLabel, - ], + summary: 'Kafka partitions below minimum ISR - writes unavailable.', + description: ||| + Kafka cluster {{ $labels.%s }} has {{ printf "%%.0f" $value }} partitions with fewer in-sync replicas than min.insync.replicas configuration. + + CRITICAL IMPACT: These partitions are UNAVAILABLE FOR WRITES when producers use acks=all, directly impacting application availability. + + This configuration prevents data loss by refusing writes when not enough replicas are in-sync, but at the cost of availability. + + Common causes: + - Broker failures reducing available replicas below threshold + - Network issues preventing replicas from staying in-sync + - Brokers overwhelmed and unable to keep up with replication + - Recent partition reassignment or broker maintenance + + Immediate actions: + 1. Identify affected partitions and their current ISR status + 2. Check broker health and availability + 3. Review network connectivity between brokers + 4. Investigate broker resource utilization (CPU, disk I/O, memory) + 5. Restart failed brokers or resolve broker issues + 6. Monitor ISR recovery as brokers catch up + + Producers will receive NOT_ENOUGH_REPLICAS errors until ISR count recovers above min.insync.replicas threshold. + ||| % groupLabel, + }, + }, + { + alert: 'KafkaPreferredReplicaImbalance', + expr: ||| + sum by (%s) (%s) > 0 + ||| % [ + std.join(',', this.config.groupLabels), + this.signals.brokerReplicaManager.preferredReplicaImbalance.asRuleExpression(), + ], + 'for': '30m', + keep_firing_for: '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Kafka has preferred replica imbalance.', + description: ||| + Kafka cluster {{ $labels.%s }} has {{ $value }} partitions where the leader is not the preferred replica. + + Impact: + Uneven load distribution across brokers can result in some brokers handling significantly more client requests (produce/consume) than others, leading to hotspots, degraded performance, and potential resource exhaustion on overloaded brokers. This prevents optimal cluster utilization and can impact latency and throughput. + + Common causes: + - Broker restarts or failures causing leadership to shift to non-preferred replicas + - Manual partition reassignments or replica movements + - Recent broker additions to the cluster + - Failed automatic preferred replica election + - Auto leader rebalancing disabled (auto.leader.rebalance.enable=false) + + Actions: + 1. Verify auto.leader.rebalance.enable is set to true in broker configuration + 2. Check leader.imbalance.check.interval.seconds (default 300s) configuration + 3. Manually trigger preferred replica election using kafka-preferred-replica-election tool + 4. Monitor broker resource utilization (CPU, network) for imbalance + 5. Review broker logs for leadership election errors + 6. Verify all brokers are healthy and reachable + + If the imbalance persists for extended periods, consider running manual preferred replica election to redistribute leadership and restore balanced load across the cluster. + ||| % groupLabel, }, }, { @@ -143,21 +293,73 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; }, annotations: { summary: 'Kafka has no active controller.', - description: 'Kafka cluster {{ $labels.%s }} has {{ $value }} broker(s) reporting as the active controller in the last 5 minute interval. During steady state there should be only one active controller per cluster.' - % groupLabel, + description: ||| + Kafka cluster {{ $labels.%s }} has {{ $value }} broker(s) reporting as the active controller. Expected exactly 1 active controller. + + CRITICAL impact: + The Kafka controller is responsible for cluster-wide administrative operations including partition leader election, broker failure detection, topic creation/deletion, and partition reassignment. Without an active controller (value=0) or with multiple controllers (value>1), the cluster cannot perform these critical operations, potentially causing: + - Inability to elect new partition leaders when brokers fail + - Topic creation/deletion operations hang indefinitely + - Partition reassignments cannot be executed + - Cluster metadata inconsistencies + - Split-brain scenarios if multiple controllers exist + + Common causes: + - Zookeeper connectivity issues or Zookeeper cluster instability + - Network partitions between brokers and Zookeeper + - Controller broker crash or unclean shutdown + - Long garbage collection pauses on controller broker + - Zookeeper session timeout (zookeeper.session.timeout.ms exceeded) + - Controller election conflicts during network splits + + This is a critical cluster-wide issue requiring immediate attention to restore normal operations. + ||| % groupLabel, }, }, { alert: 'KafkaUncleanLeaderElection', expr: '(%s) != 0' % this.signals.brokerReplicaManager.uncleanLeaderElection.asRuleExpression(), 'for': '5m', + keep_firing_for: '5m', labels: { severity: 'critical', }, annotations: { summary: 'Kafka has unclean leader elections.', - description: 'Kafka cluster {{ $labels.%s }} has {{ $value }} unclean partition leader elections reported in the last 5 minute interval. When unclean leader election is held among out-of-sync replicas, there is a possibility of data loss if any messages were not synced prior to the loss of the former leader. So if the number of unclean elections is greater than 0, investigate broker logs to determine why leaders were re-elected, and look for WARN or ERROR messages. Consider setting the broker configuration parameter unclean.leader.election.enable to false so that a replica outside of the set of in-sync replicas is never elected leader.' - % groupLabel, + description: ||| + Kafka cluster {{ $labels.%s }} has {{ $value }} unclean partition leader elections reported in the last 5 minutes. + + CRITICAL Impact - DATA LOSS RISK: + Unclean leader election occurs when no in-sync replica (ISR) is available to become the leader, forcing Kafka to elect an out-of-sync replica. This WILL result in data loss for any messages that were committed to the previous leader but not replicated to the new leader. This compromises data durability guarantees and can cause: + - Permanent loss of committed messages + - Consumer offset inconsistencies + - Duplicate message processing + - Data inconsistencies between producers and consumers + - Violation of at-least-once or exactly-once semantics + + Common causes: + - All ISR replicas failed simultaneously (broker crashes, hardware failures) + - Network partitions isolating all ISR members + - Extended broker downtime exceeding replica lag tolerance + - Insufficient replication factor (RF < 3) combined with broker failures + - min.insync.replicas set too low relative to replication factor + - Disk failures on multiple replicas simultaneously + - Aggressive unclean.leader.election.enable=true configuration + + Immediate actions: + 1. Review broker logs to identify which partitions had unclean elections + 2. Investigate root cause of ISR replica failures (check broker health, hardware, network) + 3. Assess data loss impact by comparing producer and consumer offsets for affected partitions + 4. Alert application teams to potential data loss in affected partitions + 5. Bring failed ISR replicas back online as quickly as possible + 6. Consider resetting consumer offsets if data loss is unacceptable + 7. Review and increase replication factor for critical topics (minimum RF=3) + 8. Set unclean.leader.election.enable=false to prevent future unclean elections (availability vs. durability trade-off) + 9. Increase min.insync.replicas to strengthen durability guarantees + 10. Implement better monitoring for ISR shrinkage to detect issues before unclean elections occur + + This indicates a serious reliability event that requires immediate investigation and remediation to prevent future data loss. + ||| % groupLabel, }, }, { @@ -169,7 +371,17 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; }, annotations: { summary: 'Kafka has no brokers online.', - description: 'Kafka cluster {{ $labels.%s }} broker count is 0.' % groupLabel, + description: ||| + Kafka cluster {{ $labels.%s }} has zero brokers reporting metrics. + + No brokers are online or reporting metrics, indicating complete cluster failure. This results in: + - Total unavailability of all topics and partitions + - All produce and consume operations failing + - Complete loss of cluster functionality + - Potential data loss if unclean shutdown occurred + - Application downtime for all services depending on Kafka + + ||| % groupLabel, }, }, { @@ -180,13 +392,24 @@ local xtd = import 'github.com/jsonnet-libs/xtd/main.libsonnet'; severity: 'critical', }, annotations: { - summary: 'Kafka Zookeeper sync disconected.', - description: - 'Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has disconected from Zookeeper.' - % [ - instanceLabel, - groupLabel, - ], + summary: 'Kafka Zookeeper sync disconnected.', + description: ||| + Kafka broker {{ $labels.%s }} in cluster {{ $labels.%s }} has lost connection to Zookeeper. + + Zookeeper connectivity is essential for Kafka broker operation. A disconnected broker cannot: + - Participate in controller elections + - Register or maintain its broker metadata + - Receive cluster state updates + - Serve as partition leader (will be removed from ISR) + - Handle leadership changes or partition reassignments + + This will cause the broker to become isolated from the cluster, leading to under-replicated partitions and potential service degradation for any topics hosted on this broker. + + Prolonged Zookeeper disconnection will result in the broker being ejected from the cluster and leadership reassignments. + ||| % [ + instanceLabel, + groupLabel, + ], }, }, diff --git a/kafka-observ-lib/dashboards_out/kafka-overview-dashboard.json b/kafka-observ-lib/dashboards_out/kafka-overview-dashboard.json index ce55798fd..3ef3e1b90 100644 --- a/kafka-observ-lib/dashboards_out/kafka-overview-dashboard.json +++ b/kafka-observ-lib/dashboards_out/kafka-overview-dashboard.json @@ -29,7 +29,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Active kafka controllers count.\n", + "description": "Number of active controllers in the cluster. Should always be exactly 1. \nZero indicates no controller elected, preventing cluster operations. \nMore than one indicates split-brain requiring immediate attention.\n", "fieldConfig": { "defaults": { "color": { @@ -89,7 +89,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Active brokers count.\n", + "description": "Total number of active brokers currently registered and reporting in the cluster. \n", "fieldConfig": { "defaults": { "color": { @@ -149,7 +149,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Unclean leader election rate.\n", + "description": "Rate of unclean leader elections occurring in the cluster. An unclean leader election happens\nwhen a partition leader fails and a replica that was not fully in-sync (not in the ISR) is\nelected as the new leader. This results in potential data loss as the new leader may be missing\nmessages that were committed to the previous leader. Unclean elections occur when unclean.leader.election.enable\nis set to true and there are no in-sync replicas available. Any occurrence of unclean elections\nindicates a serious problem with cluster availability and replication health that risks data integrity.\n", "fieldConfig": { "defaults": { "color": { @@ -200,7 +200,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "The count of topic partitions for which the leader is not the preferred leader.\n", + "description": "The count of topic partitions for which the leader is not the preferred leader. In Kafka,\neach partition has a preferred leader replica (typically the first replica in the replica list).\nWhen leadership is not on the preferred replica, the cluster may experience uneven load distribution\nacross brokers, leading to performance imbalances. This can occur after broker failures and restarts,\nor during cluster maintenance. Running the preferred replica election can help rebalance leadership\nand optimize cluster performance. A consistently high imbalance may indicate issues with automatic\nleader rebalancing or the need for manual intervention.\n", "fieldConfig": { "defaults": { "color": { @@ -212,7 +212,7 @@ { "matcher": { "id": "byFrameRefID", - "options": "Preferred replica inbalance" + "options": "Preferred replica imbalance" }, "properties": [ { @@ -239,11 +239,11 @@ "expr": "sum by (kafka_cluster) (\n kafka_controller_kafkacontroller_preferredreplicaimbalancecount_value{kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)\nor\nsum by (kafka_cluster) (\n kafka_controller_kafkacontroller_preferredreplicaimbalancecount{kafka_cluster=~\"$kafka_cluster\",instance=~\"$instance\"}\n)", "format": "time_series", "instant": false, - "legendFormat": "{{kafka_cluster}}: Preferred replica inbalance", - "refId": "Preferred replica inbalance" + "legendFormat": "{{kafka_cluster}}: Preferred replica imbalance", + "refId": "Preferred replica imbalance" } ], - "title": "Preferred replica inbalance", + "title": "Preferred replica imbalance", "type": "stat" }, { @@ -420,7 +420,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Online partitions.\n", + "description": "Number of partitions that are currently online and available on this broker. This includes\npartitions where this broker is either the leader or a follower replica. The total count\nreflects the broker's share of the topic partitions across the cluster. A sudden drop in\nonline partitions may indicate broker issues, partition reassignments, or cluster rebalancing.\n", "fieldConfig": { "defaults": { "color": { @@ -471,7 +471,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Number of partitions that dont have an active leader and are hence not writable or readable.\n", + "description": "Number of partitions that don't have an active leader and are hence not writable or readable.\nOffline partitions indicate a critical availability issue as producers cannot write to these\npartitions and consumers cannot read from them. This typically occurs when all replicas for\na partition are down or when there are not enough in-sync replicas to elect a new leader.\nAny non-zero value requires immediate investigation and remediation to restore service availability.\n", "fieldConfig": { "defaults": { "color": { @@ -522,7 +522,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Number of under replicated partitions (| ISR | < | all replicas |).\n", + "description": "Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor.\nUnder-replicated partitions indicate potential data availability issues, as there are fewer copies\nof the data than desired. This could be caused by broker failures, network issues, or brokers\nfalling behind in replication. A high number of under-replicated partitions poses a risk to\ndata durability and availability, as the loss of additional brokers could result in data loss.\n", "fieldConfig": { "defaults": { "color": { @@ -573,7 +573,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Under min ISR(In-Sync replicas) partitions.\n", + "description": "Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold.\nWhen the number of ISRs for a partition falls below the min.insync.replicas setting, the partition\nbecomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts\navailability. This metric indicates potential issues with broker availability, network connectivity,\nor replication lag that need immediate attention to restore write availability.\n", "fieldConfig": { "defaults": { "color": { @@ -624,7 +624,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "0 - follower, 1 - controller.\n", + "description": "Broker's current controller role: 0 indicates follower, 1 indicates active controller. \nOnly one broker should have value 1 at any time. \nUsed to identify which broker is managing cluster metadata and leadership.\nCurrent controller role: 0 - follower, 1 - controller.\n", "fieldConfig": { "defaults": { "color": { @@ -728,10 +728,10 @@ ] }, "gridPos": { - "h": 7, + "h": 10, "w": 24, "x": 0, - "y": 12 + "y": 15 }, "maxDataPoints": 100, "options": { @@ -782,7 +782,7 @@ "h": 1, "w": 0, "x": 0, - "y": 20 + "y": 25 }, "panels": [ ], "title": "Throughput", @@ -851,7 +851,7 @@ "h": 8, "w": 12, "x": 0, - "y": 21 + "y": 26 }, "options": { "legend": { @@ -928,7 +928,7 @@ "h": 8, "w": 12, "x": 12, - "y": 21 + "y": 26 }, "options": { "legend": { @@ -963,7 +963,7 @@ "h": 1, "w": 0, "x": 0, - "y": 29 + "y": 34 }, "panels": [ ], "title": "Replication", @@ -974,7 +974,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Online partitions.\n", + "description": "Number of partitions that are currently online and available on this broker. This includes\npartitions where this broker is either the leader or a follower replica. The total count\nreflects the broker's share of the topic partitions across the cluster. A sudden drop in\nonline partitions may indicate broker issues, partition reassignments, or cluster rebalancing.\n", "fieldConfig": { "defaults": { "custom": { @@ -1004,7 +1004,7 @@ "h": 6, "w": 12, "x": 0, - "y": 30 + "y": 35 }, "options": { "legend": { @@ -1038,7 +1038,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Number of partitions that dont have an active leader and are hence not writable or readable.\n", + "description": "Number of partitions that don't have an active leader and are hence not writable or readable.\nOffline partitions indicate a critical availability issue as producers cannot write to these\npartitions and consumers cannot read from them. This typically occurs when all replicas for\na partition are down or when there are not enough in-sync replicas to elect a new leader.\nAny non-zero value requires immediate investigation and remediation to restore service availability.\n", "fieldConfig": { "defaults": { "color": { @@ -1077,7 +1077,7 @@ "h": 6, "w": 12, "x": 12, - "y": 30 + "y": 35 }, "maxDataPoints": 100, "options": { @@ -1112,7 +1112,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Number of under replicated partitions (| ISR | < | all replicas |).\n", + "description": "Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor.\nUnder-replicated partitions indicate potential data availability issues, as there are fewer copies\nof the data than desired. This could be caused by broker failures, network issues, or brokers\nfalling behind in replication. A high number of under-replicated partitions poses a risk to\ndata durability and availability, as the loss of additional brokers could result in data loss.\n", "fieldConfig": { "defaults": { "color": { @@ -1151,7 +1151,7 @@ "h": 6, "w": 12, "x": 0, - "y": 36 + "y": 41 }, "maxDataPoints": 100, "options": { @@ -1186,7 +1186,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Under min ISR(In-Sync replicas) partitions.\n", + "description": "Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold.\nWhen the number of ISRs for a partition falls below the min.insync.replicas setting, the partition\nbecomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts\navailability. This metric indicates potential issues with broker availability, network connectivity,\nor replication lag that need immediate attention to restore write availability.\n", "fieldConfig": { "defaults": { "color": { @@ -1225,7 +1225,7 @@ "h": 6, "w": 12, "x": 12, - "y": 36 + "y": 41 }, "maxDataPoints": 100, "options": { @@ -1290,7 +1290,7 @@ "h": 6, "w": 12, "x": 0, - "y": 42 + "y": 47 }, "options": { "legend": { @@ -1354,7 +1354,7 @@ "h": 6, "w": 12, "x": 12, - "y": 42 + "y": 47 }, "options": { "legend": { @@ -1389,7 +1389,7 @@ "h": 1, "w": 0, "x": 0, - "y": 48 + "y": 53 }, "panels": [ { @@ -1478,7 +1478,7 @@ "h": 6, "w": 8, "x": 0, - "y": 49 + "y": 54 }, "options": { "legend": { @@ -1639,7 +1639,7 @@ "h": 6, "w": 8, "x": 8, - "y": 49 + "y": 54 }, "options": { "legend": { @@ -1800,7 +1800,7 @@ "h": 6, "w": 8, "x": 16, - "y": 49 + "y": 54 }, "options": { "legend": { @@ -1885,7 +1885,7 @@ "h": 1, "w": 0, "x": 0, - "y": 55 + "y": 60 }, "panels": [ { @@ -1893,7 +1893,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "The number of messages produced converted to match the log.message.format.version.", + "description": "Rate of producer messages requiring format conversion to match broker's log.message.format.version. \nConversions add CPU overhead and latency. \nNon-zero values suggest producer and broker version mismatches requiring alignment.\n", "fieldConfig": { "defaults": { "custom": { @@ -1923,7 +1923,7 @@ "h": 6, "w": 12, "x": 0, - "y": 56 + "y": 61 }, "options": { "legend": { @@ -1957,7 +1957,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "The number of messages consumed converted at consumer to match the log.message.format.version.", + "description": "Rate of messages requiring format conversion during consumer fetch to match log.message.format.version. \nConversions impact broker CPU and consumer latency. \nIndicates version mismatch between stored messages and consumer expectations.\n", "fieldConfig": { "defaults": { "custom": { @@ -1987,7 +1987,7 @@ "h": 6, "w": 12, "x": 12, - "y": 56 + "y": 61 }, "options": { "legend": { @@ -2026,7 +2026,7 @@ "h": 1, "w": 0, "x": 0, - "y": 62 + "y": 67 }, "panels": [ { @@ -2034,7 +2034,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Latency in millseconds for ZooKeeper requests from broker.", + "description": "Latency in milliseconds for ZooKeeper requests from broker to ZooKeeper ensemble. \nHigh latency indicates ZooKeeper performance issues or network problems. \nCritical for broker operations like leader election and metadata updates.\n", "fieldConfig": { "defaults": { "custom": { @@ -2064,7 +2064,7 @@ "h": 6, "w": 8, "x": 0, - "y": 63 + "y": 68 }, "options": { "legend": { @@ -2098,7 +2098,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Zookeeper connections rate.", + "description": "Rate of successful ZooKeeper connections established by broker. \nFrequent connections may indicate session instability or network issues. \nShould be stable in healthy clusters with occasional reconnections during maintenance.\n", "fieldConfig": { "defaults": { "custom": { @@ -2164,7 +2164,7 @@ "h": 6, "w": 8, "x": 8, - "y": 63 + "y": 68 }, "options": { "legend": { @@ -2236,7 +2236,7 @@ "h": 1, "w": 0, "x": 0, - "y": 69 + "y": 74 }, "panels": [ ], "title": "Process overview", @@ -2289,7 +2289,7 @@ "h": 6, "w": 8, "x": 0, - "y": 70 + "y": 75 }, "options": { "colorMode": "value", @@ -2344,7 +2344,7 @@ "h": 6, "w": 8, "x": 8, - "y": 70 + "y": 75 }, "options": { "colorMode": "fixed", @@ -2438,7 +2438,7 @@ "h": 6, "w": 8, "x": 16, - "y": 70 + "y": 75 }, "options": { "legend": { @@ -2521,7 +2521,7 @@ "h": 6, "w": 8, "x": 0, - "y": 76 + "y": 81 }, "options": { "legend": { @@ -2627,7 +2627,7 @@ "h": 6, "w": 8, "x": 8, - "y": 76 + "y": 81 }, "options": { "legend": { @@ -2703,7 +2703,7 @@ "h": 6, "w": 8, "x": 16, - "y": 76 + "y": 81 }, "options": { "legend": { @@ -2749,7 +2749,7 @@ "h": 1, "w": 0, "x": 0, - "y": 82 + "y": 87 }, "panels": [ { @@ -2774,7 +2774,7 @@ "h": 4, "w": 4, "x": 0, - "y": 83 + "y": 88 }, "options": { "colorMode": "value", @@ -2823,7 +2823,7 @@ "h": 4, "w": 4, "x": 4, - "y": 83 + "y": 88 }, "options": { "colorMode": "value", @@ -2882,7 +2882,7 @@ "h": 4, "w": 4, "x": 8, - "y": 83 + "y": 88 }, "pluginVersion": "v11.0.0", "targets": [ @@ -2933,7 +2933,7 @@ "h": 4, "w": 4, "x": 12, - "y": 83 + "y": 88 }, "pluginVersion": "v11.0.0", "targets": [ @@ -2962,7 +2962,7 @@ "h": 1, "w": 0, "x": 0, - "y": 87 + "y": 92 }, "panels": [ { @@ -3055,7 +3055,7 @@ "h": 8, "w": 8, "x": 0, - "y": 88 + "y": 93 }, "h": 6, "options": { @@ -3198,7 +3198,7 @@ "h": 8, "w": 8, "x": 8, - "y": 88 + "y": 93 }, "h": 6, "options": { @@ -3261,7 +3261,7 @@ "h": 1, "w": 0, "x": 0, - "y": 96 + "y": 101 }, "panels": [ { @@ -3364,7 +3364,7 @@ "h": 8, "w": 8, "x": 0, - "y": 97 + "y": 102 }, "options": { "legend": { @@ -3461,7 +3461,7 @@ "h": 8, "w": 8, "x": 8, - "y": 97 + "y": 102 }, "options": { "legend": { diff --git a/kafka-observ-lib/dashboards_out/kafka-topic-dashboard.json b/kafka-observ-lib/dashboards_out/kafka-topic-dashboard.json index d4bff85c7..99ef39e29 100644 --- a/kafka-observ-lib/dashboards_out/kafka-topic-dashboard.json +++ b/kafka-observ-lib/dashboards_out/kafka-topic-dashboard.json @@ -179,7 +179,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Messages in per second.", + "description": "Rate of messages produced to this topic across all partitions. \nIndicates topic write activity and producer throughput. \nUse to identify hot topics and understand data flow patterns.\n", "fieldConfig": { "defaults": { "custom": { @@ -243,7 +243,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Topic bytes in rate.", + "description": "Rate of incoming data in bytes written to this topic from producers. \n", "fieldConfig": { "defaults": { "custom": { @@ -310,7 +310,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Topic bytes out rate.", + "description": "Rate of outgoing data in bytes read from this topic by consumers. \n", "fieldConfig": { "defaults": { "custom": { @@ -389,7 +389,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Consumer group consume rate.", + "description": "Rate at which a consumer group is consuming and committing offsets for a topic. \nMeasures consumer throughput and processing speed. \nShould match or exceed producer rate to prevent growing lag.\n", "fieldConfig": { "defaults": { "custom": { @@ -516,7 +516,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Consumer group consume rate.", + "description": "Rate at which a consumer group is consuming and committing offsets for a topic. \nMeasures consumer throughput and processing speed. \nShould match or exceed producer rate to prevent growing lag.\n", "fieldConfig": { "defaults": { "custom": { @@ -580,7 +580,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Current approximate lag of a ConsumerGroup at Topic/Partition.", + "description": "Number of messages a consumer group is behind the latest available offset for a topic partition. \nHigh or growing lag indicates consumers can't keep up with producer throughput. \nCritical metric for consumer health and real-time processing requirements.\n", "fieldConfig": { "defaults": { "custom": { @@ -644,7 +644,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "Current approximate lag of a ConsumerGroup at Topic/Partition.", + "description": "Time lag in milliseconds between message production and consumption for a consumer group. \nRepresents real-time delay in message processing. \nMore intuitive than message count lag for understanding business impact of delays.\n", "fieldConfig": { "defaults": { "custom": { diff --git a/kafka-observ-lib/panels/replicaManager.libsonnet b/kafka-observ-lib/panels/replicaManager.libsonnet index b9bc8d5cd..edec8ba8f 100644 --- a/kafka-observ-lib/panels/replicaManager.libsonnet +++ b/kafka-observ-lib/panels/replicaManager.libsonnet @@ -25,8 +25,8 @@ local commonlib = import 'common-lib/common/main.libsonnet'; uncleanLeaderElectionStat: signals.clusterReplicaManager.uncleanLeaderElection.asStat() + commonlib.panels.generic.stat.base.stylize(), - preferredReplicaInbalanceStat: - signals.clusterReplicaManager.preferredReplicaInbalance.asStat() + preferredReplicaImbalanceStat: + signals.clusterReplicaManager.preferredReplicaImbalance.asStat() + commonlib.panels.generic.stat.base.stylize(), onlinePartitionsStat: diff --git a/kafka-observ-lib/prometheus_rules_out/prometheus_alerts.yaml b/kafka-observ-lib/prometheus_rules_out/prometheus_alerts.yaml index 8e34db161..f54723cab 100644 --- a/kafka-observ-lib/prometheus_rules_out/prometheus_alerts.yaml +++ b/kafka-observ-lib/prometheus_rules_out/prometheus_alerts.yaml @@ -27,8 +27,19 @@ groups: severity: critical - alert: KafkaISRExpandRate annotations: - description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} In-Sync Replica (ISR) is expanding by {{ $value }} per second. If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR expansion rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag. - summary: Kafka ISR expansion rate is expanding. + description: | + Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has In-Sync Replica (ISR) expanding at {{ printf "%.2f" $value }} per second. + + ISR expansion typically occurs when a broker recovers and its replicas catch up to the leader. The expected steady-state value for ISR expansion rate is 0. + + Frequent ISR expansion and shrinkage indicates instability and may suggest: + - Brokers frequently going offline/online + - Network connectivity issues + - Replica lag configuration too tight (adjust replica.lag.max.messages or replica.socket.timeout.ms) + - Insufficient broker resources causing replicas to fall behind + + If this alert fires frequently without corresponding broker outages, investigate broker health and adjust replica lag settings. + summary: Kafka ISR expansion detected. expr: | sum by (kafka_cluster,instance) (sum by (kafka_cluster,instance) (kafka_server_replicamanager_isrexpandspersec{}) or @@ -41,8 +52,23 @@ groups: severity: warning - alert: KafkaISRShrinkRate annotations: - description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} In-Sync Replica (ISR) is shrinking by {{ $value }} per second. If a broker goes down, ISR for some of the partitions shrink. When that broker is up again, ISRs are expanded once the replicas are fully caught up. Other than that, the expected value for ISR shrink rate is 0. If ISR is expanding and shrinking frequently, adjust Allowed replica lag. - summary: Kafka ISR expansion rate is shrinking. + description: | + Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has In-Sync Replica (ISR) shrinking at {{ printf "%.2f" $value }} per second. + + ISR shrinkage occurs when a replica falls too far behind the leader and is removed from the ISR set. This reduces fault tolerance as fewer replicas are in-sync. + The expected steady-state value for ISR shrink rate is 0. + + Common causes include: + - Broker failures or restarts + - Network latency or connectivity issues + - Replica lag exceeding replica.lag.max.messages threshold + - Replica not contacting leader within replica.socket.timeout.ms + - Insufficient broker resources (CPU, disk I/O, memory) + - High producer throughput overwhelming broker capacity + + If ISR is shrinking without corresponding expansion shortly after, investigate broker health, network connectivity, and resource utilization. + Consider adjusting replica.lag.max.messages or replica.socket.timeout.ms if shrinkage is frequent but brokers are healthy. + summary: Kafka ISR shrinkage detected. expr: | sum by (kafka_cluster,instance) (sum by (kafka_cluster,instance) (rate(kafka_server_replicamanager_isrshrinks_total{}[5m])) or @@ -53,29 +79,156 @@ groups: keep_firing_for: 15m labels: severity: warning - - alert: KafkaOfflinePartitonCount + - alert: KafkaOfflinePartitionCount annotations: - description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} offline partitions. After successful leader election, if the leader for partition dies, then the partition moves to the OfflinePartition state. Offline partitions are not available for reading and writing. Restart the brokers, if needed, and check the logs for errors. - summary: Kafka has offline partitons. + description: | + Kafka cluster {{ $labels.kafka_cluster }} has {{ printf "%.0f" $value }} offline partitions. + + Offline partitions have no active leader, making them completely unavailable for both reads and writes. This directly impacts application functionality. + + Common causes include: + - All replicas for the partition are down + - No in-sync replicas available for leader election + - Cluster controller issues preventing leader election + - Insufficient replica count for the replication factor + + Immediate actions: + 1. Check broker status - identify which brokers are down + 2. Review broker logs for errors and exceptions + 3. Restart failed brokers if needed + 4. Verify ZooKeeper connectivity + 5. Check for disk space or I/O issues on broker hosts + 6. Monitor ISR status to ensure replicas are catching up + + Until resolved, affected topics cannot serve traffic for these partitions. + summary: Kafka has offline partitions. expr: | sum by (kafka_cluster) (kafka_controller_kafkacontroller_offlinepartitionscount_value{} or kafka_controller_kafkacontroller_offlinepartitionscount{}) > 0 for: 5m + keep_firing_for: 5m labels: severity: critical - alert: KafkaUnderReplicatedPartitionCount annotations: - description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has {{ $value }} under replicated partitons - summary: Kafka has under replicated partitons. + description: | + Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has {{ printf "%.0f" $value }} under-replicated partitions. + + Under-replicated partitions have fewer in-sync replicas (ISR) than the configured replication factor, reducing fault tolerance and risking data loss. + + Impact: + - Reduced data durability (fewer backup copies) + - Increased risk of data loss if additional brokers fail + - Lower fault tolerance for partition availability + + Common causes: + - Broker failures or network connectivity issues + - Brokers unable to keep up with replication (resource constraints) + - High producer throughput overwhelming replica capacity + - Disk I/O saturation on replica brokers + - Network partition between brokers + + Actions: + 1. Identify which brokers are lagging (check ISR for affected partitions) + 2. Review broker resource utilization (CPU, memory, disk I/O) + 3. Check network connectivity between brokers + 4. Verify broker logs for replication errors + 5. Consider adding broker capacity if consistently under-replicated + 6. Reassign partitions if specific brokers are problematic + summary: Kafka has under-replicated partitions. expr: | sum by (kafka_cluster,instance) (kafka_cluster_partition_underreplicated{}) > 0 for: 5m + keep_firing_for: 5m + labels: + severity: critical + - alert: KafkaUnderMinISRPartitionCount + annotations: + description: | + Kafka cluster {{ $labels.kafka_cluster }} has {{ printf "%.0f" $value }} partitions with fewer in-sync replicas than min.insync.replicas configuration. + + CRITICAL IMPACT: These partitions are UNAVAILABLE FOR WRITES when producers use acks=all, directly impacting application availability. + + This configuration prevents data loss by refusing writes when not enough replicas are in-sync, but at the cost of availability. + + Common causes: + - Broker failures reducing available replicas below threshold + - Network issues preventing replicas from staying in-sync + - Brokers overwhelmed and unable to keep up with replication + - Recent partition reassignment or broker maintenance + + Immediate actions: + 1. Identify affected partitions and their current ISR status + 2. Check broker health and availability + 3. Review network connectivity between brokers + 4. Investigate broker resource utilization (CPU, disk I/O, memory) + 5. Restart failed brokers or resolve broker issues + 6. Monitor ISR recovery as brokers catch up + + Producers will receive NOT_ENOUGH_REPLICAS errors until ISR count recovers above min.insync.replicas threshold. + summary: Kafka partitions below minimum ISR - writes unavailable. + expr: | + sum by (kafka_cluster) (kafka_cluster_partition_underminisr{}) > 0 + for: 2m + keep_firing_for: 5m labels: severity: critical + - alert: KafkaPreferredReplicaImbalance + annotations: + description: | + Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} partitions where the leader is not the preferred replica. + + Impact: + Uneven load distribution across brokers can result in some brokers handling significantly more client requests (produce/consume) than others, leading to hotspots, degraded performance, and potential resource exhaustion on overloaded brokers. This prevents optimal cluster utilization and can impact latency and throughput. + + Common causes: + - Broker restarts or failures causing leadership to shift to non-preferred replicas + - Manual partition reassignments or replica movements + - Recent broker additions to the cluster + - Failed automatic preferred replica election + - Auto leader rebalancing disabled (auto.leader.rebalance.enable=false) + + Actions: + 1. Verify auto.leader.rebalance.enable is set to true in broker configuration + 2. Check leader.imbalance.check.interval.seconds (default 300s) configuration + 3. Manually trigger preferred replica election using kafka-preferred-replica-election tool + 4. Monitor broker resource utilization (CPU, network) for imbalance + 5. Review broker logs for leadership election errors + 6. Verify all brokers are healthy and reachable + + If the imbalance persists for extended periods, consider running manual preferred replica election to redistribute leadership and restore balanced load across the cluster. + summary: Kafka has preferred replica imbalance. + expr: | + sum by (kafka_cluster) (kafka_controller_kafkacontroller_preferredreplicaimbalancecount_value{} + or + kafka_controller_kafkacontroller_preferredreplicaimbalancecount{}) > 0 + for: 30m + keep_firing_for: 5m + labels: + severity: warning - alert: KafkaNoActiveController annotations: - description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} broker(s) reporting as the active controller in the last 5 minute interval. During steady state there should be only one active controller per cluster. + description: | + Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} broker(s) reporting as the active controller. Expected exactly 1 active controller. + + CRITICAL impact: + The Kafka controller is responsible for cluster-wide administrative operations including partition leader election, broker failure detection, topic creation/deletion, and partition reassignment. Without an active controller (value=0) or with multiple controllers (value>1), the cluster cannot perform these critical operations, potentially causing: + - Inability to elect new partition leaders when brokers fail + - Topic creation/deletion operations hang indefinitely + - Partition reassignments cannot be executed + - Cluster metadata inconsistencies + - Split-brain scenarios if multiple controllers exist + + Common causes: + - Zookeeper connectivity issues or Zookeeper cluster instability + - Network partitions between brokers and Zookeeper + - Controller broker crash or unclean shutdown + - Long garbage collection pauses on controller broker + - Zookeeper session timeout (zookeeper.session.timeout.ms exceeded) + - Controller election conflicts during network splits + + This is a critical cluster-wide issue requiring immediate attention to restore normal operations. summary: Kafka has no active controller. expr: |- sum by(kafka_cluster) (kafka_controller_kafkacontroller_activecontrollercount_value{} @@ -86,18 +239,60 @@ groups: severity: critical - alert: KafkaUncleanLeaderElection annotations: - description: Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} unclean partition leader elections reported in the last 5 minute interval. When unclean leader election is held among out-of-sync replicas, there is a possibility of data loss if any messages were not synced prior to the loss of the former leader. So if the number of unclean elections is greater than 0, investigate broker logs to determine why leaders were re-elected, and look for WARN or ERROR messages. Consider setting the broker configuration parameter unclean.leader.election.enable to false so that a replica outside of the set of in-sync replicas is never elected leader. + description: | + Kafka cluster {{ $labels.kafka_cluster }} has {{ $value }} unclean partition leader elections reported in the last 5 minutes. + + CRITICAL Impact - DATA LOSS RISK: + Unclean leader election occurs when no in-sync replica (ISR) is available to become the leader, forcing Kafka to elect an out-of-sync replica. This WILL result in data loss for any messages that were committed to the previous leader but not replicated to the new leader. This compromises data durability guarantees and can cause: + - Permanent loss of committed messages + - Consumer offset inconsistencies + - Duplicate message processing + - Data inconsistencies between producers and consumers + - Violation of at-least-once or exactly-once semantics + + Common causes: + - All ISR replicas failed simultaneously (broker crashes, hardware failures) + - Network partitions isolating all ISR members + - Extended broker downtime exceeding replica lag tolerance + - Insufficient replication factor (RF < 3) combined with broker failures + - min.insync.replicas set too low relative to replication factor + - Disk failures on multiple replicas simultaneously + - Aggressive unclean.leader.election.enable=true configuration + + Immediate actions: + 1. Review broker logs to identify which partitions had unclean elections + 2. Investigate root cause of ISR replica failures (check broker health, hardware, network) + 3. Assess data loss impact by comparing producer and consumer offsets for affected partitions + 4. Alert application teams to potential data loss in affected partitions + 5. Bring failed ISR replicas back online as quickly as possible + 6. Consider resetting consumer offsets if data loss is unacceptable + 7. Review and increase replication factor for critical topics (minimum RF=3) + 8. Set unclean.leader.election.enable=false to prevent future unclean elections (availability vs. durability trade-off) + 9. Increase min.insync.replicas to strengthen durability guarantees + 10. Implement better monitoring for ISR shrinkage to detect issues before unclean elections occur + + This indicates a serious reliability event that requires immediate investigation and remediation to prevent future data loss. summary: Kafka has unclean leader elections. expr: |- (sum by (kafka_cluster,instance) (kafka_controller_controllerstats_uncleanleaderelectionspersec{}) or sum by (kafka_cluster,instance) (rate(kafka_controller_controllerstats_uncleanleaderelections_total{}[5m]))) != 0 for: 5m + keep_firing_for: 5m labels: severity: critical - alert: KafkaBrokerCount annotations: - description: Kafka cluster {{ $labels.kafka_cluster }} broker count is 0. + description: |+ + Kafka cluster {{ $labels.kafka_cluster }} has zero brokers reporting metrics. + + No brokers are online or reporting metrics, indicating complete cluster failure. This results in: + - Total unavailability of all topics and partitions + - All produce and consume operations failing + - Complete loss of cluster functionality + - Potential data loss if unclean shutdown occurred + - Application downtime for all services depending on Kafka + summary: Kafka has no brokers online. expr: |- count by(kafka_cluster) (kafka_server_kafkaserver_brokerstate{} @@ -108,8 +303,20 @@ groups: severity: critical - alert: KafkaZookeeperSyncConnect annotations: - description: Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has disconected from Zookeeper. - summary: Kafka Zookeeper sync disconected. + description: | + Kafka broker {{ $labels.instance }} in cluster {{ $labels.kafka_cluster }} has lost connection to Zookeeper. + + Zookeeper connectivity is essential for Kafka broker operation. A disconnected broker cannot: + - Participate in controller elections + - Register or maintain its broker metadata + - Receive cluster state updates + - Serve as partition leader (will be removed from ISR) + - Handle leadership changes or partition reassignments + + This will cause the broker to become isolated from the cluster, leading to under-replicated partitions and potential service degradation for any topics hosted on this broker. + + Prolonged Zookeeper disconnection will result in the broker being ejected from the cluster and leadership reassignments. + summary: Kafka Zookeeper sync disconnected. expr: |- avg by(kafka_cluster,instance) (rate(kafka_server_sessionexpirelistener_zookeepersyncconnects_total{quantile="0.95",}[5m]) or diff --git a/kafka-observ-lib/rows.libsonnet b/kafka-observ-lib/rows.libsonnet index 9a19ded02..27a46e0ac 100644 --- a/kafka-observ-lib/rows.libsonnet +++ b/kafka-observ-lib/rows.libsonnet @@ -10,7 +10,7 @@ local commonlib = import 'common-lib/common/main.libsonnet'; panels.cluster.activeControllers { gridPos+: { w: 3, h: 4 } }, panels.cluster.brokersCount { gridPos+: { w: 3, h: 4 } }, panels.replicaManager.uncleanLeaderElectionStat { gridPos+: { w: 3, h: 4 } }, - panels.replicaManager.preferredReplicaInbalanceStat { gridPos+: { w: 3, h: 4 } }, + panels.replicaManager.preferredReplicaImbalanceStat { gridPos+: { w: 3, h: 4 } }, panels.cluster.clusterBytesBothPerSec { gridPos+: { w: 6, h: 8 } }, panels.cluster.clusterMessagesPerSec { gridPos+: { w: 6, h: 8 } }, //next row @@ -19,7 +19,7 @@ local commonlib = import 'common-lib/common/main.libsonnet'; panels.replicaManager.underReplicatedPartitionsStat { gridPos+: { w: 3, h: 4 } }, panels.replicaManager.underMinISRPartitionsStat { gridPos+: { w: 3, h: 4 } }, // status rows - panels.cluster.clusterRoles { gridPos+: { w: 24, h: 7 } }, + panels.cluster.clusterRoles { gridPos+: { w: 24, h: 10 } }, ] ), throughput: diff --git a/kafka-observ-lib/signals/broker.libsonnet b/kafka-observ-lib/signals/broker.libsonnet index f19aaa1eb..790d7b4f3 100644 --- a/kafka-observ-lib/signals/broker.libsonnet +++ b/kafka-observ-lib/signals/broker.libsonnet @@ -15,7 +15,10 @@ function(this) signals: { brokerMessagesInPerSec: { name: 'Broker messages in', - description: 'Broker messages in.', + description: ||| + Rate of incoming messages published to this broker across all topics. + Tracks producer throughput and write workload. + |||, type: 'counter', unit: 'mps', sources: { @@ -32,7 +35,10 @@ function(this) }, brokerBytesInPerSec: { name: 'Broker bytes in', - description: 'Broker bytes in rate.', + description: ||| + Rate of incoming data in bytes published to this broker from producers. + Measures network and storage write load. + |||, type: 'counter', unit: 'Bps', sources: { @@ -49,7 +55,10 @@ function(this) }, brokerBytesOutPerSec: { name: 'Broker bytes out', - description: 'Broker bytes out rate.', + description: ||| + Rate of outgoing data in bytes sent from this broker to consumers and followers. + Measures network read load and consumer throughput. + |||, type: 'counter', unit: 'Bps', sources: { diff --git a/kafka-observ-lib/signals/brokerReplicaManager.libsonnet b/kafka-observ-lib/signals/brokerReplicaManager.libsonnet index cfd35b4b5..5773d030f 100644 --- a/kafka-observ-lib/signals/brokerReplicaManager.libsonnet +++ b/kafka-observ-lib/signals/brokerReplicaManager.libsonnet @@ -80,7 +80,10 @@ function(this) onlinePartitions: { name: 'Online partitions', description: ||| - Online partitions. + Number of partitions that are currently online and available on this broker. This includes + partitions where this broker is either the leader or a follower replica. The total count + reflects the broker's share of the topic partitions across the cluster. A sudden drop in + online partitions may indicate broker issues, partition reassignments, or cluster rebalancing. |||, type: 'gauge', unit: 'short', @@ -103,7 +106,11 @@ function(this) offlinePartitions: { name: 'Offline partitions', description: ||| - Number of partitions that dont have an active leader and are hence not writable or readable. + Number of partitions that don't have an active leader and are hence not writable or readable. + Offline partitions indicate a critical availability issue as producers cannot write to these + partitions and consumers cannot read from them. This typically occurs when all replicas for + a partition are down or when there are not enough in-sync replicas to elect a new leader. + Any non-zero value requires immediate investigation and remediation to restore service availability. |||, type: 'gauge', unit: 'short', @@ -125,7 +132,11 @@ function(this) underReplicatedPartitions: { name: 'Under replicated partitions', description: ||| - Number of under replicated partitions (| ISR | < | all replicas |). + Number of partitions that have fewer in-sync replicas (ISR) than the configured replication factor. + Under-replicated partitions indicate potential data availability issues, as there are fewer copies + of the data than desired. This could be caused by broker failures, network issues, or brokers + falling behind in replication. A high number of under-replicated partitions poses a risk to + data durability and availability, as the loss of additional brokers could result in data loss. |||, type: 'gauge', unit: 'short', @@ -145,7 +156,11 @@ function(this) underMinISRPartitions: { name: 'Under min ISR partitions', description: ||| - Under min ISR(In-Sync replicas) partitions. + Number of partitions that have fewer in-sync replicas (ISR) than the configured minimum ISR threshold. + When the number of ISRs for a partition falls below the min.insync.replicas setting, the partition + becomes unavailable for writes (if acks=all is configured), which helps prevent data loss but impacts + availability. This metric indicates potential issues with broker availability, network connectivity, + or replication lag that need immediate attention to restore write availability. |||, type: 'gauge', unit: 'short', @@ -165,7 +180,12 @@ function(this) uncleanLeaderElection: { name: 'Unclean leader election', description: ||| - Unclean leader election rate. + Rate of unclean leader elections occurring in the cluster. An unclean leader election happens + when a partition leader fails and a replica that was not fully in-sync (not in the ISR) is + elected as the new leader. This results in potential data loss as the new leader may be missing + messages that were committed to the previous leader. Unclean elections occur when unclean.leader.election.enable + is set to true and there are no in-sync replicas available. Any occurrence of unclean elections + indicates a serious problem with cluster availability and replication health that risks data integrity. |||, type: 'raw', unit: 'short', @@ -182,10 +202,16 @@ function(this) }, }, }, - preferredReplicaInbalance: { - name: 'Preferred replica inbalance', + preferredReplicaImbalance: { + name: 'Preferred replica imbalance', description: ||| - The count of topic partitions for which the leader is not the preferred leader. + The count of topic partitions for which the leader is not the preferred leader. In Kafka, + each partition has a preferred leader replica (typically the first replica in the replica list). + When leadership is not on the preferred replica, the cluster may experience uneven load distribution + across brokers, leading to performance imbalances. This can occur after broker failures and restarts, + or during cluster maintenance. Running the preferred replica election can help rebalance leadership + and optimize cluster performance. A consistently high imbalance may indicate issues with automatic + leader rebalancing or the need for manual intervention. |||, type: 'gauge', unit: 'short', diff --git a/kafka-observ-lib/signals/cluster.libsonnet b/kafka-observ-lib/signals/cluster.libsonnet index e5f33b4db..95a9b5729 100644 --- a/kafka-observ-lib/signals/cluster.libsonnet +++ b/kafka-observ-lib/signals/cluster.libsonnet @@ -17,7 +17,9 @@ function(this) activeControllers: { name: 'Active kafka controllers', description: ||| - Active kafka controllers count. + Number of active controllers in the cluster. Should always be exactly 1. + Zero indicates no controller elected, preventing cluster operations. + More than one indicates split-brain requiring immediate attention. |||, type: 'gauge', unit: 'short', @@ -41,7 +43,10 @@ function(this) role: { name: 'Current role', description: ||| - 0 - follower, 1 - controller. + Broker's current controller role: 0 indicates follower, 1 indicates active controller. + Only one broker should have value 1 at any time. + Used to identify which broker is managing cluster metadata and leadership. + Current controller role: 0 - follower, 1 - controller. |||, type: 'gauge', unit: 'short', @@ -119,7 +124,9 @@ function(this) kraftBrokerRole: { name: 'Current role (kraft)', description: ||| - Any value - broker in kraft. + Broker state in KRaft mode (Kafka without ZooKeeper). + Any value indicates the broker is running in KRaft mode. + Used to identify KRaft-enabled brokers in the cluster. |||, type: 'gauge', unit: 'short', @@ -155,7 +162,7 @@ function(this) brokersCount: { name: 'Brokers count', description: ||| - Active brokers count. + Total number of active brokers currently registered and reporting in the cluster. |||, type: 'gauge', unit: 'short', @@ -178,7 +185,10 @@ function(this) clusterMessagesInPerSec: { name: 'Cluster messages in', - description: 'Cluster messages in.', + description: ||| + Aggregate rate of incoming messages across all brokers and topics in the cluster. + Represents total producer throughput and write workload. + |||, type: 'counter', unit: 'mps', sources: { @@ -195,7 +205,10 @@ function(this) }, clusterBytesInPerSec: { name: 'Cluster bytes in', - description: 'Cluster bytes in rate.', + description: ||| + Aggregate rate of incoming data in bytes across all brokers from producers. + Measures total network ingress and storage write load. + |||, type: 'counter', unit: 'Bps', sources: { @@ -212,7 +225,10 @@ function(this) }, clusterBytesOutPerSec: { name: 'Cluster bytes out', - description: 'Cluster bytes out rate.', + description: ||| + Aggregate rate of outgoing data in bytes across all brokers to consumers and followers. + Measures total network egress load and consumer throughput. + |||, type: 'counter', unit: 'Bps', sources: { diff --git a/kafka-observ-lib/signals/consumerGroup.libsonnet b/kafka-observ-lib/signals/consumerGroup.libsonnet index 8141b5cbd..5022ac060 100644 --- a/kafka-observ-lib/signals/consumerGroup.libsonnet +++ b/kafka-observ-lib/signals/consumerGroup.libsonnet @@ -16,7 +16,11 @@ function(this) signals: { consumerGroupLag: { name: 'Consumer group lag', - description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.', + description: ||| + Number of messages a consumer group is behind the latest available offset for a topic partition. + High or growing lag indicates consumers can't keep up with producer throughput. + Critical metric for consumer health and real-time processing requirements. + |||, type: 'gauge', unit: 'short', aggFunction: 'sum', @@ -35,7 +39,11 @@ function(this) consumerGroupLagTime: { name: 'Consumer group lag in ms', - description: 'Current approximate lag of a ConsumerGroup at Topic/Partition.', + description: ||| + Time lag in milliseconds between message production and consumption for a consumer group. + Represents real-time delay in message processing. + More intuitive than message count lag for understanding business impact of delays. + |||, type: 'gauge', unit: 'ms', optional: true, @@ -50,7 +58,11 @@ function(this) consumerGroupConsumeRate: { name: 'Consumer group consume rate', - description: 'Consumer group consume rate.', + description: ||| + Rate at which a consumer group is consuming and committing offsets for a topic. + Measures consumer throughput and processing speed. + Should match or exceed producer rate to prevent growing lag. + |||, type: 'counter', unit: 'mps', sources: { diff --git a/kafka-observ-lib/signals/conversion.libsonnet b/kafka-observ-lib/signals/conversion.libsonnet index 3f46cdcd0..9486cfb22 100644 --- a/kafka-observ-lib/signals/conversion.libsonnet +++ b/kafka-observ-lib/signals/conversion.libsonnet @@ -15,7 +15,11 @@ function(this) signals: { producerConversion: { name: 'Message conversion (producer)', - description: 'The number of messages produced converted to match the log.message.format.version.', + description: ||| + Rate of producer messages requiring format conversion to match broker's log.message.format.version. + Conversions add CPU overhead and latency. + Non-zero values suggest producer and broker version mismatches requiring alignment. + |||, type: 'counter', unit: 'mps', sources: { @@ -32,7 +36,11 @@ function(this) }, consumerConversion: { name: 'Message conversion (consumer)', - description: 'The number of messages consumed converted at consumer to match the log.message.format.version.', + description: ||| + Rate of messages requiring format conversion during consumer fetch to match log.message.format.version. + Conversions impact broker CPU and consumer latency. + Indicates version mismatch between stored messages and consumer expectations. + |||, type: 'counter', unit: 'mps', sources: { diff --git a/kafka-observ-lib/signals/topic.libsonnet b/kafka-observ-lib/signals/topic.libsonnet index 53562f454..8619fa4e1 100644 --- a/kafka-observ-lib/signals/topic.libsonnet +++ b/kafka-observ-lib/signals/topic.libsonnet @@ -16,7 +16,11 @@ function(this) signals: { topicMessagesPerSec: { name: 'Messages in per second', - description: 'Messages in per second.', + description: ||| + Rate of messages produced to this topic across all partitions. + Indicates topic write activity and producer throughput. + Use to identify hot topics and understand data flow patterns. + |||, type: 'counter', unit: 'mps', sources: { @@ -31,7 +35,9 @@ function(this) // used in table: topicMessagesPerSecByPartition: { name: 'Messages in per second', - description: 'Messages in per second.', + description: ||| + Rate of messages produced to each partition within this topic. + |||, type: 'counter', unit: 'mps', legendCustomTemplate: '{{ topic }}/{{ partition }}', @@ -47,7 +53,9 @@ function(this) // JMX exporter extras topicBytesInPerSec: { name: 'Topic bytes in', - description: 'Topic bytes in rate.', + description: ||| + Rate of incoming data in bytes written to this topic from producers. + |||, type: 'counter', unit: 'Bps', sources: { @@ -67,7 +75,9 @@ function(this) }, topicBytesOutPerSec: { name: 'Topic bytes out', - description: 'Topic bytes out rate.', + description: ||| + Rate of outgoing data in bytes read from this topic by consumers. + |||, type: 'counter', unit: 'Bps', sources: { @@ -87,7 +97,9 @@ function(this) }, topicLogStartOffset: { name: 'Topic start offset', - description: 'Topic start offset.', + description: ||| + Earliest available offset for each partition due to retention or deletion. + |||, type: 'gauge', unit: 'none', aggFunction: 'max', @@ -109,7 +121,11 @@ function(this) }, topicLogEndOffset: { name: 'Topic end offset', - description: 'Topic end offset.', + description: ||| + Latest offset (high water mark) for each partition representing newest available message. + Continuously increases as new messages arrive. + Difference between end and start offsets indicates total messages available. + |||, type: 'gauge', unit: 'none', aggFunction: 'max', @@ -126,7 +142,10 @@ function(this) }, topicLogSize: { name: 'Topic log size', - description: 'Size in bytes of the current topic-partition.', + description: ||| + Total size in bytes of data stored on disk for each topic partition. + Grows with incoming messages and shrinks with retention cleanup. + |||, type: 'gauge', unit: 'decbytes', aggFunction: 'max', diff --git a/kafka-observ-lib/signals/totalTime.libsonnet b/kafka-observ-lib/signals/totalTime.libsonnet index 8155d3738..0ecfb2b49 100644 --- a/kafka-observ-lib/signals/totalTime.libsonnet +++ b/kafka-observ-lib/signals/totalTime.libsonnet @@ -16,24 +16,27 @@ function(this) signals: { local commonRequestQueueDescription = ||| - A high value can imply there aren't enough IO threads or the CPU is a bottleneck, - or the request queue isnt large enough. The request queue size should match the number of connections. + High values indicate insufficient IO threads, CPU bottlenecks, or undersized request queue. + Queue size should match connection count. |||, local commonLocalDescription = ||| - In most cases, a high value can imply slow local storage or the storage is a bottleneck. One should also investigate LogFlushRateAndTimeMs to know how long page flushes are taking, which will also indicate a slow disk. In the case of FetchFollower requests, time spent in LocalTimeMs can be the result of a ZooKeeper write to change the ISR. + High values often indicate slow storage or disk bottlenecks. + Check LogFlushRateAndTimeMs for disk performance issues. |||, - local commonRemoteDesription = ||| - A high value can imply a slow network connection. For fetch request, if the remote time is high, it could be that there is not enough data to give in a fetch response. This can happen when the consumer or replica is caught up and there is no new incoming data. If this is the case, remote time will be close to the max wait time, which is normal. Max wait time is configured via replica.fetch.wait.max.ms and fetch.max.wait.ms. + local commonRemoteDescription = ||| + For fetch requests, high values may indicate caught-up consumers with no new data (normal if near max wait time). + Configure via replica.fetch.wait.max.ms and fetch.max.wait.ms. |||, local commonResponseQueueDescription = ||| - A high value can imply there aren't enough network threads or the network cant dequeue responses quickly enough, causing back pressure in the response queue. + High values indicate insufficient network threads or slow network dequeue causing backpressure. |||, local commonResponseDescription = ||| - A high value can imply the zero-copy from disk to the network is slow, or the network is the bottleneck because the network cant dequeue responses of the TCP socket as quickly as theyre being created. If the network buffer gets full, Kafka will block. + High values indicate slow zero-copy operations or network saturation. + Network buffer fullness can cause Kafka to block. |||, fetchQueueTime: { @@ -74,7 +77,7 @@ function(this) fetchRemoteTime: { name: 'Fetch-consumer remote time', description: "Time spent waiting for follower response (only when 'require acks' is set)." - + '\n' + commonRemoteDesription, + + '\n' + commonRemoteDescription, type: 'gauge', unit: 'ms', @@ -163,7 +166,7 @@ function(this) fetchFollowerRemoteTime: { name: 'Fetch-follower remote time', description: "Time spent waiting for follower response (only when 'require acks' is set)." - + '\n' + commonRemoteDesription, + + '\n' + commonRemoteDescription, type: 'gauge', unit: 'ms', sources: { @@ -251,7 +254,7 @@ function(this) producerRemoteTime: { name: 'Produce follower remote time', description: "Time spent waiting for follower response (only when 'require acks' is set)." - + '\n' + commonRemoteDesription, + + '\n' + commonRemoteDescription, type: 'gauge', unit: 'ms', sources: { diff --git a/kafka-observ-lib/signals/zookeeperClient.libsonnet b/kafka-observ-lib/signals/zookeeperClient.libsonnet index 430b81e8b..f11534a1f 100644 --- a/kafka-observ-lib/signals/zookeeperClient.libsonnet +++ b/kafka-observ-lib/signals/zookeeperClient.libsonnet @@ -15,7 +15,11 @@ function(this) signals: { zookeeperRequestLatency: { name: 'Zookeeper request latency', - description: 'Latency in millseconds for ZooKeeper requests from broker.', + description: ||| + Latency in milliseconds for ZooKeeper requests from broker to ZooKeeper ensemble. + High latency indicates ZooKeeper performance issues or network problems. + Critical for broker operations like leader election and metadata updates. + |||, type: 'gauge', unit: 'ms', sources: { @@ -30,7 +34,11 @@ function(this) }, zookeeperConnections: { name: 'Zookeeper connections', - description: 'Zookeeper connections rate.', + description: ||| + Rate of successful ZooKeeper connections established by broker. + Frequent connections may indicate session instability or network issues. + Should be stable in healthy clusters with occasional reconnections during maintenance. + |||, type: 'counter', unit: 'short', optional: true, @@ -45,7 +53,11 @@ function(this) }, zookeeperExpiredConnections: { name: 'Zookeeper expired connections', - description: 'Zookeeper expired connections rate.', + description: ||| + Rate of ZooKeeper session expirations from broker. + Expirations cause broker to lose cluster membership temporarily. + Indicates GC pauses, network issues, or ZooKeeper overload requiring investigation. + |||, type: 'counter', unit: 'short', optional: true, @@ -60,7 +72,11 @@ function(this) }, zookeeperDisconnects: { name: 'Zookeeper disconnects', - description: 'Zookeeper disconnects rate.', + description: ||| + Rate of ZooKeeper disconnections from broker. + Frequent disconnects indicate unstable ZooKeeper connectivity or network problems. + Can lead to ISR changes and performance degradation if persistent. + |||, type: 'counter', unit: 'short', optional: true, @@ -75,7 +91,11 @@ function(this) }, zookeeperAuthFailures: { name: 'Zookeeper auth failures', - description: 'Zookeeper auth failures from Kafka.', + description: ||| + Rate of ZooKeeper authentication failures from broker. + Indicates incorrect credentials, ACL issues, or security configuration problems. + Prevents broker from accessing ZooKeeper data and requires immediate security review. + |||, type: 'counter', unit: 'short', optional: true,