Skip to content

Commit 54c3c34

Browse files
authored
Merge pull request #155297 from cockroachdb/blathers/backport-release-25.4-155069
release-25.4: kvserver: improve WAL metric descriptions
2 parents d70350c + 4ae4e40 commit 54c3c34

File tree

2 files changed

+30
-20
lines changed

2 files changed

+30
-20
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10669,13 +10669,13 @@ layers:
1066910669
essential: true
1067010670
- name: storage.wal.fsync.latency
1067110671
exported_name: storage_wal_fsync_latency
10672-
description: The write ahead log fsync latency
10672+
description: The fsync latency to the Write-Ahead Log device.
1067310673
y_axis_label: Fsync Latency
1067410674
type: HISTOGRAM
1067510675
unit: NANOSECONDS
1067610676
aggregation: AVG
1067710677
derivative: NONE
10678-
how_to_use: If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.
10678+
how_to_use: If this value is greater than 100ms, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as this metric reflects the fsync latency of the primary and/or the secondary WAL device.
1067910679
essential: true
1068010680
- name: storage.write-stalls
1068110681
exported_name: storage_write_stalls
@@ -17403,36 +17403,38 @@ layers:
1740317403
derivative: NON_NEGATIVE_DERIVATIVE
1740417404
- name: storage.wal.bytes_in
1740517405
exported_name: storage_wal_bytes_in
17406-
description: The number of logical bytes the storage engine has written to the WAL
17406+
description: The number of logical bytes the storage engine has written to the Write-Ahead Log.
1740717407
y_axis_label: Events
1740817408
type: COUNTER
1740917409
unit: COUNT
1741017410
aggregation: AVG
1741117411
derivative: NON_NEGATIVE_DERIVATIVE
1741217412
- name: storage.wal.bytes_written
1741317413
exported_name: storage_wal_bytes_written
17414-
description: The number of bytes the storage engine has written to the WAL
17414+
description: The number of bytes the storage engine has written to the Write-Ahead Log.
1741517415
y_axis_label: Events
1741617416
type: COUNTER
1741717417
unit: COUNT
1741817418
aggregation: AVG
1741917419
derivative: NON_NEGATIVE_DERIVATIVE
1742017420
- name: storage.wal.failover.primary.duration
1742117421
exported_name: storage_wal_failover_primary_duration
17422-
description: Cumulative time spent writing to the primary WAL directory. Only populated when WAL failover is configured
17422+
description: Cumulative time spent writing to the primary WAL directory.
1742317423
y_axis_label: Nanoseconds
1742417424
type: COUNTER
1742517425
unit: NANOSECONDS
1742617426
aggregation: AVG
1742717427
derivative: NON_NEGATIVE_DERIVATIVE
17428+
how_to_use: Only populated when WAL failover is configured.
1742817429
- name: storage.wal.failover.secondary.duration
1742917430
exported_name: storage_wal_failover_secondary_duration
17430-
description: Cumulative time spent writing to the secondary WAL directory. Only populated when WAL failover is configured
17431+
description: Cumulative time spent writing to the secondary WAL directory.
1743117432
y_axis_label: Nanoseconds
1743217433
type: COUNTER
1743317434
unit: NANOSECONDS
1743417435
aggregation: AVG
1743517436
derivative: NON_NEGATIVE_DERIVATIVE
17437+
how_to_use: Only populated when WAL failover is configured.
1743617438
- name: storage.wal.failover.switch.count
1743717439
exported_name: storage_wal_failover_switch_count
1743817440
description: Count of the number of times WAL writing has switched from primary to secondary and vice versa.
@@ -17441,14 +17443,16 @@ layers:
1744117443
unit: COUNT
1744217444
aggregation: AVG
1744317445
derivative: NON_NEGATIVE_DERIVATIVE
17446+
how_to_use: Only populated when WAL failover is configured. A high switch count indicates that many disk stalls were encountered.
1744417447
- name: storage.wal.failover.write_and_sync.latency
1744517448
exported_name: storage_wal_failover_write_and_sync_latency
17446-
description: The observed latency for writing and syncing to the write ahead log. Only populated when WAL failover is configured
17449+
description: The observed latency for writing and syncing to the logical Write-Ahead Log.
1744717450
y_axis_label: Nanoseconds
1744817451
type: HISTOGRAM
1744917452
unit: NANOSECONDS
1745017453
aggregation: AVG
1745117454
derivative: NONE
17455+
how_to_use: Only populated when WAL failover is configured. Without WAL failover, the relevant metric is storage.wal.fsync.latency.
1745217456
- name: storage.write-amplification
1745317457
exported_name: storage_write_amplification
1745417458
description: |-

pkg/kv/kvserver/metrics.go

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2767,52 +2767,58 @@ Note that the measurement does not include the duration for replicating the eval
27672767
}
27682768
metaWALBytesWritten = metric.Metadata{
27692769
Name: "storage.wal.bytes_written",
2770-
Help: "The number of bytes the storage engine has written to the WAL",
2770+
Help: "The number of bytes the storage engine has written to the Write-Ahead Log.",
27712771
Measurement: "Events",
27722772
Unit: metric.Unit_COUNT,
27732773
}
27742774
metaWALBytesIn = metric.Metadata{
27752775
Name: "storage.wal.bytes_in",
2776-
Help: "The number of logical bytes the storage engine has written to the WAL",
2776+
Help: "The number of logical bytes the storage engine has written to the Write-Ahead Log.",
27772777
Measurement: "Events",
27782778
Unit: metric.Unit_COUNT,
27792779
}
27802780
metaStorageFsyncLatency = metric.Metadata{
27812781
Name: "storage.wal.fsync.latency",
2782-
Help: "The write ahead log fsync latency",
2782+
Help: "The fsync latency to the Write-Ahead Log device.",
27832783
Measurement: "Fsync Latency",
27842784
Unit: metric.Unit_NANOSECONDS,
27852785
Essential: true,
27862786
Category: metric.Metadata_STORAGE,
2787-
HowToUse: "If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured.",
2787+
HowToUse: "If this value is greater than 100ms, it is an indication of a disk stall. " +
2788+
"To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. " +
2789+
"When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as " +
2790+
"this metric reflects the fsync latency of the primary and/or the secondary WAL device.",
27882791
}
27892792
metaStorageWALFailoverSwitchCount = metric.Metadata{
27902793
Name: "storage.wal.failover.switch.count",
27912794
Help: "Count of the number of times WAL writing has switched from primary to secondary " +
27922795
"and vice versa.",
27932796
Measurement: "Events",
27942797
Unit: metric.Unit_COUNT,
2798+
HowToUse: "Only populated when WAL failover is configured. A high switch count indicates that " +
2799+
"many disk stalls were encountered.",
27952800
}
27962801
metaStorageWALFailoverPrimaryDuration = metric.Metadata{
2797-
Name: "storage.wal.failover.primary.duration",
2798-
Help: "Cumulative time spent writing to the primary WAL directory. Only populated " +
2799-
"when WAL failover is configured",
2802+
Name: "storage.wal.failover.primary.duration",
2803+
Help: "Cumulative time spent writing to the primary WAL directory.",
28002804
Measurement: "Nanoseconds",
28012805
Unit: metric.Unit_NANOSECONDS,
2806+
HowToUse: "Only populated when WAL failover is configured.",
28022807
}
28032808
metaStorageWALFailoverSecondaryDuration = metric.Metadata{
2804-
Name: "storage.wal.failover.secondary.duration",
2805-
Help: "Cumulative time spent writing to the secondary WAL directory. Only populated " +
2806-
"when WAL failover is configured",
2809+
Name: "storage.wal.failover.secondary.duration",
2810+
Help: "Cumulative time spent writing to the secondary WAL directory.",
28072811
Measurement: "Nanoseconds",
28082812
Unit: metric.Unit_NANOSECONDS,
2813+
HowToUse: "Only populated when WAL failover is configured.",
28092814
}
28102815
metaStorageWALFailoverWriteAndSyncLatency = metric.Metadata{
2811-
Name: "storage.wal.failover.write_and_sync.latency",
2812-
Help: "The observed latency for writing and syncing to the write ahead log. Only populated " +
2813-
"when WAL failover is configured",
2816+
Name: "storage.wal.failover.write_and_sync.latency",
2817+
Help: "The observed latency for writing and syncing to the logical Write-Ahead Log.",
28142818
Measurement: "Nanoseconds",
28152819
Unit: metric.Unit_NANOSECONDS,
2820+
HowToUse: "Only populated when WAL failover is configured. Without WAL failover, the relevant " +
2821+
"metric is storage.wal.fsync.latency.",
28162822
}
28172823
metaReplicaReadBatchDroppedLatchesBeforeEval = metric.Metadata{
28182824
Name: "kv.replica_read_batch_evaluate.dropped_latches_before_eval",

0 commit comments

Comments
 (0)