Skip to content

Commit 4bfc7b7

Browse files
authored
Merge pull request kmesh-net#1105 from LiZhenCheng9527/add-metric
add workload metrics
2 parents 35b7b22 + 510bcb1 commit 4bfc7b7

File tree

5 files changed

+41
-16
lines changed

5 files changed

+41
-16
lines changed

bpf/kmesh/probes/tcp_probe.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,7 @@ struct tcp_probe_info {
4545
__u32 protocol;
4646
__u32 srtt_us; /* smoothed round trip time << 3 in usecs */
4747
__u32 rtt_min;
48-
__u32 mss_cache; /* Cached effective mss, not including SACKS */
4948
__u32 total_retrans; /* Total retransmits for entire connection */
50-
__u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
51-
* total number of segments in.
52-
*/
53-
__u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
54-
* The total number of segments sent.
55-
*/
5649
__u32 lost_out; /* Lost packets */
5750
};
5851

@@ -108,10 +101,7 @@ static inline void get_tcp_probe_info(struct bpf_tcp_sock *tcp_sock, struct tcp_
108101
info->received_bytes = tcp_sock->bytes_received;
109102
info->srtt_us = tcp_sock->srtt_us;
110103
info->rtt_min = tcp_sock->rtt_min;
111-
info->mss_cache = tcp_sock->mss_cache;
112104
info->total_retrans = tcp_sock->total_retrans;
113-
info->segs_in = tcp_sock->segs_in;
114-
info->segs_out = tcp_sock->segs_out;
115105
info->lost_out = tcp_sock->lost_out;
116106
return;
117107
}

pkg/controller/telemetry/accesslog.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func buildAccesslog(data requestMetric, accesslog logInfo) string {
8888
timeInfo := fmt.Sprintf("%v", uptime)
8989
sourceInfo := fmt.Sprintf("src.addr=%s, src.workload=%s, src.namespace=%s", accesslog.sourceAddress, accesslog.sourceWorkload, accesslog.sourceNamespace)
9090
destinationInfo := fmt.Sprintf("dst.addr=%s, dst.service=%s, dst.workload=%s, dst.namespace=%s", accesslog.destinationAddress, accesslog.destinationService, accesslog.destinationWorkload, accesslog.destinationNamespace)
91-
connectionInfo := fmt.Sprintf("direction=%s, sent_bytes=%d, received_bytes=%d, duration=%vms", accesslog.direction, data.sentBytes, data.receivedBytes, (float64(data.duration) / 1000000.0))
91+
connectionInfo := fmt.Sprintf("direction=%s, sent_bytes=%d, received_bytes=%d, srtt=%dus, min_rtt=%dus, duration=%vms", accesslog.direction, data.sentBytes, data.receivedBytes, data.srtt, data.minRtt, (float64(data.duration) / 1000000.0))
9292

9393
logResult := fmt.Sprintf("%s %s, %s, %s", timeInfo, sourceInfo, destinationInfo, connectionInfo)
9494
return logResult

pkg/controller/telemetry/accesslog_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func Test_buildAccesslog(t *testing.T) {
5353
destinationNamespace: "kmesh-system",
5454
},
5555
},
56-
want: "2024-08-14 10:11:27.005837715 +0000 UTC src.addr=10.244.0.10:47667, src.workload=sleep-7656cf8794-9v2gv, src.namespace=kmesh-system, dst.addr=10.244.0.7:8080, dst.service=httpbin.ambient-demo.svc.cluster.local, dst.workload=httpbin-86b8ffc5ff-bhvxx, dst.namespace=kmesh-system, direction=INBOUND, sent_bytes=60, received_bytes=172, duration=2.236ms",
56+
want: "2024-08-14 10:11:27.005837715 +0000 UTC src.addr=10.244.0.10:47667, src.workload=sleep-7656cf8794-9v2gv, src.namespace=kmesh-system, dst.addr=10.244.0.7:8080, dst.service=httpbin.ambient-demo.svc.cluster.local, dst.workload=httpbin-86b8ffc5ff-bhvxx, dst.namespace=kmesh-system, direction=INBOUND, sent_bytes=60, received_bytes=172, srtt=0us, min_rtt=0us, duration=2.236ms",
5757
},
5858
}
5959
osStartTime = time.Date(2024, 7, 4, 20, 14, 0, 0, time.UTC)

pkg/controller/telemetry/metric.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ const (
4242

4343
connection_success = uint32(1)
4444

45+
MSG_LEN = 96
46+
4547
metricFlushInterval = 5 * time.Second
4648

4749
DEFAULT_UNKNOWN = "-"
@@ -66,6 +68,8 @@ type workloadMetricInfo struct {
6668
WorkloadConnSentBytes float64
6769
WorkloadConnReceivedBytes float64
6870
WorkloadConnFailed float64
71+
WorkloadConnTotalRetrans float64
72+
WorkloadConnPacketLost float64
6973
}
7074

7175
type serviceMetricInfo struct {
@@ -88,10 +92,7 @@ type statistics struct {
8892
Protocol uint32
8993
SRttTime uint32
9094
RttMin uint32
91-
MssCache uint32
9295
Retransmits uint32
93-
SegmentsIn uint32
94-
SegmentsOut uint32
9596
LostPackets uint32
9697
}
9798

@@ -135,6 +136,10 @@ type requestMetric struct {
135136
success uint32
136137
duration uint64
137138
closeTime uint64
139+
srtt uint32
140+
minRtt uint32
141+
totalRetrans uint32
142+
PacketLost uint32
138143
}
139144

140145
type workloadMetricLabels struct {
@@ -408,6 +413,10 @@ func buildV4Metric(buf *bytes.Buffer) (requestMetric, error) {
408413
data.success = connectData.ConnectSuccess
409414
data.duration = connectData.Duration
410415
data.closeTime = connectData.CloseTime
416+
data.srtt = connectData.statistics.SRttTime
417+
data.minRtt = connectData.statistics.RttMin
418+
data.totalRetrans = connectData.statistics.Retransmits
419+
data.PacketLost = connectData.statistics.LostPackets
411420

412421
return data, nil
413422
}
@@ -441,6 +450,10 @@ func buildV6Metric(buf *bytes.Buffer) (requestMetric, error) {
441450
data.success = connectData.ConnectSuccess
442451
data.duration = connectData.Duration
443452
data.closeTime = connectData.CloseTime
453+
data.srtt = connectData.statistics.SRttTime
454+
data.minRtt = connectData.statistics.RttMin
455+
data.totalRetrans = connectData.statistics.Retransmits
456+
data.PacketLost = connectData.statistics.LostPackets
444457

445458
return data, nil
446459
}
@@ -607,6 +620,8 @@ func (m *MetricController) updateWorkloadMetricCache(data requestMetric, labels
607620
}
608621
v.WorkloadConnReceivedBytes = v.WorkloadConnReceivedBytes + float64(data.receivedBytes)
609622
v.WorkloadConnSentBytes = v.WorkloadConnSentBytes + float64(data.sentBytes)
623+
v.WorkloadConnTotalRetrans = v.WorkloadConnTotalRetrans + float64(data.totalRetrans)
624+
v.WorkloadConnPacketLost = v.WorkloadConnPacketLost + float64(data.PacketLost)
610625
} else {
611626
newWorkloadMetricInfo := workloadMetricInfo{}
612627
if data.state == TCP_ESTABLISHED {
@@ -620,6 +635,8 @@ func (m *MetricController) updateWorkloadMetricCache(data requestMetric, labels
620635
}
621636
newWorkloadMetricInfo.WorkloadConnReceivedBytes = float64(data.receivedBytes)
622637
newWorkloadMetricInfo.WorkloadConnSentBytes = float64(data.sentBytes)
638+
newWorkloadMetricInfo.WorkloadConnTotalRetrans = float64(data.totalRetrans)
639+
newWorkloadMetricInfo.WorkloadConnPacketLost = float64(data.PacketLost)
623640
m.workloadMetricCache[labels] = &newWorkloadMetricInfo
624641
}
625642
}
@@ -670,6 +687,8 @@ func (m *MetricController) updatePrometheusMetric() {
670687
tcpSentBytesInWorkload.With(workloadLabels).Add(v.WorkloadConnSentBytes)
671688
tcpReceivedBytesInWorkload.With(workloadLabels).Add(v.WorkloadConnReceivedBytes)
672689
tcpConnectionFailedInWorkload.With(workloadLabels).Add(v.WorkloadConnFailed)
690+
tcpConnectionTotalRetransInWorkload.With(workloadLabels).Add(v.WorkloadConnTotalRetrans)
691+
tcpConnectionPacketLostInWorkload.With(workloadLabels).Add(v.WorkloadConnPacketLost)
673692
}
674693

675694
for k, v := range serviceInfoCache {

pkg/controller/telemetry/utils.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,18 @@ var (
168168
Help: "The total number of TCP connections failed to a workload.",
169169
}, workloadLabels)
170170

171+
tcpConnectionTotalRetransInWorkload = prometheus.NewGaugeVec(
172+
prometheus.GaugeOpts{
173+
Name: "kmesh_tcp_retrans_total",
174+
Help: "Total number of retransmissions of the workload over the TCP connection.",
175+
}, workloadLabels)
176+
177+
tcpConnectionPacketLostInWorkload = prometheus.NewGaugeVec(
178+
prometheus.GaugeOpts{
179+
Name: "kmesh_tcp_packet_loss_total",
180+
Help: "Tracks the total number of TCP packets lost between source and destination.",
181+
}, workloadLabels)
182+
171183
tcpConnectionOpenedInService = prometheus.NewGaugeVec(prometheus.GaugeOpts{
172184
Name: "kmesh_tcp_connections_opened_total",
173185
Help: "The total number of TCP connections opened to a service",
@@ -243,7 +255,7 @@ func runPrometheusClient(registry *prometheus.Registry) {
243255
// ensure not occur matche the same requests as /status/metric panic in unit test
244256
mu.Lock()
245257
defer mu.Unlock()
246-
registry.MustRegister(tcpConnectionOpenedInWorkload, tcpConnectionClosedInWorkload, tcpReceivedBytesInWorkload, tcpSentBytesInWorkload)
258+
registry.MustRegister(tcpConnectionOpenedInWorkload, tcpConnectionClosedInWorkload, tcpReceivedBytesInWorkload, tcpSentBytesInWorkload, tcpConnectionTotalRetransInWorkload, tcpConnectionPacketLostInWorkload)
247259
registry.MustRegister(tcpConnectionOpenedInService, tcpConnectionClosedInService, tcpReceivedBytesInService, tcpSentBytesInService)
248260
registry.MustRegister(bpfProgOpDuration, bpfProgOpCount)
249261
registry.MustRegister(mapEntryCount, mapCountInNode)
@@ -272,12 +284,16 @@ func deleteWorkloadMetricInPrometheus(workload *workloadapi.Workload) {
272284
_ = tcpConnectionOpenedInWorkload.DeletePartialMatch(prometheus.Labels{"destination_pod_name": workload.Name, "destination_pod_namespace": workload.Namespace})
273285
_ = tcpReceivedBytesInWorkload.DeletePartialMatch(prometheus.Labels{"destination_pod_name": workload.Name, "destination_pod_namespace": workload.Namespace})
274286
_ = tcpSentBytesInWorkload.DeletePartialMatch(prometheus.Labels{"destination_pod_name": workload.Name, "destination_pod_namespace": workload.Namespace})
287+
_ = tcpConnectionTotalRetransInWorkload.DeletePartialMatch(prometheus.Labels{"destination_pod_name": workload.Name, "destination_pod_namespace": workload.Namespace})
288+
_ = tcpConnectionPacketLostInWorkload.DeletePartialMatch(prometheus.Labels{"destination_pod_name": workload.Name, "destination_pod_namespace": workload.Namespace})
275289
// delete source workload metric labels
276290
_ = tcpConnectionClosedInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
277291
_ = tcpConnectionFailedInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
278292
_ = tcpConnectionOpenedInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
279293
_ = tcpReceivedBytesInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
280294
_ = tcpSentBytesInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
295+
_ = tcpConnectionTotalRetransInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
296+
_ = tcpConnectionPacketLostInWorkload.DeletePartialMatch(prometheus.Labels{"source_workload": workload.Name, "source_workload_namespace": workload.Namespace})
281297
}
282298

283299
func DeleteServiceMetric(serviceName string) {

0 commit comments

Comments
 (0)