From 16b12d1bb90c71fa403a0e4df69e318488ccb6ec Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 13 Aug 2025 14:29:23 +0530 Subject: [PATCH 01/19] GATEWAYS-4306: exporting metrics for conntrack per zone --- go.mod | 33 +++++++--- go.sum | 96 ++++++++--------------------- internal/ovsexporter/conntrack.go | 75 ++++++++++++++++++++++ internal/ovsexporter/ovsexporter.go | 29 +++++++-- 4 files changed, 152 insertions(+), 81 deletions(-) create mode 100644 internal/ovsexporter/conntrack.go diff --git a/go.mod b/go.mod index 20f9632..fb8e7af 100644 --- a/go.mod +++ b/go.mod @@ -1,18 +1,35 @@ module github.com/digitalocean/openvswitch_exporter -go 1.15 +go 1.23.0 + +toolchain go1.24.2 require ( github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 - github.com/google/go-cmp v0.5.4 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect - github.com/mdlayher/netlink v1.3.2 // indirect github.com/prometheus/client_golang v1.9.0 + github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.1.1 // indirect + github.com/golang/protobuf v1.4.3 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/josharian/native v1.1.0 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect + github.com/mdlayher/genetlink v1.0.0 // indirect + github.com/mdlayher/netlink v1.7.2 // indirect + github.com/mdlayher/socket v0.5.1 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.2.0 // indirect github.com/prometheus/common v0.17.0 // indirect github.com/prometheus/procfs v0.6.0 // indirect - github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible - golang.org/x/net v0.0.0-20210222171744-9060382bd457 // indirect - golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect - golang.org/x/sys v0.0.0-20210223095934-7937bea0104d // indirect + github.com/ti-mo/conntrack v0.5.2 // indirect + github.com/ti-mo/netfilter v0.5.3 // indirect + golang.org/x/net v0.39.0 // indirect + golang.org/x/sync v0.14.0 // indirect + golang.org/x/sys v0.34.0 // indirect google.golang.org/protobuf v1.25.0 // indirect ) +replace github.com/digitalocean/go-openvswitch => /Users/sgangopadhyay/dev/digitalocean/go-openvswitch + diff --git a/go.sum b/go.sum index 1d1c526..b2e238f 100644 --- a/go.sum +++ b/go.sum @@ -20,8 +20,6 @@ github.com/aryann/difflib v0.0.0-20170710044230-e206f873d14a/go.mod h1:DAHtR1m6l github.com/aws/aws-lambda-go v1.13.3/go.mod h1:4UKl9IzQMoD+QF79YdCuzCwp8VbmG4VAQwij/eHl5CU= github.com/aws/aws-sdk-go v1.27.0/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= -github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a h1:BtpsbiV638WQZwhA98cEZw2BsbnQJrbd0BI7tsy0W1c= -github.com/beorn7/perks v0.0.0-20160804104726-4c0e84591b9a/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -42,12 +40,9 @@ github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfc github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= -github.com/digitalocean/go-openvswitch v0.0.0-20180412190941-6a4a47d93e43 h1:WbVAw/VDkXvaFyMOkJRzKBE6bf9PY7PAfrsOY3RHnIE= -github.com/digitalocean/go-openvswitch v0.0.0-20180412190941-6a4a47d93e43/go.mod h1:MpzfscrezUxa94/T4sy2tDaxB+hQ6w0EmRBPv+xHWEs= -github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 h1:RQAD2flP6n+U5sAudMpru+EuLJ6VQduu6yenl6LwM5E= -github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8/go.mod h1:MpzfscrezUxa94/T4sy2tDaxB+hQ6w0EmRBPv+xHWEs= github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= @@ -77,8 +72,6 @@ github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfU github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= -github.com/golang/protobuf v0.0.0-20171021043952-1643683e1b54 h1:nRNJXiJvemchkOTn0V4U11TZkvacB94gTzbTZbSA7Rw= -github.com/golang/protobuf v0.0.0-20171021043952-1643683e1b54/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -99,9 +92,9 @@ github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -139,16 +132,10 @@ github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANyt github.com/influxdata/influxdb1-client v0.0.0-20191209144304-8bf82d3c094d/go.mod h1:qj24IKcXYK6Iy9ceXlo3Tc+vtHo9lIhSX5JddghvEPo= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= -github.com/josharian/native v0.0.0-20200817173448-b6b71def0850 h1:uhL5Gw7BINiiPAo24A2sxkcDI0Jt/sqp1v5xQCniEFA= -github.com/josharian/native v0.0.0-20200817173448-b6b71def0850/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw= -github.com/jsimonetti/rtnetlink v0.0.0-20200117123717-f846d4f6c1f4/go.mod h1:WGuG/smIU4J/54PblvSbh+xvCZmpJnFgr3ds6Z55XMQ= -github.com/jsimonetti/rtnetlink v0.0.0-20201009170750-9c6f07d100c1/go.mod h1:hqoO/u39cqLeBLebZ8fWdE96O7FxrAsRYhnVOdgHxok= -github.com/jsimonetti/rtnetlink v0.0.0-20201216134343-bde56ed16391/go.mod h1:cR77jAZG3Y3bsb8hF6fHJbFoyFukLFOkQ98S0pQz3xw= -github.com/jsimonetti/rtnetlink v0.0.0-20201220180245-69540ac93943/go.mod h1:z4c53zj6Eex712ROyh8WI0ihysb5j2ROyV42iNogmAs= -github.com/jsimonetti/rtnetlink v0.0.0-20210122163228-8d122574c736/go.mod h1:ZXpIyOK59ZnN7J0BV99cZUPmsqDRZ3eq5X+st7u/oSA= -github.com/jsimonetti/rtnetlink v0.0.0-20210212075122-66c871082f2b/go.mod h1:8w9Rh8m+aHZIG69YPGGem1i5VzoyRC8nw2kA8B+ik5U= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= @@ -173,23 +160,14 @@ github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNx github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/mdlayher/ethtool v0.0.0-20210210192532-2b88debcdd43/go.mod h1:+t7E0lkKfbBsebllff1xdTmyJt8lH37niI6kwFk9OTo= -github.com/mdlayher/genetlink v0.0.0-20170901181924-76fecce4c787 h1:Tbivh+kRjFJUTZmMic7LcmuzfEF/HV42ZRMY0LiQ2dU= -github.com/mdlayher/genetlink v0.0.0-20170901181924-76fecce4c787/go.mod h1:EOrmeik1bDMaRduo2B+uAYe1HmTq6yF2IMDmJi1GoWk= github.com/mdlayher/genetlink v1.0.0 h1:OoHN1OdyEIkScEmRgxLEe2M9U8ClMytqA5niynLtfj0= github.com/mdlayher/genetlink v1.0.0/go.mod h1:0rJ0h4itni50A86M2kHcgS85ttZazNt7a8H2a2cw0Gc= -github.com/mdlayher/netlink v0.0.0-20180326144912-dc216978b479 h1:MF+m/B1wWGiOBY92ORRiv6hGcRBX4KHqNoYIO+y2Owo= -github.com/mdlayher/netlink v0.0.0-20180326144912-dc216978b479/go.mod h1:a3TlQHkJH2m32RF224Z7LhD5N4mpyR8eUbCoYHywrwg= github.com/mdlayher/netlink v0.0.0-20190409211403-11939a169225/go.mod h1:eQB3mZE4aiYnlUsyGGCOpPETfdQq4Jhsgf1fk3cwQaA= github.com/mdlayher/netlink v1.0.0/go.mod h1:KxeJAFOFLG6AjpyDkQ/iIhxygIUKD+vcwqcnu43w/+M= -github.com/mdlayher/netlink v1.1.0/go.mod h1:H4WCitaheIsdF9yOYu8CFmCgQthAPIWZmcKp9uZHgmY= -github.com/mdlayher/netlink v1.1.1/go.mod h1:WTYpFb/WTvlRJAyKhZL5/uy69TDDpHHu2VZmb2XgV7o= -github.com/mdlayher/netlink v1.2.0/go.mod h1:kwVW1io0AZy9A1E2YYgaD4Cj+C+GPkU6klXCMzIJ9p8= -github.com/mdlayher/netlink v1.2.1/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU= -github.com/mdlayher/netlink v1.2.2-0.20210123213345-5cc92139ae3e/go.mod h1:bacnNlfhqHqqLo4WsYeXSqfyXkInQ9JneWI68v1KwSU= -github.com/mdlayher/netlink v1.3.0/go.mod h1:xK/BssKuwcRXHrtN04UBkwQ6dY9VviGGuriDdoPSWys= -github.com/mdlayher/netlink v1.3.2 h1:fMZOU2/M7PRMzGM3br5l1N2fu6bPSHtRytmQ338a9iA= -github.com/mdlayher/netlink v1.3.2/go.mod h1:dRJi5IABcZpBD2A3D0Mv/AiX8I9uDEu5oGkAVrekmf8= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= +github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= @@ -234,12 +212,12 @@ github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0 github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/profile v1.2.1/go.mod h1:hJw3o1OdXxsrSjjVksARp5W95eeEaEfptyVZyv6JUPA= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= -github.com/prometheus/client_golang v0.9.0-pre1.0.20171005112915-5cec1d0429b0 h1:eIVGl4K1clOaKdGaS+KSUEOwF+g2g2aIEsmikqXqRgY= -github.com/prometheus/client_golang v0.9.0-pre1.0.20171005112915-5cec1d0429b0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= @@ -247,8 +225,6 @@ github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeD github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.9.0 h1:Rrch9mh17XcxvEu9D9DEpb4isxjGBtcevQjKvxPRQIU= github.com/prometheus/client_golang v1.9.0/go.mod h1:FqZLKOZnGdFAhOK4nqGHa7D66IdsO+O441Eve7ptJDU= -github.com/prometheus/client_model v0.0.0-20170216185247-6f3806018612 h1:13pIdM2tpaDi4OVe24fgoIS7ZTqMt0QI+bwQsX5hq+g= -github.com/prometheus/client_model v0.0.0-20170216185247-6f3806018612/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -256,8 +232,6 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1: github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/common v0.0.0-20171006141418-1bab55dd05db h1:PmL7nSW2mvuotGlJKuvUcSI/eE86zwYUcIAGoB6eHBk= -github.com/prometheus/common v0.0.0-20171006141418-1bab55dd05db/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= @@ -265,8 +239,6 @@ github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB8 github.com/prometheus/common v0.15.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s= github.com/prometheus/common v0.17.0 h1:kDIZLI74SS+3tedSvEkykgBkD7txMxaJAPj8DtJUKYA= github.com/prometheus/common v0.17.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s= -github.com/prometheus/procfs v0.0.0-20171226183907-b15cd069a834 h1:HRxr4uZnx/S86wVQsfXcKhadpzdceXn2qCzCtagcI6w= -github.com/prometheus/procfs v0.0.0-20171226183907-b15cd069a834/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= @@ -302,9 +274,17 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/ti-mo/conntrack v0.5.2 h1:PQ7MCdFjniEiTJT+qsAysREUsT5iH62/VNyhkB06HOI= +github.com/ti-mo/conntrack v0.5.2/go.mod h1:4HZrFQQLOSuBzgQNid3H/wYyyp1kfGXUYxueXjIGibo= +github.com/ti-mo/netfilter v0.5.3 h1:ikzduvnaUMwre5bhbNwWOd6bjqLMVb33vv0XXbK0xGQ= +github.com/ti-mo/netfilter v0.5.3/go.mod h1:08SyBCg6hu1qyQk4s3DjjJKNrm3RTb32nm6AzyT972E= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= +github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8= +github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= @@ -333,8 +313,6 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/net v0.0.0-20170614204310-ddf80d097059 h1:gMF+Wxxy27FCUvSZhKB22yNezu60IyLC37MHpj45QXs= -golang.org/x/net v0.0.0-20170614204310-ddf80d097059/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -353,15 +331,9 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20201216054612-986b41b23924/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210222171744-9060382bd457 h1:hMm9lBjyNLe/c9C6bElQxp4wsrleaJn1vXMZIQkNN44= -golang.org/x/net v0.0.0-20210222171744-9060382bd457/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -371,10 +343,8 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ= -golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180420145319-79b0c6888797 h1:ux9vYny+vlzqIcwoO6gRu+voPvKJA10ZceuJwWf2J88= -golang.org/x/sys v0.0.0-20180420145319-79b0c6888797/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= +golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -394,26 +364,15 @@ golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201009025420-dfb3f7c4e634/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201118182958-a01c418693c7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201214210602-f9fddec55a1e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201218084310-7d0127a74742/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210110051926-789bb1bd4061/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210123111255-9b0068b26619/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210216163648-f7da38b97c65/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210223095934-7937bea0104d h1:u0GOGnBJ3EKE/tNqREhhGiCzE9jFXydDo2lf7hOwGuc= -golang.org/x/sys v0.0.0-20210223095934-7937bea0104d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= +golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -431,9 +390,7 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.3.1/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.2.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -460,7 +417,6 @@ google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQ google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c= @@ -482,6 +438,8 @@ gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go new file mode 100644 index 0000000..b949127 --- /dev/null +++ b/internal/ovsexporter/conntrack.go @@ -0,0 +1,75 @@ +package ovsexporter + +import ( + "fmt" + "log" + + "github.com/digitalocean/go-openvswitch/ovsnl" + "github.com/prometheus/client_golang/prometheus" +) + +type conntrackCollector struct { + Count *prometheus.Desc + listConntrackEntries func() ([]ovsnl.ConntrackEntry, error) +} + +func newConntrackCollector(fn func() ([]ovsnl.ConntrackEntry, error)) prometheus.Collector { + return &conntrackCollector{ + Count: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "conntrack", "count"), + "Number of conntrack entries by zone, state, and mark", + []string{"zone", "state", "mark"}, nil, + ), + listConntrackEntries: fn, + } +} + +func (c *conntrackCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.Count +} + +func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { + entries, err := c.listConntrackEntries() + if err != nil { + log.Printf("Failed to collect conntrack entries: %v", err) + // Return a zero metric to indicate the collector is working but no data + ch <- prometheus.MustNewConstMetric( + c.Count, + prometheus.GaugeValue, + 0.0, + "unknown", "unknown", "0", + ) + return + } + + // Log the number of entries found for debugging + log.Printf("Found %d conntrack entries", len(entries)) + + // Aggregate counts + counts := make(map[string]map[string]map[string]int) + for _, e := range entries { + zone := fmt.Sprintf("%d", e.Zone) + state := e.State + mark := fmt.Sprintf("%d", e.Mark) + if counts[zone] == nil { + counts[zone] = make(map[string]map[string]int) + } + if counts[zone][state] == nil { + counts[zone][state] = make(map[string]int) + } + counts[zone][state][mark]++ + } + + for zone, stateMap := range counts { + for state, markMap := range stateMap { + for mark, count := range markMap { + ch <- prometheus.MustNewConstMetric( + c.Count, + prometheus.GaugeValue, + float64(count), + zone, state, mark, + ) + } + } + } +} diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 9ff3284..8d19838 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -6,6 +6,8 @@ package ovsexporter import ( + "context" + "log" "sync" "github.com/digitalocean/go-openvswitch/ovsnl" @@ -27,11 +29,30 @@ var _ prometheus.Collector = &collector{} // New creates a new Prometheus collector which collects metrics using the // input Open vSwitch generic netlink client. func New(c *ovsnl.Client) prometheus.Collector { + collectors := []prometheus.Collector{ + newDatapathCollector(c.Datapath.List), + } + + // Try to add conntrack collector, but don't fail if it's not available + conntrackCollector := newConntrackCollector(func() ([]ovsnl.ConntrackEntry, error) { + svc, err := ovsnl.NewConntrackService() + if err != nil { + return nil, err + } + defer svc.Close() + return svc.List(context.Background()) + }) + + // Test if conntrack service can be created + if _, err := ovsnl.NewConntrackService(); err != nil { + log.Printf("Warning: Conntrack service not available: %v. Conntrack metrics will be disabled.", err) + } else { + collectors = append(collectors, conntrackCollector) + log.Printf("Conntrack collector enabled") + } + return &collector{ - cs: []prometheus.Collector{ - // Additional generic netlink family collectors can be added here. - newDatapathCollector(c.Datapath.List), - }, + cs: collectors, } } From f33d105bb5b097b27cc3101fd1358207cc13d8ac Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Thu, 28 Aug 2025 21:40:43 +0530 Subject: [PATCH 02/19] GATEWAYS-4306: scaling with even driven approach --- go.mod | 10 +- go.sum | 22 +- internal/ovsexporter/conntrack.go | 340 +++++++++++++++++++++++++--- internal/ovsexporter/ovsexporter.go | 60 +++-- vendor/modules.txt | 113 +++++++++ 5 files changed, 475 insertions(+), 70 deletions(-) create mode 100644 vendor/modules.txt diff --git a/go.mod b/go.mod index fb8e7af..49f73b0 100644 --- a/go.mod +++ b/go.mod @@ -22,14 +22,14 @@ require ( github.com/mdlayher/socket v0.5.1 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.2.0 // indirect - github.com/prometheus/common v0.17.0 // indirect - github.com/prometheus/procfs v0.6.0 // indirect + github.com/prometheus/common v0.15.0 // indirect + github.com/prometheus/procfs v0.2.0 // indirect github.com/ti-mo/conntrack v0.5.2 // indirect github.com/ti-mo/netfilter v0.5.3 // indirect golang.org/x/net v0.39.0 // indirect golang.org/x/sync v0.14.0 // indirect - golang.org/x/sys v0.34.0 // indirect - google.golang.org/protobuf v1.25.0 // indirect + golang.org/x/sys v0.35.0 // indirect + google.golang.org/protobuf v1.23.0 // indirect ) -replace github.com/digitalocean/go-openvswitch => /Users/sgangopadhyay/dev/digitalocean/go-openvswitch +replace github.com/digitalocean/go-openvswitch => ../go-openvswitch diff --git a/go.sum b/go.sum index b2e238f..d03edde 100644 --- a/go.sum +++ b/go.sum @@ -80,7 +80,6 @@ github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:x github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3 h1:JjCZWpVbqXDqFVmTfYWEVTMIYrL/NPdPSCHPJ0T/raM= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= @@ -91,8 +90,6 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -236,17 +233,15 @@ github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8 github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= +github.com/prometheus/common v0.15.0 h1:4fgOnadei3EZvgRwxJ7RMpG1k1pOZth5Pc13tyspaKM= github.com/prometheus/common v0.15.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s= -github.com/prometheus/common v0.17.0 h1:kDIZLI74SS+3tedSvEkykgBkD7txMxaJAPj8DtJUKYA= -github.com/prometheus/common v0.17.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= +github.com/prometheus/procfs v0.2.0 h1:wH4vA7pcjKuZzjF7lM8awk4fnuJO6idemZXoKnULUx4= github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= -github.com/prometheus/procfs v0.6.0 h1:mxy4L2jP6qMonqmq+aTtOx1ifVWUgG/TAmntgbh3xv4= -github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible h1:jgW1I0kWFlDOqNLlYBcxVfpRGSOL3n6lXn1BykdEG30= github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible/go.mod h1:oAIUtOny2rjMX0OWN5vPR5/q/twIROJvdqnQKDdil/s= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= @@ -342,7 +337,6 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -368,9 +362,8 @@ golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201214210602-f9fddec55a1e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.34.0 h1:H5Y5sJ2L2JRdyv7ROF1he/lPdvFsd0mJHFw2ThKHxLA= -golang.org/x/sys v0.34.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -400,7 +393,6 @@ google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRn google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190530194941-fb225487d101/go.mod h1:z3L6/3dTEVtUr6QSP8miRzeRqwQOioJ9I66odjN4I7s= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= -google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.0/go.mod h1:chYK+tFQF0nDUGJgXMSgLCQk3phJEuONr2DCgLDdAQM= @@ -410,17 +402,13 @@ google.golang.org/grpc v1.22.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= -google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= -google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c= -google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index b949127..e0f4390 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -1,37 +1,307 @@ package ovsexporter import ( + "context" "fmt" "log" + "math/rand" + "runtime" + "sync" + "time" "github.com/digitalocean/go-openvswitch/ovsnl" "github.com/prometheus/client_golang/prometheus" ) -type conntrackCollector struct { - Count *prometheus.Desc - listConntrackEntries func() ([]ovsnl.ConntrackEntry, error) +const ( + zoneThreshold = 50000 // Configure threshold for zone alerts (reduced for 2M test) + // Memory management for large conntrack tables + maxEntriesPerZone = 100 // Drastically reduced maximum entries to collect per zone to prevent OOM + largeZoneThreshold = 100000 // Use streaming approach for zones with >100k entries + // Memory pressure thresholds + memoryPressureThreshold = 0.8 // Trigger memory pressure handling when 80% of memory is used + // CPU time limits + maxCPUTimePerCollection = 60 * time.Second // Maximum CPU time per collection cycle + // Sampling configuration for large zones + sampleRateForLargeZones = 0.01 // Sample 1% of entries for zones > 1M entries + // Timeout configuration + conntrackTimeout = 30 * time.Second // Reduced timeout to prevent getting stuck + // Memory pressure logging cooldown + memoryPressureLogCooldown = 30 * time.Second // Prevent log spam + // Memory cleanup thresholds + memoryCleanupThreshold = 0.7 // Trigger aggressive cleanup at 70% usage + // Circuit breaker for performance regression + maxConsecutiveTimeouts = 3 // Stop processing after 3 consecutive timeouts +) + +var ( + lastMemoryPressureLog time.Time + consecutiveTimeouts int + lastTimeoutTime time.Time +) + +type ConntrackCollector struct { + Count *prometheus.Desc + Performance *prometheus.Desc + listZoneStats func(context.Context, int) (map[uint16]*ovsnl.ZoneStats, error) + getStats func() (*ovsnl.ConntrackPerformanceStats, error) } -func newConntrackCollector(fn func() ([]ovsnl.ConntrackEntry, error)) prometheus.Collector { - return &conntrackCollector{ +// ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot +type ConntrackCollectorWithAggAccessor struct { + *ConntrackCollector + SnapshotFunc func() map[uint16]map[uint32]int +} + +func newConntrackCollector(fn func(context.Context, int) (map[uint16]*ovsnl.ZoneStats, error), statsFn func() (*ovsnl.ConntrackPerformanceStats, error)) prometheus.Collector { + return &ConntrackCollector{ Count: prometheus.NewDesc( prometheus.BuildFQName(namespace, "conntrack", "count"), "Number of conntrack entries by zone, state, and mark", []string{"zone", "state", "mark"}, nil, ), - listConntrackEntries: fn, + Performance: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "conntrack", "performance"), + "Conntrack performance counters", + []string{"counter"}, nil, + ), + listZoneStats: fn, + getStats: statsFn, + } +} + +// checkCircuitBreaker checks if we should stop processing due to too many timeouts +func checkCircuitBreaker() bool { + now := time.Now() + + // Reset counter if more than 5 minutes have passed since last timeout + if now.Sub(lastTimeoutTime) > 5*time.Minute { + consecutiveTimeouts = 0 + return false + } + + // If we've had too many consecutive timeouts, stop processing + if consecutiveTimeouts >= maxConsecutiveTimeouts { + log.Printf("Circuit breaker triggered: %d consecutive timeouts, stopping conntrack collection", consecutiveTimeouts) + return true + } + + return false +} + +// checkMemoryPressure checks if we're under memory pressure and triggers GC if needed +func checkMemoryPressure() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + + // Calculate memory usage percentage + memoryUsage := float64(m.Alloc) / float64(m.Sys) + + if memoryUsage > memoryPressureThreshold { + // Only log if enough time has passed since last log + if time.Since(lastMemoryPressureLog) > memoryPressureLogCooldown { + log.Printf("Memory pressure detected: %.2f%% usage, triggering GC", memoryUsage*100) + lastMemoryPressureLog = time.Now() + } + runtime.GC() + } else if memoryUsage > memoryCleanupThreshold { + // Aggressive cleanup at 70% usage + runtime.GC() + } +} + +// shouldSampleEntry determines if we should sample an entry based on zone size +func shouldSampleEntry(zoneTotalCount int, entryIndex int) bool { + if zoneTotalCount <= maxEntriesPerZone { + // For small zones, collect all entries + return true + } + + if zoneTotalCount > 1000000 { + // For very large zones (>1M), use statistical sampling + return rand.Float64() < sampleRateForLargeZones + } + + // For medium zones, collect first maxEntriesPerZone entries + return entryIndex < maxEntriesPerZone +} + +// checkCPUTime checks if we're exceeding CPU time limits +func checkCPUTime(startTime time.Time) bool { + elapsed := time.Since(startTime) + if elapsed > maxCPUTimePerCollection { + log.Printf("CPU time limit exceeded: %v elapsed, continuing with sampling", elapsed) + return true + } + return false +} + +// collectConntrackWithTimeout safely collects conntrack data with timeout protection +func (c *ConntrackCollector) collectConntrackWithTimeout(ctx context.Context, threshold int) (map[uint16]*ovsnl.ZoneStats, error) { + // Check circuit breaker first + if checkCircuitBreaker() { + log.Printf("Circuit breaker active, skipping conntrack collection") + return make(map[uint16]*ovsnl.ZoneStats), nil + } + + var result map[uint16]*ovsnl.ZoneStats + var err error + var mu sync.Mutex + var wg sync.WaitGroup + + // Create a timeout context + timeoutCtx, cancel := context.WithTimeout(ctx, conntrackTimeout) + defer cancel() + + // Start collection in a goroutine + wg.Add(1) + go func() { + defer wg.Done() + defer func() { + if r := recover(); r != nil { + log.Printf("Panic in conntrack collection: %v", r) + err = fmt.Errorf("panic in conntrack collection: %v", r) + } + }() + + // Try streaming first, fallback to regular + if c.listZoneStats != nil { + result, err = c.listZoneStats(timeoutCtx, threshold) + } else { + // This case should ideally not be reached if listZoneStats is always set + err = fmt.Errorf("no listZoneStats function available") + } + + mu.Lock() + defer mu.Unlock() + }() + + // Wait for completion or timeout + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + mu.Lock() + defer mu.Unlock() + // Reset timeout counter on success + consecutiveTimeouts = 0 + return result, err + case <-timeoutCtx.Done(): + // Track timeout + consecutiveTimeouts++ + lastTimeoutTime = time.Now() + log.Printf("Conntrack collection timed out after %v (timeout #%d), returning partial results", conntrackTimeout, consecutiveTimeouts) + // Force cleanup before returning + runtime.GC() + // Return empty result instead of error to prevent metric collection failure + return make(map[uint16]*ovsnl.ZoneStats), nil } } -func (c *conntrackCollector) Describe(ch chan<- *prometheus.Desc) { +func (c *ConntrackCollector) Describe(ch chan<- *prometheus.Desc) { ch <- c.Count + ch <- c.Performance } -func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { - entries, err := c.listConntrackEntries() +func (c *ConntrackCollector) Collect(ch chan<- prometheus.Metric) { + startTime := time.Now() + ctx := context.Background() + + // Check memory pressure before starting + checkMemoryPressure() + + // Emergency shutdown if memory pressure is too high + var m runtime.MemStats + runtime.ReadMemStats(&m) + memoryUsage := float64(m.Alloc) / float64(m.Sys) + if memoryUsage > 0.85 { // 85% threshold for emergency shutdown + log.Printf("Emergency shutdown: memory usage %.2f%% too high, skipping conntrack collection", memoryUsage*100) + // Return basic metrics only + ch <- prometheus.MustNewConstMetric( + c.Count, + prometheus.GaugeValue, + 0.0, + "emergency", "shutdown", "0", + ) + return + } + + // Collect performance stats first (lightweight operation) + if c.getStats != nil { + if stats, err := c.getStats(); err == nil { + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalFound), + "found", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalInvalid), + "invalid", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalIgnore), + "ignore", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalInsert), + "insert", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalInsertFailed), + "insert_failed", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalDrop), + "drop", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalEarlyDrop), + "early_drop", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalError), + "error", + ) + ch <- prometheus.MustNewConstMetric( + c.Performance, + prometheus.GaugeValue, + float64(stats.TotalSearchRestart), + "search_restart", + ) + } else { + log.Printf("Failed to collect conntrack performance stats: %v", err) + } + } + + // Check memory pressure again before heavy operation + checkMemoryPressure() + + // Collect zone statistics with timeout protection + stats, err := c.collectConntrackWithTimeout(ctx, zoneThreshold) + if err != nil { log.Printf("Failed to collect conntrack entries: %v", err) + // Force cleanup on error + runtime.GC() // Return a zero metric to indicate the collector is working but no data ch <- prometheus.MustNewConstMetric( c.Count, @@ -42,34 +312,44 @@ func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { return } - // Log the number of entries found for debugging - log.Printf("Found %d conntrack entries", len(entries)) - - // Aggregate counts - counts := make(map[string]map[string]map[string]int) - for _, e := range entries { - zone := fmt.Sprintf("%d", e.Zone) - state := e.State - mark := fmt.Sprintf("%d", e.Mark) - if counts[zone] == nil { - counts[zone] = make(map[string]map[string]int) - } - if counts[zone][state] == nil { - counts[zone][state] = make(map[string]int) - } - counts[zone][state][mark]++ + // Process zones using event-driven aggregator data + // This is much more efficient than the old sampling approach + for zone, zoneStats := range stats { + // Always emit total count for the zone (this is critical!) + ch <- prometheus.MustNewConstMetric( + c.Count, + prometheus.GaugeValue, + float64(zoneStats.TotalCount), + fmt.Sprint(zone), + "total", + "0", + ) } - for zone, stateMap := range counts { - for state, markMap := range stateMap { - for mark, count := range markMap { + // OPTIONAL: emit per-mark counts using the aggregator directly. + // This avoids storing per-entry slices and stays O(unique marks). + if aggClient, ok := any(c).(*ConntrackCollectorWithAggAccessor); ok { + zm := aggClient.SnapshotFunc() // <- we'll show how to plumb this accessor next + // To avoid high-cardinality explosion, you can cap marks per zone: + const maxMarksPerZone = 2000 // tune for your environment + for zone, markMap := range zm { + emitted := 0 + for mark, cnt := range markMap { + if emitted >= maxMarksPerZone { + break + } ch <- prometheus.MustNewConstMetric( c.Count, prometheus.GaugeValue, - float64(count), - zone, state, mark, + float64(cnt), + fmt.Sprint(zone), "total", fmt.Sprint(mark), ) + emitted++ } } } + + // Log collection time + elapsed := time.Since(startTime) + log.Printf("Conntrack collection completed in %v", elapsed) } diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 8d19838..2ae1a4f 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -20,8 +20,9 @@ const ( // A collector aggregates Open vSwitch Prometheus collectors. type collector struct { - mu sync.Mutex - cs []prometheus.Collector + mu sync.Mutex + cs []prometheus.Collector + conntrackEnabled bool } var _ prometheus.Collector = &collector{} @@ -33,27 +34,50 @@ func New(c *ovsnl.Client) prometheus.Collector { newDatapathCollector(c.Datapath.List), } - // Try to add conntrack collector, but don't fail if it's not available - conntrackCollector := newConntrackCollector(func() ([]ovsnl.ConntrackEntry, error) { - svc, err := ovsnl.NewConntrackService() - if err != nil { - return nil, err - } - defer svc.Close() - return svc.List(context.Background()) - }) + // When you build the collector in New(...): + var snapshot func() map[uint16]map[uint32]int + if c.Agg != nil { + snapshot = c.Agg.Snapshot + } + base := newConntrackCollector( + // listZoneStats: + func(ctx context.Context, threshold int) (map[uint16]*ovsnl.ZoneStats, error) { + if c.Agg == nil { + return map[uint16]*ovsnl.ZoneStats{}, nil + } + zm := c.Agg.Snapshot() + + out := make(map[uint16]*ovsnl.ZoneStats, len(zm)) + for zone, marks := range zm { + total := 0 + for _, cnt := range marks { + total += cnt + } + // Always include the zone (so "total" time series is complete). + zs := &ovsnl.ZoneStats{TotalCount: total} + // No per-entry slice to avoid memory. + // If you still want per-mark metrics, do it in Collect directly using zm. + out[zone] = zs + _ = threshold // threshold is not used here; you can still filter if desired. + } + return out, nil + }, + // getStats: Disabled due to multicast connection issues + nil, // This will skip stats collection entirely + ) + conntrackCollector := &ConntrackCollectorWithAggAccessor{ + ConntrackCollector: base.(*ConntrackCollector), + SnapshotFunc: snapshot, + } - // Test if conntrack service can be created - if _, err := ovsnl.NewConntrackService(); err != nil { - log.Printf("Warning: Conntrack service not available: %v. Conntrack metrics will be disabled.", err) + if c.Conntrack == nil { + log.Printf("Warning: Conntrack service not available; metrics disabled.") } else { collectors = append(collectors, conntrackCollector) - log.Printf("Conntrack collector enabled") + log.Printf("Conntrack collector enabled (event-driven)") } - return &collector{ - cs: collectors, - } + return &collector{cs: collectors, conntrackEnabled: true} } // Describe implements prometheus.Collector. diff --git a/vendor/modules.txt b/vendor/modules.txt new file mode 100644 index 0000000..d76209f --- /dev/null +++ b/vendor/modules.txt @@ -0,0 +1,113 @@ +# github.com/beorn7/perks v1.0.1 +## explicit; go 1.11 +github.com/beorn7/perks/quantile +# github.com/cespare/xxhash/v2 v2.1.1 +## explicit; go 1.11 +github.com/cespare/xxhash/v2 +# github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 => ../go-openvswitch +## explicit; go 1.23.0 +github.com/digitalocean/go-openvswitch/ovsnl +github.com/digitalocean/go-openvswitch/ovsnl/internal/ovsh +# github.com/golang/protobuf v1.4.3 +## explicit; go 1.9 +github.com/golang/protobuf/proto +github.com/golang/protobuf/ptypes +github.com/golang/protobuf/ptypes/any +github.com/golang/protobuf/ptypes/duration +github.com/golang/protobuf/ptypes/timestamp +# github.com/google/go-cmp v0.7.0 +## explicit; go 1.21 +github.com/google/go-cmp/cmp +github.com/google/go-cmp/cmp/internal/diff +github.com/google/go-cmp/cmp/internal/flags +github.com/google/go-cmp/cmp/internal/function +github.com/google/go-cmp/cmp/internal/value +# github.com/josharian/native v1.1.0 +## explicit; go 1.13 +github.com/josharian/native +# github.com/matttproud/golang_protobuf_extensions v1.0.1 +## explicit +github.com/matttproud/golang_protobuf_extensions/pbutil +# github.com/mdlayher/genetlink v1.0.0 +## explicit; go 1.13 +github.com/mdlayher/genetlink +# github.com/mdlayher/netlink v1.7.2 +## explicit; go 1.18 +github.com/mdlayher/netlink +github.com/mdlayher/netlink/nlenc +# github.com/mdlayher/socket v0.5.1 +## explicit; go 1.20 +github.com/mdlayher/socket +# github.com/pkg/errors v0.9.1 +## explicit +github.com/pkg/errors +# github.com/prometheus/client_golang v1.9.0 +## explicit; go 1.11 +github.com/prometheus/client_golang/prometheus +github.com/prometheus/client_golang/prometheus/internal +github.com/prometheus/client_golang/prometheus/promhttp +# github.com/prometheus/client_model v0.2.0 +## explicit; go 1.9 +github.com/prometheus/client_model/go +# github.com/prometheus/common v0.15.0 +## explicit; go 1.11 +github.com/prometheus/common/expfmt +github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg +github.com/prometheus/common/model +# github.com/prometheus/procfs v0.2.0 +## explicit; go 1.12 +github.com/prometheus/procfs +github.com/prometheus/procfs/internal/fs +github.com/prometheus/procfs/internal/util +# github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible +## explicit +github.com/prometheus/prometheus/util/promlint +# github.com/ti-mo/conntrack v0.5.2 +## explicit; go 1.23.0 +github.com/ti-mo/conntrack +# github.com/ti-mo/netfilter v0.5.3 +## explicit; go 1.23.0 +github.com/ti-mo/netfilter +# golang.org/x/net v0.39.0 +## explicit; go 1.23.0 +golang.org/x/net/bpf +# golang.org/x/sync v0.14.0 +## explicit; go 1.23.0 +golang.org/x/sync/errgroup +# golang.org/x/sys v0.35.0 +## explicit; go 1.23.0 +golang.org/x/sys/unix +golang.org/x/sys/windows +# google.golang.org/protobuf v1.23.0 +## explicit; go 1.9 +google.golang.org/protobuf/encoding/prototext +google.golang.org/protobuf/encoding/protowire +google.golang.org/protobuf/internal/descfmt +google.golang.org/protobuf/internal/descopts +google.golang.org/protobuf/internal/detrand +google.golang.org/protobuf/internal/encoding/defval +google.golang.org/protobuf/internal/encoding/messageset +google.golang.org/protobuf/internal/encoding/tag +google.golang.org/protobuf/internal/encoding/text +google.golang.org/protobuf/internal/errors +google.golang.org/protobuf/internal/fieldnum +google.golang.org/protobuf/internal/fieldsort +google.golang.org/protobuf/internal/filedesc +google.golang.org/protobuf/internal/filetype +google.golang.org/protobuf/internal/flags +google.golang.org/protobuf/internal/genname +google.golang.org/protobuf/internal/impl +google.golang.org/protobuf/internal/mapsort +google.golang.org/protobuf/internal/pragma +google.golang.org/protobuf/internal/set +google.golang.org/protobuf/internal/strs +google.golang.org/protobuf/internal/version +google.golang.org/protobuf/proto +google.golang.org/protobuf/reflect/protoreflect +google.golang.org/protobuf/reflect/protoregistry +google.golang.org/protobuf/runtime/protoiface +google.golang.org/protobuf/runtime/protoimpl +google.golang.org/protobuf/types/known/anypb +google.golang.org/protobuf/types/known/durationpb +google.golang.org/protobuf/types/known/timestamppb +# github.com/digitalocean/go-openvswitch => ../go-openvswitch From 30b6d5819256111694d62c13a72d0e70fbea0e16 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 3 Sep 2025 01:28:14 +0530 Subject: [PATCH 03/19] code cleanup --- internal/ovsexporter/conntrack.go | 345 +++------------------------- internal/ovsexporter/ovsexporter.go | 65 +++--- vendor/modules.txt | 113 --------- 3 files changed, 52 insertions(+), 471 deletions(-) delete mode 100644 vendor/modules.txt diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index e0f4390..f6f4938 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -1,355 +1,62 @@ package ovsexporter import ( - "context" "fmt" "log" - "math/rand" - "runtime" - "sync" - "time" "github.com/digitalocean/go-openvswitch/ovsnl" "github.com/prometheus/client_golang/prometheus" ) -const ( - zoneThreshold = 50000 // Configure threshold for zone alerts (reduced for 2M test) - // Memory management for large conntrack tables - maxEntriesPerZone = 100 // Drastically reduced maximum entries to collect per zone to prevent OOM - largeZoneThreshold = 100000 // Use streaming approach for zones with >100k entries - // Memory pressure thresholds - memoryPressureThreshold = 0.8 // Trigger memory pressure handling when 80% of memory is used - // CPU time limits - maxCPUTimePerCollection = 60 * time.Second // Maximum CPU time per collection cycle - // Sampling configuration for large zones - sampleRateForLargeZones = 0.01 // Sample 1% of entries for zones > 1M entries - // Timeout configuration - conntrackTimeout = 30 * time.Second // Reduced timeout to prevent getting stuck - // Memory pressure logging cooldown - memoryPressureLogCooldown = 30 * time.Second // Prevent log spam - // Memory cleanup thresholds - memoryCleanupThreshold = 0.7 // Trigger aggressive cleanup at 70% usage - // Circuit breaker for performance regression - maxConsecutiveTimeouts = 3 // Stop processing after 3 consecutive timeouts -) - -var ( - lastMemoryPressureLog time.Time - consecutiveTimeouts int - lastTimeoutTime time.Time -) - -type ConntrackCollector struct { - Count *prometheus.Desc - Performance *prometheus.Desc - listZoneStats func(context.Context, int) (map[uint16]*ovsnl.ZoneStats, error) - getStats func() (*ovsnl.ConntrackPerformanceStats, error) +type conntrackCollector struct { + desc *prometheus.Desc + agg *ovsnl.ZoneMarkAggregator } // ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot type ConntrackCollectorWithAggAccessor struct { - *ConntrackCollector + *conntrackCollector SnapshotFunc func() map[uint16]map[uint32]int } -func newConntrackCollector(fn func(context.Context, int) (map[uint16]*ovsnl.ZoneStats, error), statsFn func() (*ovsnl.ConntrackPerformanceStats, error)) prometheus.Collector { - return &ConntrackCollector{ - Count: prometheus.NewDesc( +func newConntrackCollector(agg *ovsnl.ZoneMarkAggregator) prometheus.Collector { + return &conntrackCollector{ + desc: prometheus.NewDesc( prometheus.BuildFQName(namespace, "conntrack", "count"), - "Number of conntrack entries by zone, state, and mark", - []string{"zone", "state", "mark"}, nil, - ), - Performance: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "conntrack", "performance"), - "Conntrack performance counters", - []string{"counter"}, nil, + "Number of conntrack entries by zone and mark", + []string{"zone", "mark"}, + nil, ), - listZoneStats: fn, - getStats: statsFn, + agg: agg, } } -// checkCircuitBreaker checks if we should stop processing due to too many timeouts -func checkCircuitBreaker() bool { - now := time.Now() - - // Reset counter if more than 5 minutes have passed since last timeout - if now.Sub(lastTimeoutTime) > 5*time.Minute { - consecutiveTimeouts = 0 - return false - } - - // If we've had too many consecutive timeouts, stop processing - if consecutiveTimeouts >= maxConsecutiveTimeouts { - log.Printf("Circuit breaker triggered: %d consecutive timeouts, stopping conntrack collection", consecutiveTimeouts) - return true - } - - return false +func (c *conntrackCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc } -// checkMemoryPressure checks if we're under memory pressure and triggers GC if needed -func checkMemoryPressure() { - var m runtime.MemStats - runtime.ReadMemStats(&m) - - // Calculate memory usage percentage - memoryUsage := float64(m.Alloc) / float64(m.Sys) - - if memoryUsage > memoryPressureThreshold { - // Only log if enough time has passed since last log - if time.Since(lastMemoryPressureLog) > memoryPressureLogCooldown { - log.Printf("Memory pressure detected: %.2f%% usage, triggering GC", memoryUsage*100) - lastMemoryPressureLog = time.Now() - } - runtime.GC() - } else if memoryUsage > memoryCleanupThreshold { - // Aggressive cleanup at 70% usage - runtime.GC() - } -} - -// shouldSampleEntry determines if we should sample an entry based on zone size -func shouldSampleEntry(zoneTotalCount int, entryIndex int) bool { - if zoneTotalCount <= maxEntriesPerZone { - // For small zones, collect all entries - return true - } - - if zoneTotalCount > 1000000 { - // For very large zones (>1M), use statistical sampling - return rand.Float64() < sampleRateForLargeZones - } - - // For medium zones, collect first maxEntriesPerZone entries - return entryIndex < maxEntriesPerZone -} - -// checkCPUTime checks if we're exceeding CPU time limits -func checkCPUTime(startTime time.Time) bool { - elapsed := time.Since(startTime) - if elapsed > maxCPUTimePerCollection { - log.Printf("CPU time limit exceeded: %v elapsed, continuing with sampling", elapsed) - return true - } - return false -} - -// collectConntrackWithTimeout safely collects conntrack data with timeout protection -func (c *ConntrackCollector) collectConntrackWithTimeout(ctx context.Context, threshold int) (map[uint16]*ovsnl.ZoneStats, error) { - // Check circuit breaker first - if checkCircuitBreaker() { - log.Printf("Circuit breaker active, skipping conntrack collection") - return make(map[uint16]*ovsnl.ZoneStats), nil - } - - var result map[uint16]*ovsnl.ZoneStats - var err error - var mu sync.Mutex - var wg sync.WaitGroup - - // Create a timeout context - timeoutCtx, cancel := context.WithTimeout(ctx, conntrackTimeout) - defer cancel() - - // Start collection in a goroutine - wg.Add(1) - go func() { - defer wg.Done() - defer func() { - if r := recover(); r != nil { - log.Printf("Panic in conntrack collection: %v", r) - err = fmt.Errorf("panic in conntrack collection: %v", r) - } - }() - - // Try streaming first, fallback to regular - if c.listZoneStats != nil { - result, err = c.listZoneStats(timeoutCtx, threshold) - } else { - // This case should ideally not be reached if listZoneStats is always set - err = fmt.Errorf("no listZoneStats function available") - } - - mu.Lock() - defer mu.Unlock() - }() - - // Wait for completion or timeout - done := make(chan struct{}) - go func() { - wg.Wait() - close(done) - }() - - select { - case <-done: - mu.Lock() - defer mu.Unlock() - // Reset timeout counter on success - consecutiveTimeouts = 0 - return result, err - case <-timeoutCtx.Done(): - // Track timeout - consecutiveTimeouts++ - lastTimeoutTime = time.Now() - log.Printf("Conntrack collection timed out after %v (timeout #%d), returning partial results", conntrackTimeout, consecutiveTimeouts) - // Force cleanup before returning - runtime.GC() - // Return empty result instead of error to prevent metric collection failure - return make(map[uint16]*ovsnl.ZoneStats), nil - } -} - -func (c *ConntrackCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.Count - ch <- c.Performance -} - -func (c *ConntrackCollector) Collect(ch chan<- prometheus.Metric) { - startTime := time.Now() - ctx := context.Background() - - // Check memory pressure before starting - checkMemoryPressure() - - // Emergency shutdown if memory pressure is too high - var m runtime.MemStats - runtime.ReadMemStats(&m) - memoryUsage := float64(m.Alloc) / float64(m.Sys) - if memoryUsage > 0.85 { // 85% threshold for emergency shutdown - log.Printf("Emergency shutdown: memory usage %.2f%% too high, skipping conntrack collection", memoryUsage*100) - // Return basic metrics only +func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { + if c.agg == nil { + log.Printf("No aggregator available, emitting zero metric") ch <- prometheus.MustNewConstMetric( - c.Count, + c.desc, prometheus.GaugeValue, - 0.0, - "emergency", "shutdown", "0", + 0, + "unknown", "unknown", ) return } - // Collect performance stats first (lightweight operation) - if c.getStats != nil { - if stats, err := c.getStats(); err == nil { + snapshot := c.agg.Snapshot() + for zone, marks := range snapshot { + for mark, count := range marks { ch <- prometheus.MustNewConstMetric( - c.Performance, + c.desc, prometheus.GaugeValue, - float64(stats.TotalFound), - "found", + float64(count), + fmt.Sprintf("%d", zone), + fmt.Sprintf("%d", mark), ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalInvalid), - "invalid", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalIgnore), - "ignore", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalInsert), - "insert", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalInsertFailed), - "insert_failed", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalDrop), - "drop", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalEarlyDrop), - "early_drop", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalError), - "error", - ) - ch <- prometheus.MustNewConstMetric( - c.Performance, - prometheus.GaugeValue, - float64(stats.TotalSearchRestart), - "search_restart", - ) - } else { - log.Printf("Failed to collect conntrack performance stats: %v", err) - } - } - - // Check memory pressure again before heavy operation - checkMemoryPressure() - - // Collect zone statistics with timeout protection - stats, err := c.collectConntrackWithTimeout(ctx, zoneThreshold) - - if err != nil { - log.Printf("Failed to collect conntrack entries: %v", err) - // Force cleanup on error - runtime.GC() - // Return a zero metric to indicate the collector is working but no data - ch <- prometheus.MustNewConstMetric( - c.Count, - prometheus.GaugeValue, - 0.0, - "unknown", "unknown", "0", - ) - return - } - - // Process zones using event-driven aggregator data - // This is much more efficient than the old sampling approach - for zone, zoneStats := range stats { - // Always emit total count for the zone (this is critical!) - ch <- prometheus.MustNewConstMetric( - c.Count, - prometheus.GaugeValue, - float64(zoneStats.TotalCount), - fmt.Sprint(zone), - "total", - "0", - ) - } - - // OPTIONAL: emit per-mark counts using the aggregator directly. - // This avoids storing per-entry slices and stays O(unique marks). - if aggClient, ok := any(c).(*ConntrackCollectorWithAggAccessor); ok { - zm := aggClient.SnapshotFunc() // <- we'll show how to plumb this accessor next - // To avoid high-cardinality explosion, you can cap marks per zone: - const maxMarksPerZone = 2000 // tune for your environment - for zone, markMap := range zm { - emitted := 0 - for mark, cnt := range markMap { - if emitted >= maxMarksPerZone { - break - } - ch <- prometheus.MustNewConstMetric( - c.Count, - prometheus.GaugeValue, - float64(cnt), - fmt.Sprint(zone), "total", fmt.Sprint(mark), - ) - emitted++ - } } } - - // Log collection time - elapsed := time.Since(startTime) - log.Printf("Conntrack collection completed in %v", elapsed) } diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 2ae1a4f..d1531c9 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -6,10 +6,11 @@ package ovsexporter import ( - "context" "log" "sync" + // "time" + "github.com/digitalocean/go-openvswitch/ovsnl" "github.com/prometheus/client_golang/prometheus" ) @@ -25,6 +26,7 @@ type collector struct { conntrackEnabled bool } +// Make sure collector implements prometheus.Collector var _ prometheus.Collector = &collector{} // New creates a new Prometheus collector which collects metrics using the @@ -34,50 +36,35 @@ func New(c *ovsnl.Client) prometheus.Collector { newDatapathCollector(c.Datapath.List), } - // When you build the collector in New(...): - var snapshot func() map[uint16]map[uint32]int - if c.Agg != nil { - snapshot = c.Agg.Snapshot + // Start zone/mark aggregator + svc, err := ovsnl.NewConntrackService() + if err != nil { + log.Printf("Warning: Conntrack service not available: %v", err) + return &collector{cs: collectors} } - base := newConntrackCollector( - // listZoneStats: - func(ctx context.Context, threshold int) (map[uint16]*ovsnl.ZoneStats, error) { - if c.Agg == nil { - return map[uint16]*ovsnl.ZoneStats{}, nil - } - zm := c.Agg.Snapshot() - - out := make(map[uint16]*ovsnl.ZoneStats, len(zm)) - for zone, marks := range zm { - total := 0 - for _, cnt := range marks { - total += cnt - } - // Always include the zone (so "total" time series is complete). - zs := &ovsnl.ZoneStats{TotalCount: total} - // No per-entry slice to avoid memory. - // If you still want per-mark metrics, do it in Collect directly using zm. - out[zone] = zs - _ = threshold // threshold is not used here; you can still filter if desired. - } - return out, nil - }, - // getStats: Disabled due to multicast connection issues - nil, // This will skip stats collection entirely - ) - conntrackCollector := &ConntrackCollectorWithAggAccessor{ - ConntrackCollector: base.(*ConntrackCollector), - SnapshotFunc: snapshot, + + agg, err := ovsnl.NewZoneMarkAggregator(svc) + if err != nil { + log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) + return &collector{cs: collectors} } + //TODO : To confirm if we absolutely need this, can omit if eventual consistency is ok - if c.Conntrack == nil { - log.Printf("Warning: Conntrack service not available; metrics disabled.") + // if err := agg.PrimeSnapshot(context.Background(), 0); err != nil { + // log.Printf("Warning: Failed to prime snapshot: %v", err) + // } + if err := agg.Start(); err != nil { + log.Printf("Warning: Failed to start zone/mark aggregator: %v", err) } else { - collectors = append(collectors, conntrackCollector) - log.Printf("Conntrack collector enabled (event-driven)") + log.Printf("Conntrack zone/mark aggregator started") } - return &collector{cs: collectors, conntrackEnabled: true} + collectors = append(collectors, newConntrackCollector(agg)) + + return &collector{ + cs: collectors, + conntrackEnabled: true, + } } // Describe implements prometheus.Collector. diff --git a/vendor/modules.txt b/vendor/modules.txt deleted file mode 100644 index d76209f..0000000 --- a/vendor/modules.txt +++ /dev/null @@ -1,113 +0,0 @@ -# github.com/beorn7/perks v1.0.1 -## explicit; go 1.11 -github.com/beorn7/perks/quantile -# github.com/cespare/xxhash/v2 v2.1.1 -## explicit; go 1.11 -github.com/cespare/xxhash/v2 -# github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 => ../go-openvswitch -## explicit; go 1.23.0 -github.com/digitalocean/go-openvswitch/ovsnl -github.com/digitalocean/go-openvswitch/ovsnl/internal/ovsh -# github.com/golang/protobuf v1.4.3 -## explicit; go 1.9 -github.com/golang/protobuf/proto -github.com/golang/protobuf/ptypes -github.com/golang/protobuf/ptypes/any -github.com/golang/protobuf/ptypes/duration -github.com/golang/protobuf/ptypes/timestamp -# github.com/google/go-cmp v0.7.0 -## explicit; go 1.21 -github.com/google/go-cmp/cmp -github.com/google/go-cmp/cmp/internal/diff -github.com/google/go-cmp/cmp/internal/flags -github.com/google/go-cmp/cmp/internal/function -github.com/google/go-cmp/cmp/internal/value -# github.com/josharian/native v1.1.0 -## explicit; go 1.13 -github.com/josharian/native -# github.com/matttproud/golang_protobuf_extensions v1.0.1 -## explicit -github.com/matttproud/golang_protobuf_extensions/pbutil -# github.com/mdlayher/genetlink v1.0.0 -## explicit; go 1.13 -github.com/mdlayher/genetlink -# github.com/mdlayher/netlink v1.7.2 -## explicit; go 1.18 -github.com/mdlayher/netlink -github.com/mdlayher/netlink/nlenc -# github.com/mdlayher/socket v0.5.1 -## explicit; go 1.20 -github.com/mdlayher/socket -# github.com/pkg/errors v0.9.1 -## explicit -github.com/pkg/errors -# github.com/prometheus/client_golang v1.9.0 -## explicit; go 1.11 -github.com/prometheus/client_golang/prometheus -github.com/prometheus/client_golang/prometheus/internal -github.com/prometheus/client_golang/prometheus/promhttp -# github.com/prometheus/client_model v0.2.0 -## explicit; go 1.9 -github.com/prometheus/client_model/go -# github.com/prometheus/common v0.15.0 -## explicit; go 1.11 -github.com/prometheus/common/expfmt -github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg -github.com/prometheus/common/model -# github.com/prometheus/procfs v0.2.0 -## explicit; go 1.12 -github.com/prometheus/procfs -github.com/prometheus/procfs/internal/fs -github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible -## explicit -github.com/prometheus/prometheus/util/promlint -# github.com/ti-mo/conntrack v0.5.2 -## explicit; go 1.23.0 -github.com/ti-mo/conntrack -# github.com/ti-mo/netfilter v0.5.3 -## explicit; go 1.23.0 -github.com/ti-mo/netfilter -# golang.org/x/net v0.39.0 -## explicit; go 1.23.0 -golang.org/x/net/bpf -# golang.org/x/sync v0.14.0 -## explicit; go 1.23.0 -golang.org/x/sync/errgroup -# golang.org/x/sys v0.35.0 -## explicit; go 1.23.0 -golang.org/x/sys/unix -golang.org/x/sys/windows -# google.golang.org/protobuf v1.23.0 -## explicit; go 1.9 -google.golang.org/protobuf/encoding/prototext -google.golang.org/protobuf/encoding/protowire -google.golang.org/protobuf/internal/descfmt -google.golang.org/protobuf/internal/descopts -google.golang.org/protobuf/internal/detrand -google.golang.org/protobuf/internal/encoding/defval -google.golang.org/protobuf/internal/encoding/messageset -google.golang.org/protobuf/internal/encoding/tag -google.golang.org/protobuf/internal/encoding/text -google.golang.org/protobuf/internal/errors -google.golang.org/protobuf/internal/fieldnum -google.golang.org/protobuf/internal/fieldsort -google.golang.org/protobuf/internal/filedesc -google.golang.org/protobuf/internal/filetype -google.golang.org/protobuf/internal/flags -google.golang.org/protobuf/internal/genname -google.golang.org/protobuf/internal/impl -google.golang.org/protobuf/internal/mapsort -google.golang.org/protobuf/internal/pragma -google.golang.org/protobuf/internal/set -google.golang.org/protobuf/internal/strs -google.golang.org/protobuf/internal/version -google.golang.org/protobuf/proto -google.golang.org/protobuf/reflect/protoreflect -google.golang.org/protobuf/reflect/protoregistry -google.golang.org/protobuf/runtime/protoiface -google.golang.org/protobuf/runtime/protoimpl -google.golang.org/protobuf/types/known/anypb -google.golang.org/protobuf/types/known/durationpb -google.golang.org/protobuf/types/known/timestamppb -# github.com/digitalocean/go-openvswitch => ../go-openvswitch From 79cd030ed3724080247650a31d3913808658dbd3 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Mon, 22 Sep 2025 20:43:44 +0530 Subject: [PATCH 04/19] conntrack destroy rate limiting part 1 --- internal/ovsexporter/ovsexporter.go | 33 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index d1531c9..5cfa9c9 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -24,6 +24,7 @@ type collector struct { mu sync.Mutex cs []prometheus.Collector conntrackEnabled bool + agg *ovsnl.ZoneMarkAggregator } // Make sure collector implements prometheus.Collector @@ -36,34 +37,32 @@ func New(c *ovsnl.Client) prometheus.Collector { newDatapathCollector(c.Datapath.List), } - // Start zone/mark aggregator - svc, err := ovsnl.NewConntrackService() - if err != nil { - log.Printf("Warning: Conntrack service not available: %v", err) + // Create the aggregator using the client's ConntrackService + if c.Conntrack == nil { + log.Printf("Warning: Conntrack service not available in client") return &collector{cs: collectors} } - agg, err := ovsnl.NewZoneMarkAggregator(svc) + agg, err := ovsnl.NewZoneMarkAggregator(c.Conntrack) if err != nil { log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) return &collector{cs: collectors} } - //TODO : To confirm if we absolutely need this, can omit if eventual consistency is ok - // if err := agg.PrimeSnapshot(context.Background(), 0); err != nil { - // log.Printf("Warning: Failed to prime snapshot: %v", err) - // } + // Start the aggregator if err := agg.Start(); err != nil { log.Printf("Warning: Failed to start zone/mark aggregator: %v", err) - } else { - log.Printf("Conntrack zone/mark aggregator started") + return &collector{cs: collectors} } + log.Printf("Enhanced conntrack zone/mark aggregator started with adaptive sync") + collectors = append(collectors, newConntrackCollector(agg)) return &collector{ cs: collectors, conntrackEnabled: true, + agg: agg, } } @@ -86,3 +85,15 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { cc.Collect(ch) } } + +// Close cleans up resources +func (c *collector) Close() { + c.mu.Lock() + defer c.mu.Unlock() + + if c.agg != nil { + log.Printf("Stopping conntrack aggregator...") + c.agg.Stop() + c.agg = nil + } +} From f4b2a6b211e5a94936939b39a7d778fee1fd6047 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Fri, 10 Oct 2025 12:01:28 +0530 Subject: [PATCH 05/19] refactor --- .github/workflows/go.yml | 4 ++-- go.mod | 13 +++++-------- go.sum | 26 ++++++++------------------ internal/ovsexporter/conntrack.go | 19 ++++++++----------- internal/ovsexporter/ovsexporter.go | 11 ++--------- 5 files changed, 25 insertions(+), 48 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 85cf4ec..46ef838 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -14,9 +14,9 @@ jobs: - uses: actions/checkout@v2 - name: Set up Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v6 with: - go-version: 1.15 + go-version: 1.25 - name: Get tooling run: | diff --git a/go.mod b/go.mod index 49f73b0..3949bbf 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,6 @@ module github.com/digitalocean/openvswitch_exporter -go 1.23.0 - -toolchain go1.24.2 +go 1.25 require ( github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 @@ -15,10 +13,9 @@ require ( github.com/cespare/xxhash/v2 v2.1.1 // indirect github.com/golang/protobuf v1.4.3 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/josharian/native v1.1.0 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect - github.com/mdlayher/genetlink v1.0.0 // indirect - github.com/mdlayher/netlink v1.7.2 // indirect + github.com/mdlayher/genetlink v1.3.2 // indirect + github.com/mdlayher/netlink v1.8.0 // indirect github.com/mdlayher/socket v0.5.1 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.2.0 // indirect @@ -26,9 +23,9 @@ require ( github.com/prometheus/procfs v0.2.0 // indirect github.com/ti-mo/conntrack v0.5.2 // indirect github.com/ti-mo/netfilter v0.5.3 // indirect - golang.org/x/net v0.39.0 // indirect + golang.org/x/net v0.45.0 // indirect golang.org/x/sync v0.14.0 // indirect - golang.org/x/sys v0.35.0 // indirect + golang.org/x/sys v0.37.0 // indirect google.golang.org/protobuf v1.23.0 // indirect ) diff --git a/go.sum b/go.sum index d03edde..96a616e 100644 --- a/go.sum +++ b/go.sum @@ -129,10 +129,7 @@ github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANyt github.com/influxdata/influxdb1-client v0.0.0-20191209144304-8bf82d3c094d/go.mod h1:qj24IKcXYK6Iy9ceXlo3Tc+vtHo9lIhSX5JddghvEPo= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= -github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= -github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= -github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= @@ -157,12 +154,10 @@ github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNx github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/mdlayher/genetlink v1.0.0 h1:OoHN1OdyEIkScEmRgxLEe2M9U8ClMytqA5niynLtfj0= -github.com/mdlayher/genetlink v1.0.0/go.mod h1:0rJ0h4itni50A86M2kHcgS85ttZazNt7a8H2a2cw0Gc= -github.com/mdlayher/netlink v0.0.0-20190409211403-11939a169225/go.mod h1:eQB3mZE4aiYnlUsyGGCOpPETfdQq4Jhsgf1fk3cwQaA= -github.com/mdlayher/netlink v1.0.0/go.mod h1:KxeJAFOFLG6AjpyDkQ/iIhxygIUKD+vcwqcnu43w/+M= -github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= -github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw= +github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o= +github.com/mdlayher/netlink v1.8.0 h1:e7XNIYJKD7hUct3Px04RuIGJbBxy1/c4nX7D5YyvvlM= +github.com/mdlayher/netlink v1.8.0/go.mod h1:UhgKXUlDQhzb09DrCl2GuRNEglHmhYoWAHid9HK3594= github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= @@ -324,11 +319,9 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM= +golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -348,22 +341,19 @@ golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181122145206-62eef0e2fa9b/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190411185658-b44545bcd369/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201214210602-f9fddec55a1e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index f6f4938..15cd610 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -16,7 +16,6 @@ type conntrackCollector struct { // ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot type ConntrackCollectorWithAggAccessor struct { *conntrackCollector - SnapshotFunc func() map[uint16]map[uint32]int } func newConntrackCollector(agg *ovsnl.ZoneMarkAggregator) prometheus.Collector { @@ -48,15 +47,13 @@ func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { } snapshot := c.agg.Snapshot() - for zone, marks := range snapshot { - for mark, count := range marks { - ch <- prometheus.MustNewConstMetric( - c.desc, - prometheus.GaugeValue, - float64(count), - fmt.Sprintf("%d", zone), - fmt.Sprintf("%d", mark), - ) - } + for key, count := range snapshot { + ch <- prometheus.MustNewConstMetric( + c.desc, + prometheus.GaugeValue, + float64(count), + fmt.Sprintf("%d", key.Zone), + fmt.Sprintf("%d", key.Mark), + ) } } diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 5cfa9c9..a00f7bb 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -37,13 +37,8 @@ func New(c *ovsnl.Client) prometheus.Collector { newDatapathCollector(c.Datapath.List), } - // Create the aggregator using the client's ConntrackService - if c.Conntrack == nil { - log.Printf("Warning: Conntrack service not available in client") - return &collector{cs: collectors} - } - - agg, err := ovsnl.NewZoneMarkAggregator(c.Conntrack) + // Create the aggregator + agg, err := ovsnl.NewZoneMarkAggregator() if err != nil { log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) return &collector{cs: collectors} @@ -55,8 +50,6 @@ func New(c *ovsnl.Client) prometheus.Collector { return &collector{cs: collectors} } - log.Printf("Enhanced conntrack zone/mark aggregator started with adaptive sync") - collectors = append(collectors, newConntrackCollector(agg)) return &collector{ From b98892f3d690105712acb5fc41453e00ff5954f7 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Mon, 27 Oct 2025 19:49:28 +0530 Subject: [PATCH 06/19] aggregating metrics for conntrack count per zone --- .github/workflows/go.yml | 15 +- go.mod | 10 +- go.sum | 14 +- internal/conntrack/aggregator_linux.go | 370 +++++++++++++++++++++++++ internal/conntrack/aggregator_stub.go | 42 +++ internal/conntrack/types.go | 68 +++++ internal/ovsexporter/conntrack.go | 6 +- internal/ovsexporter/ovsexporter.go | 15 +- 8 files changed, 513 insertions(+), 27 deletions(-) create mode 100644 internal/conntrack/aggregator_linux.go create mode 100644 internal/conntrack/aggregator_stub.go create mode 100644 internal/conntrack/types.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 46ef838..30a5f52 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -12,6 +12,13 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + with: + path: openvswitch_exporter + + - uses: actions/checkout@v2 + with: + repository: digitalocean/go-openvswitch + path: go-openvswitch - name: Set up Go uses: actions/setup-go@v6 @@ -22,18 +29,24 @@ jobs: run: | go get golang.org/x/lint/golint go get honnef.co/go/tools/cmd/staticcheck + working-directory: openvswitch_exporter - name: Build run: go build -v -tags=gofuzz ./... + working-directory: openvswitch_exporter - name: vet run: go vet ./... + working-directory: openvswitch_exporter - name: staticcheck run: staticcheck ./... + working-directory: openvswitch_exporter - name: lint run: golint -set_exit_status ./cmd/... ./internal/... + working-directory: openvswitch_exporter - name: Test - run: go test -v -race ./... \ No newline at end of file + run: go test -v -race ./... + working-directory: openvswitch_exporter \ No newline at end of file diff --git a/go.mod b/go.mod index 3949bbf..fe1d4ea 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,9 @@ require ( github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 github.com/prometheus/client_golang v1.9.0 github.com/prometheus/prometheus v2.2.1-0.20180315085919-58e2a31db8de+incompatible + github.com/ti-mo/conntrack v0.6.0 + github.com/ti-mo/netfilter v0.5.3 + golang.org/x/sync v0.17.0 ) require ( @@ -21,12 +24,9 @@ require ( github.com/prometheus/client_model v0.2.0 // indirect github.com/prometheus/common v0.15.0 // indirect github.com/prometheus/procfs v0.2.0 // indirect - github.com/ti-mo/conntrack v0.5.2 // indirect - github.com/ti-mo/netfilter v0.5.3 // indirect - golang.org/x/net v0.45.0 // indirect - golang.org/x/sync v0.14.0 // indirect + golang.org/x/net v0.46.0 // indirect golang.org/x/sys v0.37.0 // indirect google.golang.org/protobuf v1.23.0 // indirect ) -replace github.com/digitalocean/go-openvswitch => ../go-openvswitch +// replace github.com/digitalocean/go-openvswitch => ../go-openvswitch diff --git a/go.sum b/go.sum index 96a616e..5ec1a5c 100644 --- a/go.sum +++ b/go.sum @@ -43,6 +43,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= +github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8 h1:RQAD2flP6n+U5sAudMpru+EuLJ6VQduu6yenl6LwM5E= +github.com/digitalocean/go-openvswitch v0.0.0-20201214180534-ce0f183468d8/go.mod h1:MpzfscrezUxa94/T4sy2tDaxB+hQ6w0EmRBPv+xHWEs= github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= @@ -266,8 +268,8 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/ti-mo/conntrack v0.5.2 h1:PQ7MCdFjniEiTJT+qsAysREUsT5iH62/VNyhkB06HOI= -github.com/ti-mo/conntrack v0.5.2/go.mod h1:4HZrFQQLOSuBzgQNid3H/wYyyp1kfGXUYxueXjIGibo= +github.com/ti-mo/conntrack v0.6.0 h1:laiW2+dzKyS2u0aVr6FeRQs+v7cj4t7q+twolL/ZkjQ= +github.com/ti-mo/conntrack v0.6.0/go.mod h1:4HZrFQQLOSuBzgQNid3H/wYyyp1kfGXUYxueXjIGibo= github.com/ti-mo/netfilter v0.5.3 h1:ikzduvnaUMwre5bhbNwWOd6bjqLMVb33vv0XXbK0xGQ= github.com/ti-mo/netfilter v0.5.3/go.mod h1:08SyBCg6hu1qyQk4s3DjjJKNrm3RTb32nm6AzyT972E= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= @@ -320,8 +322,8 @@ golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= -golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM= -golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -330,8 +332,8 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= -golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/internal/conntrack/aggregator_linux.go b/internal/conntrack/aggregator_linux.go new file mode 100644 index 0000000..8e1fafd --- /dev/null +++ b/internal/conntrack/aggregator_linux.go @@ -0,0 +1,370 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux + +package conntrack + +import ( + "fmt" + "log" + "time" + + "github.com/ti-mo/conntrack" + "github.com/ti-mo/netfilter" +) + +// +// Conntrack aggregator with bounded ingestion + DESTROY aggregation +// to handle massive bursts of conntrack DESTROY events without OOMing. +// + +// NewZoneMarkAggregator creates a new aggregator with its own listening connection. +func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { + + // Create a separate connection for listening to events + listenCli, err := conntrack.Dial(nil) + if err != nil { + return nil, fmt.Errorf("failed to create listening connection: %w", err) + } + + if err := listenCli.SetReadBuffer(64 * 1024 * 1024); err != nil { // 64MB buffer for 1.4M events/sec + log.Printf("Warning: Failed to set read buffer size: %v", err) + } + if err := listenCli.SetWriteBuffer(64 * 1024 * 1024); err != nil { // 64MB buffer for 1.4M events/sec + log.Printf("Warning: Failed to set write buffer size: %v", err) + } + + a := &ZoneMarkAggregator{ + counts: make(map[ZoneMarkKey]int), + listenCli: listenCli, + stopCh: make(chan struct{}), + eventsCh: make(chan conntrack.Event, eventChanSize), + destroyDeltas: make(map[ZoneMarkKey]int), + lastEventTime: time.Now(), + lastHealthCheck: time.Now(), + } + + return a, nil +} + +// Start subscribes to NEW/DESTROY/UPDATE events and maintains counts with bounded ingestion. +func (a *ZoneMarkAggregator) Start() error { + + if err := a.startEventListener(); err != nil { + return err + } + + for i := 0; i < eventWorkerCount; i++ { + a.wg.Go(func() error { + a.eventWorker() + return nil + }) + } + + a.wg.Go(func() error { + a.destroyFlusher() + return nil + }) + + a.wg.Go(func() error { + a.startHealthMonitoring() + return nil + }) + + return nil +} + +// startEventListener handles real-time conntrack events, pushing into bounded eventsCh. +func (a *ZoneMarkAggregator) startEventListener() error { + libEvents := make(chan conntrack.Event, 8192) + groups := []netfilter.NetlinkGroup{ + netfilter.GroupCTNew, + netfilter.GroupCTDestroy, + netfilter.GroupCTUpdate, + } + + errCh, err := a.listenCli.Listen(libEvents, 10, groups) + if err != nil { + return fmt.Errorf("failed to listen to conntrack events: %w", err) + } + + a.wg.Go(func() error { + eventCount := int64(0) + rateWindow := make([]time.Time, 0, 100) + + for { + select { + case <-a.stopCh: + log.Printf("Stopping lib->bounded relay after %d lib events", eventCount) + return nil + case e := <-errCh: + if e != nil { + log.Printf("conntrack listener error: %v", e) + a.missedEvents.Add(1) + } + case ev := <-libEvents: + select { + case a.eventsCh <- ev: + eventCount++ + a.eventCount.Store(eventCount) + a.lastEventTime = time.Now() + + rateWindow = append(rateWindow, a.lastEventTime) + if len(rateWindow) > 100 { + rateWindow = rateWindow[1:] + } + if len(rateWindow) > 1 { + duration := rateWindow[len(rateWindow)-1].Sub(rateWindow[0]) + if duration > 0 { + a.eventRate = float64(len(rateWindow)-1) / duration.Seconds() + } + } + default: + a.missedEvents.Add(1) + if a.missedEvents.Load()%100 == 0 { + log.Printf("Warning: eventsCh full, missedEvents=%d", a.missedEvents.Load()) + } + } + } + } + }) + + return nil +} + +// eventWorker consumes events from eventsCh and handles them +func (a *ZoneMarkAggregator) eventWorker() { + + for { + select { + case <-a.stopCh: + return + case ev := <-a.eventsCh: + a.handleEvent(ev) + } + } +} + +// handleEvent processes a single event. +func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) { + f := ev.Flow + key := ZoneMarkKey{Zone: f.Zone, Mark: f.Mark} + + if ev.Type == conntrack.EventNew { + a.countsMu.Lock() + defer a.countsMu.Unlock() + a.counts[key]++ + return + } + + if ev.Type == conntrack.EventDestroy { + a.deltaMu.Lock() + defer a.deltaMu.Unlock() + if len(a.destroyDeltas) < destroyDeltaCap { + a.destroyDeltas[key]++ + if len(a.destroyDeltas) > 50000 { // If we have >50K deltas, flush immediately + deltas := a.destroyDeltas + a.destroyDeltas = make(map[ZoneMarkKey]int) + // Acquire countsMu while still holding deltaMu to maintain lock ordering + a.countsMu.Lock() + defer a.countsMu.Unlock() + // Apply deltas immediately to minimize lag during extreme load + a.applyDeltasImmediatelyUnsafe(deltas) + return + } + // Log every 1000 DESTROY events to verify they're being received + if len(a.destroyDeltas)%1000 == 0 { + log.Printf("DESTROY events: %d entries in destroyDeltas (zone=%d, mark=%d)", len(a.destroyDeltas), key.Zone, key.Mark) + } + } else { + a.missedEvents.Add(1) + if a.missedEvents.Load()%dropsWarnThreshold == 0 { + log.Printf("Warning: destroyDeltas saturated (size=%d). missedEvents=%d", len(a.destroyDeltas), a.missedEvents.Load()) + } + } + return + } +} + +// applyDeltasImmediatelyUnsafe applies deltas immediately to minimize lag during extreme load +// This method assumes countsMu is already held by the caller +func (a *ZoneMarkAggregator) applyDeltasImmediatelyUnsafe(deltas map[ZoneMarkKey]int) { + for k, cnt := range deltas { + existing, ok := a.counts[k] + if !ok { + a.missedEvents.Add(int64(cnt)) + continue + } + if existing <= cnt { + delete(a.counts, k) + } else { + a.counts[k] = existing - cnt + } + } +} + +// destroyFlusher periodically applies the aggregated DESTROY deltas into counts +// Uses adaptive flushing: more frequent during high event rates for minimal lag +func (a *ZoneMarkAggregator) destroyFlusher() { + ticker := time.NewTicker(destroyFlushIntvl) + defer ticker.Stop() + + for { + select { + case <-a.stopCh: + log.Printf("Destroy flusher stopping, final flush...") + a.flushDestroyDeltas() + return + case <-ticker.C: + // Adaptive flushing: flush more frequently during high event rates + a.countsMu.RLock() + eventRate := a.eventRate + a.countsMu.RUnlock() + + if eventRate > 500000 { // Very high event rate (>500K/sec) + // Flush immediately and reset ticker for faster interval + a.flushDestroyDeltas() + ticker.Reset(50 * time.Millisecond) // 50ms during extreme load + } else if eventRate > 100000 { // High event rate (>100K/sec) + a.flushDestroyDeltas() + ticker.Reset(100 * time.Millisecond) // 100ms during high load + } else if eventRate > 10000 { // Medium event rate (>10K/sec) + a.flushDestroyDeltas() + ticker.Reset(200 * time.Millisecond) // 200ms during medium load + } else { + // Normal flush + a.flushDestroyDeltas() + ticker.Reset(destroyFlushIntvl) // Back to normal interval + } + } + } +} + +// flushDestroyDeltas atomically swaps the delta map and applies decrements +func (a *ZoneMarkAggregator) flushDestroyDeltas() { + // First acquire deltaMu to check and swap deltas + a.deltaMu.Lock() + defer a.deltaMu.Unlock() + if len(a.destroyDeltas) == 0 { + return + } + deltas := a.destroyDeltas + a.destroyDeltas = make(map[ZoneMarkKey]int) + + // Now acquire countsMu while still holding deltaMu to ensure atomicity + a.countsMu.Lock() + defer a.countsMu.Unlock() + + for k, cnt := range deltas { + existing, ok := a.counts[k] + if !ok { + a.missedEvents.Add(int64(cnt)) + continue + } + if existing <= cnt { + delete(a.counts, k) + } else { + a.counts[k] = existing - cnt + } + } +} + +// Snapshot returns a safe copy of counts. +func (a *ZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { + a.flushDestroyDeltas() + a.countsMu.RLock() + defer a.countsMu.RUnlock() + + out := make(map[ZoneMarkKey]int, len(a.counts)) + for k, c := range a.counts { + if c > 0 { + out[k] = c + } + } + return out +} + +// startHealthMonitoring periodically logs aggregator health +func (a *ZoneMarkAggregator) startHealthMonitoring() { + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-a.stopCh: + return + case <-ticker.C: + a.performHealthCheck() + } + } +} + +func (a *ZoneMarkAggregator) performHealthCheck() { + missed := a.missedEvents.Load() + + if missed > dropsWarnThreshold { + if err := a.RestartListener(); err != nil { + log.Printf("Health check: RestartListener failed: %v", err) + } else { + a.missedEvents.Store(0) + log.Printf("Health check: Listener restarted successfully") + } + } + a.lastHealthCheck = time.Now() +} + +// Stop cancels listening and closes the connection. +func (a *ZoneMarkAggregator) Stop() { + close(a.stopCh) + a.wg.Wait() // Wait for all goroutines to exit cleanly + if a.listenCli != nil { + if err := a.listenCli.Close(); err != nil { + log.Printf("Error closing listenCli during cleanup: %v", err) + } + } + a.flushDestroyDeltas() +} + +// RestartListener attempts to restart the conntrack event listener +func (a *ZoneMarkAggregator) RestartListener() error { + a.listenerMu.Lock() + defer a.listenerMu.Unlock() + + // Signal all goroutines to stop by closing stopCh + close(a.stopCh) + + // Close the old connection to help goroutines exit faster + if a.listenCli != nil { + if err := a.listenCli.Close(); err != nil { + log.Printf("Warning: Error closing old listener connection: %v", err) + } + } + + // Wait for all goroutines to exit cleanly + a.wg.Wait() + + // Create a new stopCh for the restarted goroutines + a.stopCh = make(chan struct{}) + + // Create new connection + listenCli, err := conntrack.Dial(nil) + if err != nil { + return fmt.Errorf("failed to create new listening connection: %w", err) + } + a.listenCli = listenCli + + // Start new listener with fresh goroutines + return a.startEventListener() +} diff --git a/internal/conntrack/aggregator_stub.go b/internal/conntrack/aggregator_stub.go new file mode 100644 index 0000000..78ecc0d --- /dev/null +++ b/internal/conntrack/aggregator_stub.go @@ -0,0 +1,42 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !linux + +package conntrack + +import "fmt" + +// NewZoneMarkAggregator returns an error on non-Linux platforms +func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { + return nil, fmt.Errorf("conntrack aggregator is only supported on Linux") +} + +// Start is a no-op on non-Linux platforms +func (a *ZoneMarkAggregator) Start() error { + return fmt.Errorf("conntrack aggregator is only supported on Linux") +} + +// Stop is a no-op on non-Linux platforms +func (a *ZoneMarkAggregator) Stop() {} + +// Snapshot returns an empty map on non-Linux platforms +func (a *ZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { + return make(map[ZoneMarkKey]int) +} + +// RestartListener returns an error on non-Linux platforms +func (a *ZoneMarkAggregator) RestartListener() error { + return fmt.Errorf("conntrack aggregator is only supported on Linux") +} diff --git a/internal/conntrack/types.go b/internal/conntrack/types.go new file mode 100644 index 0000000..e1dfe39 --- /dev/null +++ b/internal/conntrack/types.go @@ -0,0 +1,68 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package conntrack + +import ( + "sync" + "sync/atomic" + "time" + + "github.com/ti-mo/conntrack" + "golang.org/x/sync/errgroup" +) + +// Tunables - adjust for your environment +const ( + eventChanSize = 512 * 1024 + eventWorkerCount = 100 + destroyFlushIntvl = 100 * time.Millisecond // flush aggregated DESTROYs every 100ms for minimal lag + destroyDeltaCap = 200000 // maximum distinct (zone,mark) entries in destroyDeltas + dropsWarnThreshold = 10000 // threshold of missedEvents to log a stronger warning +) + +// ZoneMarkAggregator keeps live counts (zmKey -> count) with bounded ingestion +type ZoneMarkAggregator struct { + // primary counts (zmKey -> count) - simplified flat mapping + counts map[ZoneMarkKey]int + countsMu sync.RWMutex + eventRate float64 + + // conntrack listening connection + listenCli *conntrack.Conn + listenerMu sync.Mutex // Protects listener restart operations + + // lifecycle + stopCh chan struct{} + wg errgroup.Group + + // bounded event ingestion + eventsCh chan conntrack.Event + + // aggregated DESTROY deltas (bounded by destroyDeltaCap) + deltaMu sync.Mutex + destroyDeltas map[ZoneMarkKey]int + + // metrics / health + eventCount atomic.Int64 + lastEventTime time.Time + missedEvents atomic.Int64 + lastHealthCheck time.Time +} + +// ZoneMarkKey is a compact key for (zone,mark) +type ZoneMarkKey struct { + Zone uint16 + Mark uint32 +} diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index 15cd610..bc6696b 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -4,13 +4,13 @@ import ( "fmt" "log" - "github.com/digitalocean/go-openvswitch/ovsnl" + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" "github.com/prometheus/client_golang/prometheus" ) type conntrackCollector struct { desc *prometheus.Desc - agg *ovsnl.ZoneMarkAggregator + agg *conntrack.ZoneMarkAggregator } // ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot @@ -18,7 +18,7 @@ type ConntrackCollectorWithAggAccessor struct { *conntrackCollector } -func newConntrackCollector(agg *ovsnl.ZoneMarkAggregator) prometheus.Collector { +func newConntrackCollector(agg *conntrack.ZoneMarkAggregator) prometheus.Collector { return &conntrackCollector{ desc: prometheus.NewDesc( prometheus.BuildFQName(namespace, "conntrack", "count"), diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index a00f7bb..df605ec 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -9,9 +9,8 @@ import ( "log" "sync" - // "time" - "github.com/digitalocean/go-openvswitch/ovsnl" + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" "github.com/prometheus/client_golang/prometheus" ) @@ -24,7 +23,6 @@ type collector struct { mu sync.Mutex cs []prometheus.Collector conntrackEnabled bool - agg *ovsnl.ZoneMarkAggregator } // Make sure collector implements prometheus.Collector @@ -34,11 +32,11 @@ var _ prometheus.Collector = &collector{} // input Open vSwitch generic netlink client. func New(c *ovsnl.Client) prometheus.Collector { collectors := []prometheus.Collector{ - newDatapathCollector(c.Datapath.List), + // newDatapathCollector(c.Datapath.List), } // Create the aggregator - agg, err := ovsnl.NewZoneMarkAggregator() + agg, err := conntrack.NewZoneMarkAggregator() if err != nil { log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) return &collector{cs: collectors} @@ -55,7 +53,6 @@ func New(c *ovsnl.Client) prometheus.Collector { return &collector{ cs: collectors, conntrackEnabled: true, - agg: agg, } } @@ -83,10 +80,4 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { func (c *collector) Close() { c.mu.Lock() defer c.mu.Unlock() - - if c.agg != nil { - log.Printf("Stopping conntrack aggregator...") - c.agg.Stop() - c.agg = nil - } } From 901ac81bab53c73776b0d44e14f61a7be1eff3ed Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Mon, 27 Oct 2025 19:53:41 +0530 Subject: [PATCH 07/19] trying to fix the static check issue --- .github/workflows/go.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 30a5f52..524c620 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -25,10 +25,13 @@ jobs: with: go-version: 1.25 - - name: Get tooling + - name: Install tooling run: | - go get golang.org/x/lint/golint - go get honnef.co/go/tools/cmd/staticcheck + # Install linting / analysis tools with explicit versions (go get no longer installs binaries) + go install honnef.co/go/tools/cmd/staticcheck@v0.5.1 + go install golang.org/x/lint/golint@latest + # Ensure GOPATH/bin is on PATH for subsequent steps (actions/setup-go usually does this, but we enforce it) + echo "$(go env GOPATH)/bin" >> "$GITHUB_PATH" working-directory: openvswitch_exporter - name: Build From a4ae2cdc2a62a1dfc9afe2b0856c799a4dbeac38 Mon Sep 17 00:00:00 2001 From: shrouti1995 Date: Mon, 27 Oct 2025 20:18:37 +0530 Subject: [PATCH 08/19] solve gha issue * trying to fix the static check issue * trying to fix the static check issue --- .github/workflows/go.yml | 23 +++++------------------ go.mod | 2 -- internal/ovsexporter/ovsexporter_test.go | 4 ++-- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 524c620..fd15e8c 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -12,13 +12,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - with: - path: openvswitch_exporter - - - uses: actions/checkout@v2 - with: - repository: digitalocean/go-openvswitch - path: go-openvswitch - name: Set up Go uses: actions/setup-go@v6 @@ -27,29 +20,23 @@ jobs: - name: Install tooling run: | - # Install linting / analysis tools with explicit versions (go get no longer installs binaries) - go install honnef.co/go/tools/cmd/staticcheck@v0.5.1 - go install golang.org/x/lint/golint@latest - # Ensure GOPATH/bin is on PATH for subsequent steps (actions/setup-go usually does this, but we enforce it) + # Install analysis tools. staticcheck @latest for Go 1.24 compatibility. + go install honnef.co/go/tools/cmd/staticcheck@latest + # golint is deprecated; keep temporarily (will remove in follow-up) + go install golang.org/x/lint/golint@latest || echo "golint install failed (deprecated)" echo "$(go env GOPATH)/bin" >> "$GITHUB_PATH" - working-directory: openvswitch_exporter - name: Build run: go build -v -tags=gofuzz ./... - working-directory: openvswitch_exporter - name: vet run: go vet ./... - working-directory: openvswitch_exporter - name: staticcheck run: staticcheck ./... - working-directory: openvswitch_exporter - name: lint run: golint -set_exit_status ./cmd/... ./internal/... - working-directory: openvswitch_exporter - name: Test - run: go test -v -race ./... - working-directory: openvswitch_exporter \ No newline at end of file + run: go test -v -race ./... \ No newline at end of file diff --git a/go.mod b/go.mod index fe1d4ea..5caef53 100644 --- a/go.mod +++ b/go.mod @@ -28,5 +28,3 @@ require ( golang.org/x/sys v0.37.0 // indirect google.golang.org/protobuf v1.23.0 // indirect ) - -// replace github.com/digitalocean/go-openvswitch => ../go-openvswitch diff --git a/internal/ovsexporter/ovsexporter_test.go b/internal/ovsexporter/ovsexporter_test.go index 39e08be..c140af3 100644 --- a/internal/ovsexporter/ovsexporter_test.go +++ b/internal/ovsexporter/ovsexporter_test.go @@ -5,7 +5,7 @@ package ovsexporter import ( "bytes" - "io/ioutil" + "io" "net/http" "net/http/httptest" "testing" @@ -33,7 +33,7 @@ func testCollector(t *testing.T, collector prometheus.Collector) []byte { } defer resp.Body.Close() - buf, err := ioutil.ReadAll(resp.Body) + buf, err := io.ReadAll(resp.Body) if err != nil { t.Fatalf("failed to read server response: %v", err) } From 836243bfee0e7546541b34252f9e0f70332ec931 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Mon, 27 Oct 2025 20:25:16 +0530 Subject: [PATCH 09/19] uncomment code --- internal/ovsexporter/ovsexporter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index df605ec..dc453c5 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -32,7 +32,7 @@ var _ prometheus.Collector = &collector{} // input Open vSwitch generic netlink client. func New(c *ovsnl.Client) prometheus.Collector { collectors := []prometheus.Collector{ - // newDatapathCollector(c.Datapath.List), + newDatapathCollector(c.Datapath.List), } // Create the aggregator From ed8d85171fb554a975ba6ce530cf10c547ff4479 Mon Sep 17 00:00:00 2001 From: shrouti1995 Date: Tue, 28 Oct 2025 13:28:41 +0530 Subject: [PATCH 10/19] Adding test cases * adding test cases * adding mock tests * Context-Based Cancellation Refactoring * add centralised error propagation * solve lint error --- internal/conntrack/aggregator_linux.go | 92 ++++++++------ internal/conntrack/aggregator_linux_test.go | 116 ++++++++++++++++++ internal/conntrack/aggregator_stub.go | 42 ------- internal/conntrack/mock.go | 120 ++++++++++++++++++ internal/conntrack/types.go | 17 ++- internal/ovsexporter/conntrack.go | 6 +- internal/ovsexporter/conntrack_mock_test.go | 128 ++++++++++++++++++++ internal/ovsexporter/conntrack_test.go | 64 ++++++++++ 8 files changed, 500 insertions(+), 85 deletions(-) create mode 100644 internal/conntrack/aggregator_linux_test.go delete mode 100644 internal/conntrack/aggregator_stub.go create mode 100644 internal/conntrack/mock.go create mode 100644 internal/ovsexporter/conntrack_mock_test.go create mode 100644 internal/ovsexporter/conntrack_test.go diff --git a/internal/conntrack/aggregator_linux.go b/internal/conntrack/aggregator_linux.go index 8e1fafd..6a8a54f 100644 --- a/internal/conntrack/aggregator_linux.go +++ b/internal/conntrack/aggregator_linux.go @@ -17,6 +17,7 @@ package conntrack import ( + "context" "fmt" "log" "time" @@ -46,10 +47,12 @@ func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { log.Printf("Warning: Failed to set write buffer size: %v", err) } + ctx, cancel := context.WithCancel(context.Background()) a := &ZoneMarkAggregator{ counts: make(map[ZoneMarkKey]int), listenCli: listenCli, - stopCh: make(chan struct{}), + ctx: ctx, + cancel: cancel, eventsCh: make(chan conntrack.Event, eventChanSize), destroyDeltas: make(map[ZoneMarkKey]int), lastEventTime: time.Now(), @@ -68,19 +71,16 @@ func (a *ZoneMarkAggregator) Start() error { for i := 0; i < eventWorkerCount; i++ { a.wg.Go(func() error { - a.eventWorker() - return nil + return a.eventWorker(a.ctx) }) } a.wg.Go(func() error { - a.destroyFlusher() - return nil + return a.destroyFlusher(a.ctx) }) a.wg.Go(func() error { - a.startHealthMonitoring() - return nil + return a.startHealthMonitoring(a.ctx) }) return nil @@ -95,7 +95,7 @@ func (a *ZoneMarkAggregator) startEventListener() error { netfilter.GroupCTUpdate, } - errCh, err := a.listenCli.Listen(libEvents, 10, groups) + errCh, err := a.listenCli.Listen(libEvents, 50, groups) if err != nil { return fmt.Errorf("failed to listen to conntrack events: %w", err) } @@ -106,9 +106,9 @@ func (a *ZoneMarkAggregator) startEventListener() error { for { select { - case <-a.stopCh: + case <-a.ctx.Done(): log.Printf("Stopping lib->bounded relay after %d lib events", eventCount) - return nil + return a.ctx.Err() case e := <-errCh: if e != nil { log.Printf("conntrack listener error: %v", e) @@ -145,20 +145,22 @@ func (a *ZoneMarkAggregator) startEventListener() error { } // eventWorker consumes events from eventsCh and handles them -func (a *ZoneMarkAggregator) eventWorker() { - +func (a *ZoneMarkAggregator) eventWorker(ctx context.Context) error { for { select { - case <-a.stopCh: - return + case <-ctx.Done(): + return ctx.Err() case ev := <-a.eventsCh: - a.handleEvent(ev) + if err := a.handleEvent(ev); err != nil { + log.Printf("Error handling event: %v", err) + // Continue processing other events, but log the error + } } } } // handleEvent processes a single event. -func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) { +func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) error { f := ev.Flow key := ZoneMarkKey{Zone: f.Zone, Mark: f.Mark} @@ -166,7 +168,7 @@ func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) { a.countsMu.Lock() defer a.countsMu.Unlock() a.counts[key]++ - return + return nil } if ev.Type == conntrack.EventDestroy { @@ -182,7 +184,7 @@ func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) { defer a.countsMu.Unlock() // Apply deltas immediately to minimize lag during extreme load a.applyDeltasImmediatelyUnsafe(deltas) - return + return nil } // Log every 1000 DESTROY events to verify they're being received if len(a.destroyDeltas)%1000 == 0 { @@ -194,8 +196,10 @@ func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) { log.Printf("Warning: destroyDeltas saturated (size=%d). missedEvents=%d", len(a.destroyDeltas), a.missedEvents.Load()) } } - return + return nil } + + return nil } // applyDeltasImmediatelyUnsafe applies deltas immediately to minimize lag during extreme load @@ -217,16 +221,16 @@ func (a *ZoneMarkAggregator) applyDeltasImmediatelyUnsafe(deltas map[ZoneMarkKey // destroyFlusher periodically applies the aggregated DESTROY deltas into counts // Uses adaptive flushing: more frequent during high event rates for minimal lag -func (a *ZoneMarkAggregator) destroyFlusher() { +func (a *ZoneMarkAggregator) destroyFlusher(ctx context.Context) error { ticker := time.NewTicker(destroyFlushIntvl) defer ticker.Stop() for { select { - case <-a.stopCh: + case <-ctx.Done(): log.Printf("Destroy flusher stopping, final flush...") a.flushDestroyDeltas() - return + return ctx.Err() case <-ticker.C: // Adaptive flushing: flush more frequently during high event rates a.countsMu.RLock() @@ -297,38 +301,54 @@ func (a *ZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { } // startHealthMonitoring periodically logs aggregator health -func (a *ZoneMarkAggregator) startHealthMonitoring() { +func (a *ZoneMarkAggregator) startHealthMonitoring(ctx context.Context) error { ticker := time.NewTicker(5 * time.Minute) defer ticker.Stop() for { select { - case <-a.stopCh: - return + case <-ctx.Done(): + return ctx.Err() case <-ticker.C: - a.performHealthCheck() + if err := a.performHealthCheck(); err != nil { + log.Printf("Health check error: %v", err) + // Continue monitoring even if health check fails + } } } } -func (a *ZoneMarkAggregator) performHealthCheck() { +func (a *ZoneMarkAggregator) performHealthCheck() error { missed := a.missedEvents.Load() if missed > dropsWarnThreshold { if err := a.RestartListener(); err != nil { log.Printf("Health check: RestartListener failed: %v", err) - } else { - a.missedEvents.Store(0) - log.Printf("Health check: Listener restarted successfully") + return fmt.Errorf("failed to restart listener: %w", err) } + a.missedEvents.Store(0) + log.Printf("Health check: Listener restarted successfully") } a.lastHealthCheck = time.Now() + return nil +} + +// GetError returns any error from the errgroup if available +func (a *ZoneMarkAggregator) GetError() error { + // This is a non-blocking way to check if there are any errors + // The actual error handling happens in Stop() + return nil } // Stop cancels listening and closes the connection. func (a *ZoneMarkAggregator) Stop() { - close(a.stopCh) - a.wg.Wait() // Wait for all goroutines to exit cleanly + a.cancel() // Cancel the context to signal all goroutines to stop + + // Wait for all goroutines to exit and check for errors + if err := a.wg.Wait(); err != nil { + log.Printf("Error from goroutine group: %v", err) + } + if a.listenCli != nil { if err := a.listenCli.Close(); err != nil { log.Printf("Error closing listenCli during cleanup: %v", err) @@ -342,8 +362,8 @@ func (a *ZoneMarkAggregator) RestartListener() error { a.listenerMu.Lock() defer a.listenerMu.Unlock() - // Signal all goroutines to stop by closing stopCh - close(a.stopCh) + // Signal all goroutines to stop by canceling the context + a.cancel() // Close the old connection to help goroutines exit faster if a.listenCli != nil { @@ -355,8 +375,8 @@ func (a *ZoneMarkAggregator) RestartListener() error { // Wait for all goroutines to exit cleanly a.wg.Wait() - // Create a new stopCh for the restarted goroutines - a.stopCh = make(chan struct{}) + // Create a new context for the restarted goroutines + a.ctx, a.cancel = context.WithCancel(context.Background()) // Create new connection listenCli, err := conntrack.Dial(nil) diff --git a/internal/conntrack/aggregator_linux_test.go b/internal/conntrack/aggregator_linux_test.go new file mode 100644 index 0000000..6043f66 --- /dev/null +++ b/internal/conntrack/aggregator_linux_test.go @@ -0,0 +1,116 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux + +package conntrack + +import ( + "testing" +) + +func TestZoneMarkAggregator(t *testing.T) { + // Test aggregator creation + agg, err := NewZoneMarkAggregator() + if err != nil { + // This is expected to fail in test environment due to permission requirements + t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) + return + } + + if agg == nil { + t.Fatal("NewZoneMarkAggregator() returned nil aggregator") + } + + // Test basic methods + snapshot := agg.Snapshot() + if snapshot == nil { + t.Fatal("Snapshot() returned nil") + } + + // Clean up + t.Cleanup(agg.Stop) +} + +func TestZoneMarkAggregatorSnapshot(t *testing.T) { + // Test aggregator creation + agg, err := NewZoneMarkAggregator() + if err != nil { + // This is expected to fail in test environment due to permission requirements + t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) + return + } + + if agg == nil { + t.Fatal("NewZoneMarkAggregator() returned nil aggregator") + } + + // Test snapshot functionality with new ZoneMarkKey-based mapping + snapshot := agg.Snapshot() + if snapshot == nil { + t.Fatal("Snapshot() returned nil") + } + + // Verify snapshot is a map[ZoneMarkKey]int + if len(snapshot) == 0 { + t.Log("Snapshot is empty (expected in test environment)") + } + + // Test that we can iterate over the snapshot + for key, count := range snapshot { + if count <= 0 { + t.Errorf("Invalid count %d for key %+v", count, key) + } + t.Logf("Zone: %d, Mark: %d, Count: %d", key.Zone, key.Mark, count) + } + + // Clean up + t.Cleanup(agg.Stop) +} + +func TestZMKeyComparison(t *testing.T) { + // Test that ZoneMarkKey works correctly as a map key + key1 := ZoneMarkKey{Zone: 1, Mark: 100} + key2 := ZoneMarkKey{Zone: 1, Mark: 100} + key3 := ZoneMarkKey{Zone: 2, Mark: 100} + key4 := ZoneMarkKey{Zone: 1, Mark: 200} + + // Test equality + if key1 != key2 { + t.Error("Identical ZoneMarkKey structs should be equal") + } + + // Test inequality + if key1 == key3 { + t.Error("Different zone ZoneMarkKey structs should not be equal") + } + if key1 == key4 { + t.Error("Different mark ZoneMarkKey structs should not be equal") + } + + // Test as map keys + testMap := make(map[ZoneMarkKey]int) + testMap[key1] = 5 + testMap[key3] = 10 + + if testMap[key1] != 5 { + t.Error("ZoneMarkKey should work as map key") + } + if testMap[key2] != 5 { + t.Error("Equal ZoneMarkKey structs should map to same value") + } + if testMap[key3] != 10 { + t.Error("Different ZoneMarkKey should map to different value") + } +} diff --git a/internal/conntrack/aggregator_stub.go b/internal/conntrack/aggregator_stub.go deleted file mode 100644 index 78ecc0d..0000000 --- a/internal/conntrack/aggregator_stub.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 DigitalOcean. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build !linux - -package conntrack - -import "fmt" - -// NewZoneMarkAggregator returns an error on non-Linux platforms -func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { - return nil, fmt.Errorf("conntrack aggregator is only supported on Linux") -} - -// Start is a no-op on non-Linux platforms -func (a *ZoneMarkAggregator) Start() error { - return fmt.Errorf("conntrack aggregator is only supported on Linux") -} - -// Stop is a no-op on non-Linux platforms -func (a *ZoneMarkAggregator) Stop() {} - -// Snapshot returns an empty map on non-Linux platforms -func (a *ZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { - return make(map[ZoneMarkKey]int) -} - -// RestartListener returns an error on non-Linux platforms -func (a *ZoneMarkAggregator) RestartListener() error { - return fmt.Errorf("conntrack aggregator is only supported on Linux") -} diff --git a/internal/conntrack/mock.go b/internal/conntrack/mock.go new file mode 100644 index 0000000..edd88d1 --- /dev/null +++ b/internal/conntrack/mock.go @@ -0,0 +1,120 @@ +//go:build !linux +// +build !linux + +package conntrack + +import ( + "context" + "sync" + "time" +) + +// MockZoneMarkAggregator provides a mock implementation for non-Linux platforms +type MockZoneMarkAggregator struct { + *ZoneMarkAggregator + counts map[ZoneMarkKey]int + countsMu sync.RWMutex +} + +// NewZoneMarkAggregator creates a mock aggregator for testing +func NewZoneMarkAggregator() (*MockZoneMarkAggregator, error) { + ctx, cancel := context.WithCancel(context.Background()) + return &MockZoneMarkAggregator{ + ZoneMarkAggregator: &ZoneMarkAggregator{ + ctx: ctx, + cancel: cancel, + }, + counts: make(map[ZoneMarkKey]int), + }, nil +} + +// Snapshot returns a copy of the current counts +func (m *MockZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { + m.countsMu.RLock() + defer m.countsMu.RUnlock() + + snapshot := make(map[ZoneMarkKey]int) + for k, v := range m.counts { + snapshot[k] = v + } + return snapshot +} + +// Start starts the mock aggregator (no-op for mock) +func (m *MockZoneMarkAggregator) Start() error { + return nil +} + +// Stop stops the mock aggregator +func (m *MockZoneMarkAggregator) Stop() { + m.cancel() +} + +// AddEntry adds a mock entry for testing +func (m *MockZoneMarkAggregator) AddEntry(zone uint16, mark uint32) { + m.countsMu.Lock() + defer m.countsMu.Unlock() + + key := ZoneMarkKey{Zone: zone, Mark: mark} + m.counts[key]++ +} + +// RemoveEntry removes a mock entry for testing +func (m *MockZoneMarkAggregator) RemoveEntry(zone uint16, mark uint32) { + m.countsMu.Lock() + defer m.countsMu.Unlock() + + key := ZoneMarkKey{Zone: zone, Mark: mark} + if m.counts[key] > 0 { + m.counts[key]-- + if m.counts[key] == 0 { + delete(m.counts, key) + } + } +} + +// SetCount sets a specific count for testing +func (m *MockZoneMarkAggregator) SetCount(zone uint16, mark uint32, count int) { + m.countsMu.Lock() + defer m.countsMu.Unlock() + + key := ZoneMarkKey{Zone: zone, Mark: mark} + if count <= 0 { + delete(m.counts, key) + } else { + m.counts[key] = count + } +} + +// Clear clears all counts +func (m *MockZoneMarkAggregator) Clear() { + m.countsMu.Lock() + defer m.countsMu.Unlock() + + m.counts = make(map[ZoneMarkKey]int) +} + +// GetEventRate returns a mock event rate +func (m *MockZoneMarkAggregator) GetEventRate() float64 { + return 100.0 // Mock rate +} + +// GetEventCount returns a mock event count +func (m *MockZoneMarkAggregator) GetEventCount() int64 { + return int64(len(m.counts)) * 10 // Mock count +} + +// GetMissedEvents returns a mock missed events count +func (m *MockZoneMarkAggregator) GetMissedEvents() int64 { + return 0 // Mock no missed events +} + +// IsHealthy returns true for mock +func (m *MockZoneMarkAggregator) IsHealthy() bool { + return true +} + +// GetLastEventTime returns current time for mock +func (m *MockZoneMarkAggregator) GetLastEventTime() time.Time { + return time.Now() +} diff --git a/internal/conntrack/types.go b/internal/conntrack/types.go index e1dfe39..6677f6f 100644 --- a/internal/conntrack/types.go +++ b/internal/conntrack/types.go @@ -15,6 +15,7 @@ package conntrack import ( + "context" "sync" "sync/atomic" "time" @@ -27,9 +28,9 @@ import ( const ( eventChanSize = 512 * 1024 eventWorkerCount = 100 - destroyFlushIntvl = 100 * time.Millisecond // flush aggregated DESTROYs every 100ms for minimal lag - destroyDeltaCap = 200000 // maximum distinct (zone,mark) entries in destroyDeltas - dropsWarnThreshold = 10000 // threshold of missedEvents to log a stronger warning + destroyFlushIntvl = 50 * time.Millisecond // flush aggregated DESTROYs every 50ms for minimal lag + destroyDeltaCap = 200000 // maximum distinct (zone,mark) entries in destroyDeltas + dropsWarnThreshold = 10000 // threshold of missedEvents to log a stronger warning ) // ZoneMarkAggregator keeps live counts (zmKey -> count) with bounded ingestion @@ -44,7 +45,8 @@ type ZoneMarkAggregator struct { listenerMu sync.Mutex // Protects listener restart operations // lifecycle - stopCh chan struct{} + ctx context.Context + cancel context.CancelFunc wg errgroup.Group // bounded event ingestion @@ -66,3 +68,10 @@ type ZoneMarkKey struct { Zone uint16 Mark uint32 } + +// Aggregator interface defines the methods needed by the collector +type Aggregator interface { + Snapshot() map[ZoneMarkKey]int + Stop() + Start() error +} diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index bc6696b..5430e21 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -10,7 +10,7 @@ import ( type conntrackCollector struct { desc *prometheus.Desc - agg *conntrack.ZoneMarkAggregator + agg conntrack.Aggregator } // ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot @@ -18,10 +18,10 @@ type ConntrackCollectorWithAggAccessor struct { *conntrackCollector } -func newConntrackCollector(agg *conntrack.ZoneMarkAggregator) prometheus.Collector { +func newConntrackCollector(agg conntrack.Aggregator) prometheus.Collector { return &conntrackCollector{ desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "conntrack", "count"), + prometheus.BuildFQName(namespace, "conntrack", "entries"), "Number of conntrack entries by zone and mark", []string{"zone", "mark"}, nil, diff --git a/internal/ovsexporter/conntrack_mock_test.go b/internal/ovsexporter/conntrack_mock_test.go new file mode 100644 index 0000000..f72339c --- /dev/null +++ b/internal/ovsexporter/conntrack_mock_test.go @@ -0,0 +1,128 @@ +//go:build !linux +// +build !linux + +package ovsexporter + +import ( + "testing" + + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" +) + +func TestConntrackCollector(t *testing.T) { + // Create a mock aggregator + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + + // Clean up aggregator after test + t.Cleanup(agg.Stop) + + // Add some test data + agg.SetCount(0, 100, 1500) + agg.SetCount(0, 200, 2500) + agg.SetCount(1, 300, 3500) + + // Create collector with mock aggregator + collector := newConntrackCollector(agg) + + // Test the collector + testCollector(t, collector) +} + +func TestConntrackCollectorWithNilAggregator(t *testing.T) { + // Test that the collector handles a nil aggregator gracefully + collector := newConntrackCollector(nil) + + // This should not panic and should emit zero metrics + testCollector(t, collector) +} + +func TestConntrackCollectorWithEmptyAggregator(t *testing.T) { + // Create an empty mock aggregator + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + + t.Cleanup(agg.Stop) + + // Create collector with empty aggregator + collector := newConntrackCollector(agg) + + // Test the collector + testCollector(t, collector) +} + +func TestConntrackCollectorWithLargeDataset(t *testing.T) { + // Create a mock aggregator with large dataset + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + + t.Cleanup(agg.Stop) + + // Add large dataset + // Simulate 2M entries across multiple zones + for zone := uint16(0); zone < 10; zone++ { + for mark := uint32(0); mark < 1000; mark++ { + agg.SetCount(zone, mark, int(uint32(zone)*1000+mark)) + } + } + + // Create collector + collector := newConntrackCollector(agg) + + // Test the collector + testCollector(t, collector) +} + +func TestConntrackCollectorEdgeCases(t *testing.T) { + // Test edge cases + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + + t.Cleanup(agg.Stop) + + // Test zero values + agg.SetCount(0, 0, 0) + + // Test maximum values + agg.SetCount(65535, 4294967295, 1000000) + + // Test negative count (should be handled gracefully) + agg.SetCount(1, 1, -1) + + collector := newConntrackCollector(agg) + testCollector(t, collector) +} + +func TestConntrackCollectorConcurrency(t *testing.T) { + // Test concurrent access + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + + t.Cleanup(agg.Stop) + + collector := newConntrackCollector(agg) + + // Test concurrent collection + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func() { + testCollector(t, collector) + done <- true + }() + } + + // Wait for all goroutines to complete + for i := 0; i < 10; i++ { + <-done + } +} diff --git a/internal/ovsexporter/conntrack_test.go b/internal/ovsexporter/conntrack_test.go new file mode 100644 index 0000000..30d6ed7 --- /dev/null +++ b/internal/ovsexporter/conntrack_test.go @@ -0,0 +1,64 @@ +//go:build linux +// +build linux + +// Copyright 2018-2021 DigitalOcean. +// SPDX-License-Identifier: Apache-2.0 + +package ovsexporter + +import ( + "testing" + "time" + + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" +) + +func TestConntrackCollector(t *testing.T) { + // Create a mock aggregator + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + // This is expected to fail in test environment due to permission requirements + t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) + // Test with nil aggregator to ensure collector handles gracefully + collector := newConntrackCollector(nil) + testCollector(t, collector) + return + } + + // Clean up aggregator after test + t.Cleanup(agg.Stop) + + // Create collector with real aggregator + collector := newConntrackCollector(agg) + + // Test the collector + testCollector(t, collector) +} + +func TestConntrackCollectorWithNilAggregator(t *testing.T) { + // Test that the collector handles a nil aggregator gracefully + collector := newConntrackCollector(nil) + + // This should not panic and should emit zero metrics + testCollector(t, collector) +} + +func TestConntrackCollectorWithRealData(t *testing.T) { + if testing.Short() { + t.Skip("Skipping conntrack test in short mode") + } + + // Test with real conntrack data if available + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Skipf("Skipping real data test: %v", err) + } + + t.Cleanup(agg.Stop) + + // Wait a bit for some real data to accumulate + time.Sleep(100 * time.Millisecond) + + collector := newConntrackCollector(agg) + testCollector(t, collector) +} From 78c7134482392ae8789e973aeb31bcef12ef2cad Mon Sep 17 00:00:00 2001 From: shrouti1995 Date: Tue, 28 Oct 2025 20:48:57 +0530 Subject: [PATCH 11/19] refining code * adding test cases * adding mock tests * Context-Based Cancellation Refactoring * add centralised error propagation * solve lint error * modelling test cases in table format * graceful shut down * centralised config * solve lint error --- CONNTRACK_CONFIG.md | 103 +++++ cmd/openvswitch_exporter/main.go | 61 ++- internal/conntrack/aggregator_linux.go | 69 ++- internal/conntrack/aggregator_linux_test.go | 418 ++++++++++++++--- internal/conntrack/config.go | 111 +++++ internal/conntrack/mock.go | 12 +- internal/conntrack/types.go | 14 +- internal/ovsexporter/conntrack_mock_test.go | 478 ++++++++++++++++---- internal/ovsexporter/conntrack_test.go | 367 +++++++++++++-- internal/ovsexporter/ovsexporter.go | 16 +- internal/ovsexporter/test_helpers.go | 283 ++++++++++++ 11 files changed, 1706 insertions(+), 226 deletions(-) create mode 100644 CONNTRACK_CONFIG.md create mode 100644 internal/conntrack/config.go create mode 100644 internal/ovsexporter/test_helpers.go diff --git a/CONNTRACK_CONFIG.md b/CONNTRACK_CONFIG.md new file mode 100644 index 0000000..2a2672f --- /dev/null +++ b/CONNTRACK_CONFIG.md @@ -0,0 +1,103 @@ +# Conntrack Configuration + +This document describes the configuration options available for the conntrack aggregator. + +## Environment Variables + +The conntrack aggregator can be configured using environment variables with the `CONNTRACK_` prefix: + +| Variable | Default | Description | +|----------|---------|-------------| +| `CONNTRACK_EVENT_CHAN_SIZE` | `524288` | Event channel buffer size (512KB) | +| `CONNTRACK_EVENT_WORKER_COUNT` | `100` | Number of event worker goroutines | +| `CONNTRACK_DESTROY_FLUSH_INTERVAL` | `50ms` | Interval for flushing destroy deltas | +| `CONNTRACK_DESTROY_DELTA_CAP` | `200000` | Maximum destroy delta entries | +| `CONNTRACK_DROPS_WARN_THRESHOLD` | `10000` | Threshold for missed events warning | +| `CONNTRACK_READ_BUFFER_SIZE` | `67108864` | Read buffer size (64MB) | +| `CONNTRACK_WRITE_BUFFER_SIZE` | `67108864` | Write buffer size (64MB) | +| `CONNTRACK_HEALTH_CHECK_INTERVAL` | `5m` | Health check interval | +| `CONNTRACK_GRACEFUL_TIMEOUT` | `30s` | Graceful shutdown timeout | + +## Usage Examples + +### Basic Configuration + +```bash +# Set custom buffer sizes +export CONNTRACK_EVENT_CHAN_SIZE=1048576 +export CONNTRACK_EVENT_WORKER_COUNT=200 + +# Run the exporter +./openvswitch_exporter +``` + +### High-Throughput Environment + +For environments with high conntrack event rates (>1M events/sec): + +```bash +export CONNTRACK_EVENT_CHAN_SIZE=1048576 # 1MB buffer +export CONNTRACK_EVENT_WORKER_COUNT=200 # More workers +export CONNTRACK_DESTROY_FLUSH_INTERVAL=25ms # Faster flushing +export CONNTRACK_DESTROY_DELTA_CAP=500000 # Larger delta cap +export CONNTRACK_READ_BUFFER_SIZE=134217728 # 128MB read buffer +export CONNTRACK_WRITE_BUFFER_SIZE=134217728 # 128MB write buffer +``` + +### Low-Resource Environment + +For environments with limited resources: + +```bash +export CONNTRACK_EVENT_CHAN_SIZE=65536 # 64KB buffer +export CONNTRACK_EVENT_WORKER_COUNT=50 # Fewer workers +export CONNTRACK_DESTROY_FLUSH_INTERVAL=100ms # Slower flushing +export CONNTRACK_DESTROY_DELTA_CAP=50000 # Smaller delta cap +export CONNTRACK_READ_BUFFER_SIZE=16777216 # 16MB read buffer +export CONNTRACK_WRITE_BUFFER_SIZE=16777216 # 16MB write buffer +``` + +### Development/Testing + +For development and testing: + +```bash +export CONNTRACK_GRACEFUL_TIMEOUT=5s # Faster shutdown +export CONNTRACK_HEALTH_CHECK_INTERVAL=1m # More frequent health checks +``` + +## Configuration Validation + +The configuration system includes validation: + +- **Positive values**: All numeric values must be positive +- **Valid durations**: Time values must be valid Go durations +- **Range checks**: Values are checked for reasonable ranges + +Invalid values will fall back to defaults with a warning logged. + +## Migration from Hardcoded Constants + +The following hardcoded constants have been replaced: + +| Old Constant | New Environment Variable | Default Value | +|--------------|-------------------------|---------------| +| `eventChanSize = 512 * 1024` | `CONNTRACK_EVENT_CHAN_SIZE` | `524288` | +| `eventWorkerCount = 100` | `CONNTRACK_EVENT_WORKER_COUNT` | `100` | +| `destroyFlushIntvl = 50ms` | `CONNTRACK_DESTROY_FLUSH_INTERVAL` | `50ms` | +| `destroyDeltaCap = 200000` | `CONNTRACK_DESTROY_DELTA_CAP` | `200000` | +| `dropsWarnThreshold = 10000` | `CONNTRACK_DROPS_WARN_THRESHOLD` | `10000` | +| Buffer sizes `64MB` | `CONNTRACK_READ_BUFFER_SIZE` / `WRITE_BUFFER_SIZE` | `67108864` | +| Health check `5m` | `CONNTRACK_HEALTH_CHECK_INTERVAL` | `5m` | +| Graceful timeout `30s` | `CONNTRACK_GRACEFUL_TIMEOUT` | `30s` | + +## Performance Impact + +Configuration changes can significantly impact performance: + +- **Larger buffers**: Better for high-throughput, uses more memory +- **More workers**: Better parallelism, uses more CPU +- **Faster flushing**: Lower latency, more CPU usage +- **Larger delta cap**: Handles bursts better, uses more memory + +Choose settings based on your environment's characteristics and requirements. diff --git a/cmd/openvswitch_exporter/main.go b/cmd/openvswitch_exporter/main.go index 59c93d2..ba8495d 100644 --- a/cmd/openvswitch_exporter/main.go +++ b/cmd/openvswitch_exporter/main.go @@ -5,9 +5,14 @@ package main import ( + "context" "flag" "log" "net/http" + "os" + "os/signal" + "syscall" + "time" "github.com/digitalocean/go-openvswitch/ovsnl" "github.com/digitalocean/openvswitch_exporter/internal/ovsexporter" @@ -38,9 +43,59 @@ func main() { http.Redirect(w, r, *metricsPath, http.StatusMovedPermanently) }) - log.Printf("starting Open vSwitch exporter on %q", *metricsAddr) + // Create HTTP server + server := &http.Server{ + Addr: *metricsAddr, + Handler: mux, + } + + // Handle shutdown signals + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, + syscall.SIGINT, // Ctrl+C + syscall.SIGTERM, // Termination request + syscall.SIGHUP, // Hang up (config reload) + syscall.SIGQUIT, // Quit signal + ) - if err := http.ListenAndServe(*metricsAddr, mux); err != nil { - log.Fatalf("cannot start Open vSwitch exporter: %v", err) + // Start server in goroutine + go func() { + log.Printf("starting Open vSwitch exporter on %q", *metricsAddr) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("cannot start Open vSwitch exporter: %v", err) + } + }() + + // Wait for shutdown signal + sig := <-sigChan + + switch sig { + case syscall.SIGHUP: + log.Printf("Received SIGHUP, reloading config...") + // TODO: Add config reload logic here + log.Printf("Config reloaded") + return + case syscall.SIGQUIT: + log.Printf("Received SIGQUIT, shutting down immediately...") + // Immediate shutdown for SIGQUIT + default: + log.Printf("Received signal %v, stopping gracefully...", sig) } + + // Graceful shutdown with 15 second timeout + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + if err := server.Shutdown(ctx); err != nil { + log.Printf("Server shutdown error: %v", err) + } + + // Close collector if it supports graceful shutdown + if closeable, ok := collector.(interface{ Close() error }); ok { + if err := closeable.Close(); err != nil { + log.Printf("Collector shutdown error: %v", err) + } + } + + log.Printf("Exporter stopped") } diff --git a/internal/conntrack/aggregator_linux.go b/internal/conntrack/aggregator_linux.go index 6a8a54f..6a0cfe8 100644 --- a/internal/conntrack/aggregator_linux.go +++ b/internal/conntrack/aggregator_linux.go @@ -33,27 +33,32 @@ import ( // NewZoneMarkAggregator creates a new aggregator with its own listening connection. func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { + return NewZoneMarkAggregatorWithConfig(LoadConfig()) +} +// NewZoneMarkAggregatorWithConfig creates a new aggregator with custom configuration. +func NewZoneMarkAggregatorWithConfig(config *Config) (*ZoneMarkAggregator, error) { // Create a separate connection for listening to events listenCli, err := conntrack.Dial(nil) if err != nil { return nil, fmt.Errorf("failed to create listening connection: %w", err) } - if err := listenCli.SetReadBuffer(64 * 1024 * 1024); err != nil { // 64MB buffer for 1.4M events/sec + if err := listenCli.SetReadBuffer(config.ReadBufferSize); err != nil { log.Printf("Warning: Failed to set read buffer size: %v", err) } - if err := listenCli.SetWriteBuffer(64 * 1024 * 1024); err != nil { // 64MB buffer for 1.4M events/sec + if err := listenCli.SetWriteBuffer(config.WriteBufferSize); err != nil { log.Printf("Warning: Failed to set write buffer size: %v", err) } ctx, cancel := context.WithCancel(context.Background()) a := &ZoneMarkAggregator{ + config: config, counts: make(map[ZoneMarkKey]int), listenCli: listenCli, ctx: ctx, cancel: cancel, - eventsCh: make(chan conntrack.Event, eventChanSize), + eventsCh: make(chan conntrack.Event, config.EventChanSize), destroyDeltas: make(map[ZoneMarkKey]int), lastEventTime: time.Now(), lastHealthCheck: time.Now(), @@ -69,7 +74,7 @@ func (a *ZoneMarkAggregator) Start() error { return err } - for i := 0; i < eventWorkerCount; i++ { + for i := 0; i < a.config.EventWorkerCount; i++ { a.wg.Go(func() error { return a.eventWorker(a.ctx) }) @@ -174,7 +179,7 @@ func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) error { if ev.Type == conntrack.EventDestroy { a.deltaMu.Lock() defer a.deltaMu.Unlock() - if len(a.destroyDeltas) < destroyDeltaCap { + if len(a.destroyDeltas) < a.config.DestroyDeltaCap { a.destroyDeltas[key]++ if len(a.destroyDeltas) > 50000 { // If we have >50K deltas, flush immediately deltas := a.destroyDeltas @@ -192,7 +197,7 @@ func (a *ZoneMarkAggregator) handleEvent(ev conntrack.Event) error { } } else { a.missedEvents.Add(1) - if a.missedEvents.Load()%dropsWarnThreshold == 0 { + if a.missedEvents.Load()%a.config.DropsWarnThreshold == 0 { log.Printf("Warning: destroyDeltas saturated (size=%d). missedEvents=%d", len(a.destroyDeltas), a.missedEvents.Load()) } } @@ -222,7 +227,7 @@ func (a *ZoneMarkAggregator) applyDeltasImmediatelyUnsafe(deltas map[ZoneMarkKey // destroyFlusher periodically applies the aggregated DESTROY deltas into counts // Uses adaptive flushing: more frequent during high event rates for minimal lag func (a *ZoneMarkAggregator) destroyFlusher(ctx context.Context) error { - ticker := time.NewTicker(destroyFlushIntvl) + ticker := time.NewTicker(a.config.DestroyFlushIntvl) defer ticker.Stop() for { @@ -250,7 +255,7 @@ func (a *ZoneMarkAggregator) destroyFlusher(ctx context.Context) error { } else { // Normal flush a.flushDestroyDeltas() - ticker.Reset(destroyFlushIntvl) // Back to normal interval + ticker.Reset(a.config.DestroyFlushIntvl) // Back to normal interval } } } @@ -302,7 +307,7 @@ func (a *ZoneMarkAggregator) Snapshot() map[ZoneMarkKey]int { // startHealthMonitoring periodically logs aggregator health func (a *ZoneMarkAggregator) startHealthMonitoring(ctx context.Context) error { - ticker := time.NewTicker(5 * time.Minute) + ticker := time.NewTicker(a.config.HealthCheckIntvl) defer ticker.Stop() for { @@ -321,7 +326,7 @@ func (a *ZoneMarkAggregator) startHealthMonitoring(ctx context.Context) error { func (a *ZoneMarkAggregator) performHealthCheck() error { missed := a.missedEvents.Load() - if missed > dropsWarnThreshold { + if missed > a.config.DropsWarnThreshold { if err := a.RestartListener(); err != nil { log.Printf("Health check: RestartListener failed: %v", err) return fmt.Errorf("failed to restart listener: %w", err) @@ -340,21 +345,53 @@ func (a *ZoneMarkAggregator) GetError() error { return nil } -// Stop cancels listening and closes the connection. -func (a *ZoneMarkAggregator) Stop() { - a.cancel() // Cancel the context to signal all goroutines to stop +// Stop cancels listening and closes the connection with graceful shutdown. +func (a *ZoneMarkAggregator) Stop() error { + return a.StopWithTimeout(a.config.GracefulTimeout) +} + +// StopWithTimeout cancels listening and closes the connection with a configurable timeout. +func (a *ZoneMarkAggregator) StopWithTimeout(timeout time.Duration) error { + // Signal shutdown to all goroutines + a.cancel() + + // Create a context with timeout for graceful shutdown + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + // Channel to receive shutdown completion + done := make(chan error, 1) - // Wait for all goroutines to exit and check for errors - if err := a.wg.Wait(); err != nil { - log.Printf("Error from goroutine group: %v", err) + // Wait for goroutines to exit in a separate goroutine + go func() { + done <- a.wg.Wait() + }() + + // Wait for either completion or timeout + select { + case err := <-done: + if err != nil { + log.Printf("Error from goroutine group during shutdown: %v", err) + // Continue with cleanup even if there were errors + } + case <-ctx.Done(): + log.Printf("Graceful shutdown timeout exceeded (%v), forcing cleanup", timeout) + // Force close connections even if goroutines didn't exit cleanly } + // Close the listening connection if a.listenCli != nil { if err := a.listenCli.Close(); err != nil { log.Printf("Error closing listenCli during cleanup: %v", err) } + a.listenCli = nil } + + // Final flush of any remaining deltas a.flushDestroyDeltas() + + log.Printf("Aggregator stopped gracefully") + return nil } // RestartListener attempts to restart the conntrack event listener diff --git a/internal/conntrack/aggregator_linux_test.go b/internal/conntrack/aggregator_linux_test.go index 6043f66..8c629e1 100644 --- a/internal/conntrack/aggregator_linux_test.go +++ b/internal/conntrack/aggregator_linux_test.go @@ -18,99 +18,377 @@ package conntrack import ( "testing" + "time" ) func TestZoneMarkAggregator(t *testing.T) { - // Test aggregator creation - agg, err := NewZoneMarkAggregator() - if err != nil { - // This is expected to fail in test environment due to permission requirements - t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) - return - } + tests := []struct { + name string + setup func() (*ZoneMarkAggregator, error) + operations []func(*ZoneMarkAggregator) error + validate func(*testing.T, *ZoneMarkAggregator) + wantErr bool + skipOnError bool + }{ + { + name: "successful_creation", + setup: func() (*ZoneMarkAggregator, error) { + return NewZoneMarkAggregator() + }, + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + if agg == nil { + t.Fatal("expected non-nil aggregator") + } + snapshot := agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + }, + wantErr: false, + skipOnError: true, // Skip if permission issues + }, + { + name: "snapshot_functionality", + setup: func() (*ZoneMarkAggregator, error) { + return NewZoneMarkAggregator() + }, + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + snapshot := agg.Snapshot() + if snapshot == nil { + t.Fatal("Snapshot() returned nil") + } - if agg == nil { - t.Fatal("NewZoneMarkAggregator() returned nil aggregator") - } + // Verify snapshot is a map[ZoneMarkKey]int + if len(snapshot) == 0 { + t.Log("Snapshot is empty (expected in test environment)") + } - // Test basic methods - snapshot := agg.Snapshot() - if snapshot == nil { - t.Fatal("Snapshot() returned nil") + // Test that we can iterate over the snapshot + for key, count := range snapshot { + if count <= 0 { + t.Errorf("Invalid count %d for key %+v", count, key) + } + t.Logf("Zone: %d, Mark: %d, Count: %d", key.Zone, key.Mark, count) + } + }, + wantErr: false, + skipOnError: true, + }, + { + name: "start_stop_lifecycle", + setup: func() (*ZoneMarkAggregator, error) { + return NewZoneMarkAggregator() + }, + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + func(agg *ZoneMarkAggregator) error { + // Let it run briefly + time.Sleep(10 * time.Millisecond) + return agg.Stop() + }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + // Test that aggregator can be stopped gracefully + if err := agg.Stop(); err != nil { + t.Errorf("Stop() returned error: %v", err) + } + // Snapshot should still work after stop + snapshot := agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work after stop") + } + }, + wantErr: false, + skipOnError: true, + }, + { + name: "concurrent_snapshot_access", + setup: func() (*ZoneMarkAggregator, error) { + return NewZoneMarkAggregator() + }, + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + // Test concurrent snapshot access + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func() { + snapshot := agg.Snapshot() + if snapshot == nil { + t.Error("Concurrent snapshot returned nil") + } + done <- true + }() + } + + // Wait for all goroutines + for i := 0; i < 10; i++ { + <-done + } + }, + wantErr: false, + skipOnError: true, + }, } - // Clean up - t.Cleanup(agg.Stop) -} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := tt.setup() + if (err != nil) != tt.wantErr { + if tt.skipOnError { + t.Logf("Skipping test due to expected failure: %v", err) + return + } + t.Errorf("setup error = %v, wantErr %v", err, tt.wantErr) + return + } + if agg == nil { + if tt.skipOnError { + t.Skip("Expected failure in test environment") + return + } + t.Fatal("NewZoneMarkAggregator() returned nil aggregator") + } -func TestZoneMarkAggregatorSnapshot(t *testing.T) { - // Test aggregator creation - agg, err := NewZoneMarkAggregator() - if err != nil { - // This is expected to fail in test environment due to permission requirements - t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) - return - } + t.Cleanup(func() { agg.Stop() }) + + for i, op := range tt.operations { + if err := op(agg); err != nil { + if tt.skipOnError { + t.Logf("Skipping test due to operation %d failure: %v", i, err) + return + } + t.Errorf("operation %d failed: %v", i, err) + return + } + } - if agg == nil { - t.Fatal("NewZoneMarkAggregator() returned nil aggregator") + if tt.validate != nil { + tt.validate(t, agg) + } + }) } +} - // Test snapshot functionality with new ZoneMarkKey-based mapping - snapshot := agg.Snapshot() - if snapshot == nil { - t.Fatal("Snapshot() returned nil") +func TestZoneMarkKey(t *testing.T) { + tests := []struct { + name string + key1 ZoneMarkKey + key2 ZoneMarkKey + expected bool + desc string + }{ + { + name: "identical_keys", + key1: ZoneMarkKey{Zone: 1, Mark: 100}, + key2: ZoneMarkKey{Zone: 1, Mark: 100}, + expected: true, + desc: "Identical ZoneMarkKey structs should be equal", + }, + { + name: "different_zone", + key1: ZoneMarkKey{Zone: 1, Mark: 100}, + key2: ZoneMarkKey{Zone: 2, Mark: 100}, + expected: false, + desc: "Different zone ZoneMarkKey structs should not be equal", + }, + { + name: "different_mark", + key1: ZoneMarkKey{Zone: 1, Mark: 100}, + key2: ZoneMarkKey{Zone: 1, Mark: 200}, + expected: false, + desc: "Different mark ZoneMarkKey structs should not be equal", + }, + { + name: "both_different", + key1: ZoneMarkKey{Zone: 1, Mark: 100}, + key2: ZoneMarkKey{Zone: 2, Mark: 200}, + expected: false, + desc: "Both zone and mark different should not be equal", + }, + { + name: "zero_values", + key1: ZoneMarkKey{Zone: 0, Mark: 0}, + key2: ZoneMarkKey{Zone: 0, Mark: 0}, + expected: true, + desc: "Zero values should be equal", + }, + { + name: "max_values", + key1: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, + key2: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, + expected: true, + desc: "Max values should be equal", + }, } - // Verify snapshot is a map[ZoneMarkKey]int - if len(snapshot) == 0 { - t.Log("Snapshot is empty (expected in test environment)") + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := (tt.key1 == tt.key2) + if result != tt.expected { + t.Errorf("%s: got %v, want %v", tt.desc, result, tt.expected) + } + }) } +} - // Test that we can iterate over the snapshot - for key, count := range snapshot { - if count <= 0 { - t.Errorf("Invalid count %d for key %+v", count, key) - } - t.Logf("Zone: %d, Mark: %d, Count: %d", key.Zone, key.Mark, count) +func TestZoneMarkKeyAsMapKey(t *testing.T) { + tests := []struct { + name string + keys []ZoneMarkKey + values []int + lookup ZoneMarkKey + expected int + desc string + }{ + { + name: "basic_map_operations", + keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, + values: []int{5, 10}, + lookup: ZoneMarkKey{Zone: 1, Mark: 100}, + expected: 5, + desc: "ZoneMarkKey should work as map key", + }, + { + name: "equal_keys_map_to_same_value", + keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, + values: []int{5, 10}, + lookup: ZoneMarkKey{Zone: 1, Mark: 100}, // Same as first key + expected: 5, + desc: "Equal ZoneMarkKey structs should map to same value", + }, + { + name: "different_keys_map_to_different_values", + keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, + values: []int{5, 10}, + lookup: ZoneMarkKey{Zone: 2, Mark: 200}, + expected: 10, + desc: "Different ZoneMarkKey should map to different value", + }, + { + name: "zero_key_operations", + keys: []ZoneMarkKey{{Zone: 0, Mark: 0}, {Zone: 1, Mark: 1}}, + values: []int{100, 200}, + lookup: ZoneMarkKey{Zone: 0, Mark: 0}, + expected: 100, + desc: "Zero value keys should work correctly", + }, } - // Clean up - t.Cleanup(agg.Stop) -} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + testMap := make(map[ZoneMarkKey]int) -func TestZMKeyComparison(t *testing.T) { - // Test that ZoneMarkKey works correctly as a map key - key1 := ZoneMarkKey{Zone: 1, Mark: 100} - key2 := ZoneMarkKey{Zone: 1, Mark: 100} - key3 := ZoneMarkKey{Zone: 2, Mark: 100} - key4 := ZoneMarkKey{Zone: 1, Mark: 200} + // Populate map + for i, key := range tt.keys { + testMap[key] = tt.values[i] + } - // Test equality - if key1 != key2 { - t.Error("Identical ZoneMarkKey structs should be equal") + // Test lookup + result := testMap[tt.lookup] + if result != tt.expected { + t.Errorf("%s: got %d, want %d", tt.desc, result, tt.expected) + } + }) } +} - // Test inequality - if key1 == key3 { - t.Error("Different zone ZoneMarkKey structs should not be equal") - } - if key1 == key4 { - t.Error("Different mark ZoneMarkKey structs should not be equal") +func TestAggregatorLifecycle(t *testing.T) { + tests := []struct { + name string + operations []func(*ZoneMarkAggregator) error + validate func(*testing.T, *ZoneMarkAggregator) + wantErr bool + skipOnError bool + }{ + { + name: "start_twice_should_fail", + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + func(agg *ZoneMarkAggregator) error { return agg.Start() }, // Second start + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + // Second start should fail or be idempotent + }, + wantErr: false, // May or may not error depending on implementation + skipOnError: true, + }, + { + name: "stop_without_start", + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { + return agg.Stop() // Stop without starting + }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + // Should not panic + snapshot := agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work even after stop without start") + } + }, + wantErr: false, + }, + { + name: "snapshot_after_stop", + operations: []func(*ZoneMarkAggregator) error{ + func(agg *ZoneMarkAggregator) error { return agg.Start() }, + func(agg *ZoneMarkAggregator) error { + time.Sleep(10 * time.Millisecond) + return agg.Stop() + }, + }, + validate: func(t *testing.T, agg *ZoneMarkAggregator) { + // Snapshot should work after stop + snapshot := agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work after stop") + } + }, + wantErr: false, + skipOnError: true, + }, } - // Test as map keys - testMap := make(map[ZoneMarkKey]int) - testMap[key1] = 5 - testMap[key3] = 10 + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := NewZoneMarkAggregator() + if err != nil { + if tt.skipOnError { + t.Logf("Skipping test due to expected failure: %v", err) + return + } + t.Fatalf("Failed to create aggregator: %v", err) + } + if agg == nil { + t.Fatal("NewZoneMarkAggregator() returned nil aggregator") + } - if testMap[key1] != 5 { - t.Error("ZoneMarkKey should work as map key") - } - if testMap[key2] != 5 { - t.Error("Equal ZoneMarkKey structs should map to same value") - } - if testMap[key3] != 10 { - t.Error("Different ZoneMarkKey should map to different value") + t.Cleanup(func() { agg.Stop() }) + + for i, op := range tt.operations { + if err := op(agg); err != nil { + if (err != nil) != tt.wantErr { + if tt.skipOnError { + t.Logf("Skipping test due to operation %d failure: %v", i, err) + return + } + t.Errorf("operation %d error = %v, wantErr %v", i, err, tt.wantErr) + return + } + } + } + + if tt.validate != nil { + tt.validate(t, agg) + } + }) } } diff --git a/internal/conntrack/config.go b/internal/conntrack/config.go new file mode 100644 index 0000000..f668cd2 --- /dev/null +++ b/internal/conntrack/config.go @@ -0,0 +1,111 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package conntrack + +import ( + "os" + "strconv" + "time" +) + +// Config holds configuration for the conntrack aggregator +type Config struct { + EventChanSize int + EventWorkerCount int + DestroyFlushIntvl time.Duration + DestroyDeltaCap int + DropsWarnThreshold int64 + ReadBufferSize int + WriteBufferSize int + HealthCheckIntvl time.Duration + GracefulTimeout time.Duration +} + +// DefaultConfig returns default configuration values +func DefaultConfig() *Config { + return &Config{ + EventChanSize: 512 * 1024, + EventWorkerCount: 100, + DestroyFlushIntvl: 50 * time.Millisecond, + DestroyDeltaCap: 200000, + DropsWarnThreshold: 10000, + ReadBufferSize: 64 * 1024 * 1024, + WriteBufferSize: 64 * 1024 * 1024, + HealthCheckIntvl: 5 * time.Minute, + GracefulTimeout: 30 * time.Second, + } +} + +// LoadConfig loads conntrack configuration from environment variables +func LoadConfig() *Config { + config := DefaultConfig() + + // Load from environment variables + if size := os.Getenv("CONNTRACK_EVENT_CHAN_SIZE"); size != "" { + if s, err := strconv.Atoi(size); err == nil && s > 0 { + config.EventChanSize = s + } + } + + if count := os.Getenv("CONNTRACK_EVENT_WORKER_COUNT"); count != "" { + if c, err := strconv.Atoi(count); err == nil && c > 0 { + config.EventWorkerCount = c + } + } + + if interval := os.Getenv("CONNTRACK_DESTROY_FLUSH_INTERVAL"); interval != "" { + if d, err := time.ParseDuration(interval); err == nil && d > 0 { + config.DestroyFlushIntvl = d + } + } + + if cap := os.Getenv("CONNTRACK_DESTROY_DELTA_CAP"); cap != "" { + if c, err := strconv.Atoi(cap); err == nil && c > 0 { + config.DestroyDeltaCap = c + } + } + + if threshold := os.Getenv("CONNTRACK_DROPS_WARN_THRESHOLD"); threshold != "" { + if t, err := strconv.ParseInt(threshold, 10, 64); err == nil && t >= 0 { + config.DropsWarnThreshold = t + } + } + + if size := os.Getenv("CONNTRACK_READ_BUFFER_SIZE"); size != "" { + if s, err := strconv.Atoi(size); err == nil && s > 0 { + config.ReadBufferSize = s + } + } + + if size := os.Getenv("CONNTRACK_WRITE_BUFFER_SIZE"); size != "" { + if s, err := strconv.Atoi(size); err == nil && s > 0 { + config.WriteBufferSize = s + } + } + + if interval := os.Getenv("CONNTRACK_HEALTH_CHECK_INTERVAL"); interval != "" { + if d, err := time.ParseDuration(interval); err == nil && d > 0 { + config.HealthCheckIntvl = d + } + } + + if timeout := os.Getenv("CONNTRACK_GRACEFUL_TIMEOUT"); timeout != "" { + if d, err := time.ParseDuration(timeout); err == nil && d > 0 { + config.GracefulTimeout = d + } + } + + return config +} diff --git a/internal/conntrack/mock.go b/internal/conntrack/mock.go index edd88d1..665ac92 100644 --- a/internal/conntrack/mock.go +++ b/internal/conntrack/mock.go @@ -18,9 +18,15 @@ type MockZoneMarkAggregator struct { // NewZoneMarkAggregator creates a mock aggregator for testing func NewZoneMarkAggregator() (*MockZoneMarkAggregator, error) { + return NewZoneMarkAggregatorWithConfig(LoadConfig()) +} + +// NewZoneMarkAggregatorWithConfig creates a mock aggregator with custom configuration +func NewZoneMarkAggregatorWithConfig(config *Config) (*MockZoneMarkAggregator, error) { ctx, cancel := context.WithCancel(context.Background()) return &MockZoneMarkAggregator{ ZoneMarkAggregator: &ZoneMarkAggregator{ + config: config, ctx: ctx, cancel: cancel, }, @@ -45,9 +51,11 @@ func (m *MockZoneMarkAggregator) Start() error { return nil } -// Stop stops the mock aggregator -func (m *MockZoneMarkAggregator) Stop() { +// Stop stops the mock aggregator with graceful shutdown +func (m *MockZoneMarkAggregator) Stop() error { m.cancel() + // Mock implementation doesn't need actual cleanup + return nil } // AddEntry adds a mock entry for testing diff --git a/internal/conntrack/types.go b/internal/conntrack/types.go index 6677f6f..ede992b 100644 --- a/internal/conntrack/types.go +++ b/internal/conntrack/types.go @@ -24,17 +24,11 @@ import ( "golang.org/x/sync/errgroup" ) -// Tunables - adjust for your environment -const ( - eventChanSize = 512 * 1024 - eventWorkerCount = 100 - destroyFlushIntvl = 50 * time.Millisecond // flush aggregated DESTROYs every 50ms for minimal lag - destroyDeltaCap = 200000 // maximum distinct (zone,mark) entries in destroyDeltas - dropsWarnThreshold = 10000 // threshold of missedEvents to log a stronger warning -) - // ZoneMarkAggregator keeps live counts (zmKey -> count) with bounded ingestion type ZoneMarkAggregator struct { + // Configuration + config *Config + // primary counts (zmKey -> count) - simplified flat mapping counts map[ZoneMarkKey]int countsMu sync.RWMutex @@ -72,6 +66,6 @@ type ZoneMarkKey struct { // Aggregator interface defines the methods needed by the collector type Aggregator interface { Snapshot() map[ZoneMarkKey]int - Stop() + Stop() error Start() error } diff --git a/internal/ovsexporter/conntrack_mock_test.go b/internal/ovsexporter/conntrack_mock_test.go index f72339c..271915d 100644 --- a/internal/ovsexporter/conntrack_mock_test.go +++ b/internal/ovsexporter/conntrack_mock_test.go @@ -5,124 +5,416 @@ package ovsexporter import ( "testing" + "time" "github.com/digitalocean/openvswitch_exporter/internal/conntrack" ) func TestConntrackCollector(t *testing.T) { - // Create a mock aggregator - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Fatalf("Failed to create mock aggregator: %v", err) - } - - // Clean up aggregator after test - t.Cleanup(agg.Stop) - - // Add some test data - agg.SetCount(0, 100, 1500) - agg.SetCount(0, 200, 2500) - agg.SetCount(1, 300, 3500) - - // Create collector with mock aggregator - collector := newConntrackCollector(agg) - - // Test the collector - testCollector(t, collector) -} + tests := []struct { + name string + setup func() (conntrack.Aggregator, error) + operations []func(conntrack.Aggregator) error + validate func(*testing.T, *conntrackCollector) + wantErr bool + description string + }{ + { + name: "basic_functionality", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { + // Add test data + mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + mockAgg.SetCount(0, 100, 1500) + mockAgg.SetCount(0, 200, 2500) + mockAgg.SetCount(1, 300, 3500) + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + if collector == nil { + t.Fatal("expected non-nil collector") + } + if collector.desc == nil { + t.Fatal("expected non-nil description") + } + }, + wantErr: false, + description: "Test basic collector functionality with mock data", + }, + { + name: "nil_aggregator", + setup: func() (conntrack.Aggregator, error) { + return nil, nil + }, + operations: []func(conntrack.Aggregator) error{}, + validate: func(t *testing.T, collector *conntrackCollector) { + if collector == nil { + t.Fatal("expected non-nil collector") + } + if collector.agg != nil { + t.Error("expected nil aggregator") + } + }, + wantErr: false, + description: "Test collector handles nil aggregator gracefully", + }, + { + name: "empty_aggregator", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{}, + validate: func(t *testing.T, collector *conntrackCollector) { + if collector == nil { + t.Fatal("expected non-nil collector") + } + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + if len(snapshot) != 0 { + t.Errorf("expected empty snapshot, got %d entries", len(snapshot)) + } + }, + wantErr: false, + description: "Test collector with empty aggregator", + }, + { + name: "large_dataset", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { + // Add large dataset - simulate 10K entries across multiple zones + mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + for zone := uint16(0); zone < 10; zone++ { + for mark := uint32(0); mark < 1000; mark++ { + mockAgg.SetCount(zone, mark, int(uint32(zone)*1000+mark)) + } + } + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + if len(snapshot) != 10000 { + t.Errorf("expected 10000 entries, got %d", len(snapshot)) + } + }, + wantErr: false, + description: "Test collector with large dataset", + }, + { + name: "edge_cases", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { + mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + // Test zero values + mockAgg.SetCount(0, 0, 0) + // Test maximum values + mockAgg.SetCount(65535, 4294967295, 1000000) + // Test negative count (should be handled gracefully) + mockAgg.SetCount(1, 1, -1) + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + // Should have 2 entries (zero and negative counts should be filtered out) + if len(snapshot) != 2 { + t.Errorf("expected 2 entries, got %d", len(snapshot)) + } + }, + wantErr: false, + description: "Test collector with edge cases", + }, + { + name: "concurrent_operations", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { + mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + // Add some initial data + mockAgg.SetCount(0, 100, 100) + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Test concurrent collection + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func() { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Error("Concurrent snapshot returned nil") + } + done <- true + }() + } -func TestConntrackCollectorWithNilAggregator(t *testing.T) { - // Test that the collector handles a nil aggregator gracefully - collector := newConntrackCollector(nil) - - // This should not panic and should emit zero metrics - testCollector(t, collector) -} - -func TestConntrackCollectorWithEmptyAggregator(t *testing.T) { - // Create an empty mock aggregator - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Fatalf("Failed to create mock aggregator: %v", err) + // Wait for all goroutines + for i := 0; i < 10; i++ { + <-done + } + }, + wantErr: false, + description: "Test concurrent collector operations", + }, } - t.Cleanup(agg.Stop) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := tt.setup() + if (err != nil) != tt.wantErr { + t.Errorf("setup error = %v, wantErr %v", err, tt.wantErr) + return + } - // Create collector with empty aggregator - collector := newConntrackCollector(agg) + if agg != nil { + t.Cleanup(func() { agg.Stop() }) + } - // Test the collector - testCollector(t, collector) -} + for i, op := range tt.operations { + if err := op(agg); err != nil { + t.Errorf("operation %d failed: %v", i, err) + return + } + } -func TestConntrackCollectorWithLargeDataset(t *testing.T) { - // Create a mock aggregator with large dataset - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Fatalf("Failed to create mock aggregator: %v", err) - } + collector := newConntrackCollector(agg) + if tt.validate != nil { + tt.validate(t, collector.(*conntrackCollector)) + } - t.Cleanup(agg.Stop) - - // Add large dataset - // Simulate 2M entries across multiple zones - for zone := uint16(0); zone < 10; zone++ { - for mark := uint32(0); mark < 1000; mark++ { - agg.SetCount(zone, mark, int(uint32(zone)*1000+mark)) - } + // Test the collector with Prometheus + testCollector(t, collector) + }) } - - // Create collector - collector := newConntrackCollector(agg) - - // Test the collector - testCollector(t, collector) } -func TestConntrackCollectorEdgeCases(t *testing.T) { - // Test edge cases - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Fatalf("Failed to create mock aggregator: %v", err) +func TestMockAggregatorOperations(t *testing.T) { + tests := []struct { + name string + operations []func(*conntrack.MockZoneMarkAggregator) + validate func(*testing.T, *conntrack.MockZoneMarkAggregator) + description string + }{ + { + name: "add_remove_entries", + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + agg.AddEntry(0, 100) + agg.AddEntry(0, 100) + agg.AddEntry(1, 200) + }, + func(agg *conntrack.MockZoneMarkAggregator) { + agg.RemoveEntry(0, 100) + }, + }, + validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + snapshot := agg.Snapshot() + if len(snapshot) != 2 { + t.Errorf("expected 2 entries, got %d", len(snapshot)) + } + // Check specific counts + key1 := conntrack.ZoneMarkKey{Zone: 0, Mark: 100} + key2 := conntrack.ZoneMarkKey{Zone: 1, Mark: 200} + if snapshot[key1] != 1 { + t.Errorf("expected count 1 for key %v, got %d", key1, snapshot[key1]) + } + if snapshot[key2] != 1 { + t.Errorf("expected count 1 for key %v, got %d", key2, snapshot[key2]) + } + }, + description: "Test add/remove entry operations", + }, + { + name: "set_count_operations", + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + agg.SetCount(0, 100, 1500) + agg.SetCount(1, 200, 2500) + agg.SetCount(2, 300, 0) // Should be filtered out + }, + }, + validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + snapshot := agg.Snapshot() + if len(snapshot) != 2 { + t.Errorf("expected 2 entries, got %d", len(snapshot)) + } + key1 := conntrack.ZoneMarkKey{Zone: 0, Mark: 100} + key2 := conntrack.ZoneMarkKey{Zone: 1, Mark: 200} + if snapshot[key1] != 1500 { + t.Errorf("expected count 1500 for key %v, got %d", key1, snapshot[key1]) + } + if snapshot[key2] != 2500 { + t.Errorf("expected count 2500 for key %v, got %d", key2, snapshot[key2]) + } + }, + description: "Test set count operations", + }, + { + name: "clear_operations", + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + agg.SetCount(0, 100, 1500) + agg.SetCount(1, 200, 2500) + }, + func(agg *conntrack.MockZoneMarkAggregator) { + agg.Clear() + }, + }, + validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + snapshot := agg.Snapshot() + if len(snapshot) != 0 { + t.Errorf("expected empty snapshot after clear, got %d entries", len(snapshot)) + } + }, + description: "Test clear operations", + }, + { + name: "health_metrics", + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + agg.SetCount(0, 100, 1000) + }, + }, + validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + if !agg.IsHealthy() { + t.Error("expected healthy aggregator") + } + if agg.GetEventRate() != 100.0 { + t.Errorf("expected event rate 100.0, got %f", agg.GetEventRate()) + } + if agg.GetMissedEvents() != 0 { + t.Errorf("expected 0 missed events, got %d", agg.GetMissedEvents()) + } + lastEventTime := agg.GetLastEventTime() + if lastEventTime.IsZero() { + t.Error("expected non-zero last event time") + } + }, + description: "Test health metrics", + }, } - t.Cleanup(agg.Stop) - - // Test zero values - agg.SetCount(0, 0, 0) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + t.Cleanup(func() { agg.Stop() }) - // Test maximum values - agg.SetCount(65535, 4294967295, 1000000) + for _, op := range tt.operations { + op(agg) + // Add small delay between operations to test timing + time.Sleep(1 * time.Millisecond) + } - // Test negative count (should be handled gracefully) - agg.SetCount(1, 1, -1) - - collector := newConntrackCollector(agg) - testCollector(t, collector) + if tt.validate != nil { + tt.validate(t, agg) + } + }) + } } -func TestConntrackCollectorConcurrency(t *testing.T) { - // Test concurrent access - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Fatalf("Failed to create mock aggregator: %v", err) +func TestConntrackCollectorIntegration(t *testing.T) { + tests := []struct { + name string + setup func() (*conntrack.MockZoneMarkAggregator, error) + operations []func(*conntrack.MockZoneMarkAggregator) + validate func(*testing.T, *conntrackCollector) + description string + }{ + { + name: "full_lifecycle", + setup: func() (*conntrack.MockZoneMarkAggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + // Start the aggregator + agg.Start() + }, + func(agg *conntrack.MockZoneMarkAggregator) { + // Add some data + agg.SetCount(0, 100, 1500) + agg.SetCount(1, 200, 2500) + }, + func(agg *conntrack.MockZoneMarkAggregator) { + // Modify data + agg.AddEntry(0, 100) + agg.RemoveEntry(1, 200) + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Test that collector can handle the aggregator + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + // Should have 2 entries (one added, one removed) + if len(snapshot) != 2 { + t.Errorf("expected 2 entries, got %d", len(snapshot)) + } + }, + description: "Test full lifecycle with collector", + }, + { + name: "stress_test", + setup: func() (*conntrack.MockZoneMarkAggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(*conntrack.MockZoneMarkAggregator){ + func(agg *conntrack.MockZoneMarkAggregator) { + // Add many entries rapidly + for i := 0; i < 1000; i++ { + agg.SetCount(uint16(i%10), uint32(i), i) + } + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + if len(snapshot) != 1000 { + t.Errorf("expected 1000 entries, got %d", len(snapshot)) + } + }, + description: "Test stress scenario with many entries", + }, } - t.Cleanup(agg.Stop) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := tt.setup() + if err != nil { + t.Fatalf("Failed to create mock aggregator: %v", err) + } + t.Cleanup(func() { agg.Stop() }) - collector := newConntrackCollector(agg) + for _, op := range tt.operations { + op(agg) + // Small delay between operations + time.Sleep(1 * time.Millisecond) + } - // Test concurrent collection - done := make(chan bool, 10) - for i := 0; i < 10; i++ { - go func() { - testCollector(t, collector) - done <- true - }() - } + collector := newConntrackCollector(agg) + if tt.validate != nil { + tt.validate(t, collector.(*conntrackCollector)) + } - // Wait for all goroutines to complete - for i := 0; i < 10; i++ { - <-done + // Test with Prometheus + testCollector(t, collector) + }) } } diff --git a/internal/ovsexporter/conntrack_test.go b/internal/ovsexporter/conntrack_test.go index 30d6ed7..6ae71ad 100644 --- a/internal/ovsexporter/conntrack_test.go +++ b/internal/ovsexporter/conntrack_test.go @@ -14,33 +14,186 @@ import ( ) func TestConntrackCollector(t *testing.T) { - // Create a mock aggregator - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - // This is expected to fail in test environment due to permission requirements - t.Logf("Expected failure in test environment: NewZoneMarkAggregator() error = %v", err) - // Test with nil aggregator to ensure collector handles gracefully - collector := newConntrackCollector(nil) - testCollector(t, collector) - return + tests := []struct { + name string + setup func() (conntrack.Aggregator, error) + operations []func(conntrack.Aggregator) error + validate func(*testing.T, *conntrackCollector) + wantErr bool + skipOnError bool + description string + }{ + { + name: "real_aggregator_creation", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { return agg.Start() }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + if collector == nil { + t.Fatal("expected non-nil collector") + } + if collector.desc == nil { + t.Fatal("expected non-nil description") + } + if collector.agg == nil { + t.Fatal("expected non-nil aggregator") + } + }, + wantErr: false, + skipOnError: true, // Skip if permission issues + description: "Test collector with real aggregator creation", + }, + { + name: "nil_aggregator_handling", + setup: func() (conntrack.Aggregator, error) { + return nil, nil + }, + operations: []func(conntrack.Aggregator) error{}, + validate: func(t *testing.T, collector *conntrackCollector) { + if collector == nil { + t.Fatal("expected non-nil collector") + } + if collector.agg != nil { + t.Error("expected nil aggregator") + } + // Test that collector handles nil aggregator gracefully + // This should not panic and should emit zero metrics + }, + wantErr: false, + skipOnError: false, + description: "Test collector handles nil aggregator gracefully", + }, + { + name: "real_data_processing", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { return agg.Start() }, + func(agg conntrack.Aggregator) error { + // Let it run briefly to potentially collect real data + time.Sleep(50 * time.Millisecond) + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + // In test environment, snapshot might be empty + t.Logf("Snapshot contains %d entries", len(snapshot)) + }, + wantErr: false, + skipOnError: true, + description: "Test collector with real data processing", + }, + { + name: "concurrent_collection", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { return agg.Start() }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Test concurrent collection + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func() { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Error("Concurrent snapshot returned nil") + } + done <- true + }() + } + + // Wait for all goroutines + for i := 0; i < 10; i++ { + <-done + } + }, + wantErr: false, + skipOnError: true, + description: "Test concurrent collection operations", + }, + { + name: "lifecycle_management", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { return agg.Start() }, + func(agg conntrack.Aggregator) error { + // Let it run briefly + time.Sleep(10 * time.Millisecond) + return nil + }, + func(agg conntrack.Aggregator) error { + // Stop the aggregator + agg.Stop() + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Snapshot should still work after stop + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work after stop") + } + }, + wantErr: false, + skipOnError: true, + description: "Test aggregator lifecycle management", + }, } - // Clean up aggregator after test - t.Cleanup(agg.Stop) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := tt.setup() + if (err != nil) != tt.wantErr { + if tt.skipOnError { + t.Logf("Skipping test due to expected failure: %v", err) + // Test with nil aggregator to ensure collector handles gracefully + collector := newConntrackCollector(nil) + testCollector(t, collector) + return + } + t.Errorf("setup error = %v, wantErr %v", err, tt.wantErr) + return + } - // Create collector with real aggregator - collector := newConntrackCollector(agg) + if agg != nil { + t.Cleanup(func() { agg.Stop() }) + } - // Test the collector - testCollector(t, collector) -} + for i, op := range tt.operations { + if err := op(agg); err != nil { + if tt.skipOnError { + t.Logf("Skipping test due to operation %d failure: %v", i, err) + // Test with nil aggregator as fallback + collector := newConntrackCollector(nil) + testCollector(t, collector) + return + } + t.Errorf("operation %d failed: %v", i, err) + return + } + } -func TestConntrackCollectorWithNilAggregator(t *testing.T) { - // Test that the collector handles a nil aggregator gracefully - collector := newConntrackCollector(nil) + collector := newConntrackCollector(agg) + if tt.validate != nil { + tt.validate(t, collector.(*conntrackCollector)) + } - // This should not panic and should emit zero metrics - testCollector(t, collector) + // Test the collector with Prometheus + testCollector(t, collector) + }) + } } func TestConntrackCollectorWithRealData(t *testing.T) { @@ -48,17 +201,171 @@ func TestConntrackCollectorWithRealData(t *testing.T) { t.Skip("Skipping conntrack test in short mode") } - // Test with real conntrack data if available - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - t.Skipf("Skipping real data test: %v", err) + tests := []struct { + name string + duration time.Duration + validate func(*testing.T, *conntrackCollector) + description string + }{ + { + name: "short_duration", + duration: 100 * time.Millisecond, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + t.Logf("Short duration test: %d entries collected", len(snapshot)) + }, + description: "Test with short data collection duration", + }, + { + name: "medium_duration", + duration: 500 * time.Millisecond, + validate: func(t *testing.T, collector *conntrackCollector) { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Fatal("expected non-nil snapshot") + } + t.Logf("Medium duration test: %d entries collected", len(snapshot)) + }, + description: "Test with medium data collection duration", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Test with real conntrack data if available + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + t.Skipf("Skipping real data test: %v", err) + } + + t.Cleanup(func() { agg.Stop() }) + + // Start the aggregator + if err := agg.Start(); err != nil { + t.Skipf("Skipping real data test - failed to start: %v", err) + } + + // Wait for data to accumulate + time.Sleep(tt.duration) + + collector := newConntrackCollector(agg) + if tt.validate != nil { + tt.validate(t, collector.(*conntrackCollector)) + } + + // Test the collector with Prometheus + testCollector(t, collector) + }) + } +} + +func TestConntrackCollectorEdgeCases(t *testing.T) { + tests := []struct { + name string + setup func() (conntrack.Aggregator, error) + operations []func(conntrack.Aggregator) error + validate func(*testing.T, *conntrackCollector) + wantErr bool + skipOnError bool + description string + }{ + { + name: "start_stop_multiple_times", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { return agg.Start() }, + func(agg conntrack.Aggregator) error { + time.Sleep(10 * time.Millisecond) + agg.Stop() + return nil + }, + func(agg conntrack.Aggregator) error { + // Try to start again after stop + return agg.Start() + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Should handle restart gracefully + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work after restart") + } + }, + wantErr: false, + skipOnError: true, + description: "Test start/stop multiple times", + }, + { + name: "rapid_start_stop_cycles", + setup: func() (conntrack.Aggregator, error) { + return conntrack.NewZoneMarkAggregator() + }, + operations: []func(conntrack.Aggregator) error{ + func(agg conntrack.Aggregator) error { + // Rapid start/stop cycles + for i := 0; i < 5; i++ { + if err := agg.Start(); err != nil { + return err + } + time.Sleep(1 * time.Millisecond) + agg.Stop() + time.Sleep(1 * time.Millisecond) + } + return nil + }, + }, + validate: func(t *testing.T, collector *conntrackCollector) { + // Should not panic or leak resources + snapshot := collector.agg.Snapshot() + if snapshot == nil { + t.Error("Snapshot should work after rapid cycles") + } + }, + wantErr: false, + skipOnError: true, + description: "Test rapid start/stop cycles", + }, } - t.Cleanup(agg.Stop) + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + agg, err := tt.setup() + if (err != nil) != tt.wantErr { + if tt.skipOnError { + t.Logf("Skipping test due to expected failure: %v", err) + return + } + t.Errorf("setup error = %v, wantErr %v", err, tt.wantErr) + return + } + + if agg != nil { + t.Cleanup(func() { agg.Stop() }) + } - // Wait a bit for some real data to accumulate - time.Sleep(100 * time.Millisecond) + for i, op := range tt.operations { + if err := op(agg); err != nil { + if tt.skipOnError { + t.Logf("Skipping test due to operation %d failure: %v", i, err) + return + } + t.Errorf("operation %d failed: %v", i, err) + return + } + } - collector := newConntrackCollector(agg) - testCollector(t, collector) + collector := newConntrackCollector(agg) + if tt.validate != nil { + tt.validate(t, collector.(*conntrackCollector)) + } + + // Test the collector with Prometheus + testCollector(t, collector) + }) + } } diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index dc453c5..948bfea 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -23,6 +23,7 @@ type collector struct { mu sync.Mutex cs []prometheus.Collector conntrackEnabled bool + aggregator conntrack.Aggregator } // Make sure collector implements prometheus.Collector @@ -53,6 +54,7 @@ func New(c *ovsnl.Client) prometheus.Collector { return &collector{ cs: collectors, conntrackEnabled: true, + aggregator: agg, } } @@ -76,8 +78,18 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { } } -// Close cleans up resources -func (c *collector) Close() { +// Close cleans up resources with graceful shutdown +func (c *collector) Close() error { c.mu.Lock() defer c.mu.Unlock() + + if c.conntrackEnabled && c.aggregator != nil { + if err := c.aggregator.Stop(); err != nil { + log.Printf("Error stopping aggregator: %v", err) + return err + } + log.Printf("Collector closed gracefully") + } + + return nil } diff --git a/internal/ovsexporter/test_helpers.go b/internal/ovsexporter/test_helpers.go new file mode 100644 index 0000000..c2109f6 --- /dev/null +++ b/internal/ovsexporter/test_helpers.go @@ -0,0 +1,283 @@ +//go:build !linux +// +build !linux + +// Copyright 2018-2021 DigitalOcean. +// SPDX-License-Identifier: Apache-2.0 + +package ovsexporter + +import ( + "testing" + "time" + + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" +) + +// TestHelper provides common testing utilities for conntrack tests +type TestHelper struct { + t *testing.T +} + +// NewTestHelper creates a new test helper instance +func NewTestHelper(t *testing.T) *TestHelper { + return &TestHelper{t: t} +} + +// CreateMockAggregator creates a mock aggregator for testing +func (th *TestHelper) CreateMockAggregator() *conntrack.MockZoneMarkAggregator { + agg, err := conntrack.NewZoneMarkAggregator() + if err != nil { + th.t.Fatalf("Failed to create mock aggregator: %v", err) + } + return agg +} + +// CreateMockAggregatorWithData creates a mock aggregator with test data +func (th *TestHelper) CreateMockAggregatorWithData(data []TestData) *conntrack.MockZoneMarkAggregator { + agg := th.CreateMockAggregator() + + for _, d := range data { + agg.SetCount(d.Zone, d.Mark, d.Count) + } + + return agg +} + +// TestData represents test data for aggregator testing +type TestData struct { + Zone uint16 + Mark uint32 + Count int +} + +// CommonTestData provides commonly used test data sets +var CommonTestData = struct { + Empty []TestData + Basic []TestData + Large []TestData + EdgeCases []TestData + Concurrent []TestData +}{ + Empty: []TestData{}, + + Basic: []TestData{ + {Zone: 0, Mark: 100, Count: 1500}, + {Zone: 0, Mark: 200, Count: 2500}, + {Zone: 1, Mark: 300, Count: 3500}, + }, + + Large: func() []TestData { + var data []TestData + for zone := uint16(0); zone < 10; zone++ { + for mark := uint32(0); mark < 1000; mark++ { + data = append(data, TestData{ + Zone: zone, + Mark: mark, + Count: int(uint32(zone)*1000 + mark), + }) + } + } + return data + }(), + + EdgeCases: []TestData{ + {Zone: 0, Mark: 0, Count: 0}, // Zero values + {Zone: 65535, Mark: 4294967295, Count: 1000000}, // Max values + {Zone: 1, Mark: 1, Count: -1}, // Negative count + }, + + Concurrent: []TestData{ + {Zone: 0, Mark: 100, Count: 100}, + {Zone: 1, Mark: 200, Count: 200}, + {Zone: 2, Mark: 300, Count: 300}, + }, +} + +// ValidateSnapshot validates a snapshot against expected data +func (th *TestHelper) ValidateSnapshot(snapshot map[conntrack.ZoneMarkKey]int, expected []TestData) { + if snapshot == nil { + th.t.Fatal("expected non-nil snapshot") + } + + // Count non-zero entries + actualCount := 0 + for _, count := range snapshot { + if count > 0 { + actualCount++ + } + } + + expectedCount := 0 + for _, d := range expected { + if d.Count > 0 { + expectedCount++ + } + } + + if actualCount != expectedCount { + th.t.Errorf("expected %d non-zero entries, got %d", expectedCount, actualCount) + } + + // Validate specific entries + for _, d := range expected { + if d.Count > 0 { + key := conntrack.ZoneMarkKey{Zone: d.Zone, Mark: d.Mark} + if count, exists := snapshot[key]; !exists { + th.t.Errorf("expected entry for key %v not found", key) + } else if count != d.Count { + th.t.Errorf("expected count %d for key %v, got %d", d.Count, key, count) + } + } + } +} + +// RunConcurrentTest runs a test function concurrently +func (th *TestHelper) RunConcurrentTest(goroutines int, testFunc func()) { + done := make(chan bool, goroutines) + + for i := 0; i < goroutines; i++ { + go func() { + testFunc() + done <- true + }() + } + + // Wait for all goroutines + for i := 0; i < goroutines; i++ { + <-done + } +} + +// WaitForCondition waits for a condition to be true with timeout +func (th *TestHelper) WaitForCondition(condition func() bool, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + ticker := time.NewTicker(10 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + if condition() { + return true + } + if time.Now().After(deadline) { + return false + } + } + } +} + +// BenchmarkCollector benchmarks collector performance +func (th *TestHelper) BenchmarkCollector(collector *conntrackCollector, iterations int) time.Duration { + start := time.Now() + + for i := 0; i < iterations; i++ { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + th.t.Errorf("snapshot returned nil at iteration %d", i) + } + } + + return time.Since(start) +} + +// TestCollectorWithData tests a collector with specific data +func (th *TestHelper) TestCollectorWithData(collector *conntrackCollector, expectedData []TestData) { + // Test snapshot + snapshot := collector.agg.Snapshot() + th.ValidateSnapshot(snapshot, expectedData) +} + +// TestCollectorLifecycle tests the full lifecycle of a collector +func (th *TestHelper) TestCollectorLifecycle(agg conntrack.Aggregator) { + collector := newConntrackCollector(agg).(*conntrackCollector) + + // Test initial state + if collector == nil { + th.t.Fatal("expected non-nil collector") + } + + // Test snapshot + snapshot := collector.agg.Snapshot() + if snapshot == nil { + th.t.Fatal("expected non-nil snapshot") + } + + // Test concurrent access + th.RunConcurrentTest(10, func() { + snapshot := collector.agg.Snapshot() + if snapshot == nil { + th.t.Error("concurrent snapshot returned nil") + } + }) +} + +// MockAggregatorBuilder provides a fluent interface for building mock aggregators +type MockAggregatorBuilder struct { + agg *conntrack.MockZoneMarkAggregator +} + +// NewMockAggregatorBuilder creates a new builder +func (th *TestHelper) NewMockAggregatorBuilder() *MockAggregatorBuilder { + return &MockAggregatorBuilder{ + agg: th.CreateMockAggregator(), + } +} + +// WithData adds test data to the aggregator +func (b *MockAggregatorBuilder) WithData(data []TestData) *MockAggregatorBuilder { + for _, d := range data { + b.agg.SetCount(d.Zone, d.Mark, d.Count) + } + return b +} + +// WithEntry adds a single entry to the aggregator +func (b *MockAggregatorBuilder) WithEntry(zone uint16, mark uint32, count int) *MockAggregatorBuilder { + b.agg.SetCount(zone, mark, count) + return b +} + +// WithAddEntry adds an entry using AddEntry method +func (b *MockAggregatorBuilder) WithAddEntry(zone uint16, mark uint32) *MockAggregatorBuilder { + b.agg.AddEntry(zone, mark) + return b +} + +// WithRemoveEntry removes an entry using RemoveEntry method +func (b *MockAggregatorBuilder) WithRemoveEntry(zone uint16, mark uint32) *MockAggregatorBuilder { + b.agg.RemoveEntry(zone, mark) + return b +} + +// Build returns the built aggregator +func (b *MockAggregatorBuilder) Build() *conntrack.MockZoneMarkAggregator { + return b.agg +} + +// TestCollectorBuilder provides a fluent interface for building collectors +type TestCollectorBuilder struct { + collector *conntrackCollector +} + +// NewTestCollectorBuilder creates a new collector builder +func (th *TestHelper) NewTestCollectorBuilder() *TestCollectorBuilder { + return &TestCollectorBuilder{} +} + +// WithMockAggregator sets a mock aggregator +func (b *TestCollectorBuilder) WithMockAggregator(agg *conntrack.MockZoneMarkAggregator) *TestCollectorBuilder { + b.collector = newConntrackCollector(agg).(*conntrackCollector) + return b +} + +// WithNilAggregator sets a nil aggregator +func (b *TestCollectorBuilder) WithNilAggregator() *TestCollectorBuilder { + b.collector = newConntrackCollector(nil).(*conntrackCollector) + return b +} + +// Build returns the built collector +func (b *TestCollectorBuilder) Build() *conntrackCollector { + return b.collector +} From 71644e4920af52e22229b7b0045d2336d128d2c0 Mon Sep 17 00:00:00 2001 From: shrouti1995 Date: Wed, 29 Oct 2025 15:57:17 +0530 Subject: [PATCH 12/19] Update internal/ovsexporter/ovsexporter.go Co-authored-by: Anit Gandhi --- internal/ovsexporter/ovsexporter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 948bfea..1323238 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -23,7 +23,7 @@ type collector struct { mu sync.Mutex cs []prometheus.Collector conntrackEnabled bool - aggregator conntrack.Aggregator + conntrack.Aggregator conntrack.Aggregator } // Make sure collector implements prometheus.Collector From b21f8d531c1a0a2d223fd1760249cb918b75fb93 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 29 Oct 2025 19:21:25 +0530 Subject: [PATCH 13/19] code review suggestions --- internal/conntrack/aggregator_linux.go | 9 +- internal/conntrack/aggregator_linux_test.go | 249 +++++-------------- internal/conntrack/config.go | 62 ----- internal/conntrack/mock.go | 16 +- internal/conntrack/types.go | 4 +- internal/ovsexporter/conntrack.go | 4 +- internal/ovsexporter/conntrack_mock_test.go | 68 +++--- internal/ovsexporter/conntrack_test.go | 75 +++--- internal/ovsexporter/ovsexporter.go | 21 +- internal/ovsexporter/test_helpers.go | 258 ++------------------ 10 files changed, 176 insertions(+), 590 deletions(-) diff --git a/internal/conntrack/aggregator_linux.go b/internal/conntrack/aggregator_linux.go index 6a0cfe8..bd87a2f 100644 --- a/internal/conntrack/aggregator_linux.go +++ b/internal/conntrack/aggregator_linux.go @@ -26,14 +26,17 @@ import ( "github.com/ti-mo/netfilter" ) +// Compile-time assertion that *ZoneMarkAggregator implements MarkZoneAggregator +var _ MarkZoneAggregator = (*ZoneMarkAggregator)(nil) + // // Conntrack aggregator with bounded ingestion + DESTROY aggregation // to handle massive bursts of conntrack DESTROY events without OOMing. // // NewZoneMarkAggregator creates a new aggregator with its own listening connection. -func NewZoneMarkAggregator() (*ZoneMarkAggregator, error) { - return NewZoneMarkAggregatorWithConfig(LoadConfig()) +func NewZoneMarkAggregator() (MarkZoneAggregator, error) { + return NewZoneMarkAggregatorWithConfig(DefaultConfig()) } // NewZoneMarkAggregatorWithConfig creates a new aggregator with custom configuration. @@ -390,7 +393,7 @@ func (a *ZoneMarkAggregator) StopWithTimeout(timeout time.Duration) error { // Final flush of any remaining deltas a.flushDestroyDeltas() - log.Printf("Aggregator stopped gracefully") + log.Printf("MarkZoneAggregator stopped gracefully") return nil } diff --git a/internal/conntrack/aggregator_linux_test.go b/internal/conntrack/aggregator_linux_test.go index 8c629e1..8f7132b 100644 --- a/internal/conntrack/aggregator_linux_test.go +++ b/internal/conntrack/aggregator_linux_test.go @@ -17,6 +17,7 @@ package conntrack import ( + "sync" "testing" "time" ) @@ -24,114 +25,82 @@ import ( func TestZoneMarkAggregator(t *testing.T) { tests := []struct { name string - setup func() (*ZoneMarkAggregator, error) - operations []func(*ZoneMarkAggregator) error - validate func(*testing.T, *ZoneMarkAggregator) + setup func() (MarkZoneAggregator, error) + operations []func(MarkZoneAggregator) error + validate func(*testing.T, MarkZoneAggregator) wantErr bool skipOnError bool }{ { - name: "successful_creation", - setup: func() (*ZoneMarkAggregator, error) { - return NewZoneMarkAggregator() + name: "successful_creation", + setup: func() (MarkZoneAggregator, error) { return NewZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, }, - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, - }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { + validate: func(t *testing.T, agg MarkZoneAggregator) { if agg == nil { t.Fatal("expected non-nil aggregator") } - snapshot := agg.Snapshot() - if snapshot == nil { - t.Fatal("expected non-nil snapshot") - } + _ = agg.Snapshot() // tolerate empty in CI }, wantErr: false, - skipOnError: true, // Skip if permission issues + skipOnError: true, }, { - name: "snapshot_functionality", - setup: func() (*ZoneMarkAggregator, error) { - return NewZoneMarkAggregator() - }, - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, + name: "snapshot_functionality", + setup: func() (MarkZoneAggregator, error) { return NewZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - snapshot := agg.Snapshot() - if snapshot == nil { - t.Fatal("Snapshot() returned nil") + validate: func(t *testing.T, agg MarkZoneAggregator) { + snap := agg.Snapshot() + if snap == nil { + t.Skip("nil snapshot likely due to permissions") } - - // Verify snapshot is a map[ZoneMarkKey]int - if len(snapshot) == 0 { - t.Log("Snapshot is empty (expected in test environment)") - } - - // Test that we can iterate over the snapshot - for key, count := range snapshot { + for key, count := range snap { if count <= 0 { t.Errorf("Invalid count %d for key %+v", count, key) } - t.Logf("Zone: %d, Mark: %d, Count: %d", key.Zone, key.Mark, count) } }, wantErr: false, skipOnError: true, }, { - name: "start_stop_lifecycle", - setup: func() (*ZoneMarkAggregator, error) { - return NewZoneMarkAggregator() - }, - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, - func(agg *ZoneMarkAggregator) error { - // Let it run briefly + name: "start_stop_lifecycle", + setup: func() (MarkZoneAggregator, error) { return NewZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { time.Sleep(10 * time.Millisecond) return agg.Stop() }, }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - // Test that aggregator can be stopped gracefully + validate: func(t *testing.T, agg MarkZoneAggregator) { if err := agg.Stop(); err != nil { - t.Errorf("Stop() returned error: %v", err) - } - // Snapshot should still work after stop - snapshot := agg.Snapshot() - if snapshot == nil { - t.Error("Snapshot should work after stop") + t.Errorf("Stop() error: %v", err) } + _ = agg.Snapshot() }, wantErr: false, skipOnError: true, }, { - name: "concurrent_snapshot_access", - setup: func() (*ZoneMarkAggregator, error) { - return NewZoneMarkAggregator() - }, - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, + name: "concurrent_snapshot_access", + setup: func() (MarkZoneAggregator, error) { return NewZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - // Test concurrent snapshot access - done := make(chan bool, 10) + validate: func(t *testing.T, agg MarkZoneAggregator) { + var wg sync.WaitGroup + wg.Add(10) for i := 0; i < 10; i++ { go func() { - snapshot := agg.Snapshot() - if snapshot == nil { - t.Error("Concurrent snapshot returned nil") - } - done <- true + defer wg.Done() + _ = agg.Snapshot() }() } - - // Wait for all goroutines - for i := 0; i < 10; i++ { - <-done - } + wg.Wait() }, wantErr: false, skipOnError: true, @@ -185,48 +154,12 @@ func TestZoneMarkKey(t *testing.T) { expected bool desc string }{ - { - name: "identical_keys", - key1: ZoneMarkKey{Zone: 1, Mark: 100}, - key2: ZoneMarkKey{Zone: 1, Mark: 100}, - expected: true, - desc: "Identical ZoneMarkKey structs should be equal", - }, - { - name: "different_zone", - key1: ZoneMarkKey{Zone: 1, Mark: 100}, - key2: ZoneMarkKey{Zone: 2, Mark: 100}, - expected: false, - desc: "Different zone ZoneMarkKey structs should not be equal", - }, - { - name: "different_mark", - key1: ZoneMarkKey{Zone: 1, Mark: 100}, - key2: ZoneMarkKey{Zone: 1, Mark: 200}, - expected: false, - desc: "Different mark ZoneMarkKey structs should not be equal", - }, - { - name: "both_different", - key1: ZoneMarkKey{Zone: 1, Mark: 100}, - key2: ZoneMarkKey{Zone: 2, Mark: 200}, - expected: false, - desc: "Both zone and mark different should not be equal", - }, - { - name: "zero_values", - key1: ZoneMarkKey{Zone: 0, Mark: 0}, - key2: ZoneMarkKey{Zone: 0, Mark: 0}, - expected: true, - desc: "Zero values should be equal", - }, - { - name: "max_values", - key1: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, - key2: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, - expected: true, - desc: "Max values should be equal", - }, + {name: "identical_keys", key1: ZoneMarkKey{Zone: 1, Mark: 100}, key2: ZoneMarkKey{Zone: 1, Mark: 100}, expected: true, desc: "Identical ZoneMarkKey structs should be equal"}, + {name: "different_zone", key1: ZoneMarkKey{Zone: 1, Mark: 100}, key2: ZoneMarkKey{Zone: 2, Mark: 100}, expected: false, desc: "Different zone ZoneMarkKey structs should not be equal"}, + {name: "different_mark", key1: ZoneMarkKey{Zone: 1, Mark: 100}, key2: ZoneMarkKey{Zone: 1, Mark: 200}, expected: false, desc: "Different mark ZoneMarkKey structs should not be equal"}, + {name: "both_different", key1: ZoneMarkKey{Zone: 1, Mark: 100}, key2: ZoneMarkKey{Zone: 2, Mark: 200}, expected: false, desc: "Both zone and mark different should not be equal"}, + {name: "zero_values", key1: ZoneMarkKey{Zone: 0, Mark: 0}, key2: ZoneMarkKey{Zone: 0, Mark: 0}, expected: true, desc: "Zero values should be equal"}, + {name: "max_values", key1: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, key2: ZoneMarkKey{Zone: 65535, Mark: 4294967295}, expected: true, desc: "Max values should be equal"}, } for _, tt := range tests { @@ -248,50 +181,18 @@ func TestZoneMarkKeyAsMapKey(t *testing.T) { expected int desc string }{ - { - name: "basic_map_operations", - keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, - values: []int{5, 10}, - lookup: ZoneMarkKey{Zone: 1, Mark: 100}, - expected: 5, - desc: "ZoneMarkKey should work as map key", - }, - { - name: "equal_keys_map_to_same_value", - keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, - values: []int{5, 10}, - lookup: ZoneMarkKey{Zone: 1, Mark: 100}, // Same as first key - expected: 5, - desc: "Equal ZoneMarkKey structs should map to same value", - }, - { - name: "different_keys_map_to_different_values", - keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, - values: []int{5, 10}, - lookup: ZoneMarkKey{Zone: 2, Mark: 200}, - expected: 10, - desc: "Different ZoneMarkKey should map to different value", - }, - { - name: "zero_key_operations", - keys: []ZoneMarkKey{{Zone: 0, Mark: 0}, {Zone: 1, Mark: 1}}, - values: []int{100, 200}, - lookup: ZoneMarkKey{Zone: 0, Mark: 0}, - expected: 100, - desc: "Zero value keys should work correctly", - }, + {name: "basic_map_operations", keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, values: []int{5, 10}, lookup: ZoneMarkKey{Zone: 1, Mark: 100}, expected: 5, desc: "ZoneMarkKey should work as map key"}, + {name: "equal_keys_map_to_same_value", keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, values: []int{5, 10}, lookup: ZoneMarkKey{Zone: 1, Mark: 100}, expected: 5, desc: "Equal ZoneMarkKey structs should map to same value"}, + {name: "different_keys_map_to_different_values", keys: []ZoneMarkKey{{Zone: 1, Mark: 100}, {Zone: 2, Mark: 200}}, values: []int{5, 10}, lookup: ZoneMarkKey{Zone: 2, Mark: 200}, expected: 10, desc: "Different ZoneMarkKey should map to different value"}, + {name: "zero_key_operations", keys: []ZoneMarkKey{{Zone: 0, Mark: 0}, {Zone: 1, Mark: 1}}, values: []int{100, 200}, lookup: ZoneMarkKey{Zone: 0, Mark: 0}, expected: 100, desc: "Zero value keys should work correctly"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { testMap := make(map[ZoneMarkKey]int) - - // Populate map for i, key := range tt.keys { testMap[key] = tt.values[i] } - - // Test lookup result := testMap[tt.lookup] if result != tt.expected { t.Errorf("%s: got %d, want %d", tt.desc, result, tt.expected) @@ -303,55 +204,36 @@ func TestZoneMarkKeyAsMapKey(t *testing.T) { func TestAggregatorLifecycle(t *testing.T) { tests := []struct { name string - operations []func(*ZoneMarkAggregator) error - validate func(*testing.T, *ZoneMarkAggregator) + operations []func(MarkZoneAggregator) error + validate func(*testing.T, MarkZoneAggregator) wantErr bool skipOnError bool }{ { name: "start_twice_should_fail", - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, - func(agg *ZoneMarkAggregator) error { return agg.Start() }, // Second start - }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - // Second start should fail or be idempotent + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { return agg.Start() }, // second start }, - wantErr: false, // May or may not error depending on implementation + validate: func(t *testing.T, agg MarkZoneAggregator) {}, + wantErr: false, skipOnError: true, }, { name: "stop_without_start", - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { - return agg.Stop() // Stop without starting - }, - }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - // Should not panic - snapshot := agg.Snapshot() - if snapshot == nil { - t.Error("Snapshot should work even after stop without start") - } + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Stop() }, }, - wantErr: false, + validate: func(t *testing.T, agg MarkZoneAggregator) { _ = agg.Snapshot() }, + wantErr: false, }, { name: "snapshot_after_stop", - operations: []func(*ZoneMarkAggregator) error{ - func(agg *ZoneMarkAggregator) error { return agg.Start() }, - func(agg *ZoneMarkAggregator) error { - time.Sleep(10 * time.Millisecond) - return agg.Stop() - }, - }, - validate: func(t *testing.T, agg *ZoneMarkAggregator) { - // Snapshot should work after stop - snapshot := agg.Snapshot() - if snapshot == nil { - t.Error("Snapshot should work after stop") - } + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { time.Sleep(10 * time.Millisecond); return agg.Stop() }, }, + validate: func(t *testing.T, agg MarkZoneAggregator) { _ = agg.Snapshot() }, wantErr: false, skipOnError: true, }, @@ -370,14 +252,12 @@ func TestAggregatorLifecycle(t *testing.T) { if agg == nil { t.Fatal("NewZoneMarkAggregator() returned nil aggregator") } - t.Cleanup(func() { agg.Stop() }) - for i, op := range tt.operations { if err := op(agg); err != nil { if (err != nil) != tt.wantErr { if tt.skipOnError { - t.Logf("Skipping test due to operation %d failure: %v", i, err) + t.Logf("Skipping op %d failure: %v", i, err) return } t.Errorf("operation %d error = %v, wantErr %v", i, err, tt.wantErr) @@ -385,7 +265,6 @@ func TestAggregatorLifecycle(t *testing.T) { } } } - if tt.validate != nil { tt.validate(t, agg) } diff --git a/internal/conntrack/config.go b/internal/conntrack/config.go index f668cd2..a6936a5 100644 --- a/internal/conntrack/config.go +++ b/internal/conntrack/config.go @@ -15,8 +15,6 @@ package conntrack import ( - "os" - "strconv" "time" ) @@ -49,63 +47,3 @@ func DefaultConfig() *Config { } // LoadConfig loads conntrack configuration from environment variables -func LoadConfig() *Config { - config := DefaultConfig() - - // Load from environment variables - if size := os.Getenv("CONNTRACK_EVENT_CHAN_SIZE"); size != "" { - if s, err := strconv.Atoi(size); err == nil && s > 0 { - config.EventChanSize = s - } - } - - if count := os.Getenv("CONNTRACK_EVENT_WORKER_COUNT"); count != "" { - if c, err := strconv.Atoi(count); err == nil && c > 0 { - config.EventWorkerCount = c - } - } - - if interval := os.Getenv("CONNTRACK_DESTROY_FLUSH_INTERVAL"); interval != "" { - if d, err := time.ParseDuration(interval); err == nil && d > 0 { - config.DestroyFlushIntvl = d - } - } - - if cap := os.Getenv("CONNTRACK_DESTROY_DELTA_CAP"); cap != "" { - if c, err := strconv.Atoi(cap); err == nil && c > 0 { - config.DestroyDeltaCap = c - } - } - - if threshold := os.Getenv("CONNTRACK_DROPS_WARN_THRESHOLD"); threshold != "" { - if t, err := strconv.ParseInt(threshold, 10, 64); err == nil && t >= 0 { - config.DropsWarnThreshold = t - } - } - - if size := os.Getenv("CONNTRACK_READ_BUFFER_SIZE"); size != "" { - if s, err := strconv.Atoi(size); err == nil && s > 0 { - config.ReadBufferSize = s - } - } - - if size := os.Getenv("CONNTRACK_WRITE_BUFFER_SIZE"); size != "" { - if s, err := strconv.Atoi(size); err == nil && s > 0 { - config.WriteBufferSize = s - } - } - - if interval := os.Getenv("CONNTRACK_HEALTH_CHECK_INTERVAL"); interval != "" { - if d, err := time.ParseDuration(interval); err == nil && d > 0 { - config.HealthCheckIntvl = d - } - } - - if timeout := os.Getenv("CONNTRACK_GRACEFUL_TIMEOUT"); timeout != "" { - if d, err := time.ParseDuration(timeout); err == nil && d > 0 { - config.GracefulTimeout = d - } - } - - return config -} diff --git a/internal/conntrack/mock.go b/internal/conntrack/mock.go index 665ac92..a0d1fa6 100644 --- a/internal/conntrack/mock.go +++ b/internal/conntrack/mock.go @@ -9,6 +9,9 @@ import ( "time" ) +// Compile-time assertion that *MockZoneMarkAggregator implements MarkZoneAggregator +var _ MarkZoneAggregator = (*MockZoneMarkAggregator)(nil) + // MockZoneMarkAggregator provides a mock implementation for non-Linux platforms type MockZoneMarkAggregator struct { *ZoneMarkAggregator @@ -16,9 +19,16 @@ type MockZoneMarkAggregator struct { countsMu sync.RWMutex } -// NewZoneMarkAggregator creates a mock aggregator for testing -func NewZoneMarkAggregator() (*MockZoneMarkAggregator, error) { - return NewZoneMarkAggregatorWithConfig(LoadConfig()) +// NewMockZoneMarkAggregator creates a mock aggregator for testing +func NewMockZoneMarkAggregator() (*MockZoneMarkAggregator, error) { + return NewZoneMarkAggregatorWithConfig(DefaultConfig()) +} + +// NewZoneMarkAggregator provides a linux-compatible constructor name on non-Linux +// platforms. This allows calling code to use conntrack.NewZoneMarkAggregator() +// uniformly across OSes while receiving a mock implementation on non-Linux. +func NewZoneMarkAggregator() (MarkZoneAggregator, error) { //nolint:golint // cross-platform parity, returns interface + return NewZoneMarkAggregatorWithConfig(DefaultConfig()) } // NewZoneMarkAggregatorWithConfig creates a mock aggregator with custom configuration diff --git a/internal/conntrack/types.go b/internal/conntrack/types.go index ede992b..4917c0d 100644 --- a/internal/conntrack/types.go +++ b/internal/conntrack/types.go @@ -63,8 +63,8 @@ type ZoneMarkKey struct { Mark uint32 } -// Aggregator interface defines the methods needed by the collector -type Aggregator interface { +// MarkZoneAggregator interface defines the methods needed by the collector +type MarkZoneAggregator interface { Snapshot() map[ZoneMarkKey]int Stop() error Start() error diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go index 5430e21..41f43af 100644 --- a/internal/ovsexporter/conntrack.go +++ b/internal/ovsexporter/conntrack.go @@ -10,7 +10,7 @@ import ( type conntrackCollector struct { desc *prometheus.Desc - agg conntrack.Aggregator + agg conntrack.MarkZoneAggregator } // ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot @@ -18,7 +18,7 @@ type ConntrackCollectorWithAggAccessor struct { *conntrackCollector } -func newConntrackCollector(agg conntrack.Aggregator) prometheus.Collector { +func newConntrackCollector(agg conntrack.MarkZoneAggregator) prometheus.Collector { return &conntrackCollector{ desc: prometheus.NewDesc( prometheus.BuildFQName(namespace, "conntrack", "entries"), diff --git a/internal/ovsexporter/conntrack_mock_test.go b/internal/ovsexporter/conntrack_mock_test.go index 271915d..2ce5bba 100644 --- a/internal/ovsexporter/conntrack_mock_test.go +++ b/internal/ovsexporter/conntrack_mock_test.go @@ -4,6 +4,7 @@ package ovsexporter import ( + "sync" "testing" "time" @@ -13,19 +14,19 @@ import ( func TestConntrackCollector(t *testing.T) { tests := []struct { name string - setup func() (conntrack.Aggregator, error) - operations []func(conntrack.Aggregator) error + setup func() (conntrack.MarkZoneAggregator, error) + operations []func(conntrack.MarkZoneAggregator) error validate func(*testing.T, *conntrackCollector) wantErr bool description string }{ { name: "basic_functionality", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (conntrack.MarkZoneAggregator, error) { + return conntrack.NewMockZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { // Add test data mockAgg := agg.(*conntrack.MockZoneMarkAggregator) mockAgg.SetCount(0, 100, 1500) @@ -47,10 +48,10 @@ func TestConntrackCollector(t *testing.T) { }, { name: "nil_aggregator", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return nil, nil }, - operations: []func(conntrack.Aggregator) error{}, + operations: []func(conntrack.MarkZoneAggregator) error{}, validate: func(t *testing.T, collector *conntrackCollector) { if collector == nil { t.Fatal("expected non-nil collector") @@ -64,10 +65,10 @@ func TestConntrackCollector(t *testing.T) { }, { name: "empty_aggregator", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (conntrack.MarkZoneAggregator, error) { + return conntrack.NewMockZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{}, + operations: []func(conntrack.MarkZoneAggregator) error{}, validate: func(t *testing.T, collector *conntrackCollector) { if collector == nil { t.Fatal("expected non-nil collector") @@ -85,11 +86,11 @@ func TestConntrackCollector(t *testing.T) { }, { name: "large_dataset", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (conntrack.MarkZoneAggregator, error) { + return conntrack.NewMockZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { // Add large dataset - simulate 10K entries across multiple zones mockAgg := agg.(*conntrack.MockZoneMarkAggregator) for zone := uint16(0); zone < 10; zone++ { @@ -111,11 +112,11 @@ func TestConntrackCollector(t *testing.T) { }, { name: "edge_cases", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (conntrack.MarkZoneAggregator, error) { + return conntrack.NewMockZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { mockAgg := agg.(*conntrack.MockZoneMarkAggregator) // Test zero values mockAgg.SetCount(0, 0, 0) @@ -137,35 +138,28 @@ func TestConntrackCollector(t *testing.T) { description: "Test collector with edge cases", }, { - name: "concurrent_operations", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() - }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { + name: "concurrent_operations", + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewMockZoneMarkAggregator() }, + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { mockAgg := agg.(*conntrack.MockZoneMarkAggregator) - // Add some initial data mockAgg.SetCount(0, 100, 100) return nil }, }, validate: func(t *testing.T, collector *conntrackCollector) { - // Test concurrent collection - done := make(chan bool, 10) + var wg sync.WaitGroup + wg.Add(10) for i := 0; i < 10; i++ { go func() { + defer wg.Done() snapshot := collector.agg.Snapshot() if snapshot == nil { t.Error("Concurrent snapshot returned nil") } - done <- true }() } - - // Wait for all goroutines - for i := 0; i < 10; i++ { - <-done - } + wg.Wait() }, wantErr: false, description: "Test concurrent collector operations", @@ -310,7 +304,7 @@ func TestMockAggregatorOperations(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - agg, err := conntrack.NewZoneMarkAggregator() + agg, err := conntrack.NewMockZoneMarkAggregator() if err != nil { t.Fatalf("Failed to create mock aggregator: %v", err) } @@ -340,7 +334,7 @@ func TestConntrackCollectorIntegration(t *testing.T) { { name: "full_lifecycle", setup: func() (*conntrack.MockZoneMarkAggregator, error) { - return conntrack.NewZoneMarkAggregator() + return conntrack.NewMockZoneMarkAggregator() }, operations: []func(*conntrack.MockZoneMarkAggregator){ func(agg *conntrack.MockZoneMarkAggregator) { @@ -374,7 +368,7 @@ func TestConntrackCollectorIntegration(t *testing.T) { { name: "stress_test", setup: func() (*conntrack.MockZoneMarkAggregator, error) { - return conntrack.NewZoneMarkAggregator() + return conntrack.NewMockZoneMarkAggregator() }, operations: []func(*conntrack.MockZoneMarkAggregator){ func(agg *conntrack.MockZoneMarkAggregator) { diff --git a/internal/ovsexporter/conntrack_test.go b/internal/ovsexporter/conntrack_test.go index 6ae71ad..914934e 100644 --- a/internal/ovsexporter/conntrack_test.go +++ b/internal/ovsexporter/conntrack_test.go @@ -7,6 +7,7 @@ package ovsexporter import ( + "sync" "testing" "time" @@ -16,8 +17,8 @@ import ( func TestConntrackCollector(t *testing.T) { tests := []struct { name string - setup func() (conntrack.Aggregator, error) - operations []func(conntrack.Aggregator) error + setup func() (conntrack.MarkZoneAggregator, error) + operations []func(conntrack.MarkZoneAggregator) error validate func(*testing.T, *conntrackCollector) wantErr bool skipOnError bool @@ -25,11 +26,11 @@ func TestConntrackCollector(t *testing.T) { }{ { name: "real_aggregator_creation", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { return agg.Start() }, + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, }, validate: func(t *testing.T, collector *conntrackCollector) { if collector == nil { @@ -48,10 +49,10 @@ func TestConntrackCollector(t *testing.T) { }, { name: "nil_aggregator_handling", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return nil, nil }, - operations: []func(conntrack.Aggregator) error{}, + operations: []func(conntrack.MarkZoneAggregator) error{}, validate: func(t *testing.T, collector *conntrackCollector) { if collector == nil { t.Fatal("expected non-nil collector") @@ -68,12 +69,12 @@ func TestConntrackCollector(t *testing.T) { }, { name: "real_data_processing", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { return agg.Start() }, - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, + func(agg conntrack.MarkZoneAggregator) error { // Let it run briefly to potentially collect real data time.Sleep(50 * time.Millisecond) return nil @@ -92,30 +93,22 @@ func TestConntrackCollector(t *testing.T) { description: "Test collector with real data processing", }, { - name: "concurrent_collection", - setup: func() (conntrack.Aggregator, error) { - return conntrack.NewZoneMarkAggregator() - }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { return agg.Start() }, - }, + name: "concurrent_collection", + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, + operations: []func(conntrack.MarkZoneAggregator) error{func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }}, validate: func(t *testing.T, collector *conntrackCollector) { - // Test concurrent collection - done := make(chan bool, 10) + var wg sync.WaitGroup + wg.Add(10) for i := 0; i < 10; i++ { go func() { + defer wg.Done() snapshot := collector.agg.Snapshot() if snapshot == nil { t.Error("Concurrent snapshot returned nil") } - done <- true }() } - - // Wait for all goroutines - for i := 0; i < 10; i++ { - <-done - } + wg.Wait() }, wantErr: false, skipOnError: true, @@ -123,17 +116,17 @@ func TestConntrackCollector(t *testing.T) { }, { name: "lifecycle_management", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { return agg.Start() }, - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, + func(agg conntrack.MarkZoneAggregator) error { // Let it run briefly time.Sleep(10 * time.Millisecond) return nil }, - func(agg conntrack.Aggregator) error { + func(agg conntrack.MarkZoneAggregator) error { // Stop the aggregator agg.Stop() return nil @@ -265,8 +258,8 @@ func TestConntrackCollectorWithRealData(t *testing.T) { func TestConntrackCollectorEdgeCases(t *testing.T) { tests := []struct { name string - setup func() (conntrack.Aggregator, error) - operations []func(conntrack.Aggregator) error + setup func() (conntrack.MarkZoneAggregator, error) + operations []func(conntrack.MarkZoneAggregator) error validate func(*testing.T, *conntrackCollector) wantErr bool skipOnError bool @@ -274,17 +267,17 @@ func TestConntrackCollectorEdgeCases(t *testing.T) { }{ { name: "start_stop_multiple_times", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { return agg.Start() }, - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, + func(agg conntrack.MarkZoneAggregator) error { time.Sleep(10 * time.Millisecond) agg.Stop() return nil }, - func(agg conntrack.Aggregator) error { + func(agg conntrack.MarkZoneAggregator) error { // Try to start again after stop return agg.Start() }, @@ -302,11 +295,11 @@ func TestConntrackCollectorEdgeCases(t *testing.T) { }, { name: "rapid_start_stop_cycles", - setup: func() (conntrack.Aggregator, error) { + setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.Aggregator) error{ - func(agg conntrack.Aggregator) error { + operations: []func(conntrack.MarkZoneAggregator) error{ + func(agg conntrack.MarkZoneAggregator) error { // Rapid start/stop cycles for i := 0; i < 5; i++ { if err := agg.Start(); err != nil { diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index 1323238..d796e2f 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -20,10 +20,9 @@ const ( // A collector aggregates Open vSwitch Prometheus collectors. type collector struct { - mu sync.Mutex - cs []prometheus.Collector - conntrackEnabled bool - conntrack.Aggregator conntrack.Aggregator + mu sync.Mutex + cs []prometheus.Collector + conntrackAggregator conntrack.MarkZoneAggregator } // Make sure collector implements prometheus.Collector @@ -42,20 +41,12 @@ func New(c *ovsnl.Client) prometheus.Collector { log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) return &collector{cs: collectors} } - - // Start the aggregator if err := agg.Start(); err != nil { log.Printf("Warning: Failed to start zone/mark aggregator: %v", err) return &collector{cs: collectors} } - collectors = append(collectors, newConntrackCollector(agg)) - - return &collector{ - cs: collectors, - conntrackEnabled: true, - aggregator: agg, - } + return &collector{cs: collectors, conntrackAggregator: agg} } // Describe implements prometheus.Collector. @@ -83,8 +74,8 @@ func (c *collector) Close() error { c.mu.Lock() defer c.mu.Unlock() - if c.conntrackEnabled && c.aggregator != nil { - if err := c.aggregator.Stop(); err != nil { + if c.conntrackAggregator != nil { + if err := c.conntrackAggregator.Stop(); err != nil { log.Printf("Error stopping aggregator: %v", err) return err } diff --git a/internal/ovsexporter/test_helpers.go b/internal/ovsexporter/test_helpers.go index c2109f6..551826a 100644 --- a/internal/ovsexporter/test_helpers.go +++ b/internal/ovsexporter/test_helpers.go @@ -8,105 +8,30 @@ package ovsexporter import ( "testing" - "time" "github.com/digitalocean/openvswitch_exporter/internal/conntrack" ) -// TestHelper provides common testing utilities for conntrack tests -type TestHelper struct { - t *testing.T -} - -// NewTestHelper creates a new test helper instance -func NewTestHelper(t *testing.T) *TestHelper { - return &TestHelper{t: t} -} - -// CreateMockAggregator creates a mock aggregator for testing -func (th *TestHelper) CreateMockAggregator() *conntrack.MockZoneMarkAggregator { - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - th.t.Fatalf("Failed to create mock aggregator: %v", err) - } - return agg -} - -// CreateMockAggregatorWithData creates a mock aggregator with test data -func (th *TestHelper) CreateMockAggregatorWithData(data []TestData) *conntrack.MockZoneMarkAggregator { - agg := th.CreateMockAggregator() - - for _, d := range data { - agg.SetCount(d.Zone, d.Mark, d.Count) - } - - return agg -} - -// TestData represents test data for aggregator testing +// TestData represents test data for aggregator testing. type TestData struct { Zone uint16 Mark uint32 Count int } -// CommonTestData provides commonly used test data sets -var CommonTestData = struct { - Empty []TestData - Basic []TestData - Large []TestData - EdgeCases []TestData - Concurrent []TestData -}{ - Empty: []TestData{}, - - Basic: []TestData{ - {Zone: 0, Mark: 100, Count: 1500}, - {Zone: 0, Mark: 200, Count: 2500}, - {Zone: 1, Mark: 300, Count: 3500}, - }, - - Large: func() []TestData { - var data []TestData - for zone := uint16(0); zone < 10; zone++ { - for mark := uint32(0); mark < 1000; mark++ { - data = append(data, TestData{ - Zone: zone, - Mark: mark, - Count: int(uint32(zone)*1000 + mark), - }) - } - } - return data - }(), - - EdgeCases: []TestData{ - {Zone: 0, Mark: 0, Count: 0}, // Zero values - {Zone: 65535, Mark: 4294967295, Count: 1000000}, // Max values - {Zone: 1, Mark: 1, Count: -1}, // Negative count - }, - - Concurrent: []TestData{ - {Zone: 0, Mark: 100, Count: 100}, - {Zone: 1, Mark: 200, Count: 200}, - {Zone: 2, Mark: 300, Count: 300}, - }, -} - -// ValidateSnapshot validates a snapshot against expected data -func (th *TestHelper) ValidateSnapshot(snapshot map[conntrack.ZoneMarkKey]int, expected []TestData) { +// ValidateSnapshot compares a snapshot with expected test data, asserting counts match for non-zero entries. +func ValidateSnapshot(t *testing.T, snapshot map[conntrack.ZoneMarkKey]int, expected []TestData) { if snapshot == nil { - th.t.Fatal("expected non-nil snapshot") + t.Fatal("expected non-nil snapshot") } - // Count non-zero entries + // Count non-zero entries in snapshot and expected. actualCount := 0 - for _, count := range snapshot { - if count > 0 { + for _, c := range snapshot { + if c > 0 { actualCount++ } } - expectedCount := 0 for _, d := range expected { if d.Count > 0 { @@ -115,169 +40,22 @@ func (th *TestHelper) ValidateSnapshot(snapshot map[conntrack.ZoneMarkKey]int, e } if actualCount != expectedCount { - th.t.Errorf("expected %d non-zero entries, got %d", expectedCount, actualCount) + t.Errorf("expected %d non-zero entries, got %d", expectedCount, actualCount) } - // Validate specific entries + // Validate specific entries. for _, d := range expected { - if d.Count > 0 { - key := conntrack.ZoneMarkKey{Zone: d.Zone, Mark: d.Mark} - if count, exists := snapshot[key]; !exists { - th.t.Errorf("expected entry for key %v not found", key) - } else if count != d.Count { - th.t.Errorf("expected count %d for key %v, got %d", d.Count, key, count) - } + if d.Count <= 0 { + continue } - } -} - -// RunConcurrentTest runs a test function concurrently -func (th *TestHelper) RunConcurrentTest(goroutines int, testFunc func()) { - done := make(chan bool, goroutines) - - for i := 0; i < goroutines; i++ { - go func() { - testFunc() - done <- true - }() - } - - // Wait for all goroutines - for i := 0; i < goroutines; i++ { - <-done - } -} - -// WaitForCondition waits for a condition to be true with timeout -func (th *TestHelper) WaitForCondition(condition func() bool, timeout time.Duration) bool { - deadline := time.Now().Add(timeout) - ticker := time.NewTicker(10 * time.Millisecond) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - if condition() { - return true - } - if time.Now().After(deadline) { - return false - } + key := conntrack.ZoneMarkKey{Zone: d.Zone, Mark: d.Mark} + count, ok := snapshot[key] + if !ok { + t.Errorf("expected entry for key %v not found", key) + continue } - } -} - -// BenchmarkCollector benchmarks collector performance -func (th *TestHelper) BenchmarkCollector(collector *conntrackCollector, iterations int) time.Duration { - start := time.Now() - - for i := 0; i < iterations; i++ { - snapshot := collector.agg.Snapshot() - if snapshot == nil { - th.t.Errorf("snapshot returned nil at iteration %d", i) + if count != d.Count { + t.Errorf("expected count %d for key %v, got %d", d.Count, key, count) } } - - return time.Since(start) -} - -// TestCollectorWithData tests a collector with specific data -func (th *TestHelper) TestCollectorWithData(collector *conntrackCollector, expectedData []TestData) { - // Test snapshot - snapshot := collector.agg.Snapshot() - th.ValidateSnapshot(snapshot, expectedData) -} - -// TestCollectorLifecycle tests the full lifecycle of a collector -func (th *TestHelper) TestCollectorLifecycle(agg conntrack.Aggregator) { - collector := newConntrackCollector(agg).(*conntrackCollector) - - // Test initial state - if collector == nil { - th.t.Fatal("expected non-nil collector") - } - - // Test snapshot - snapshot := collector.agg.Snapshot() - if snapshot == nil { - th.t.Fatal("expected non-nil snapshot") - } - - // Test concurrent access - th.RunConcurrentTest(10, func() { - snapshot := collector.agg.Snapshot() - if snapshot == nil { - th.t.Error("concurrent snapshot returned nil") - } - }) -} - -// MockAggregatorBuilder provides a fluent interface for building mock aggregators -type MockAggregatorBuilder struct { - agg *conntrack.MockZoneMarkAggregator -} - -// NewMockAggregatorBuilder creates a new builder -func (th *TestHelper) NewMockAggregatorBuilder() *MockAggregatorBuilder { - return &MockAggregatorBuilder{ - agg: th.CreateMockAggregator(), - } -} - -// WithData adds test data to the aggregator -func (b *MockAggregatorBuilder) WithData(data []TestData) *MockAggregatorBuilder { - for _, d := range data { - b.agg.SetCount(d.Zone, d.Mark, d.Count) - } - return b -} - -// WithEntry adds a single entry to the aggregator -func (b *MockAggregatorBuilder) WithEntry(zone uint16, mark uint32, count int) *MockAggregatorBuilder { - b.agg.SetCount(zone, mark, count) - return b -} - -// WithAddEntry adds an entry using AddEntry method -func (b *MockAggregatorBuilder) WithAddEntry(zone uint16, mark uint32) *MockAggregatorBuilder { - b.agg.AddEntry(zone, mark) - return b -} - -// WithRemoveEntry removes an entry using RemoveEntry method -func (b *MockAggregatorBuilder) WithRemoveEntry(zone uint16, mark uint32) *MockAggregatorBuilder { - b.agg.RemoveEntry(zone, mark) - return b -} - -// Build returns the built aggregator -func (b *MockAggregatorBuilder) Build() *conntrack.MockZoneMarkAggregator { - return b.agg -} - -// TestCollectorBuilder provides a fluent interface for building collectors -type TestCollectorBuilder struct { - collector *conntrackCollector -} - -// NewTestCollectorBuilder creates a new collector builder -func (th *TestHelper) NewTestCollectorBuilder() *TestCollectorBuilder { - return &TestCollectorBuilder{} -} - -// WithMockAggregator sets a mock aggregator -func (b *TestCollectorBuilder) WithMockAggregator(agg *conntrack.MockZoneMarkAggregator) *TestCollectorBuilder { - b.collector = newConntrackCollector(agg).(*conntrackCollector) - return b -} - -// WithNilAggregator sets a nil aggregator -func (b *TestCollectorBuilder) WithNilAggregator() *TestCollectorBuilder { - b.collector = newConntrackCollector(nil).(*conntrackCollector) - return b -} - -// Build returns the built collector -func (b *TestCollectorBuilder) Build() *conntrackCollector { - return b.collector } From a33ec5dcebcaffb0b4332b638c6491b7c0906181 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 29 Oct 2025 22:34:21 +0530 Subject: [PATCH 14/19] code review suggestions --- cmd/openvswitch_exporter/main.go | 27 ++- .../conntrack/{types.go => aggregator.go} | 0 internal/conntrack/config.go | 84 ++++++-- internal/conntrack/exporter.go | 89 ++++++++ .../exporter_mock_test.go} | 202 +++++++++++------- .../exporter_test.go} | 166 +++++++++----- internal/ovsexporter/conntrack.go | 59 ----- internal/ovsexporter/ovsexporter.go | 37 +--- 8 files changed, 417 insertions(+), 247 deletions(-) rename internal/conntrack/{types.go => aggregator.go} (100%) create mode 100644 internal/conntrack/exporter.go rename internal/{ovsexporter/conntrack_mock_test.go => conntrack/exporter_mock_test.go} (61%) rename internal/{ovsexporter/conntrack_test.go => conntrack/exporter_test.go} (63%) delete mode 100644 internal/ovsexporter/conntrack.go diff --git a/cmd/openvswitch_exporter/main.go b/cmd/openvswitch_exporter/main.go index ba8495d..b78cb18 100644 --- a/cmd/openvswitch_exporter/main.go +++ b/cmd/openvswitch_exporter/main.go @@ -15,6 +15,7 @@ import ( "time" "github.com/digitalocean/go-openvswitch/ovsnl" + "github.com/digitalocean/openvswitch_exporter/internal/conntrack" "github.com/digitalocean/openvswitch_exporter/internal/ovsexporter" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -22,8 +23,9 @@ import ( func main() { var ( - metricsAddr = flag.String("metrics.addr", ":9310", "address for Open vSwitch exporter") - metricsPath = flag.String("metrics.path", "/metrics", "URL path for surfacing collected metrics") + metricsAddr = flag.String("metrics.addr", ":9310", "address for Open vSwitch exporter") + metricsPath = flag.String("metrics.path", "/metrics", "URL path for surfacing collected metrics") + enableConntrack = flag.Bool("enable.conntrack", true, "enable conntrack metrics exporter") ) flag.Parse() @@ -37,6 +39,19 @@ func main() { collector := ovsexporter.New(c) prometheus.MustRegister(collector) + // Optionally register conntrack collector + var conntrackAggregator conntrack.MarkZoneAggregator + if *enableConntrack { + conntrackCollector, agg, err := conntrack.NewCollector() + if err != nil { + log.Printf("Warning: Failed to create conntrack collector: %v", err) + } else { + prometheus.MustRegister(conntrackCollector) + conntrackAggregator = agg + log.Printf("Conntrack metrics exporter enabled") + } + } + mux := http.NewServeMux() mux.Handle(*metricsPath, promhttp.Handler()) mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { @@ -90,10 +105,10 @@ func main() { log.Printf("Server shutdown error: %v", err) } - // Close collector if it supports graceful shutdown - if closeable, ok := collector.(interface{ Close() error }); ok { - if err := closeable.Close(); err != nil { - log.Printf("Collector shutdown error: %v", err) + // Stop conntrack aggregator if it was enabled + if conntrackAggregator != nil { + if err := conntrackAggregator.Stop(); err != nil { + log.Printf("Conntrack aggregator shutdown error: %v", err) } } diff --git a/internal/conntrack/types.go b/internal/conntrack/aggregator.go similarity index 100% rename from internal/conntrack/types.go rename to internal/conntrack/aggregator.go diff --git a/internal/conntrack/config.go b/internal/conntrack/config.go index a6936a5..2b186e5 100644 --- a/internal/conntrack/config.go +++ b/internal/conntrack/config.go @@ -18,20 +18,82 @@ import ( "time" ) -// Config holds configuration for the conntrack aggregator +// Config holds configuration for the conntrack aggregator. +// +// The conntrack aggregator uses a default configuration system. The exporter currently +// uses default values for all conntrack settings. Custom configuration is supported +// programmatically via this Config struct and NewZoneMarkAggregatorWithConfig() function, +// but is not currently exposed at the top-level exporter interface. +// +// To use custom configuration, you would need to modify the exporter code to pass a +// custom Config struct instead of using NewZoneMarkAggregator() which always uses defaults. type Config struct { - EventChanSize int - EventWorkerCount int - DestroyFlushIntvl time.Duration - DestroyDeltaCap int + // EventChanSize is the buffer size for the bounded event channel that receives + // conntrack events from the netlink listener (default: 524288 = 512KB). + // This channel acts as a buffer between the raw netlink events and the event + // workers. When full, events are dropped and missedEvents counter is incremented. + EventChanSize int + + // EventWorkerCount is the number of goroutines that process events from the + // bounded event channel (default: 100). Each worker + // processes NEW/DESTROY/UPDATE events. + EventWorkerCount int + + // DestroyFlushIntvl is the base interval for flushing aggregated DESTROY deltas + // into the main counts map (default: 50ms). The actual flushing is adaptive: + // - >500K events/sec: 50ms intervals + // - >100K events/sec: 100ms intervals + // - >10K events/sec: 200ms intervals + // - Normal: uses this configured interval + // Faster flushing reduces latency but uses more CPU. + DestroyFlushIntvl time.Duration + + // DestroyDeltaCap is the maximum number of DESTROY deltas that can be accumulated + // before dropping events (default: 200000). DESTROY events are aggregated into + // deltas to handle massive bursts without OOM. When this cap is reached, new + // DESTROY events are dropped and missedEvents counter is incremented. + DestroyDeltaCap int + + // DropsWarnThreshold is the threshold for triggering health check actions + // (default: 10000). When missed events exceed this threshold, the health + // monitor will attempt to restart the conntrack listener to recover from + // potential connection issues. DropsWarnThreshold int64 - ReadBufferSize int - WriteBufferSize int - HealthCheckIntvl time.Duration - GracefulTimeout time.Duration + + // ReadBufferSize is the socket read buffer size for the conntrack netlink + // connection (default: 67108864 = 64MB). This affects how much data can be + // buffered at the kernel level before being read by the application. + ReadBufferSize int + + // WriteBufferSize is the socket write buffer size for the conntrack netlink + // connection (default: 67108864 = 64MB). This affects how much data can be + // buffered for writes to the kernel. + WriteBufferSize int + + // HealthCheckIntvl is the interval for periodic health monitoring (default: 5m). + // The health monitor checks missed events count and restarts the listener + // if drops exceed DropsWarnThreshold. + HealthCheckIntvl time.Duration + + // GracefulTimeout is the maximum time to wait for graceful shutdown of all + // goroutines during Stop() (default: 30s). This includes waiting for event + // workers to finish, flushing remaining deltas, and closing connections. + GracefulTimeout time.Duration } -// DefaultConfig returns default configuration values +// DefaultConfig returns default configuration values suitable for most production environments. +// This is used internally by NewZoneMarkAggregator(). +// +// Default values: +// - EventChanSize: 524288 (512KB) +// - EventWorkerCount: 100 +// - DestroyFlushIntvl: 50ms +// - DestroyDeltaCap: 200000 +// - DropsWarnThreshold: 10000 +// - ReadBufferSize: 67108864 (64MB) +// - WriteBufferSize: 67108864 (64MB) +// - HealthCheckIntvl: 5m +// - GracefulTimeout: 30s func DefaultConfig() *Config { return &Config{ EventChanSize: 512 * 1024, @@ -45,5 +107,3 @@ func DefaultConfig() *Config { GracefulTimeout: 30 * time.Second, } } - -// LoadConfig loads conntrack configuration from environment variables diff --git a/internal/conntrack/exporter.go b/internal/conntrack/exporter.go new file mode 100644 index 0000000..57dc3ee --- /dev/null +++ b/internal/conntrack/exporter.go @@ -0,0 +1,89 @@ +// Copyright 2017 DigitalOcean. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package conntrack + +import ( + "fmt" + "log" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + namespace = "openvswitch" +) + +// Collector is a Prometheus collector for conntrack entries by zone and mark. +type Collector struct { + desc *prometheus.Desc + agg MarkZoneAggregator +} + +// Compile-time assertion that *Collector implements prometheus.Collector +var _ prometheus.Collector = (*Collector)(nil) + +// NewCollector creates a new Prometheus collector for conntrack metrics. +// It starts the aggregator and returns a collector that can be registered +// with Prometheus. The aggregator must be stopped separately via Stop(). +func NewCollector() (prometheus.Collector, MarkZoneAggregator, error) { + agg, err := NewZoneMarkAggregator() + if err != nil { + return nil, nil, fmt.Errorf("failed to create zone/mark aggregator: %w", err) + } + + if err := agg.Start(); err != nil { + return nil, nil, fmt.Errorf("failed to start zone/mark aggregator: %w", err) + } + + return &Collector{ + desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "conntrack", "entries"), + "Number of conntrack entries by zone and mark", + []string{"zone", "mark"}, + nil, + ), + agg: agg, + }, agg, nil +} + +// Describe implements prometheus.Collector. +func (c *Collector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect implements prometheus.Collector. +func (c *Collector) Collect(ch chan<- prometheus.Metric) { + if c.agg == nil { + log.Printf("No aggregator available, emitting zero metric") + ch <- prometheus.MustNewConstMetric( + c.desc, + prometheus.GaugeValue, + 0, + "unknown", "unknown", + ) + return + } + + snapshot := c.agg.Snapshot() + for key, count := range snapshot { + ch <- prometheus.MustNewConstMetric( + c.desc, + prometheus.GaugeValue, + float64(count), + fmt.Sprintf("%d", key.Zone), + fmt.Sprintf("%d", key.Mark), + ) + } +} diff --git a/internal/ovsexporter/conntrack_mock_test.go b/internal/conntrack/exporter_mock_test.go similarity index 61% rename from internal/ovsexporter/conntrack_mock_test.go rename to internal/conntrack/exporter_mock_test.go index 2ce5bba..69c48c3 100644 --- a/internal/ovsexporter/conntrack_mock_test.go +++ b/internal/conntrack/exporter_mock_test.go @@ -1,41 +1,89 @@ //go:build !linux // +build !linux -package ovsexporter +package conntrack import ( "sync" "testing" "time" - "github.com/digitalocean/openvswitch_exporter/internal/conntrack" + "bytes" + "io" + "net/http" + "net/http/httptest" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/prometheus/util/promlint" ) -func TestConntrackCollector(t *testing.T) { +func testCollector(t *testing.T, collector prometheus.Collector) []byte { + t.Helper() + + // Set up and gather metrics from a single pass. + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(collector); err != nil { + t.Fatalf("failed to register Prometheus collector: %v", err) + } + + srv := httptest.NewServer(promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + defer srv.Close() + + resp, err := http.Get(srv.URL) + if err != nil { + t.Fatalf("failed to GET data from prometheus: %v", err) + } + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("failed to read server response: %v", err) + } + + // Check for lint cleanliness of metrics. + problems, err := promlint.New(bytes.NewReader(buf)).Lint() + if err != nil { + t.Fatalf("failed to lint metrics: %v", err) + } + + if len(problems) > 0 { + for _, p := range problems { + t.Logf("\t%s: %s", p.Metric, p.Text) + } + + t.Fatal("failing test due to lint problems") + } + + // Metrics check out, return to caller for further tests. + return buf +} + +func TestCollector(t *testing.T) { tests := []struct { name string - setup func() (conntrack.MarkZoneAggregator, error) - operations []func(conntrack.MarkZoneAggregator) error - validate func(*testing.T, *conntrackCollector) + setup func() (MarkZoneAggregator, error) + operations []func(MarkZoneAggregator) error + validate func(*testing.T, *Collector) wantErr bool description string }{ { name: "basic_functionality", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { // Add test data - mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + mockAgg := agg.(*MockZoneMarkAggregator) mockAgg.SetCount(0, 100, 1500) mockAgg.SetCount(0, 200, 2500) mockAgg.SetCount(1, 300, 3500) return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { if collector == nil { t.Fatal("expected non-nil collector") } @@ -48,11 +96,11 @@ func TestConntrackCollector(t *testing.T) { }, { name: "nil_aggregator", - setup: func() (conntrack.MarkZoneAggregator, error) { + setup: func() (MarkZoneAggregator, error) { return nil, nil }, - operations: []func(conntrack.MarkZoneAggregator) error{}, - validate: func(t *testing.T, collector *conntrackCollector) { + operations: []func(MarkZoneAggregator) error{}, + validate: func(t *testing.T, collector *Collector) { if collector == nil { t.Fatal("expected non-nil collector") } @@ -65,11 +113,11 @@ func TestConntrackCollector(t *testing.T) { }, { name: "empty_aggregator", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{}, - validate: func(t *testing.T, collector *conntrackCollector) { + operations: []func(MarkZoneAggregator) error{}, + validate: func(t *testing.T, collector *Collector) { if collector == nil { t.Fatal("expected non-nil collector") } @@ -86,13 +134,13 @@ func TestConntrackCollector(t *testing.T) { }, { name: "large_dataset", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { // Add large dataset - simulate 10K entries across multiple zones - mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + mockAgg := agg.(*MockZoneMarkAggregator) for zone := uint16(0); zone < 10; zone++ { for mark := uint32(0); mark < 1000; mark++ { mockAgg.SetCount(zone, mark, int(uint32(zone)*1000+mark)) @@ -101,7 +149,7 @@ func TestConntrackCollector(t *testing.T) { return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() if len(snapshot) != 10000 { t.Errorf("expected 10000 entries, got %d", len(snapshot)) @@ -112,12 +160,12 @@ func TestConntrackCollector(t *testing.T) { }, { name: "edge_cases", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { - mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { + mockAgg := agg.(*MockZoneMarkAggregator) // Test zero values mockAgg.SetCount(0, 0, 0) // Test maximum values @@ -127,7 +175,7 @@ func TestConntrackCollector(t *testing.T) { return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() // Should have 2 entries (zero and negative counts should be filtered out) if len(snapshot) != 2 { @@ -139,15 +187,15 @@ func TestConntrackCollector(t *testing.T) { }, { name: "concurrent_operations", - setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewMockZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { - mockAgg := agg.(*conntrack.MockZoneMarkAggregator) + setup: func() (MarkZoneAggregator, error) { return NewMockZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { + mockAgg := agg.(*MockZoneMarkAggregator) mockAgg.SetCount(0, 100, 100) return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { var wg sync.WaitGroup wg.Add(10) for i := 0; i < 10; i++ { @@ -185,9 +233,9 @@ func TestConntrackCollector(t *testing.T) { } } - collector := newConntrackCollector(agg) + collector := &Collector{agg: agg} if tt.validate != nil { - tt.validate(t, collector.(*conntrackCollector)) + tt.validate(t, collector) } // Test the collector with Prometheus @@ -199,30 +247,30 @@ func TestConntrackCollector(t *testing.T) { func TestMockAggregatorOperations(t *testing.T) { tests := []struct { name string - operations []func(*conntrack.MockZoneMarkAggregator) - validate func(*testing.T, *conntrack.MockZoneMarkAggregator) + operations []func(*MockZoneMarkAggregator) + validate func(*testing.T, *MockZoneMarkAggregator) description string }{ { name: "add_remove_entries", - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { agg.AddEntry(0, 100) agg.AddEntry(0, 100) agg.AddEntry(1, 200) }, - func(agg *conntrack.MockZoneMarkAggregator) { + func(agg *MockZoneMarkAggregator) { agg.RemoveEntry(0, 100) }, }, - validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + validate: func(t *testing.T, agg *MockZoneMarkAggregator) { snapshot := agg.Snapshot() if len(snapshot) != 2 { t.Errorf("expected 2 entries, got %d", len(snapshot)) } // Check specific counts - key1 := conntrack.ZoneMarkKey{Zone: 0, Mark: 100} - key2 := conntrack.ZoneMarkKey{Zone: 1, Mark: 200} + key1 := ZoneMarkKey{Zone: 0, Mark: 100} + key2 := ZoneMarkKey{Zone: 1, Mark: 200} if snapshot[key1] != 1 { t.Errorf("expected count 1 for key %v, got %d", key1, snapshot[key1]) } @@ -234,20 +282,20 @@ func TestMockAggregatorOperations(t *testing.T) { }, { name: "set_count_operations", - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { agg.SetCount(0, 100, 1500) agg.SetCount(1, 200, 2500) agg.SetCount(2, 300, 0) // Should be filtered out }, }, - validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + validate: func(t *testing.T, agg *MockZoneMarkAggregator) { snapshot := agg.Snapshot() if len(snapshot) != 2 { t.Errorf("expected 2 entries, got %d", len(snapshot)) } - key1 := conntrack.ZoneMarkKey{Zone: 0, Mark: 100} - key2 := conntrack.ZoneMarkKey{Zone: 1, Mark: 200} + key1 := ZoneMarkKey{Zone: 0, Mark: 100} + key2 := ZoneMarkKey{Zone: 1, Mark: 200} if snapshot[key1] != 1500 { t.Errorf("expected count 1500 for key %v, got %d", key1, snapshot[key1]) } @@ -259,16 +307,16 @@ func TestMockAggregatorOperations(t *testing.T) { }, { name: "clear_operations", - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { agg.SetCount(0, 100, 1500) agg.SetCount(1, 200, 2500) }, - func(agg *conntrack.MockZoneMarkAggregator) { + func(agg *MockZoneMarkAggregator) { agg.Clear() }, }, - validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + validate: func(t *testing.T, agg *MockZoneMarkAggregator) { snapshot := agg.Snapshot() if len(snapshot) != 0 { t.Errorf("expected empty snapshot after clear, got %d entries", len(snapshot)) @@ -278,12 +326,12 @@ func TestMockAggregatorOperations(t *testing.T) { }, { name: "health_metrics", - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { agg.SetCount(0, 100, 1000) }, }, - validate: func(t *testing.T, agg *conntrack.MockZoneMarkAggregator) { + validate: func(t *testing.T, agg *MockZoneMarkAggregator) { if !agg.IsHealthy() { t.Error("expected healthy aggregator") } @@ -304,7 +352,7 @@ func TestMockAggregatorOperations(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - agg, err := conntrack.NewMockZoneMarkAggregator() + agg, err := NewMockZoneMarkAggregator() if err != nil { t.Fatalf("Failed to create mock aggregator: %v", err) } @@ -323,36 +371,36 @@ func TestMockAggregatorOperations(t *testing.T) { } } -func TestConntrackCollectorIntegration(t *testing.T) { +func TestCollectorIntegration(t *testing.T) { tests := []struct { name string - setup func() (*conntrack.MockZoneMarkAggregator, error) - operations []func(*conntrack.MockZoneMarkAggregator) - validate func(*testing.T, *conntrackCollector) + setup func() (*MockZoneMarkAggregator, error) + operations []func(*MockZoneMarkAggregator) + validate func(*testing.T, *Collector) description string }{ { name: "full_lifecycle", - setup: func() (*conntrack.MockZoneMarkAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (*MockZoneMarkAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { // Start the aggregator agg.Start() }, - func(agg *conntrack.MockZoneMarkAggregator) { + func(agg *MockZoneMarkAggregator) { // Add some data agg.SetCount(0, 100, 1500) agg.SetCount(1, 200, 2500) }, - func(agg *conntrack.MockZoneMarkAggregator) { + func(agg *MockZoneMarkAggregator) { // Modify data agg.AddEntry(0, 100) agg.RemoveEntry(1, 200) }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { // Test that collector can handle the aggregator snapshot := collector.agg.Snapshot() if snapshot == nil { @@ -367,18 +415,18 @@ func TestConntrackCollectorIntegration(t *testing.T) { }, { name: "stress_test", - setup: func() (*conntrack.MockZoneMarkAggregator, error) { - return conntrack.NewMockZoneMarkAggregator() + setup: func() (*MockZoneMarkAggregator, error) { + return NewMockZoneMarkAggregator() }, - operations: []func(*conntrack.MockZoneMarkAggregator){ - func(agg *conntrack.MockZoneMarkAggregator) { + operations: []func(*MockZoneMarkAggregator){ + func(agg *MockZoneMarkAggregator) { // Add many entries rapidly for i := 0; i < 1000; i++ { agg.SetCount(uint16(i%10), uint32(i), i) } }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() if len(snapshot) != 1000 { t.Errorf("expected 1000 entries, got %d", len(snapshot)) @@ -402,9 +450,9 @@ func TestConntrackCollectorIntegration(t *testing.T) { time.Sleep(1 * time.Millisecond) } - collector := newConntrackCollector(agg) + collector := &Collector{agg: agg} if tt.validate != nil { - tt.validate(t, collector.(*conntrackCollector)) + tt.validate(t, collector) } // Test with Prometheus diff --git a/internal/ovsexporter/conntrack_test.go b/internal/conntrack/exporter_test.go similarity index 63% rename from internal/ovsexporter/conntrack_test.go rename to internal/conntrack/exporter_test.go index 914934e..af433b1 100644 --- a/internal/ovsexporter/conntrack_test.go +++ b/internal/conntrack/exporter_test.go @@ -4,35 +4,83 @@ // Copyright 2018-2021 DigitalOcean. // SPDX-License-Identifier: Apache-2.0 -package ovsexporter +package conntrack import ( "sync" "testing" "time" - "github.com/digitalocean/openvswitch_exporter/internal/conntrack" + "bytes" + "io" + "net/http" + "net/http/httptest" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/prometheus/util/promlint" ) -func TestConntrackCollector(t *testing.T) { +func testCollector(t *testing.T, collector prometheus.Collector) []byte { + t.Helper() + + // Set up and gather metrics from a single pass. + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(collector); err != nil { + t.Fatalf("failed to register Prometheus collector: %v", err) + } + + srv := httptest.NewServer(promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + defer srv.Close() + + resp, err := http.Get(srv.URL) + if err != nil { + t.Fatalf("failed to GET data from prometheus: %v", err) + } + defer resp.Body.Close() + + buf, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("failed to read server response: %v", err) + } + + // Check for lint cleanliness of metrics. + problems, err := promlint.New(bytes.NewReader(buf)).Lint() + if err != nil { + t.Fatalf("failed to lint metrics: %v", err) + } + + if len(problems) > 0 { + for _, p := range problems { + t.Logf("\t%s: %s", p.Metric, p.Text) + } + + t.Fatal("failing test due to lint problems") + } + + // Metrics check out, return to caller for further tests. + return buf +} + +func TestCollector(t *testing.T) { tests := []struct { name string - setup func() (conntrack.MarkZoneAggregator, error) - operations []func(conntrack.MarkZoneAggregator) error - validate func(*testing.T, *conntrackCollector) + setup func() (MarkZoneAggregator, error) + operations []func(MarkZoneAggregator) error + validate func(*testing.T, *Collector) wantErr bool skipOnError bool description string }{ { name: "real_aggregator_creation", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { if collector == nil { t.Fatal("expected non-nil collector") } @@ -49,11 +97,11 @@ func TestConntrackCollector(t *testing.T) { }, { name: "nil_aggregator_handling", - setup: func() (conntrack.MarkZoneAggregator, error) { + setup: func() (MarkZoneAggregator, error) { return nil, nil }, - operations: []func(conntrack.MarkZoneAggregator) error{}, - validate: func(t *testing.T, collector *conntrackCollector) { + operations: []func(MarkZoneAggregator) error{}, + validate: func(t *testing.T, collector *Collector) { if collector == nil { t.Fatal("expected non-nil collector") } @@ -69,18 +117,18 @@ func TestConntrackCollector(t *testing.T) { }, { name: "real_data_processing", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { // Let it run briefly to potentially collect real data time.Sleep(50 * time.Millisecond) return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() if snapshot == nil { t.Fatal("expected non-nil snapshot") @@ -94,9 +142,9 @@ func TestConntrackCollector(t *testing.T) { }, { name: "concurrent_collection", - setup: func() (conntrack.MarkZoneAggregator, error) { return conntrack.NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }}, - validate: func(t *testing.T, collector *conntrackCollector) { + setup: func() (MarkZoneAggregator, error) { return NewZoneMarkAggregator() }, + operations: []func(MarkZoneAggregator) error{func(agg MarkZoneAggregator) error { return agg.Start() }}, + validate: func(t *testing.T, collector *Collector) { var wg sync.WaitGroup wg.Add(10) for i := 0; i < 10; i++ { @@ -116,23 +164,23 @@ func TestConntrackCollector(t *testing.T) { }, { name: "lifecycle_management", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { // Let it run briefly time.Sleep(10 * time.Millisecond) return nil }, - func(agg conntrack.MarkZoneAggregator) error { + func(agg MarkZoneAggregator) error { // Stop the aggregator agg.Stop() return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { // Snapshot should still work after stop snapshot := collector.agg.Snapshot() if snapshot == nil { @@ -152,7 +200,7 @@ func TestConntrackCollector(t *testing.T) { if tt.skipOnError { t.Logf("Skipping test due to expected failure: %v", err) // Test with nil aggregator to ensure collector handles gracefully - collector := newConntrackCollector(nil) + collector := &Collector{agg: nil} testCollector(t, collector) return } @@ -169,7 +217,7 @@ func TestConntrackCollector(t *testing.T) { if tt.skipOnError { t.Logf("Skipping test due to operation %d failure: %v", i, err) // Test with nil aggregator as fallback - collector := newConntrackCollector(nil) + collector := &Collector{agg: nil} testCollector(t, collector) return } @@ -178,9 +226,9 @@ func TestConntrackCollector(t *testing.T) { } } - collector := newConntrackCollector(agg) + collector := &Collector{agg: agg} if tt.validate != nil { - tt.validate(t, collector.(*conntrackCollector)) + tt.validate(t, collector) } // Test the collector with Prometheus @@ -189,7 +237,7 @@ func TestConntrackCollector(t *testing.T) { } } -func TestConntrackCollectorWithRealData(t *testing.T) { +func TestCollectorWithRealData(t *testing.T) { if testing.Short() { t.Skip("Skipping conntrack test in short mode") } @@ -197,13 +245,13 @@ func TestConntrackCollectorWithRealData(t *testing.T) { tests := []struct { name string duration time.Duration - validate func(*testing.T, *conntrackCollector) + validate func(*testing.T, *Collector) description string }{ { name: "short_duration", duration: 100 * time.Millisecond, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() if snapshot == nil { t.Fatal("expected non-nil snapshot") @@ -215,7 +263,7 @@ func TestConntrackCollectorWithRealData(t *testing.T) { { name: "medium_duration", duration: 500 * time.Millisecond, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { snapshot := collector.agg.Snapshot() if snapshot == nil { t.Fatal("expected non-nil snapshot") @@ -229,7 +277,7 @@ func TestConntrackCollectorWithRealData(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Test with real conntrack data if available - agg, err := conntrack.NewZoneMarkAggregator() + agg, err := NewZoneMarkAggregator() if err != nil { t.Skipf("Skipping real data test: %v", err) } @@ -244,9 +292,9 @@ func TestConntrackCollectorWithRealData(t *testing.T) { // Wait for data to accumulate time.Sleep(tt.duration) - collector := newConntrackCollector(agg) + collector := &Collector{agg: agg} if tt.validate != nil { - tt.validate(t, collector.(*conntrackCollector)) + tt.validate(t, collector) } // Test the collector with Prometheus @@ -255,34 +303,34 @@ func TestConntrackCollectorWithRealData(t *testing.T) { } } -func TestConntrackCollectorEdgeCases(t *testing.T) { +func TestCollectorEdgeCases(t *testing.T) { tests := []struct { name string - setup func() (conntrack.MarkZoneAggregator, error) - operations []func(conntrack.MarkZoneAggregator) error - validate func(*testing.T, *conntrackCollector) + setup func() (MarkZoneAggregator, error) + operations []func(MarkZoneAggregator) error + validate func(*testing.T, *Collector) wantErr bool skipOnError bool description string }{ { name: "start_stop_multiple_times", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { return agg.Start() }, - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { return agg.Start() }, + func(agg MarkZoneAggregator) error { time.Sleep(10 * time.Millisecond) agg.Stop() return nil }, - func(agg conntrack.MarkZoneAggregator) error { + func(agg MarkZoneAggregator) error { // Try to start again after stop return agg.Start() }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { // Should handle restart gracefully snapshot := collector.agg.Snapshot() if snapshot == nil { @@ -295,11 +343,11 @@ func TestConntrackCollectorEdgeCases(t *testing.T) { }, { name: "rapid_start_stop_cycles", - setup: func() (conntrack.MarkZoneAggregator, error) { - return conntrack.NewZoneMarkAggregator() + setup: func() (MarkZoneAggregator, error) { + return NewZoneMarkAggregator() }, - operations: []func(conntrack.MarkZoneAggregator) error{ - func(agg conntrack.MarkZoneAggregator) error { + operations: []func(MarkZoneAggregator) error{ + func(agg MarkZoneAggregator) error { // Rapid start/stop cycles for i := 0; i < 5; i++ { if err := agg.Start(); err != nil { @@ -312,7 +360,7 @@ func TestConntrackCollectorEdgeCases(t *testing.T) { return nil }, }, - validate: func(t *testing.T, collector *conntrackCollector) { + validate: func(t *testing.T, collector *Collector) { // Should not panic or leak resources snapshot := collector.agg.Snapshot() if snapshot == nil { @@ -352,9 +400,9 @@ func TestConntrackCollectorEdgeCases(t *testing.T) { } } - collector := newConntrackCollector(agg) + collector := &Collector{agg: agg} if tt.validate != nil { - tt.validate(t, collector.(*conntrackCollector)) + tt.validate(t, collector) } // Test the collector with Prometheus diff --git a/internal/ovsexporter/conntrack.go b/internal/ovsexporter/conntrack.go deleted file mode 100644 index 41f43af..0000000 --- a/internal/ovsexporter/conntrack.go +++ /dev/null @@ -1,59 +0,0 @@ -package ovsexporter - -import ( - "fmt" - "log" - - "github.com/digitalocean/openvswitch_exporter/internal/conntrack" - "github.com/prometheus/client_golang/prometheus" -) - -type conntrackCollector struct { - desc *prometheus.Desc - agg conntrack.MarkZoneAggregator -} - -// ConntrackCollectorWithAggAccessor wraps the existing collector with access to the aggregator snapshot -type ConntrackCollectorWithAggAccessor struct { - *conntrackCollector -} - -func newConntrackCollector(agg conntrack.MarkZoneAggregator) prometheus.Collector { - return &conntrackCollector{ - desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "conntrack", "entries"), - "Number of conntrack entries by zone and mark", - []string{"zone", "mark"}, - nil, - ), - agg: agg, - } -} - -func (c *conntrackCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.desc -} - -func (c *conntrackCollector) Collect(ch chan<- prometheus.Metric) { - if c.agg == nil { - log.Printf("No aggregator available, emitting zero metric") - ch <- prometheus.MustNewConstMetric( - c.desc, - prometheus.GaugeValue, - 0, - "unknown", "unknown", - ) - return - } - - snapshot := c.agg.Snapshot() - for key, count := range snapshot { - ch <- prometheus.MustNewConstMetric( - c.desc, - prometheus.GaugeValue, - float64(count), - fmt.Sprintf("%d", key.Zone), - fmt.Sprintf("%d", key.Mark), - ) - } -} diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index d796e2f..c6d7864 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -6,11 +6,9 @@ package ovsexporter import ( - "log" "sync" "github.com/digitalocean/go-openvswitch/ovsnl" - "github.com/digitalocean/openvswitch_exporter/internal/conntrack" "github.com/prometheus/client_golang/prometheus" ) @@ -20,9 +18,8 @@ const ( // A collector aggregates Open vSwitch Prometheus collectors. type collector struct { - mu sync.Mutex - cs []prometheus.Collector - conntrackAggregator conntrack.MarkZoneAggregator + mu sync.Mutex + cs []prometheus.Collector } // Make sure collector implements prometheus.Collector @@ -34,19 +31,7 @@ func New(c *ovsnl.Client) prometheus.Collector { collectors := []prometheus.Collector{ newDatapathCollector(c.Datapath.List), } - - // Create the aggregator - agg, err := conntrack.NewZoneMarkAggregator() - if err != nil { - log.Printf("Warning: Failed to create zone/mark aggregator: %v", err) - return &collector{cs: collectors} - } - if err := agg.Start(); err != nil { - log.Printf("Warning: Failed to start zone/mark aggregator: %v", err) - return &collector{cs: collectors} - } - collectors = append(collectors, newConntrackCollector(agg)) - return &collector{cs: collectors, conntrackAggregator: agg} + return &collector{cs: collectors} } // Describe implements prometheus.Collector. @@ -68,19 +53,3 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { cc.Collect(ch) } } - -// Close cleans up resources with graceful shutdown -func (c *collector) Close() error { - c.mu.Lock() - defer c.mu.Unlock() - - if c.conntrackAggregator != nil { - if err := c.conntrackAggregator.Stop(); err != nil { - log.Printf("Error stopping aggregator: %v", err) - return err - } - log.Printf("Collector closed gracefully") - } - - return nil -} From 8d4ce2d8a753fbfb5dc0d2e1f892fe300e490862 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 29 Oct 2025 22:37:24 +0530 Subject: [PATCH 15/19] fixing test cases --- internal/conntrack/exporter_test.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/internal/conntrack/exporter_test.go b/internal/conntrack/exporter_test.go index af433b1..7953682 100644 --- a/internal/conntrack/exporter_test.go +++ b/internal/conntrack/exporter_test.go @@ -199,9 +199,6 @@ func TestCollector(t *testing.T) { if (err != nil) != tt.wantErr { if tt.skipOnError { t.Logf("Skipping test due to expected failure: %v", err) - // Test with nil aggregator to ensure collector handles gracefully - collector := &Collector{agg: nil} - testCollector(t, collector) return } t.Errorf("setup error = %v, wantErr %v", err, tt.wantErr) @@ -216,9 +213,6 @@ func TestCollector(t *testing.T) { if err := op(agg); err != nil { if tt.skipOnError { t.Logf("Skipping test due to operation %d failure: %v", i, err) - // Test with nil aggregator as fallback - collector := &Collector{agg: nil} - testCollector(t, collector) return } t.Errorf("operation %d failed: %v", i, err) From f6ea39ebecee821131cc7d46b91b95de1ded04c3 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 29 Oct 2025 22:41:08 +0530 Subject: [PATCH 16/19] fixing test cases --- internal/conntrack/exporter_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/conntrack/exporter_test.go b/internal/conntrack/exporter_test.go index 7953682..8e3bdf9 100644 --- a/internal/conntrack/exporter_test.go +++ b/internal/conntrack/exporter_test.go @@ -102,14 +102,10 @@ func TestCollector(t *testing.T) { }, operations: []func(MarkZoneAggregator) error{}, validate: func(t *testing.T, collector *Collector) { - if collector == nil { - t.Fatal("expected non-nil collector") - } + // Intentionally minimal: do not attempt Prometheus registration when agg is nil. if collector.agg != nil { - t.Error("expected nil aggregator") + t.Errorf("expected nil aggregator, got non-nil") } - // Test that collector handles nil aggregator gracefully - // This should not panic and should emit zero metrics }, wantErr: false, skipOnError: false, @@ -225,8 +221,12 @@ func TestCollector(t *testing.T) { tt.validate(t, collector) } - // Test the collector with Prometheus - testCollector(t, collector) + // Only run Prometheus registration when we have a non-nil aggregator. + if agg != nil { + testCollector(t, collector) + } else { + t.Log("Skipping Prometheus registration for nil aggregator case") + } }) } } From f67d1a80bce6ce3af125d3700f486b0aac73502a Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Wed, 29 Oct 2025 22:42:01 +0530 Subject: [PATCH 17/19] removing unnecessary files --- CONNTRACK_CONFIG.md | 103 -------------------------------------------- 1 file changed, 103 deletions(-) delete mode 100644 CONNTRACK_CONFIG.md diff --git a/CONNTRACK_CONFIG.md b/CONNTRACK_CONFIG.md deleted file mode 100644 index 2a2672f..0000000 --- a/CONNTRACK_CONFIG.md +++ /dev/null @@ -1,103 +0,0 @@ -# Conntrack Configuration - -This document describes the configuration options available for the conntrack aggregator. - -## Environment Variables - -The conntrack aggregator can be configured using environment variables with the `CONNTRACK_` prefix: - -| Variable | Default | Description | -|----------|---------|-------------| -| `CONNTRACK_EVENT_CHAN_SIZE` | `524288` | Event channel buffer size (512KB) | -| `CONNTRACK_EVENT_WORKER_COUNT` | `100` | Number of event worker goroutines | -| `CONNTRACK_DESTROY_FLUSH_INTERVAL` | `50ms` | Interval for flushing destroy deltas | -| `CONNTRACK_DESTROY_DELTA_CAP` | `200000` | Maximum destroy delta entries | -| `CONNTRACK_DROPS_WARN_THRESHOLD` | `10000` | Threshold for missed events warning | -| `CONNTRACK_READ_BUFFER_SIZE` | `67108864` | Read buffer size (64MB) | -| `CONNTRACK_WRITE_BUFFER_SIZE` | `67108864` | Write buffer size (64MB) | -| `CONNTRACK_HEALTH_CHECK_INTERVAL` | `5m` | Health check interval | -| `CONNTRACK_GRACEFUL_TIMEOUT` | `30s` | Graceful shutdown timeout | - -## Usage Examples - -### Basic Configuration - -```bash -# Set custom buffer sizes -export CONNTRACK_EVENT_CHAN_SIZE=1048576 -export CONNTRACK_EVENT_WORKER_COUNT=200 - -# Run the exporter -./openvswitch_exporter -``` - -### High-Throughput Environment - -For environments with high conntrack event rates (>1M events/sec): - -```bash -export CONNTRACK_EVENT_CHAN_SIZE=1048576 # 1MB buffer -export CONNTRACK_EVENT_WORKER_COUNT=200 # More workers -export CONNTRACK_DESTROY_FLUSH_INTERVAL=25ms # Faster flushing -export CONNTRACK_DESTROY_DELTA_CAP=500000 # Larger delta cap -export CONNTRACK_READ_BUFFER_SIZE=134217728 # 128MB read buffer -export CONNTRACK_WRITE_BUFFER_SIZE=134217728 # 128MB write buffer -``` - -### Low-Resource Environment - -For environments with limited resources: - -```bash -export CONNTRACK_EVENT_CHAN_SIZE=65536 # 64KB buffer -export CONNTRACK_EVENT_WORKER_COUNT=50 # Fewer workers -export CONNTRACK_DESTROY_FLUSH_INTERVAL=100ms # Slower flushing -export CONNTRACK_DESTROY_DELTA_CAP=50000 # Smaller delta cap -export CONNTRACK_READ_BUFFER_SIZE=16777216 # 16MB read buffer -export CONNTRACK_WRITE_BUFFER_SIZE=16777216 # 16MB write buffer -``` - -### Development/Testing - -For development and testing: - -```bash -export CONNTRACK_GRACEFUL_TIMEOUT=5s # Faster shutdown -export CONNTRACK_HEALTH_CHECK_INTERVAL=1m # More frequent health checks -``` - -## Configuration Validation - -The configuration system includes validation: - -- **Positive values**: All numeric values must be positive -- **Valid durations**: Time values must be valid Go durations -- **Range checks**: Values are checked for reasonable ranges - -Invalid values will fall back to defaults with a warning logged. - -## Migration from Hardcoded Constants - -The following hardcoded constants have been replaced: - -| Old Constant | New Environment Variable | Default Value | -|--------------|-------------------------|---------------| -| `eventChanSize = 512 * 1024` | `CONNTRACK_EVENT_CHAN_SIZE` | `524288` | -| `eventWorkerCount = 100` | `CONNTRACK_EVENT_WORKER_COUNT` | `100` | -| `destroyFlushIntvl = 50ms` | `CONNTRACK_DESTROY_FLUSH_INTERVAL` | `50ms` | -| `destroyDeltaCap = 200000` | `CONNTRACK_DESTROY_DELTA_CAP` | `200000` | -| `dropsWarnThreshold = 10000` | `CONNTRACK_DROPS_WARN_THRESHOLD` | `10000` | -| Buffer sizes `64MB` | `CONNTRACK_READ_BUFFER_SIZE` / `WRITE_BUFFER_SIZE` | `67108864` | -| Health check `5m` | `CONNTRACK_HEALTH_CHECK_INTERVAL` | `5m` | -| Graceful timeout `30s` | `CONNTRACK_GRACEFUL_TIMEOUT` | `30s` | - -## Performance Impact - -Configuration changes can significantly impact performance: - -- **Larger buffers**: Better for high-throughput, uses more memory -- **More workers**: Better parallelism, uses more CPU -- **Faster flushing**: Lower latency, more CPU usage -- **Larger delta cap**: Handles bursts better, uses more memory - -Choose settings based on your environment's characteristics and requirements. From 6f7a80216f9308b182051da3bb3bf50cbf0ab99b Mon Sep 17 00:00:00 2001 From: shrouti1995 Date: Mon, 3 Nov 2025 18:40:55 +0530 Subject: [PATCH 18/19] Apply suggestions from code review Co-authored-by: Anit Gandhi --- cmd/openvswitch_exporter/main.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cmd/openvswitch_exporter/main.go b/cmd/openvswitch_exporter/main.go index b78cb18..4b5d8f9 100644 --- a/cmd/openvswitch_exporter/main.go +++ b/cmd/openvswitch_exporter/main.go @@ -42,12 +42,16 @@ func main() { // Optionally register conntrack collector var conntrackAggregator conntrack.MarkZoneAggregator if *enableConntrack { - conntrackCollector, agg, err := conntrack.NewCollector() + conntrackCollector, conntrackAggregator, err := conntrack.NewCollector() if err != nil { log.Printf("Warning: Failed to create conntrack collector: %v", err) } else { prometheus.MustRegister(conntrackCollector) - conntrackAggregator = agg + defer func() { + if err := conntrackAggregator.Stop(); err != nil { + log.Printf("Conntrack aggregator shutdown error: %v", err) + } + } log.Printf("Conntrack metrics exporter enabled") } } @@ -105,12 +109,6 @@ func main() { log.Printf("Server shutdown error: %v", err) } - // Stop conntrack aggregator if it was enabled - if conntrackAggregator != nil { - if err := conntrackAggregator.Stop(); err != nil { - log.Printf("Conntrack aggregator shutdown error: %v", err) - } - } log.Printf("Exporter stopped") } From 4883f1e7973b157faf6576e58f4d67f4e166e2e0 Mon Sep 17 00:00:00 2001 From: sgangopadhyay Date: Mon, 3 Nov 2025 19:09:37 +0530 Subject: [PATCH 19/19] review comments addressed --- cmd/openvswitch_exporter/main.go | 31 +++---------- internal/conntrack/aggregator_linux.go | 11 +---- internal/ovsexporter/ovsexporter.go | 9 ++-- internal/ovsexporter/test_helpers.go | 61 -------------------------- 4 files changed, 14 insertions(+), 98 deletions(-) delete mode 100644 internal/ovsexporter/test_helpers.go diff --git a/cmd/openvswitch_exporter/main.go b/cmd/openvswitch_exporter/main.go index 4b5d8f9..a2ab22a 100644 --- a/cmd/openvswitch_exporter/main.go +++ b/cmd/openvswitch_exporter/main.go @@ -40,7 +40,6 @@ func main() { prometheus.MustRegister(collector) // Optionally register conntrack collector - var conntrackAggregator conntrack.MarkZoneAggregator if *enableConntrack { conntrackCollector, conntrackAggregator, err := conntrack.NewCollector() if err != nil { @@ -48,10 +47,12 @@ func main() { } else { prometheus.MustRegister(conntrackCollector) defer func() { - if err := conntrackAggregator.Stop(); err != nil { - log.Printf("Conntrack aggregator shutdown error: %v", err) + if conntrackAggregator != nil { + if err := conntrackAggregator.Stop(); err != nil { + log.Printf("Conntrack aggregator shutdown error: %v", err) + } } - } + }() log.Printf("Conntrack metrics exporter enabled") } } @@ -70,12 +71,7 @@ func main() { // Handle shutdown signals sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, - syscall.SIGINT, // Ctrl+C - syscall.SIGTERM, // Termination request - syscall.SIGHUP, // Hang up (config reload) - syscall.SIGQUIT, // Quit signal - ) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start server in goroutine go func() { @@ -87,19 +83,7 @@ func main() { // Wait for shutdown signal sig := <-sigChan - - switch sig { - case syscall.SIGHUP: - log.Printf("Received SIGHUP, reloading config...") - // TODO: Add config reload logic here - log.Printf("Config reloaded") - return - case syscall.SIGQUIT: - log.Printf("Received SIGQUIT, shutting down immediately...") - // Immediate shutdown for SIGQUIT - default: - log.Printf("Received signal %v, stopping gracefully...", sig) - } + log.Printf("Received signal %v, stopping gracefully...", sig) // Graceful shutdown with 15 second timeout ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) @@ -109,6 +93,5 @@ func main() { log.Printf("Server shutdown error: %v", err) } - log.Printf("Exporter stopped") } diff --git a/internal/conntrack/aggregator_linux.go b/internal/conntrack/aggregator_linux.go index bd87a2f..ddb7a2a 100644 --- a/internal/conntrack/aggregator_linux.go +++ b/internal/conntrack/aggregator_linux.go @@ -341,20 +341,13 @@ func (a *ZoneMarkAggregator) performHealthCheck() error { return nil } -// GetError returns any error from the errgroup if available -func (a *ZoneMarkAggregator) GetError() error { - // This is a non-blocking way to check if there are any errors - // The actual error handling happens in Stop() - return nil -} - // Stop cancels listening and closes the connection with graceful shutdown. func (a *ZoneMarkAggregator) Stop() error { - return a.StopWithTimeout(a.config.GracefulTimeout) + return a.stopWithTimeout(a.config.GracefulTimeout) } // StopWithTimeout cancels listening and closes the connection with a configurable timeout. -func (a *ZoneMarkAggregator) StopWithTimeout(timeout time.Duration) error { +func (a *ZoneMarkAggregator) stopWithTimeout(timeout time.Duration) error { // Signal shutdown to all goroutines a.cancel() diff --git a/internal/ovsexporter/ovsexporter.go b/internal/ovsexporter/ovsexporter.go index c6d7864..9ff3284 100644 --- a/internal/ovsexporter/ovsexporter.go +++ b/internal/ovsexporter/ovsexporter.go @@ -22,16 +22,17 @@ type collector struct { cs []prometheus.Collector } -// Make sure collector implements prometheus.Collector var _ prometheus.Collector = &collector{} // New creates a new Prometheus collector which collects metrics using the // input Open vSwitch generic netlink client. func New(c *ovsnl.Client) prometheus.Collector { - collectors := []prometheus.Collector{ - newDatapathCollector(c.Datapath.List), + return &collector{ + cs: []prometheus.Collector{ + // Additional generic netlink family collectors can be added here. + newDatapathCollector(c.Datapath.List), + }, } - return &collector{cs: collectors} } // Describe implements prometheus.Collector. diff --git a/internal/ovsexporter/test_helpers.go b/internal/ovsexporter/test_helpers.go deleted file mode 100644 index 551826a..0000000 --- a/internal/ovsexporter/test_helpers.go +++ /dev/null @@ -1,61 +0,0 @@ -//go:build !linux -// +build !linux - -// Copyright 2018-2021 DigitalOcean. -// SPDX-License-Identifier: Apache-2.0 - -package ovsexporter - -import ( - "testing" - - "github.com/digitalocean/openvswitch_exporter/internal/conntrack" -) - -// TestData represents test data for aggregator testing. -type TestData struct { - Zone uint16 - Mark uint32 - Count int -} - -// ValidateSnapshot compares a snapshot with expected test data, asserting counts match for non-zero entries. -func ValidateSnapshot(t *testing.T, snapshot map[conntrack.ZoneMarkKey]int, expected []TestData) { - if snapshot == nil { - t.Fatal("expected non-nil snapshot") - } - - // Count non-zero entries in snapshot and expected. - actualCount := 0 - for _, c := range snapshot { - if c > 0 { - actualCount++ - } - } - expectedCount := 0 - for _, d := range expected { - if d.Count > 0 { - expectedCount++ - } - } - - if actualCount != expectedCount { - t.Errorf("expected %d non-zero entries, got %d", expectedCount, actualCount) - } - - // Validate specific entries. - for _, d := range expected { - if d.Count <= 0 { - continue - } - key := conntrack.ZoneMarkKey{Zone: d.Zone, Mark: d.Mark} - count, ok := snapshot[key] - if !ok { - t.Errorf("expected entry for key %v not found", key) - continue - } - if count != d.Count { - t.Errorf("expected count %d for key %v, got %d", d.Count, key, count) - } - } -}