replicaset: deduplicate error logs in master search and failover

Serpentian · Serpentian · commit eb1fca16d862 · 2025-10-19T16:56:36.000+03:00
Failover and master search service used to emit a warning on every iteration while a replicaset was unreachable, flooding logs with the same message. From now on this won't be the case, same errors are not printed anymore. Collect idle connections service doesn't need such functionality, since it doesn't log any errors. Closes #588 NO_DOC=bugfix
diff --git a/test/failover/config.lua b/test/failover/config.lua
@@ -131,5 +131,6 @@ return {
     weights = weights,
     sharding = sharding,
     replication_connect_quorum = 0,
+    log_level = 6, -- verbose
 }
 
diff --git a/test/router-luatest/router_2_2_test.lua b/test/router-luatest/router_2_2_test.lua
@@ -588,21 +588,25 @@ g.test_named_config_identification = function(g)
     vtest.router_cfg(g.router, global_cfg)
 end
 
---
--- gh-ee-4: when a known master is disconnected, the router wouldn't try to find
--- the new one and would keep trying to hit the old instance. It should instead
--- try to locate a new master when the old one is disconnected.
---
-g.test_master_discovery_on_disconnect = function(g)
+local function get_auto_master_global_cfg()
     local new_cfg_template = table.deepcopy(cfg_template)
-    -- Auto-master simplifies master switch which is very needed in this test.
     for _, rs in pairs(new_cfg_template.sharding) do
         rs.master = 'auto'
         for _, r in pairs(rs.replicas) do
             r.master = nil
         end
     end
-    local new_cluster_cfg = vtest.config_new(new_cfg_template)
+    return vtest.config_new(new_cfg_template)
+end
+
+--
+-- gh-ee-4: when a known master is disconnected, the router wouldn't try to find
+-- the new one and would keep trying to hit the old instance. It should instead
+-- try to locate a new master when the old one is disconnected.
+--
+g.test_master_discovery_on_disconnect = function(g)
+    -- Auto-master simplifies master switch which is very needed in this test.
+    local new_cluster_cfg = get_auto_master_global_cfg()
     vtest.cluster_cfg(g, new_cluster_cfg)
     g.replica_1_a:update_box_cfg{read_only = false}
     g.replica_1_b:update_box_cfg{read_only = true}
@@ -1199,3 +1203,65 @@ g.test_discovery_not_spam_same_errors = function(g)
         _G.old_interval = nil
     end)
 end
+
+g.test_replicaset_services_do_not_spam_same_errors = function(g)
+    local new_cluster_cfg = get_auto_master_global_cfg()
+    vtest.router_cfg(g.router, new_cluster_cfg)
+
+    --
+    -- Replica failover service. Replicaset failover fails only on assertion,
+    -- which should not normally happen.
+    --
+    g.replica_2_a:exec(function()
+        rawset(_G, 'old_call', ivshard.storage._call)
+        ivshard.storage._call = function(service_name, ...)
+            if service_name == 'info' then
+                error('TimedOut')
+            end
+            return _G.old_call(service_name, ...)
+        end
+    end)
+    local msg = "Ping error from .*replica_2_a"
+    g.router:wait_log_exactly_once(msg, {timeout = 0.1, on_yield = function()
+        _G.failover_wakeup()
+    end})
+
+    --
+    -- Master search services.
+    --
+    g.router:exec(function(uuid)
+        ivshard.router.internal.static_router.replicasets[uuid].master = nil
+    end, {g.replica_2_a:replicaset_uuid()})
+    msg = "Error during master search"
+    g.router:wait_log_exactly_once(msg, {timeout = 0.1, on_yield = function()
+        ivshard.router.master_search_wakeup()
+    end})
+
+    g.replica_2_a:exec(function()
+        ivshard.storage._call = _G.old_call
+        _G.old_call = nil
+    end)
+    vtest.router_cfg(g.router, global_cfg)
+end
+
+g.test_log_ratelimiter_is_dropped_on_disable_of_the_service = function(g)
+    local new_cluster_cfg = get_auto_master_global_cfg()
+    vtest.router_cfg(g.router, new_cluster_cfg)
+
+    -- Limiters exists and are flushed, when auto master is enabled.
+    local name = g.router:exec(function(uuid)
+        rawset(_G, 'ratelimit', require('vshard.log_ratelimit'))
+        local rs = ivshard.router.internal.static_router.replicasets[uuid]
+        local name = 'master_search.' .. rs.id
+        ilt.assert(_G.ratelimit.internal.limiters[name])
+        return name
+    end, {g.replica_2_a:replicaset_uuid()})
+
+    -- This is not the case, when it's disabled.
+    vtest.router_cfg(g.router, global_cfg)
+    g.router:exec(function(name)
+        collectgarbage()
+        ilt.assert_not(_G.ratelimit.internal.limiters[name])
+        _G.ratelimit = nil
+    end, {name})
+end
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
@@ -1787,8 +1787,10 @@ end
 
 local function replica_failover_service_step(replica, data)
     if not data.info then
+        local name = 'replica_failover'
         -- Service info is recreated on every reload.
-        data.info = lservice_info.new('replica_failover')
+        data.info = lservice_info.new(name)
+        data.limiter = lratelimit.create{name = name .. '.' .. replica.id}
     end
     if replica.errinj.ERRINJ_REPLICA_FAILOVER_DELAY then
         replica.errinj.ERRINJ_REPLICA_FAILOVER_DELAY = 'in'
@@ -1807,9 +1809,9 @@ local function replica_failover_service_step(replica, data)
     data.info:set_activity('pinging')
     local net_status, _, err = replica_failover_ping(replica, opts)
     if not net_status then
-        log.error(data.info:set_status_error(
-            'Ping error from %s: perhaps a connection is down: %s',
-            replica, err))
+        data.limiter:log_error(err, data.info:set_status_error(
+                   'Ping error from %s: perhaps a connection is down: %s',
+                   replica, err))
         -- Connection hangs. Recreate it to be able to
         -- fail over to a replica next by priority. The
         -- old connection is not closed in case if it just
@@ -1921,8 +1923,10 @@ end
 --
 local function replicaset_failover_service_step(replicaset, data)
     if not data.info then
+        local name = 'replicaset_failover'
         -- Service info is recreated on every reload.
-        data.info = lservice_info.new('replicaset_failover')
+        data.info = lservice_info.new(name)
+        data.limiter = lratelimit.create{name = name .. '.' .. replicaset.id}
         -- This flag is used to avoid logging like:
         -- 'All is ok ... All is ok ... All is ok ...'
         -- each replicaset.failover_interval seconds.
@@ -1939,9 +1943,9 @@ local function replicaset_failover_service_step(replicaset, data)
     data.info:set_activity('updating replicas')
     local ok, replica_is_changed = pcall(replicaset_failover_step, replicaset)
     if not ok then
-        log.error(data.info:set_status_error(
-            'Error during failovering: %s',
-            lerror.make(replica_is_changed)))
+        data.limiter:log_error(replica_is_changed, data.info:set_status_error(
+                   'Error during failovering: %s',
+                   lerror.make(replica_is_changed)))
         replica_is_changed = true
     else
         data.info:set_status_ok()
@@ -2024,7 +2028,9 @@ end
 --
 local function replicaset_master_search_service_step(replicaset, data)
     if not data.info then
-        data.info = lservice_info.new('master_search')
+        local name = 'master_search'
+        data.info = lservice_info.new(name)
+        data.limiter = lratelimit.create{name = name .. '.' .. replicaset.id}
         data.is_in_progress = false
     end
     data.info:next_iter()
@@ -2041,8 +2047,8 @@ local function replicaset_master_search_service_step(replicaset, data)
     local is_done, is_nop, err =
         replicaset_master_search_step(replicaset, {mode = mode})
     if err then
-        log.error(data.info:set_status_error(
-            'Error during master search: %s', lerror.make(err)))
+        data.limiter:log_error(err, data.info:set_status_error(
+                   'Error during master search: %s', lerror.make(err)))
     end
     if is_done then
         timeout = consts.MASTER_SEARCH_IDLE_INTERVAL

Original file line number	Diff line number	Diff line change
`@@ -131,5 +131,6 @@ return {`
`131`	`131`	`weights = weights,`
`132`	`132`	`sharding = sharding,`
`133`	`133`	`replication_connect_quorum = 0,`
	`134`	`+ log_level = 6, -- verbose`
`134`	`135`	`}`
`135`	`136`