Skip to content

Commit eb1fca1

Browse files
committed
replicaset: deduplicate error logs in master search and failover
Failover and master search service used to emit a warning on every iteration while a replicaset was unreachable, flooding logs with the same message. From now on this won't be the case, same errors are not printed anymore. Collect idle connections service doesn't need such functionality, since it doesn't log any errors. Closes #588 NO_DOC=bugfix
1 parent e58f08e commit eb1fca1

File tree

3 files changed

+92
-19
lines changed

3 files changed

+92
-19
lines changed

test/failover/config.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,6 @@ return {
131131
weights = weights,
132132
sharding = sharding,
133133
replication_connect_quorum = 0,
134+
log_level = 6, -- verbose
134135
}
135136

test/router-luatest/router_2_2_test.lua

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -588,21 +588,25 @@ g.test_named_config_identification = function(g)
588588
vtest.router_cfg(g.router, global_cfg)
589589
end
590590

591-
--
592-
-- gh-ee-4: when a known master is disconnected, the router wouldn't try to find
593-
-- the new one and would keep trying to hit the old instance. It should instead
594-
-- try to locate a new master when the old one is disconnected.
595-
--
596-
g.test_master_discovery_on_disconnect = function(g)
591+
local function get_auto_master_global_cfg()
597592
local new_cfg_template = table.deepcopy(cfg_template)
598-
-- Auto-master simplifies master switch which is very needed in this test.
599593
for _, rs in pairs(new_cfg_template.sharding) do
600594
rs.master = 'auto'
601595
for _, r in pairs(rs.replicas) do
602596
r.master = nil
603597
end
604598
end
605-
local new_cluster_cfg = vtest.config_new(new_cfg_template)
599+
return vtest.config_new(new_cfg_template)
600+
end
601+
602+
--
603+
-- gh-ee-4: when a known master is disconnected, the router wouldn't try to find
604+
-- the new one and would keep trying to hit the old instance. It should instead
605+
-- try to locate a new master when the old one is disconnected.
606+
--
607+
g.test_master_discovery_on_disconnect = function(g)
608+
-- Auto-master simplifies master switch which is very needed in this test.
609+
local new_cluster_cfg = get_auto_master_global_cfg()
606610
vtest.cluster_cfg(g, new_cluster_cfg)
607611
g.replica_1_a:update_box_cfg{read_only = false}
608612
g.replica_1_b:update_box_cfg{read_only = true}
@@ -1199,3 +1203,65 @@ g.test_discovery_not_spam_same_errors = function(g)
11991203
_G.old_interval = nil
12001204
end)
12011205
end
1206+
1207+
g.test_replicaset_services_do_not_spam_same_errors = function(g)
1208+
local new_cluster_cfg = get_auto_master_global_cfg()
1209+
vtest.router_cfg(g.router, new_cluster_cfg)
1210+
1211+
--
1212+
-- Replica failover service. Replicaset failover fails only on assertion,
1213+
-- which should not normally happen.
1214+
--
1215+
g.replica_2_a:exec(function()
1216+
rawset(_G, 'old_call', ivshard.storage._call)
1217+
ivshard.storage._call = function(service_name, ...)
1218+
if service_name == 'info' then
1219+
error('TimedOut')
1220+
end
1221+
return _G.old_call(service_name, ...)
1222+
end
1223+
end)
1224+
local msg = "Ping error from .*replica_2_a"
1225+
g.router:wait_log_exactly_once(msg, {timeout = 0.1, on_yield = function()
1226+
_G.failover_wakeup()
1227+
end})
1228+
1229+
--
1230+
-- Master search services.
1231+
--
1232+
g.router:exec(function(uuid)
1233+
ivshard.router.internal.static_router.replicasets[uuid].master = nil
1234+
end, {g.replica_2_a:replicaset_uuid()})
1235+
msg = "Error during master search"
1236+
g.router:wait_log_exactly_once(msg, {timeout = 0.1, on_yield = function()
1237+
ivshard.router.master_search_wakeup()
1238+
end})
1239+
1240+
g.replica_2_a:exec(function()
1241+
ivshard.storage._call = _G.old_call
1242+
_G.old_call = nil
1243+
end)
1244+
vtest.router_cfg(g.router, global_cfg)
1245+
end
1246+
1247+
g.test_log_ratelimiter_is_dropped_on_disable_of_the_service = function(g)
1248+
local new_cluster_cfg = get_auto_master_global_cfg()
1249+
vtest.router_cfg(g.router, new_cluster_cfg)
1250+
1251+
-- Limiters exists and are flushed, when auto master is enabled.
1252+
local name = g.router:exec(function(uuid)
1253+
rawset(_G, 'ratelimit', require('vshard.log_ratelimit'))
1254+
local rs = ivshard.router.internal.static_router.replicasets[uuid]
1255+
local name = 'master_search.' .. rs.id
1256+
ilt.assert(_G.ratelimit.internal.limiters[name])
1257+
return name
1258+
end, {g.replica_2_a:replicaset_uuid()})
1259+
1260+
-- This is not the case, when it's disabled.
1261+
vtest.router_cfg(g.router, global_cfg)
1262+
g.router:exec(function(name)
1263+
collectgarbage()
1264+
ilt.assert_not(_G.ratelimit.internal.limiters[name])
1265+
_G.ratelimit = nil
1266+
end, {name})
1267+
end

vshard/replicaset.lua

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,8 +1787,10 @@ end
17871787

17881788
local function replica_failover_service_step(replica, data)
17891789
if not data.info then
1790+
local name = 'replica_failover'
17901791
-- Service info is recreated on every reload.
1791-
data.info = lservice_info.new('replica_failover')
1792+
data.info = lservice_info.new(name)
1793+
data.limiter = lratelimit.create{name = name .. '.' .. replica.id}
17921794
end
17931795
if replica.errinj.ERRINJ_REPLICA_FAILOVER_DELAY then
17941796
replica.errinj.ERRINJ_REPLICA_FAILOVER_DELAY = 'in'
@@ -1807,9 +1809,9 @@ local function replica_failover_service_step(replica, data)
18071809
data.info:set_activity('pinging')
18081810
local net_status, _, err = replica_failover_ping(replica, opts)
18091811
if not net_status then
1810-
log.error(data.info:set_status_error(
1811-
'Ping error from %s: perhaps a connection is down: %s',
1812-
replica, err))
1812+
data.limiter:log_error(err, data.info:set_status_error(
1813+
'Ping error from %s: perhaps a connection is down: %s',
1814+
replica, err))
18131815
-- Connection hangs. Recreate it to be able to
18141816
-- fail over to a replica next by priority. The
18151817
-- old connection is not closed in case if it just
@@ -1921,8 +1923,10 @@ end
19211923
--
19221924
local function replicaset_failover_service_step(replicaset, data)
19231925
if not data.info then
1926+
local name = 'replicaset_failover'
19241927
-- Service info is recreated on every reload.
1925-
data.info = lservice_info.new('replicaset_failover')
1928+
data.info = lservice_info.new(name)
1929+
data.limiter = lratelimit.create{name = name .. '.' .. replicaset.id}
19261930
-- This flag is used to avoid logging like:
19271931
-- 'All is ok ... All is ok ... All is ok ...'
19281932
-- each replicaset.failover_interval seconds.
@@ -1939,9 +1943,9 @@ local function replicaset_failover_service_step(replicaset, data)
19391943
data.info:set_activity('updating replicas')
19401944
local ok, replica_is_changed = pcall(replicaset_failover_step, replicaset)
19411945
if not ok then
1942-
log.error(data.info:set_status_error(
1943-
'Error during failovering: %s',
1944-
lerror.make(replica_is_changed)))
1946+
data.limiter:log_error(replica_is_changed, data.info:set_status_error(
1947+
'Error during failovering: %s',
1948+
lerror.make(replica_is_changed)))
19451949
replica_is_changed = true
19461950
else
19471951
data.info:set_status_ok()
@@ -2024,7 +2028,9 @@ end
20242028
--
20252029
local function replicaset_master_search_service_step(replicaset, data)
20262030
if not data.info then
2027-
data.info = lservice_info.new('master_search')
2031+
local name = 'master_search'
2032+
data.info = lservice_info.new(name)
2033+
data.limiter = lratelimit.create{name = name .. '.' .. replicaset.id}
20282034
data.is_in_progress = false
20292035
end
20302036
data.info:next_iter()
@@ -2041,8 +2047,8 @@ local function replicaset_master_search_service_step(replicaset, data)
20412047
local is_done, is_nop, err =
20422048
replicaset_master_search_step(replicaset, {mode = mode})
20432049
if err then
2044-
log.error(data.info:set_status_error(
2045-
'Error during master search: %s', lerror.make(err)))
2050+
data.limiter:log_error(err, data.info:set_status_error(
2051+
'Error during master search: %s', lerror.make(err)))
20462052
end
20472053
if is_done then
20482054
timeout = consts.MASTER_SEARCH_IDLE_INTERVAL

0 commit comments

Comments
 (0)