Skip to content

Commit 01fa577

Browse files
committed
router: fix discovery service being nullified too late
If user tries to reconfigure vshard very often, it may happen, that discovery service didn't manage to notice, that it's cancelled, before the new one is started. In such case the new fiber may encounter assertion failure and will be restarted several times. The approach, which makes the service nullification, when the current one is the one stored globally was chosen, since with blocking service cancellation reconfiguration of vshard may take huge amount of time. Part of #9 NO_DOC=bugfix
1 parent ac902ae commit 01fa577

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

test/router-luatest/router_2_2_test.lua

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,3 +1140,28 @@ g.test_retryable_transfer_in_progress = function(g)
11401140
_G.bucket_gc_wait()
11411141
end, {bid, g.replica_2_a:replicaset_uuid()})
11421142
end
1143+
1144+
--
1145+
-- gh-ee-9: service objects are nullified too late.
1146+
--
1147+
g.test_discovery_can_be_restarted_fast = function(g)
1148+
local new_cfg_template = table.deepcopy(cfg_template)
1149+
new_cfg_template.discovery_mode = 'once'
1150+
local new_global_cfg = vtest.config_new(new_cfg_template)
1151+
vtest.clear_test_cfg_options(new_global_cfg)
1152+
1153+
g.router:exec(function(cfg)
1154+
local internal = ivshard.router.internal
1155+
internal.errinj.ERRINJ_LONG_DISCOVERY = true
1156+
ilt.helpers.retrying({timeout = 20}, function()
1157+
ivshard.router.discovery_wakeup()
1158+
ilt.assert_equals(internal.errinj.ERRINJ_LONG_DISCOVERY, 'waiting')
1159+
end)
1160+
ivshard.router._bucket_reset(1)
1161+
ivshard.router.cfg(cfg)
1162+
internal.errinj.ERRINJ_LONG_DISCOVERY = false
1163+
end, {new_global_cfg})
1164+
1165+
t.assert_not(g.router:grep_log('assertion failed'))
1166+
vtest.router_cfg(g.router, global_cfg)
1167+
end

vshard/router/init.lua

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -403,19 +403,20 @@ local function discovery_service_f(router, service)
403403
end
404404
while M.errinj.ERRINJ_LONG_DISCOVERY do
405405
M.errinj.ERRINJ_LONG_DISCOVERY = 'waiting'
406-
lfiber.sleep(0.01)
406+
-- Note, that the errinj won't throw FiberIsCancelled error.
407+
pcall(lfiber.sleep, 0.01)
407408
end
408409
until next(router.replicasets)
409410
end
410411
end
411412

412413
local function discovery_f(router)
413-
assert(not router.discovery_service)
414414
local service = lservice_info.new('discovery')
415415
router.discovery_service = service
416416
local ok, err = pcall(discovery_service_f, router, service)
417-
assert(router.discovery_service == service)
418-
router.discovery_service = nil
417+
if router.discovery_service == service then
418+
router.discovery_service = nil
419+
end
419420
if not ok then
420421
error(err)
421422
end

0 commit comments

Comments
 (0)