Skip to content

Commit 9a0cea2

Browse files
committed
recovery: reduce spam of "Finish bucket recovery step" logs
Before this patch "Finish bucket recovery step ..." logs were printed at the end of recovery even if no buckets were successfully recovered. It led to unnecessary log records. This patch fixes the issue by adding an additional check for the number of recovered buckets. Also we print id of every bucket which was successfully recovered in order to track buckets' duplication. Part of #212 NO_DOC=bugfix
1 parent 3a5ddef commit 9a0cea2

File tree

2 files changed

+96
-1
lines changed

2 files changed

+96
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,95 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
101101
ilt.assert_equals(box.space._bucket:get(bid), nil)
102102
end, {bid})
103103
end
104+
105+
local rebalancer_recovery_group = t.group('rebalancer-recovery-logging')
106+
107+
local function start_bucket_move(src_storage, dest_storage, bucket_id)
108+
src_storage:exec(function(bucket_id, replicaset_id)
109+
ivshard.storage.bucket_send(bucket_id, replicaset_id)
110+
end, {bucket_id, dest_storage:replicaset_uuid()})
111+
112+
dest_storage:exec(function(bucket_id)
113+
t.helpers.retrying({timeout = 10}, function()
114+
t.assert(box.space._bucket:select(bucket_id))
115+
end)
116+
end, {bucket_id})
117+
end
118+
119+
local function assert_bucket_is_transferred(src_storage, dest_storage,
120+
bucket_id)
121+
src_storage:exec(function(bucket_id)
122+
t.helpers.retrying({}, function()
123+
t.assert_equals(box.space._bucket:select(bucket_id), {})
124+
end)
125+
end, {bucket_id})
126+
dest_storage:exec(function(bucket_id)
127+
t.helpers.retrying({}, function()
128+
t.assert_equals(box.space._bucket:select(bucket_id)[1].status,
129+
'active')
130+
end)
131+
end, {bucket_id})
132+
end
133+
134+
rebalancer_recovery_group.before_all(function(g)
135+
global_cfg = vtest.config_new(cfg_template)
136+
vtest.cluster_new(g, global_cfg)
137+
g.router = vtest.router_new(g, 'router', global_cfg)
138+
vtest.cluster_bootstrap(g, global_cfg)
139+
vtest.cluster_wait_vclock_all(g)
140+
141+
vtest.cluster_exec_each_master(g, function()
142+
box.schema.create_space('test_space')
143+
box.space.test_space:format({
144+
{name = 'pk', type = 'unsigned'},
145+
{name = 'bucket_id', type = 'unsigned'},
146+
})
147+
box.space.test_space:create_index('primary', {parts = {'pk'}})
148+
box.space.test_space:create_index(
149+
'bucket_id', {parts = {'bucket_id'}, unique = false})
150+
end)
151+
end)
152+
153+
rebalancer_recovery_group.after_all(function(g)
154+
g.cluster:drop()
155+
end)
156+
157+
--
158+
-- Improve logging of rebalancer and recovery (gh-212).
159+
--
160+
rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
161+
g.replica_2_a:exec(function()
162+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
163+
rawset(_G, 'old_call', ivshard.storage._call)
164+
ivshard.storage._call = function(service_name, ...)
165+
if service_name == 'recovery_bucket_stat' then
166+
return error('TimedOut')
167+
end
168+
return _G.old_call(service_name, ...)
169+
end
170+
end)
171+
local hanged_bucket_id_1 = vtest.storage_first_bucket(g.replica_1_a)
172+
start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_1)
173+
local hanged_bucket_id_2 = vtest.storage_first_bucket(g.replica_1_a)
174+
start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_2)
175+
t.helpers.retrying({}, function()
176+
t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1'))
177+
end)
178+
t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0'))
179+
g.replica_2_a:exec(function()
180+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
181+
ivshard.storage._call = _G.old_call
182+
ivshard.storage.garbage_collector_wakeup()
183+
ivshard.storage.recovery_wakeup()
184+
end)
185+
g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
186+
t.helpers.retrying({}, function()
187+
t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' ..
188+
'sending buckets are recovered among'))
189+
t.assert(g.replica_1_a:grep_log('Recovered buckets: %[1,2%]'))
190+
end)
191+
assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
192+
hanged_bucket_id_1)
193+
assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
194+
hanged_bucket_id_2)
195+
end

vshard/storage/init.lua

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,7 @@ local function recovery_step_by_type(type)
931931
local recovered = 0
932932
local total = 0
933933
local start_format = 'Starting %s buckets recovery step'
934+
local recovered_buckets = {}
934935
for _, bucket in _bucket.index.status:pairs(type) do
935936
lfiber.testcancel()
936937
total = total + 1
@@ -1001,11 +1002,13 @@ local function recovery_step_by_type(type)
10011002
bucket_id, bucket.status, remote_bucket.status, peer_id)
10021003
end
10031004
is_step_empty = false
1005+
table.insert(recovered_buckets, bucket_id)
10041006
::continue::
10051007
end
1006-
if not is_step_empty then
1008+
if recovered > 0 then
10071009
log.info('Finish bucket recovery step, %d %s buckets are recovered '..
10081010
'among %d', recovered, type, total)
1011+
log.info('Recovered buckets: %s', recovered_buckets)
10091012
end
10101013
return total, recovered
10111014
end

0 commit comments

Comments
 (0)