Skip to content

Commit b9e5153

Browse files
committed
recovery: reduce spam of "Finish bucket recovery step" logs
Before this patch "Finish bucket recovery step ..." logs were printed at the end of recovery even if no buckets were successfully recovered. It led to unnecessary log records. This patch fixes the issue by adding an additional check for the number of recovered buckets. Also we print id of every bucket which was successfully recovered in order to track buckets' duplication. Part of #212 NO_DOC=bugfix
1 parent 3a5ddef commit b9e5153

File tree

2 files changed

+97
-1
lines changed

2 files changed

+97
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,95 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
101101
ilt.assert_equals(box.space._bucket:get(bid), nil)
102102
end, {bid})
103103
end
104+
105+
local rebalancer_recovery_group = t.group('rebalancer-recovery-logging')
106+
107+
local function start_bucket_move(src_storage, dest_storage, bucket_id)
108+
src_storage:exec(function(bucket_id, replicaset_id)
109+
ivshard.storage.bucket_send(bucket_id, replicaset_id)
110+
end, {bucket_id, dest_storage:replicaset_uuid()})
111+
112+
dest_storage:exec(function(bucket_id)
113+
t.helpers.retrying({timeout = 10}, function()
114+
t.assert(box.space._bucket:select(bucket_id))
115+
end)
116+
end, {bucket_id})
117+
end
118+
119+
local function assert_bucket_is_transferred(src_storage, dest_storage,
120+
bucket_id)
121+
src_storage:exec(function(bucket_id)
122+
t.helpers.retrying({}, function()
123+
t.assert_equals(box.space._bucket:select(bucket_id), {})
124+
end)
125+
end, {bucket_id})
126+
dest_storage:exec(function(bucket_id)
127+
t.helpers.retrying({}, function()
128+
t.assert_equals(box.space._bucket:select(bucket_id)[1].status,
129+
'active')
130+
end)
131+
end, {bucket_id})
132+
end
133+
134+
rebalancer_recovery_group.before_all(function(g)
135+
global_cfg = vtest.config_new(cfg_template)
136+
vtest.cluster_new(g, global_cfg)
137+
g.router = vtest.router_new(g, 'router', global_cfg)
138+
vtest.cluster_bootstrap(g, global_cfg)
139+
vtest.cluster_wait_vclock_all(g)
140+
141+
vtest.cluster_exec_each_master(g, function()
142+
box.schema.create_space('test_space')
143+
box.space.test_space:format({
144+
{name = 'pk', type = 'unsigned'},
145+
{name = 'bucket_id', type = 'unsigned'},
146+
})
147+
box.space.test_space:create_index('primary', {parts = {'pk'}})
148+
box.space.test_space:create_index(
149+
'bucket_id', {parts = {'bucket_id'}, unique = false})
150+
end)
151+
end)
152+
153+
rebalancer_recovery_group.after_all(function(g)
154+
g.cluster:drop()
155+
end)
156+
157+
--
158+
-- Improve logging of rebalancer and recovery (gh-212).
159+
--
160+
rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
161+
g.replica_2_a:exec(function()
162+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
163+
rawset(_G, 'old_call', ivshard.storage._call)
164+
ivshard.storage._call = function(service_name, ...)
165+
if service_name == 'recovery_bucket_stat' then
166+
return error('TimedOut')
167+
end
168+
return _G.old_call(service_name, ...)
169+
end
170+
end)
171+
local hanged_bucket_id_1 = vtest.storage_first_bucket(g.replica_1_a)
172+
start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_1)
173+
local hanged_bucket_id_2 = vtest.storage_first_bucket(g.replica_1_a)
174+
start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_2)
175+
t.helpers.retrying({}, function()
176+
t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1'))
177+
end)
178+
t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0'))
179+
g.replica_2_a:exec(function()
180+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
181+
ivshard.storage._call = _G.old_call
182+
ivshard.storage.garbage_collector_wakeup()
183+
ivshard.storage.recovery_wakeup()
184+
end)
185+
g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
186+
t.helpers.retrying({}, function()
187+
t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' ..
188+
'sending buckets are recovered among'))
189+
t.assert(g.replica_1_a:grep_log('Recovered buckets: %[1,2%]'))
190+
end)
191+
assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
192+
hanged_bucket_id_1)
193+
assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
194+
hanged_bucket_id_2)
195+
end

vshard/storage/init.lua

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ local lmsgpack = require('msgpack')
55
local netbox = require('net.box') -- for net.box:self()
66
local trigger = require('internal.trigger')
77
local ffi = require('ffi')
8+
local json_encode = require('json').encode
89
local yaml_encode = require('yaml').encode
910
local fiber_clock = lfiber.clock
1011
local fiber_yield = lfiber.yield
@@ -931,6 +932,7 @@ local function recovery_step_by_type(type)
931932
local recovered = 0
932933
local total = 0
933934
local start_format = 'Starting %s buckets recovery step'
935+
local recovered_buckets = {}
934936
for _, bucket in _bucket.index.status:pairs(type) do
935937
lfiber.testcancel()
936938
total = total + 1
@@ -1001,11 +1003,13 @@ local function recovery_step_by_type(type)
10011003
bucket_id, bucket.status, remote_bucket.status, peer_id)
10021004
end
10031005
is_step_empty = false
1006+
table.insert(recovered_buckets, bucket_id)
10041007
::continue::
10051008
end
1006-
if not is_step_empty then
1009+
if recovered > 0 then
10071010
log.info('Finish bucket recovery step, %d %s buckets are recovered '..
10081011
'among %d', recovered, type, total)
1012+
log.info('Recovered buckets: %s', json_encode(recovered_buckets))
10091013
end
10101014
return total, recovered
10111015
end

0 commit comments

Comments
 (0)