Skip to content

Commit 4674bee

Browse files
committed
recovery: reduce spam in logs
Before this patch "Finish bucket recovery step ..." logs were printed at the end of recovery even if no buckets were successfully recovered, it led to unnecessary log entries. This patch fixes the issue by adding an additional check for the number of recovered buckets. Closes #212 NO_DOC=bugfix
1 parent 3a5ddef commit 4674bee

File tree

2 files changed

+124
-1
lines changed

2 files changed

+124
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ local cfg_template = {
4141
},
4242
bucket_count = 15,
4343
replication_timeout = 0.1,
44+
rebalancer_mode = 'auto',
4445
}
4546
local global_cfg
4647

@@ -101,3 +102,125 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
101102
ilt.assert_equals(box.space._bucket:get(bid), nil)
102103
end, {bid})
103104
end
105+
106+
local rebalancer_recovery_group = t.group('rebalancer-recovery-logging')
107+
108+
local function get_first_storage_bucket_id(storage)
109+
return storage:exec(function()
110+
return box.space._bucket:select()[1].id
111+
end)
112+
end
113+
114+
local function move_bucket(src_storage, dest_storage, bucket_id)
115+
src_storage:exec(function(bucket_id, replicaset_id)
116+
ivshard.storage.bucket_send(bucket_id, replicaset_id)
117+
end, {bucket_id, dest_storage:replicaset_uuid()})
118+
119+
dest_storage:exec(function(bucket_id)
120+
t.helpers.retrying({timeout = 10}, function()
121+
t.assert(box.space._bucket:select(bucket_id))
122+
end)
123+
end, {bucket_id})
124+
end
125+
126+
--
127+
-- The string should be strictly in format: YEAR-MONTH-DAY HOUR:MIN:SEC.MS
128+
--
129+
local function convert_string_to_datetime(str)
130+
local date, time = string.match(str, "(%d+-%d+-%d+) (%d+:%d+:%d+.%d+)")
131+
t.assert(date)
132+
local year, month, day = string.match(date, "^(%d+)-(%d+)-(%d+)")
133+
t.assert(year, "Unable to parse 'date' field of string")
134+
local hour, min, sec, ms = string.match(time, "(%d+):(%d+):(%d+).(%d+)$")
135+
t.assert(hour, "Unable to parse 'time' field of string")
136+
137+
return require("datetime").new{
138+
year = tonumber(year), month = tonumber(month), day = tonumber(day),
139+
hour = tonumber(hour), min = tonumber(min), sec = tonumber(sec),
140+
nsec = tonumber(ms)}
141+
end
142+
143+
local function test_only_one_record_appears_in_logs(server, record, wait_time)
144+
local first_log_record = nil
145+
t.helpers.retrying({}, function()
146+
first_log_record = server:grep_log(record)
147+
t.assert(first_log_record)
148+
end)
149+
local first_log_time = convert_string_to_datetime(first_log_record)
150+
-- We need to wait a bit in order to catch how much as possible
151+
-- spam in server's logs.
152+
require("fiber").sleep(wait_time)
153+
local last_log_record = server:grep_log(record)
154+
t.assert(last_log_record)
155+
local last_log_time = convert_string_to_datetime(last_log_record)
156+
t.assert_equals(first_log_time, last_log_time,
157+
"There are two identical records in logs")
158+
end
159+
160+
rebalancer_recovery_group.before_all(function(g)
161+
global_cfg = vtest.config_new(cfg_template)
162+
vtest.cluster_new(g, global_cfg)
163+
g.router = vtest.router_new(g, 'router', global_cfg)
164+
vtest.cluster_bootstrap(g, global_cfg)
165+
vtest.cluster_wait_vclock_all(g)
166+
167+
for _, storage in pairs({g.replica_1_a, g.replica_2_a, g.replica_3_a}) do
168+
storage:exec(function()
169+
rawset(_G, 'create_test_space', function()
170+
box.schema.create_space('test_space')
171+
box.space.test_space:format({
172+
{name = 'pk', type = 'uuid'},
173+
{name = 'bucket_id', type = 'unsigned'},
174+
})
175+
box.space.test_space:create_index('primary', {parts = {'pk'}})
176+
box.space.test_space:create_index(
177+
'bucket_id', {parts = {'bucket_id'}, unique = false})
178+
end)
179+
end)
180+
g.router:exec(function(bucket_id)
181+
ivshard.router.call(bucket_id, 'write', 'create_test_space')
182+
end, {get_first_storage_bucket_id(storage)})
183+
end
184+
185+
g.buckets_not_active_log = 'Some buckets are not active'
186+
g.buckets_not_active_pattern = "%d+-%d+-%d+ %d+:%d+:%d+.%d+ .* " ..
187+
g.buckets_not_active_log
188+
g.rebalancer_wait_interval = 0.01
189+
end)
190+
191+
rebalancer_recovery_group.after_all(function(g)
192+
g.cluster:drop()
193+
end)
194+
195+
--
196+
-- Improve logging of rebalancer and recovery (gh-212).
197+
--
198+
rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
199+
g.replica_2_a:exec(function()
200+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
201+
rawset(_G, 'old_call', ivshard.storage._call)
202+
ivshard.storage._call = function(service_name, ...)
203+
if service_name == 'recovery_bucket_stat' then
204+
return error('TimedOut')
205+
end
206+
return _G.old_call(service_name, ...)
207+
end
208+
end)
209+
local hanged_bucket_id = get_first_storage_bucket_id(g.replica_1_a)
210+
move_bucket(g.replica_1_a, g.replica_2_a, hanged_bucket_id)
211+
t.helpers.retrying({}, function()
212+
t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1'))
213+
end)
214+
t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0'))
215+
g.replica_2_a:exec(function()
216+
ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
217+
ivshard.storage._call = _G.old_call
218+
ivshard.storage.recovery_wakeup()
219+
end)
220+
g.replica_1_a:exec(function(bucket_id)
221+
t.helpers.retrying({}, function()
222+
t.assert_equals(box.space._bucket:select(bucket_id)[1].status,
223+
'active')
224+
end)
225+
end, {hanged_bucket_id})
226+
end

vshard/storage/init.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1003,7 +1003,7 @@ local function recovery_step_by_type(type)
10031003
is_step_empty = false
10041004
::continue::
10051005
end
1006-
if not is_step_empty then
1006+
if not is_step_empty and recovered > 0 then
10071007
log.info('Finish bucket recovery step, %d %s buckets are recovered '..
10081008
'among %d', recovered, type, total)
10091009
end

0 commit comments

Comments
 (0)