From 769a8ab8ba3bb267fc2a10d71ec45db4c647b860 Mon Sep 17 00:00:00 2001 From: Roman Gromov Date: Wed, 17 Sep 2025 12:53:05 +0300 Subject: [PATCH 1/5] recovery: reduce spam of "Finish bucket recovery step" logs Before this patch "Finish bucket recovery step ..." logs were printed at the end of recovery even if no buckets were successfully recovered. It led to unnecessary log records. This patch fixes the issue by adding an additional check for the number of recovered buckets. Part of tarantool/vshard#212 NO_DOC=bugfix --- test/storage-luatest/storage_1_1_1_test.lua | 94 +++++++++++++++++++++ vshard/storage/init.lua | 2 +- 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua index 694baa79..2ea32b1e 100644 --- a/test/storage-luatest/storage_1_1_1_test.lua +++ b/test/storage-luatest/storage_1_1_1_test.lua @@ -101,3 +101,97 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g) ilt.assert_equals(box.space._bucket:get(bid), nil) end, {bid}) end + +local rebalancer_recovery_group = t.group('rebalancer-recovery-logging') + +local function start_bucket_move(src_storage, dest_storage, bucket_id) + src_storage:exec(function(bucket_id, replicaset_id) + ivshard.storage.bucket_send(bucket_id, replicaset_id) + end, {bucket_id, dest_storage:replicaset_uuid()}) + + dest_storage:exec(function(bucket_id) + t.helpers.retrying({timeout = 10}, function() + t.assert(box.space._bucket:select(bucket_id)) + end) + end, {bucket_id}) +end + +local function wait_for_bucket_is_transferred(src_storage, dest_storage, + bucket_id) + src_storage:exec(function(bucket_id) + t.helpers.retrying({}, function() + t.assert_equals(box.space._bucket:select(bucket_id), {}) + end) + end, {bucket_id}) + dest_storage:exec(function(bucket_id) + t.helpers.retrying({}, function() + t.assert_equals(box.space._bucket:select(bucket_id)[1].status, + 'active') + end) + end, {bucket_id}) +end + +rebalancer_recovery_group.before_all(function(g) + global_cfg = vtest.config_new(cfg_template) + vtest.cluster_new(g, global_cfg) + g.router = vtest.router_new(g, 'router', global_cfg) + vtest.cluster_bootstrap(g, global_cfg) + vtest.cluster_wait_vclock_all(g) + + vtest.cluster_exec_each_master(g, function() + box.schema.create_space('test_space') + box.space.test_space:format({ + {name = 'pk', type = 'unsigned'}, + {name = 'bucket_id', type = 'unsigned'}, + }) + box.space.test_space:create_index('primary', {parts = {'pk'}}) + box.space.test_space:create_index( + 'bucket_id', {parts = {'bucket_id'}, unique = false}) + end) +end) + +rebalancer_recovery_group.after_all(function(g) + g.cluster:drop() +end) + +-- +-- Improve logging of rebalancer and recovery (gh-212). +-- +rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g) + g.replica_2_a:exec(function() + ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true + rawset(_G, 'old_call', ivshard.storage._call) + ivshard.storage._call = function(service_name, ...) + if service_name == 'recovery_bucket_stat' then + return error('TimedOut') + end + return _G.old_call(service_name, ...) + end + end) + local hanged_bucket_id_1 = vtest.storage_first_bucket(g.replica_1_a) + start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_1) + local hanged_bucket_id_2 = vtest.storage_first_bucket(g.replica_1_a) + start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_2) + t.helpers.retrying({}, function() + g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end) + t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1')) + end) + t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0')) + g.replica_2_a:exec(function() + ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false + ivshard.storage._call = _G.old_call + end) + t.helpers.retrying({timeout = 60}, function() + g.replica_2_a:exec(function() + ivshard.storage.garbage_collector_wakeup() + ivshard.storage.recovery_wakeup() + end) + g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end) + t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' .. + 'sending buckets are recovered among')) + end) + wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, + hanged_bucket_id_1) + wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, + hanged_bucket_id_2) +end diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 97642250..869751fb 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -1005,7 +1005,7 @@ local function recovery_step_by_type(type, limiter) is_step_empty = false ::continue:: end - if not is_step_empty then + if recovered > 0 then log.info('Finish bucket recovery step, %d %s buckets are recovered '.. 'among %d', recovered, type, total) end From f8031bf080a462658c9174b202d7e5ae0a087f06 Mon Sep 17 00:00:00 2001 From: Roman Gromov Date: Wed, 17 Sep 2025 12:53:21 +0300 Subject: [PATCH 2/5] recovery: add logging of recovered buckets This patch introduces logging of buckets' ids which were recovered during recovery stage of storage. Part of tarantool/vshard#212 NO_DOC=bugfix --- test/storage-luatest/storage_1_1_1_test.lua | 9 +++++++-- vshard/storage/init.lua | 8 +++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua index 2ea32b1e..04ba357f 100644 --- a/test/storage-luatest/storage_1_1_1_test.lua +++ b/test/storage-luatest/storage_1_1_1_test.lua @@ -187,8 +187,13 @@ rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g) ivshard.storage.recovery_wakeup() end) g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end) - t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' .. - 'sending buckets are recovered among')) + -- In some rare cases the recovery service can recover buckets one + -- by one. As a result we get multiple "Finish bucket recovery" and + -- "Recovery buckets" logs with different bucket ids and buckets' + -- count. That is why we should grep general logs without buckets' + -- count and bucket ids to avoid flakiness. + t.assert(g.replica_1_a:grep_log('Finish bucket recovery step')) + t.assert(g.replica_1_a:grep_log('Recovered buckets')) end) wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, hanged_bucket_id_1) diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 869751fb..9d3a10db 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -5,6 +5,7 @@ local lmsgpack = require('msgpack') local netbox = require('net.box') -- for net.box:self() local trigger = require('internal.trigger') local ffi = require('ffi') +local json_encode = require('json').encode local yaml_encode = require('yaml').encode local fiber_clock = lfiber.clock local fiber_yield = lfiber.yield @@ -932,6 +933,7 @@ local function recovery_step_by_type(type, limiter) local recovered = 0 local total = 0 local start_format = 'Starting %s buckets recovery step' + local recovered_buckets = {SENT = {}, GARBAGE = {}, ACTIVE = {}} for _, bucket in _bucket.index.status:pairs(type) do lfiber.testcancel() total = total + 1 @@ -992,12 +994,15 @@ local function recovery_step_by_type(type, limiter) if recovery_local_bucket_is_sent(bucket, remote_bucket) then _bucket:update({bucket_id}, {{'=', 2, BSENT}}) recovered = recovered + 1 + table.insert(recovered_buckets['SENT'], bucket_id) elseif recovery_local_bucket_is_garbage(bucket, remote_bucket) then _bucket:update({bucket_id}, {{'=', 2, BGARBAGE}}) recovered = recovered + 1 + table.insert(recovered_buckets['SENT'], bucket_id) elseif recovery_local_bucket_is_active(bucket, remote_bucket) then _bucket:replace({bucket_id, BACTIVE}) recovered = recovered + 1 + table.insert(recovered_buckets['ACTIVE'], bucket_id) elseif is_step_empty then log.info('Bucket %s is %s local and %s on replicaset %s, waiting', bucket_id, bucket.status, remote_bucket.status, peer_id) @@ -1007,7 +1012,8 @@ local function recovery_step_by_type(type, limiter) end if recovered > 0 then log.info('Finish bucket recovery step, %d %s buckets are recovered '.. - 'among %d', recovered, type, total) + 'among %d. Recovered buckets: %s', recovered, type, total, + json_encode(recovered_buckets)) end return total, recovered end From 18585367c3c211aeb73332fa4e5d23f792372781 Mon Sep 17 00:00:00 2001 From: Roman Gromov Date: Tue, 9 Sep 2025 16:28:55 +0300 Subject: [PATCH 3/5] rebalancer: add logging of routes This patch adds rebalancer routes' logging. The log file now includes information about the source storage, the number of buckets, and the destination storage where the buckets will be moved. Since the rebalancer service has changed logging of routes that were sent, we change the `rebalancer/rebalancer.test.lua` and `rebalancer/stress_add_remove_several_rs.test.lua` tests. Part of tarantool/vshard#212 NO_DOC=bugfix --- test/rebalancer/rebalancer.result | 4 ++-- test/rebalancer/rebalancer.test.lua | 4 ++-- .../stress_add_remove_several_rs.result | 4 ++-- .../stress_add_remove_several_rs.test.lua | 4 ++-- test/storage-luatest/storage_1_1_1_test.lua | 20 +++++++++++++++++++ vshard/storage/init.lua | 5 +++-- 6 files changed, 31 insertions(+), 10 deletions(-) diff --git a/test/rebalancer/rebalancer.result b/test/rebalancer/rebalancer.result index 8a6ebf9f..148ddbfc 100644 --- a/test/rebalancer/rebalancer.result +++ b/test/rebalancer/rebalancer.result @@ -149,7 +149,7 @@ test_run:switch('box_1_a') vshard.storage.rebalancer_enable() --- ... -wait_rebalancer_state("Rebalance routes are sent", test_run) +wait_rebalancer_state("The following rebalancer routes were sent", test_run) --- ... wait_rebalancer_state('The cluster is balanced ok', test_run) @@ -239,7 +239,7 @@ cfg.rebalancer_disbalance_threshold = 0.01 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) --- ... -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) --- ... wait_rebalancer_state('The cluster is balanced ok', test_run) diff --git a/test/rebalancer/rebalancer.test.lua b/test/rebalancer/rebalancer.test.lua index ec7ebcf2..bc67145e 100644 --- a/test/rebalancer/rebalancer.test.lua +++ b/test/rebalancer/rebalancer.test.lua @@ -78,7 +78,7 @@ util.map_bucket_protection(test_run, {REPLICASET_1}, true) test_run:switch('box_1_a') vshard.storage.rebalancer_enable() -wait_rebalancer_state("Rebalance routes are sent", test_run) +wait_rebalancer_state("The following rebalancer routes were sent", test_run) wait_rebalancer_state('The cluster is balanced ok', test_run) _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE}) @@ -118,7 +118,7 @@ _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE}) -- Return 1%. cfg.rebalancer_disbalance_threshold = 0.01 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) wait_rebalancer_state('The cluster is balanced ok', test_run) _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE}) _bucket.index.status:min({vshard.consts.BUCKET.ACTIVE}) diff --git a/test/rebalancer/stress_add_remove_several_rs.result b/test/rebalancer/stress_add_remove_several_rs.result index 6a9b0ffb..194c99c4 100644 --- a/test/rebalancer/stress_add_remove_several_rs.result +++ b/test/rebalancer/stress_add_remove_several_rs.result @@ -175,7 +175,7 @@ add_replicaset() vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) --- ... -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) --- ... -- Now, add a second replicaset. @@ -422,7 +422,7 @@ remove_second_replicaset_first_stage() vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) --- ... -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) --- ... -- Rebalancing has been started - now remove second replicaset. diff --git a/test/rebalancer/stress_add_remove_several_rs.test.lua b/test/rebalancer/stress_add_remove_several_rs.test.lua index f62400f2..7c1ae3bc 100644 --- a/test/rebalancer/stress_add_remove_several_rs.test.lua +++ b/test/rebalancer/stress_add_remove_several_rs.test.lua @@ -71,7 +71,7 @@ fiber.sleep(0.5) test_run:switch('box_1_a') add_replicaset() vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) -- Now, add a second replicaset. @@ -153,7 +153,7 @@ fiber.sleep(0.5) test_run:switch('box_1_a') remove_second_replicaset_first_stage() vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) -wait_rebalancer_state('Rebalance routes are sent', test_run) +wait_rebalancer_state('The following rebalancer routes were sent', test_run) -- Rebalancing has been started - now remove second replicaset. remove_replicaset_first_stage() vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a) diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua index 04ba357f..cc891b18 100644 --- a/test/storage-luatest/storage_1_1_1_test.lua +++ b/test/storage-luatest/storage_1_1_1_test.lua @@ -200,3 +200,23 @@ rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g) wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, hanged_bucket_id_2) end + +rebalancer_recovery_group.test_rebalancer_routes_logging = function(g) + local moved_bucket_from_2 = vtest.storage_first_bucket(g.replica_2_a) + start_bucket_move(g.replica_2_a, g.replica_1_a, moved_bucket_from_2) + local moved_bucket_from_3 = vtest.storage_first_bucket(g.replica_3_a) + start_bucket_move(g.replica_3_a, g.replica_1_a, moved_bucket_from_3) + t.helpers.retrying({timeout = 60}, function() + g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end) + t.assert(g.replica_1_a:grep_log('Apply rebalancer routes with 1 ' .. + 'workers')) + end) + local rebalancer_routes_msg = string.format( + "{\"%s\":{\"%s\":1,\"%s\":1}}", g.replica_1_a:replicaset_uuid(), + g.replica_3_a:replicaset_uuid(), g.replica_2_a:replicaset_uuid()) + t.helpers.retrying({}, function() + t.assert(g.replica_1_a:grep_log(rebalancer_routes_msg)) + g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end) + g.replica_1_a:grep_log('The cluster is balanced ok.') + end) +end diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 9d3a10db..bf26fa2a 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -2909,8 +2909,9 @@ local function rebalancer_service_f(service, limiter) goto continue end end - log.info('Rebalance routes are sent. Schedule next wakeup after '.. - '%f seconds', consts.REBALANCER_WORK_INTERVAL) + log.info('The following rebalancer routes were sent: %s. ' .. + 'Schedule next wakeup after %f seconds', json_encode(routes), + consts.REBALANCER_WORK_INTERVAL) service:set_activity('idling') lfiber.testcancel() lfiber.sleep(consts.REBALANCER_WORK_INTERVAL) From 71947f467fb26f5e186872ba1b2bd1298c86bd6a Mon Sep 17 00:00:00 2001 From: Roman Gromov Date: Fri, 7 Nov 2025 19:10:38 +0300 Subject: [PATCH 4/5] rebalancer: refactoring of rebalancer_request_state Before this patch the function `rebalancer_request_state` returned only nil in case of errors (e.g. presence of SENDING, RECEIVING, GARBAGE buckets, not active rebalancer). This makes it inconsistent compared to other rebalancer functions. Now, in case of errors we return `(nil, err)` instead of `nil`. It can help us to propagate a meaningful error in rebalancer service. In addition we introduce a new vshard error for cases when the storage has some non-active buckets during rebalancing - `BUCKET_INVALID_STATE`. Also we remove "Some buckets are not active" log from `rebalancer_service_f` because a meaningful error about non-active buckets is already returned from `rebalancer_download_states`. NO_TEST=refactoring NO_DOC=refactoring --- test/rebalancer/rebalancer.result | 9 +++++---- test/rebalancer/rebalancer.test.lua | 6 +++++- test/unit/rebalancer.result | 16 +++++++++++++--- test/unit/rebalancer.test.lua | 6 +++++- vshard/error.lua | 5 +++++ vshard/storage/init.lua | 22 ++++++++++++---------- 6 files changed, 45 insertions(+), 19 deletions(-) diff --git a/test/rebalancer/rebalancer.result b/test/rebalancer/rebalancer.result index 148ddbfc..59728cd2 100644 --- a/test/rebalancer/rebalancer.result +++ b/test/rebalancer/rebalancer.result @@ -318,12 +318,13 @@ _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}}) --- - [150, 'receiving'] ... -wait_rebalancer_state("Some buckets are not active", test_run) ---- -... +-- We should not check the certain status of buckets (e.g. receiving) because +-- in rare cases we accidentally can get the wrong one. For example we wait for +-- "receiving" status, but get "garbage" due to some previous rebalancer error. +wait_rebalancer_state('Error during downloading rebalancer states:.*' .. \ + 'BUCKET_INVALID_STATE', test_run) \ _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}}) --- -- [150, 'active'] ... vshard.storage.sync() --- diff --git a/test/rebalancer/rebalancer.test.lua b/test/rebalancer/rebalancer.test.lua index bc67145e..dfe97198 100644 --- a/test/rebalancer/rebalancer.test.lua +++ b/test/rebalancer/rebalancer.test.lua @@ -156,7 +156,11 @@ util.map_bucket_protection(test_run, {REPLICASET_1}, false) test_run:switch('box_1_a') vshard.storage.rebalancer_enable() _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}}) -wait_rebalancer_state("Some buckets are not active", test_run) +-- We should not check the certain status of buckets (e.g. receiving) because +-- in rare cases we accidentally can get the wrong one. For example we wait for +-- "receiving" status, but get "garbage" due to some previous rebalancer error. +wait_rebalancer_state('Error during downloading rebalancer states:.*' .. \ + 'BUCKET_INVALID_STATE', test_run) \ _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}}) vshard.storage.sync() diff --git a/test/unit/rebalancer.result b/test/unit/rebalancer.result index d312a411..236fc0ec 100644 --- a/test/unit/rebalancer.result +++ b/test/unit/rebalancer.result @@ -290,9 +290,11 @@ build_routes(replicasets) vshard.storage.internal.is_master = true --- ... -get_state = vshard.storage._rebalancer_request_state ---- -... +get_state = function() \ + local res, err = vshard.storage._rebalancer_request_state() \ + if res == nil then err.trace = nil end \ + return res, err \ +end \ _bucket = box.schema.create_space('_bucket') --- ... @@ -318,6 +320,7 @@ get_state() --- - bucket_active_count: 2 bucket_pinned_count: 0 +- null ... _bucket:replace{1, consts.BUCKET.RECEIVING} --- @@ -325,6 +328,13 @@ _bucket:replace{1, consts.BUCKET.RECEIVING} ... get_state() --- +- null +- buckets_state: receiving + code: 42 + replica_id: _ + type: ShardingError + message: Replica _ has receiving buckets + name: BUCKET_INVALID_STATE ... vshard.storage.internal.is_master = false --- diff --git a/test/unit/rebalancer.test.lua b/test/unit/rebalancer.test.lua index e6d54b81..4a6d556c 100644 --- a/test/unit/rebalancer.test.lua +++ b/test/unit/rebalancer.test.lua @@ -76,7 +76,11 @@ build_routes(replicasets) -- Test rebalancer local state. -- vshard.storage.internal.is_master = true -get_state = vshard.storage._rebalancer_request_state +get_state = function() \ + local res, err = vshard.storage._rebalancer_request_state() \ + if res == nil then err.trace = nil end \ + return res, err \ +end \ _bucket = box.schema.create_space('_bucket') pk = _bucket:create_index('pk') status = _bucket:create_index('status', {parts = {{2, 'string'}}, unique = false}) diff --git a/vshard/error.lua b/vshard/error.lua index 0892758c..5ffebfc5 100644 --- a/vshard/error.lua +++ b/vshard/error.lua @@ -207,6 +207,11 @@ local error_message_template = { msg = 'Mismatch server name: expected "%s", but got "%s"', args = {'expected_name', 'actual_name'}, }, + [42] = { + name = 'BUCKET_INVALID_STATE', + msg = 'Replica %s has %s buckets', + args = {'replica_id', 'buckets_state'} + } } -- diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index bf26fa2a..b842b588 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -2851,12 +2851,10 @@ local function rebalancer_service_f(service, limiter) return end if not status or replicasets == nil then - local err = status and total_bucket_active_count or replicasets - if err then - limiter:log_error(err, service:set_status_error( - 'Error during downloading rebalancer states: %s', err)) - end - log.info('Some buckets are not active, retry rebalancing later') + local err = total_bucket_active_count + limiter:log_error(err, service:set_status_error( + 'Error during downloading rebalancer states: %s, ' .. + 'retry rebalancing later', err)) service:set_activity('idling') lfiber.testcancel() lfiber.sleep(consts.REBALANCER_WORK_INTERVAL) @@ -2945,18 +2943,22 @@ local function rebalancer_request_state() return nil, err end if not M.is_rebalancer_active or rebalancing_is_in_progress() then - return + return nil, lerror.make('Rebalancer is not active or is in progress') end local _bucket = box.space._bucket local status_index = _bucket.index.status + local repl_id = M.this_replica and M.this_replica.id or '_' if #status_index:select({BSENDING}, {limit = 1}) > 0 then - return + return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE, + repl_id, 'sending') end if #status_index:select({BRECEIVING}, {limit = 1}) > 0 then - return + return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE, + repl_id, 'receiving') end if #status_index:select({BGARBAGE}, {limit = 1}) > 0 then - return + return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE, + repl_id, 'garbage') end return { bucket_active_count = status_index:count({BACTIVE}), From 16b39a2ec479e737b358236abed02eb59840407d Mon Sep 17 00:00:00 2001 From: Roman Gromov Date: Fri, 7 Nov 2025 19:10:42 +0300 Subject: [PATCH 5/5] rebalancer: add replicaset.id into rebalancer_request_state errors Before this patch the function `rebalancer_download_states` didn't return information about replicaset from which the states could not be downloaded. As a result, the errors returned from `rebalancer_download_states` lack of valuable information about unhealthy replicaset. Now, we add replicaset.id into error which is returned from the `rebalancer_download_states`. It can help us to propagate replicaset.id to rebalancer service. Closes #212 NO_DOC=bugfix --- test/storage-luatest/storage_1_1_1_test.lua | 17 +++++++++++++++++ vshard/storage/init.lua | 1 + 2 files changed, 18 insertions(+) diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua index cc891b18..dc03a180 100644 --- a/test/storage-luatest/storage_1_1_1_test.lua +++ b/test/storage-luatest/storage_1_1_1_test.lua @@ -220,3 +220,20 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g) g.replica_1_a:grep_log('The cluster is balanced ok.') end) end + +rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g) + local moved_bucket = vtest.storage_first_bucket(g.replica_2_a) + start_bucket_move(g.replica_1_a, g.replica_2_a, moved_bucket) + wait_for_bucket_is_transferred(g.replica_1_a, g.replica_2_a, moved_bucket) + vtest.storage_stop(g.replica_2_a) + local err_log = string.format('Error during downloading rebalancer ' .. + 'states:.*"replicaset_id":"%s"', + g.replica_2_a:replicaset_uuid()) + t.helpers.retrying({timeout = 60}, function() + g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end) + t.assert(g.replica_1_a:grep_log(err_log)) + end) + vtest.storage_start(g.replica_2_a, global_cfg) + start_bucket_move(g.replica_2_a, g.replica_1_a, moved_bucket) + wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, moved_bucket) +end diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index b842b588..213eef23 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -2806,6 +2806,7 @@ local function rebalancer_download_states() replicaset, 'vshard.storage.rebalancer_request_state', {}, {timeout = consts.REBALANCER_GET_STATE_TIMEOUT}) if state == nil then + err.replicaset_id = replicaset.id return nil, err end local bucket_count = state.bucket_active_count +