From 769a8ab8ba3bb267fc2a10d71ec45db4c647b860 Mon Sep 17 00:00:00 2001
From: Roman Gromov <r.gromov.official@gmail.com>
Date: Wed, 17 Sep 2025 12:53:05 +0300
Subject: [PATCH 1/5] recovery: reduce spam of "Finish bucket recovery step"
 logs

Before this patch "Finish bucket recovery step ..." logs were printed at
the end of recovery even if no buckets were successfully recovered. It led
to unnecessary log records. This patch fixes the issue by adding an
additional check for the number of recovered buckets.

Part of tarantool/vshard#212

NO_DOC=bugfix
---
 test/storage-luatest/storage_1_1_1_test.lua | 94 +++++++++++++++++++++
 vshard/storage/init.lua                     |  2 +-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
index 694baa79..2ea32b1e 100644
--- a/test/storage-luatest/storage_1_1_1_test.lua
+++ b/test/storage-luatest/storage_1_1_1_test.lua
@@ -101,3 +101,97 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
         ilt.assert_equals(box.space._bucket:get(bid), nil)
     end, {bid})
 end
+
+local rebalancer_recovery_group = t.group('rebalancer-recovery-logging')
+
+local function start_bucket_move(src_storage, dest_storage, bucket_id)
+    src_storage:exec(function(bucket_id, replicaset_id)
+        ivshard.storage.bucket_send(bucket_id, replicaset_id)
+    end, {bucket_id, dest_storage:replicaset_uuid()})
+
+    dest_storage:exec(function(bucket_id)
+        t.helpers.retrying({timeout = 10}, function()
+            t.assert(box.space._bucket:select(bucket_id))
+        end)
+    end, {bucket_id})
+end
+
+local function wait_for_bucket_is_transferred(src_storage, dest_storage,
+                                              bucket_id)
+    src_storage:exec(function(bucket_id)
+        t.helpers.retrying({}, function()
+            t.assert_equals(box.space._bucket:select(bucket_id), {})
+        end)
+    end, {bucket_id})
+    dest_storage:exec(function(bucket_id)
+        t.helpers.retrying({}, function()
+            t.assert_equals(box.space._bucket:select(bucket_id)[1].status,
+                            'active')
+        end)
+    end, {bucket_id})
+end
+
+rebalancer_recovery_group.before_all(function(g)
+    global_cfg = vtest.config_new(cfg_template)
+    vtest.cluster_new(g, global_cfg)
+    g.router = vtest.router_new(g, 'router', global_cfg)
+    vtest.cluster_bootstrap(g, global_cfg)
+    vtest.cluster_wait_vclock_all(g)
+
+    vtest.cluster_exec_each_master(g, function()
+        box.schema.create_space('test_space')
+        box.space.test_space:format({
+            {name = 'pk', type = 'unsigned'},
+            {name = 'bucket_id', type = 'unsigned'},
+        })
+        box.space.test_space:create_index('primary', {parts = {'pk'}})
+        box.space.test_space:create_index(
+            'bucket_id', {parts = {'bucket_id'}, unique = false})
+    end)
+end)
+
+rebalancer_recovery_group.after_all(function(g)
+    g.cluster:drop()
+end)
+
+--
+-- Improve logging of rebalancer and recovery (gh-212).
+--
+rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
+    g.replica_2_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
+        rawset(_G, 'old_call', ivshard.storage._call)
+        ivshard.storage._call = function(service_name, ...)
+            if service_name == 'recovery_bucket_stat' then
+                return error('TimedOut')
+            end
+            return _G.old_call(service_name, ...)
+        end
+    end)
+    local hanged_bucket_id_1 = vtest.storage_first_bucket(g.replica_1_a)
+    start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_1)
+    local hanged_bucket_id_2 = vtest.storage_first_bucket(g.replica_1_a)
+    start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_2)
+    t.helpers.retrying({}, function()
+        g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
+        t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1'))
+    end)
+    t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0'))
+    g.replica_2_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
+        ivshard.storage._call = _G.old_call
+    end)
+    t.helpers.retrying({timeout = 60}, function()
+        g.replica_2_a:exec(function()
+            ivshard.storage.garbage_collector_wakeup()
+            ivshard.storage.recovery_wakeup()
+        end)
+        g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
+        t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' ..
+                                        'sending buckets are recovered among'))
+    end)
+    wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
+                                   hanged_bucket_id_1)
+    wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
+                                   hanged_bucket_id_2)
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 97642250..869751fb 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -1005,7 +1005,7 @@ local function recovery_step_by_type(type, limiter)
         is_step_empty = false
 ::continue::
     end
-    if not is_step_empty then
+    if recovered > 0 then
         log.info('Finish bucket recovery step, %d %s buckets are recovered '..
                  'among %d', recovered, type, total)
     end

From f8031bf080a462658c9174b202d7e5ae0a087f06 Mon Sep 17 00:00:00 2001
From: Roman Gromov <r.gromov.official@gmail.com>
Date: Wed, 17 Sep 2025 12:53:21 +0300
Subject: [PATCH 2/5] recovery: add logging of recovered buckets

This patch introduces logging of buckets' ids which were recovered
during recovery stage of storage.

Part of tarantool/vshard#212

NO_DOC=bugfix
---
 test/storage-luatest/storage_1_1_1_test.lua | 9 +++++++--
 vshard/storage/init.lua                     | 8 +++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
index 2ea32b1e..04ba357f 100644
--- a/test/storage-luatest/storage_1_1_1_test.lua
+++ b/test/storage-luatest/storage_1_1_1_test.lua
@@ -187,8 +187,13 @@ rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
             ivshard.storage.recovery_wakeup()
         end)
         g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
-        t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' ..
-                                        'sending buckets are recovered among'))
+        -- In some rare cases the recovery service can recover buckets one
+        -- by one. As a result we get multiple "Finish bucket recovery" and
+        -- "Recovery buckets" logs with different bucket ids and buckets'
+        -- count. That is why we should grep general logs without buckets'
+        -- count and bucket ids to avoid flakiness.
+        t.assert(g.replica_1_a:grep_log('Finish bucket recovery step'))
+        t.assert(g.replica_1_a:grep_log('Recovered buckets'))
     end)
     wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
                                    hanged_bucket_id_1)
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 869751fb..9d3a10db 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -5,6 +5,7 @@ local lmsgpack = require('msgpack')
 local netbox = require('net.box') -- for net.box:self()
 local trigger = require('internal.trigger')
 local ffi = require('ffi')
+local json_encode = require('json').encode
 local yaml_encode = require('yaml').encode
 local fiber_clock = lfiber.clock
 local fiber_yield = lfiber.yield
@@ -932,6 +933,7 @@ local function recovery_step_by_type(type, limiter)
     local recovered = 0
     local total = 0
     local start_format = 'Starting %s buckets recovery step'
+    local recovered_buckets = {SENT = {}, GARBAGE = {}, ACTIVE = {}}
     for _, bucket in _bucket.index.status:pairs(type) do
         lfiber.testcancel()
         total = total + 1
@@ -992,12 +994,15 @@ local function recovery_step_by_type(type, limiter)
         if recovery_local_bucket_is_sent(bucket, remote_bucket) then
             _bucket:update({bucket_id}, {{'=', 2, BSENT}})
             recovered = recovered + 1
+            table.insert(recovered_buckets['SENT'], bucket_id)
         elseif recovery_local_bucket_is_garbage(bucket, remote_bucket) then
             _bucket:update({bucket_id}, {{'=', 2, BGARBAGE}})
             recovered = recovered + 1
+            table.insert(recovered_buckets['SENT'], bucket_id)
         elseif recovery_local_bucket_is_active(bucket, remote_bucket) then
             _bucket:replace({bucket_id, BACTIVE})
             recovered = recovered + 1
+            table.insert(recovered_buckets['ACTIVE'], bucket_id)
         elseif is_step_empty then
             log.info('Bucket %s is %s local and %s on replicaset %s, waiting',
                      bucket_id, bucket.status, remote_bucket.status, peer_id)
@@ -1007,7 +1012,8 @@ local function recovery_step_by_type(type, limiter)
     end
     if recovered > 0 then
         log.info('Finish bucket recovery step, %d %s buckets are recovered '..
-                 'among %d', recovered, type, total)
+                 'among %d. Recovered buckets: %s', recovered, type, total,
+                 json_encode(recovered_buckets))
     end
     return total, recovered
 end

From 18585367c3c211aeb73332fa4e5d23f792372781 Mon Sep 17 00:00:00 2001
From: Roman Gromov <r.gromov.official@gmail.com>
Date: Tue, 9 Sep 2025 16:28:55 +0300
Subject: [PATCH 3/5] rebalancer: add logging of routes

This patch adds rebalancer routes' logging. The log file now
includes information about the source storage, the number of
buckets, and the destination storage where the buckets will
be moved.

Since the rebalancer service has changed logging of routes that
were sent, we change the `rebalancer/rebalancer.test.lua` and
`rebalancer/stress_add_remove_several_rs.test.lua` tests.

Part of tarantool/vshard#212

NO_DOC=bugfix
---
 test/rebalancer/rebalancer.result             |  4 ++--
 test/rebalancer/rebalancer.test.lua           |  4 ++--
 .../stress_add_remove_several_rs.result       |  4 ++--
 .../stress_add_remove_several_rs.test.lua     |  4 ++--
 test/storage-luatest/storage_1_1_1_test.lua   | 20 +++++++++++++++++++
 vshard/storage/init.lua                       |  5 +++--
 6 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/test/rebalancer/rebalancer.result b/test/rebalancer/rebalancer.result
index 8a6ebf9f..148ddbfc 100644
--- a/test/rebalancer/rebalancer.result
+++ b/test/rebalancer/rebalancer.result
@@ -149,7 +149,7 @@ test_run:switch('box_1_a')
 vshard.storage.rebalancer_enable()
 ---
 ...
-wait_rebalancer_state("Rebalance routes are sent", test_run)
+wait_rebalancer_state("The following rebalancer routes were sent", test_run)
 ---
 ...
 wait_rebalancer_state('The cluster is balanced ok', test_run)
@@ -239,7 +239,7 @@ cfg.rebalancer_disbalance_threshold = 0.01
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
 ---
 ...
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 ---
 ...
 wait_rebalancer_state('The cluster is balanced ok', test_run)
diff --git a/test/rebalancer/rebalancer.test.lua b/test/rebalancer/rebalancer.test.lua
index ec7ebcf2..bc67145e 100644
--- a/test/rebalancer/rebalancer.test.lua
+++ b/test/rebalancer/rebalancer.test.lua
@@ -78,7 +78,7 @@ util.map_bucket_protection(test_run, {REPLICASET_1}, true)
 
 test_run:switch('box_1_a')
 vshard.storage.rebalancer_enable()
-wait_rebalancer_state("Rebalance routes are sent", test_run)
+wait_rebalancer_state("The following rebalancer routes were sent", test_run)
 
 wait_rebalancer_state('The cluster is balanced ok', test_run)
 _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
@@ -118,7 +118,7 @@ _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
 -- Return 1%.
 cfg.rebalancer_disbalance_threshold = 0.01
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 wait_rebalancer_state('The cluster is balanced ok', test_run)
 _bucket.index.status:count({vshard.consts.BUCKET.ACTIVE})
 _bucket.index.status:min({vshard.consts.BUCKET.ACTIVE})
diff --git a/test/rebalancer/stress_add_remove_several_rs.result b/test/rebalancer/stress_add_remove_several_rs.result
index 6a9b0ffb..194c99c4 100644
--- a/test/rebalancer/stress_add_remove_several_rs.result
+++ b/test/rebalancer/stress_add_remove_several_rs.result
@@ -175,7 +175,7 @@ add_replicaset()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
 ---
 ...
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 ---
 ...
 -- Now, add a second replicaset.
@@ -422,7 +422,7 @@ remove_second_replicaset_first_stage()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
 ---
 ...
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 ---
 ...
 -- Rebalancing has been started - now remove second replicaset.
diff --git a/test/rebalancer/stress_add_remove_several_rs.test.lua b/test/rebalancer/stress_add_remove_several_rs.test.lua
index f62400f2..7c1ae3bc 100644
--- a/test/rebalancer/stress_add_remove_several_rs.test.lua
+++ b/test/rebalancer/stress_add_remove_several_rs.test.lua
@@ -71,7 +71,7 @@ fiber.sleep(0.5)
 test_run:switch('box_1_a')
 add_replicaset()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 
 -- Now, add a second replicaset.
 
@@ -153,7 +153,7 @@ fiber.sleep(0.5)
 test_run:switch('box_1_a')
 remove_second_replicaset_first_stage()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
-wait_rebalancer_state('Rebalance routes are sent', test_run)
+wait_rebalancer_state('The following rebalancer routes were sent', test_run)
 -- Rebalancing has been started - now remove second replicaset.
 remove_replicaset_first_stage()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_1_a)
diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
index 04ba357f..cc891b18 100644
--- a/test/storage-luatest/storage_1_1_1_test.lua
+++ b/test/storage-luatest/storage_1_1_1_test.lua
@@ -200,3 +200,23 @@ rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
     wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
                                    hanged_bucket_id_2)
 end
+
+rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
+    local moved_bucket_from_2 = vtest.storage_first_bucket(g.replica_2_a)
+    start_bucket_move(g.replica_2_a, g.replica_1_a, moved_bucket_from_2)
+    local moved_bucket_from_3 = vtest.storage_first_bucket(g.replica_3_a)
+    start_bucket_move(g.replica_3_a, g.replica_1_a, moved_bucket_from_3)
+    t.helpers.retrying({timeout = 60}, function()
+        g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end)
+        t.assert(g.replica_1_a:grep_log('Apply rebalancer routes with 1 ' ..
+                                        'workers'))
+    end)
+    local rebalancer_routes_msg = string.format(
+        "{\"%s\":{\"%s\":1,\"%s\":1}}", g.replica_1_a:replicaset_uuid(),
+        g.replica_3_a:replicaset_uuid(), g.replica_2_a:replicaset_uuid())
+    t.helpers.retrying({}, function()
+        t.assert(g.replica_1_a:grep_log(rebalancer_routes_msg))
+        g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end)
+        g.replica_1_a:grep_log('The cluster is balanced ok.')
+    end)
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 9d3a10db..bf26fa2a 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -2909,8 +2909,9 @@ local function rebalancer_service_f(service, limiter)
                 goto continue
             end
         end
-        log.info('Rebalance routes are sent. Schedule next wakeup after '..
-                 '%f seconds', consts.REBALANCER_WORK_INTERVAL)
+        log.info('The following rebalancer routes were sent: %s. ' ..
+                 'Schedule next wakeup after %f seconds', json_encode(routes),
+                 consts.REBALANCER_WORK_INTERVAL)
         service:set_activity('idling')
         lfiber.testcancel()
         lfiber.sleep(consts.REBALANCER_WORK_INTERVAL)

From 71947f467fb26f5e186872ba1b2bd1298c86bd6a Mon Sep 17 00:00:00 2001
From: Roman Gromov <r.gromov.official@gmail.com>
Date: Fri, 7 Nov 2025 19:10:38 +0300
Subject: [PATCH 4/5] rebalancer: refactoring of rebalancer_request_state

Before this patch the function `rebalancer_request_state` returned only
nil in case of errors (e.g. presence of SENDING, RECEIVING, GARBAGE
buckets, not active rebalancer). This makes it inconsistent compared to
other rebalancer functions.

Now, in case of errors we return `(nil, err)` instead of `nil`. It can
help us to propagate a meaningful error in rebalancer service. In
addition we introduce a new vshard error for cases when the storage has
some non-active buckets during rebalancing - `BUCKET_INVALID_STATE`.

Also we remove "Some buckets are not active" log from
`rebalancer_service_f` because a meaningful error about non-active
buckets is already returned from `rebalancer_download_states`.

NO_TEST=refactoring
NO_DOC=refactoring
---
 test/rebalancer/rebalancer.result   |  9 +++++----
 test/rebalancer/rebalancer.test.lua |  6 +++++-
 test/unit/rebalancer.result         | 16 +++++++++++++---
 test/unit/rebalancer.test.lua       |  6 +++++-
 vshard/error.lua                    |  5 +++++
 vshard/storage/init.lua             | 22 ++++++++++++----------
 6 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/test/rebalancer/rebalancer.result b/test/rebalancer/rebalancer.result
index 148ddbfc..59728cd2 100644
--- a/test/rebalancer/rebalancer.result
+++ b/test/rebalancer/rebalancer.result
@@ -318,12 +318,13 @@ _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}})
 ---
 - [150, 'receiving']
 ...
-wait_rebalancer_state("Some buckets are not active", test_run)
----
-...
+-- We should not check the certain status of buckets (e.g. receiving) because
+-- in rare cases we accidentally can get the wrong one. For example we wait for
+-- "receiving" status, but get "garbage" due to some previous rebalancer error.
+wait_rebalancer_state('Error during downloading rebalancer states:.*' ..  \
+                      'BUCKET_INVALID_STATE', test_run)                   \
 _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}})
 ---
-- [150, 'active']
 ...
 vshard.storage.sync()
 ---
diff --git a/test/rebalancer/rebalancer.test.lua b/test/rebalancer/rebalancer.test.lua
index bc67145e..dfe97198 100644
--- a/test/rebalancer/rebalancer.test.lua
+++ b/test/rebalancer/rebalancer.test.lua
@@ -156,7 +156,11 @@ util.map_bucket_protection(test_run, {REPLICASET_1}, false)
 test_run:switch('box_1_a')
 vshard.storage.rebalancer_enable()
 _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}})
-wait_rebalancer_state("Some buckets are not active", test_run)
+-- We should not check the certain status of buckets (e.g. receiving) because
+-- in rare cases we accidentally can get the wrong one. For example we wait for
+-- "receiving" status, but get "garbage" due to some previous rebalancer error.
+wait_rebalancer_state('Error during downloading rebalancer states:.*' ..  \
+                      'BUCKET_INVALID_STATE', test_run)                   \
 _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}})
 vshard.storage.sync()
 
diff --git a/test/unit/rebalancer.result b/test/unit/rebalancer.result
index d312a411..236fc0ec 100644
--- a/test/unit/rebalancer.result
+++ b/test/unit/rebalancer.result
@@ -290,9 +290,11 @@ build_routes(replicasets)
 vshard.storage.internal.is_master = true
 ---
 ...
-get_state = vshard.storage._rebalancer_request_state
----
-...
+get_state = function()                                          \
+    local res, err = vshard.storage._rebalancer_request_state() \
+    if res == nil then err.trace = nil end                      \
+    return res, err                                             \
+end                                                             \
 _bucket = box.schema.create_space('_bucket')
 ---
 ...
@@ -318,6 +320,7 @@ get_state()
 ---
 - bucket_active_count: 2
   bucket_pinned_count: 0
+- null
 ...
 _bucket:replace{1, consts.BUCKET.RECEIVING}
 ---
@@ -325,6 +328,13 @@ _bucket:replace{1, consts.BUCKET.RECEIVING}
 ...
 get_state()
 ---
+- null
+- buckets_state: receiving
+  code: 42
+  replica_id: _
+  type: ShardingError
+  message: Replica _ has receiving buckets
+  name: BUCKET_INVALID_STATE
 ...
 vshard.storage.internal.is_master = false
 ---
diff --git a/test/unit/rebalancer.test.lua b/test/unit/rebalancer.test.lua
index e6d54b81..4a6d556c 100644
--- a/test/unit/rebalancer.test.lua
+++ b/test/unit/rebalancer.test.lua
@@ -76,7 +76,11 @@ build_routes(replicasets)
 -- Test rebalancer local state.
 --
 vshard.storage.internal.is_master = true
-get_state = vshard.storage._rebalancer_request_state
+get_state = function()                                          \
+    local res, err = vshard.storage._rebalancer_request_state() \
+    if res == nil then err.trace = nil end                      \
+    return res, err                                             \
+end                                                             \
 _bucket = box.schema.create_space('_bucket')
 pk = _bucket:create_index('pk')
 status = _bucket:create_index('status', {parts = {{2, 'string'}}, unique = false})
diff --git a/vshard/error.lua b/vshard/error.lua
index 0892758c..5ffebfc5 100644
--- a/vshard/error.lua
+++ b/vshard/error.lua
@@ -207,6 +207,11 @@ local error_message_template = {
         msg = 'Mismatch server name: expected "%s", but got "%s"',
         args = {'expected_name', 'actual_name'},
     },
+    [42] = {
+        name = 'BUCKET_INVALID_STATE',
+        msg = 'Replica %s has %s buckets',
+        args = {'replica_id', 'buckets_state'}
+    }
 }
 
 --
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index bf26fa2a..b842b588 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -2851,12 +2851,10 @@ local function rebalancer_service_f(service, limiter)
             return
         end
         if not status or replicasets == nil then
-            local err = status and total_bucket_active_count or replicasets
-            if err then
-                limiter:log_error(err, service:set_status_error(
-                    'Error during downloading rebalancer states: %s', err))
-            end
-            log.info('Some buckets are not active, retry rebalancing later')
+            local err = total_bucket_active_count
+            limiter:log_error(err, service:set_status_error(
+                'Error during downloading rebalancer states: %s, ' ..
+                'retry rebalancing later', err))
             service:set_activity('idling')
             lfiber.testcancel()
             lfiber.sleep(consts.REBALANCER_WORK_INTERVAL)
@@ -2945,18 +2943,22 @@ local function rebalancer_request_state()
         return nil, err
     end
     if not M.is_rebalancer_active or rebalancing_is_in_progress() then
-        return
+        return nil, lerror.make('Rebalancer is not active or is in progress')
     end
     local _bucket = box.space._bucket
     local status_index = _bucket.index.status
+    local repl_id = M.this_replica and M.this_replica.id or '_'
     if #status_index:select({BSENDING}, {limit = 1}) > 0 then
-        return
+        return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE,
+                                  repl_id, 'sending')
     end
     if #status_index:select({BRECEIVING}, {limit = 1}) > 0 then
-        return
+        return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE,
+                                  repl_id, 'receiving')
     end
     if #status_index:select({BGARBAGE}, {limit = 1}) > 0 then
-        return
+        return nil, lerror.vshard(lerror.code.BUCKET_INVALID_STATE,
+                                  repl_id, 'garbage')
     end
     return {
         bucket_active_count = status_index:count({BACTIVE}),

From 16b39a2ec479e737b358236abed02eb59840407d Mon Sep 17 00:00:00 2001
From: Roman Gromov <r.gromov.official@gmail.com>
Date: Fri, 7 Nov 2025 19:10:42 +0300
Subject: [PATCH 5/5] rebalancer: add replicaset.id into
 rebalancer_request_state errors

Before this patch the function `rebalancer_download_states` didn't
return information about replicaset from which the states could not
be downloaded. As a result, the errors returned from
`rebalancer_download_states` lack of valuable information about
unhealthy replicaset.

Now, we add replicaset.id into error which is returned from the
`rebalancer_download_states`. It can help us to propagate replicaset.id
to rebalancer service.

Closes #212

NO_DOC=bugfix
---
 test/storage-luatest/storage_1_1_1_test.lua | 17 +++++++++++++++++
 vshard/storage/init.lua                     |  1 +
 2 files changed, 18 insertions(+)

diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
index cc891b18..dc03a180 100644
--- a/test/storage-luatest/storage_1_1_1_test.lua
+++ b/test/storage-luatest/storage_1_1_1_test.lua
@@ -220,3 +220,20 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
         g.replica_1_a:grep_log('The cluster is balanced ok.')
     end)
 end
+
+rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
+    local moved_bucket = vtest.storage_first_bucket(g.replica_2_a)
+    start_bucket_move(g.replica_1_a, g.replica_2_a, moved_bucket)
+    wait_for_bucket_is_transferred(g.replica_1_a, g.replica_2_a, moved_bucket)
+    vtest.storage_stop(g.replica_2_a)
+    local err_log = string.format('Error during downloading rebalancer ' ..
+                                  'states:.*"replicaset_id":"%s"',
+                                  g.replica_2_a:replicaset_uuid())
+    t.helpers.retrying({timeout = 60}, function()
+        g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end)
+        t.assert(g.replica_1_a:grep_log(err_log))
+    end)
+    vtest.storage_start(g.replica_2_a, global_cfg)
+    start_bucket_move(g.replica_2_a, g.replica_1_a, moved_bucket)
+    wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, moved_bucket)
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index b842b588..213eef23 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -2806,6 +2806,7 @@ local function rebalancer_download_states()
             replicaset, 'vshard.storage.rebalancer_request_state', {},
             {timeout = consts.REBALANCER_GET_STATE_TIMEOUT})
         if state == nil then
+            err.replicaset_id = replicaset.id
             return nil, err
         end
         local bucket_count = state.bucket_active_count +