recovery: reduce spam of "Finish bucket recovery step" logs

mrForza · mrForza · commit b9e515358b87 · 2025-09-10T10:45:44.000+03:00
Before this patch "Finish bucket recovery step ..." logs were printed at the end of recovery even if no buckets were successfully recovered. It led to unnecessary log records. This patch fixes the issue by adding an additional check for the number of recovered buckets. Also we print id of every bucket which was successfully recovered in order to track buckets' duplication. Part of #212 NO_DOC=bugfix
diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
@@ -101,3 +101,95 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
         ilt.assert_equals(box.space._bucket:get(bid), nil)
     end, {bid})
 end
+
+local rebalancer_recovery_group = t.group('rebalancer-recovery-logging')
+
+local function start_bucket_move(src_storage, dest_storage, bucket_id)
+    src_storage:exec(function(bucket_id, replicaset_id)
+        ivshard.storage.bucket_send(bucket_id, replicaset_id)
+    end, {bucket_id, dest_storage:replicaset_uuid()})
+
+    dest_storage:exec(function(bucket_id)
+        t.helpers.retrying({timeout = 10}, function()
+            t.assert(box.space._bucket:select(bucket_id))
+        end)
+    end, {bucket_id})
+end
+
+local function assert_bucket_is_transferred(src_storage, dest_storage,
+                                            bucket_id)
+    src_storage:exec(function(bucket_id)
+        t.helpers.retrying({}, function()
+            t.assert_equals(box.space._bucket:select(bucket_id), {})
+        end)
+    end, {bucket_id})
+    dest_storage:exec(function(bucket_id)
+        t.helpers.retrying({}, function()
+            t.assert_equals(box.space._bucket:select(bucket_id)[1].status,
+                            'active')
+        end)
+    end, {bucket_id})
+end
+
+rebalancer_recovery_group.before_all(function(g)
+    global_cfg = vtest.config_new(cfg_template)
+    vtest.cluster_new(g, global_cfg)
+    g.router = vtest.router_new(g, 'router', global_cfg)
+    vtest.cluster_bootstrap(g, global_cfg)
+    vtest.cluster_wait_vclock_all(g)
+
+    vtest.cluster_exec_each_master(g, function()
+        box.schema.create_space('test_space')
+        box.space.test_space:format({
+            {name = 'pk', type = 'unsigned'},
+            {name = 'bucket_id', type = 'unsigned'},
+        })
+        box.space.test_space:create_index('primary', {parts = {'pk'}})
+        box.space.test_space:create_index(
+            'bucket_id', {parts = {'bucket_id'}, unique = false})
+    end)
+end)
+
+rebalancer_recovery_group.after_all(function(g)
+    g.cluster:drop()
+end)
+
+--
+-- Improve logging of rebalancer and recovery (gh-212).
+--
+rebalancer_recovery_group.test_no_logs_while_unsuccess_recovery = function(g)
+    g.replica_2_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = true
+        rawset(_G, 'old_call', ivshard.storage._call)
+        ivshard.storage._call = function(service_name, ...)
+            if service_name == 'recovery_bucket_stat' then
+                return error('TimedOut')
+            end
+            return _G.old_call(service_name, ...)
+        end
+    end)
+    local hanged_bucket_id_1 = vtest.storage_first_bucket(g.replica_1_a)
+    start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_1)
+    local hanged_bucket_id_2 = vtest.storage_first_bucket(g.replica_1_a)
+    start_bucket_move(g.replica_1_a, g.replica_2_a, hanged_bucket_id_2)
+    t.helpers.retrying({}, function()
+        t.assert(g.replica_1_a:grep_log('Error during recovery of bucket 1'))
+    end)
+    t.assert_not(g.replica_1_a:grep_log('Finish bucket recovery step, 0'))
+    g.replica_2_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_RECEIVE_PARTIALLY = false
+        ivshard.storage._call = _G.old_call
+        ivshard.storage.garbage_collector_wakeup()
+        ivshard.storage.recovery_wakeup()
+    end)
+    g.replica_1_a:exec(function() ivshard.storage.recovery_wakeup() end)
+    t.helpers.retrying({}, function()
+        t.assert(g.replica_1_a:grep_log('Finish bucket recovery step, 2 ' ..
+                                        'sending buckets are recovered among'))
+        t.assert(g.replica_1_a:grep_log('Recovered buckets: %[1,2%]'))
+    end)
+    assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
+                                 hanged_bucket_id_1)
+    assert_bucket_is_transferred(g.replica_2_a, g.replica_1_a,
+                                 hanged_bucket_id_2)
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
@@ -5,6 +5,7 @@ local lmsgpack = require('msgpack')
 local netbox = require('net.box') -- for net.box:self()
 local trigger = require('internal.trigger')
 local ffi = require('ffi')
+local json_encode = require('json').encode
 local yaml_encode = require('yaml').encode
 local fiber_clock = lfiber.clock
 local fiber_yield = lfiber.yield
@@ -931,6 +932,7 @@ local function recovery_step_by_type(type)
     local recovered = 0
     local total = 0
     local start_format = 'Starting %s buckets recovery step'
+    local recovered_buckets = {}
     for _, bucket in _bucket.index.status:pairs(type) do
         lfiber.testcancel()
         total = total + 1
@@ -1001,11 +1003,13 @@ local function recovery_step_by_type(type)
                      bucket_id, bucket.status, remote_bucket.status, peer_id)
         end
         is_step_empty = false
+        table.insert(recovered_buckets, bucket_id)
 ::continue::
     end
-    if not is_step_empty then
+    if recovered > 0 then
         log.info('Finish bucket recovery step, %d %s buckets are recovered '..
                  'among %d', recovered, type, total)
+        log.info('Recovered buckets: %s', json_encode(recovered_buckets))
     end
     return total, recovered
 end