@@ -41,6 +41,7 @@ local cfg_template = {
4141 },
4242 bucket_count = 15 ,
4343 replication_timeout = 0.1 ,
44+ rebalancer_mode = ' auto' ,
4445}
4546local global_cfg
4647
@@ -101,3 +102,125 @@ test_group.test_manual_bucket_send_doubled_buckets = function(g)
101102 ilt .assert_equals (box .space ._bucket :get (bid ), nil )
102103 end , {bid })
103104end
105+
106+ local rebalancer_recovery_group = t .group (' rebalancer-recovery-logging' )
107+
108+ local function get_first_storage_bucket_id (storage )
109+ return storage :exec (function ()
110+ return box .space ._bucket :select ()[1 ].id
111+ end )
112+ end
113+
114+ local function move_bucket (src_storage , dest_storage , bucket_id )
115+ src_storage :exec (function (bucket_id , replicaset_id )
116+ ivshard .storage .bucket_send (bucket_id , replicaset_id )
117+ end , {bucket_id , dest_storage :replicaset_uuid ()})
118+
119+ dest_storage :exec (function (bucket_id )
120+ t .helpers .retrying ({timeout = 10 }, function ()
121+ t .assert (box .space ._bucket :select (bucket_id ))
122+ end )
123+ end , {bucket_id })
124+ end
125+
126+ --
127+ -- The string should be strictly in format: YEAR-MONTH-DAY HOUR:MIN:SEC.MS
128+ --
129+ local function convert_string_to_datetime (str )
130+ local date , time = string.match (str , " (%d+-%d+-%d+) (%d+:%d+:%d+.%d+)" )
131+ t .assert (date )
132+ local year , month , day = string.match (date , " ^(%d+)-(%d+)-(%d+)" )
133+ t .assert (year , " Unable to parse 'date' field of string" )
134+ local hour , min , sec , ms = string.match (time , " (%d+):(%d+):(%d+).(%d+)$" )
135+ t .assert (hour , " Unable to parse 'time' field of string" )
136+
137+ return require (" datetime" ).new {
138+ year = tonumber (year ), month = tonumber (month ), day = tonumber (day ),
139+ hour = tonumber (hour ), min = tonumber (min ), sec = tonumber (sec ),
140+ nsec = tonumber (ms )}
141+ end
142+
143+ local function test_only_one_record_appears_in_logs (server , record , wait_time )
144+ local first_log_record = nil
145+ t .helpers .retrying ({}, function ()
146+ first_log_record = server :grep_log (record )
147+ t .assert (first_log_record )
148+ end )
149+ local first_log_time = convert_string_to_datetime (first_log_record )
150+ -- We need to wait a bit in order to catch how much as possible
151+ -- spam in server's logs.
152+ require (" fiber" ).sleep (wait_time )
153+ local last_log_record = server :grep_log (record )
154+ t .assert (last_log_record )
155+ local last_log_time = convert_string_to_datetime (last_log_record )
156+ t .assert_equals (first_log_time , last_log_time ,
157+ " There are two identical records in logs" )
158+ end
159+
160+ rebalancer_recovery_group .before_all (function (g )
161+ global_cfg = vtest .config_new (cfg_template )
162+ vtest .cluster_new (g , global_cfg )
163+ g .router = vtest .router_new (g , ' router' , global_cfg )
164+ vtest .cluster_bootstrap (g , global_cfg )
165+ vtest .cluster_wait_vclock_all (g )
166+
167+ for _ , storage in pairs ({g .replica_1_a , g .replica_2_a , g .replica_3_a }) do
168+ storage :exec (function ()
169+ rawset (_G , ' create_test_space' , function ()
170+ box .schema .create_space (' test_space' )
171+ box .space .test_space :format ({
172+ {name = ' pk' , type = ' uuid' },
173+ {name = ' bucket_id' , type = ' unsigned' },
174+ })
175+ box .space .test_space :create_index (' primary' , {parts = {' pk' }})
176+ box .space .test_space :create_index (
177+ ' bucket_id' , {parts = {' bucket_id' }, unique = false })
178+ end )
179+ end )
180+ g .router :exec (function (bucket_id )
181+ ivshard .router .call (bucket_id , ' write' , ' create_test_space' )
182+ end , {get_first_storage_bucket_id (storage )})
183+ end
184+
185+ g .buckets_not_active_log = ' Some buckets are not active'
186+ g .buckets_not_active_pattern = " %d+-%d+-%d+ %d+:%d+:%d+.%d+ .* " ..
187+ g .buckets_not_active_log
188+ g .rebalancer_wait_interval = 0.01
189+ end )
190+
191+ rebalancer_recovery_group .after_all (function (g )
192+ g .cluster :drop ()
193+ end )
194+
195+ --
196+ -- Improve logging of rebalancer and recovery (gh-212).
197+ --
198+ rebalancer_recovery_group .test_no_logs_while_unsuccess_recovery = function (g )
199+ g .replica_2_a :exec (function ()
200+ ivshard .storage .internal .errinj .ERRINJ_RECEIVE_PARTIALLY = true
201+ rawset (_G , ' old_call' , ivshard .storage ._call )
202+ ivshard .storage ._call = function (service_name , ...)
203+ if service_name == ' recovery_bucket_stat' then
204+ return error (' TimedOut' )
205+ end
206+ return _G .old_call (service_name , ... )
207+ end
208+ end )
209+ local hanged_bucket_id = get_first_storage_bucket_id (g .replica_1_a )
210+ move_bucket (g .replica_1_a , g .replica_2_a , hanged_bucket_id )
211+ t .helpers .retrying ({}, function ()
212+ t .assert (g .replica_1_a :grep_log (' Error during recovery of bucket 1' ))
213+ end )
214+ t .assert_not (g .replica_1_a :grep_log (' Finish bucket recovery step, 0' ))
215+ g .replica_2_a :exec (function ()
216+ ivshard .storage .internal .errinj .ERRINJ_RECEIVE_PARTIALLY = false
217+ ivshard .storage ._call = _G .old_call
218+ ivshard .storage .recovery_wakeup ()
219+ end )
220+ g .replica_1_a :exec (function (bucket_id )
221+ t .helpers .retrying ({}, function ()
222+ t .assert_equals (box .space ._bucket :select (bucket_id )[1 ].status ,
223+ ' active' )
224+ end )
225+ end , {hanged_bucket_id })
226+ end
0 commit comments