@@ -103,6 +103,7 @@ StorageType == [
103103 gcAck : [ BucketIds -> SUBSET Storages ] ,
104104 sendWaitTarget : [ BucketIds -> [ Storages -> Nat ] ] ,
105105 sendingBuckets : SUBSET BucketIds ,
106+ masterWaitTarget : [ Storages -> Nat ] ,
106107 \* For limiting tests, it cannot be done outside of the module, since
107108 \* a test doesn't know, whether the bucket send or ref actually happened.
108109 errinj : [
@@ -158,6 +159,7 @@ StorageInit ==
158159 gcAck |-> [ b \in BucketIds |-> { } ] ,
159160 sendWaitTarget |-> [ b \in BucketIds |-> [ s \in Storages |-> 0 ] ] ,
160161 sendingBuckets |-> { } ,
162+ masterWaitTarget |-> [ s \in Storages |-> 0 ] ,
161163 errinj |-> [
162164 bucketSendCount |-> 0 ,
163165 bucketRWRefCount |-> 0 ,
@@ -188,6 +190,7 @@ StorageState(i) == [
188190 gcAck |-> storages [ i ] . gcAck ,
189191 sendWaitTarget |-> storages [ i ] . sendWaitTarget ,
190192 sendingBuckets |-> storages [ i ] . sendingBuckets ,
193+ masterWaitTarget |-> storages [ i ] . masterWaitTarget ,
191194 errinj |-> storages [ i ] . errinj
192195]
193196
@@ -200,7 +203,8 @@ StorageStateApply(i, state) ==
200203 VarSet ( i , "gcAck" , state . gcAck ,
201204 VarSet ( i , "sendWaitTarget" , state . sendWaitTarget ,
202205 VarSet ( i , "sendingBuckets" , state . sendingBuckets ,
203- VarSet ( i , "errinj" , state . errinj , storages ) ) ) ) ) ) ) ) )
206+ VarSet ( i , "masterWaitTarget" , state . masterWaitTarget ,
207+ VarSet ( i , "errinj" , state . errinj , storages ) ) ) ) ) ) ) ) ) )
204208 /\ network ' =
205209 [ s \in Storages |->
206210 [ t \in Storages |->
@@ -256,8 +260,28 @@ ProcessReplicationBucket(state, j) ==
256260(* Failover master change *)
257261(************************************************************************** *)
258262
263+ \* True iff this node's current vclock has reached the stored target.
264+ MasterSyncDone ( state ) ==
265+ \A s \in Storages : state . vclock [ s ] >= state . masterWaitTarget [ s ]
266+
267+ \* Max of a non-empty set of Nats.
268+ MaxNat ( S ) == CHOOSE m \in S : \A x \in S : m >= x
269+
270+ \* Compute the one-shot target when i becomes master:
271+ \* for every source s, the maximum vclock[s] seen among peers at that instant.
272+ MasterCatchupTarget ( i ) ==
273+ [ s \in Storages |->
274+ IF OtherReplicasInReplicaset ( i ) = { } THEN 0
275+ ELSE MaxNat ( { storages [ p ] . vclock [ s ] : p \in OtherReplicasInReplicaset ( i ) } )
276+ ]
277+
278+ \* Until master synchronizes with other repliacas, rebalancer and recovery
279+ \* are not allowed.
259280BecomeMaster ( state ) ==
260- IF ~ IsMaster ( state ) THEN [ state EXCEPT ! . status = "master" ] ELSE state
281+ IF ~ IsMaster ( state ) THEN
282+ LET target == MasterCatchupTarget ( state . id ) IN
283+ [ state EXCEPT ! . status = "master" , ! . masterWaitTarget = target ]
284+ ELSE state
261285
262286BecomeReplica ( state ) ==
263287 IF IsMaster ( state ) THEN [ state EXCEPT ! . status = "replica" ] ELSE state
@@ -315,6 +339,7 @@ BucketSendWaitAndSend(state, b, j) ==
315339 /\ b \in state . sendingBuckets
316340 /\ j # state . id
317341 /\ AllReplicasCaughtUp ( state , b )
342+ /\ MasterSyncDone ( state )
318343 THEN
319344 LET msg == [ type |-> "BUCKET_RECV" ,
320345 content |-> [ bucket |-> b , final |-> FALSE ] ]
@@ -333,7 +358,7 @@ BucketDropFromSending(state, b) ==
333358BucketSendStart ( state , b , j ) ==
334359 IF IsMaster ( state ) /\ state . buckets [ b ] . status = "ACTIVE" /\
335360 state . bucketRefs [ b ] . rw = 0 /\ ~ state . bucketRefs [ b ] . rw_lock /\
336- b \notin state . transferingBuckets /\ j # state . id
361+ b \notin state . transferingBuckets /\ j # state . id /\ MasterSyncDone ( state )
337362 THEN LET state1 == [ state EXCEPT
338363 ! . bucketRefs [ b ] . rw_lock = TRUE ,
339364 ! . transferingBuckets = @ \union { b } ,
@@ -348,7 +373,7 @@ BucketSendStart(state, b, j) ==
348373
349374BucketRecvStart ( state , j ) ==
350375 LET msg == Head ( state . networkReceive [ j ] ) IN
351- IF msg . type = "BUCKET_RECV" /\ ~ msg . content . final THEN
376+ IF msg . type = "BUCKET_RECV" /\ ~ msg . content . final /\ MasterSyncDone ( state ) THEN
352377 LET b == msg . content . bucket IN
353378 IF IsMaster ( state ) /\ state . buckets [ b ] . status = NULL /\
354379 b \notin state . transferingBuckets
@@ -376,7 +401,7 @@ BucketSendFinish(state, j) ==
376401 LET b == msg . content . bucket
377402 ok == msg . content . status
378403 IN
379- IF IsMaster ( state ) /\ b \in state . transferingBuckets THEN
404+ IF IsMaster ( state ) /\ b \in state . transferingBuckets /\ MasterSyncDone ( state ) THEN
380405 LET state1 == [ state EXCEPT ! . networkReceive [ j ] = Tail ( @ ) ] IN
381406 IF ok THEN
382407 \* Receiver accepted the bucket - mark as SENT
@@ -406,7 +431,7 @@ BucketRecvFinish(state, j) ==
406431 IF msg . type = "BUCKET_RECV" /\ msg . content . final THEN
407432 LET b == msg . content . bucket IN
408433 IF IsMaster ( state ) /\ state . buckets [ b ] . status = "RECEIVING" /\
409- b \notin state . transferingBuckets
434+ b \notin state . transferingBuckets /\ MasterSyncDone ( state )
410435 THEN LET state1 == [ state EXCEPT
411436 ! . networkReceive [ j ] = Tail ( @ ) ] IN
412437 LET state2 ==
@@ -432,6 +457,7 @@ RecoverySendStatRequest(state, b) ==
432457 /\ state . buckets [ b ] . status \in { "SENDING" , "RECEIVING" }
433458 /\ ~ ( b \in state . transferingBuckets )
434459 /\ dest_rs # NULL
460+ /\ MasterSyncDone ( state )
435461 THEN
436462 \* Choose any storage in the destination replicaset.
437463 LET candidates == { s \in Storages :
@@ -447,7 +473,7 @@ ProcessRecoveryStatRequest(state, j) ==
447473 LET msg == Head ( state . networkReceive [ j ] ) IN
448474 IF msg . type # "RECOVERY_BUCKET_STAT" THEN state
449475 ELSE
450- IF ~ IsMaster ( state ) THEN
476+ IF ~ IsMaster ( state ) \/ ~ MasterSyncDone ( state ) THEN
451477 \* Drop message if this node is not a master.
452478 [ state EXCEPT ! . networkReceive [ j ] = Tail ( @ ) ]
453479 ELSE
@@ -466,7 +492,7 @@ ProcessRecoveryStatResponse(state, j) ==
466492 LET msg == Head ( state . networkReceive [ j ] ) IN
467493 IF msg . type # "RECOVERY_BUCKET_STAT_RESPONSE" THEN state
468494 ELSE
469- IF ~ IsMaster ( state ) THEN
495+ IF ~ IsMaster ( state ) \/ ~ MasterSyncDone ( state ) THEN
470496 \* Drop message if this node is not a master.
471497 [ state EXCEPT ! . networkReceive [ j ] = Tail ( @ ) ]
472498 ELSE
0 commit comments