@@ -406,7 +406,11 @@ type FaultConfig struct {
406406 // RemoveNetworkPartition is an operation that simulates healing a network
407407 // partition.
408408 RemoveNetworkPartition int
409- // Disk stalls and node crashes belong here.
409+ // StopNode is an operation that stops a randomly chosen node.
410+ StopNode int
411+ // RestartNode is an operation that restarts a randomly chosen node.
412+ RestartNode int
413+ // Disk stalls and other faults belong here.
410414}
411415
412416// newAllOperationsConfig returns a GeneratorConfig that exercises *all*
@@ -531,6 +535,8 @@ func newAllOperationsConfig() GeneratorConfig {
531535 Fault : FaultConfig {
532536 AddNetworkPartition : 1 ,
533537 RemoveNetworkPartition : 1 ,
538+ StopNode : 1 ,
539+ RestartNode : 1 ,
534540 },
535541 }}
536542}
@@ -625,10 +631,12 @@ func NewDefaultConfig() GeneratorConfig {
625631 config .Ops .ClosureTxn .TxnClientOps .FlushLockTable = 0
626632 config .Ops .ClosureTxn .TxnBatchOps .Ops .FlushLockTable = 0
627633
628- // Network partitions can result in unavailability and need to be enabled with
629- // care by specific test variants.
634+ // Network partitions and node restarts can result in unavailability and need
635+ // to be enabled with care by specific test variants.
630636 config .Ops .Fault .AddNetworkPartition = 0
631637 config .Ops .Fault .RemoveNetworkPartition = 0
638+ config .Ops .Fault .StopNode = 0
639+ config .Ops .Fault .RestartNode = 0
632640 return config
633641}
634642
@@ -679,7 +687,7 @@ type Generator struct {
679687
680688// MakeGenerator constructs a Generator.
681689func MakeGenerator (
682- config GeneratorConfig , replicasFn GetReplicasFn , mode TestMode ,
690+ config GeneratorConfig , replicasFn GetReplicasFn , mode TestMode , n * nodes ,
683691) (* Generator , error ) {
684692 if config .NumNodes <= 0 {
685693 return nil , errors .Errorf (`NumNodes must be positive got: %d` , config .NumNodes )
@@ -716,6 +724,7 @@ func MakeGenerator(
716724 currentSplits : make (map [string ]struct {}),
717725 historicalSplits : make (map [string ]struct {}),
718726 partitions : p ,
727+ nodes : n ,
719728 mode : mode ,
720729 }
721730 return g , nil
@@ -755,6 +764,9 @@ type generator struct {
755764 // between nodes.
756765 partitions
757766
767+ // nodes contains the sets of running and stopped nodes.
768+ nodes * nodes
769+
758770 // mode is the test mode (e.g. Liveness or Safety). The generator needs it in
759771 // order to set a timeout for range lookups under safety mode.
760772 mode TestMode
@@ -770,6 +782,52 @@ type partitions struct {
770782 partitioned map [connection ]struct {}
771783}
772784
785+ // nodes contains the sets of running and stopped nodes. This struct is shared
786+ // between the generator and the applier to make sure nodes are promptly marked
787+ // as running/stopped when operations are generated/applied. The generator uses
788+ // removeRandRunning and removeRandStopped to pick nodes to stop/restart, and
789+ // the applier uses setRunning and setStopped to update the sets when operations
790+ // are actually applied. This is important because there could be a gap of
791+ // multiple seconds between generating a stop/restart operation and a node fully
792+ // stopping/restarting.
793+ type nodes struct {
794+ mu syncutil.RWMutex
795+ running map [int ]struct {}
796+ stopped map [int ]struct {}
797+ }
798+
799+ func randNodeFromMap (m map [int ]struct {}, rng * rand.Rand ) int {
800+ return maps .Keys (m )[rng .Intn (len (m ))]
801+ }
802+
803+ func (n * nodes ) removeRandRunning (rng * rand.Rand ) int {
804+ n .mu .Lock ()
805+ defer n .mu .Unlock ()
806+ nodeID := randNodeFromMap (n .running , rng )
807+ delete (n .running , nodeID )
808+ return nodeID
809+ }
810+
811+ func (n * nodes ) removeRandStopped (rng * rand.Rand ) int {
812+ n .mu .Lock ()
813+ defer n .mu .Unlock ()
814+ nodeID := randNodeFromMap (n .stopped , rng )
815+ delete (n .stopped , nodeID )
816+ return nodeID
817+ }
818+
819+ func (n * nodes ) setRunning (nodeID int ) {
820+ n .mu .Lock ()
821+ defer n .mu .Unlock ()
822+ n .running [nodeID ] = struct {}{}
823+ }
824+
825+ func (n * nodes ) setStopped (nodeID int ) {
826+ n .mu .Lock ()
827+ defer n .mu .Unlock ()
828+ n .stopped [nodeID ] = struct {}{}
829+ }
830+
773831// RandStep returns a single randomly generated next operation to execute.
774832//
775833// RandStep is not concurrency safe.
@@ -851,6 +909,12 @@ func (g *generator) RandStep(rng *rand.Rand) Step {
851909 addOpGen (& allowed , toggleGlobalReads , g .Config .Ops .ChangeZone .ToggleGlobalReads )
852910 addOpGen (& allowed , addRandNetworkPartition , g .Config .Ops .Fault .AddNetworkPartition )
853911 addOpGen (& allowed , removeRandNetworkPartition , g .Config .Ops .Fault .RemoveNetworkPartition )
912+ if len (g .nodes .running ) > 0 {
913+ addOpGen (& allowed , stopRandNode , g .Config .Ops .Fault .StopNode )
914+ }
915+ if len (g .nodes .stopped ) > 0 {
916+ addOpGen (& allowed , restartRandNode , g .Config .Ops .Fault .RestartNode )
917+ }
854918
855919 return step (g .selectOp (rng , allowed ))
856920}
@@ -1759,6 +1823,16 @@ func removeRandNetworkPartition(g *generator, rng *rand.Rand) Operation {
17591823 return removeNetworkPartition (randConn .from , randConn .to )
17601824}
17611825
1826+ func stopRandNode (g * generator , rng * rand.Rand ) Operation {
1827+ randNode := g .nodes .removeRandRunning (rng )
1828+ return stopNode (randNode )
1829+ }
1830+
1831+ func restartRandNode (g * generator , rng * rand.Rand ) Operation {
1832+ randNode := g .nodes .removeRandStopped (rng )
1833+ return restartNode (randNode )
1834+ }
1835+
17621836func makeRandBatch (c * ClientOperationConfig ) opGenFunc {
17631837 return func (g * generator , rng * rand.Rand ) Operation {
17641838 var allowed []opGen
@@ -2380,6 +2454,14 @@ func removeNetworkPartition(from int, to int) Operation {
23802454 }
23812455}
23822456
2457+ func stopNode (nodeID int ) Operation {
2458+ return Operation {StopNode : & StopNodeOperation {NodeId : int32 (nodeID )}}
2459+ }
2460+
2461+ func restartNode (nodeID int ) Operation {
2462+ return Operation {RestartNode : & RestartNodeOperation {NodeId : int32 (nodeID )}}
2463+ }
2464+
23832465type countingRandSource struct {
23842466 count atomic.Uint64
23852467 inner rand.Source64
0 commit comments