@@ -60,6 +60,7 @@ type OperationConfig struct {
6060 ChangeLease ChangeLeaseConfig
6161 ChangeSetting ChangeSettingConfig
6262 ChangeZone ChangeZoneConfig
63+ Fault FaultConfig
6364}
6465
6566// ClosureTxnConfig configures the relative probability of running some
@@ -392,6 +393,20 @@ type SavepointConfig struct {
392393 SavepointRollback int
393394}
394395
396+ // FaultConfig configures the relative probabilities of generating different
397+ // types of faults. Network partitions can be symmetric or asymmetric, partial
398+ // or full, but they may need multiple operations to set up; e.g. a symmetric
399+ // partition between node A and node B requires to partitions: from A to B, and
400+ // from B to A.
401+ type FaultConfig struct {
402+ // AddNetworkPartition is an operation that simulates a network partition.
403+ AddNetworkPartition int
404+ // RemoveNetworkPartition is an operation that simulates healing a network
405+ // partition.
406+ RemoveNetworkPartition int
407+ // Disk stalls and node crashes belong here.
408+ }
409+
395410// newAllOperationsConfig returns a GeneratorConfig that exercises *all*
396411// options. You probably want NewDefaultConfig. Most of the time, these will be
397412// the same, but having both allows us to merge code for operations that do not
@@ -511,6 +526,10 @@ func newAllOperationsConfig() GeneratorConfig {
511526 ChangeZone : ChangeZoneConfig {
512527 ToggleGlobalReads : 1 ,
513528 },
529+ Fault : FaultConfig {
530+ AddNetworkPartition : 1 ,
531+ RemoveNetworkPartition : 1 ,
532+ },
514533 }}
515534}
516535
@@ -603,6 +622,11 @@ func NewDefaultConfig() GeneratorConfig {
603622 config .Ops .ClosureTxn .CommitBatchOps .FlushLockTable = 0
604623 config .Ops .ClosureTxn .TxnClientOps .FlushLockTable = 0
605624 config .Ops .ClosureTxn .TxnBatchOps .Ops .FlushLockTable = 0
625+
626+ // Network partitions can result in unavailability and need to be enabled with
627+ // care by specific test variants.
628+ config .Ops .Fault .AddNetworkPartition = 0
629+ config .Ops .Fault .RemoveNetworkPartition = 0
606630 return config
607631}
608632
@@ -663,13 +687,27 @@ func MakeGenerator(config GeneratorConfig, replicasFn GetReplicasFn) (*Generator
663687 return nil , errors .Errorf (`NumReplicas (%d) must <= NumNodes (%d)` ,
664688 config .NumReplicas , config .NumNodes )
665689 }
690+ p := partitions {
691+ healthy : make (map [connection ]struct {}),
692+ partitioned : make (map [connection ]struct {}),
693+ }
694+ for i := 1 ; i <= config .NumNodes ; i ++ {
695+ for j := 1 ; j <= config .NumNodes ; j ++ {
696+ if i == j {
697+ continue
698+ }
699+ conn := connection {from : i , to : j }
700+ p .healthy [conn ] = struct {}{}
701+ }
702+ }
666703 g := & Generator {}
667704 g .mu .generator = generator {
668705 Config : config ,
669706 replicasFn : replicasFn ,
670707 keys : make (map [string ]string ),
671708 currentSplits : make (map [string ]struct {}),
672709 historicalSplits : make (map [string ]struct {}),
710+ partitions : p ,
673711 }
674712 return g , nil
675713}
@@ -703,6 +741,20 @@ type generator struct {
703741 // emitted, regardless of whether the split has since been applied or been
704742 // merged away again.
705743 historicalSplits map [string ]struct {}
744+
745+ // partitions contains the sets of healthy and partitioned connections
746+ // between nodes.
747+ partitions
748+ }
749+
750+ type connection struct {
751+ from int // node ID
752+ to int // node ID
753+ }
754+
755+ type partitions struct {
756+ healthy map [connection ]struct {}
757+ partitioned map [connection ]struct {}
706758}
707759
708760// RandStep returns a single randomly generated next operation to execute.
@@ -763,6 +815,8 @@ func (g *generator) RandStep(rng *rand.Rand) Step {
763815
764816 addOpGen (& allowed , setLeaseType , g .Config .Ops .ChangeSetting .SetLeaseType )
765817 addOpGen (& allowed , toggleGlobalReads , g .Config .Ops .ChangeZone .ToggleGlobalReads )
818+ addOpGen (& allowed , addRandNetworkPartition , g .Config .Ops .Fault .AddNetworkPartition )
819+ addOpGen (& allowed , removeRandNetworkPartition , g .Config .Ops .Fault .RemoveNetworkPartition )
766820
767821 return step (g .selectOp (rng , allowed ))
768822}
@@ -1643,6 +1697,34 @@ func toggleGlobalReads(_ *generator, _ *rand.Rand) Operation {
16431697 return changeZone (ChangeZoneType_ToggleGlobalReads )
16441698}
16451699
1700+ func addRandNetworkPartition (g * generator , rng * rand.Rand ) Operation {
1701+ if len (g .partitions .healthy ) == 0 {
1702+ return addNetworkPartition (0 , 0 )
1703+ }
1704+ all := make ([]connection , 0 , len (g .partitions .healthy ))
1705+ for conn := range g .partitions .healthy {
1706+ all = append (all , conn )
1707+ }
1708+ randConn := all [rng .Intn (len (all ))]
1709+ delete (g .partitions .healthy , randConn )
1710+ g .partitions .partitioned [randConn ] = struct {}{}
1711+ return addNetworkPartition (randConn .from , randConn .to )
1712+ }
1713+
1714+ func removeRandNetworkPartition (g * generator , rng * rand.Rand ) Operation {
1715+ if len (g .partitions .partitioned ) == 0 {
1716+ return removeNetworkPartition (0 , 0 )
1717+ }
1718+ all := make ([]connection , 0 , len (g .partitions .partitioned ))
1719+ for conn := range g .partitions .partitioned {
1720+ all = append (all , conn )
1721+ }
1722+ randConn := all [rng .Intn (len (all ))]
1723+ delete (g .partitions .partitioned , randConn )
1724+ g .partitions .healthy [randConn ] = struct {}{}
1725+ return removeNetworkPartition (randConn .from , randConn .to )
1726+ }
1727+
16461728func makeRandBatch (c * ClientOperationConfig ) opGenFunc {
16471729 return func (g * generator , rng * rand.Rand ) Operation {
16481730 var allowed []opGen
@@ -2252,6 +2334,18 @@ func rollbackSavepoint(id int) Operation {
22522334 return Operation {SavepointRollback : & SavepointRollbackOperation {ID : int32 (id )}}
22532335}
22542336
2337+ func addNetworkPartition (from int , to int ) Operation {
2338+ return Operation {
2339+ AddNetworkPartition : & AddNetworkPartitionOperation {FromNode : int32 (from ), ToNode : int32 (to )},
2340+ }
2341+ }
2342+
2343+ func removeNetworkPartition (from int , to int ) Operation {
2344+ return Operation {
2345+ RemoveNetworkPartition : & RemoveNetworkPartitionOperation {FromNode : int32 (from ), ToNode : int32 (to )},
2346+ }
2347+ }
2348+
22552349type countingRandSource struct {
22562350 count atomic.Uint64
22572351 inner rand.Source64
0 commit comments