Skip to content

Commit 9ffa0d1

Browse files
craig[bot]DarrylWong
andcommitted
Merge #150172
150172: failureinjection/roachtest: add helper to spin up artificial MR cluster r=DarrylWong a=DarrylWong The current network latency failure mode requires the user to specify latencies between every single node. However, one use case of this failure mode may to simulate a MR deployment, where specifying the latency pairings for every node can be burdensome. This change adds some helpers to easily create an artificial MR cluster. Release note: none Informs: #138970 ------ A while ago, it was [suggested that metamorphically simulating MR](https://cockroachlabs.slack.com/archives/C023S0V4YEB/p1739990528631999) clusters could provide more coverage. I was able to reproduce the bug found in #140967 with this approach by running the existing mixed version backup test with no other changes. I figured it might be interesting for `@Dev-Kyle` to try and get this working for the mixed version framework (time permitting). I also did a comparison of our simulated MR cluster vs an actual geo distributed cluster and saw they both achieved roughly the same TPCC/YCSB performance. Co-authored-by: DarrylWong <darryl@cockroachlabs.com>
2 parents 42cc603 + 722da26 commit 9ffa0d1

File tree

5 files changed

+217
-6
lines changed

5 files changed

+217
-6
lines changed

pkg/cmd/roachtest/roachtestutil/utils.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ import (
1515
"regexp"
1616
"strconv"
1717
"strings"
18+
"sync"
1819
"time"
1920

2021
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
2122
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
2223
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
2324
"github.com/cockroachdb/cockroach/pkg/roachprod"
2425
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
26+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
2527
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2628
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2729
"github.com/cockroachdb/cockroach/pkg/util"
@@ -284,3 +286,53 @@ func PrefixCmdOutputWithTimestamp(cmd string) string {
284286
awkCmd := `awk 'NF { cmd="date +\"%H:%M:%S\""; cmd | getline ts; close(cmd); print ts ":", $0; next } { print }'`
285287
return fmt.Sprintf(`bash -c '%s' 2>&1 |`, cmd) + awkCmd
286288
}
289+
290+
// SimulateMultiRegionCluster injects artificial latency between nodes
291+
// to simulate a multi-region cluster with nodes distributed according
292+
// to regionToNodeMap. The actual cluster itself must have been provisioned
293+
// in the same region/zone.
294+
func SimulateMultiRegionCluster(
295+
ctx context.Context,
296+
t test.Test,
297+
c cluster.Cluster,
298+
regionToNodeMap failures.RegionToNodes,
299+
l *logger.Logger,
300+
) (func(), error) {
301+
latencyFailer, err := c.GetFailer(l, c.All(), failures.NetworkLatencyName)
302+
if err != nil {
303+
return nil, err
304+
}
305+
306+
args, err := failures.MakeNetworkLatencyArgs(regionToNodeMap)
307+
if err != nil {
308+
return nil, err
309+
}
310+
311+
failerLogger, fileName, err := LoggerForCmd(l, c.All(), "MultiRegionClusterSetup")
312+
if err != nil {
313+
return nil, err
314+
}
315+
l.Printf("Simulating Multi Region cluster; details in %s.log", fileName)
316+
317+
if err = latencyFailer.Setup(ctx, failerLogger, args); err != nil {
318+
return nil, err
319+
}
320+
321+
// We want to inject the failure mode after service registration but before
322+
// the cluster actually starts. We also want to only inject it the first time,
323+
// as it will persist through restarts.
324+
var once sync.Once
325+
c.RegisterClusterHook("inject multi-region latency", option.PreStartHook, time.Minute, func(ctx context.Context) error {
326+
once.Do(func() {
327+
err = latencyFailer.Inject(ctx, failerLogger, args)
328+
})
329+
return err
330+
})
331+
cleanupFunc := func() {
332+
if err = latencyFailer.Cleanup(context.Background(), failerLogger); err != nil {
333+
t.Fatal(err)
334+
}
335+
}
336+
337+
return cleanupFunc, nil
338+
}

pkg/cmd/roachtest/tests/tpcc.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/task"
2929
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
3030
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
31+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
3132
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
3233
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
3334
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
@@ -920,6 +921,43 @@ func registerTPCC(r registry.Registry) {
920921
},
921922
})
922923

924+
// This test runs TPCC on a simulated multi-region cluster by injecting network latency
925+
// failure modes. This tests that our artificial latency failure mode used to simulate MR works,
926+
// as well as offers a cheaper way to test MR TPCC without spinning up a distributed cluster.
927+
r.Add(registry.TestSpec{
928+
Name: fmt.Sprintf("tpcc/headroom/%s/artificial-multi-region", headroomSpec.String()),
929+
Owner: registry.OwnerTestEng,
930+
Benchmark: true,
931+
CompatibleClouds: registry.AllClouds,
932+
Suites: registry.ManualOnly,
933+
Cluster: headroomSpec,
934+
Timeout: 4 * time.Hour,
935+
EncryptionSupport: registry.EncryptionMetamorphic,
936+
Leases: registry.MetamorphicLeases,
937+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
938+
// Note that the workload node is placed in us-east as it's in the middle.
939+
regionToNodes := failures.RegionToNodes{
940+
failures.USEast: {1, 4},
941+
failures.USWest: {2},
942+
failures.EuropeWest: {3},
943+
}
944+
945+
cleanup, err := roachtestutil.SimulateMultiRegionCluster(ctx, t, c, regionToNodes, t.L())
946+
if err != nil {
947+
t.Fatal(err)
948+
}
949+
defer cleanup()
950+
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
951+
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
952+
t.L().Printf("computed headroom warehouses of %d\n", headroomWarehouses)
953+
runTPCC(ctx, t, t.L(), c, tpccOptions{
954+
Warehouses: headroomWarehouses,
955+
Duration: 120 * time.Minute,
956+
SetupType: usingImport,
957+
})
958+
},
959+
})
960+
923961
// Setup multi-region tests.
924962
{
925963
mrSetup := []struct {

pkg/roachprod/failureinjection/failures/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ go_library(
77
"failer.go",
88
"failure.go",
99
"latency.go",
10+
"multiregion_latency.go",
1011
"network_partition.go",
1112
"noop.go",
1213
"option.go",

pkg/roachprod/failureinjection/failures/latency.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ type ArtificialLatency struct {
3939
Delay time.Duration
4040
}
4141

42-
func (l *ArtificialLatency) String() string {
43-
return fmt.Sprintf("%d-%d-%s", l.Source, l.Destination, l.Delay)
42+
// FilterName returns the unique name for each filter. Note that the filter name maps
43+
// many source nodes to single destination node. Each node to node latency needs its
44+
// own filter, but we can inject the same rule on multiple source nodes in one shot.
45+
func (l *ArtificialLatency) FilterName(destNode install.Node) string {
46+
return fmt.Sprintf("%d-%d-%s", l.Source, destNode, l.Delay)
4447
}
4548

4649
type NetworkLatencyArgs struct {
@@ -179,10 +182,10 @@ func (f *NetworkLatency) Inject(ctx context.Context, l *logger.Logger, args Fail
179182
// Enforce we don't have duplicate rules, as it complicates the removal process of filters
180183
// and is something the user likely didn't intend.
181184
class := f.findNextOpenClass()
182-
if _, ok := f.filterNameToClassMap[latency.String()]; ok {
185+
if _, ok := f.filterNameToClassMap[latency.FilterName(dest)]; ok {
183186
return errors.Newf("failed trying to inject ArtificialLatency, rule already exists: %+v", latency)
184187
}
185-
f.filterNameToClassMap[latency.String()] = class
188+
f.filterNameToClassMap[latency.FilterName(dest)] = class
186189
handle := 10 * class
187190

188191
cmd := failScriptEarlyCmd
@@ -210,7 +213,7 @@ func (f *NetworkLatency) Recover(ctx context.Context, l *logger.Logger, args Fai
210213
return err
211214
}
212215

213-
class, ok := f.filterNameToClassMap[latency.String()]
216+
class, ok := f.filterNameToClassMap[latency.FilterName(dest)]
214217
if !ok {
215218
return errors.New("failed trying to recover latency failure, ArtificialLatency rule was not found: %+v")
216219
}
@@ -229,7 +232,7 @@ func (f *NetworkLatency) Recover(ctx context.Context, l *logger.Logger, args Fai
229232
if class < f.nextAvailableClass {
230233
f.nextAvailableClass = class
231234
}
232-
delete(f.filterNameToClassMap, latency.String())
235+
delete(f.filterNameToClassMap, latency.FilterName(dest))
233236
}
234237
}
235238
return nil
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package failures
7+
8+
import (
9+
"fmt"
10+
"time"
11+
12+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
13+
)
14+
15+
type (
16+
// Region represents a region in which a node can be located.
17+
Region string
18+
// RegionToNodes is a convenience type that maps regions to their nodes.
19+
RegionToNodes map[Region][]install.Node
20+
// LatencyMap is a mapping of all the one way latencies between regions,
21+
// i.e. 0.5 * RTT.
22+
LatencyMap map[Region]map[Region]time.Duration
23+
// roundTripLatency represents the RTT from RegionA to RegionB.
24+
roundTripLatency struct {
25+
RegionA Region
26+
RegionB Region
27+
Latency time.Duration
28+
}
29+
)
30+
31+
const (
32+
USEast Region = "us-east"
33+
USWest Region = "us-west"
34+
EuropeWest Region = "europe-west"
35+
)
36+
37+
// createLatencyMap creates a LatencyMap from a slice of roundTripLatency.
38+
// N.B. Latencies are assumed to be symmetric. The latency from region A
39+
// to region B is the same as the latency from region B to region A, i.e.
40+
// 0.5 * RTT.
41+
func createLatencyMap(roundTripLatency []roundTripLatency) LatencyMap {
42+
latencyMap := make(map[Region]map[Region]time.Duration)
43+
for _, rtt := range roundTripLatency {
44+
if _, ok := latencyMap[rtt.RegionA]; !ok {
45+
latencyMap[rtt.RegionA] = make(map[Region]time.Duration)
46+
}
47+
if _, ok := latencyMap[rtt.RegionB]; !ok {
48+
latencyMap[rtt.RegionB] = make(map[Region]time.Duration)
49+
}
50+
latencyMap[rtt.RegionA][rtt.RegionB] = rtt.Latency / 2
51+
latencyMap[rtt.RegionB][rtt.RegionA] = rtt.Latency / 2
52+
}
53+
return latencyMap
54+
}
55+
56+
// defaultLatencyMap returns a LatencyMap with estimated latencies between
57+
// VMs in different regions. Numbers are copied from the GCP console at
58+
// of the time of this comment:
59+
// https://console.cloud.google.com/net-intelligence/performance/global-dashboard
60+
//
61+
// N.B. Assumes the actual roachprod cluster was created in the same region/zone, i.e.
62+
// the original latency is negligible (gce claims sub 1ms for intra zone).
63+
var defaultLatencyMap = func() LatencyMap {
64+
regionLatencies := []roundTripLatency{
65+
{
66+
RegionA: USEast,
67+
RegionB: USWest,
68+
Latency: 64 * time.Millisecond,
69+
},
70+
{
71+
RegionA: USEast,
72+
RegionB: EuropeWest,
73+
Latency: 86 * time.Millisecond,
74+
},
75+
{
76+
RegionA: USWest,
77+
RegionB: EuropeWest,
78+
Latency: 132 * time.Millisecond,
79+
},
80+
}
81+
return createLatencyMap(regionLatencies)
82+
}()
83+
84+
// MakeNetworkLatencyArgs returns the NetworkLatencyArgs to simulate the latency
85+
// of a multiregion cluster with the provided mapping.
86+
func MakeNetworkLatencyArgs(regionToNodeMap RegionToNodes) (NetworkLatencyArgs, error) {
87+
artificialLatencies := make([]ArtificialLatency, 0)
88+
LatencyMappingNotFoundErr := func(regionA, regionB Region) error {
89+
return fmt.Errorf("no latency mapping found from region %s to %s", regionA, regionB)
90+
}
91+
for regionA, srcNodes := range regionToNodeMap {
92+
for regionB, destNodes := range regionToNodeMap {
93+
if regionA == regionB {
94+
continue
95+
}
96+
if defaultLatencyMap[regionA] == nil {
97+
return NetworkLatencyArgs{}, LatencyMappingNotFoundErr(regionA, regionB)
98+
}
99+
delay, ok := defaultLatencyMap[regionA][regionB]
100+
if !ok {
101+
return NetworkLatencyArgs{}, LatencyMappingNotFoundErr(regionA, regionB)
102+
}
103+
104+
artificialLatencies = append(artificialLatencies, ArtificialLatency{
105+
Source: srcNodes,
106+
Destination: destNodes,
107+
Delay: delay,
108+
})
109+
}
110+
}
111+
112+
args := NetworkLatencyArgs{
113+
ArtificialLatencies: artificialLatencies,
114+
}
115+
116+
return args, nil
117+
}

0 commit comments

Comments
 (0)