Skip to content

Commit 2d56ca7

Browse files
craig[bot]tbg
andcommitted
150564: ui: rework network page r=tbg a=tbg Trying to make it a bit more functional. First, we put bytes sent/received first. We drop packets sent/received, which largely duplicates this (I could be talked out of that). Packet errors and drops are jammed into a single chart. Step changes are (hopefully) still visible, and it avoids having to scroll through a set of empty graphs on the way down in the common case of no issues. (I might be wrong about this, but I've never seen errors, for example, only drops). I added the new TCP retransmits chart from #149928. The proxy charts probably not useful most of the time, but I kept them around. _If_ they do something, I assume we want to know. | Old Chart Layout | New Chart Layout | | :---------------------------------------- | :--------------------------------------- | | Network Bytes Received | Network Bytes Sent | | Network Packets Received | Network Bytes Received | | Network Packet Errors on Receive | RPC Heartbeat Latency: 50th percentile | | Network Packet Drops on Receive | RPC Heartbeat Latency: 99th percentile | | Network Bytes Sent | Unhealthy RPC Connections | | Network Packets Sent | Network Packet Errors and Drops | | Network Packet Errors on Send | TCP Retransmits | | Network Packet Drops on Send | Proxy requests | | RPC Heartbeat Latency: 50th percentile | Proxy request errors | | RPC Heartbeat Latency: 99th percentile | Proxy forwards | | Unhealthy RPC Connections | Proxy forward errors | | Proxy requests | | | Proxy request errors | | | Proxy forwards | | Epic: none 153145: kvserver: exec-trace TestReplicateQueueRebalanceMultiStore r=tbg a=tbg It failed in a way that looked like network latency, but there is no non-local networking or artificial delay in this test. Enable execution tracing this test to maybe understand this better next time. Closes #153137. Epic: none 153326: allocatorimpl: rm TestAllocatorFullDisks r=tbg a=tbg It's been flaky forever, and we have asim coverage for full disks (asim/tests/testdata/non_rand/example_fulldisk[^1]) [^1]: https://github.com/cockroachdb/cockroach/blob/8b1e1a9e399ca84a87c686d0106279e0c5904871/pkg/kv/kvserver/asim/tests/testdata/non_rand/example_fulldisk.txt#L5 This and its backports will: Close #153225. Close #152897. Close #152317. Close #152056 Close #150456. Epic: none Co-authored-by: Tobias Grieger <tobias.b.grieger@gmail.com>
4 parents b9835d8 + 0a03a0f + 19fe46d + 6573ed4 commit 2d56ca7

File tree

3 files changed

+63
-242
lines changed

3 files changed

+63
-242
lines changed

pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go

Lines changed: 0 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -8785,160 +8785,6 @@ func (ts *testStore) rebalance(ots *testStore, bytes int64, qps float64, do Disk
87858785
ots.Capacity.LogicalBytes += bytes
87868786
}
87878787

8788-
func (ts *testStore) compact() {
8789-
ts.Capacity.Used = ts.Capacity.LogicalBytes
8790-
ts.Capacity.Available = ts.Capacity.Capacity - ts.Capacity.Used
8791-
}
8792-
8793-
func TestAllocatorFullDisks(t *testing.T) {
8794-
defer leaktest.AfterTest(t)()
8795-
defer log.Scope(t).Close(t)
8796-
8797-
ctx := context.Background()
8798-
stopper := stop.NewStopper()
8799-
defer stopper.Stop(ctx)
8800-
8801-
st := cluster.MakeTestingClusterSettings()
8802-
tr := tracing.NewTracer()
8803-
clock := hlc.NewClockForTesting(nil)
8804-
8805-
g := gossip.NewTest(1, stopper, metric.NewRegistry())
8806-
8807-
liveness.TimeUntilNodeDead.Override(ctx, &st.SV, liveness.TestTimeUntilNodeDeadOff)
8808-
8809-
const generations = 100
8810-
const nodes = 20
8811-
const capacity = (1 << 30) + 1
8812-
const rangeSize = 16 << 20
8813-
8814-
mockNodeLiveness := storepool.NewMockNodeLiveness(livenesspb.NodeLivenessStatus_LIVE)
8815-
sp := storepool.NewStorePool(
8816-
log.MakeTestingAmbientContext(tr),
8817-
st,
8818-
g,
8819-
clock,
8820-
func() int {
8821-
return nodes
8822-
},
8823-
mockNodeLiveness.NodeLivenessFunc,
8824-
false, /* deterministic */
8825-
)
8826-
alloc := MakeAllocator(st, false /* deterministic */, func(id roachpb.NodeID) (time.Duration, bool) {
8827-
return 0, false
8828-
}, nil)
8829-
8830-
var wg sync.WaitGroup
8831-
g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStoreDescPrefix),
8832-
func(_ string, _ roachpb.Value, _ int64) { wg.Done() },
8833-
// Redundant callbacks are required by this test.
8834-
gossip.Redundant)
8835-
8836-
do := makeDiskCapacityOptions(&st.SV)
8837-
8838-
// Each range is equally sized (16mb), we want the number of ranges per node,
8839-
// when their size is added, to be no greater than the full disk rebalance
8840-
// threshold (0.925%) e.g for below:
8841-
// capacity = 1024mb
8842-
// rangeSize = 16mb
8843-
// threshold = 0.925
8844-
// rangesPerNode = ⌊1024mb * 0.925 / 16mb⌋ = 59
8845-
rangesPerNode := int(math.Floor(capacity * do.RebalanceToThreshold / rangeSize))
8846-
rangesToAdd := rangesPerNode * nodes
8847-
8848-
// Initialize testStores.
8849-
var testStores [nodes]testStore
8850-
for i := 0; i < len(testStores); i++ {
8851-
// Don't immediately reclaim disk space from removed ranges. This mimics
8852-
// range deletions don't immediately reclaim disk space in rocksdb.
8853-
testStores[i].immediateCompaction = false
8854-
testStores[i].StoreID = roachpb.StoreID(i)
8855-
testStores[i].Node = roachpb.NodeDescriptor{NodeID: roachpb.NodeID(i)}
8856-
testStores[i].Capacity = roachpb.StoreCapacity{Capacity: capacity, Available: capacity}
8857-
}
8858-
// Initialize the cluster with a single range.
8859-
testStores[0].add(rangeSize, 0)
8860-
rangesAdded := 1
8861-
8862-
for i := 0; i < generations; i++ {
8863-
// First loop through test stores and randomly add data.
8864-
for j := 0; j < len(testStores); j++ {
8865-
if mockNodeLiveness.NodeLivenessFunc(roachpb.NodeID(j)) == livenesspb.NodeLivenessStatus_DEAD {
8866-
continue
8867-
}
8868-
ts := &testStores[j]
8869-
// Add [0,2) ranges to the node, simulating splits and data growth.
8870-
toAdd := alloc.randGen.Intn(2)
8871-
for k := 0; k < toAdd; k++ {
8872-
if rangesAdded < rangesToAdd {
8873-
ts.add(rangeSize, 0)
8874-
rangesAdded++
8875-
}
8876-
}
8877-
if ts.Capacity.Available <= 0 {
8878-
t.Errorf("testStore %d ran out of space during generation %d (rangesAdded=%d/%d): %+v",
8879-
j, i, rangesAdded, rangesToAdd, ts.Capacity)
8880-
mockNodeLiveness.SetNodeStatus(roachpb.NodeID(j), livenesspb.NodeLivenessStatus_DEAD)
8881-
}
8882-
wg.Add(1)
8883-
if err := g.AddInfoProto(gossip.MakeStoreDescKey(roachpb.StoreID(j)), &ts.StoreDescriptor, 0); err != nil {
8884-
t.Fatal(err)
8885-
}
8886-
}
8887-
wg.Wait()
8888-
8889-
// Loop through each store a number of times and maybe rebalance.
8890-
for j := 0; j < 10; j++ {
8891-
for k := 0; k < len(testStores); k++ {
8892-
if mockNodeLiveness.NodeLivenessFunc(roachpb.NodeID(k)) == livenesspb.NodeLivenessStatus_DEAD {
8893-
continue
8894-
}
8895-
ts := &testStores[k]
8896-
// Rebalance until there's no more rebalancing to do.
8897-
if ts.Capacity.RangeCount > 0 {
8898-
var rangeUsageInfo allocator.RangeUsageInfo
8899-
target, _, details, ok := alloc.RebalanceVoter(
8900-
ctx,
8901-
sp,
8902-
emptySpanConfig(),
8903-
nil,
8904-
[]roachpb.ReplicaDescriptor{{NodeID: ts.Node.NodeID, StoreID: ts.StoreID}},
8905-
nil,
8906-
rangeUsageInfo,
8907-
storepool.StoreFilterThrottled,
8908-
alloc.ScorerOptions(ctx),
8909-
)
8910-
if ok {
8911-
if log.V(1) {
8912-
log.Dev.Infof(ctx, "rebalancing to %v; details: %s", target, details)
8913-
}
8914-
testStores[k].rebalance(&testStores[int(target.StoreID)], rangeSize, 0 /* qps */, do)
8915-
}
8916-
}
8917-
// Gossip occasionally, as real Stores do when replicas move around.
8918-
if j%3 == 2 {
8919-
wg.Add(1)
8920-
if err := g.AddInfoProto(gossip.MakeStoreDescKey(roachpb.StoreID(j)), &ts.StoreDescriptor, 0); err != nil {
8921-
t.Fatal(err)
8922-
}
8923-
}
8924-
}
8925-
}
8926-
8927-
// Simulate rocksdb compactions freeing up disk space.
8928-
for j := 0; j < len(testStores); j++ {
8929-
if mockNodeLiveness.NodeLivenessFunc(roachpb.NodeID(j)) != livenesspb.NodeLivenessStatus_DEAD {
8930-
ts := &testStores[j]
8931-
if ts.Capacity.Available <= 0 {
8932-
t.Errorf("testStore %d ran out of space during generation %d: %+v", j, i, ts.Capacity)
8933-
mockNodeLiveness.SetNodeStatus(roachpb.NodeID(j), livenesspb.NodeLivenessStatus_DEAD)
8934-
} else {
8935-
ts.compact()
8936-
}
8937-
}
8938-
}
8939-
}
8940-
}
8941-
89428788
func Example_rangeCountRebalancing() {
89438789
testStores := make([]testStore, 20)
89448790
rebalanceFn := func(ctx context.Context, ts *testStore, testStores []testStore, alloc *Allocator, storePool *storepool.StorePool) {

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,15 @@ func TestReplicateQueueRebalance(t *testing.T) {
176176
// rebalances the replicas and leases.
177177
func TestReplicateQueueRebalanceMultiStore(t *testing.T) {
178178
defer leaktest.AfterTest(t)()
179-
defer log.Scope(t).Close(t)
180-
skip.UnderRace(t)
181-
skip.UnderShort(t)
182-
skip.UnderDeadlock(t)
179+
skip.UnderDuress(t) // eight stores is too much under duress
180+
scope := log.Scope(t)
181+
defer scope.Close(t)
182+
183+
// The test exhibited an interesting failure mode that we want
184+
// to be able to better investigate should it reoccur.
185+
// See: https://github.com/cockroachdb/cockroach/issues/153137
186+
// and https://cockroachlabs.slack.com/archives/G01G8LK77DK/p1757330830964639.
187+
defer testutils.StartExecTrace(t, scope.GetDirectory()).Finish(t)
183188

184189
testCases := []struct {
185190
name string

0 commit comments

Comments
 (0)