Skip to content

Commit df1585e

Browse files
craig[bot]sumeerbhola
andcommitted
Merge #157066
157066: mma: more cleanup of pending changes r=tbg,wenyihu6 a=sumeerbhola The existing PendingRangeChange is now used in all the exported Allocator methods. This makes it clear that the set of changes are being treated as an atomic unit in the interface. The MMA code also reduces the switching between pendingReplicaChange, ChangeID and ReplicaChange which was confusing. It now uses pendingReplicaChange in more cases. Since we use PendingRangeChange to describe the change, and then later annotate it with the change ids, start time etc., there is some refactoring which involves producing a change using MakePendingRangeChange, and later changing its state using clusterState.addPendingRangeChange. Minor cleanup: The Allocator interface now includes integration methods implemented by allocatorState, as originally intended. Informs #157049 Epic: CRDB-55052 Release note: None Co-authored-by: sumeerbhola <sumeer@cockroachlabs.com>
2 parents ddeef36 + 972d746 commit df1585e

File tree

8 files changed

+237
-207
lines changed

8 files changed

+237
-207
lines changed

pkg/kv/kvserver/allocator/mmaprototype/allocator.go

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -43,34 +43,29 @@ type Allocator interface {
4343
// associated node in the cluster.
4444
ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoadMsg)
4545

46-
// TODO(sumeer): only a subset of the fields in
47-
// pendingReplicaChange/PendingRangeChange are relevant to the caller. Hide
48-
// the remaining.
49-
5046
// Methods related to making changes.
5147

52-
// AdjustPendingChangesDisposition is optional feedback to inform the
53-
// allocator of success or failure of proposed changes. For successful
54-
// changes, this is a faster way to know about success than waiting for the
55-
// next ProcessNodeLoadResponse from the local node. For failed changes, in
56-
// the absence of this feedback, proposed changes that have not been enacted
57-
// in N seconds will be garbage collected and assumed to have failed.
48+
// AdjustPendingChangeDisposition is optional feedback to inform the
49+
// allocator of success or failure of proposed changes to a range. For
50+
// successful changes, this is a faster way to know about success than
51+
// waiting for the next ProcessNodeLoadResponse from the local node. For
52+
// failed changes, in the absence of this feedback, proposed changes that
53+
// have not been enacted in N seconds will be garbage collected and assumed
54+
// to have failed.
5855
//
59-
// Calls to AdjustPendingChangesDisposition must be correctly sequenced with
56+
// Calls to AdjustPendingChangeDisposition must be correctly sequenced with
6057
// full state updates from the local node provided in
6158
// ProcessNodeLoadResponse.
62-
//
63-
// REQUIRES: len(changes) > 0 and all changes are to the same range.
64-
AdjustPendingChangesDisposition(changes []ChangeID, success bool)
59+
AdjustPendingChangeDisposition(change PendingRangeChange, success bool)
6560

66-
// RegisterExternalChanges informs this allocator about yet to complete
61+
// RegisterExternalChange informs this allocator about yet to complete
6762
// changes to the cluster which were not initiated by this allocator. The
68-
// caller is returned a list of ChangeIDs, corresponding 1:1 to each replica
69-
// change provided as an argument. The returned list of ChangeIDs should then
70-
// be used to call AdjustPendingChangesDisposition when the changes are
71-
// completed, either successfully or not. All changes should correspond to the
72-
// same range.
73-
RegisterExternalChanges(changes []ReplicaChange) []ChangeID
63+
// ownership of all state inside change is handed off to the callee. If ok
64+
// is true, the change was registered, and the caller should subsequently
65+
// use the same change in a subsequent call to
66+
// AdjustPendingChangeDisposition when the changes are completed, either
67+
// successfully or not. If ok is false, the change was not registered.
68+
RegisterExternalChange(change PendingRangeChange) (ok bool)
7469

7570
// ComputeChanges is called periodically and frequently, say every 10s.
7671
//
@@ -152,6 +147,23 @@ type Allocator interface {
152147
//
153148
// TODO(sumeer): remove once the integration is properly done.
154149
KnownStores() map[roachpb.StoreID]struct{}
150+
151+
// BuildMMARebalanceAdvisor is called by the allocator sync to build a
152+
// MMARebalanceAdvisor for the given existing store and candidates. The
153+
// advisor should be later passed to IsInConflictWithMMA to determine if a
154+
// given candidate is in conflict with the existing store.
155+
//
156+
// TODO(sumeer): merge the above comment with the comment in the
157+
// implementation.
158+
BuildMMARebalanceAdvisor(existing roachpb.StoreID, cands []roachpb.StoreID) *MMARebalanceAdvisor
159+
160+
// IsInConflictWithMMA is called by the allocator sync to determine if the
161+
// given candidate is in conflict with the existing store.
162+
//
163+
// TODO(sumeer): merge the above comment with the comment in the
164+
// implementation.
165+
IsInConflictWithMMA(
166+
ctx context.Context, cand roachpb.StoreID, advisor *MMARebalanceAdvisor, cpuOnly bool) bool
155167
}
156168

157169
// Avoid unused lint errors.

pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go

Lines changed: 52 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -544,16 +544,14 @@ func (cs *clusterState) rebalanceStores(
544544
NodeID: ss.NodeID,
545545
StoreID: ss.StoreID,
546546
}
547-
leaseChanges := MakeLeaseTransferChanges(
547+
replicaChanges := MakeLeaseTransferChanges(
548548
rangeID, rstate.replicas, rstate.load, addTarget, removeTarget)
549-
if err := cs.preCheckOnApplyReplicaChanges(leaseChanges[:]); err != nil {
550-
panic(errors.Wrapf(err, "pre-check failed for lease transfer %v", leaseChanges))
549+
leaseChange := MakePendingRangeChange(rangeID, replicaChanges[:])
550+
if err := cs.preCheckOnApplyReplicaChanges(leaseChange.pendingReplicaChanges); err != nil {
551+
panic(errors.Wrapf(err, "pre-check failed for lease transfer %v", leaseChange))
551552
}
552-
pendingChanges := cs.createPendingChanges(leaseChanges[:]...)
553-
changes = append(changes, PendingRangeChange{
554-
RangeID: rangeID,
555-
pendingReplicaChanges: pendingChanges[:],
556-
})
553+
cs.addPendingRangeChange(leaseChange)
554+
changes = append(changes, leaseChange)
557555
leaseTransferCount++
558556
if changes[len(changes)-1].IsChangeReplicas() || !changes[len(changes)-1].IsTransferLease() {
559557
panic(fmt.Sprintf("lease transfer is invalid: %v", changes[len(changes)-1]))
@@ -763,15 +761,13 @@ func (cs *clusterState) rebalanceStores(
763761
}
764762
replicaChanges := makeRebalanceReplicaChanges(
765763
rangeID, rstate.replicas, rstate.load, addTarget, removeTarget)
766-
if err = cs.preCheckOnApplyReplicaChanges(replicaChanges[:]); err != nil {
764+
rangeChange := MakePendingRangeChange(rangeID, replicaChanges[:])
765+
if err = cs.preCheckOnApplyReplicaChanges(rangeChange.pendingReplicaChanges); err != nil {
767766
panic(errors.Wrapf(err, "pre-check failed for replica changes: %v for %v",
768767
replicaChanges, rangeID))
769768
}
770-
pendingChanges := cs.createPendingChanges(replicaChanges[:]...)
771-
changes = append(changes, PendingRangeChange{
772-
RangeID: rangeID,
773-
pendingReplicaChanges: pendingChanges[:],
774-
})
769+
cs.addPendingRangeChange(rangeChange)
770+
changes = append(changes, rangeChange)
775771
rangeMoveCount++
776772
log.KvDistribution.VInfof(ctx, 2,
777773
"result(success): rebalancing r%v from s%v to s%v [change: %v] with resulting loads source: %v target: %v",
@@ -807,93 +803,73 @@ func (a *allocatorState) SetStore(store StoreAttributesAndLocality) {
807803
a.cs.setStore(store)
808804
}
809805

810-
// ProcessStoreLeaseholderMsg implements the Allocator interface.
806+
// ProcessStoreLoadMsg implements the Allocator interface.
811807
func (a *allocatorState) ProcessStoreLoadMsg(ctx context.Context, msg *StoreLoadMsg) {
812808
a.mu.Lock()
813809
defer a.mu.Unlock()
814810
a.cs.processStoreLoadMsg(ctx, msg)
815811
}
816812

817-
// AdjustPendingChangesDisposition implements the Allocator interface.
818-
func (a *allocatorState) AdjustPendingChangesDisposition(changeIDs []ChangeID, success bool) {
813+
// AdjustPendingChangeDisposition implements the Allocator interface.
814+
func (a *allocatorState) AdjustPendingChangeDisposition(change PendingRangeChange, success bool) {
819815
a.mu.Lock()
820816
defer a.mu.Unlock()
821-
// NB: It is possible that some of the changeIDs have already been enacted
822-
// via StoreLeaseholderMsg, and even been garbage collected. So no
823-
// assumption can be made about whether these changeIDs will be found in the
824-
// allocator's state.
825-
if !success {
826-
// Gather the changes that are found and need to be undone.
827-
replicaChanges := make([]ReplicaChange, 0, len(changeIDs))
828-
for _, changeID := range changeIDs {
829-
change, ok := a.cs.pendingChanges[changeID]
830-
if !ok {
831-
continue
832-
}
833-
rs, ok := a.cs.ranges[change.rangeID]
834-
if !ok {
835-
panic(errors.AssertionFailedf("range %v not found in cluster state", change.rangeID))
836-
}
837-
if rs.pendingChangeNoRollback {
838-
// All the changes are to the same range, so return.
839-
return
840-
}
841-
replicaChanges = append(replicaChanges, change.ReplicaChange)
842-
}
843-
if len(replicaChanges) == 0 {
844-
return
817+
rs, ok := a.cs.ranges[change.RangeID]
818+
if !ok {
819+
// Range no longer exists. This can happen if the StoreLeaseholderMsg
820+
// which included the effect of the change that transferred the lease away
821+
// was already processed, causing the range to no longer be tracked by the
822+
// allocator.
823+
return
824+
}
825+
if !success && rs.pendingChangeNoRollback {
826+
// Not allowed to undo.
827+
return
828+
}
829+
// NB: It is possible that some of the changes have already been enacted via
830+
// StoreLeaseholderMsg, and even been garbage collected. So no assumption
831+
// can be made about whether these changes will be found in the allocator's
832+
// state. We gather the found changes.
833+
var changes []*pendingReplicaChange
834+
for _, c := range change.pendingReplicaChanges {
835+
ch, ok := a.cs.pendingChanges[c.ChangeID]
836+
if !ok {
837+
continue
845838
}
846-
// Check that we can undo these changes. If not, log and return.
847-
if err := a.cs.preCheckOnUndoReplicaChanges(replicaChanges); err != nil {
848-
// TODO(sumeer): we should be able to panic here, once the interface
849-
// contract says that all the proposed changes must be included in
850-
// changeIDs. Without that contract, there may be a pair of changes
851-
// (remove replica and lease from s1), (add replica and lease to s2),
852-
// and the caller can provide the first changeID only, and the undo
853-
// would cause two leaseholders. The pre-check would catch that here.
854-
log.KvDistribution.Infof(context.Background(), "did not undo change %v: due to %v", changeIDs, err)
855-
return
839+
changes = append(changes, ch)
840+
}
841+
if len(changes) == 0 {
842+
return
843+
}
844+
if !success {
845+
// Check that we can undo these changes.
846+
if err := a.cs.preCheckOnUndoReplicaChanges(changes); err != nil {
847+
panic(err)
856848
}
857849
}
858-
859-
for _, changeID := range changeIDs {
860-
// We set !requireFound, since some of these pending changes may no longer
861-
// exist in the allocator's state. For example, a StoreLeaseholderMsg that
862-
// happened after the pending change was created and before this call to
863-
// AdjustPendingChangesDisposition may have already removed the pending
864-
// change.
850+
for _, c := range changes {
865851
if success {
866-
// TODO(sumeer): this code is implicitly assuming that all the changes
867-
// on the rangeState are being enacted. And that is true of the current
868-
// callers. We should explicitly state the assumption in the interface.
869-
// Because if only some are being enacted, we ought to set
870-
// pendingChangeNoRollback, and we don't bother to.
871-
a.cs.pendingChangeEnacted(changeID, a.cs.ts.Now(), false)
852+
a.cs.pendingChangeEnacted(c.ChangeID, a.cs.ts.Now())
872853
} else {
873-
a.cs.undoPendingChange(changeID, false)
854+
a.cs.undoPendingChange(c.ChangeID)
874855
}
875856
}
876857
}
877858

878-
// RegisterExternalChanges implements the Allocator interface. All changes should
879-
// correspond to the same range, panic otherwise.
880-
func (a *allocatorState) RegisterExternalChanges(changes []ReplicaChange) []ChangeID {
859+
// RegisterExternalChange implements the Allocator interface.
860+
func (a *allocatorState) RegisterExternalChange(change PendingRangeChange) (ok bool) {
881861
a.mu.Lock()
882862
defer a.mu.Unlock()
883-
if err := a.cs.preCheckOnApplyReplicaChanges(changes); err != nil {
863+
if err := a.cs.preCheckOnApplyReplicaChanges(change.pendingReplicaChanges); err != nil {
884864
a.mmaMetrics.ExternalFailedToRegister.Inc(1)
885865
log.KvDistribution.Infof(context.Background(),
886866
"did not register external changes: due to %v", err)
887-
return nil
867+
return false
888868
} else {
889869
a.mmaMetrics.ExternaRegisterSuccess.Inc(1)
890870
}
891-
pendingChanges := a.cs.createPendingChanges(changes...)
892-
changeIDs := make([]ChangeID, len(pendingChanges))
893-
for i, pendingChange := range pendingChanges {
894-
changeIDs[i] = pendingChange.ChangeID
895-
}
896-
return changeIDs
871+
a.cs.addPendingRangeChange(change)
872+
return true
897873
}
898874

899875
// ComputeChanges implements the Allocator interface.

0 commit comments

Comments
 (0)