Skip to content

Commit 42a6d73

Browse files
committed
Refine VMG member update to avoid race condition
Signed-off-by: Gong Zhang <gong.zhang@broadcom.com>
1 parent b981168 commit 42a6d73

File tree

2 files changed

+100
-87
lines changed

2 files changed

+100
-87
lines changed

controllers/vmware/virtualmachinegroup_reconciler.go

Lines changed: 49 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -227,31 +227,32 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
227227
return err
228228
}
229229

230-
// Member Update:
231-
// The VirtualMachineGroup's BootOrder.Members list, is only allowed to be set or added
232-
// during two phases to maintain control over VM placement:
233-
//
234-
// 1. Initial Creation: When the VirtualMachineGroup object does not yet exist.
235-
// 2. Post-Placement: After the VirtualMachineGroup exists AND is marked Ready which means all members are placed successfully,
236-
// and critically, all MachineDeployments have a corresponding zone placement annotation recorded on the VMG.
237-
//
238-
// For member removal, this is always allowed since it doesn't impact ongoing placement or rely on the placement annotation.
230+
// The core purpose of isMemberUpdateAllowed is to prevent the VirtualMachineGroup from being updated with new members
231+
// that require placement, unless the VirtualMachineGroup
232+
// has successfully completed its initial placement and added the required
233+
// placement annotations. This stabilizes placement decisions before allowing new VMs
234+
// to be created under the group.
235+
//s
236+
// The update is allowed if:
237+
// 1. The VirtualMachineGroup is being initially created.
238+
// 2. The update is a scale-down operation (fewer target members).
239+
// 3. The VirtualMachineGroup is placement Ready.
240+
// 4. The new member's underlying CAPI Machine has a FailureDomain set (will skip placement process).
241+
// 5. The new member requires placement annotation AND the VirtualMachineGroup has the corresponding
242+
// placement annotation for the member's MachineDeployment.
239243
//
240244
// This prevents member updates that could lead to new VMs being created
241245
// without necessary zone labels, resulting in undesired placement, such as VM within a MachineDeployment but are
242246
// placed to different Zones.
243-
244-
isMemberUpdateAllowed, err := isMemberUpdateAllowed(ctx, r.Client, members, vmg)
247+
err := isMemberUpdateAllowed(ctx, r.Client, members, vmg)
245248
if err != nil {
246249
return err
247250
}
248251

249-
if isMemberUpdateAllowed {
250-
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
251-
{
252-
Members: members,
253-
},
254-
}
252+
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
253+
{
254+
Members: members,
255+
},
255256
}
256257

257258
// Set the owner reference
@@ -262,9 +263,8 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
262263
return nil
263264
}
264265

265-
// isMemberUpdateAllowed determines if the BootOrder.Members field can be safely updated on the VirtualMachineGroup.
266-
// It allows updates only during initial creation or after all member placement are completed successfully.
267-
func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) (bool, error) {
266+
// isMemberUpdateAllowed checks if a VirtualMachineGroup's BootOrder.Members update is allowed.
267+
func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) error {
268268
logger := log.FromContext(ctx)
269269
key := client.ObjectKey{
270270
Namespace: vmg.Namespace,
@@ -275,38 +275,43 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
275275
currentVMG := &vmoprv1.VirtualMachineGroup{}
276276
if err := kubeClient.Get(ctx, key, currentVMG); err != nil {
277277
if apierrors.IsNotFound(err) {
278-
// If VirtualMachineGroup is not found, allow member update as it should be in initial creation phase.
279-
logger.V(5).Info("VirtualMachineGroup not found, allowing member update for initial creation.")
280-
return true, nil
278+
// 1. If VirtualMachineGroup is not found, allow member update as it should be in initial creation phase.
279+
logger.V(5).Info("VirtualMachineGroup not created yet, allowing member update for initial creation.")
280+
return nil
281281
}
282-
return false, errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s", vmg.Namespace, vmg.Name)
282+
return errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s, blocking member update", vmg.Namespace, vmg.Name)
283283
}
284-
// Copy retrieved data back to the input pointer for consistency
284+
// Copy retrieved data back to the input pointer for consistency.
285285
*vmg = *currentVMG
286286

287-
// Get current member names from VirtualMachineGroup Spec.BootOrder
287+
// Get current member names from VirtualMachineGroup Spec.BootOrder.
288288
currentMemberNames := make(map[string]struct{})
289289
if len(vmg.Spec.BootOrder) > 0 {
290290
for _, m := range vmg.Spec.BootOrder[0].Members {
291291
currentMemberNames[m.Name] = struct{}{}
292292
}
293293
}
294294

295-
// 1. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
295+
// 2. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
296296
if len(targetMember) < len(currentMemberNames) {
297297
logger.V(5).Info("Scaling down detected (fewer target members), allowing member update.")
298-
return true, nil
298+
return nil
299299
}
300300

301-
// 2. If adding members, continue following checks.
301+
// 3. If initial placement is still in progress, block adding new member.
302+
if !conditions.IsTrue(vmg, vmoprv1.ReadyConditionType) {
303+
return fmt.Errorf("waiting for VirtualMachineGroup %s to get condition %s to true, temporarily blocking member update", klog.KObj(vmg), vmoprv1.ReadyConditionType)
304+
}
305+
306+
// If adding members, continue following checks.
302307
var newMembers []vmoprv1.GroupMember
303308
for _, m := range targetMember {
304309
if _, exists := currentMemberNames[m.Name]; !exists {
305310
newMembers = append(newMembers, m)
306311
}
307312
}
308313

309-
// 3. Check newly added members for Machine.Spec.FailureDomain via VSphereMachine.If a member belongs to a Machine
314+
// 4. Check newly added members for Machine.Spec.FailureDomain via VSphereMachine.If a member belongs to a Machine
310315
// which has failureDomain specified, allow it since it will skip the placement
311316
// process. If not, continue to check if the belonging MachineDeployment has got placement annotation.
312317
for _, newMember := range newMembers {
@@ -317,10 +322,9 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
317322
vsphereMachine := &vmwarev1.VSphereMachine{}
318323
if err := kubeClient.Get(ctx, vsphereMachineKey, vsphereMachine); err != nil {
319324
if apierrors.IsNotFound(err) {
320-
logger.V(5).Info("VSphereMachine for new member not found, temporarily blocking update.", "VSphereMachineName", newMember.Name)
321-
return false, nil
325+
return errors.Wrapf(err, "VSphereMachine for new member %s not found, temporarily blocking member update", newMember.Name)
322326
}
323-
return false, errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
327+
return errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
324328
}
325329

326330
var machineOwnerName string
@@ -333,8 +337,7 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
333337

334338
if machineOwnerName == "" {
335339
// VSphereMachine found but owner Machine reference is missing
336-
logger.V(5).Info("VSphereMachine found but owner Machine reference is missing, temporarily blocking update.", "VSphereMachineName", newMember.Name)
337-
return false, nil
340+
return fmt.Errorf("VSphereMachine %s found but owner Machine reference is missing, temporarily blocking member update", newMember.Name)
338341
}
339342

340343
machineKey := types.NamespacedName{
@@ -345,46 +348,38 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
345348

346349
if err := kubeClient.Get(ctx, machineKey, machine); err != nil {
347350
if apierrors.IsNotFound(err) {
348-
logger.V(5).Info("CAPI Machine not found via owner reference, temporarily blocking update.", "Machine", klog.KRef(machineOwnerName, vmg.Namespace))
349-
return false, nil
351+
return errors.Wrapf(err, "Machine %s not found via owner reference, temporarily blocking member update", klog.KRef(machineOwnerName, vmg.Namespace))
350352
}
351-
return false, errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
353+
return errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
352354
}
353355

354-
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update.
356+
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update for this member.
355357
fd := machine.Spec.FailureDomain
356358
if fd != "" {
357-
logger.V(5).Info("New member's Machine has FailureDomain specified. Allowing VMG update for this member.")
359+
logger.V(5).Info("New member's Machine has FailureDomain specified. Allowing VMG update for this member.", "Member", newMember.Name)
358360
continue
359361
}
360362

361-
// If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to full VMG Annotation check.
362-
logger.V(5).Info("New member's CAPI Machine lacks FailureDomain. Falling through to VMG Annotation check.", "MachineName", machineOwnerName)
363-
364-
// If no Placement Annotations, skip member update and wait for it.
363+
// 5. If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to Annotation check.
364+
// If no Placement Annotations, block member update and wait for it.
365365
annotations := vmg.GetAnnotations()
366366
if len(annotations) == 0 {
367-
return false, nil
367+
return fmt.Errorf("waiting for placement annotation to update VMG member %s, temporarily blocking member update", newMember.Name)
368368
}
369369

370370
mdLabelName := vsphereMachine.Labels[clusterv1.MachineDeploymentNameLabel]
371371
if mdLabelName == "" {
372-
return false, errors.Wrapf(nil, "VSphereMachine doesn't have MachineDeployment name label %s", klog.KObj(vsphereMachine))
372+
return fmt.Errorf("VSphereMachine doesn't have MachineDeployment name label %s, blocking member update", klog.KObj(vsphereMachine))
373373
}
374374

375375
annotationKey := fmt.Sprintf("%s/%s", ZoneAnnotationPrefix, mdLabelName)
376-
377376
if _, found := annotations[annotationKey]; !found {
378-
logger.V(5).Info("Required placement annotation is missing.",
379-
"Member", newMember, "Annotation", annotationKey)
380-
return false, nil
377+
return fmt.Errorf("waiting for placement annotation %s to update VMG member %s, temporarily blocking member update", annotationKey, newMember.Name)
381378
}
382-
383-
logger.V(5).Info("New member requires placement annotation and it is present. Allowing this member.", "Member", newMember)
384379
}
385380

386-
logger.V(5).Info("Either no new members, or all newly added members existed or have satisfied placement requirements, allowing update.")
387-
return true, nil
381+
logger.V(5).Info("All newly added members either existed or have satisfied placement requirements, allowing member update.")
382+
return nil
388383
}
389384

390385
// getExpectedVSphereMachineCount get expected total count of Machines belonging to the Cluster.

0 commit comments

Comments
 (0)