Skip to content

Commit 3ed2e93

Browse files
committed
Refine VMG member update to avoid race condition
Signed-off-by: Gong Zhang <gong.zhang@broadcom.com>
1 parent b981168 commit 3ed2e93

File tree

2 files changed

+151
-136
lines changed

2 files changed

+151
-136
lines changed

controllers/vmware/virtualmachinegroup_reconciler.go

Lines changed: 62 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,27 @@ func (r *VirtualMachineGroupReconciler) createOrUpdateVirtualMachineGroup(ctx co
199199
}
200200
}
201201

202+
// The core purpose of isCreateOrPatchAllowed is to prevent the VirtualMachineGroup from being updated with new members
203+
// that require placement, unless the VirtualMachineGroup
204+
// has successfully completed its initial placement and added the required
205+
// placement annotations. This stabilizes placement decisions before allowing new VMs
206+
// to be added under the group.
207+
//s
208+
// The CreateOrPatch is allowed if:
209+
// 1. The VirtualMachineGroup is being initially created.
210+
// 2. The update is a scale-down operation.
211+
// 3. The VirtualMachineGroup is placement Ready.
212+
// 4. The new member's underlying CAPI Machine has a FailureDomain set (will skip placement process).
213+
// 5. The new member requires placement annotation AND the VirtualMachineGroup has the corresponding
214+
// placement annotation for the member's MachineDeployment.
215+
//
216+
// This prevents member updates that could lead to new VMs being created
217+
// without necessary zone labels, resulting in undesired placement.
218+
err = isCreateOrPatchAllowed(ctx, r.Client, members, vmg)
219+
if err != nil {
220+
return reconcile.Result{}, err
221+
}
222+
202223
// Use CreateOrPatch to create or update the VirtualMachineGroup.
203224
_, err = controllerutil.CreateOrPatch(ctx, r.Client, vmg, func() error {
204225
return r.reconcileVirtualMachineGroup(ctx, vmg, cluster, members, mdNames)
@@ -227,31 +248,10 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
227248
return err
228249
}
229250

230-
// Member Update:
231-
// The VirtualMachineGroup's BootOrder.Members list, is only allowed to be set or added
232-
// during two phases to maintain control over VM placement:
233-
//
234-
// 1. Initial Creation: When the VirtualMachineGroup object does not yet exist.
235-
// 2. Post-Placement: After the VirtualMachineGroup exists AND is marked Ready which means all members are placed successfully,
236-
// and critically, all MachineDeployments have a corresponding zone placement annotation recorded on the VMG.
237-
//
238-
// For member removal, this is always allowed since it doesn't impact ongoing placement or rely on the placement annotation.
239-
//
240-
// This prevents member updates that could lead to new VMs being created
241-
// without necessary zone labels, resulting in undesired placement, such as VM within a MachineDeployment but are
242-
// placed to different Zones.
243-
244-
isMemberUpdateAllowed, err := isMemberUpdateAllowed(ctx, r.Client, members, vmg)
245-
if err != nil {
246-
return err
247-
}
248-
249-
if isMemberUpdateAllowed {
250-
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
251-
{
252-
Members: members,
253-
},
254-
}
251+
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
252+
{
253+
Members: members,
254+
},
255255
}
256256

257257
// Set the owner reference
@@ -262,9 +262,8 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
262262
return nil
263263
}
264264

265-
// isMemberUpdateAllowed determines if the BootOrder.Members field can be safely updated on the VirtualMachineGroup.
266-
// It allows updates only during initial creation or after all member placement are completed successfully.
267-
func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) (bool, error) {
265+
// isCreateOrPatchAllowed checks if a VirtualMachineGroup is allowd to create or patch by check if BootOrder.Members update is allowed.
266+
func isCreateOrPatchAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) error {
268267
logger := log.FromContext(ctx)
269268
key := client.ObjectKey{
270269
Namespace: vmg.Namespace,
@@ -275,38 +274,48 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
275274
currentVMG := &vmoprv1.VirtualMachineGroup{}
276275
if err := kubeClient.Get(ctx, key, currentVMG); err != nil {
277276
if apierrors.IsNotFound(err) {
278-
// If VirtualMachineGroup is not found, allow member update as it should be in initial creation phase.
279-
logger.V(5).Info("VirtualMachineGroup not found, allowing member update for initial creation.")
280-
return true, nil
277+
// 1. If VirtualMachineGroup is not found, allow CreateOrPatch as it should be in initial creation phase.
278+
logger.V(6).Info("VirtualMachineGroup not created yet, allowing create")
279+
return nil
281280
}
282-
return false, errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s", vmg.Namespace, vmg.Name)
281+
return errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s, blocking patch", vmg.Namespace, vmg.Name)
283282
}
284-
// Copy retrieved data back to the input pointer for consistency
283+
// Copy retrieved data back to the input pointer for consistency.
285284
*vmg = *currentVMG
286285

287-
// Get current member names from VirtualMachineGroup Spec.BootOrder
286+
// Get current member names from VirtualMachineGroup Spec.BootOrder.
288287
currentMemberNames := make(map[string]struct{})
289288
if len(vmg.Spec.BootOrder) > 0 {
290289
for _, m := range vmg.Spec.BootOrder[0].Members {
291290
currentMemberNames[m.Name] = struct{}{}
292291
}
293292
}
294293

295-
// 1. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
294+
// 2. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
296295
if len(targetMember) < len(currentMemberNames) {
297-
logger.V(5).Info("Scaling down detected (fewer target members), allowing member update.")
298-
return true, nil
296+
logger.V(6).Info("Scaling down detected (fewer target members), allowing patch.")
297+
return nil
299298
}
300299

301-
// 2. If adding members, continue following checks.
302300
var newMembers []vmoprv1.GroupMember
303301
for _, m := range targetMember {
304302
if _, exists := currentMemberNames[m.Name]; !exists {
305303
newMembers = append(newMembers, m)
306304
}
307305
}
308306

309-
// 3. Check newly added members for Machine.Spec.FailureDomain via VSphereMachine.If a member belongs to a Machine
307+
// If no new member added, allow patch.
308+
if len(newMembers) == 0 {
309+
logger.V(6).Info("No new member detected, allowing patch.")
310+
return nil
311+
}
312+
313+
// 3. If initial placement is still in progress, block adding new member.
314+
if !conditions.IsTrue(vmg, vmoprv1.ReadyConditionType) {
315+
return fmt.Errorf("waiting for VirtualMachineGroup %s to get condition %s to true, temporarily blocking patch", klog.KObj(vmg), vmoprv1.ReadyConditionType)
316+
}
317+
318+
// 4. Check newly added members for Machine.Spec.FailureDomain via VSphereMachine.If a member belongs to a Machine
310319
// which has failureDomain specified, allow it since it will skip the placement
311320
// process. If not, continue to check if the belonging MachineDeployment has got placement annotation.
312321
for _, newMember := range newMembers {
@@ -317,10 +326,9 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
317326
vsphereMachine := &vmwarev1.VSphereMachine{}
318327
if err := kubeClient.Get(ctx, vsphereMachineKey, vsphereMachine); err != nil {
319328
if apierrors.IsNotFound(err) {
320-
logger.V(5).Info("VSphereMachine for new member not found, temporarily blocking update.", "VSphereMachineName", newMember.Name)
321-
return false, nil
329+
return errors.Wrapf(err, "VSphereMachine for new member %s not found, temporarily blocking patch", newMember.Name)
322330
}
323-
return false, errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
331+
return errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
324332
}
325333

326334
var machineOwnerName string
@@ -333,8 +341,7 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
333341

334342
if machineOwnerName == "" {
335343
// VSphereMachine found but owner Machine reference is missing
336-
logger.V(5).Info("VSphereMachine found but owner Machine reference is missing, temporarily blocking update.", "VSphereMachineName", newMember.Name)
337-
return false, nil
344+
return fmt.Errorf("VSphereMachine %s found but owner Machine reference is missing, temporarily blocking patch", newMember.Name)
338345
}
339346

340347
machineKey := types.NamespacedName{
@@ -345,46 +352,38 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
345352

346353
if err := kubeClient.Get(ctx, machineKey, machine); err != nil {
347354
if apierrors.IsNotFound(err) {
348-
logger.V(5).Info("CAPI Machine not found via owner reference, temporarily blocking update.", "Machine", klog.KRef(machineOwnerName, vmg.Namespace))
349-
return false, nil
355+
return errors.Wrapf(err, "Machine %s not found via owner reference, temporarily blocking patch", klog.KRef(machineOwnerName, vmg.Namespace))
350356
}
351-
return false, errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
357+
return errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
352358
}
353359

354-
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update.
360+
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update for this member.
355361
fd := machine.Spec.FailureDomain
356362
if fd != "" {
357-
logger.V(5).Info("New member's Machine has FailureDomain specified. Allowing VMG update for this member.")
363+
logger.V(6).Info("New member's Machine has FailureDomain specified. Allowing patch", "Member", newMember.Name)
358364
continue
359365
}
360366

361-
// If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to full VMG Annotation check.
362-
logger.V(5).Info("New member's CAPI Machine lacks FailureDomain. Falling through to VMG Annotation check.", "MachineName", machineOwnerName)
363-
364-
// If no Placement Annotations, skip member update and wait for it.
367+
// 5. If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to Annotation check.
368+
// If no Placement Annotations, block member update and wait for it.
365369
annotations := vmg.GetAnnotations()
366370
if len(annotations) == 0 {
367-
return false, nil
371+
return fmt.Errorf("waiting for placement annotation to add VMG member %s, temporarily blocking patch", newMember.Name)
368372
}
369373

370374
mdLabelName := vsphereMachine.Labels[clusterv1.MachineDeploymentNameLabel]
371375
if mdLabelName == "" {
372-
return false, errors.Wrapf(nil, "VSphereMachine doesn't have MachineDeployment name label %s", klog.KObj(vsphereMachine))
376+
return fmt.Errorf("VSphereMachine doesn't have MachineDeployment name label %s, blocking patch", klog.KObj(vsphereMachine))
373377
}
374378

375379
annotationKey := fmt.Sprintf("%s/%s", ZoneAnnotationPrefix, mdLabelName)
376-
377380
if _, found := annotations[annotationKey]; !found {
378-
logger.V(5).Info("Required placement annotation is missing.",
379-
"Member", newMember, "Annotation", annotationKey)
380-
return false, nil
381+
return fmt.Errorf("waiting for placement annotation %s to add VMG member %s, temporarily blocking patch", annotationKey, newMember.Name)
381382
}
382-
383-
logger.V(5).Info("New member requires placement annotation and it is present. Allowing this member.", "Member", newMember)
384383
}
385384

386-
logger.V(5).Info("Either no new members, or all newly added members existed or have satisfied placement requirements, allowing update.")
387-
return true, nil
385+
logger.V(6).Info("All newly added members either existed or have satisfied placement requirements, allowing patch")
386+
return nil
388387
}
389388

390389
// getExpectedVSphereMachineCount get expected total count of Machines belonging to the Cluster.

0 commit comments

Comments
 (0)