Skip to content

Commit

Permalink
fix: updates binding transition times (#906)
Browse files Browse the repository at this point in the history
update binding transition time available condition & add retry
  • Loading branch information
britaniar committed Sep 6, 2024
1 parent de6689a commit dcad29d
Show file tree
Hide file tree
Showing 10 changed files with 800 additions and 12 deletions.
5 changes: 5 additions & 0 deletions apis/placement/v1/binding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ func (m *ClusterResourceBinding) SetConditions(conditions ...metav1.Condition) {
}
}

// RemoveCondition removes the condition of the given ClusterResourceBinding.
func (m *ClusterResourceBinding) RemoveCondition(conditionType string) {
meta.RemoveStatusCondition(&m.Status.Conditions, conditionType)
}

// GetCondition returns the condition of the given ClusterResourceBinding.
func (m *ClusterResourceBinding) GetCondition(conditionType string) *metav1.Condition {
return meta.FindStatusCondition(m.Status.Conditions, conditionType)
Expand Down
5 changes: 5 additions & 0 deletions apis/placement/v1beta1/binding_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ func (b *ClusterResourceBinding) SetConditions(conditions ...metav1.Condition) {
}
}

// RemoveCondition removes the condition of the given ClusterResourceBinding.
func (b *ClusterResourceBinding) RemoveCondition(conditionType string) {
meta.RemoveStatusCondition(&b.Status.Conditions, conditionType)
}

// GetCondition returns the condition of the given ClusterResourceBinding.
func (b *ClusterResourceBinding) GetCondition(conditionType string) *metav1.Condition {
return meta.FindStatusCondition(b.Status.Conditions, conditionType)
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/clusterresourcebindingwatcher/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func isBindingStatusUpdated(oldBinding, newBinding *fleetv1beta1.ClusterResource
oldCond := oldBinding.GetCondition(string(i.ResourceBindingConditionType()))
newCond := newBinding.GetCondition(string(i.ResourceBindingConditionType()))
if !condition.EqualCondition(oldCond, newCond) {
klog.V(2).InfoS("The binding condition has changed, need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding))
klog.V(2).InfoS("The binding condition has changed, need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding), "type", i.ResourceBindingConditionType())
return true
}
}
Expand Down
59 changes: 52 additions & 7 deletions pkg/controllers/workgenerator/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques

workUpdated := false
overrideSucceeded := false
// Reset the conditions and failed placements.
for i := condition.OverriddenCondition; i < condition.TotalCondition; i++ {
resourceBinding.RemoveCondition(string(i.ResourceBindingConditionType()))
}
resourceBinding.Status.FailedPlacements = nil
// list all the corresponding works
works, syncErr := r.listAllWorksAssociated(ctx, &resourceBinding)
if syncErr == nil {
Expand Down Expand Up @@ -152,8 +157,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
if err := errors.Unwrap(syncErr); err != nil && len(err.Error()) > 2 {
errorMessage = errorMessage[len(err.Error())+2:]
}
// remove all the failedPlacement as it does not reflect the latest status
resourceBinding.Status.FailedPlacements = nil
if !overrideSucceeded {
resourceBinding.SetConditions(metav1.Condition{
Status: metav1.ConditionFalse,
Expand All @@ -180,8 +183,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
Message: "All of the works are synchronized to the latest",
})
if workUpdated {
// revert the applied condition and failedPlacement if we made any changes to the work
resourceBinding.Status.FailedPlacements = nil
// revert the applied condition if we made any changes to the work
resourceBinding.SetConditions(metav1.Condition{
Status: metav1.ConditionFalse,
Type: string(fleetv1beta1.ResourceBindingApplied),
Expand All @@ -195,9 +197,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
}

// update the resource binding status
if updateErr := r.Client.Status().Update(ctx, &resourceBinding); updateErr != nil {
klog.ErrorS(updateErr, "Failed to update the resourceBinding status", "resourceBinding", bindingRef)
return controllerruntime.Result{}, controller.NewUpdateIgnoreConflictError(updateErr)
if updateErr := r.updateBindingStatusWithRetry(ctx, &resourceBinding); updateErr != nil {
return controllerruntime.Result{}, updateErr
}
if errors.Is(syncErr, controller.ErrUserError) {
// Stop retry when the error is caused by user error
Expand All @@ -220,6 +221,50 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
return controllerruntime.Result{}, syncErr
}

// updateBindingStatusWIthRetry sends the update request to API server with retry.
func (r *Reconciler) updateBindingStatusWithRetry(ctx context.Context, resourceBinding *fleetv1beta1.ClusterResourceBinding) error {
// Retry only for specific errors or conditions
err := r.Client.Status().Update(ctx, resourceBinding)
if err != nil {
klog.ErrorS(err, "Failed to update the resourceBinding status, will retry", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
errAfterRetries := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
var latestBinding fleetv1beta1.ClusterResourceBinding
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(resourceBinding), &latestBinding); err != nil {
return err
}
// Work generator is the only controller that updates conditions excluding rollout started which is updated by rollout controller.
if rolloutCond := latestBinding.GetCondition(string(fleetv1beta1.ResourceBindingRolloutStarted)); rolloutCond != nil {
found := false
for i := range resourceBinding.Status.Conditions {
if resourceBinding.Status.Conditions[i].Type == rolloutCond.Type {
// Replace the existing condition
resourceBinding.Status.Conditions[i] = *rolloutCond
found = true
break
}
}
if !found {
// Prepend the new condition if it wasn't found
resourceBinding.Status.Conditions = append([]metav1.Condition{*rolloutCond}, resourceBinding.Status.Conditions...)
}
}

if err := r.Client.Status().Update(ctx, resourceBinding); err != nil {
klog.ErrorS(err, "Failed to update the resourceBinding status on retry", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
return err
}
klog.V(2).InfoS("Successfully updated the resourceBinding status", "resourceBinding", klog.KObj(resourceBinding), "resourceBindingStatus", resourceBinding.Status)
return nil
})
if errAfterRetries != nil {
klog.ErrorS(errAfterRetries, "Failed to update binding status after retries", "resourceBinding", klog.KObj(resourceBinding))
return errAfterRetries
}
return nil
}
return err
}

// handleDelete handle a deleting binding
func (r *Reconciler) handleDelete(ctx context.Context, resourceBinding *fleetv1beta1.ClusterResourceBinding) (controllerruntime.Result, error) {
klog.V(4).InfoS("Start to handle deleting resource binding", "resourceBinding", klog.KObj(resourceBinding))
Expand Down
127 changes: 126 additions & 1 deletion pkg/controllers/workgenerator/controller_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ var (
validResourceOverrideSnapshot placementv1alpha1.ResourceOverrideSnapshot
invalidClusterResourceOverrideSnapshot placementv1alpha1.ClusterResourceOverrideSnapshot

cmpConditionOption = cmp.Options{cmpopts.SortSlices(utils.LessFuncFailedResourcePlacements), utils.IgnoreConditionLTTAndMessageFields, cmpopts.EquateEmpty()}
cmpConditionOption = cmp.Options{cmpopts.SortSlices(utils.LessFuncFailedResourcePlacements), utils.IgnoreConditionLTTAndMessageFields, cmpopts.EquateEmpty()}
cmpConditionOptionWithLTT = cmp.Options{cmpopts.SortSlices(utils.LessFuncFailedResourcePlacements), cmpopts.EquateEmpty()}

fakeFailedAppliedReason = "fakeApplyFailureReason"
fakeFailedAppliedMessage = "fake apply failure message"
Expand Down Expand Up @@ -1308,6 +1309,124 @@ var _ = Describe("Test Work Generator Controller", func() {
}, timeout, interval).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name))
})
})

Context("Should not touch/reset RolloutStarted condition when the binding is updated", func() {
var masterSnapshot *placementv1beta1.ClusterResourceSnapshot

BeforeEach(func() {
masterSnapshot = generateResourceSnapshot(1, 1, 0, [][]byte{
testResourceCRD, testNameSpace, testResource,
})
Expect(k8sClient.Create(ctx, masterSnapshot)).Should(Succeed())
By(fmt.Sprintf("master resource snapshot %s created", masterSnapshot.Name))
spec := placementv1beta1.ResourceBindingSpec{
State: placementv1beta1.BindingStateBound,
ResourceSnapshotName: masterSnapshot.Name,
TargetCluster: memberClusterName,
}
binding = generateClusterResourceBinding(spec)
Expect(k8sClient.Create(ctx, binding)).Should(Succeed())
By(fmt.Sprintf("resource binding %s created", binding.Name))
})

AfterEach(func() {
By("Deleting master clusterResourceSnapshot")
Expect(k8sClient.Delete(ctx, masterSnapshot)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{}))
})

It("Should create all the work in the target namespace after the resource snapshot is created", func() {
// check the binding status till the bound condition is true
Eventually(func() bool {
if err := k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding); err != nil {
return false
}
// only check the work created status as the applied status reason changes depends on where the reconcile logic is
return condition.IsConditionStatusTrue(
meta.FindStatusCondition(binding.Status.Conditions, string(placementv1beta1.ResourceBindingWorkSynchronized)), binding.GetGeneration())
}, timeout, interval).Should(BeTrue(), fmt.Sprintf("binding(%s) condition should be true", binding.Name))
// check the work is created by now
work := placementv1beta1.Work{}
Eventually(func() error {
return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
By(fmt.Sprintf("work %s is created in %s", work.Name, work.Namespace))
//inspect the work
wantWork := placementv1beta1.Work{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName),
Namespace: memberClusterNamespaceName,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: placementv1beta1.GroupVersion.String(),
Kind: "ClusterResourceBinding",
Name: binding.Name,
UID: binding.UID,
BlockOwnerDeletion: ptr.To(true),
},
},
Labels: map[string]string{
placementv1beta1.CRPTrackingLabel: testCRPName,
placementv1beta1.ParentBindingLabel: binding.Name,
placementv1beta1.ParentResourceSnapshotIndexLabel: "1",
},
},
Spec: placementv1beta1.WorkSpec{
Workload: placementv1beta1.WorkloadTemplate{
Manifests: []placementv1beta1.Manifest{
{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
{RawExtension: runtime.RawExtension{Raw: testNameSpace}},
{RawExtension: runtime.RawExtension{Raw: testResource}},
},
},
},
}
diff := cmp.Diff(wantWork, work, ignoreWorkOption, ignoreTypeMeta)
Expect(diff).Should(BeEmpty(), fmt.Sprintf("work(%s) mismatch (-want +got):\n%s", work.Name, diff))
// check the binding status that it should be marked as work not applied eventually
verifyBindingStatusSyncedNotApplied(binding, false, true)
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted))
// mark the work applied
markWorkApplied(&work)
// check the binding status that it should be marked as applied true eventually
verifyBindStatusAppliedNotAvailable(binding, false)
checkRolloutStartedNotUpdated(rolloutCond, binding)
// mark the work available
markWorkAvailable(&work)
// check the binding status that it should be marked as available true eventually
verifyBindStatusAvail(binding, false)
checkRolloutStartedNotUpdated(rolloutCond, binding)
})

It("Should treat the unscheduled binding as bound and not remove work", func() {
// check the work is created
work := placementv1beta1.Work{}
Eventually(func() error {
return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
}, timeout, interval).Should(Succeed(), "Failed to get the expected work in hub cluster")
By(fmt.Sprintf("work %s is created in %s", work.Name, work.Namespace))
// update binding to be unscheduled
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
rolloutCond := binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted))
binding.Spec.State = placementv1beta1.BindingStateUnscheduled
Expect(k8sClient.Update(ctx, binding)).Should(Succeed())
By(fmt.Sprintf("resource binding %s updated to be unscheduled", binding.Name))
Consistently(func() error {
return k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(placementv1beta1.FirstWorkNameFmt, testCRPName), Namespace: memberClusterNamespaceName}, &work)
}, duration, interval).Should(Succeed(), "controller should not remove work in hub cluster for unscheduled binding")
//inspect the work manifest to make sure it still has the same content
expectedManifest := []placementv1beta1.Manifest{
{RawExtension: runtime.RawExtension{Raw: testResourceCRD}},
{RawExtension: runtime.RawExtension{Raw: testNameSpace}},
{RawExtension: runtime.RawExtension{Raw: testResource}},
}
diff := cmp.Diff(expectedManifest, work.Spec.Workload.Manifests)
Expect(diff).Should(BeEmpty(), fmt.Sprintf("work manifest(%s) mismatch (-want +got):\n%s", work.Name, diff))
// check the binding status
verifyBindingStatusSyncedNotApplied(binding, false, false)
checkRolloutStartedNotUpdated(rolloutCond, binding)
})
})
})

Context("Test Bound ClusterResourceBinding with not found cluster", func() {
Expand Down Expand Up @@ -1946,3 +2065,9 @@ func markOneManifestAvailable(work *placementv1beta1.Work) {
Expect(k8sClient.Status().Update(ctx, work)).Should(Succeed())
By(fmt.Sprintf("resource work `%s` is marked as available", work.Name))
}

func checkRolloutStartedNotUpdated(rolloutCond *metav1.Condition, binding *placementv1beta1.ClusterResourceBinding) {
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: binding.Name}, binding)).Should(Succeed())
diff := cmp.Diff(rolloutCond, binding.GetCondition(string(placementv1beta1.ResourceBindingRolloutStarted)), cmpConditionOptionWithLTT)
Expect(diff).Should(BeEmpty(), fmt.Sprintf("binding(%s) mismatch (-want +got)", binding.Name), diff)
}
Loading

0 comments on commit dcad29d

Please sign in to comment.