Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ssa: Improve wait error reporting #722

Merged
merged 1 commit into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 29 additions & 26 deletions ssa/manager_wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package ssa

import (
"context"
"errors"
"fmt"
"strings"
"time"
Expand Down Expand Up @@ -83,7 +84,6 @@ func (m *ResourceManager) WaitForSet(set object.ObjMetadataSet, opts WaitOptions
eventsChan := m.poller.Poll(ctx, set, pollingOpts)

lastStatus := make(map[object.ObjMetadata]*event.ResourceStatus)
var failedResources int

done := statusCollector.ListenWithObserver(eventsChan, collector.ObserverFunc(
func(statusCollector *collector.ResourceStatusCollector, e event.Event) {
Expand All @@ -96,7 +96,7 @@ func (m *ResourceManager) WaitForSet(set object.ObjMetadataSet, opts WaitOptions
// skip DeadlineExceeded errors because kstatus emits that error
// for every resource it's monitoring even when only one of them
// actually fails.
if rs.Error != context.DeadlineExceeded {
if !errors.Is(rs.Error, context.DeadlineExceeded) {
lastStatus[rs.Identifier] = rs
}

Expand All @@ -105,7 +105,6 @@ func (m *ResourceManager) WaitForSet(set object.ObjMetadataSet, opts WaitOptions
}
rss = append(rss, rs)
}
failedResources = countFailed

desired := status.CurrentStatus
aggStatus := aggregator.AggregateStatus(rss, desired)
Expand All @@ -122,32 +121,36 @@ func (m *ResourceManager) WaitForSet(set object.ObjMetadataSet, opts WaitOptions
return statusCollector.Error
}

if ctx.Err() == context.DeadlineExceeded || (opts.FailFast && failedResources > 0) {
msg := "failed early due to stalled resources"
if ctx.Err() == context.DeadlineExceeded {
msg = "timeout waiting for"
}

var errors = []string{}
for id, rs := range statusCollector.ResourceStatuses {
if rs == nil {
errors = append(errors, fmt.Sprintf("can't determine status for %s", utils.FmtObjMetadata(id)))
continue
var errs []string
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wdyt think about

 var errs []error
	for id, rs := range statusCollector.ResourceStatuses {
		switch {
		case rs == nil || lastStatus[id] == nil:
			errs = append(errs, fmt.Errorf("resource %s not found", utils.FmtObjMetadata(id)))
		case lastStatus[id].Status == status.FailedStatus:
			var builder strings.Builder
			builder.WriteString(fmt.Sprintf("%s status: '%s'",
				utils.FmtObjMetadata(rs.Identifier), lastStatus[id].Status))
			if rs.Error != nil {
				builder.WriteString(fmt.Sprintf(": %s", rs.Error))
			}
			errs = append(errs, errors.New(builder.String()))
...
	if len(errs) > 0 {
		msg := "failed early due to stalled resources"
		if errors.Is(ctx.Err(), context.DeadlineExceeded) {
			msg = "timeout waiting for"
		}
		return fmt.Errorf("%s: %w", msg, errors.Join(errs...))
	}

Wrapping the errors can make it easier to inspect for calling funtions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more about humans than code, keep in mind that the resulting error ends up in Slack, PagerDuty, etc. Is error wrapping with 100+ errors easier to read and parse than an array?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.Error() will return a string still. The difference is that for Join. It says The error formats as the concatenation of the strings obtained by calling the Error method of each element of errs, with a newline between each string.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok but this will change the format and if people rely on the current array, their parser will fail. I would consider change it in Flux minor release instead of a patch like now.

for id, rs := range statusCollector.ResourceStatuses {
switch {
case rs == nil || lastStatus[id] == nil:
errs = append(errs, fmt.Sprintf("can't determine status for %s", utils.FmtObjMetadata(id)))
case lastStatus[id].Status == status.FailedStatus:
var builder strings.Builder
builder.WriteString(fmt.Sprintf("%s status: '%s'",
utils.FmtObjMetadata(rs.Identifier), lastStatus[id].Status))
if rs.Error != nil {
builder.WriteString(fmt.Sprintf(": %s", rs.Error))
}
if lastStatus[id] == nil {
// this is only nil in the rare case where no status can be determined for the resource at all
errors = append(errors, fmt.Sprintf("%s (unknown status)", utils.FmtObjMetadata(rs.Identifier)))
} else if lastStatus[id].Status != status.CurrentStatus {
var builder strings.Builder
builder.WriteString(fmt.Sprintf("%s status: '%s'",
utils.FmtObjMetadata(rs.Identifier), lastStatus[id].Status))
if rs.Error != nil {
builder.WriteString(fmt.Sprintf(": %s", rs.Error))
}
errors = append(errors, builder.String())
errs = append(errs, builder.String())
case errors.Is(ctx.Err(), context.DeadlineExceeded) && lastStatus[id].Status != status.CurrentStatus:
var builder strings.Builder
builder.WriteString(fmt.Sprintf("%s status: '%s'",
utils.FmtObjMetadata(rs.Identifier), lastStatus[id].Status))
if rs.Error != nil {
builder.WriteString(fmt.Sprintf(": %s", rs.Error))
Comment on lines +129 to +142
Copy link
Contributor

@darkowlzz darkowlzz Jan 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to ignore it but the two cases above seem identical.
Maybe there are two cases to keep the conditionals simple but maybe it'd be simpler to maintain in the long run if the duplicate code can be put together in one case and the case expression can be made a little complex like:

case (lastStatus[id].Status == status.FailedStatus) || (errors.Is(ctx.Err(), context.DeadlineExceeded) && lastStatus[id].Status != status.CurrentStatus):
        ...

I may be missing something if they aren't identical. Please tell me if they aren't.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did like this so it's easier to read, failure vs timeout.

}
errs = append(errs, builder.String())
}
}

if len(errs) > 0 {
msg := "failed early due to stalled resources"
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
msg = "timeout waiting for"
}
return fmt.Errorf("%s: [%s]", msg, strings.Join(errors, ", "))
return fmt.Errorf("%s: [%s]", msg, strings.Join(errs, ", "))
}

return nil
Expand Down
63 changes: 15 additions & 48 deletions ssa/manager_wait_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func TestWaitForSet_failFast(t *testing.T) {
t.Fatal(err)
}

// Set Progressing Condition to false and reason to ProgressDeadlineExceeded
// Set Progressing Condition to false and reason to ProgressDeadlineExceeded.
// This tells kstatus that the deployment has stalled.
cond := appsv1.DeploymentCondition{
Type: appsv1.DeploymentProgressing,
Expand All @@ -160,6 +160,8 @@ func TestWaitForSet_failFast(t *testing.T) {
t.Fatal(err)
}

// Set PVC phase to Pending.
// This tells kstatus that the PVC is in progress.
clusterPvc := &unstructured.Unstructured{}
clusterPvc.SetGroupVersionKind(schema.GroupVersionKind{
Group: "",
Expand All @@ -170,7 +172,7 @@ func TestWaitForSet_failFast(t *testing.T) {
t.Fatal(err)
}

if err := unstructured.SetNestedField(clusterPvc.Object, "Bound", "status", "phase"); err != nil {
if err := unstructured.SetNestedField(clusterPvc.Object, "Pending", "status", "phase"); err != nil {
t.Fatal(err)
}

Expand All @@ -185,7 +187,7 @@ func TestWaitForSet_failFast(t *testing.T) {
t.Fatal(err)
}

t.Run("timeout when failfast is set to false", func(t *testing.T) {
t.Run("timeout when fail fast is disabled", func(t *testing.T) {
err = manager.WaitForSet(cs.ToObjMetadataSet(), WaitOptions{
Interval: interval,
Timeout: timeout,
Expand All @@ -199,54 +201,15 @@ func TestWaitForSet_failFast(t *testing.T) {
}

if !strings.Contains(err.Error(), deployFailedMsg) {
t.Fatal("expected error to contain status of failed deployment")
}
})

t.Run("return early when failfast is set to true", func(t *testing.T) {
err = manager.WaitForSet(cs.ToObjMetadataSet(), WaitOptions{
Interval: interval,
Timeout: timeout,
FailFast: true,
})

deployFailedMsg := fmt.Sprintf("%s status: '%s'", utils.FmtObjMetadata(deployObjMeta), status.FailedStatus)

if err == nil || !strings.Contains(err.Error(), "failed early") {
t.Fatal("expected WaitForSet to fail early due to stalled deployment")
t.Fatal("expected error to contain status of failed deployment", err.Error())
}

if !strings.Contains(err.Error(), deployFailedMsg) {
t.Fatal("expected error to contain status of failed deployment")
if !strings.Contains(err.Error(), "InProgress") {
t.Fatal("expected error to contain InProgress deployment", err.Error())
}
})

t.Run("fail early even if there are still Progressing resources", func(t *testing.T) {
// change status to Pending to have an 'InProgress' resource
clusterPvc := &unstructured.Unstructured{}
clusterPvc.SetGroupVersionKind(schema.GroupVersionKind{
Group: "",
Kind: "PersistentVolumeClaim",
Version: "v1",
})
if err := manager.client.Get(ctx, client.ObjectKeyFromObject(pvc), clusterPvc); err != nil {
t.Fatal(err)
}

if err := unstructured.SetNestedField(clusterPvc.Object, "Pending", "status", "phase"); err != nil {
t.Fatal(err)
}
opts := &client.SubResourcePatchOptions{
PatchOptions: client.PatchOptions{
FieldManager: manager.owner.Field,
},
}

clusterPvc.SetManagedFields(nil)
if err := manager.client.Status().Patch(ctx, clusterPvc, client.Apply, opts); err != nil {
t.Fatal(err)
}

t.Run("fail early even if there are still progressing resources", func(t *testing.T) {
err = manager.WaitForSet(cs.ToObjMetadataSet(), WaitOptions{
Interval: interval,
Timeout: timeout,
Expand All @@ -256,11 +219,15 @@ func TestWaitForSet_failFast(t *testing.T) {
deployFailedMsg := fmt.Sprintf("%s status: '%s'", utils.FmtObjMetadata(deployObjMeta), status.FailedStatus)

if err == nil || !strings.Contains(err.Error(), "failed early") {
t.Fatal("expected WaitForSet to fail early due to stalled deployment")
t.Fatal("expected to fail early due to stalled deployment", err.Error())
}

if !strings.Contains(err.Error(), deployFailedMsg) {
t.Fatal("expected error to contain status of failed deployment")
t.Fatal("expected error to contain status of failed deployment", err.Error())
}

if strings.Contains(err.Error(), "InProgress") {
t.Fatal("expected error to not contain InProgress resources", err.Error())
}
})
}
Loading