From 481d609de353b29888040ab4cfbd9f28cc362152 Mon Sep 17 00:00:00 2001
From: Peter Wilson <peter.wilson@hashicorp.com>
Date: Thu, 5 Sep 2024 12:55:27 +0100
Subject: [PATCH 1/3] always use derived context for audit logging, increase
 timeout to 10secs max

---
 audit/broker.go      | 94 +++++++++++---------------------------------
 audit/broker_test.go | 79 ++++++++++++++++++-------------------
 2 files changed, 61 insertions(+), 112 deletions(-)

diff --git a/audit/broker.go b/audit/broker.go
index 601831eab067..13845267362e 100644
--- a/audit/broker.go
+++ b/audit/broker.go
@@ -22,7 +22,7 @@ import (
 
 const (
 	// timeout is the duration which should be used for context related timeouts.
-	timeout = 5 * time.Second
+	timeout = 10 * time.Second
 )
 
 var (
@@ -277,29 +277,12 @@ func (b *Broker) LogRequest(ctx context.Context, in *logical.LogInput) (retErr e
 
 	e.Data = in
 
-	// Evaluate whether we need a new context for auditing.
-	var auditContext context.Context
-	if isContextViable(ctx) {
-		auditContext = ctx
-	} else {
-		// In cases where we are trying to audit the request, and the existing
-		// context is not viable due to a short deadline, we detach ourselves from
-		// the original context (keeping only the namespace).
-		// This is so that we get a fair run at writing audit entries if Vault
-		// has taken up a lot of time handling the request before audit (request)
-		// is triggered. Pipeline nodes and the eventlogger.Broker may check for a
-		// cancelled context and refuse to process the nodes further.
-		ns, err := nshelper.FromContext(ctx)
-		if err != nil {
-			return fmt.Errorf("namespace missing from context: %w", err)
-		}
-
-		tempContext, auditCancel := context.WithTimeout(context.Background(), timeout)
-		defer auditCancel()
-		auditContext = nshelper.ContextWithNamespace(tempContext, ns)
-
-		b.logger.Trace("log request requires a derived context (original context was not viable)", "namespace ID", ns.ID, "namespace path", ns.Path, "timeout", timeout)
+	// Get a context to use for auditing.
+	auditContext, auditCancel, err := getAuditContext(ctx)
+	if err != nil {
+		return err
 	}
+	defer auditCancel()
 
 	var status eventlogger.Status
 	if hasAuditPipelines(b.broker) {
@@ -361,29 +344,12 @@ func (b *Broker) LogResponse(ctx context.Context, in *logical.LogInput) (retErr
 
 	e.Data = in
 
-	// Evaluate whether we need a new context for auditing.
-	var auditContext context.Context
-	if isContextViable(ctx) {
-		auditContext = ctx
-	} else {
-		// In cases where we are trying to audit the response, and the existing
-		// context is not viable due to a short deadline, we detach ourselves from
-		// the original context (keeping only the namespace).
-		// This is so that we get a fair run at writing audit entries if Vault
-		// has taken up a lot of time handling the request before audit (response)
-		// is triggered. Pipeline nodes and the eventlogger.Broker may check for a
-		// cancelled context and refuse to process the nodes further.
-		ns, err := nshelper.FromContext(ctx)
-		if err != nil {
-			return fmt.Errorf("namespace missing from context: %w", err)
-		}
-
-		tempContext, auditCancel := context.WithTimeout(context.Background(), timeout)
-		defer auditCancel()
-		auditContext = nshelper.ContextWithNamespace(tempContext, ns)
-
-		b.logger.Trace("log response requires a derived context (original context was not viable)", "namespace ID", ns.ID, "namespace path", ns.Path, "timeout", timeout)
+	// Get a context to use for auditing.
+	auditContext, auditCancel, err := getAuditContext(ctx)
+	if err != nil {
+		return err
 	}
+	defer auditCancel()
 
 	var status eventlogger.Status
 	if hasAuditPipelines(b.broker) {
@@ -463,33 +429,19 @@ func (b *Broker) IsRegistered(name string) bool {
 	return b.isRegisteredByName(name)
 }
 
-// isContextViable examines the supplied context to see if its own deadline would
-// occur later than a newly created context with a specific timeout.
-// Additionally, whether the supplied context is already cancelled, thus making it
-// unviable.
-// If the existing context is viable it can be used 'as-is', if not, the caller
-// should consider creating a new context with the relevant deadline and associated
-// context values (e.g. namespace) in order to reduce the likelihood that the
-// audit system believes there is a failure in audit (and updating its metrics)
-// when the root cause is elsewhere.
-func isContextViable(ctx context.Context) bool {
-	if ctx == nil {
-		return false
-	}
-
-	select {
-	case <-ctx.Done():
-		return false
-	default:
+// getAuditContext extracts the namespace from the specified context and returns
+// a new context and cancelation function, completely detached from the original
+// with a timeout.
+// NOTE: The context.CancelFunc returned from this function should be deferred
+// immediately by the caller to prevent resource leaks.
+func getAuditContext(ctx context.Context) (context.Context, context.CancelFunc, error) {
+	ns, err := nshelper.FromContext(ctx)
+	if err != nil {
+		return nil, nil, fmt.Errorf("namespace missing from context: %w", err)
 	}
 
-	deadline, hasDeadline := ctx.Deadline()
-
-	// If there's no deadline on the context then we don't need to worry about
-	// it being cancelled due to a timeout.
-	if !hasDeadline {
-		return true
-	}
+	tempContext := nshelper.ContextWithNamespace(context.Background(), ns)
+	auditContext, auditCancel := context.WithTimeout(tempContext, timeout)
 
-	return deadline.After(time.Now().Add(timeout))
+	return auditContext, auditCancel, nil
 }
diff --git a/audit/broker_test.go b/audit/broker_test.go
index 1fd6d545771c..a7bdc960d128 100644
--- a/audit/broker_test.go
+++ b/audit/broker_test.go
@@ -159,54 +159,51 @@ func BenchmarkAuditBroker_File_Request_DevNull(b *testing.B) {
 	})
 }
 
-// TestBroker_isContextViable_basics checks the expected result of isContextViable
-// for basic inputs such as nil, cancelled context and a never-ending context.
-func TestBroker_isContextViable_basics(t *testing.T) {
+// TestBroker_getAuditContext_NoNamespace checks that we get the right error when
+// trying to get an audit context with no namespace.
+func TestBroker_getAuditContext_NoNamespace(t *testing.T) {
 	t.Parallel()
 
-	require.False(t, isContextViable(nil))
-	ctx, cancel := context.WithCancel(context.Background())
-	cancel()
-	require.False(t, isContextViable(ctx))
-	require.True(t, isContextViable(context.Background()))
+	_, _, err := getAuditContext(context.Background())
+	require.Error(t, err)
+	require.EqualError(t, err, "namespace missing from context: no namespace")
 }
 
-// TestBroker_isContextViable_timeouts checks the expected result of isContextViable
-// for various timeout durations.
-func TestBroker_isContextViable_timeouts(t *testing.T) {
+// TestBroker_getAuditContext checks that we get a context back which isn't linked
+// to the original context, and contains our namespace.
+func TestBroker_getAuditContext(t *testing.T) {
 	t.Parallel()
 
-	tests := map[string]struct {
-		timeout  time.Duration
-		expected bool
-	}{
-		"2s-smaller-deadline": {
-			timeout:  timeout - 2*time.Second,
-			expected: false,
-		},
-		"same-deadline": {
-			timeout:  timeout,
-			expected: false, // Expected as a near miss
-		},
-		"same-deadline-plus": {
-			timeout:  timeout + 100*time.Millisecond,
-			expected: true,
-		},
-		"2x-longer-deadline": {
-			timeout:  timeout * 2,
-			expected: true,
-		},
+	// context with namespace
+	ns := &nshelper.Namespace{
+		ID:   "foo",
+		Path: "foo/",
 	}
 
-	for name, tc := range tests {
-		name := name
-		tc := tc
-		t.Run(name, func(t *testing.T) {
-			t.Parallel()
+	// Create a context with a namespace.
+	originalContext, originalCancel := context.WithCancel(context.Background())
+	t.Cleanup(originalCancel)
+	nsContext := nshelper.ContextWithNamespace(originalContext, ns)
 
-			ctx, cancel := context.WithTimeout(context.Background(), tc.timeout)
-			t.Cleanup(func() { cancel() })
-			require.Equal(t, tc.expected, isContextViable(ctx))
-		})
-	}
+	// Get the audit context
+	auditContext, auditCancel, err := getAuditContext(nsContext)
+	t.Cleanup(auditCancel)
+
+	require.NoError(t, err)
+	require.NotNil(t, auditContext)
+	require.NotNil(t, auditCancel)
+
+	// Ensure the namespace is there too.
+	val, err := nshelper.FromContext(auditContext)
+	require.NoError(t, err)
+	require.Equal(t, ns, val)
+
+	// Now cancel the original context and ensure it is done but audit context isn't.
+	originalCancel()
+	require.NotNil(t, originalContext.Err())
+	require.Nil(t, auditContext.Err())
+
+	// Now cancel the audit context and ensure that it is done.
+	auditCancel()
+	require.NotNil(t, auditContext.Err())
 }

From 3bb65977cd6e596f3f543b9bf77aea7cd67af052 Mon Sep 17 00:00:00 2001
From: Peter Wilson <peter.wilson@hashicorp.com>
Date: Thu, 5 Sep 2024 12:56:42 +0100
Subject: [PATCH 2/3] improve go doc

---
 audit/broker.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/audit/broker.go b/audit/broker.go
index 13845267362e..1f9a859d3881 100644
--- a/audit/broker.go
+++ b/audit/broker.go
@@ -432,8 +432,8 @@ func (b *Broker) IsRegistered(name string) bool {
 // getAuditContext extracts the namespace from the specified context and returns
 // a new context and cancelation function, completely detached from the original
 // with a timeout.
-// NOTE: The context.CancelFunc returned from this function should be deferred
-// immediately by the caller to prevent resource leaks.
+// NOTE: When error is nil, the context.CancelFunc returned from this function
+// should be deferred immediately by the caller to prevent resource leaks.
 func getAuditContext(ctx context.Context) (context.Context, context.CancelFunc, error) {
 	ns, err := nshelper.FromContext(ctx)
 	if err != nil {

From 1c1b55eb2e241c76aa3aba9843b46bd0a0897317 Mon Sep 17 00:00:00 2001
From: Peter Wilson <peter.wilson@hashicorp.com>
Date: Thu, 5 Sep 2024 13:22:46 +0100
Subject: [PATCH 3/3] changelog

---
 changelog/28286.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 changelog/28286.txt

diff --git a/changelog/28286.txt b/changelog/28286.txt
new file mode 100644
index 000000000000..c7010a199241
--- /dev/null
+++ b/changelog/28286.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+audit: Internal implementation changes to the audit subsystem which improve relability.
+```
\ No newline at end of file