From 01565f88cbc1a7d7465d804c60330009c7894c81 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Wed, 31 Jan 2024 12:52:44 +1100 Subject: [PATCH 01/11] querytee: Add support to log comparatively slow queries This adds `proxy.log-slow-query-response-threshold` which when enabled will log any queries that take longer than the configured threshold on one backend compared to the fastest backend. --- tools/querytee/proxy.go | 4 +- tools/querytee/proxy_endpoint.go | 66 +++++++++++++--- tools/querytee/proxy_endpoint_test.go | 110 +++++++++++++++++++++++++- tools/querytee/proxy_metrics.go | 2 + 4 files changed, 167 insertions(+), 15 deletions(-) diff --git a/tools/querytee/proxy.go b/tools/querytee/proxy.go index a3263a939ad..b0a05b22bb1 100644 --- a/tools/querytee/proxy.go +++ b/tools/querytee/proxy.go @@ -34,6 +34,7 @@ type ProxyConfig struct { PreferredBackend string BackendReadTimeout time.Duration CompareResponses bool + LogSlowQueryResponseThreshold time.Duration ValueComparisonTolerance float64 UseRelativeError bool PassThroughNonRegisteredRoutes bool @@ -51,6 +52,7 @@ func (cfg *ProxyConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.PreferredBackend, "backend.preferred", "", "The hostname of the preferred backend when selecting the response to send back to the client. If no preferred backend is configured then the query-tee will send back to the client the first successful response received without waiting for other backends.") f.DurationVar(&cfg.BackendReadTimeout, "backend.read-timeout", 150*time.Second, "The timeout when reading the response from a backend.") f.BoolVar(&cfg.CompareResponses, "proxy.compare-responses", false, "Compare responses between preferred and secondary endpoints for supported routes.") + f.DurationVar(&cfg.LogSlowQueryResponseThreshold, "proxy.log-slow-query-response-threshold", 10*time.Second, "The relative elapsed time between backends for when to log queries. 0 to disable.") f.Float64Var(&cfg.ValueComparisonTolerance, "proxy.value-comparison-tolerance", 0.000001, "The tolerance to apply when comparing floating point values in the responses. 0 to disable tolerance and require exact match (not recommended).") f.BoolVar(&cfg.UseRelativeError, "proxy.compare-use-relative-error", false, "Use relative error tolerance when comparing floating point values.") f.DurationVar(&cfg.SkipRecentSamples, "proxy.compare-skip-recent-samples", 2*time.Minute, "The window from now to skip comparing samples. 0 to disable.") @@ -204,7 +206,7 @@ func (p *Proxy) Start() error { if p.cfg.CompareResponses { comparator = route.ResponseComparator } - router.Path(route.Path).Methods(route.Methods...).Handler(NewProxyEndpoint(p.backends, route.RouteName, p.metrics, p.logger, comparator)) + router.Path(route.Path).Methods(route.Methods...).Handler(NewProxyEndpoint(p.backends, route.RouteName, p.metrics, p.logger, comparator, p.cfg.LogSlowQueryResponseThreshold)) } if p.cfg.PassThroughNonRegisteredRoutes { diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index b8aff2b718f..56430012fe6 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -23,10 +23,11 @@ type ResponsesComparator interface { } type ProxyEndpoint struct { - backends []*ProxyBackend - metrics *ProxyMetrics - logger log.Logger - comparator ResponsesComparator + backends []*ProxyBackend + metrics *ProxyMetrics + logger log.Logger + comparator ResponsesComparator + slowResponseThreshold time.Duration // Whether for this endpoint there's a preferred backend configured. hasPreferredBackend bool @@ -35,7 +36,7 @@ type ProxyEndpoint struct { routeName string } -func NewProxyEndpoint(backends []*ProxyBackend, routeName string, metrics *ProxyMetrics, logger log.Logger, comparator ResponsesComparator) *ProxyEndpoint { +func NewProxyEndpoint(backends []*ProxyBackend, routeName string, metrics *ProxyMetrics, logger log.Logger, comparator ResponsesComparator, slowResponseThreshold time.Duration) *ProxyEndpoint { hasPreferredBackend := false for _, backend := range backends { if backend.preferred { @@ -45,12 +46,13 @@ func NewProxyEndpoint(backends []*ProxyBackend, routeName string, metrics *Proxy } return &ProxyEndpoint{ - backends: backends, - routeName: routeName, - metrics: metrics, - logger: logger, - comparator: comparator, - hasPreferredBackend: hasPreferredBackend, + backends: backends, + routeName: routeName, + metrics: metrics, + logger: logger, + comparator: comparator, + slowResponseThreshold: slowResponseThreshold, + hasPreferredBackend: hasPreferredBackend, } } @@ -82,6 +84,7 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba body []byte responses = make([]*backendResponse, 0, len(p.backends)) responsesMtx = sync.Mutex{} + timingMtx = sync.Mutex{} query = req.URL.RawQuery ) @@ -104,6 +107,14 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba level.Debug(p.logger).Log("msg", "Received request", "path", req.URL.Path, "query", query) + // Keep track of the fastest and slowest backends + var ( + fastestDuration time.Duration + fastestBackend *ProxyBackend + slowestDuration time.Duration + slowestBackend *ProxyBackend + ) + wg.Add(len(p.backends)) for _, b := range p.backends { b := b @@ -122,6 +133,19 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba elapsed := time.Since(start) contentType := "" + if p.slowResponseThreshold > 0 { + timingMtx.Lock() + if elapsed > slowestDuration { + slowestDuration = elapsed + slowestBackend = b + } + if fastestDuration == 0 || elapsed < fastestDuration { + fastestDuration = elapsed + fastestBackend = b + } + timingMtx.Unlock() + } + if resp != nil { contentType = resp.Header.Get("Content-Type") } @@ -132,6 +156,7 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba contentType: contentType, body: body, err: err, + elapsedTime: elapsed, } // Log with a level based on the backend response. @@ -174,6 +199,8 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba "query", query, "user", req.Header.Get("X-Scope-OrgID"), "err", err, + "expected-response-duration", expectedResponse.elapsedTime, + "actual-response-duration", actualResponse.elapsedTime, ) } else if result == ComparisonSkipped { level.Warn(p.logger).Log( @@ -182,6 +209,22 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba "query", query, "user", req.Header.Get("X-Scope-OrgID"), "err", err, + "expected-response-duration", expectedResponse.elapsedTime, + "actual-response-duration", actualResponse.elapsedTime, + ) + } + + // Log queries that are slower in some backends than others + if p.slowResponseThreshold > 0 && slowestDuration-fastestDuration >= p.slowResponseThreshold { + level.Warn(p.logger).Log( + "msg", "response time between backends exceeded threshold", + "route-name", p.routeName, + "query", query, + "user", req.Header.Get("X-Scope-OrgID"), + "slowest-duration", slowestDuration, + "slowest-backend", slowestBackend.name, + "fastest-duration", fastestDuration, + "fastest-backend", fastestBackend.name, ) } @@ -254,6 +297,7 @@ type backendResponse struct { contentType string body []byte err error + elapsedTime time.Duration } func (r *backendResponse) succeeded() bool { diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index 707384c799a..d193a4079e8 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -104,7 +104,7 @@ func Test_ProxyEndpoint_waitBackendResponseForDownstream(t *testing.T) { testData := testData t.Run(testName, func(t *testing.T) { - endpoint := NewProxyEndpoint(testData.backends, "test", NewProxyMetrics(nil), log.NewNopLogger(), nil) + endpoint := NewProxyEndpoint(testData.backends, "test", NewProxyMetrics(nil), log.NewNopLogger(), nil, 0) // Send the responses from a dedicated goroutine. resCh := make(chan *backendResponse) @@ -148,7 +148,7 @@ func Test_ProxyEndpoint_Requests(t *testing.T) { NewProxyBackend("backend-1", backendURL1, time.Second, true, false), NewProxyBackend("backend-2", backendURL2, time.Second, false, false), } - endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(nil), log.NewNopLogger(), nil) + endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(nil), log.NewNopLogger(), nil, 0) for _, tc := range []struct { name string @@ -322,7 +322,7 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { comparisonError: scenario.comparatorError, } - endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator) + endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, 0) resp := httptest.NewRecorder() req, err := http.NewRequest("GET", "http://test/api/v1/test", nil) @@ -350,6 +350,97 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { } } +func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { + scenarios := map[string]struct { + slowResponseThreshold time.Duration + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration + }{ + "responses are below threshold": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 0 * time.Millisecond, + secondaryResponseLatency: 0 * time.Millisecond, + }, + "one response above threshold": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 0 * time.Millisecond, + secondaryResponseLatency: 101 * time.Millisecond, + }, + "responses are both above threshold, but lower than threshold between themselves": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 101 * time.Millisecond, + secondaryResponseLatency: 150 * time.Millisecond, + }, + "responses are both above threshold, and above threshold between themselves": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 101 * time.Millisecond, + secondaryResponseLatency: 202 * time.Millisecond, + }, + } + + for name, scenario := range scenarios { + t.Run(name, func(t *testing.T) { + preferredBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + time.Sleep(scenario.preferredResponseLatency) + _, err := w.Write([]byte("preferred response")) + require.NoError(t, err) + })) + + defer preferredBackend.Close() + preferredBackendURL, err := url.Parse(preferredBackend.URL) + require.NoError(t, err) + + secondaryBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + time.Sleep(scenario.secondaryResponseLatency) + _, err := w.Write([]byte("preferred response")) + require.NoError(t, err) + })) + + defer secondaryBackend.Close() + secondaryBackendURL, err := url.Parse(secondaryBackend.URL) + require.NoError(t, err) + + backends := []*ProxyBackend{ + NewProxyBackend("preferred-backend", preferredBackendURL, time.Second, true, false), + NewProxyBackend("secondary-backend", secondaryBackendURL, time.Second, false, false), + } + + logger := newMockLogger() + reg := prometheus.NewPedanticRegistry() + comparator := &mockComparator{ + comparisonResult: ComparisonSuccess, + } + + endpoint := NewProxyEndpoint(backends, "test", NewProxyMetrics(reg), logger, comparator, scenario.slowResponseThreshold) + + resp := httptest.NewRecorder() + req, err := http.NewRequest("GET", "http://test/api/v1/test", nil) + require.NoError(t, err) + endpoint.ServeHTTP(resp, req) + + // The HTTP request above will return as soon as the primary response is received, but this doesn't guarantee that the response comparison has been completed. + // Wait for the response comparison to complete before checking the logged messages. + waitForResponseComparisonMetric(t, reg, ComparisonSuccess) + + for _, m := range logger.messages { + fmt.Println(m) + } + if (scenario.preferredResponseLatency >= scenario.secondaryResponseLatency && + scenario.preferredResponseLatency-scenario.secondaryResponseLatency < scenario.slowResponseThreshold) || + (scenario.secondaryResponseLatency >= scenario.preferredResponseLatency && + scenario.secondaryResponseLatency-scenario.preferredResponseLatency < scenario.slowResponseThreshold) { + requireNoLogMessages(t, logger.messages, "response time between backends exceeded threshold") + } else { + requireLogMessage(t, logger.messages, "response time between backends exceeded threshold") + } + }) + } +} + func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expectedResult ComparisonResult) { started := time.Now() timeoutAt := started.Add(2 * time.Second) @@ -374,6 +465,19 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect } } +func requireLogMessage(t *testing.T, messages []map[string]interface{}, expectedMessage string) { + sawMessage := false + + for _, m := range messages { + if m["msg"] == expectedMessage { + sawMessage = true + break + } + } + + require.True(t, sawMessage, "expected to find a '%s' message logged, but only these messages were logged: %v", expectedMessage, messages) +} + func requireNoLogMessages(t *testing.T, messages []map[string]interface{}, forbiddenMessages ...string) { for _, m := range messages { msg := m["msg"] diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index 019c9dea1d0..e91dd15ca11 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -43,6 +43,8 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { Name: "responses_compared_total", Help: "Total number of responses compared per route name by result.", }, []string{"route", "result"}), + /// add in metrics for latency + // can have a label per backend } return m From f692dfeb78005cf063b000c9ee79f43e14948799 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 9 Feb 2024 21:34:28 +1100 Subject: [PATCH 02/11] Add docs --- docs/sources/mimir/manage/tools/query-tee.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index c81726ab275..51453bc2a14 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -125,6 +125,13 @@ When the query results comparison is enabled, the query-tee compares the respons > If either Mimir cluster is running with a non-default value of `-ruler.evaluation-delay-duration`, we recommend setting `-proxy.compare-skip-recent-samples` to 1 minute more than the > value of `-ruler.evaluation-delay-duration`. +### Slow query log + +The query-tee can optionally log requests that take longer on one backend than it did compared to the fastest backend. +The logging is enabled by setting the flag `proxy.log-slow-query-response-threshold` to a duration whereby if two backends differ by the specified amount the query is logged. + +> **Note**: Slow query logging it disabled by default (by setting to 0). + ### Exported metrics The query-tee exposes the following Prometheus metrics at the `/metrics` endpoint listening on the port configured via the flag `-server.metrics-port`: From 20aeaf7d2b4b4027172b8fc2e54d7a29b11a8a1e Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 9 Feb 2024 21:34:34 +1100 Subject: [PATCH 03/11] Add changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39e2ac28fd8..4ffe776ac34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -167,6 +167,8 @@ ### Query-tee +* [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 + ### Documentation * [ENHANCEMENT] Added runbook for `KubePersistentVolumeFillingUp` alert. #7297 From 480b36cea26425917262f94688e6a4661911b4b8 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Mon, 19 Feb 2024 22:39:37 +1100 Subject: [PATCH 04/11] Apply suggestions from code review Co-authored-by: George Krajcsovits --- tools/querytee/proxy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/querytee/proxy.go b/tools/querytee/proxy.go index b0a05b22bb1..55162505dd0 100644 --- a/tools/querytee/proxy.go +++ b/tools/querytee/proxy.go @@ -52,7 +52,7 @@ func (cfg *ProxyConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.PreferredBackend, "backend.preferred", "", "The hostname of the preferred backend when selecting the response to send back to the client. If no preferred backend is configured then the query-tee will send back to the client the first successful response received without waiting for other backends.") f.DurationVar(&cfg.BackendReadTimeout, "backend.read-timeout", 150*time.Second, "The timeout when reading the response from a backend.") f.BoolVar(&cfg.CompareResponses, "proxy.compare-responses", false, "Compare responses between preferred and secondary endpoints for supported routes.") - f.DurationVar(&cfg.LogSlowQueryResponseThreshold, "proxy.log-slow-query-response-threshold", 10*time.Second, "The relative elapsed time between backends for when to log queries. 0 to disable.") + f.DurationVar(&cfg.LogSlowQueryResponseThreshold, "proxy.log-slow-query-response-threshold", 10*time.Second, "The minimum difference in response time between slowest and fastest back-end over which to log the query. 0 to disable.") f.Float64Var(&cfg.ValueComparisonTolerance, "proxy.value-comparison-tolerance", 0.000001, "The tolerance to apply when comparing floating point values in the responses. 0 to disable tolerance and require exact match (not recommended).") f.BoolVar(&cfg.UseRelativeError, "proxy.compare-use-relative-error", false, "Use relative error tolerance when comparing floating point values.") f.DurationVar(&cfg.SkipRecentSamples, "proxy.compare-skip-recent-samples", 2*time.Minute, "The window from now to skip comparing samples. 0 to disable.") From 54d22523a6dc4da1bc26a2230851f8affcb6df26 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Mon, 19 Feb 2024 22:40:50 +1100 Subject: [PATCH 05/11] Remove accidental comment --- tools/querytee/proxy_metrics.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/querytee/proxy_metrics.go b/tools/querytee/proxy_metrics.go index e91dd15ca11..019c9dea1d0 100644 --- a/tools/querytee/proxy_metrics.go +++ b/tools/querytee/proxy_metrics.go @@ -43,8 +43,6 @@ func NewProxyMetrics(registerer prometheus.Registerer) *ProxyMetrics { Name: "responses_compared_total", Help: "Total number of responses compared per route name by result.", }, []string{"route", "result"}), - /// add in metrics for latency - // can have a label per backend } return m From df2ea8dee964139913620a1f6a2bc3d553a0093e Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 20 Feb 2024 13:04:39 +1100 Subject: [PATCH 06/11] Update docs/sources/mimir/manage/tools/query-tee.md Co-authored-by: Jack Baldry --- docs/sources/mimir/manage/tools/query-tee.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 51453bc2a14..3a1eda28950 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -127,10 +127,11 @@ When the query results comparison is enabled, the query-tee compares the respons ### Slow query log -The query-tee can optionally log requests that take longer on one backend than it did compared to the fastest backend. -The logging is enabled by setting the flag `proxy.log-slow-query-response-threshold` to a duration whereby if two backends differ by the specified amount the query is logged. +You can configure query-tee to log requests that take longer than the fastest backend by by setting the flag `-proxy.log-slow-query-response-threshold`. -> **Note**: Slow query logging it disabled by default (by setting to 0). +The default value is `10s` which logs requests that are ten seconds slower than the fastest backend. + +To disable slow query logging, set `-proxy.log-slow-query-response-threshold` to `0`. ### Exported metrics From 4eb115a1d31290aa89f6f5d55bf87de72f4ca425 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 8 Mar 2024 13:27:56 +1100 Subject: [PATCH 07/11] Apply suggestions from code review Co-authored-by: Arve Knudsen --- docs/sources/mimir/manage/tools/query-tee.md | 2 +- tools/querytee/proxy_endpoint.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sources/mimir/manage/tools/query-tee.md b/docs/sources/mimir/manage/tools/query-tee.md index 23055548a4d..8ca1bb2e5b6 100644 --- a/docs/sources/mimir/manage/tools/query-tee.md +++ b/docs/sources/mimir/manage/tools/query-tee.md @@ -137,7 +137,7 @@ If either Mimir cluster is running with a non-default value of `-ruler.evaluatio ### Slow query log -You can configure query-tee to log requests that take longer than the fastest backend by by setting the flag `-proxy.log-slow-query-response-threshold`. +You can configure query-tee to log requests that take longer than the fastest backend by setting the flag `-proxy.log-slow-query-response-threshold`. The default value is `10s` which logs requests that are ten seconds slower than the fastest backend. diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index 56430012fe6..ed0413b33b8 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -217,7 +217,7 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba // Log queries that are slower in some backends than others if p.slowResponseThreshold > 0 && slowestDuration-fastestDuration >= p.slowResponseThreshold { level.Warn(p.logger).Log( - "msg", "response time between backends exceeded threshold", + "msg", "response time difference between backends exceeded threshold", "route-name", p.routeName, "query", query, "user", req.Header.Get("X-Scope-OrgID"), From b0b198f08cd4f8db02dc32c152ddbbc359d126ff Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 8 Mar 2024 13:44:08 +1100 Subject: [PATCH 08/11] Use snake_case on logs Note: this changes the pre-existing `route-name` to `route_name` --- tools/querytee/proxy_endpoint.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/querytee/proxy_endpoint.go b/tools/querytee/proxy_endpoint.go index ed0413b33b8..6bc54df452a 100644 --- a/tools/querytee/proxy_endpoint.go +++ b/tools/querytee/proxy_endpoint.go @@ -195,22 +195,22 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba if result == ComparisonFailed { level.Error(p.logger).Log( "msg", "response comparison failed", - "route-name", p.routeName, + "route_name", p.routeName, "query", query, "user", req.Header.Get("X-Scope-OrgID"), "err", err, - "expected-response-duration", expectedResponse.elapsedTime, - "actual-response-duration", actualResponse.elapsedTime, + "expected_response_duration", expectedResponse.elapsedTime, + "actual_response_duration", actualResponse.elapsedTime, ) } else if result == ComparisonSkipped { level.Warn(p.logger).Log( "msg", "response comparison skipped", - "route-name", p.routeName, + "route_name", p.routeName, "query", query, "user", req.Header.Get("X-Scope-OrgID"), "err", err, - "expected-response-duration", expectedResponse.elapsedTime, - "actual-response-duration", actualResponse.elapsedTime, + "expected_response_duration", expectedResponse.elapsedTime, + "actual_response_duration", actualResponse.elapsedTime, ) } @@ -218,13 +218,13 @@ func (p *ProxyEndpoint) executeBackendRequests(req *http.Request, resCh chan *ba if p.slowResponseThreshold > 0 && slowestDuration-fastestDuration >= p.slowResponseThreshold { level.Warn(p.logger).Log( "msg", "response time difference between backends exceeded threshold", - "route-name", p.routeName, + "route_name", p.routeName, "query", query, "user", req.Header.Get("X-Scope-OrgID"), - "slowest-duration", slowestDuration, - "slowest-backend", slowestBackend.name, - "fastest-duration", fastestDuration, - "fastest-backend", fastestBackend.name, + "slowest_duration", slowestDuration, + "slowest_backend", slowestBackend.name, + "fastest_duration", fastestDuration, + "fastest_backend", fastestBackend.name, ) } From 41c3a6dd455b00c3a29ac8babf8e06afb9f29d03 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 8 Mar 2024 13:44:22 +1100 Subject: [PATCH 09/11] Add extra test and make simplify logic --- tools/querytee/proxy_endpoint_test.go | 59 ++++++++++++++++----------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index d193a4079e8..40335ab802e 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -352,29 +352,46 @@ func Test_ProxyEndpoint_Comparison(t *testing.T) { func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { scenarios := map[string]struct { - slowResponseThreshold time.Duration - preferredResponseLatency time.Duration - secondaryResponseLatency time.Duration + slowResponseThreshold time.Duration + preferredResponseLatency time.Duration + secondaryResponseLatency time.Duration + expectLatencyExceedsThreshold bool }{ "responses are below threshold": { - slowResponseThreshold: 100 * time.Millisecond, - preferredResponseLatency: 0 * time.Millisecond, - secondaryResponseLatency: 0 * time.Millisecond, + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 0 * time.Millisecond, + secondaryResponseLatency: 0 * time.Millisecond, + expectLatencyExceedsThreshold: false, }, "one response above threshold": { - slowResponseThreshold: 100 * time.Millisecond, - preferredResponseLatency: 0 * time.Millisecond, - secondaryResponseLatency: 101 * time.Millisecond, + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 0 * time.Millisecond, + secondaryResponseLatency: 101 * time.Millisecond, + expectLatencyExceedsThreshold: true, }, "responses are both above threshold, but lower than threshold between themselves": { - slowResponseThreshold: 100 * time.Millisecond, - preferredResponseLatency: 101 * time.Millisecond, - secondaryResponseLatency: 150 * time.Millisecond, + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 101 * time.Millisecond, + secondaryResponseLatency: 150 * time.Millisecond, + expectLatencyExceedsThreshold: false, }, "responses are both above threshold, and above threshold between themselves": { - slowResponseThreshold: 100 * time.Millisecond, - preferredResponseLatency: 101 * time.Millisecond, - secondaryResponseLatency: 202 * time.Millisecond, + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 101 * time.Millisecond, + secondaryResponseLatency: 202 * time.Millisecond, + expectLatencyExceedsThreshold: true, + }, + "secondary latency is faster than primary, and difference is below threshold": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 50 * time.Millisecond, + secondaryResponseLatency: 0 * time.Millisecond, + expectLatencyExceedsThreshold: false, + }, + "secondary latency is faster than primary, and difference is above threshold": { + slowResponseThreshold: 100 * time.Millisecond, + preferredResponseLatency: 101 * time.Millisecond, + secondaryResponseLatency: 0 * time.Millisecond, + expectLatencyExceedsThreshold: true, }, } @@ -426,16 +443,10 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { // Wait for the response comparison to complete before checking the logged messages. waitForResponseComparisonMetric(t, reg, ComparisonSuccess) - for _, m := range logger.messages { - fmt.Println(m) - } - if (scenario.preferredResponseLatency >= scenario.secondaryResponseLatency && - scenario.preferredResponseLatency-scenario.secondaryResponseLatency < scenario.slowResponseThreshold) || - (scenario.secondaryResponseLatency >= scenario.preferredResponseLatency && - scenario.secondaryResponseLatency-scenario.preferredResponseLatency < scenario.slowResponseThreshold) { - requireNoLogMessages(t, logger.messages, "response time between backends exceeded threshold") + if scenario.expectLatencyExceedsThreshold { + requireLogMessage(t, logger.messages, "response time difference between backends exceeded threshold") } else { - requireLogMessage(t, logger.messages, "response time between backends exceeded threshold") + requireNoLogMessages(t, logger.messages, "response time difference between backends exceeded threshold") } }) } From 2ebe02834d3a72c1a909eefbbd4bcbeedd3c7ac6 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 15 Mar 2024 20:53:56 +1100 Subject: [PATCH 10/11] Move changelog to correct position After merging in main, we need to put the changelog in unreleased. --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30f026a1fd9..35f16fb1c02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ ### Query-tee +* [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 + ### Documentation ### Tools @@ -242,7 +244,6 @@ * [BUGFIX] Fix issue where `Host` HTTP header was not being correctly changed for the proxy targets. #7386 * [ENHANCEMENT] Allow using the value of X-Scope-OrgID for basic auth username in the forwarded request if URL username is set as `__REQUEST_HEADER_X_SCOPE_ORGID__`. #7452 -* [ENHANCEMENT] Log queries that take longer than `proxy.log-slow-query-response-threshold` when compared to other backends. #7346 ### Documentation From 3b82265051b9c77f0eb9210a86c8b6124e1bfd6d Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 15 Mar 2024 21:19:53 +1100 Subject: [PATCH 11/11] Check the slowest/fastest backends are logged correctly --- tools/querytee/proxy_endpoint_test.go | 32 +++++++++++++++++++++------ 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tools/querytee/proxy_endpoint_test.go b/tools/querytee/proxy_endpoint_test.go index 40335ab802e..97bf3f0b36e 100644 --- a/tools/querytee/proxy_endpoint_test.go +++ b/tools/querytee/proxy_endpoint_test.go @@ -356,6 +356,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { preferredResponseLatency time.Duration secondaryResponseLatency time.Duration expectLatencyExceedsThreshold bool + fastestBackend string + slowestBackend string }{ "responses are below threshold": { slowResponseThreshold: 100 * time.Millisecond, @@ -368,6 +370,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { preferredResponseLatency: 0 * time.Millisecond, secondaryResponseLatency: 101 * time.Millisecond, expectLatencyExceedsThreshold: true, + fastestBackend: "preferred-backend", + slowestBackend: "secondary-backend", }, "responses are both above threshold, but lower than threshold between themselves": { slowResponseThreshold: 100 * time.Millisecond, @@ -380,6 +384,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { preferredResponseLatency: 101 * time.Millisecond, secondaryResponseLatency: 202 * time.Millisecond, expectLatencyExceedsThreshold: true, + fastestBackend: "preferred-backend", + slowestBackend: "secondary-backend", }, "secondary latency is faster than primary, and difference is below threshold": { slowResponseThreshold: 100 * time.Millisecond, @@ -392,6 +398,8 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { preferredResponseLatency: 101 * time.Millisecond, secondaryResponseLatency: 0 * time.Millisecond, expectLatencyExceedsThreshold: true, + fastestBackend: "secondary-backend", + slowestBackend: "preferred-backend", }, } @@ -444,7 +452,11 @@ func Test_ProxyEndpoint_LogSlowQueries(t *testing.T) { waitForResponseComparisonMetric(t, reg, ComparisonSuccess) if scenario.expectLatencyExceedsThreshold { - requireLogMessage(t, logger.messages, "response time difference between backends exceeded threshold") + requireLogKeyValues(t, logger.messages, map[string]string{ + "msg": "response time difference between backends exceeded threshold", + "slowest_backend": scenario.slowestBackend, + "fastest_backend": scenario.fastestBackend, + }) } else { requireNoLogMessages(t, logger.messages, "response time difference between backends exceeded threshold") } @@ -476,17 +488,23 @@ func waitForResponseComparisonMetric(t *testing.T, g prometheus.Gatherer, expect } } -func requireLogMessage(t *testing.T, messages []map[string]interface{}, expectedMessage string) { - sawMessage := false - +func requireLogKeyValues(t *testing.T, messages []map[string]interface{}, targetKeyValues map[string]string) { + allKeyValuesMatch := false for _, m := range messages { - if m["msg"] == expectedMessage { - sawMessage = true + allKeyValuesMatch = true + for targetKey, targetValue := range targetKeyValues { + if value, exists := m[targetKey]; !exists || value != targetValue { + // Key does not exist or value does not match + allKeyValuesMatch = false + break + } + } + if allKeyValuesMatch { break } } - require.True(t, sawMessage, "expected to find a '%s' message logged, but only these messages were logged: %v", expectedMessage, messages) + require.True(t, allKeyValuesMatch, "expected to find a message logged with specific key-values: %s, but only these messages were logged: %v", targetKeyValues, messages) } func requireNoLogMessages(t *testing.T, messages []map[string]interface{}, forbiddenMessages ...string) {