From 12599824175ffe2da60003b9a2189b3a0fe0463e Mon Sep 17 00:00:00 2001 From: Coenen Benjamin Date: Tue, 30 May 2023 10:19:14 +0200 Subject: [PATCH] feat(telemetry): add configurable histogram buckets for metrics (#3098) You can customize the buckets for all generated histograms: ```yaml title="router.yaml" telemetry: metrics: common: buckets: - 0.05 - 0.10 - 0.25 - 0.50 - 1.00 - 2.50 - 5.00 - 10.00 - 20.00 ``` Fixes #2333 **Checklist** Complete the checklist (and note appropriate exceptions) before a final PR is raised. - [x] Changes are compatible[^1] - [x] Documentation[^2] completed - [ ] Performance impact assessed and acceptable - Tests added and passing[^3] - [x] Unit Tests - [ ] Integration Tests - [x] Manual Tests **Exceptions** *Note any exceptions here* **Notes** [^1]. It may be appropriate to bring upcoming changes to the attention of other (impacted) groups. Please endeavour to do this before seeking PR approval. The mechanism for doing this will vary considerably, so use your judgement as to how and when to do this. [^2]. Configuration is an important part of many changes. Where applicable please try to document configuration examples. [^3]. Tick whichever testing boxes are applicable. If you are adding Manual Tests: - please document the manual testing (extensively) in the Exceptions. - please raise a separate issue to automate the test and label it (or ask for it to be labeled) as `manual test` --------- Signed-off-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com> --- .changesets/feat_swan_cub_foot_audience.md | 21 ++ ...nfiguration__tests__schema_generation.snap | 24 ++ apollo-router/src/plugins/telemetry/config.rs | 12 +- .../src/plugins/telemetry/metrics/otlp.rs | 4 +- .../plugins/telemetry/metrics/prometheus.rs | 4 +- apollo-router/src/plugins/telemetry/mod.rs | 281 ++++++++++++++++++ ...est_prometheus_metrics_custom_buckets.snap | 20 ++ docs/source/configuration/metrics.mdx | 20 ++ 8 files changed, 378 insertions(+), 8 deletions(-) create mode 100644 .changesets/feat_swan_cub_foot_audience.md create mode 100644 apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap diff --git a/.changesets/feat_swan_cub_foot_audience.md b/.changesets/feat_swan_cub_foot_audience.md new file mode 100644 index 0000000000..89408bc37f --- /dev/null +++ b/.changesets/feat_swan_cub_foot_audience.md @@ -0,0 +1,21 @@ +### Configurable histogram buckets for metrics ([Issue #2333](https://github.com/apollographql/router/issues/2333)) + +You can customize the buckets for all generated histograms: + +```yaml title="router.yaml" +telemetry: + metrics: + common: + buckets: + - 0.05 + - 0.10 + - 0.25 + - 0.50 + - 1.00 + - 2.50 + - 5.00 + - 10.00 + - 20.00 +``` + +By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/3098 \ No newline at end of file diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index f0cae7a315..9048ab187a 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -3711,6 +3711,28 @@ expression: "&schema" "additionalProperties": false, "nullable": true }, + "buckets": { + "description": "Custom buckets for histograms", + "default": [ + 0.001, + 0.005, + 0.015, + 0.05, + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 1.0, + 5.0, + 10.0 + ], + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, "resources": { "description": "Resources", "default": {}, @@ -3721,11 +3743,13 @@ expression: "&schema" }, "service_name": { "description": "Set a service.name resource in your metrics", + "default": null, "type": "string", "nullable": true }, "service_namespace": { "description": "Set a service.namespace attribute in your metrics", + "default": null, "type": "string", "nullable": true } diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 21bb70387f..afc99d28a1 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -80,7 +80,7 @@ pub(crate) struct Metrics { } #[derive(Clone, Default, Debug, Deserialize, JsonSchema)] -#[serde(deny_unknown_fields, rename_all = "snake_case")] +#[serde(deny_unknown_fields, rename_all = "snake_case", default)] pub(crate) struct MetricsCommon { /// Configuration to add custom labels/attributes to metrics pub(crate) attributes: Option, @@ -88,9 +88,17 @@ pub(crate) struct MetricsCommon { pub(crate) service_name: Option, /// Set a service.namespace attribute in your metrics pub(crate) service_namespace: Option, - #[serde(default)] /// Resources pub(crate) resources: HashMap, + /// Custom buckets for histograms + #[serde(default = "default_buckets")] + pub(crate) buckets: Vec, +} + +fn default_buckets() -> Vec { + vec![ + 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, + ] } /// Tracing configuration diff --git a/apollo-router/src/plugins/telemetry/metrics/otlp.rs b/apollo-router/src/plugins/telemetry/metrics/otlp.rs index d4883a56be..de18f0f9f1 100644 --- a/apollo-router/src/plugins/telemetry/metrics/otlp.rs +++ b/apollo-router/src/plugins/telemetry/metrics/otlp.rs @@ -42,9 +42,7 @@ impl MetricsConfigurator for super::super::otlp::Config { Some(exporter) => { let exporter = opentelemetry_otlp::new_pipeline() .metrics( - selectors::simple::histogram([ - 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, - ]), + selectors::simple::histogram(metrics_config.buckets.clone()), aggregation::stateless_temporality_selector(), opentelemetry::runtime::Tokio, ) diff --git a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs index f830b31152..5d6e6c84bb 100644 --- a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs +++ b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs @@ -85,9 +85,7 @@ impl MetricsConfigurator for Config { if self.enabled { let mut controller = controllers::basic( processors::factory( - selectors::simple::histogram([ - 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, - ]), + selectors::simple::histogram(metrics_config.buckets.clone()), aggregation::stateless_temporality_selector(), ) .with_memory(true), diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 05f50f9005..ec2850d262 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -2098,6 +2098,287 @@ mod tests { assert_snapshot!(prom_metrics); } + #[tokio::test(flavor = "multi_thread")] + async fn it_test_prometheus_metrics_custom_buckets() { + let mut mock_service = MockSupergraphService::new(); + mock_service + .expect_call() + .times(1) + .returning(move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .header("x-custom", "coming_from_header") + .data(json!({"data": {"my_value": 2usize}})) + .build() + .unwrap()) + }); + + let mut mock_bad_request_service = MockSupergraphService::new(); + mock_bad_request_service + .expect_call() + .times(1) + .returning(move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::BAD_REQUEST) + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }); + + let mut mock_subgraph_service = MockSubgraphService::new(); + mock_subgraph_service + .expect_call() + .times(1) + .returning(move |req: SubgraphRequest| { + let mut extension = Object::new(); + extension.insert( + serde_json_bytes::ByteString::from("status"), + serde_json_bytes::Value::String(ByteString::from("INTERNAL_SERVER_ERROR")), + ); + let _ = req + .context + .insert("my_key", "my_custom_attribute_from_context".to_string()) + .unwrap(); + Ok(SubgraphResponse::fake_builder() + .context(req.context) + .error( + Error::builder() + .message(String::from("an error occured")) + .extensions(extension) + .extension_code("FETCH_ERROR") + .build(), + ) + .build()) + }); + + let mut mock_subgraph_service_in_error = MockSubgraphService::new(); + mock_subgraph_service_in_error + .expect_call() + .times(1) + .returning(move |_req: SubgraphRequest| { + Err(Box::new(FetchError::SubrequestHttpError { + status_code: None, + service: String::from("my_subgraph_name_error"), + reason: String::from("cannot contact the subgraph"), + })) + }); + + let dyn_plugin: Box = crate::plugin::plugins() + .find(|factory| factory.name == "apollo.telemetry") + .expect("Plugin not found") + .create_instance( + &Value::from_str( + r#"{ + "apollo": { + "client_name_header": "name_header", + "client_version_header": "version_header", + "schema_id": "schema_sha" + }, + "metrics": { + "common": { + "service_name": "apollo-router", + "buckets": [5.0, 10.0, 20.0], + "attributes": { + "supergraph": { + "static": [ + { + "name": "myname", + "value": "label_value" + } + ], + "request": { + "header": [ + { + "named": "test", + "default": "default_value", + "rename": "renamed_value" + }, + { + "named": "another_test", + "default": "my_default_value" + } + ] + }, + "response": { + "header": [{ + "named": "x-custom" + }], + "body": [{ + "path": ".data.data.my_value", + "name": "my_value" + }] + } + }, + "subgraph": { + "all": { + "errors": { + "include_messages": true, + "extensions": [{ + "name": "subgraph_error_extended_code", + "path": ".code" + }, { + "name": "message", + "path": ".reason" + }] + } + }, + "subgraphs": { + "my_subgraph_name": { + "request": { + "body": [{ + "path": ".query", + "name": "query_from_request" + }, { + "path": ".data", + "name": "unknown_data", + "default": "default_value" + }, { + "path": ".data2", + "name": "unknown_data_bis" + }] + }, + "response": { + "body": [{ + "path": ".errors[0].extensions.status", + "name": "error" + }] + }, + "context": [ + { + "named": "my_key" + } + ] + } + } + } + } + }, + "prometheus": { + "enabled": true + } + } + }"#, + ) + .unwrap(), + Default::default(), + ) + .await + .unwrap(); + let mut supergraph_service = dyn_plugin.supergraph_service(BoxService::new(mock_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + let mut bad_request_supergraph_service = + dyn_plugin.supergraph_service(BoxService::new(mock_bad_request_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = bad_request_supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + let mut subgraph_service = + dyn_plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .unwrap(); + // Another subgraph + let mut subgraph_service = dyn_plugin.subgraph_service( + "my_subgraph_name_error", + BoxService::new(mock_subgraph_service_in_error), + ); + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .expect_err("Must be in error"); + + let http_req_prom = http::Request::get("http://localhost:9090/WRONG/URL/metrics") + .body(Default::default()) + .unwrap(); + let mut web_endpoint = dyn_plugin + .web_endpoints() + .into_iter() + .next() + .unwrap() + .1 + .into_iter() + .next() + .unwrap() + .into_router(); + let resp = web_endpoint + .ready() + .await + .unwrap() + .call(http_req_prom) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + + let http_req_prom = http::Request::get("http://localhost:9090/metrics") + .body(Default::default()) + .unwrap(); + let mut resp = web_endpoint.oneshot(http_req_prom).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body = hyper::body::to_bytes(resp.body_mut()).await.unwrap(); + let prom_metrics = String::from_utf8_lossy(&body) + .to_string() + .split('\n') + .filter(|l| l.contains("bucket") && !l.contains("apollo_router_span_count")) + .sorted() + .join("\n"); + assert_snapshot!(prom_metrics); + } + #[test] fn it_test_send_headers_to_studio() { let fw_headers = ForwardHeaders::Only(vec![ diff --git a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap new file mode 100644 index 0000000000..a973b2aaae --- /dev/null +++ b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap @@ -0,0 +1,20 @@ +--- +source: apollo-router/src/plugins/telemetry/mod.rs +expression: prom_metrics +--- +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="5"} 1 diff --git a/docs/source/configuration/metrics.mdx b/docs/source/configuration/metrics.mdx index f68a8f1d77..8abef3d50e 100644 --- a/docs/source/configuration/metrics.mdx +++ b/docs/source/configuration/metrics.mdx @@ -233,6 +233,26 @@ To fetch the value of the field `x`, the corresponding path is `.items[0].wanted JSON path queries always begin with a period `.` +## Changing default buckets for histograms + +You can customize the buckets for all generated histograms: + +```yaml title="router.yaml" +telemetry: + metrics: + common: + buckets: + - 0.05 + - 0.10 + - 0.25 + - 0.50 + - 1.00 + - 2.50 + - 5.00 + - 10.00 + - 20.00 +``` + ## Adding custom resources Resources are similar to [attributes](#adding-custom-attributeslabels), but there are more globals. They're configured directly on the metrics exporter, which means they're always present on each of your metrics.