diff --git a/.changesets/feat_swan_cub_foot_audience.md b/.changesets/feat_swan_cub_foot_audience.md new file mode 100644 index 0000000000..89408bc37f --- /dev/null +++ b/.changesets/feat_swan_cub_foot_audience.md @@ -0,0 +1,21 @@ +### Configurable histogram buckets for metrics ([Issue #2333](https://github.com/apollographql/router/issues/2333)) + +You can customize the buckets for all generated histograms: + +```yaml title="router.yaml" +telemetry: + metrics: + common: + buckets: + - 0.05 + - 0.10 + - 0.25 + - 0.50 + - 1.00 + - 2.50 + - 5.00 + - 10.00 + - 20.00 +``` + +By [@bnjjj](https://github.com/bnjjj) in https://github.com/apollographql/router/pull/3098 \ No newline at end of file diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index f0cae7a315..9048ab187a 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -3711,6 +3711,28 @@ expression: "&schema" "additionalProperties": false, "nullable": true }, + "buckets": { + "description": "Custom buckets for histograms", + "default": [ + 0.001, + 0.005, + 0.015, + 0.05, + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 1.0, + 5.0, + 10.0 + ], + "type": "array", + "items": { + "type": "number", + "format": "double" + } + }, "resources": { "description": "Resources", "default": {}, @@ -3721,11 +3743,13 @@ expression: "&schema" }, "service_name": { "description": "Set a service.name resource in your metrics", + "default": null, "type": "string", "nullable": true }, "service_namespace": { "description": "Set a service.namespace attribute in your metrics", + "default": null, "type": "string", "nullable": true } diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 21bb70387f..afc99d28a1 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -80,7 +80,7 @@ pub(crate) struct Metrics { } #[derive(Clone, Default, Debug, Deserialize, JsonSchema)] -#[serde(deny_unknown_fields, rename_all = "snake_case")] +#[serde(deny_unknown_fields, rename_all = "snake_case", default)] pub(crate) struct MetricsCommon { /// Configuration to add custom labels/attributes to metrics pub(crate) attributes: Option, @@ -88,9 +88,17 @@ pub(crate) struct MetricsCommon { pub(crate) service_name: Option, /// Set a service.namespace attribute in your metrics pub(crate) service_namespace: Option, - #[serde(default)] /// Resources pub(crate) resources: HashMap, + /// Custom buckets for histograms + #[serde(default = "default_buckets")] + pub(crate) buckets: Vec, +} + +fn default_buckets() -> Vec { + vec![ + 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, + ] } /// Tracing configuration diff --git a/apollo-router/src/plugins/telemetry/metrics/otlp.rs b/apollo-router/src/plugins/telemetry/metrics/otlp.rs index d4883a56be..de18f0f9f1 100644 --- a/apollo-router/src/plugins/telemetry/metrics/otlp.rs +++ b/apollo-router/src/plugins/telemetry/metrics/otlp.rs @@ -42,9 +42,7 @@ impl MetricsConfigurator for super::super::otlp::Config { Some(exporter) => { let exporter = opentelemetry_otlp::new_pipeline() .metrics( - selectors::simple::histogram([ - 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, - ]), + selectors::simple::histogram(metrics_config.buckets.clone()), aggregation::stateless_temporality_selector(), opentelemetry::runtime::Tokio, ) diff --git a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs index f830b31152..5d6e6c84bb 100644 --- a/apollo-router/src/plugins/telemetry/metrics/prometheus.rs +++ b/apollo-router/src/plugins/telemetry/metrics/prometheus.rs @@ -85,9 +85,7 @@ impl MetricsConfigurator for Config { if self.enabled { let mut controller = controllers::basic( processors::factory( - selectors::simple::histogram([ - 0.001, 0.005, 0.015, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 5.0, 10.0, - ]), + selectors::simple::histogram(metrics_config.buckets.clone()), aggregation::stateless_temporality_selector(), ) .with_memory(true), diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 05f50f9005..ec2850d262 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -2098,6 +2098,287 @@ mod tests { assert_snapshot!(prom_metrics); } + #[tokio::test(flavor = "multi_thread")] + async fn it_test_prometheus_metrics_custom_buckets() { + let mut mock_service = MockSupergraphService::new(); + mock_service + .expect_call() + .times(1) + .returning(move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .header("x-custom", "coming_from_header") + .data(json!({"data": {"my_value": 2usize}})) + .build() + .unwrap()) + }); + + let mut mock_bad_request_service = MockSupergraphService::new(); + mock_bad_request_service + .expect_call() + .times(1) + .returning(move |req: SupergraphRequest| { + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::BAD_REQUEST) + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }); + + let mut mock_subgraph_service = MockSubgraphService::new(); + mock_subgraph_service + .expect_call() + .times(1) + .returning(move |req: SubgraphRequest| { + let mut extension = Object::new(); + extension.insert( + serde_json_bytes::ByteString::from("status"), + serde_json_bytes::Value::String(ByteString::from("INTERNAL_SERVER_ERROR")), + ); + let _ = req + .context + .insert("my_key", "my_custom_attribute_from_context".to_string()) + .unwrap(); + Ok(SubgraphResponse::fake_builder() + .context(req.context) + .error( + Error::builder() + .message(String::from("an error occured")) + .extensions(extension) + .extension_code("FETCH_ERROR") + .build(), + ) + .build()) + }); + + let mut mock_subgraph_service_in_error = MockSubgraphService::new(); + mock_subgraph_service_in_error + .expect_call() + .times(1) + .returning(move |_req: SubgraphRequest| { + Err(Box::new(FetchError::SubrequestHttpError { + status_code: None, + service: String::from("my_subgraph_name_error"), + reason: String::from("cannot contact the subgraph"), + })) + }); + + let dyn_plugin: Box = crate::plugin::plugins() + .find(|factory| factory.name == "apollo.telemetry") + .expect("Plugin not found") + .create_instance( + &Value::from_str( + r#"{ + "apollo": { + "client_name_header": "name_header", + "client_version_header": "version_header", + "schema_id": "schema_sha" + }, + "metrics": { + "common": { + "service_name": "apollo-router", + "buckets": [5.0, 10.0, 20.0], + "attributes": { + "supergraph": { + "static": [ + { + "name": "myname", + "value": "label_value" + } + ], + "request": { + "header": [ + { + "named": "test", + "default": "default_value", + "rename": "renamed_value" + }, + { + "named": "another_test", + "default": "my_default_value" + } + ] + }, + "response": { + "header": [{ + "named": "x-custom" + }], + "body": [{ + "path": ".data.data.my_value", + "name": "my_value" + }] + } + }, + "subgraph": { + "all": { + "errors": { + "include_messages": true, + "extensions": [{ + "name": "subgraph_error_extended_code", + "path": ".code" + }, { + "name": "message", + "path": ".reason" + }] + } + }, + "subgraphs": { + "my_subgraph_name": { + "request": { + "body": [{ + "path": ".query", + "name": "query_from_request" + }, { + "path": ".data", + "name": "unknown_data", + "default": "default_value" + }, { + "path": ".data2", + "name": "unknown_data_bis" + }] + }, + "response": { + "body": [{ + "path": ".errors[0].extensions.status", + "name": "error" + }] + }, + "context": [ + { + "named": "my_key" + } + ] + } + } + } + } + }, + "prometheus": { + "enabled": true + } + } + }"#, + ) + .unwrap(), + Default::default(), + ) + .await + .unwrap(); + let mut supergraph_service = dyn_plugin.supergraph_service(BoxService::new(mock_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + let mut bad_request_supergraph_service = + dyn_plugin.supergraph_service(BoxService::new(mock_bad_request_service)); + let router_req = SupergraphRequest::fake_builder().header("test", "my_value_set"); + + let _router_response = bad_request_supergraph_service + .ready() + .await + .unwrap() + .call(router_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + + let mut subgraph_service = + dyn_plugin.subgraph_service("my_subgraph_name", BoxService::new(mock_subgraph_service)); + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .unwrap(); + // Another subgraph + let mut subgraph_service = dyn_plugin.subgraph_service( + "my_subgraph_name_error", + BoxService::new(mock_subgraph_service_in_error), + ); + let subgraph_req = SubgraphRequest::fake_builder() + .subgraph_request( + http_ext::Request::fake_builder() + .header("test", "my_value_set") + .body( + Request::fake_builder() + .query(String::from("query { test }")) + .build(), + ) + .build() + .unwrap(), + ) + .build(); + let _subgraph_response = subgraph_service + .ready() + .await + .unwrap() + .call(subgraph_req) + .await + .expect_err("Must be in error"); + + let http_req_prom = http::Request::get("http://localhost:9090/WRONG/URL/metrics") + .body(Default::default()) + .unwrap(); + let mut web_endpoint = dyn_plugin + .web_endpoints() + .into_iter() + .next() + .unwrap() + .1 + .into_iter() + .next() + .unwrap() + .into_router(); + let resp = web_endpoint + .ready() + .await + .unwrap() + .call(http_req_prom) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + + let http_req_prom = http::Request::get("http://localhost:9090/metrics") + .body(Default::default()) + .unwrap(); + let mut resp = web_endpoint.oneshot(http_req_prom).await.unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body = hyper::body::to_bytes(resp.body_mut()).await.unwrap(); + let prom_metrics = String::from_utf8_lossy(&body) + .to_string() + .split('\n') + .filter(|l| l.contains("bucket") && !l.contains("apollo_router_span_count")) + .sorted() + .join("\n"); + assert_snapshot!(prom_metrics); + } + #[test] fn it_test_send_headers_to_studio() { let fw_headers = ForwardHeaders::Only(vec![ diff --git a/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap new file mode 100644 index 0000000000..a973b2aaae --- /dev/null +++ b/apollo-router/src/plugins/telemetry/snapshots/apollo_router__plugins__telemetry__tests__it_test_prometheus_metrics_custom_buckets.snap @@ -0,0 +1,20 @@ +--- +source: apollo-router/src/plugins/telemetry/mod.rs +expression: prom_metrics +--- +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",error="400 Bad Request",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="400",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{another_test="my_default_value",my_value="2",myname="label_value",renamed_value="my_value_set",service_name="apollo-router",status="200",x_custom="coming_from_header",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{error="INTERNAL_SERVER_ERROR",my_key="my_custom_attribute_from_context",query_from_request="query { test }",service_name="apollo-router",status="200",subgraph="my_subgraph_name",unknown_data="default_value",le="5"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="+Inf"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="10"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="20"} 1 +apollo_router_http_request_duration_seconds_bucket{message="cannot contact the subgraph",service_name="apollo-router",status="500",subgraph="my_subgraph_name_error",subgraph_error_extended_code="SUBREQUEST_HTTP_ERROR",le="5"} 1 diff --git a/docs/source/configuration/metrics.mdx b/docs/source/configuration/metrics.mdx index f68a8f1d77..8abef3d50e 100644 --- a/docs/source/configuration/metrics.mdx +++ b/docs/source/configuration/metrics.mdx @@ -233,6 +233,26 @@ To fetch the value of the field `x`, the corresponding path is `.items[0].wanted JSON path queries always begin with a period `.` +## Changing default buckets for histograms + +You can customize the buckets for all generated histograms: + +```yaml title="router.yaml" +telemetry: + metrics: + common: + buckets: + - 0.05 + - 0.10 + - 0.25 + - 0.50 + - 1.00 + - 2.50 + - 5.00 + - 10.00 + - 20.00 +``` + ## Adding custom resources Resources are similar to [attributes](#adding-custom-attributeslabels), but there are more globals. They're configured directly on the metrics exporter, which means they're always present on each of your metrics.