From 3908d7d3a4fc86356cf133c122b7f546c0952142 Mon Sep 17 00:00:00 2001 From: Adam Boguszewski <108867528+aboguszewski-sumo@users.noreply.github.com> Date: Thu, 17 Nov 2022 22:34:50 +0100 Subject: [PATCH] [receiver/elasticsearch]: add cluster health metrics for two more shard types (#14875) * feat: add cluster health metrics for two more shard types --- .../elasticsearch-cluster-health-shards.yaml | 16 ++++++++ receiver/elasticsearchreceiver/README.md | 16 ++++++++ .../elasticsearchreceiver/documentation.md | 2 +- .../internal/metadata/generated_metrics.go | 16 ++++++-- .../internal/model/clusterhealth.go | 22 ++++++----- receiver/elasticsearchreceiver/metadata.yaml | 2 + receiver/elasticsearchreceiver/scraper.go | 39 +++++++++++++++++-- .../elasticsearchreceiver/scraper_test.go | 8 ++++ .../testdata/expected_metrics/full.json | 26 +++++++++++++ .../testdata/expected_metrics/noNodes.json | 26 +++++++++++++ .../testdata/sample_payloads/health.json | 2 +- 11 files changed, 155 insertions(+), 20 deletions(-) create mode 100644 .chloggen/elasticsearch-cluster-health-shards.yaml diff --git a/.chloggen/elasticsearch-cluster-health-shards.yaml b/.chloggen/elasticsearch-cluster-health-shards.yaml new file mode 100644 index 000000000000..0353d8f3e616 --- /dev/null +++ b/.chloggen/elasticsearch-cluster-health-shards.yaml @@ -0,0 +1,16 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: elasticsearchreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add cluster health metrics for two more shards types + +# One or more tracking issues related to the change +issues: [14635] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/receiver/elasticsearchreceiver/README.md b/receiver/elasticsearchreceiver/README.md index d47a59459d29..7ca81bb58873 100644 --- a/receiver/elasticsearchreceiver/README.md +++ b/receiver/elasticsearchreceiver/README.md @@ -55,5 +55,21 @@ The following metric are available with versions: - `elasticsearch.cluster.state_update.time` >= [7.16.0](https://www.elastic.co/guide/en/elasticsearch/reference/7.16/release-notes-7.16.0.html) Details about the metrics produced by this receiver can be found in [metadata.yaml](./metadata.yaml) + +## Feature gate configurations + +See the [Collector feature gates](https://github.com/open-telemetry/opentelemetry-collector/blob/main/featuregate/README.md#collector-feature-gates) for an overview of feature gates in the collector. + +**ALPHA**: `receiver.elasticsearch.emitClusterHealthDetailedShardMetrics` + +The feature gate `receiver.elasticsearch.emitClusterHealthDetailedShardMetrics` once enabled starts emitting the metric `elasticsearch.cluster.shards` +with two additional data points - one with `state` equal to `active_primary` and one with `state` equal to `unassigned_delayed`. + +This is considered a breaking change for existing users of this receiver, and it is recommended to migrate to the new implementation when possible. Any new users planning to adopt this receiver should enable this feature gate to avoid having to migrate any visualisations or alerts. + +This feature gate will eventually be enabled by default, and eventually the old implementation will be removed. It aims +to give users time to migrate to the new implementation. The target release for this featuregate to be enabled by default +is 0.68.0. + [beta]:https://github.com/open-telemetry/opentelemetry-collector#beta [contrib]:https://github.com/open-telemetry/opentelemetry-collector-releases/tree/main/distributions/otelcol-contrib diff --git a/receiver/elasticsearchreceiver/documentation.md b/receiver/elasticsearchreceiver/documentation.md index 4b3545ca80c7..a05eb9f8f562 100644 --- a/receiver/elasticsearchreceiver/documentation.md +++ b/receiver/elasticsearchreceiver/documentation.md @@ -136,7 +136,7 @@ metrics: | operation (operation) | The type of operation. | index, delete, get, query, fetch, scroll, suggest, merge, refresh, flush, warmer | | query_cache_count_type (type) | Type of query cache count | hit, miss | | segments_memory_object_type (object) | Type of object in segment | term, doc_value, index_writer, fixed_bit_set | -| shard_state (state) | The state of the shard. | active, relocating, initializing, unassigned | +| shard_state (state) | The state of the shard. | active, active_primary, relocating, initializing, unassigned, unassigned_delayed | | task_state (state) | The state of the task. | rejected, completed | | thread_pool_name | The name of the thread pool. | | | thread_state (state) | The state of the thread. | active, idle | diff --git a/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go b/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go index a28571425cb4..81c323dc6cd0 100644 --- a/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go +++ b/receiver/elasticsearchreceiver/internal/metadata/generated_metrics.go @@ -874,9 +874,11 @@ type AttributeShardState int const ( _ AttributeShardState = iota AttributeShardStateActive + AttributeShardStateActivePrimary AttributeShardStateRelocating AttributeShardStateInitializing AttributeShardStateUnassigned + AttributeShardStateUnassignedDelayed ) // String returns the string representation of the AttributeShardState. @@ -884,22 +886,28 @@ func (av AttributeShardState) String() string { switch av { case AttributeShardStateActive: return "active" + case AttributeShardStateActivePrimary: + return "active_primary" case AttributeShardStateRelocating: return "relocating" case AttributeShardStateInitializing: return "initializing" case AttributeShardStateUnassigned: return "unassigned" + case AttributeShardStateUnassignedDelayed: + return "unassigned_delayed" } return "" } // MapAttributeShardState is a helper map of string to AttributeShardState attribute value. var MapAttributeShardState = map[string]AttributeShardState{ - "active": AttributeShardStateActive, - "relocating": AttributeShardStateRelocating, - "initializing": AttributeShardStateInitializing, - "unassigned": AttributeShardStateUnassigned, + "active": AttributeShardStateActive, + "active_primary": AttributeShardStateActivePrimary, + "relocating": AttributeShardStateRelocating, + "initializing": AttributeShardStateInitializing, + "unassigned": AttributeShardStateUnassigned, + "unassigned_delayed": AttributeShardStateUnassignedDelayed, } // AttributeTaskState specifies the a value task_state attribute. diff --git a/receiver/elasticsearchreceiver/internal/model/clusterhealth.go b/receiver/elasticsearchreceiver/internal/model/clusterhealth.go index 7e05e1acb97a..9f383b441440 100644 --- a/receiver/elasticsearchreceiver/internal/model/clusterhealth.go +++ b/receiver/elasticsearchreceiver/internal/model/clusterhealth.go @@ -18,14 +18,16 @@ package model // import "github.com/open-telemetry/opentelemetry-collector-contr // The struct is not exhaustive; It does not provide all values returned by elasticsearch, // only the ones relevant to the metrics retrieved by the scraper. type ClusterHealth struct { - ClusterName string `json:"cluster_name"` - ActiveShards int64 `json:"active_shards"` - RelocatingShards int64 `json:"relocating_shards"` - InitializingShards int64 `json:"initializing_shards"` - UnassignedShards int64 `json:"unassigned_shards"` - NodeCount int64 `json:"number_of_nodes"` - DataNodeCount int64 `json:"number_of_data_nodes"` - PendingTasksCount int64 `json:"number_of_pending_tasks"` - InFlightFetchCount int64 `json:"number_of_in_flight_fetch"` - Status string `json:"status"` + ClusterName string `json:"cluster_name"` + ActiveShards int64 `json:"active_shards"` + ActivePrimaryShards int64 `json:"active_primary_shards"` + RelocatingShards int64 `json:"relocating_shards"` + InitializingShards int64 `json:"initializing_shards"` + UnassignedShards int64 `json:"unassigned_shards"` + DelayedUnassignedShards int64 `json:"delayed_unassigned_shards"` + NodeCount int64 `json:"number_of_nodes"` + DataNodeCount int64 `json:"number_of_data_nodes"` + PendingTasksCount int64 `json:"number_of_pending_tasks"` + InFlightFetchCount int64 `json:"number_of_in_flight_fetch"` + Status string `json:"status"` } diff --git a/receiver/elasticsearchreceiver/metadata.yaml b/receiver/elasticsearchreceiver/metadata.yaml index c8b8c0e66a9b..57d32d94318b 100644 --- a/receiver/elasticsearchreceiver/metadata.yaml +++ b/receiver/elasticsearchreceiver/metadata.yaml @@ -45,9 +45,11 @@ attributes: description: The state of the shard. enum: - active + - active_primary - relocating - initializing - unassigned + - unassigned_delayed operation: value: operation description: The type of operation. diff --git a/receiver/elasticsearchreceiver/scraper.go b/receiver/elasticsearchreceiver/scraper.go index fc52a7fdef73..6fcfad6113d3 100644 --- a/receiver/elasticsearchreceiver/scraper.go +++ b/receiver/elasticsearchreceiver/scraper.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/go-version" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver/scrapererror" @@ -41,6 +42,19 @@ var ( }() ) +const ( + readmeURL = "https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/elasticsearchreceiver/README.md" + emitClusterHealthDetailedShardMetricsID = "receiver.elasticsearch.emitClusterHealthDetailedShardMetrics" +) + +func init() { + featuregate.GetRegistry().MustRegisterID( + emitClusterHealthDetailedShardMetricsID, + featuregate.StageAlpha, + featuregate.WithRegisterDescription("When enabled, the elasticsearch.cluster.shards metric will be emitted with two more datapoints."), + ) +} + var errUnknownClusterStatus = errors.New("unknown cluster status") type elasticsearchScraper struct { @@ -50,17 +64,29 @@ type elasticsearchScraper struct { mb *metadata.MetricsBuilder version *version.Version clusterName string + + // Feature gates + emitClusterHealthDetailedShardMetrics bool } func newElasticSearchScraper( settings component.ReceiverCreateSettings, cfg *Config, ) *elasticsearchScraper { - return &elasticsearchScraper{ - settings: settings.TelemetrySettings, - cfg: cfg, - mb: metadata.NewMetricsBuilder(cfg.Metrics, settings.BuildInfo), + e := &elasticsearchScraper{ + settings: settings.TelemetrySettings, + cfg: cfg, + mb: metadata.NewMetricsBuilder(cfg.Metrics, settings.BuildInfo), + emitClusterHealthDetailedShardMetrics: featuregate.GetRegistry().IsEnabled(emitClusterHealthDetailedShardMetricsID), } + + if !e.emitClusterHealthDetailedShardMetrics { + settings.Logger.Warn( + fmt.Sprintf("Feature gate %s is not enabled. Please see the README for more information: %s", emitClusterHealthDetailedShardMetricsID, readmeURL), + ) + } + + return e } func (r *elasticsearchScraper) start(_ context.Context, host component.Host) (err error) { @@ -323,6 +349,11 @@ func (r *elasticsearchScraper) scrapeClusterMetrics(ctx context.Context, now pco r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.RelocatingShards, metadata.AttributeShardStateRelocating) r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.UnassignedShards, metadata.AttributeShardStateUnassigned) + if r.emitClusterHealthDetailedShardMetrics { + r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.ActivePrimaryShards, metadata.AttributeShardStateActivePrimary) + r.mb.RecordElasticsearchClusterShardsDataPoint(now, clusterHealth.DelayedUnassignedShards, metadata.AttributeShardStateUnassignedDelayed) + } + r.mb.RecordElasticsearchClusterPendingTasksDataPoint(now, clusterHealth.PendingTasksCount) r.mb.RecordElasticsearchClusterInFlightFetchDataPoint(now, clusterHealth.InFlightFetchCount) diff --git a/receiver/elasticsearchreceiver/scraper_test.go b/receiver/elasticsearchreceiver/scraper_test.go index c0fde98fb0d8..e09f523c13bd 100644 --- a/receiver/elasticsearchreceiver/scraper_test.go +++ b/receiver/elasticsearchreceiver/scraper_test.go @@ -26,6 +26,7 @@ import ( "go.opentelemetry.io/collector/component/componenttest" "go.opentelemetry.io/collector/config/confighttp" "go.opentelemetry.io/collector/config/configtls" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/receiver/scrapererror" "github.com/open-telemetry/opentelemetry-collector-contrib/internal/scrapertest" @@ -38,6 +39,13 @@ const fullExpectedMetricsPath = "./testdata/expected_metrics/full.json" const skipClusterExpectedMetricsPath = "./testdata/expected_metrics/clusterSkip.json" const noNodesExpectedMetricsPath = "./testdata/expected_metrics/noNodes.json" +func TestMain(m *testing.M) { + // Enable the feature gates before all tests to avoid flaky tests. + _ = featuregate.GetRegistry().Apply(map[string]bool{emitClusterHealthDetailedShardMetricsID: true}) + code := m.Run() + os.Exit(code) +} + func TestScraper(t *testing.T) { t.Parallel() diff --git a/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json b/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json index 267b0936742a..2af62881cb1e 100644 --- a/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json +++ b/receiver/elasticsearchreceiver/testdata/expected_metrics/full.json @@ -2515,6 +2515,32 @@ ], "startTimeUnixNano": "1661811689941624000", "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "23", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "active_primary" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "1", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "unassigned_delayed" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" } ] }, diff --git a/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json b/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json index 7efecefdfa2e..c9930d7c4d38 100644 --- a/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json +++ b/receiver/elasticsearchreceiver/testdata/expected_metrics/noNodes.json @@ -180,6 +180,32 @@ ], "startTimeUnixNano": "1662458370557980000", "timeUnixNano": "1662458370559258000" + }, + { + "asInt": "23", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "active_primary" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" + }, + { + "asInt": "1", + "attributes": [ + { + "key": "state", + "value": { + "stringValue": "unassigned_delayed" + } + } + ], + "startTimeUnixNano": "1661811689941624000", + "timeUnixNano": "1661811689943245000" } ] }, diff --git a/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json b/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json index 35f14df3362d..ac526ca34c3e 100644 --- a/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json +++ b/receiver/elasticsearchreceiver/testdata/sample_payloads/health.json @@ -9,7 +9,7 @@ "relocating_shards": 10, "initializing_shards": 2, "unassigned_shards": 3, - "delayed_unassigned_shards": 0, + "delayed_unassigned_shards": 1, "number_of_pending_tasks": 0, "number_of_in_flight_fetch": 0, "task_max_waiting_in_queue_millis": 0,