-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix otel-collector logging repeat error
The pod log shows repeat log of duplicate timeseries error that takes up logging quota. Although the metris are correctly being sent, the error message can be confusing to users. The removal of `type` tag is causing one timeseries being exported multiple times in batch with and without the tag. This change uses aggregation in metricstransfrom processor to eliminate the `type` tag from Monarch pipeline. Format follows otel-collector-contrib 0.54.0. Tested locally e2e. b/290678742
- Loading branch information
1 parent
a386344
commit 5154183
Showing
4 changed files
with
381 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
// Copyright 2022 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package e2e | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
||
"kpt.dev/configsync/e2e" | ||
"kpt.dev/configsync/e2e/nomostest" | ||
"kpt.dev/configsync/e2e/nomostest/ntopts" | ||
"kpt.dev/configsync/e2e/nomostest/taskgroup" | ||
nomostesting "kpt.dev/configsync/e2e/nomostest/testing" | ||
"kpt.dev/configsync/pkg/kinds" | ||
ocmetrics "kpt.dev/configsync/pkg/metrics" | ||
) | ||
|
||
const ( | ||
DefaultMonitorKSA = "default" | ||
MonitorGSA = "e2e-test-metric-writer" | ||
) | ||
|
||
func TestOtelCollectorDeployment(t *testing.T) { | ||
nt := nomostest.New(t, nomostesting.Reconciliation1, ntopts.RequireGKE(t)) | ||
nt.T.Cleanup(func() { | ||
nt.MustKubectl("delete", "-f", "../testdata/otel-collector/otel-cm-duplicate-timeseries.yaml", "--ignore-not-found") | ||
if t.Failed() { | ||
nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false) | ||
} | ||
}) | ||
|
||
// If Workload Identity enabled on cluster, setup KSA to GSA annotation | ||
if workloadPool, err := getWorkloadPool(nt); err != nil { | ||
nt.T.Fatal(err) | ||
} else if workloadPool != "" { | ||
nt.T.Log("Workload identity enabled, adding KSA annotation to use %s service account", MonitorGSA) | ||
nt.MustKubectl("annotate", "serviceaccount", "--namespace", ocmetrics.MonitoringNamespace, DefaultMonitorKSA, | ||
fmt.Sprintf("iam.gke.io/gcp-service-account=%s@%s.iam.gserviceaccount.com", MonitorGSA, *e2e.GCPProject)) | ||
} | ||
|
||
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM") | ||
nomostest.DeletePodByLabel(nt, "app", "opentelemetry", false) | ||
if err := nt.WatchForAllSyncs(); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
if err := validateOtelCollectorStatusCurrent(nt, false); err != nil { | ||
nt.T.Fatal() | ||
} | ||
|
||
nt.T.Log("Wait 1 minute for metric exporter to settle") | ||
time.Sleep(1 * time.Minute) | ||
nt.T.Log("Checking the otel-collector log contains no failure...") | ||
err := CheckDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
nt.T.Log("Apply custom otel-collector ConfigMap that could cause duplicate time series error") | ||
nt.MustKubectl("apply", "-f", "../testdata/otel-collector/otel-cm-duplicate-timeseries.yaml") | ||
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log") | ||
nomostest.DeletePodByLabel(nt, "app", "opentelemetry", false) | ||
if err := nt.WatchForAllSyncs(); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
if err := validateOtelCollectorStatusCurrent(nt, true); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
nt.T.Log("wait 1 minute for metric exporter to settle") | ||
time.Sleep(1 * time.Minute) | ||
nt.T.Log("Checking the otel-collector log contains failure...") | ||
if err := CheckDeploymentLogHasFailure(nt, ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
} | ||
|
||
func CheckDeploymentLogHasFailure(nt *nomostest.NT, deployment, namespace string) error { | ||
nt.T.Helper() | ||
|
||
args := []string{"logs", fmt.Sprintf("deployment/%s", deployment), "-n", namespace} | ||
cmd := fmt.Sprintf("kubectl %s", strings.Join(args, " ")) | ||
out, err := nt.Shell.Kubectl(args...) | ||
if err != nil { | ||
nt.T.Logf("failed to run %q: %v\n%s", cmd, err, out) | ||
return err | ||
} | ||
|
||
entry := strings.Split(string(out), "\n") | ||
for _, m := range entry { | ||
if strings.Contains(m, "failed to export time series to GCM") { | ||
return nil | ||
} | ||
} | ||
return fmt.Errorf("error expected in the log of deployment %s, namespace %s but found none", deployment, namespace) | ||
} | ||
|
||
func CheckDeploymentLogHasNoFailure(nt *nomostest.NT, deployment, namespace string) error { | ||
nt.T.Helper() | ||
|
||
args := []string{"logs", fmt.Sprintf("deployment/%s", deployment), "-n", namespace} | ||
cmd := fmt.Sprintf("kubectl %s", strings.Join(args, " ")) | ||
out, err := nt.Shell.Kubectl(args...) | ||
if err != nil { | ||
nt.T.Logf("failed to run %q: %v\n%s", cmd, err, out) | ||
return err | ||
} | ||
|
||
entry := strings.Split(string(out), "\n") | ||
for _, m := range entry { | ||
if strings.Contains(m, "failed to export time series to GCM") { | ||
return fmt.Errorf("failure found in the log of deployment %s, namespace %s: %s", deployment, namespace, m) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// func validateDeploymentstatuscurrent | ||
func validateOtelCollectorStatusCurrent(nt *nomostest.NT, customConfigMap bool) error { | ||
tg := taskgroup.New() | ||
tg.Go(func() error { | ||
return nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace) | ||
}) | ||
tg.Go(func() error { | ||
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace) | ||
}) | ||
if customConfigMap { | ||
tg.Go(func() error { | ||
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorCustomCM, ocmetrics.MonitoringNamespace) | ||
}) | ||
} else { | ||
tg.Go(func() error { | ||
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorGooglecloud, ocmetrics.MonitoringNamespace) | ||
}) | ||
} | ||
return tg.Wait() | ||
} |
211 changes: 211 additions & 0 deletions
211
e2e/testdata/otel-collector/otel-cm-duplicate-timeseries.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
# Copyright 2022 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
apiVersion: v1 | ||
data: | ||
otel-collector-config.yaml: |- | ||
receivers: | ||
opencensus: | ||
exporters: | ||
prometheus: | ||
endpoint: :8675 | ||
namespace: config_sync | ||
resource_to_telemetry_conversion: | ||
enabled: true | ||
googlecloud: | ||
metric: | ||
prefix: "custom.googleapis.com/opencensus/config_sync/" | ||
# The exporter would always fail at sending metric descriptor. Skipping | ||
# creation of metric descriptors until the error from upstream is resolved | ||
# The metric streaming data is not affected | ||
# https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/issues/529 | ||
skip_create_descriptor: true | ||
# resource_filters looks for metric resource attributes by prefix and converts | ||
# them into custom metric labels, so they become visible and can be accessed | ||
# under the GroupBy dropdown list in Cloud Monitoring | ||
resource_filters: | ||
- prefix: "cloud.account.id" | ||
- prefix: "cloud.availability.zone" | ||
- prefix: "cloud.platform" | ||
- prefix: "cloud.provider" | ||
- prefix: "k8s.pod.ip" | ||
- prefix: "k8s.pod.namespace" | ||
- prefix: "k8s.pod.uid" | ||
- prefix: "k8s.container.name" | ||
- prefix: "host.id" | ||
- prefix: "host.name" | ||
- prefix: "k8s.deployment.name" | ||
- prefix: "k8s.node.name" | ||
retry_on_failure: | ||
enabled: false | ||
sending_queue: | ||
enabled: false | ||
googlecloud/kubernetes: | ||
metric: | ||
prefix: "kubernetes.io/internal/addons/config_sync/" | ||
# skip_create_descriptor: Metrics start with 'kubernetes.io/' have already | ||
# got descriptors defined internally. Skip sending dupeicated metric | ||
# descriptors here to prevent errors or conflicts. | ||
skip_create_descriptor: true | ||
# instrumentation_library_labels: Otel Collector by default attaches | ||
# 'instrumentation_version' and 'instrumentation_source' labels that are | ||
# not specified in our Cloud Monarch definitions, thus skipping them here | ||
instrumentation_library_labels: false | ||
# create_service_timeseries: This is a recommended configuration for | ||
# 'service metrics' starts with 'kubernetes.io/' prefix. It uses | ||
# CreateTimeSeries API and has its own quotas, so that custom metric write | ||
# will not break this ingestion pipeline | ||
create_service_timeseries: true | ||
service_resource_labels: false | ||
retry_on_failure: | ||
enabled: false | ||
sending_queue: | ||
enabled: false | ||
processors: | ||
batch: | ||
# resourcedetection: This processor is needed to correctly mirror resource | ||
# labels from OpenCensus to OpenTelemetry. We also want to keep this same | ||
# processor in Otel Agent configuration as the resource labels are added from | ||
# there | ||
resourcedetection: | ||
detectors: [env, gcp] | ||
filter/cloudmonitoring: | ||
metrics: | ||
include: | ||
match_type: regexp | ||
metric_names: | ||
- reconciler_errors | ||
- apply_duration_seconds | ||
- reconcile_duration_seconds | ||
- rg_reconcile_duration_seconds | ||
- last_sync_timestamp | ||
- pipeline_error_observed | ||
- declared_resources | ||
- apply_operations_total | ||
- resource_fights_total | ||
- internal_errors_total | ||
- kcc_resource_count | ||
- resource_count | ||
- ready_resource_count | ||
- cluster_scoped_resource_count | ||
- resource_ns_count | ||
- api_duration_seconds | ||
filter/kubernetes: | ||
metrics: | ||
include: | ||
match_type: regexp | ||
metric_names: | ||
- kustomize.* | ||
- api_duration_seconds | ||
- reconciler_errors | ||
- pipeline_error_observed | ||
- reconcile_duration_seconds | ||
- rg_reconcile_duration_seconds | ||
- parser_duration_seconds | ||
- declared_resources | ||
- apply_operations_total | ||
- apply_duration_seconds | ||
- resource_fights_total | ||
- remediate_duration_seconds | ||
- resource_conflicts_total | ||
- internal_errors_total | ||
- rendering_count_total | ||
- skip_rendering_count_total | ||
- resource_override_count_total | ||
- git_sync_depth_override_count_total | ||
- no_ssl_verify_count_total | ||
- kcc_resource_count | ||
- last_sync_timestamp | ||
# Remove custom configsync metric labels that are not registered with Monarch | ||
# This action applies to all metrics that are sent through the pipeline that | ||
# is using this processor | ||
attributes/kubernetes: | ||
actions: | ||
# Remove custom configsync metric labels that are not registered with Monarch | ||
- key: configsync.sync.kind | ||
action: delete | ||
- key: configsync.sync.name | ||
action: delete | ||
- key: configsync.sync.namespace | ||
action: delete | ||
# Remove high cardinality configsync metric labels when sending to Monarch. | ||
# These labels are useful to users, but too noisy for global aggregation. | ||
- key: commit | ||
action: delete | ||
- key: type | ||
action: delete | ||
metricstransform/kubernetes: | ||
transforms: | ||
- include: declared_resources | ||
action: update | ||
new_name: current_declared_resources | ||
- include: reconciler_errors | ||
action: update | ||
new_name: last_reconciler_errors | ||
- include: pipeline_error_observed | ||
action: update | ||
new_name: last_pipeline_error_observed | ||
- include: apply_operations_total | ||
action: update | ||
new_name: apply_operations_count | ||
- include: resource_fights_total | ||
action: update | ||
new_name: resource_fights_count | ||
- include: resource_conflicts_total | ||
action: update | ||
new_name: resource_conflicts_count | ||
- include: internal_errors_total | ||
action: update | ||
new_name: internal_errors_count | ||
- include: rendering_count_total | ||
action: update | ||
new_name: rendering_count | ||
- include: skip_rendering_count_total | ||
action: update | ||
new_name: skip_rendering_count | ||
- include: resource_override_count_total | ||
action: update | ||
new_name: resource_override_count | ||
- include: git_sync_depth_override_count_total | ||
action: update | ||
new_name: git_sync_depth_override_count | ||
- include: no_ssl_verify_count_total | ||
action: update | ||
new_name: no_ssl_verify_count | ||
extensions: | ||
health_check: | ||
service: | ||
extensions: [health_check] | ||
pipelines: | ||
metrics/cloudmonitoring: | ||
receivers: [opencensus] | ||
processors: [batch, filter/cloudmonitoring, resourcedetection] | ||
exporters: [googlecloud] | ||
metrics/prometheus: | ||
receivers: [opencensus] | ||
processors: [batch] | ||
exporters: [prometheus] | ||
metrics/kubernetes: | ||
receivers: [opencensus] | ||
processors: [batch, filter/kubernetes, attributes/kubernetes, metricstransform/kubernetes, resourcedetection] | ||
exporters: [googlecloud/kubernetes] | ||
kind: ConfigMap | ||
metadata: | ||
labels: | ||
app: opentelemetry | ||
component: otel-collector | ||
configmanagement.gke.io/arch: csmr | ||
configmanagement.gke.io/system: "true" | ||
name: otel-collector-custom | ||
namespace: config-management-monitoring |
Oops, something went wrong.