Skip to content

Commit

Permalink
Fix otel-collector logging repeat error
Browse files Browse the repository at this point in the history
The pod log shows repeat log of duplicate timeseries error that takes up logging quota. Although the metris are correctly being sent, the error message can be confusing to users.

The removal of `type` tag is causing one timeseries being exported multiple times in batch with and without the tag.

This change uses aggregation in metricstransfrom processor to eliminate the `type` tag from Monarch pipeline. Format follows otel-collector-contrib 0.54.0.

Tested locally e2e.

b/290678742
  • Loading branch information
tiffanny29631 committed Jul 28, 2023
1 parent a520198 commit 21226a4
Show file tree
Hide file tree
Showing 4 changed files with 367 additions and 3 deletions.
150 changes: 150 additions & 0 deletions e2e/testcases/otel_collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package e2e

import (
"fmt"
"strings"
"testing"
"time"

"kpt.dev/configsync/e2e"
"kpt.dev/configsync/e2e/nomostest"
"kpt.dev/configsync/e2e/nomostest/ntopts"
"kpt.dev/configsync/e2e/nomostest/taskgroup"
nomostesting "kpt.dev/configsync/e2e/nomostest/testing"
"kpt.dev/configsync/pkg/kinds"
ocmetrics "kpt.dev/configsync/pkg/metrics"
)

const (
DefaultMonitorKSA = "default"
MonitorGSA = "e2e-test-metric-writer"
)

func TestOtelCollectorDeployment(t *testing.T) {
nt := nomostest.New(t, nomostesting.Reconciliation1, ntopts.RequireGKE(t))
nt.T.Cleanup(func() {
nt.MustKubectl("delete", "-f", "../testdata/otel-collector/otel-cm-duplicate-timeseries.yaml", "--ignore-not-found")
if t.Failed() {
nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false)
}
})

// If Workload Identity enabled on cluster, setup KSA to GSA annotation
if workloadPool, err := getWorkloadPool(nt); err != nil {
nt.T.Fatal(err)
} else if workloadPool != "" {
nt.T.Log("Workload identity enabled, adding KSA annotation to use %s service account", MonitorGSA)
nt.MustKubectl("annotate", "serviceaccount", "--namespace", ocmetrics.MonitoringNamespace, DefaultMonitorKSA,
fmt.Sprintf("iam.gke.io/gcp-service-account=%s@%s.iam.gserviceaccount.com", MonitorGSA, *e2e.GCPProject))
}

nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM")
nomostest.DeletePodByLabel(nt, "app", "opentelemetry", false)
if err := nt.WatchForAllSyncs(); err != nil {
nt.T.Fatal(err)
}
if err := validateOtelCollectorStatusCurrent(nt, false); err != nil {
nt.T.Fatal()
}

nt.T.Log("Wait 20s for metric exporter to settle")
time.Sleep(1 * time.Minute)
nt.T.Log("Checking the otel-collector log contains no failure...")
err := CheckDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace)
if err != nil {
nt.T.Fatal(err)
}

nt.T.Log("Apply custom otel-collector ConfigMap that could cause duplicate time series error")
nt.MustKubectl("apply", "-f", "../testdata/otel-collector/otel-cm-duplicate-timeseries.yaml")
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log")
nomostest.DeletePodByLabel(nt, "app", "opentelemetry", false)
if err := nt.WatchForAllSyncs(); err != nil {
nt.T.Fatal(err)
}
if err := validateOtelCollectorStatusCurrent(nt, true); err != nil {
nt.T.Fatal(err)
}

nt.T.Log("wait 20s for metric exporter to settle")
time.Sleep(1 * time.Minute)
nt.T.Log("Checking the otel-collector log contains failure...")
if err := CheckDeploymentLogHasFailure(nt, ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace); err != nil {
nt.T.Fatal(err)
}
}

func CheckDeploymentLogHasFailure(nt *nomostest.NT, deployment, namespace string) error {
nt.T.Helper()

args := []string{"logs", fmt.Sprintf("deployment/%s", deployment), "-n", namespace}
cmd := fmt.Sprintf("kubectl %s", strings.Join(args, " "))
out, err := nt.Shell.Kubectl(args...)
if err != nil {
nt.T.Logf("failed to run %q: %v\n%s", cmd, err, out)
return err
}

entry := strings.Split(string(out), "\n")
for _, m := range entry {
if strings.Contains(m, "failed to export time series to GCM") {
return nil
}
}
return fmt.Errorf("error expected in the log of deployment %s, namespace %s but found none", deployment, namespace)
}

func CheckDeploymentLogHasNoFailure(nt *nomostest.NT, deployment, namespace string) error {
nt.T.Helper()

args := []string{"logs", fmt.Sprintf("deployment/%s", deployment), "-n", namespace}
cmd := fmt.Sprintf("kubectl %s", strings.Join(args, " "))
out, err := nt.Shell.Kubectl(args...)
if err != nil {
nt.T.Logf("failed to run %q: %v\n%s", cmd, err, out)
return err
}

entry := strings.Split(string(out), "\n")
for _, m := range entry {
if strings.Contains(m, "failed to export time series to GCM") {
return fmt.Errorf("failure found in the log of deployment %s, namespace %s: %s", deployment, namespace, m)
}
}
return nil
}

// func validateDeploymentstatuscurrent
func validateOtelCollectorStatusCurrent(nt *nomostest.NT, customConfigMap bool) error {
tg := taskgroup.New()
tg.Go(func() error {
return nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace)
})
tg.Go(func() error {
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorName, ocmetrics.MonitoringNamespace)
})
if customConfigMap {
tg.Go(func() error {
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorCustomCM, ocmetrics.MonitoringNamespace)
})
} else {
tg.Go(func() error {
return nt.Watcher.WatchForCurrentStatus(kinds.ConfigMap(), ocmetrics.OtelCollectorGooglecloud, ocmetrics.MonitoringNamespace)
})
}
return tg.Wait()
}
197 changes: 197 additions & 0 deletions e2e/testdata/otel-collector/otel-cm-duplicate-timeseries.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
apiVersion: v1
data:
otel-collector-config.yaml: |-
receivers:
opencensus:
exporters:
prometheus:
endpoint: :8675
namespace: config_sync
resource_to_telemetry_conversion:
enabled: true
googlecloud:
metric:
prefix: "custom.googleapis.com/opencensus/config_sync/"
# The exporter would always fail at sending metric descriptor. Skipping
# creation of metric descriptors until the error from upstream is resolved
# The metric streaming data is not affected
# https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/issues/529
skip_create_descriptor: true
# resource_filters looks for metric resource attributes by prefix and converts
# them into custom metric labels, so they become visible and can be accessed
# under the GroupBy dropdown list in Cloud Monitoring
resource_filters:
- prefix: "cloud.account.id"
- prefix: "cloud.availability.zone"
- prefix: "cloud.platform"
- prefix: "cloud.provider"
- prefix: "k8s.pod.ip"
- prefix: "k8s.pod.namespace"
- prefix: "k8s.pod.uid"
- prefix: "k8s.container.name"
- prefix: "host.id"
- prefix: "host.name"
- prefix: "k8s.deployment.name"
- prefix: "k8s.node.name"
retry_on_failure:
enabled: false
sending_queue:
enabled: false
googlecloud/kubernetes:
metric:
prefix: "kubernetes.io/internal/addons/config_sync/"
# skip_create_descriptor: Metrics start with 'kubernetes.io/' have already
# got descriptors defined internally. Skip sending dupeicated metric
# descriptors here to prevent errors or conflicts.
skip_create_descriptor: true
# instrumentation_library_labels: Otel Collector by default attaches
# 'instrumentation_version' and 'instrumentation_source' labels that are
# not specified in our Cloud Monarch definitions, thus skipping them here
instrumentation_library_labels: false
# create_service_timeseries: This is a recommended configuration for
# 'service metrics' starts with 'kubernetes.io/' prefix. It uses
# CreateTimeSeries API and has its own quotas, so that custom metric write
# will not break this ingestion pipeline
create_service_timeseries: true
service_resource_labels: false
retry_on_failure:
enabled: false
sending_queue:
enabled: false
processors:
batch:
# resourcedetection: This processor is needed to correctly mirror resource
# labels from OpenCensus to OpenTelemetry. We also want to keep this same
# processor in Otel Agent configuration as the resource labels are added from
# there
resourcedetection:
detectors: [env, gcp]
filter/cloudmonitoring:
metrics:
include:
match_type: regexp
metric_names:
- reconciler_errors
- apply_duration_seconds
- reconcile_duration_seconds
- rg_reconcile_duration_seconds
- last_sync_timestamp
- pipeline_error_observed
- declared_resources
- apply_operations_total
- resource_fights_total
- internal_errors_total
- kcc_resource_count
- resource_count
- ready_resource_count
- cluster_scoped_resource_count
- resource_ns_count
- api_duration_seconds
filter/kubernetes:
metrics:
include:
match_type: regexp
metric_names:
- kustomize.*
- api_duration_seconds
- reconciler_errors
- pipeline_error_observed
- reconcile_duration_seconds
- rg_reconcile_duration_seconds
- parser_duration_seconds
- declared_resources
- apply_operations_total
- apply_duration_seconds
- resource_fights_total
- remediate_duration_seconds
- resource_conflicts_total
- internal_errors_total
- rendering_count_total
- skip_rendering_count_total
- resource_override_count_total
- git_sync_depth_override_count_total
- no_ssl_verify_count_total
- kcc_resource_count
- last_sync_timestamp
# Remove custom configsync metric labels that are not registered with Monarch
# This action applies to all metrics that are sent through the pipeline that
# is using this processor
attributes/kubernetes:
actions:
# Remove custom configsync metric labels that are not registered with Monarch
- key: configsync.sync.kind
action: delete
- key: configsync.sync.name
action: delete
- key: configsync.sync.namespace
action: delete
# Remove high cardinality configsync metric labels when sending to Monarch.
# These labels are useful to users, but too noisy for global aggregation.
- key: commit
action: delete
- key: type
action: delete
metricstransform/kubernetes:
transforms:
- include: declared_resources
action: update
new_name: current_declared_resources
- include: reconciler_errors
action: update
new_name: last_reconciler_errors
- include: pipeline_error_observed
action: update
new_name: last_pipeline_error_observed
- include: apply_operations_total
action: update
new_name: apply_operations_count
- include: resource_fights_total
action: update
new_name: resource_fights_count
- include: resource_conflicts_total
action: update
new_name: resource_conflicts_count
- include: internal_errors_total
action: update
new_name: internal_errors_count
- include: rendering_count_total
action: update
new_name: rendering_count
- include: skip_rendering_count_total
action: update
new_name: skip_rendering_count
- include: resource_override_count_total
action: update
new_name: resource_override_count
- include: git_sync_depth_override_count_total
action: update
new_name: git_sync_depth_override_count
- include: no_ssl_verify_count_total
action: update
new_name: no_ssl_verify_count
extensions:
health_check:
service:
extensions: [health_check]
pipelines:
metrics/cloudmonitoring:
receivers: [opencensus]
processors: [batch, filter/cloudmonitoring, resourcedetection]
exporters: [googlecloud]
metrics/prometheus:
receivers: [opencensus]
processors: [batch]
exporters: [prometheus]
metrics/kubernetes:
receivers: [opencensus]
processors: [batch, filter/kubernetes, attributes/kubernetes, metricstransform/kubernetes, resourcedetection]
exporters: [googlecloud/kubernetes]
kind: ConfigMap
metadata:
labels:
app: opentelemetry
component: otel-collector
configmanagement.gke.io/arch: csmr
configmanagement.gke.io/system: "true"
name: otel-collector-custom
namespace: config-management-monitoring
21 changes: 19 additions & 2 deletions pkg/metrics/otel.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,15 @@ processors:
# These labels are useful to users, but too noisy for global aggregation.
- key: commit
action: delete
- key: type
action: delete
metricstransform/kubernetes:
transforms:
- include: api_duration_seconds
action: update
operations:
# Eliminate 'type' tag from metric using aggregation
- action: aggregate_labels
label_set: [status, operation]
aggregation_type: sum
- include: declared_resources
action: update
new_name: current_declared_resources
Expand All @@ -179,6 +184,11 @@ processors:
- include: apply_operations_total
action: update
new_name: apply_operations_count
operations:
# Eliminate 'type' tag from metric using aggregation
- action: aggregate_labels
label_set: [controller, operation, status]
aggregation_type: sum
- include: resource_fights_total
action: update
new_name: resource_fights_count
Expand All @@ -191,6 +201,13 @@ processors:
- include: rendering_count_total
action: update
new_name: rendering_count
- include: remediate_duration_seconds
action: update
operations:
# Eliminate 'type' tag from metric using aggregation
- action: aggregate_labels
label_set: [status]
aggregation_type: sum
- include: skip_rendering_count_total
action: update
new_name: skip_rendering_count
Expand Down
Loading

0 comments on commit 21226a4

Please sign in to comment.