diff --git a/changelog/fragments/1729235281-kube-stack-helm-chart.yaml b/changelog/fragments/1729235281-kube-stack-helm-chart.yaml new file mode 100644 index 00000000000..44e36929e6d --- /dev/null +++ b/changelog/fragments/1729235281-kube-stack-helm-chart.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: other + +# Change summary; a 80ish characters long description of the change. +summary: add EDOT colletor kube-stack Helm values + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: "elastic-agent" + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/deploy/helm/edot-collector/kube-stack/README.md b/deploy/helm/edot-collector/kube-stack/README.md new file mode 100644 index 00000000000..b11585240b7 --- /dev/null +++ b/deploy/helm/edot-collector/kube-stack/README.md @@ -0,0 +1,63 @@ +## Kube-stack Helm Chart + +**A more detailed documentation can be found [here](https://github.com/elastic/opentelemetry/blob/main/docs/kubernetes/operator/README.md).** + +The [kube-stack Helm Chart](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-kube-stack) is used to manage the installation of the OpenTelemetry operator (including its CRDs) and to configure a suite of EDOT collectors, which instrument various Kubernetes components to enable comprehensive observability and monitoring. + +The chart is installed with a provided default `values.yaml` file that can be customized when needed. + +### DaemonSet collectors + +The OpenTelemetry components deployed within the DaemonSet EDOT collectors are responsible for observing specific signals from each node. To ensure complete data collection, these components must be deployed on every node in the cluster. Failing to do so will result in partial and potentially incomplete data. + +The DaemonSet collectors handle the following data: + +- Host Metrics: Collects host metrics specific to each node, utilizing the [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/hostmetricsreceiver/README.md) +- Kubernetes Metrics: Captures metrics related to the Kubernetes infrastructure on each node, utlilizing [kubeletstats](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/README.md) receiver +- Logs: Utilizes [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) to gather logs from all Pods running on the respective node. +- OTLP Traces: Utilizes [Otlp receiver]( https://github.com/open-telemetry/opentelemetry-collector/blob/main/receiver/otlpreceiver/README.md) which configures both HTTP and GRPC endpoints on the node to receive OTLP trace data. + +### Deployment collector + +The OpenTelemetry components deployed within a Deployment collector focus on gathering data at the cluster level rather than at individual nodes. A Deployment instance of the collector operates as a standalone (unlike DaemonSet collector instances, which are deployed on every node) + +The Deployment collector handles the following data: + +- Kubernetes Events: Monitors and collects events occurring across the entire Kubernetes cluster, utilizing [k8sobject](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sobjectsreceiver) receiver +- Cluster Metrics: Captures metrics that provide insights into the overall health and performance of the Kubernetes cluster, utilizing [k8s_cluster](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver) receiver + +### Auto-instrumentation + +The Helm Chart is configured to enable zero-code instrumentation using the [Operator's Instrumentation resource](https://github.com/open-telemetry/opentelemetry-operator/?tab=readme-ov-file#opentelemetry-auto-instrumentation-injection) for the following programming languages: + +- Go +- Java +- Node.js +- Python +- .NET + + +### Installation + +1. Create the `opentelemetry-operator-system` Kubernetes namespace: +``` +$ kubectl create namespace opentelemetry-operator-system +``` + +2. Create a secret in Kubernetes with the following command. + ``` + kubectl create -n opentelemetry-operator-system secret generic elastic-secret-otel \ + --from-literal=elastic_endpoint='YOUR_ELASTICSEARCH_ENDPOINT' \ + --from-literal=elastic_api_key='YOUR_ELASTICSEARCH_API_KEY' + ``` + Don't forget to replace + - `YOUR_ELASTICSEARCH_ENDPOINT`: your Elasticsearch endpoint (*with* `https://` prefix example: `https://1234567.us-west2.gcp.elastic-cloud.com:443`). + - `YOUR_ELASTICSEARCH_API_KEY`: your Elasticsearch API Key + +3. Execute the following commands to deploy the Helm Chart. + +``` +$ helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +$ helm repo update +$ helm upgrade --install --namespace opentelemetry-operator-system opentelemetry-kube-stack open-telemetry/opentelemetry-kube-stack --values ./values.yaml --version 0.3.0 +``` diff --git a/deploy/helm/edot-collector/kube-stack/values.yaml b/deploy/helm/edot-collector/kube-stack/values.yaml new file mode 100644 index 00000000000..092ad3c7600 --- /dev/null +++ b/deploy/helm/edot-collector/kube-stack/values.yaml @@ -0,0 +1,827 @@ +opentelemetry-operator: + manager: + extraArgs: + - --enable-go-instrumentation + admissionWebhooks: + certManager: + enabled: false + +autoGenerateCert: + enabled: true + recreate: true + +crds: + create: true + +defaultCRConfig: + image: + repository: "docker.elastic.co/beats/elastic-agent" + tag: "8.16.0-SNAPSHOT" + targetAllocator: + enabled: false + env: + - name: ELASTIC_AGENT_OTEL + value: '"true"' + - name: ELASTIC_ENDPOINT + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_endpoint + - name: ELASTIC_API_KEY + valueFrom: + secretKeyRef: + name: elastic-secret-otel + key: elastic_api_key + +clusterRole: + rules: + - apiGroups: [ "" ] + resources: ["configmaps"] + verbs: ["get"] + +# `clusterName` specifies the name of the kubernetes cluster +# It set the 'k8s.cluster.name' field, should be used for kubernetes environments, where cluster name can not be detected using resourcedetection +# Cluster Name is detected automatically for EKS/GKE/AKS +# clusterName: myClusterName +collectors: + cluster: + config: + exporters: + debug: + verbosity: basic + elasticsearch/otel: + endpoints: + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} + logs_dynamic_index: + enabled: true + # tls: + # insecure_skip_verify: true + mapping: + mode: otel + processors: + resourcedetection/eks: + detectors: [env, eks] + timeout: 15s + override: true + eks: + resource_attributes: + k8s.cluster.name: + enabled: true + resourcedetection/gcp: + detectors: [env, gcp] + timeout: 2s + override: false + resourcedetection/aks: + detectors: [env, aks] + timeout: 2s + override: false + aks: + resource_attributes: + k8s.cluster.name: + enabled: true + resource/k8s: + attributes: + - key: service.name + from_attribute: app.label.name + action: insert + - key: service.name + from_attribute: k8s.container.name + action: insert + - key: app.label.name + action: delete + - key: service.version + from_attribute: app.label.version + action: insert + - key: app.label.version + action: delete + k8sattributes: + passthrough: false + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.replicaset.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.ip" + - "k8s.pod.uid" + - "k8s.pod.start_time" + labels: + - tag_name: app.label.name + key: app.kubernetes.io/name + from: pod + - tag_name: app.label.version + key: app.kubernetes.io/version + from: pod + receivers: + k8s_cluster: + auth_type: serviceAccount + node_conditions_to_report: + - Ready + - MemoryPressure + allocatable_types_to_report: + - cpu + - memory + metrics: + k8s.pod.status_reason: + enabled: true + resource_attributes: + k8s.kubelet.version: + enabled: true + os.description: + enabled: true + os.type: + enabled: true + k8s.container.status.last_terminated_reason: + enabled: true + + service: + pipelines: + metrics: + exporters: + - debug + - elasticsearch/otel + processors: + - k8sattributes + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + receivers: + - k8s_cluster + logs: + receivers: + - k8sobjects + processors: + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + exporters: + - debug + - elasticsearch/otel + daemon: + presets: + logsCollection: + enabled: true + storeCheckpoints: true + hostNetwork: true + securityContext: + runAsUser: 0 + runAsGroup: 0 + scrape_configs_file: "" + config: + connectors: + signaltometrics: + logs: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: "1" + datapoints: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: "1" + spans: + - name: service_summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: metricset.name + default_value: service_summary + sum: + value: Int(AdjustedCount()) + - name: transaction.duration.histogram + description: APM service transaction aggregated metrics as histogram + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [_doc_count] + unit: us + exponential_histogram: + value: Microseconds(end_time - start_time) + - name: transaction.duration.summary + description: APM service transaction aggregated metrics as summary + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + unit: us + histogram: + buckets: [1] + value: Microseconds(end_time - start_time) + - name: transaction.duration.histogram + description: APM transaction aggregated metrics as histogram + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + - key: container.id + - key: k8s.pod.name + - key: service.version + - key: service.instance.id # service.node.name + - key: process.runtime.name # service.runtime.name + - key: process.runtime.version # service.runtime.version + - key: telemetry.sdk.version # service.language.version?? + - key: host.name + - key: os.type # host.os.platform + - key: faas.instance + - key: faas.name + - key: faas.version + - key: cloud.provider + - key: cloud.region + - key: cloud.availability_zone + - key: cloud.platform # cloud.servicename + - key: cloud.account.id + attributes: + - key: transaction.root + - key: transaction.name + - key: transaction.type + - key: transaction.result + - key: event.outcome + - key: metricset.name + default_value: transaction + - key: elasticsearch.mapping.hints + default_value: [_doc_count] + unit: us + exponential_histogram: + value: Microseconds(end_time - start_time) + - name: transaction.duration.summary + description: APM transaction aggregated metrics as summary + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + - key: container.id + - key: k8s.pod.name + - key: service.version + - key: service.instance.id # service.node.name + - key: process.runtime.name # service.runtime.name + - key: process.runtime.version # service.runtime.version + - key: telemetry.sdk.version # service.language.version?? + - key: host.name + - key: os.type # host.os.platform + - key: faas.instance + - key: faas.name + - key: faas.version + - key: cloud.provider + - key: cloud.region + - key: cloud.availability_zone + - key: cloud.platform # cloud.servicename + - key: cloud.account.id + attributes: + - key: transaction.root + - key: transaction.name + - key: transaction.type + - key: transaction.result + - key: event.outcome + - key: metricset.name + default_value: transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + unit: us + histogram: + buckets: [1] + value: Microseconds(end_time - start_time) + - name: span.destination.service.response_time.sum.us + description: APM span destination metrics + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: span.name + - key: event.outcome + - key: service.target.type + - key: service.target.name + - key: span.destination.service.resource + - key: metricset.name + default_value: service_destination + unit: us + sum: + value: Double(Microseconds(end_time - start_time)) + - name: span.destination.service.response_time.count + description: APM span destination metrics + ephemeral_resource_attribute: true + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: span.name + - key: event.outcome + - key: service.target.type + - key: service.target.name + - key: span.destination.service.resource + - key: metricset.name + default_value: service_destination + sum: + value: Int(AdjustedCount()) + # event.success_count is populated using 2 metric definition with different conditions + # and value for the histogram bucket based on event outcome. Both metric definition + # are created using same name and attribute and will result in a single histogram. + # We use mapping hint of aggregate_metric_double, so, only the sum and the count + # values are required and the actual histogram bucket is ignored. + - name: event.success_count + description: Success count as a metric for service transaction + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + conditions: + - attributes["event.outcome"] != nil and attributes["event.outcome"] == "success" + unit: us + histogram: + buckets: [1] + count: Int(AdjustedCount()) + value: Int(AdjustedCount()) + - name: event.success_count + description: Success count as a metric for service transaction + include_resource_attributes: + - key: service.name + - key: deployment.environment # service.environment + - key: telemetry.sdk.language # service.language.name + - key: agent.name # set via elastictraceprocessor + attributes: + - key: transaction.root + - key: transaction.type + - key: metricset.name + default_value: service_transaction + - key: elasticsearch.mapping.hints + default_value: [aggregate_metric_double] + conditions: + - attributes["event.outcome"] != nil and attributes["event.outcome"] != "success" + unit: us + histogram: + buckets: [0] + count: Int(AdjustedCount()) + value: Double(0) + exporters: + debug: + verbosity: basic + elasticsearch/otel: + endpoints: + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} + metrics_dynamic_index: + enabled: true + logs_dynamic_index: + enabled: true + traces_dynamic_index: + enabled: true + flush: + interval: 10s + # tls: + # insecure_skip_verify: true + mapping: + mode: otel + elasticsearch/ecs: + endpoints: + - ${env:ELASTIC_ENDPOINT} + api_key: ${env:ELASTIC_API_KEY} + # tls: + # insecure_skip_verify: true + mapping: + mode: ecs + processors: + batch: {} + elastictrace: + lsminterval: + intervals: + - duration: 1m + statements: + - set(resource.attributes["metricset.interval"], "1m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "1m"], ".")) + - set(attributes["processor.event"], "metric") + - duration: 10m + statements: + - set(resource.attributes["metricset.interval"], "10m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "10m"], ".")) + - set(attributes["processor.event"], "metric") + - duration: 60m + statements: + - set(resource.attributes["metricset.interval"], "60m") + - set(attributes["data_stream.dataset"], Concat([attributes["metricset.name"], "60m"], ".")) + - set(attributes["processor.event"], "metric") + elasticinframetrics: + add_system_metrics: true + add_k8s_metrics: true + drop_original: true + resourcedetection/eks: + detectors: [env, eks] + timeout: 15s + override: true + eks: + resource_attributes: + k8s.cluster.name: + enabled: true + resourcedetection/gcp: + detectors: [env, gcp] + timeout: 2s + override: false + resourcedetection/aks: + detectors: [env, aks] + timeout: 2s + override: false + aks: + resource_attributes: + k8s.cluster.name: + enabled: true + resource/k8s: + attributes: + - key: service.name + from_attribute: app.label.name + action: insert + - key: service.name + from_attribute: k8s.container.name + action: insert + - key: app.label.name + action: delete + - key: service.version + from_attribute: app.label.version + action: insert + - key: app.label.version + action: delete + attributes/dataset: + actions: + - key: event.dataset + from_attribute: data_stream.dataset + action: upsert + resource/cloud: + attributes: + - key: cloud.instance.id + from_attribute: host.id + action: insert + resource/process: + attributes: + - key: process.executable.name + action: delete + - key: process.executable.path + action: delete + resourcedetection/system: + detectors: ["system", "ec2"] + system: + hostname_sources: [ "os" ] + resource_attributes: + host.name: + enabled: true + host.id: + enabled: false + host.arch: + enabled: true + host.ip: + enabled: true + host.mac: + enabled: true + host.cpu.vendor.id: + enabled: true + host.cpu.family: + enabled: true + host.cpu.model.id: + enabled: true + host.cpu.model.name: + enabled: true + host.cpu.stepping: + enabled: true + host.cpu.cache.l2.size: + enabled: true + os.description: + enabled: true + os.type: + enabled: true + ec2: + resource_attributes: + host.name: + enabled: false + host.id: + enabled: true + k8sattributes: + filter: + node_from_env_var: OTEL_K8S_NODE_NAME + passthrough: false + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.replicaset.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.ip" + - "k8s.pod.uid" + - "k8s.pod.start_time" + labels: + - tag_name: app.label.name + key: app.kubernetes.io/name + from: pod + - tag_name: app.label.version + key: app.kubernetes.io/version + from: pod + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + filelog: + retry_on_failure: + enabled: true + start_at: end + exclude: + # exlude collector logs + - /var/log/pods/opentelemetry-operator-system_opentelemetry-kube-stack*/*/*.log + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser + type: container + hostmetrics: + collection_interval: 10s + root_path: /hostfs + scrapers: + cpu: + metrics: + system.cpu.utilization: + enabled: true + system.cpu.logical.count: + enabled: true + memory: + metrics: + system.memory.utilization: + enabled: true + process: + mute_process_exe_error: true + mute_process_io_error: true + mute_process_user_error: true + metrics: + process.threads: + enabled: true + process.open_file_descriptors: + enabled: true + process.memory.utilization: + enabled: true + process.disk.operations: + enabled: true + network: {} + processes: {} + load: {} + disk: {} + filesystem: + exclude_mount_points: + mount_points: + - /dev/* + - /proc/* + - /sys/* + - /run/k3s/containerd/* + - /var/lib/docker/* + - /var/lib/kubelet/* + - /snap/* + match_type: regexp + exclude_fs_types: + fs_types: + - autofs + - binfmt_misc + - bpf + - cgroup2 + - configfs + - debugfs + - devpts + - devtmpfs + - fusectl + - hugetlbfs + - iso9660 + - mqueue + - nsfs + - overlay + - proc + - procfs + - pstore + - rpc_pipefs + - securityfs + - selinuxfs + - squashfs + - sysfs + - tracefs + match_type: strict + kubeletstats: + auth_type: serviceAccount + collection_interval: 20s + endpoint: ${env:OTEL_K8S_NODE_NAME}:10250 + node: '${env:OTEL_K8S_NODE_NAME}' + # Required to work for all CSPs without an issue + insecure_skip_verify: true + k8s_api_config: + auth_type: serviceAccount + metrics: + k8s.pod.memory.node.utilization: + enabled: true + k8s.pod.cpu.node.utilization: + enabled: true + k8s.container.cpu_limit_utilization: + enabled: true + k8s.pod.cpu_limit_utilization: + enabled: true + k8s.container.cpu_request_utilization: + enabled: true + k8s.container.memory_limit_utilization: + enabled: true + k8s.pod.memory_limit_utilization: + enabled: true + k8s.container.memory_request_utilization: + enabled: true + k8s.node.uptime: + enabled: true + k8s.node.cpu.usage: + enabled: true + k8s.pod.cpu.usage: + enabled: true + extra_metadata_labels: + - container.id + service: + pipelines: + logs/node: + receivers: + - filelog + processors: + - batch + - k8sattributes + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/cloud + exporters: + - debug + - elasticsearch/otel + metrics/node/otel: + receivers: + - kubeletstats + processors: + - batch + - k8sattributes + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/cloud + exporters: + - debug + - elasticsearch/otel + metrics/node/ecs: + receivers: + - hostmetrics + - kubeletstats + processors: + - elasticinframetrics + - batch + - k8sattributes + - resourcedetection/system + - resourcedetection/eks + - resourcedetection/gcp + - resourcedetection/aks + - resource/k8s + - resource/cloud + - attributes/dataset + - resource/process + exporters: + - debug + - elasticsearch/ecs + metrics/otel-apm: + receivers: + - otlp + processors: + - batch + exporters: + - debug + - signaltometrics + - elasticsearch/otel + logs/apm: + receivers: + - otlp + processors: + - batch + exporters: + - debug + - signaltometrics + - elasticsearch/otel + traces/apm: + receivers: + - otlp + processors: + - batch + - elastictrace + exporters: + - debug + - signaltometrics + - elasticsearch/otel + metrics/aggregated-otel-metrics: + receivers: + - signaltometrics + processors: + - batch + - lsminterval + exporters: + - debug + - elasticsearch/otel + +instrumentation: + name: elastic-instrumentation + enabled: true + exporter: + endpoint: http://opentelemetry-kube-stack-daemon-collector.opentelemetry-operator-system.svc.cluster.local:4318 + propagators: + - tracecontext + - baggage + - b3 + sampler: + type: parentbased_traceidratio + argument: "1.0" + java: + image: docker.elastic.co/observability/elastic-otel-javaagent:1.0.0 + nodejs: + image: docker.elastic.co/observability/elastic-otel-node:edge + dotnet: + image: docker.elastic.co/observability/elastic-otel-dotnet:edge + python: + image: docker.elastic.co/observability/elastic-otel-python:edge + go: + image: ghcr.io/open-telemetry/opentelemetry-go-instrumentation/autoinstrumentation-go:v0.14.0-alpha diff --git a/testing/integration/otel_helm_test.go b/testing/integration/otel_helm_test.go new file mode 100644 index 00000000000..86db08bc262 --- /dev/null +++ b/testing/integration/otel_helm_test.go @@ -0,0 +1,168 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +//go:build integration + +package integration + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "fmt" + "log" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/stretchr/testify/require" + "gopkg.in/yaml.v2" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/cli" + corev1 "k8s.io/api/core/v1" +) + +var ( + kubeStackChartURL = "https://github.com/open-telemetry/opentelemetry-helm-charts/releases/download/opentelemetry-kube-stack-0.3.2/opentelemetry-kube-stack-0.3.2.tgz" + kubeStackChartVersion = "0.3.2" +) + +func TestOtelKubeStackHelm(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: false, + OS: []define.OS{ + // only test the basic and the wolfi container with otel + {Type: define.Kubernetes, DockerVariant: "basic"}, + {Type: define.Kubernetes, DockerVariant: "wolfi"}, + }, + Group: define.Kubernetes, + }) + + client, err := info.KubeClient() + require.NoError(t, err) + require.NotNil(t, client) + + testLogsBasePath := os.Getenv("K8S_TESTS_POD_LOGS_BASE") + require.NotEmpty(t, testLogsBasePath, "K8S_TESTS_POD_LOGS_BASE must be set") + + err = os.MkdirAll(filepath.Join(testLogsBasePath, t.Name()), 0o755) + require.NoError(t, err, "failed to create test logs directory") + + namespace := info.Namespace + + esHost := os.Getenv("ELASTICSEARCH_HOST") + require.NotEmpty(t, esHost, "ELASTICSEARCH_HOST must be set") + + esAPIKey, err := generateESAPIKey(info.ESClient, namespace) + require.NoError(t, err, "failed to generate ES API key") + require.NotEmpty(t, esAPIKey, "failed to generate ES API key") + + chartOptions := &action.ChartPathOptions{ + RepoURL: "https://github.com/open-telemetry/opentelemetry-helm-charts/releases/download/opentelemetry-kube-stack-0.3.2/opentelemetry-kube-stack-0.3.2.tgz", + Version: "0.3.0", + } + + chartLocation, err := action.NewPull().LocateChart(chartOptions.RepoURL, cli.New()) + if err != nil { + panic(err) + } + + testCases := []struct { + name string + valuesFile string + atLeastValidatedPodsNumber int + }{ + { + name: "helm standalone agent default kubernetes privileged", + valuesFile: "../../deploy/helm/edot-collector/kube-stack/values.yaml", + // - perNode Daemonset (at least 1 agent pod) + // - clusterWide Deployment (1 agent pod) + // - operator Deployment (1 agent pod) + atLeastValidatedPodsNumber: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + hasher := sha256.New() + hasher.Write([]byte(tc.name)) + testNamespace := strings.ToLower(base64.URLEncoding.EncodeToString(hasher.Sum(nil))) + testNamespace = noSpecialCharsRegexp.ReplaceAllString(testNamespace, "") + + settings := cli.New() + settings.SetNamespace(testNamespace) + actionConfig := &action.Configuration{} + + helmChart, err := loader.Load(chartLocation) + require.NoError(t, err, "failed to load helm chart") + + err = actionConfig.Init(settings.RESTClientGetter(), settings.Namespace(), "", + func(format string, v ...interface{}) {}) + require.NoError(t, err, "failed to init helm action config") + + yamlFile, err := os.ReadFile(tc.valuesFile) + if err != nil { + require.NoError(t, err, "failed to read helm chart values file") + } + + // Initialize a map to hold the parsed data + helmValues := make(map[string]any) + + // Unmarshal the YAML into the map + err = yaml.Unmarshal(yamlFile, &helmValues) + if err != nil { + log.Fatalf("Error unmarshalling YAML: %v", err) + } + + t.Cleanup(func() { + if t.Failed() { + dumpLogs(t, ctx, client, testNamespace, testLogsBasePath) + } + + uninstallAction := action.NewUninstall(actionConfig) + uninstallAction.Wait = true + + _, err = uninstallAction.Run("helm-agent") + if err != nil { + require.NoError(t, err, "failed to uninstall helm chart") + } + }) + + installAction := action.NewInstall(actionConfig) + installAction.Namespace = testNamespace + installAction.CreateNamespace = true + installAction.UseReleaseName = true + installAction.ReleaseName = "helm-agent" + installAction.Timeout = 2 * time.Minute + installAction.Wait = true + installAction.WaitForJobs = true + _, err = installAction.Run(helmChart, helmValues) + require.NoError(t, err, "failed to install helm chart") + + podList := &corev1.PodList{} + err = client.Resources(testNamespace).List(ctx, podList) + require.NoError(t, err, fmt.Sprintf("failed to list pods in namespace %s", testNamespace)) + + checkedAgentContainers := 0 + + for _, pod := range podList.Items { + if !strings.HasPrefix(pod.GetName(), "kube-stack-") { + continue + } + + checkedAgentContainers++ + } + + require.GreaterOrEqual(t, checkedAgentContainers, tc.atLeastValidatedPodsNumber, + fmt.Sprintf("at least %d agent containers should be checked", tc.atLeastValidatedPodsNumber)) + }) + } +}