Skip to content

Commit

Permalink
Add measurement for cilium endpoint propagation delay
Browse files Browse the repository at this point in the history
  • Loading branch information
dlapcevic committed Jan 14, 2022
1 parent 3e11fa1 commit 7c7c4b1
Show file tree
Hide file tree
Showing 5 changed files with 313 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"fmt"
"math"
"strconv"
"time"

"github.com/prometheus/common/model"
"k8s.io/klog"
"k8s.io/perf-tests/clusterloader2/pkg/errors"
"k8s.io/perf-tests/clusterloader2/pkg/measurement"
"k8s.io/perf-tests/clusterloader2/pkg/util"
)

const (
cepPropagationDelayMeasurementName = "CiliumEndpointPropagationDelay"
// The metric definition and bucket sizes for cilium_endpoint_propagation_delay_seconds:
// https://github.com/cilium/cilium/blob/v1.11/pkg/metrics/metrics.go#L1263
cepPropagationDelayQuery = `sum(cilium_endpoint_propagation_delay_seconds_bucket) by (le)`
queryInterval = 10 * time.Minute

// bucketAllEntries is the default Prometheus bucket that
// includes all entries for the histogram snapshot.
bucketAllEntries = "+Inf"
// defaultBucketSLO and defaultPercentileSLO are used together to
// determine if the test should pass, when not specified otherwise
// in the CL2 parameters. The test should pass only if the size of
// the defaultBucketSLO is at least within the defined percentile
// in comparison to the size of the bucketAllEntries.
defaultBucketSLO = 600
defaultPercentileSLO float64 = 95
)

func init() {
create := func() measurement.Measurement {
return CreatePrometheusMeasurement(&cepPropagationDelayGatherer{})
}
if err := measurement.Register(cepPropagationDelayMeasurementName, create); err != nil {
klog.Fatalf("Cannot register %s: %v", cepPropagationDelayMeasurementName, err)
}
}

type cepPropagationDelayGatherer struct{}

// cepPropagationDelayMetricMap contains timestamps in the outer map,
// and buckets and their sizes in the inner map.
type cepPropagationDelayMetricMap map[string]map[string]int

func (c *cepPropagationDelayGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) {
cepPropagationDelay, err := c.gatherCepPropagationDelay(executor, startTime, endTime)
if err != nil {
return nil, err
}

content, err := util.PrettyPrintJSON(cepPropagationDelay)
if err != nil {
return nil, err
}

summaries := []measurement.Summary{measurement.CreateSummary(cepPropagationDelayMeasurementName, "json", content)}
return summaries, validateCEPPropagationDelay(cepPropagationDelay, config)
}

func (c *cepPropagationDelayGatherer) gatherCepPropagationDelay(executor QueryExecutor, startTime, endTime time.Time) (cepPropagationDelayMetricMap, error) {
// Query the data between start and end time on fixed intervals
// to get accurate data from multiple snapshots.
var samples []*model.Sample
queryTime := startTime.Add(queryInterval)
for queryTime.Before(endTime) {
newSamples, err := executor.Query(cepPropagationDelayQuery, queryTime)
if err == nil {
samples = append(samples, newSamples...)
} else {
klog.V(2).Infof("Got error querying Prometheus: %v", err)
}
queryTime = queryTime.Add(queryInterval)
}

extractSampleData := func(sample *model.Sample) (string, string, int) {
return sample.Timestamp.String(), string(sample.Metric["le"]), int(math.Round(float64(sample.Value)))
}

result := make(cepPropagationDelayMetricMap)
for _, sample := range samples {
timestamp, bucket, value := extractSampleData(sample)
if _, ok := result[timestamp]; !ok {
result[timestamp] = make(map[string]int)
}
result[timestamp][bucket] = value
}
return result, nil
}

func validateCEPPropagationDelay(result cepPropagationDelayMetricMap, config *measurement.Config) error {
bucketNumSLO, err := util.GetFloat64OrDefault(config.Params, "bucketSLO", defaultBucketSLO)
if err != nil || bucketNumSLO == 0 {
klog.V(2).Infof("Using defaultBucketSLO: %d, because bucketSLO param is invalid: %v", int(math.Floor(defaultBucketSLO)), err)
bucketNumSLO = defaultBucketSLO
}
bucketSLO := strconv.FormatFloat(bucketNumSLO, 'g', -1, 64)

percentileSLO, err := util.GetFloat64OrDefault(config.Params, "percentileSLO", defaultPercentileSLO)
if err != nil || percentileSLO == 0 {
klog.V(2).Infof("Using defaultPercentileSLO: %f, because percentileSLO param is invalid: %v", percentileSLO, err)
percentileSLO = defaultPercentileSLO
}

for timestamp, buckets := range result {
totalEvents := buckets[bucketAllEntries]
if totalEvents == 0 {
continue
}

acceptedDelayEvents := buckets[bucketSLO]
perc := (float64(acceptedDelayEvents) / float64(totalEvents)) * 100
if perc < percentileSLO {
return errors.NewMetricViolationError(
"Cilium endpoint propagation delay",
fmt.Sprintf("%s: updates for %ss delay is within %d%%, expected %d%%, buckets:\n%v",
timestamp,
bucketSLO,
int(math.Floor(perc)),
int(math.Floor(percentileSLO)),
buckets,
),
)
}
}
return nil
}

func (c *cepPropagationDelayGatherer) Configure(config *measurement.Config) error {
return nil
}

func (c *cepPropagationDelayGatherer) IsEnabled(config *measurement.Config) bool {
return true
}

func (*cepPropagationDelayGatherer) String() string {
return cepPropagationDelayMeasurementName
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"fmt"
"os"
"testing"
"time"

"github.com/stretchr/testify/assert"

"k8s.io/perf-tests/clusterloader2/pkg/measurement"
"k8s.io/perf-tests/clusterloader2/pkg/measurement/common/executors"
)

func TestCiliumEndpointPropagationDelayMeasurement(t *testing.T) {
cases := []struct {
name string
config *measurement.Config
hasError bool
testSeriesFile string
testSeriesDuration time.Duration
}{
{
name: "default_slo_pass",
hasError: false,
testSeriesFile: "default_slo_pass.yaml",
testSeriesDuration: 100 * time.Minute,
config: &measurement.Config{
Params: map[string]interface{}{},
},
},
{
name: "default_slo_fail",
hasError: true,
testSeriesFile: "default_slo_fail.yaml",
testSeriesDuration: 100 * time.Minute,
config: &measurement.Config{
Params: map[string]interface{}{},
},
},
{
name: "custom_slo_pass",
hasError: false,
testSeriesFile: "default_slo_pass.yaml",
testSeriesDuration: 100 * time.Minute,
config: &measurement.Config{
Params: map[string]interface{}{
"bucketSLO": float64(600),
"percentileSLO": float64(99),
},
},
},
{
name: "custom_slo_fail",
hasError: true,
testSeriesFile: "default_slo_fail.yaml",
testSeriesDuration: 100 * time.Minute,
config: &measurement.Config{
Params: map[string]interface{}{
"bucketSLO": float64(1),
"percentileSLO": float64(99),
},
},
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
f, err := createRulesFile("../../prometheus/manifests/prometheus-rules.yaml")
if err != nil {
t.Fatalf("Failed to create rules file: %v", err)
}
defer os.Remove(f.Name())
executor, err := executors.NewPromqlExecutor(fmt.Sprintf("slos/testdata/cilium_endpoint_propagation_delay/%s", tc.testSeriesFile), f.Name())
if err != nil {
t.Fatalf("Failed to create PromQL executor: %v", err)
}
defer executor.Close()
gatherer := &cepPropagationDelayGatherer{}
start := time.Unix(0, 0).UTC()
end := start.Add(tc.testSeriesDuration)
_, err = gatherer.Gather(executor, start, end, tc.config)
if tc.hasError {
assert.NotNil(t, err, "Wanted error, but got none")
} else {
assert.Nil(t, err, "Wanted no error, but got %v", err)
}
})
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
interval: 10m
input_series:
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="1", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 25 25 25 25
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="120", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 50 50 50 50
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="600", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 80 80 80 80
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="+Inf", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 100 100 100 100
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="1", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 25 25 25 25
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="120", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 50 50 50 50
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="600", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 80 80 80 80
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="+Inf", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 100 100 100 100
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
interval: 10m
input_series:
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="1", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 25 25 25 25
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="120", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 70 70 70 70
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="600", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 100 100 100 100
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="+Inf", namespace="kube-system", pod="anetd-bbbb2"}
values: 0 0 0 0 0 10 100 100 100 100
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="1", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 25 25 25 25
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="120", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 70 70 70 70
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="600", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 100 100 100 100
- series: cilium_endpoint_propagation_delay_seconds_bucket{container="cilium-agent", endpoint="advdpmetrics", instance="10.120.32.100:9990", job="cilium", le="+Inf", namespace="kube-system", pod="anetd-cccc2"}
values: 0 0 0 0 0 10 100 100 100 100
12 changes: 12 additions & 0 deletions clusterloader2/testing/load/modules/measurements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}}
{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}}
{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}}
{{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}}
{{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}}
{{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}}
{{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}}
{{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}}
{{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}}
Expand Down Expand Up @@ -111,6 +114,15 @@ steps:
action: {{$action}}
defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}}
customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}}
{{end}}
{{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}}
- Identifier: CiliumEndpointPropagationDelay
Method: CiliumEndpointPropagationDelay
Params:
action: {{$action}}
bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}}
percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}}
enableViolations: true
{{end}}
- Identifier: TestMetrics
Method: TestMetrics
Expand Down

0 comments on commit 7c7c4b1

Please sign in to comment.