From 9e0cafcf89aac8e4a89f7b21b22838ddad05d172 Mon Sep 17 00:00:00 2001
From: Ying WANG <ying.wang@grafana.com>
Date: Wed, 5 Jun 2024 15:58:50 -0500
Subject: [PATCH] add metrics for both otlp and remote write

---
 CHANGELOG.md                   |  2 +-
 pkg/distributor/distributor.go | 45 +++++++++++++++++++++-------------
 pkg/distributor/otel.go        |  2 --
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ccb3de9e730..f17bc5968f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,7 +24,7 @@
 * [FEATURE] Query-frontend, querier: new experimental `/cardinality/active_native_histogram_metrics` API to get active native histogram metric names with statistics about active native histogram buckets. #7982 #7986 #8008
 * [FEATURE] Alertmanager: Added `-alertmanager.max-silences-count` and `-alertmanager.max-silence-size-bytes` to set limits on per tenant silences. Disabled by default. #6898
 * [FEATURE] Ingester: add experimental support for the server-side circuit breakers when writing to ingesters. This can be enabled using `-ingester.circuit-breaker.enabled` option. Further `-ingester.circuit-breaker.*` options for configuring circuit-breaker are available. Added metrics `cortex_ingester_circuit_breaker_results_total`,  `cortex_ingester_circuit_breaker_transitions_total` and `cortex_ingester_circuit_breaker_current_state`. #8180
-* [ENHANCEMENT] Distributor: add metrics `cortex_distributor_otlp_samples_per_batch` to track samples per batch in otlp request. #8265
+* [ENHANCEMENT] Distributor: add metrics `cortex_distributor_samples_per_request` and `cortex_distributor_exemplars_per_request` to track samples per request. #8265
 * [ENHANCEMENT] Reduced memory allocations in functions used to propagate contextual information between gRPC calls. #7529
 * [ENHANCEMENT] Distributor: add experimental limit for exemplars per series per request, enabled with `-distributor.max-exemplars-per-series-per-request`, the number of discarded exemplars are tracked with `cortex_discarded_exemplars_total{reason="too_many_exemplars_per_series_per_request"}` #7989 #8010
 * [ENHANCEMENT] Store-gateway: merge series from different blocks concurrently. #7456
diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go
index c95c9e85a98..2f22c71dd29 100644
--- a/pkg/distributor/distributor.go
+++ b/pkg/distributor/distributor.go
@@ -136,6 +136,8 @@ type Distributor struct {
 	dedupedSamples                   *prometheus.CounterVec
 	labelsHistogram                  prometheus.Histogram
 	sampleDelayHistogram             prometheus.Histogram
+	incomingSamplesPerRequest        *prometheus.HistogramVec
+	incomingExemplarsPerRequest      *prometheus.HistogramVec
 	latestSeenSampleTimestampPerUser *prometheus.GaugeVec
 	hashCollisionCount               prometheus.Counter
 
@@ -264,9 +266,8 @@ const (
 )
 
 type PushMetrics struct {
-	otlpRequestCounter          *prometheus.CounterVec
-	uncompressedBodySize        *prometheus.HistogramVec
-	otlpIncomingSamplesPerBatch *prometheus.HistogramVec
+	otlpRequestCounter   *prometheus.CounterVec
+	uncompressedBodySize *prometheus.HistogramVec
 }
 
 func newPushMetrics(reg prometheus.Registerer) *PushMetrics {
@@ -282,13 +283,6 @@ func newPushMetrics(reg prometheus.Registerer) *PushMetrics {
 			NativeHistogramMinResetDuration: 1 * time.Hour,
 			NativeHistogramMaxBucketNumber:  100,
 		}, []string{"user"}),
-		otlpIncomingSamplesPerBatch: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
-			Name:                            "cortex_distributor_otlp_samples_per_batch",
-			Help:                            "Number of samples per batch in otlp request.",
-			NativeHistogramBucketFactor:     2,
-			NativeHistogramMinResetDuration: 1 * time.Hour,
-			NativeHistogramMaxBucketNumber:  100,
-		}, []string{"user"}),
 	}
 }
 
@@ -304,16 +298,9 @@ func (m *PushMetrics) ObserveUncompressedBodySize(user string, size float64) {
 	}
 }
 
-func (m *PushMetrics) ObserveOtlpIncomingSamplesPerBatch(user string, count float64) {
-	if m != nil {
-		m.otlpIncomingSamplesPerBatch.WithLabelValues(user).Observe(count)
-	}
-}
-
 func (m *PushMetrics) deleteUserMetrics(user string) {
 	m.otlpRequestCounter.DeleteLabelValues(user)
 	m.uncompressedBodySize.DeleteLabelValues(user)
-	m.otlpIncomingSamplesPerBatch.DeleteLabelValues(user)
 }
 
 // New constructs a new Distributor
@@ -425,6 +412,20 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove
 				60 * 60 * 24, // 24h
 			},
 		}),
+		incomingSamplesPerRequest: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
+			Name:                            "cortex_distributor_samples_per_request",
+			Help:                            "Number of samples per request.",
+			NativeHistogramBucketFactor:     2,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+			NativeHistogramMaxBucketNumber:  100,
+		}, []string{"user"}),
+		incomingExemplarsPerRequest: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
+			Name:                            "cortex_distributor_exemplars_per_request",
+			Help:                            "Number of exemplars per request.",
+			NativeHistogramBucketFactor:     2,
+			NativeHistogramMinResetDuration: 1 * time.Hour,
+			NativeHistogramMaxBucketNumber:  100,
+		}, []string{"user"}),
 		latestSeenSampleTimestampPerUser: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cortex_distributor_latest_seen_sample_timestamp_seconds",
 			Help: "Unix timestamp of latest received sample per user.",
@@ -664,6 +665,8 @@ func (d *Distributor) cleanupInactiveUser(userID string) {
 	d.incomingSamples.DeleteLabelValues(userID)
 	d.incomingExemplars.DeleteLabelValues(userID)
 	d.incomingMetadata.DeleteLabelValues(userID)
+	d.incomingSamplesPerRequest.DeleteLabelValues(userID)
+	d.incomingExemplarsPerRequest.DeleteLabelValues(userID)
 	d.nonHASamples.DeleteLabelValues(userID)
 	d.latestSeenSampleTimestampPerUser.DeleteLabelValues(userID)
 
@@ -1042,7 +1045,11 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc {
 
 		var firstPartialErr error
 		var removeIndexes []int
+		totalSamples, totalExemplars := 0, 0
+
 		for tsIdx, ts := range req.Timeseries {
+			totalSamples += len(ts.Samples)
+			totalExemplars += len(ts.Exemplars)
 			if len(ts.Labels) == 0 {
 				removeIndexes = append(removeIndexes, tsIdx)
 				continue
@@ -1068,6 +1075,10 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc {
 			validatedSamples += len(ts.Samples) + len(ts.Histograms)
 			validatedExemplars += len(ts.Exemplars)
 		}
+
+		d.incomingSamplesPerRequest.WithLabelValues(userID).Observe(float64(totalSamples))
+		d.incomingExemplarsPerRequest.WithLabelValues(userID).Observe(float64(totalExemplars))
+
 		if len(removeIndexes) > 0 {
 			for _, removeIndex := range removeIndexes {
 				mimirpb.ReusePreallocTimeseries(&req.Timeseries[removeIndex])
diff --git a/pkg/distributor/otel.go b/pkg/distributor/otel.go
index 80ee7621de2..d01ce91c2b9 100644
--- a/pkg/distributor/otel.go
+++ b/pkg/distributor/otel.go
@@ -187,8 +187,6 @@ func OTLPHandler(
 			"exemplar_count", exemplarCount,
 		)
 
-		pushMetrics.ObserveOtlpIncomingSamplesPerBatch(tenantID, float64(sampleCount))
-
 		req.Timeseries = metrics
 
 		if enableOtelMetadataStorage {