Skip to content

Commit

Permalink
[connector/servicegraph] Coalesce different attr sets into single Sco…
Browse files Browse the repository at this point in the history
…peMetrics metric entry (#34070)

Tests in `connector/servicegraph` were failing because the servicegraph
`buildMetrics()` code was creating multiple metric entries for the same
metric name within a single MetricScope. Although this may not be
forbidden by the Otel specification, I think there is a general
assumption that a metric name does not appear more than once within the
same MetricScope.
Instead, different values (e.g. with different sets of attribute values)
should be created as separate datapoints within the same metric.

The `pmetrictest.CompareScopeMetrics` test functionality is not designed
to handle multiple metric entries with the same `Name()`. Instead, it is
assumed that in cases where Order is ignored, the [first entry found in
the actual metrics which matches the name of the expected
metrics](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/ca9a8d14df471ed6a7dda7562ddfe659ec52127d/pkg/pdatatest/pmetrictest/metrics.go#L186-L195)
*must* be the metric to compare.

This fix changes the `buildMetrics()` code to create one metric within a
scope, and instead create multiple datapoints per metric when there are
entries where the datapoint attribute set is unique (i.e. all entries in
the internal maps
`serviceGraphConnector.req{Total,FailedTotal,ServerDurationSecondsCount}`
are coalesced into a single named metric as appropriate.)

_Note_: I don't have any past experience working with
servicegraphconnector, but just observing that
`collectClientLatencyMetrics()` and `collectServerLatencyMetrics()` both
range over the same map - `p.reqServerDurationSecondsCount`, although
the actual values collected in `collectClientLatencyMetrics()` are from
`p.reqServerDurationSeconds{Count,Sum,BucketCounts}` and the values
collected in `collectServerLatencyMetrics()` are from
`p.reqClientDurationSeconds{Count,Sum,BucketCounts}`.

This seems a little asymmetrical, but I don't have enough experience to
say whether this is an error or not.

**Description:**  Fixes #33998 

**Testing:** All other unit tests now complete, and the previously
failing unit test now works reliably.

**Documentation:** No documentation added. This is a unit test fix.
  • Loading branch information
evantorrie committed Jul 17, 2024
1 parent 7e3fc98 commit f2cfc2d
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 80 deletions.
107 changes: 57 additions & 50 deletions connector/servicegraphconnector/connector.go
Original file line number Diff line number Diff line change
Expand Up @@ -472,44 +472,48 @@ func (p *serviceGraphConnector) buildMetrics() (pmetric.Metrics, error) {
}

func (p *serviceGraphConnector) collectCountMetrics(ilm pmetric.ScopeMetrics) error {
for key, c := range p.reqTotal {
if len(p.reqTotal) > 0 {
mCount := ilm.Metrics().AppendEmpty()
mCount.SetName("traces_service_graph_request_total")
mCount.SetEmptySum().SetIsMonotonic(true)
// TODO: Support other aggregation temporalities
mCount.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative)

dpCalls := mCount.Sum().DataPoints().AppendEmpty()
dpCalls.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpCalls.SetTimestamp(pcommon.NewTimestampFromTime(time.Now()))
dpCalls.SetIntValue(c)
for key, c := range p.reqTotal {
dpCalls := mCount.Sum().DataPoints().AppendEmpty()
dpCalls.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpCalls.SetTimestamp(pcommon.NewTimestampFromTime(time.Now()))
dpCalls.SetIntValue(c)

dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}

dimensions.CopyTo(dpCalls.Attributes())
dimensions.CopyTo(dpCalls.Attributes())
}
}

for key, c := range p.reqFailedTotal {
if len(p.reqFailedTotal) > 0 {
mCount := ilm.Metrics().AppendEmpty()
mCount.SetName("traces_service_graph_request_failed_total")
mCount.SetEmptySum().SetIsMonotonic(true)
// TODO: Support other aggregation temporalities
mCount.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative)

dpCalls := mCount.Sum().DataPoints().AppendEmpty()
dpCalls.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpCalls.SetTimestamp(pcommon.NewTimestampFromTime(time.Now()))
dpCalls.SetIntValue(c)
for key, c := range p.reqFailedTotal {
dpCalls := mCount.Sum().DataPoints().AppendEmpty()
dpCalls.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpCalls.SetTimestamp(pcommon.NewTimestampFromTime(time.Now()))
dpCalls.SetIntValue(c)

dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}

dimensions.CopyTo(dpCalls.Attributes())
dimensions.CopyTo(dpCalls.Attributes())
}
}

return nil
Expand All @@ -529,57 +533,60 @@ func (p *serviceGraphConnector) collectLatencyMetrics(ilm pmetric.ScopeMetrics)
}

func (p *serviceGraphConnector) collectClientLatencyMetrics(ilm pmetric.ScopeMetrics) error {
for key := range p.reqServerDurationSecondsCount {
if len(p.reqServerDurationSecondsCount) > 0 {
mDuration := ilm.Metrics().AppendEmpty()
mDuration.SetName("traces_service_graph_request_client_seconds")
// TODO: Support other aggregation temporalities
mDuration.SetEmptyHistogram().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative)

timestamp := pcommon.NewTimestampFromTime(time.Now())

dpDuration := mDuration.Histogram().DataPoints().AppendEmpty()
dpDuration.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpDuration.SetTimestamp(timestamp)
dpDuration.ExplicitBounds().FromRaw(p.reqDurationBounds)
dpDuration.BucketCounts().FromRaw(p.reqServerDurationSecondsBucketCounts[key])
dpDuration.SetCount(p.reqServerDurationSecondsCount[key])
dpDuration.SetSum(p.reqServerDurationSecondsSum[key])
for key := range p.reqServerDurationSecondsCount {
dpDuration := mDuration.Histogram().DataPoints().AppendEmpty()
dpDuration.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpDuration.SetTimestamp(timestamp)
dpDuration.ExplicitBounds().FromRaw(p.reqDurationBounds)
dpDuration.BucketCounts().FromRaw(p.reqServerDurationSecondsBucketCounts[key])
dpDuration.SetCount(p.reqServerDurationSecondsCount[key])
dpDuration.SetSum(p.reqServerDurationSecondsSum[key])

// TODO: Support exemplars
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}

// TODO: Support exemplars
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
dimensions.CopyTo(dpDuration.Attributes())
}

dimensions.CopyTo(dpDuration.Attributes())
}
return nil
}

func (p *serviceGraphConnector) collectServerLatencyMetrics(ilm pmetric.ScopeMetrics, mName string) error {
for key := range p.reqServerDurationSecondsCount {
if len(p.reqServerDurationSecondsCount) > 0 {
mDuration := ilm.Metrics().AppendEmpty()
mDuration.SetName(mName)
// TODO: Support other aggregation temporalities
mDuration.SetEmptyHistogram().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative)

timestamp := pcommon.NewTimestampFromTime(time.Now())

dpDuration := mDuration.Histogram().DataPoints().AppendEmpty()
dpDuration.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpDuration.SetTimestamp(timestamp)
dpDuration.ExplicitBounds().FromRaw(p.reqDurationBounds)
dpDuration.BucketCounts().FromRaw(p.reqClientDurationSecondsBucketCounts[key])
dpDuration.SetCount(p.reqClientDurationSecondsCount[key])
dpDuration.SetSum(p.reqClientDurationSecondsSum[key])
for key := range p.reqServerDurationSecondsCount {

// TODO: Support exemplars
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}
dpDuration := mDuration.Histogram().DataPoints().AppendEmpty()
dpDuration.SetStartTimestamp(pcommon.NewTimestampFromTime(p.startTime))
dpDuration.SetTimestamp(timestamp)
dpDuration.ExplicitBounds().FromRaw(p.reqDurationBounds)
dpDuration.BucketCounts().FromRaw(p.reqClientDurationSecondsBucketCounts[key])
dpDuration.SetCount(p.reqClientDurationSecondsCount[key])
dpDuration.SetSum(p.reqClientDurationSecondsSum[key])

// TODO: Support exemplars
dimensions, ok := p.dimensionsForSeries(key)
if !ok {
return fmt.Errorf("failed to find dimensions for key %s", key)
}

dimensions.CopyTo(dpDuration.Attributes())
dimensions.CopyTo(dpDuration.Attributes())
}
}
return nil
}
Expand Down
1 change: 0 additions & 1 deletion connector/servicegraphconnector/connector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ func TestConnectorConsume(t *testing.T) {
assert.NoError(t, conn.Shutdown(context.Background()))
})
t.Run("test fix failed label not work", func(t *testing.T) {
t.Skip("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/33998 skip flaky test")
cfg := &Config{
Store: StoreConfig{MaxItems: 10},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ resourceMetrics:
stringValue: bar
startTimeUnixNano: "1000000"
timeUnixNano: "2000000"
isMonotonic: true
- name: traces_service_graph_request_total
sum:
aggregationTemporality: 2
dataPoints:
- asInt: "1"
attributes:
- key: client
Expand Down Expand Up @@ -77,7 +72,7 @@ resourceMetrics:
stringValue: ""
- key: failed
value:
boolValue: false
boolValue: true
- key: server
value:
stringValue: bar
Expand All @@ -92,14 +87,14 @@ resourceMetrics:
- "0"
- "0"
- "0"
- "2"
- "1"
- "0"
- "0"
- "0"
- "0"
- "0"
- "0"
count: "2"
count: "1"
explicitBounds:
- 0.002
- 0.004
Expand All @@ -118,12 +113,8 @@ resourceMetrics:
- 10
- 15
startTimeUnixNano: "1000000"
sum: 2
sum: 1
timeUnixNano: "2000000"
name: traces_service_graph_request_server_seconds
- histogram:
aggregationTemporality: 2
dataPoints:
- attributes:
- key: client
value:
Expand All @@ -133,7 +124,7 @@ resourceMetrics:
stringValue: ""
- key: failed
value:
boolValue: true
boolValue: false
- key: server
value:
stringValue: bar
Expand All @@ -148,14 +139,14 @@ resourceMetrics:
- "0"
- "0"
- "0"
- "1"
- "2"
- "0"
- "0"
- "0"
- "0"
- "0"
- "0"
count: "1"
count: "2"
explicitBounds:
- 0.002
- 0.004
Expand All @@ -174,7 +165,7 @@ resourceMetrics:
- 10
- 15
startTimeUnixNano: "1000000"
sum: 1
sum: 2
timeUnixNano: "2000000"
name: traces_service_graph_request_server_seconds
- histogram:
Expand All @@ -189,7 +180,7 @@ resourceMetrics:
stringValue: ""
- key: failed
value:
boolValue: false
boolValue: true
- key: server
value:
stringValue: bar
Expand All @@ -204,14 +195,14 @@ resourceMetrics:
- "0"
- "0"
- "0"
- "2"
- "1"
- "0"
- "0"
- "0"
- "0"
- "0"
- "0"
count: "2"
count: "1"
explicitBounds:
- 0.002
- 0.004
Expand All @@ -230,12 +221,8 @@ resourceMetrics:
- 10
- 15
startTimeUnixNano: "1000000"
sum: 2
sum: 1
timeUnixNano: "2000000"
name: traces_service_graph_request_client_seconds
- histogram:
aggregationTemporality: 2
dataPoints:
- attributes:
- key: client
value:
Expand All @@ -245,7 +232,7 @@ resourceMetrics:
stringValue: ""
- key: failed
value:
boolValue: true
boolValue: false
- key: server
value:
stringValue: bar
Expand All @@ -260,14 +247,14 @@ resourceMetrics:
- "0"
- "0"
- "0"
- "1"
- "2"
- "0"
- "0"
- "0"
- "0"
- "0"
- "0"
count: "1"
count: "2"
explicitBounds:
- 0.002
- 0.004
Expand All @@ -286,7 +273,7 @@ resourceMetrics:
- 10
- 15
startTimeUnixNano: "1000000"
sum: 1
sum: 2
timeUnixNano: "2000000"
name: traces_service_graph_request_client_seconds
scope:
Expand Down

0 comments on commit f2cfc2d

Please sign in to comment.