Skip to content

Commit

Permalink
Merge pull request #123 from armon/dont-expire-defined-prom-metrics
Browse files Browse the repository at this point in the history
Don't expire Prometheus metrics that have been explicitly defined
  • Loading branch information
banks committed Apr 22, 2021
2 parents 8ca742b + 60a4d2a commit 111a475
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 18 deletions.
34 changes: 16 additions & 18 deletions prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package prometheus
import (
"fmt"
"log"
"math"
"regexp"
"strings"
"sync"
Expand All @@ -31,17 +30,16 @@ type PrometheusOpts struct {
Expiration time.Duration
Registerer prometheus.Registerer

// Gauges, Summaries, and Counters allow us to pre-declare metrics by giving their Name, Help, and ConstLabels to
// the PrometheusSink when it is created. Metrics declared in this way will be initialized at zero and will not be
// deleted when their expiry is reached.
// - Gauges and Summaries will be set to NaN when they expire.
// - Counters continue to Collect their last known value.
// Ex:
// PrometheusOpts{
// Gauges, Summaries, and Counters allow us to pre-declare metrics by giving
// their Name, Help, and ConstLabels to the PrometheusSink when it is created.
// Metrics declared in this way will be initialized at zero and will not be
// deleted or altered when their expiry is reached.
//
// Ex: PrometheusOpts{
// Expiration: 10 * time.Second,
// Gauges: []GaugeDefinition{
// {
// Name: []string{ "application", "component", "measurement"},
// Name: []string{ "application", "component", "measurement"},
// Help: "application_component_measurement provides an example of how to declare static metrics",
// ConstLabels: []metrics.Label{ { Name: "my_label", Value: "does_not_change" }, },
// },
Expand Down Expand Up @@ -139,21 +137,24 @@ func (p *PrometheusSink) Describe(c chan<- *prometheus.Desc) {
// logic to clean up ephemeral metrics if their value haven't been set for a
// duration exceeding our allowed expiration time.
func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
p.collectAtTime(c, time.Now())
}

// collectAtTime allows internal testing of the expiry based logic here without
// mocking clocks or making tests timing sensitive.
func (p *PrometheusSink) collectAtTime(c chan<- prometheus.Metric, t time.Time) {
expire := p.expiration != 0
now := time.Now()
p.gauges.Range(func(k, v interface{}) bool {
if v == nil {
return true
}
g := v.(*gauge)
lastUpdate := g.updatedAt
if expire && lastUpdate.Add(p.expiration).Before(now) {
if expire && lastUpdate.Add(p.expiration).Before(t) {
if g.canDelete {
p.gauges.Delete(k)
return true
}
// We have not observed the gauge this interval so we don't know its value.
g.Set(math.NaN())
}
g.Collect(c)
return true
Expand All @@ -164,13 +165,11 @@ func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
}
s := v.(*summary)
lastUpdate := s.updatedAt
if expire && lastUpdate.Add(p.expiration).Before(now) {
if expire && lastUpdate.Add(p.expiration).Before(t) {
if s.canDelete {
p.summaries.Delete(k)
return true
}
// We have observed nothing in this interval.
s.Observe(math.NaN())
}
s.Collect(c)
return true
Expand All @@ -181,12 +180,11 @@ func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) {
}
count := v.(*counter)
lastUpdate := count.updatedAt
if expire && lastUpdate.Add(p.expiration).Before(now) {
if expire && lastUpdate.Add(p.expiration).Before(t) {
if count.canDelete {
p.counters.Delete(k)
return true
}
// Counters remain at their previous value when not observed, so we do not set it to NaN.
}
count.Collect(c)
return true
Expand Down
71 changes: 71 additions & 0 deletions prometheus/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,77 @@ func TestDefinitions(t *testing.T) {
}
return true
})

// Set a bunch of values
sink.SetGauge(gaugeDef.Name, 42)
sink.AddSample(summaryDef.Name, 42)
sink.IncrCounter(counterDef.Name, 1)

// Test that the expiry behavior works as expected. First pick a time which
// is after all the actual updates above.
timeAfterUpdates := time.Now()
// Buffer the chan to make sure it doesn't block. We expect only 3 metrics to
// be produced but give some extra room as this will hang the test if we don't
// have a big enough buffer.
ch := make(chan prometheus.Metric, 10)

// Collect the metrics as if it's some time in the future, way beyond the 5
// second expiry.
sink.collectAtTime(ch, timeAfterUpdates.Add(10*time.Second))

// We should see all the metrics desired Expiry behavior
expectedNum := 3
for i := 0; i < expectedNum; i++ {
select {
case m := <-ch:
// m is a prometheus.Metric the only thing we can do is Write it to a
// protobuf type and read from there.
var pb dto.Metric
if err := m.Write(&pb); err != nil {
t.Fatalf("unexpected error reading metric: %s", err)
}
desc := m.Desc().String()
switch {
case pb.Counter != nil:
if !strings.Contains(desc, counterDef.Help) {
t.Fatalf("expected counter to include correct help=%s, but was %s", counterDef.Help, m.Desc().String())
}
// Counters should _not_ reset. We could assert not nil too but that
// would be a bug in prometheus client code so assume it's never nil...
if *pb.Counter.Value != float64(1) {
t.Fatalf("expected defined counter to have value 42 after expiring, got %f", *pb.Counter.Value)
}
case pb.Gauge != nil:
if !strings.Contains(desc, gaugeDef.Help) {
t.Fatalf("expected gauge to include correct help=%s, but was %s", gaugeDef.Help, m.Desc().String())
}
// Gauges should _not_ reset. We could assert not nil too but that
// would be a bug in prometheus client code so assume it's never nil...
if *pb.Gauge.Value != float64(42) {
t.Fatalf("expected defined gauge to have value 42 after expiring, got %f", *pb.Gauge.Value)
}
case pb.Summary != nil:
if !strings.Contains(desc, summaryDef.Help) {
t.Fatalf("expected summary to include correct help=%s, but was %s", summaryDef.Help, m.Desc().String())
}
// Summaries should not be reset. Previous behavior here did attempt to
// reset them by calling Observe(NaN) which results in all values being
// set to NaN but doesn't actually clear the time window of data
// predictably so future observations could also end up as NaN until the
// NaN sample has aged out of the window. Since the summary is already
// aging out a fixed time window (we fix it a 10 seconds currently for
// all summaries and it's not affected by Expiration option), there's no
// point in trying to reset it after "expiry".
if *pb.Summary.SampleSum != float64(42) {
t.Fatalf("expected defined summary sum to have value 42 after expiring, got %f", *pb.Summary.SampleSum)
}
default:
t.Fatalf("unexpected metric type %v", pb)
}
case <-time.After(100 * time.Millisecond):
t.Fatalf("Timed out waiting to collect expected metric. Got %d, want %d", i, expectedNum)
}
}
}

func MockGetHostname() string {
Expand Down

0 comments on commit 111a475

Please sign in to comment.