Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply error in loop should not kill the sidecar #2996

Merged
merged 4 commits into from
Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
- [#2892](https://github.com/thanos-io/thanos/pull/2892) Receive: Receiver fails when the initial upload fails.
- [#2865](https://github.com/thanos-io/thanos/pull/2865) ui: Migrate Thanos Ruler UI to React
- [#2964](https://github.com/thanos-io/thanos/pull/2964) Query: Add time range parameters to label APIs. Add `start` and `end` fields to Store API `LabelNamesRequest` and `LabelValuesRequest`.
- [#2996](https://github.com/thanos-io/thanos/pull/2996) Sidecar: Add `reloader_config_errors_total` metric. Add new flags `--reloader.watch-interval`, and `--reloader.retry-interval`.
mneverov marked this conversation as resolved.
Show resolved Hide resolved

### Changed

Expand Down
9 changes: 9 additions & 0 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ type reloaderConfig struct {
confFile string
envVarConfFile string
ruleDirectories []string
watchInterval time.Duration
retryInterval time.Duration
}

func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
Expand All @@ -110,6 +112,13 @@ func (rc *reloaderConfig) registerFlag(cmd *kingpin.CmdClause) *reloaderConfig {
cmd.Flag("reloader.rule-dir",
"Rule directories for the reloader to refresh (repeated field).").
StringsVar(&rc.ruleDirectories)
cmd.Flag("reloader.watch-interval",
"Controls how often reloader re-reads config and rules.").
Default("3m").DurationVar(&rc.watchInterval)
cmd.Flag("reloader.retry-interval",
"Controls how often reloader retries config reload in case of error.").
Default("5s").DurationVar(&rc.retryInterval)

return rc
}

Expand Down
16 changes: 9 additions & 7 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ func registerSidecar(m map[string]setupFunc, app *kingpin.Application) {
conf.registerFlag(cmd)

m[component.Sidecar.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
rl := reloader.New(
log.With(logger, "component", "reloader"),
rl := reloader.New(log.With(logger, "component", "reloader"),
extprom.WrapRegistererWithPrefix("thanos_sidecar_", reg),
reloader.ReloadURLFromBase(conf.prometheus.url),
conf.reloader.confFile,
conf.reloader.envVarConfFile,
conf.reloader.ruleDirectories,
)
&reloader.Options{
ReloadURL: reloader.ReloadURLFromBase(conf.prometheus.url),
CfgFile: conf.reloader.confFile,
CfgOutputFile: conf.reloader.envVarConfFile,
RuleDirs: conf.reloader.ruleDirectories,
WatchInterval: conf.reloader.watchInterval,
RetryInterval: conf.reloader.retryInterval,
})

return runSidecar(
g,
Expand Down
6 changes: 6 additions & 0 deletions docs/components/sidecar.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,12 @@ Flags:
--reloader.rule-dir=RELOADER.RULE-DIR ...
Rule directories for the reloader to refresh
(repeated field).
--reloader.watch-interval=3m
Controls how often reloader re-reads config and
rules.
--reloader.retry-interval=5s
Controls how often reloader retries config
reload in case of error.
--objstore.config-file=<file-path>
Path to YAML file that contains object store
configuration. See format details:
Expand Down
17 changes: 9 additions & 8 deletions pkg/reloader/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"log"
"net/http"
"net/url"
"time"

"github.com/thanos-io/thanos/pkg/reloader"
)
Expand All @@ -19,14 +20,14 @@ func ExampleReloader() {
if err != nil {
log.Fatal(err)
}
rl := reloader.New(
nil,
nil,
reloader.ReloadURLFromBase(u),
"/path/to/cfg",
"/path/to/cfg.out",
[]string{"/path/to/dirs"},
)
rl := reloader.New(nil, nil, &reloader.Options{
ReloadURL: reloader.ReloadURLFromBase(u),
CfgFile: "/path/to/cfg",
CfgOutputFile: "/path/to/cfg.out",
RuleDirs: []string{"/path/to/dirs"},
WatchInterval: 3 * time.Minute,
RetryInterval: 5 * time.Second,
})

ctx, cancel := context.WithCancel(context.Background())
go func() {
Expand Down
67 changes: 45 additions & 22 deletions pkg/reloader/reloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
// This and below for reloader:
//
// u, _ := url.Parse("http://localhost:9090")
// rl := reloader.New(
// nil,
// reloader.ReloadURLFromBase(u),
// "/path/to/cfg",
// "/path/to/cfg.out",
// []string{"/path/to/dirs"},
// )
// rl := reloader.New(nil, nil, &reloader.Options{
// ReloadURL: reloader.ReloadURLFromBase(u),
// CfgFile: "/path/to/cfg",
// CfgOutputFile: "/path/to/cfg.out",
// RuleDirs: []string{"/path/to/dirs"},
// WatchInterval: 3 * time.Minute,
// RetryInterval: 5 * time.Second,
// })
//
// The url of reloads can be generated with function ReloadURLFromBase().
// It will append the default path of reload into the given url:
Expand All @@ -41,7 +42,7 @@
// // ...
// cancel()
//
// By default, reloader will make a schedule to check the given config files and dirs of sum of hash with the last result,
// Reloader will make a schedule to check the given config files and dirs of sum of hash with the last result,
// even if it is no changes.
//
// A basic example of configuration template with environment variables:
Expand Down Expand Up @@ -97,27 +98,44 @@ type Reloader struct {
watches prometheus.Gauge
watchEvents prometheus.Counter
watchErrors prometheus.Counter
configErrors prometheus.Counter
}

// Options bundles options for the Reloader.
type Options struct {
// ReloadURL is a prometheus URL to trigger reloads.
ReloadURL *url.URL
// CfgFile is a path to the prometheus config file to watch.
CfgFile string
// CfgOutputFile is a path for the output config file.
// If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables
// will be substituted and the output written into the given path. Prometheus should then use
// cfgOutputFile as its config file path.
CfgOutputFile string
// RuleDirs is a collection of paths for this reloader to watch over.
RuleDirs []string
// WatchInterval controls how often reloader re-reads config and rules.
WatchInterval time.Duration
// RetryInterval controls how often reloader retries config reload in case of error.
RetryInterval time.Duration
}

var firstGzipBytes = []byte{0x1f, 0x8b, 0x08}

// New creates a new reloader that watches the given config file and rule directory
// and triggers a Prometheus reload upon changes.
// If cfgOutputFile is not empty the config file will be decompressed if needed, environment variables
// will be substituted and the output written into the given path. Prometheus should then use
// cfgOutputFile as its config file path.
func New(logger log.Logger, reg prometheus.Registerer, reloadURL *url.URL, cfgFile string, cfgOutputFile string, ruleDirs []string) *Reloader {
func New(logger log.Logger, reg prometheus.Registerer, o *Options) *Reloader {
if logger == nil {
logger = log.NewNopLogger()
}
r := &Reloader{
logger: logger,
reloadURL: reloadURL,
cfgFile: cfgFile,
cfgOutputFile: cfgOutputFile,
ruleDirs: ruleDirs,
watchInterval: 3 * time.Minute,
retryInterval: 5 * time.Second,
reloadURL: o.ReloadURL,
cfgFile: o.CfgFile,
cfgOutputFile: o.CfgOutputFile,
ruleDirs: o.RuleDirs,
watchInterval: o.WatchInterval,
retryInterval: o.RetryInterval,

reloads: promauto.With(reg).NewCounter(
prometheus.CounterOpts{
Expand All @@ -131,6 +149,12 @@ func New(logger log.Logger, reg prometheus.Registerer, reloadURL *url.URL, cfgFi
Help: "Total number of reload requests that failed.",
},
),
configErrors: promauto.With(reg).NewCounter(
prometheus.CounterOpts{
Name: "reloader_config_errors_total",
mneverov marked this conversation as resolved.
Show resolved Hide resolved
Help: "Total number of config reads that failed.",
mneverov marked this conversation as resolved.
Show resolved Hide resolved
},
),
watches: promauto.With(reg).NewGauge(
prometheus.GaugeOpts{
Name: "reloader_watches",
Expand Down Expand Up @@ -196,7 +220,7 @@ func (r *Reloader) Watch(ctx context.Context) error {

r.watches.Set(float64(len(watchables)))
level.Info(r.logger).Log(
"msg", "started watching config file and non-recursively rule dirs for changes",
"msg", "started watching config file and recursively rule dirs for changes",
"cfg", r.cfgFile,
"out", r.cfgOutputFile,
"dirs", strings.Join(r.ruleDirs, ","))
Expand All @@ -218,9 +242,8 @@ func (r *Reloader) Watch(ctx context.Context) error {
}

if err := r.apply(ctx); err != nil {
// Critical error.
// TODO(bwplotka): There is no need to get process down in this case and decrease availability, handle the error in different way.
return err
r.configErrors.Inc()
level.Error(r.logger).Log("msg", "apply error", "err", err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have metric for that as well? (:

}
}
}
Expand Down
43 changes: 28 additions & 15 deletions pkg/reloader/reloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,14 @@ func TestReloader_ConfigApply(t *testing.T) {
input = filepath.Join(dir, "in", "cfg.yaml.tmpl")
output = filepath.Join(dir, "out", "cfg.yaml")
)
reloader := New(nil, nil, reloadURL, input, output, nil)
reloader.watchInterval = 9999 * time.Hour // Disable interval to test watch logic only.
reloader.retryInterval = 100 * time.Millisecond
reloader := New(nil, nil, &Options{
ReloadURL: reloadURL,
CfgFile: input,
CfgOutputFile: output,
RuleDirs: nil,
WatchInterval: 9999 * time.Hour, // Disable interval to test watch logic only.
RetryInterval: 100 * time.Millisecond,
})

// Fail without config.
err = reloader.Watch(ctx)
Expand Down Expand Up @@ -97,6 +102,7 @@ config:
}()

reloadsSeen := 0
attemptsCnt := 0
for {
select {
case <-ctx.Done():
Expand All @@ -105,12 +111,9 @@ config:
}

rel := reloads.Load().(int)
if rel <= reloadsSeen {
// Nothing new.
continue
}
reloadsSeen = rel

if reloadsSeen == 0 {
if reloadsSeen == 1 {
// Initial apply seen (without doing nothing).
f, err := ioutil.ReadFile(output)
testutil.Ok(t, err)
Expand All @@ -128,7 +131,7 @@ config:
b: $(TEST_RELOADER_THANOS_ENV)
c: $(TEST_RELOADER_THANOS_ENV2)
`), os.ModePerm))
} else {
} else if reloadsSeen == 2 {
// Another apply, ensure we see change.
f, err := ioutil.ReadFile(output)
testutil.Ok(t, err)
Expand All @@ -138,10 +141,15 @@ config:
b: 2
c: 3
`, string(f))
// All good, break
break

// Change the mode so reloader can't read the file.
testutil.Ok(t, os.Chmod(input, os.ModeDir))
attemptsCnt += 1
// That was the second attempt to reload config. All good, break.
if attemptsCnt == 2 {
break
}
}
reloadsSeen = rel
}
cancel2()
g.Wait()
Expand Down Expand Up @@ -191,9 +199,14 @@ func TestReloader_RuleApply(t *testing.T) {
testutil.Ok(t, os.Mkdir(path.Join(dir2, "rule-dir"), os.ModePerm))
testutil.Ok(t, os.Symlink(path.Join(dir2, "rule-dir"), path.Join(dir, "rule-dir")))

reloader := New(nil, nil, reloadURL, "", "", []string{dir, path.Join(dir, "rule-dir")})
reloader.watchInterval = 100 * time.Millisecond
reloader.retryInterval = 100 * time.Millisecond
reloader := New(nil, nil, &Options{
ReloadURL: reloadURL,
CfgFile: "",
CfgOutputFile: "",
RuleDirs: []string{dir, path.Join(dir, "rule-dir")},
WatchInterval: 100 * time.Millisecond,
RetryInterval: 100 * time.Millisecond,
})

// Some initial state.
testutil.Ok(t, ioutil.WriteFile(path.Join(dir, "rule1.yaml"), []byte("rule"), os.ModePerm))
Expand Down