Skip to content

Commit

Permalink
Added thanos bucket replicate (#2113)
Browse files Browse the repository at this point in the history
* bucket: Implement replcate

Signed-off-by: Xiang Dai <764524258@qq.com>

* Document replicate

Signed-off-by: Xiang Dai <764524258@qq.com>

* feedback

Signed-off-by: Xiang Dai <764524258@qq.com>

* feedback

Signed-off-by: Xiang Dai <764524258@qq.com>

* remove version check

Signed-off-by: Xiang Dai <764524258@qq.com>

* update CHANGLOG

Signed-off-by: Xiang Dai <764524258@qq.com>

* add chan for SIGHUP

Signed-off-by: Xiang Dai <764524258@qq.com>

* add existence check

Signed-off-by: Xiang Dai <764524258@qq.com>

* Add mixin

Signed-off-by: Xiang Dai <764524258@qq.com>

* rename as replicate

Signed-off-by: Xiang Dai <764524258@qq.com>

* feedback

Signed-off-by: Xiang Dai <764524258@qq.com>

* update mixin

Signed-off-by: Xiang Dai <764524258@qq.com>

* update CHANGLOG

Signed-off-by: Xiang Dai <764524258@qq.com>

* add bucket prefix

Signed-off-by: Xiang Dai <764524258@qq.com>
  • Loading branch information
daixiang0 authored Feb 26, 2020
1 parent a46b9c3 commit 0f6df8b
Show file tree
Hide file tree
Showing 21 changed files with 1,800 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel
- [#2049](https://github.com/thanos-io/thanos/pull/2049) Tracing: Support sampling on Elastic APM with new sample_rate setting.
- [#2008](https://github.com/thanos-io/thanos/pull/2008) Querier, Receiver, Sidecar, Store: Add gRPC [health check](https://github.com/grpc/grpc/blob/master/doc/health-checking.md) endpoints.
- [#2145](https://github.com/thanos-io/thanos/pull/2145) Tracing: track query sent to prometheus via remote read api.
- [#2113](https://github.com/thanos-io/thanos/pull/2113) Bucket: Added `thanos bucket replicate`.

### Changed

Expand Down
47 changes: 47 additions & 0 deletions cmd/thanos/bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"os"
"sort"
"strconv"
"strings"
"text/template"
"time"
Expand All @@ -26,13 +27,15 @@ import (
"github.com/thanos-io/thanos/pkg/block"
"github.com/thanos-io/thanos/pkg/block/metadata"
"github.com/thanos-io/thanos/pkg/compact"
"github.com/thanos-io/thanos/pkg/compact/downsample"
"github.com/thanos-io/thanos/pkg/component"
"github.com/thanos-io/thanos/pkg/extflag"
"github.com/thanos-io/thanos/pkg/extprom"
extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http"
"github.com/thanos-io/thanos/pkg/objstore"
"github.com/thanos-io/thanos/pkg/objstore/client"
"github.com/thanos-io/thanos/pkg/prober"
"github.com/thanos-io/thanos/pkg/replicate"
"github.com/thanos-io/thanos/pkg/runutil"
httpserver "github.com/thanos-io/thanos/pkg/server/http"
"github.com/thanos-io/thanos/pkg/ui"
Expand Down Expand Up @@ -69,6 +72,7 @@ func registerBucket(m map[string]setupFunc, app *kingpin.Application, name strin
registerBucketLs(m, cmd, name, objStoreConfig)
registerBucketInspect(m, cmd, name, objStoreConfig)
registerBucketWeb(m, cmd, name, objStoreConfig)
registerBucketReplicate(m, cmd, name, objStoreConfig)
}

func registerBucketVerify(m map[string]setupFunc, root *kingpin.CmdClause, name string, objStoreConfig *extflag.PathOrContent) {
Expand Down Expand Up @@ -377,6 +381,49 @@ func registerBucketWeb(m map[string]setupFunc, root *kingpin.CmdClause, name str
}
}

// Provide a list of resolution, can not use Enum directly, since string does not implement int64 function.
func listResLevel() []string {
return []string{
strconv.FormatInt(downsample.ResLevel0, 10),
strconv.FormatInt(downsample.ResLevel1, 10),
strconv.FormatInt(downsample.ResLevel2, 10)}
}

func registerBucketReplicate(m map[string]setupFunc, root *kingpin.CmdClause, name string, objStoreConfig *extflag.PathOrContent) {
cmd := root.Command("replicate", fmt.Sprintf("Replicate data from one object storage to another. NOTE: Currently it works only with Thanos blocks (%v has to have Thanos metadata).", block.MetaFilename))
httpBindAddr, httpGracePeriod := regHTTPFlags(cmd)
toObjStoreConfig := regCommonObjStoreFlags(cmd, "-to", false, "The object storage which replicate data to.")
// TODO(bwplotka): Allow to replicate many resolution levels.
resolution := cmd.Flag("resolution", "Only blocks with this resolution will be replicated.").Default(strconv.FormatInt(downsample.ResLevel0, 10)).HintAction(listResLevel).Int64()
// TODO(bwplotka): Allow to replicate many compaction levels.
compaction := cmd.Flag("compaction", "Only blocks with this compaction level will be replicated.").Default("1").Int()
matcherStrs := cmd.Flag("matcher", "Only blocks whose external labels exactly match this matcher will be replicated.").PlaceHolder("key=\"value\"").Strings()
singleRun := cmd.Flag("single-run", "Run replication only one time, then exit.").Default("false").Bool()

m[name+" replicate"] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
matchers, err := replicate.ParseFlagMatchers(*matcherStrs)
if err != nil {
return errors.Wrap(err, "parse block label matchers")
}

return replicate.RunReplicate(
g,
logger,
reg,
tracer,
*httpBindAddr,
time.Duration(*httpGracePeriod),
matchers,
compact.ResolutionLevel(*resolution),
*compaction,
objStoreConfig,
toObjStoreConfig,
*singleRun,
)
}

}

// refresh metadata from remote storage periodically and update UI.
func refresh(ctx context.Context, logger log.Logger, bucketUI *ui.Bucket, duration time.Duration, timeout time.Duration, name string, reg *prometheus.Registry, objStoreConfig *extflag.PathOrContent) error {
confContentYaml, err := objStoreConfig.Content()
Expand Down
74 changes: 74 additions & 0 deletions docs/components/bucket.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ Subcommands:
bucket web [<flags>]
Web interface for remote storage bucket
bucket replicate [<flags>]
Replicate data from one object storage to another. NOTE: Currently it works
only with Thanos blocks (meta.json has to have Thanos metadata).
```

Expand Down Expand Up @@ -315,3 +319,73 @@ Flags:
--timeout=5m Timeout to download metadata from remote storage
```

### replicate

`bucket replicate` is used to replicate buckets from one object storage to another.

NOTE: Currently it works only with Thanos blocks (meta.json has to have Thanos metadata).

Example:
```
$ thanos bucket replicate --objstore.config-file="..." --objstore-to.config="..."
```

[embedmd]:# (flags/bucket_replicate.txt)
```txt
usage: thanos bucket replicate [<flags>]
Replicate data from one object storage to another. NOTE: Currently it works only
with Thanos blocks (meta.json has to have Thanos metadata).
Flags:
-h, --help Show context-sensitive help (also try
--help-long and --help-man).
--version Show application version.
--log.level=info Log filtering level.
--log.format=logfmt Log format to use. Possible options: logfmt or
json.
--tracing.config-file=<file-path>
Path to YAML file with tracing configuration.
See format details:
https://thanos.io/tracing.md/#configuration
--tracing.config=<content>
Alternative to 'tracing.config-file' flag
(lower priority). Content of YAML file with
tracing configuration. See format details:
https://thanos.io/tracing.md/#configuration
--objstore.config-file=<file-path>
Path to YAML file that contains object store
configuration. See format details:
https://thanos.io/storage.md/#configuration
--objstore.config=<content>
Alternative to 'objstore.config-file' flag
(lower priority). Content of YAML file that
contains object store configuration. See format
details:
https://thanos.io/storage.md/#configuration
--http-address="0.0.0.0:10902"
Listen host:port for HTTP endpoints.
--http-grace-period=2m Time to wait after an interrupt received for
HTTP Server.
--objstore-to.config-file=<file-path>
Path to YAML file that contains object store-to
configuration. See format details:
https://thanos.io/storage.md/#configuration The
object storage which replicate data to.
--objstore-to.config=<content>
Alternative to 'objstore-to.config-file' flag
(lower priority). Content of YAML file that
contains object store-to configuration. See
format details:
https://thanos.io/storage.md/#configuration The
object storage which replicate data to.
--resolution=0 Only blocks with this resolution will be
replicated.
--compaction=1 Only blocks with this compaction level will be
replicated.
--matcher=key="value" ... Only blocks whose external labels exactly match
this matcher will be replicated.
--single-run Run replication only one time, then exit.
```
42 changes: 42 additions & 0 deletions examples/alerts/alerts.md
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,48 @@ rules:
severity: warning
```
## Replicate
[embedmd]:# (../tmp/thanos-bucket-replicate.rules.yaml yaml)
```yaml
name: thanos-bucket-replicate.rules
rules:
- alert: ThanosBucketReplicateIsDown
annotations:
message: Thanos Replicate has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-bucket-replicate.*"})
for: 5m
labels:
severity: critical
- alert: ThanosBucketReplicateErrorRate
annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: |
(
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m]))
) * 100 >= 10
for: 5m
labels:
severity: critical
- alert: ThanosBucketReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"})) > 120
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
)
for: 5m
labels:
severity: critical
```
## Extras
### Absent Rules
Expand Down
36 changes: 36 additions & 0 deletions examples/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -439,3 +439,39 @@ groups:
for: 5m
labels:
severity: critical
- name: thanos-bucket-replicate.rules
rules:
- alert: ThanosBucketReplicateIsDown
annotations:
message: Thanos Replicate has disappeared from Prometheus target discovery.
expr: |
absent(up{job=~"thanos-bucket-replicate.*"})
for: 5m
labels:
severity: critical
- alert: ThanosBucketReplicateErrorRate
annotations:
message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
failed.
expr: |
(
sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m]))
) * 100 >= 10
for: 5m
labels:
severity: critical
- alert: ThanosBucketReplicateRunLatency
annotations:
message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
$value }} seconds for the replicate operations.
expr: |
(
histogram_quantile(0.9, sum by (job, le) (thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"})) > 120
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
)
for: 5m
labels:
severity: critical
2 changes: 2 additions & 0 deletions examples/alerts/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ groups:
labels:
quantile: "0.99"
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile
- name: thanos-bucket-replicate.rules
rules: []
Loading

0 comments on commit 0f6df8b

Please sign in to comment.