diff --git a/cmd/validate.go b/cmd/validate.go index 638ca0502..86ebcc9e0 100644 --- a/cmd/validate.go +++ b/cmd/validate.go @@ -9,6 +9,7 @@ import ( "encoding/hex" "encoding/json" "fmt" + "strings" "github.com/gogo/protobuf/proto" "github.com/pingcap/errors" @@ -20,6 +21,7 @@ import ( "github.com/tikv/pd/pkg/mock/mockid" "go.uber.org/zap" + "github.com/pingcap/br/pkg/conn" "github.com/pingcap/br/pkg/restore" "github.com/pingcap/br/pkg/rtree" "github.com/pingcap/br/pkg/task" @@ -45,6 +47,7 @@ func NewValidateCommand() *cobra.Command { meta.AddCommand(newBackupMetaCommand()) meta.AddCommand(decodeBackupMetaCommand()) meta.AddCommand(encodeBackupMetaCommand()) + meta.AddCommand(setPDConfigCommand()) meta.Hidden = true return meta @@ -323,3 +326,43 @@ func encodeBackupMetaCommand() *cobra.Command { } return encodeBackupMetaCmd } + +func setPDConfigCommand() *cobra.Command { + pdConfigCmd := &cobra.Command{ + Use: "reset-pd-config-as-default", + Short: "reset pd scheduler and config adjusted by BR to default value", + RunE: func(cmd *cobra.Command, args []string) error { + ctx, cancel := context.WithCancel(GetDefaultContext()) + defer cancel() + + var cfg task.Config + if err := cfg.ParseFromFlags(cmd.Flags()); err != nil { + return err + } + + mgr, err := task.NewMgr(ctx, tidbGlue, cfg.PD, cfg.TLS, cfg.CheckRequirements) + if err != nil { + return err + } + defer mgr.Close() + + for scheduler := range conn.Schedulers { + if strings.HasPrefix(scheduler, "balance") { + err := mgr.AddScheduler(ctx, scheduler) + if err != nil { + return err + } + log.Info("add pd schedulers succeed", + zap.String("schedulers", scheduler)) + } + } + + if err := mgr.UpdatePDScheduleConfig(ctx, conn.DefaultPDCfg); err != nil { + return errors.Annotate(err, "fail to update PD merge config") + } + log.Info("add pd configs succeed", zap.Any("config", conn.DefaultPDCfg)) + return nil + }, + } + return pdConfigCmd +} diff --git a/pkg/conn/scheduler_utils.go b/pkg/conn/scheduler_utils.go index 56b9e12b5..b31f3415b 100644 --- a/pkg/conn/scheduler_utils.go +++ b/pkg/conn/scheduler_utils.go @@ -21,7 +21,8 @@ type clusterConfig struct { } var ( - schedulers = map[string]struct{}{ + // Schedulers represent region/leader schedulers which can impact on performance. + Schedulers = map[string]struct{}{ "balance-leader-scheduler": {}, "balance-hot-region-scheduler": {}, "balance-region-scheduler": {}, @@ -30,6 +31,7 @@ var ( "shuffle-region-scheduler": {}, "shuffle-hot-region-scheduler": {}, } + pdRegionMergeCfg = []string{ "max-merge-region-keys", "max-merge-region-size", @@ -39,6 +41,15 @@ var ( "region-schedule-limit", "max-snapshot-count", } + + // DefaultPDCfg find by https://github.com/tikv/pd/blob/master/conf/config.toml. + DefaultPDCfg = map[string]interface{}{ + "max-merge-region-keys": 200000, + "max-merge-region-size": 20, + "leader-schedule-limit": 4, + "region-schedule-limit": 2048, + "max-snapshot-count": 3, + } ) func addPDLeaderScheduler(ctx context.Context, mgr *Mgr, removedSchedulers []string) error { @@ -101,7 +112,7 @@ func (mgr *Mgr) RemoveSchedulers(ctx context.Context) (undo utils.UndoFunc, err } needRemoveSchedulers := make([]string, 0, len(existSchedulers)) for _, s := range existSchedulers { - if _, ok := schedulers[s]; ok { + if _, ok := Schedulers[s]; ok { needRemoveSchedulers = append(needRemoveSchedulers, s) } } diff --git a/pkg/task/backup.go b/pkg/task/backup.go index 321bb16e1..ae724d49a 100644 --- a/pkg/task/backup.go +++ b/pkg/task/backup.go @@ -175,7 +175,7 @@ func RunBackup(c context.Context, g glue.Glue, cmdName string, cfg *BackupConfig if err != nil { return err } - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } diff --git a/pkg/task/backup_raw.go b/pkg/task/backup_raw.go index fa6a15e72..be89fa456 100644 --- a/pkg/task/backup_raw.go +++ b/pkg/task/backup_raw.go @@ -125,7 +125,7 @@ func RunBackupRaw(c context.Context, g glue.Glue, cmdName string, cfg *RawKvConf if err != nil { return err } - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } diff --git a/pkg/task/common.go b/pkg/task/common.go index 6f21bf359..67cf8e20e 100644 --- a/pkg/task/common.go +++ b/pkg/task/common.go @@ -277,8 +277,8 @@ func (cfg *Config) ParseFromFlags(flags *pflag.FlagSet) error { return cfg.TLS.ParseFromFlags(flags) } -// newMgr creates a new mgr at the given PD address. -func newMgr(ctx context.Context, +// NewMgr creates a new mgr at the given PD address. +func NewMgr(ctx context.Context, g glue.Glue, pds []string, tlsConfig TLSConfig, checkRequirements bool) (*conn.Mgr, error) { diff --git a/pkg/task/restore.go b/pkg/task/restore.go index c91e56abf..3cdeae7a8 100644 --- a/pkg/task/restore.go +++ b/pkg/task/restore.go @@ -93,7 +93,7 @@ func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf ctx, cancel := context.WithCancel(c) defer cancel() - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } @@ -369,7 +369,7 @@ func RunRestoreTiflashReplica(c context.Context, g glue.Glue, cmdName string, cf ctx, cancel := context.WithCancel(c) defer cancel() - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } diff --git a/pkg/task/restore_log.go b/pkg/task/restore_log.go index 5c2854e43..986d9a48a 100644 --- a/pkg/task/restore_log.go +++ b/pkg/task/restore_log.go @@ -97,7 +97,7 @@ func RunLogRestore(c context.Context, g glue.Glue, cfg *LogRestoreConfig) error ctx, cancel := context.WithCancel(c) defer cancel() - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } diff --git a/pkg/task/restore_raw.go b/pkg/task/restore_raw.go index 61bd020b4..397bad04e 100644 --- a/pkg/task/restore_raw.go +++ b/pkg/task/restore_raw.go @@ -58,7 +58,7 @@ func RunRestoreRaw(c context.Context, g glue.Glue, cmdName string, cfg *RestoreR ctx, cancel := context.WithCancel(c) defer cancel() - mgr, err := newMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) + mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, cfg.CheckRequirements) if err != nil { return err } diff --git a/tests/br_other/run.sh b/tests/br_other/run.sh index 7d968610e..c4e5f3cf6 100644 --- a/tests/br_other/run.sh +++ b/tests/br_other/run.sh @@ -95,8 +95,34 @@ fi # make sure we won't stuck in non-scheduler state, even we send a SIGTERM to it. # give enough time to BR so it can gracefully stop. -sleep 10 -! curl http://$PD_ADDR/pd/api/v1/config/schedule | grep '"disable": true' +sleep 5 +if curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '[."schedulers-v2"][0][0]' | grep -q '"disable": false' +then + echo "TEST: [$TEST_NAME] failed because scheduler has not been removed" + exit 1 +fi + +pd_settings=5 +# we need reset pd scheduler/config to default +# until pd has the solution to temporary set these scheduler/configs. +run_br validate reset-pd-config-as-default + +# max-merge-region-size set to default 20 +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-size"' | grep "20" || ((pd_settings--)) + +# max-merge-region-keys set to default 200000 +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."max-merge-region-keys"' | grep "200000" || ((pd_settings--)) +# balance-region scheduler enabled +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-region")}' | grep '"disable": false' || ((pd_settings--)) +# balance-leader scheduler enabled +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="balance-leader")}' | grep '"disable": false' || ((pd_settings--)) +# hot region scheduler enabled +curl http://$PD_ADDR/pd/api/v1/config/schedule | jq '."schedulers-v2"[] | {disable: .disable, type: ."type" | select (.=="hot-region")}' | grep '"disable": false' || ((pd_settings--)) + +if [ "$pd_settings" -ne "5" ];then + echo "TEST: [$TEST_NAME] test validate reset pd config failed!" + exit 1 +fi run_sql "DROP DATABASE $DB;"