Skip to content

Commit

Permalink
cli: don't fail drain cmd if cluster settings aren't available
Browse files Browse the repository at this point in the history
This makes the command more robust, since it should still work even if
the settings cannot be fetched. If the cluster is not fully available,
then this step may fail, but it should not prevent a drain command on a
specific node.

Release note: None
  • Loading branch information
rafiss committed Mar 23, 2023
1 parent c2460f1 commit 5becbc8
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 31 deletions.
58 changes: 31 additions & 27 deletions pkg/cli/rpc_node_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,37 +64,41 @@ func doDrain(
return doDrainNoTimeout(ctx, c, targetNode)
}

shutdownSettings, err := c.Settings(ctx, &serverpb.SettingsRequest{
Keys: []string{
"server.shutdown.drain_wait",
"server.shutdown.connection_wait",
"server.shutdown.query_wait",
"server.shutdown.lease_transfer_wait",
},
UnredactedValues: true,
})
if err != nil {
return false, true, err
}

// Add an extra buffer of 10 seconds for the timeout.
minWait := 10 * time.Second
for k, v := range shutdownSettings.KeyValues {
wait, err := time.ParseDuration(v.Value)
if err := contextutil.RunWithTimeout(ctx, "get-drain-settings", 5*time.Second, func(ctx context.Context) error {
shutdownSettings, err := c.Settings(ctx, &serverpb.SettingsRequest{
Keys: []string{
"server.shutdown.drain_wait",
"server.shutdown.connection_wait",
"server.shutdown.query_wait",
"server.shutdown.lease_transfer_wait",
},
UnredactedValues: true,
})
if err != nil {
return false, true, err
return err
}
minWait += wait
// query_wait is used twice during draining, so count it twice here.
if k == "server.shutdown.query_wait" {
// Add an extra buffer of 10 seconds for the timeout.
minWait := 10 * time.Second
for k, v := range shutdownSettings.KeyValues {
wait, err := time.ParseDuration(v.Value)
if err != nil {
return err
}
minWait += wait
// query_wait is used twice during draining, so count it twice here.
if k == "server.shutdown.query_wait" {
minWait += wait
}
}
}
if minWait > drainCtx.drainWait {
fmt.Fprintf(stderr, "warning: --drain-wait is %s, but the server.shutdown.{drain,query,connection,lease_transfer}_wait "+
"cluster settings require a value of at least %s; using the larger value\n",
drainCtx.drainWait, minWait)
drainCtx.drainWait = minWait
if minWait > drainCtx.drainWait {
fmt.Fprintf(stderr, "warning: --drain-wait is %s, but the server.shutdown.{drain,query,connection,lease_transfer}_wait "+
"cluster settings require a value of at least %s; using the larger value\n",
drainCtx.drainWait, minWait)
drainCtx.drainWait = minWait
}
return nil
}); err != nil {
fmt.Fprintf(stderr, "warning: could not check drain related cluster settings: %v\n", err)
}

err = contextutil.RunWithTimeout(ctx, "drain", drainCtx.drainWait, func(ctx context.Context) (err error) {
Expand Down
52 changes: 48 additions & 4 deletions pkg/cmd/roachtest/tests/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,16 @@ func registerDrain(r registry.Registry) {
Owner: registry.OwnerSQLSessions,
Cluster: r.MakeClusterSpec(1),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runTestWarningForConnWait(ctx, t, c)
runWarningForConnWait(ctx, t, c)
},
})

r.Add(registry.TestSpec{
Name: "drain/not-at-quorum",
Owner: registry.OwnerSQLSessions,
Cluster: r.MakeClusterSpec(3),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runClusterNotAtQuorum(ctx, t, c)
},
})
}
Expand Down Expand Up @@ -203,9 +212,9 @@ func runEarlyExitInConnectionWait(ctx context.Context, t test.Test, c cluster.Cl

}

// runTestWarningForConnWait is to verify a warning exists in the case that
// runWarningForConnWait is to verify a warning exists in the case that
// connectionWait expires.
func runTestWarningForConnWait(ctx context.Context, t test.Test, c cluster.Cluster) {
func runWarningForConnWait(ctx context.Context, t test.Test, c cluster.Cluster) {
var err error
const (
// Set the duration of the draining period.
Expand Down Expand Up @@ -296,6 +305,39 @@ func runTestWarningForConnWait(ctx context.Context, t test.Test, c cluster.Clust
require.NoError(t, err, "warning is not logged in the log file")
}

// runClusterNotAtQuorum is to verify that draining works even when the cluster
// is not at quorum.
func runClusterNotAtQuorum(ctx context.Context, t test.Test, c cluster.Cluster) {
err := c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All())
if err != nil {
t.Fatal(err)
}

c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())
db := c.Conn(ctx, t.L(), 1)
defer func() { _ = db.Close() }()

err = WaitFor3XReplication(ctx, t, db)
require.NoError(t, err)

stopOpts := option.DefaultStopOpts()
stopOpts.RoachprodOpts.Sig = 9 // SIGKILL

c.Stop(ctx, t.L(), stopOpts, c.Node(1))
c.Stop(ctx, t.L(), stopOpts, c.Node(2))

t.Status("start draining node 3")
// Ignore the error, since the command is expected to time out.
results, _ := c.RunWithDetailsSingleNode(
ctx,
t.L(),
c.Node(3),
"./cockroach node drain --self --insecure --drain-wait=10s",
)
t.L().Printf("drain output:\n%s\n%s\n", results.Stdout, results.Stderr)
require.Contains(t, results.Stderr, "could not check drain related cluster settings")
}

// prepareCluster is to start the server on nodes in the given cluster, and set
// the cluster setting for duration of each phase of the draining process.
func prepareCluster(
Expand All @@ -308,7 +350,9 @@ func prepareCluster(
) {
var err error
err = c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All())
require.NoError(t, err, "cannot mount cockroach binary")
if err != nil {
t.Fatal(err)
}

c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())

Expand Down

0 comments on commit 5becbc8

Please sign in to comment.