From 93e620eccaa48387e57ecb1587df736164d6f69c Mon Sep 17 00:00:00 2001 From: Marius Posta Date: Mon, 7 Nov 2022 10:50:58 -0500 Subject: [PATCH] schemachanger: improve panic- and error handling Previously, the declarative schema changer would only recover from runtime errors in certain subsystems like building the targets or planning the execution of operations. Consequently an implementation bug leading to a runtime error in the execution layer would trigger a panic which would not be recovered and which would cause the whole process to crash. This commit fixes this by introducing a common error handler in the form of scerrors.EventLogger, which recovers from panics, wraps errors, and prints informative log messages in a uniform way, to be used at the top of (or near the top of) the declarative schema changer call stack. Fixes #91400. Release note (bug fix): fixed a bug in which panics triggered by certain DDL statements were not properly recovered, leading to the cluster node crashing. --- pkg/sql/schemachanger/scbuild/BUILD.bazel | 2 - pkg/sql/schemachanger/scbuild/build.go | 32 +++--------- pkg/sql/schemachanger/scerrors/BUILD.bazel | 3 ++ pkg/sql/schemachanger/scerrors/errors.go | 58 ++++++++++++++++++++++ pkg/sql/schemachanger/scpb/state.go | 13 +++++ pkg/sql/schemachanger/scplan/BUILD.bazel | 2 + pkg/sql/schemachanger/scplan/plan.go | 19 ++++--- pkg/sql/schemachanger/scrun/BUILD.bazel | 3 +- pkg/sql/schemachanger/scrun/scrun.go | 30 +++++------ pkg/sql/testdata/telemetry/error | 2 +- 10 files changed, 108 insertions(+), 56 deletions(-) diff --git a/pkg/sql/schemachanger/scbuild/BUILD.bazel b/pkg/sql/schemachanger/scbuild/BUILD.bazel index b152068161a0..75a7c4a9a84a 100644 --- a/pkg/sql/schemachanger/scbuild/BUILD.bazel +++ b/pkg/sql/schemachanger/scbuild/BUILD.bazel @@ -47,8 +47,6 @@ go_library( "//pkg/sql/sessiondatapb", "//pkg/sql/sqlerrors", "//pkg/sql/types", - "//pkg/util/log", - "//pkg/util/timeutil", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_redact//:redact", diff --git a/pkg/sql/schemachanger/scbuild/build.go b/pkg/sql/schemachanger/scbuild/build.go index 2c76fc3c9da6..ca68a3cdbb77 100644 --- a/pkg/sql/schemachanger/scbuild/build.go +++ b/pkg/sql/schemachanger/scbuild/build.go @@ -12,7 +12,6 @@ package scbuild import ( "context" - "runtime" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -27,9 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb" - "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" - "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" ) // Build constructs a new state from an initial state and a statement. @@ -39,13 +36,11 @@ import ( func Build( ctx context.Context, dependencies Dependencies, initial scpb.CurrentState, n tree.Statement, ) (_ scpb.CurrentState, err error) { - start := timeutil.Now() - defer func() { - if err != nil || !log.ExpensiveLogEnabled(ctx, 2) { - return - } - log.Infof(ctx, "build for %s took %v", n.StatementTag(), timeutil.Since(start)) - }() + defer scerrors.StartEventf( + ctx, + "building declarative schema change targets for %s", + redact.Safe(n.StatementTag()), + ).HandlePanicAndLogError(ctx, &err) initial = initial.DeepCopy() bs := newBuilderState(ctx, dependencies, initial) els := newEventLogState(dependencies, initial, n) @@ -64,21 +59,6 @@ func Build( TreeAnnotator: an, SchemaFeatureChecker: dependencies.FeatureChecker(), } - defer func() { - switch recErr := recover().(type) { - case nil: - // No error. - case runtime.Error: - err = errors.WithAssertionFailure(recErr) - case error: - err = recErr - default: - err = errors.AssertionFailedf( - "unexpected error encountered while building schema change plan %s", - recErr, - ) - } - }() scbuildstmt.Process(b, an.GetStatement()) an.ValidateAnnotations() els.statements[len(els.statements)-1].RedactedStatement = diff --git a/pkg/sql/schemachanger/scerrors/BUILD.bazel b/pkg/sql/schemachanger/scerrors/BUILD.bazel index 54658eb0a929..5b62de479367 100644 --- a/pkg/sql/schemachanger/scerrors/BUILD.bazel +++ b/pkg/sql/schemachanger/scerrors/BUILD.bazel @@ -10,7 +10,10 @@ go_library( "//pkg/sql/catalog", "//pkg/sql/catalog/descpb", "//pkg/sql/sem/tree", + "//pkg/util/log", + "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", ], ) diff --git a/pkg/sql/schemachanger/scerrors/errors.go b/pkg/sql/schemachanger/scerrors/errors.go index 785b5db3978b..a3c183cd160a 100644 --- a/pkg/sql/schemachanger/scerrors/errors.go +++ b/pkg/sql/schemachanger/scerrors/errors.go @@ -11,15 +11,73 @@ package scerrors import ( + "context" "fmt" + "runtime" "strings" + "time" "github.com/cockroachdb/cockroach/pkg/sql/catalog" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" ) +// EventLogger is a convenience object used for logging schema changer events. +type EventLogger struct { + msg redact.SafeValue + start time.Time +} + +// StartEventf logs the start of a schema change event, and returns an object +// with a HandlePanicAndLogError method to handle panics and log errors at the +// end of the event. +// +// Typical usage is along the lines of: +// - defer StartEventf(...).HandlePanicAndLogError(...) +func StartEventf(ctx context.Context, format string, args ...interface{}) EventLogger { + msg := redact.Safe(fmt.Sprintf(format, args...)) + log.InfofDepth(ctx, 1, "%s", msg) + return EventLogger{ + msg: msg, + start: timeutil.Now(), + } +} + +// HandlePanicAndLogError handles panics by recovering them in an error, +// which it then also logs. See also StartEventf. +func (el EventLogger) HandlePanicAndLogError(ctx context.Context, err *error) { + switch recErr := recover().(type) { + case nil: + // No panicked error. + case runtime.Error: + *err = errors.WithAssertionFailure(recErr) + case error: + *err = recErr + default: + *err = errors.AssertionFailedf("recovered from uncategorizable panic: %v", recErr) + } + if *err == nil { + *err = ctx.Err() + } + if errors.Is(*err, context.Canceled) { + return + } + if *err == nil { + if log.ExpensiveLogEnabled(ctx, 2) { + log.InfofDepth(ctx, 1, "done %s in %s", el.msg, redact.Safe(timeutil.Since(el.start))) + } + return + } + log.WarningfDepth(ctx, 1, "failed %s with error: %v", el.msg, *err) + if errors.HasAssertionFailure(*err) { + *err = errors.Wrapf(*err, "%s", el.msg) + } +} + type notImplementedError struct { n tree.NodeFormatter detail string diff --git a/pkg/sql/schemachanger/scpb/state.go b/pkg/sql/schemachanger/scpb/state.go index e2c23daae2c5..edf948444171 100644 --- a/pkg/sql/schemachanger/scpb/state.go +++ b/pkg/sql/schemachanger/scpb/state.go @@ -12,6 +12,7 @@ package scpb import ( "sort" + "strings" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/errors" @@ -69,6 +70,18 @@ func (s *CurrentState) Rollback() { s.InRollback = true } +// StatementTags returns the concatenated statement tags in the current state. +func (s CurrentState) StatementTags() string { + var sb strings.Builder + for i, stmt := range s.Statements { + if i > 0 { + sb.WriteString("; ") + } + sb.WriteString(stmt.StatementTag) + } + return sb.String() +} + // NumStatus is the number of values which Status may take on. var NumStatus = len(Status_name) diff --git a/pkg/sql/schemachanger/scplan/BUILD.bazel b/pkg/sql/schemachanger/scplan/BUILD.bazel index 7efd047612b5..f8d5b08823fd 100644 --- a/pkg/sql/schemachanger/scplan/BUILD.bazel +++ b/pkg/sql/schemachanger/scplan/BUILD.bazel @@ -12,6 +12,7 @@ go_library( deps = [ "//pkg/clusterversion", "//pkg/jobs/jobspb", + "//pkg/sql/schemachanger/scerrors", "//pkg/sql/schemachanger/scop", "//pkg/sql/schemachanger/scpb", "//pkg/sql/schemachanger/scplan/internal/opgen", @@ -25,6 +26,7 @@ go_library( "//pkg/util/timeutil", "//pkg/util/treeprinter", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", "@in_gopkg_yaml_v2//:yaml_v2", ], ) diff --git a/pkg/sql/schemachanger/scplan/plan.go b/pkg/sql/schemachanger/scplan/plan.go index 0bb0e47b1b4c..c28eee82c8bf 100644 --- a/pkg/sql/schemachanger/scplan/plan.go +++ b/pkg/sql/schemachanger/scplan/plan.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scerrors" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scop" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scpb" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scplan/internal/opgen" @@ -24,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" ) // Params holds the arguments for planning. @@ -79,6 +81,13 @@ func (p Plan) StagesForCurrentPhase() []scstage.Stage { // MakePlan generates a Plan for a particular phase of a schema change, given // the initial state for a set of targets. Returns an error when planning fails. func MakePlan(ctx context.Context, initial scpb.CurrentState, params Params) (p Plan, err error) { + defer scerrors.StartEventf( + ctx, + "building declarative schema changer plan in %s (rollback=%v) for %s", + redact.Safe(params.ExecutionPhase), + redact.Safe(params.InRollback), + redact.Safe(initial.StatementTags()), + ).HandlePanicAndLogError(ctx, &err) p = Plan{ CurrentState: initial, Params: params, @@ -91,16 +100,6 @@ func MakePlan(ctx context.Context, initial scpb.CurrentState, params Params) (p } func makePlan(ctx context.Context, p *Plan) (err error) { - defer func() { - if r := recover(); r != nil { - rAsErr, ok := r.(error) - if !ok { - rAsErr = errors.Errorf("panic during MakePlan: %v", r) - } - err = rAsErr - } - err = errors.WithAssertionFailure(err) - }() { start := timeutil.Now() p.Graph = buildGraph(ctx, p.Params.ActiveVersion, p.CurrentState) diff --git a/pkg/sql/schemachanger/scrun/BUILD.bazel b/pkg/sql/schemachanger/scrun/BUILD.bazel index 2255f7cccce5..db75cea6687d 100644 --- a/pkg/sql/schemachanger/scrun/BUILD.bazel +++ b/pkg/sql/schemachanger/scrun/BUILD.bazel @@ -21,9 +21,8 @@ go_library( "//pkg/sql/schemachanger/scop", "//pkg/sql/schemachanger/scpb", "//pkg/sql/schemachanger/scplan", - "//pkg/util/log", - "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", ], ) diff --git a/pkg/sql/schemachanger/scrun/scrun.go b/pkg/sql/schemachanger/scrun/scrun.go index 0df19a0194db..602f48fe01fd 100644 --- a/pkg/sql/schemachanger/scrun/scrun.go +++ b/pkg/sql/schemachanger/scrun/scrun.go @@ -24,9 +24,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scop" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scpb" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scplan" - "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" ) // RunStatementPhase executes in-transaction schema changes for the targeted @@ -158,22 +157,18 @@ func executeStage( stageIdx int, stage scplan.Stage, ) (err error) { - - log.Infof(ctx, "executing %s (rollback=%v)", stage, p.InRollback) - + defer scerrors.StartEventf( + ctx, + "executing declarative schema change %s (rollback=%v) for %s", + redact.Safe(stage), + redact.Safe(p.InRollback), + redact.Safe(p.StatementTags()), + ).HandlePanicAndLogError(ctx, &err) if knobs != nil && knobs.BeforeStage != nil { if err := knobs.BeforeStage(p, stageIdx); err != nil { return err } } - - start := timeutil.Now() - defer func() { - if log.ExpensiveLogEnabled(ctx, 2) { - log.Infof(ctx, "executing %s (rollback=%v) took %v: err = %v", - stage, p.InRollback, timeutil.Since(start), err) - } - }() if err := scexec.ExecuteStage(ctx, deps, stage.Ops()); err != nil { // Don't go through the effort to wrap the error if it's a retry or it's a // cancelation. @@ -209,7 +204,12 @@ func makeState( descriptorIDs []descpb.ID, rollback bool, withCatalog withCatalogFunc, -) (scpb.CurrentState, error) { +) (state scpb.CurrentState, err error) { + defer scerrors.StartEventf( + ctx, + "rebuilding declarative schema change state from descriptors %v", + redact.Safe(descriptorIDs), + ).HandlePanicAndLogError(ctx, &err) descError := func(desc catalog.Descriptor, err error) error { return errors.Wrapf(err, "descriptor %q (%d)", desc.GetName(), desc.GetID()) } @@ -273,7 +273,7 @@ func makeState( }); err != nil { return scpb.CurrentState{}, err } - state, err := scpb.MakeCurrentStateFromDescriptors(descriptorStates) + state, err = scpb.MakeCurrentStateFromDescriptors(descriptorStates) if err != nil { return scpb.CurrentState{}, err } diff --git a/pkg/sql/testdata/telemetry/error b/pkg/sql/testdata/telemetry/error index bbc50567cd51..4933d6529360 100644 --- a/pkg/sql/testdata/telemetry/error +++ b/pkg/sql/testdata/telemetry/error @@ -43,7 +43,7 @@ SELECT crdb_internal.unsafe_upsert_descriptor(id, crdb_internal.json_to_pb('desc feature-usage DROP TABLE tbl CASCADE; ---- -error: pq: internal error: relation "tbl" (...): missing fk back reference "tbl_customer_fkey" to "tbl" from "fktbl" +error: pq: internal error: building declarative schema change targets for DROP TABLE: relation "tbl" (...): missing fk back reference "tbl_customer_fkey" to "tbl" from "fktbl" errorcodes.XX000 sql.schema.validation_errors.read.backward_references.relation