Skip to content

Commit

Permalink
inspect: resurrect the inspect command (tendermint#9655)
Browse files Browse the repository at this point in the history
resurrect the inspect command from tendermint#6785

Co-authored-by: Sam Kleinman <garen@tychoish.com>
Co-authored-by: Thane Thomson <connect@thanethomson.com>
Co-authored-by: Callum Waters <cmwaters19@gmail.com>
  • Loading branch information
4 people authored Dec 7, 2022
1 parent ac48630 commit 3324f49
Show file tree
Hide file tree
Showing 53 changed files with 1,506 additions and 362 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG_PENDING.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

- Go API
- [p2p] \#9625 Remove unused p2p/trust package (@cmwaters)
- [rpc] \#9655 Remove global environment and replace with constructor. (@williambanfield,@tychoish)
- [node] \#9655 Move DBContext and DBProvider from the node package to the config package. (@williambanfield,@tychoish)

- Blockchain Protocol

Expand All @@ -22,6 +24,7 @@
- [tools/tm-signer-harness] \#6498 Set OS home dir to instead of the hardcoded PATH. (@JayT106)
- [metrics] \#9682 move state-syncing and block-syncing metrics to their respective packages (@cmwaters)
labels have moved from block_syncing -> blocksync_syncing and state_syncing -> statesync_syncing
- [inspect] \#9655 Add a new `inspect` command for introspecting the state and block store of a crashed tendermint node. (@williambanfield)

### FEATURES

Expand Down
87 changes: 87 additions & 0 deletions cmd/tendermint/commands/inspect.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package commands

import (
"context"
"os"
"os/signal"
"syscall"

"github.com/spf13/cobra"

cfg "github.com/tendermint/tendermint/config"
"github.com/tendermint/tendermint/inspect"
"github.com/tendermint/tendermint/state"
"github.com/tendermint/tendermint/state/indexer/block"
"github.com/tendermint/tendermint/store"
"github.com/tendermint/tendermint/types"
)

// InspectCmd is the command for starting an inspect server.
var InspectCmd = &cobra.Command{
Use: "inspect",
Short: "Run an inspect server for investigating Tendermint state",
Long: `
inspect runs a subset of Tendermint's RPC endpoints that are useful for debugging
issues with Tendermint.
When the Tendermint consensus engine detects inconsistent state, it will crash the
Tendermint process. Tendermint will not start up while in this inconsistent state.
The inspect command can be used to query the block and state store using Tendermint
RPC calls to debug issues of inconsistent state.
`,

RunE: runInspect,
}

func init() {
InspectCmd.Flags().
String("rpc.laddr",
config.RPC.ListenAddress, "RPC listenener address. Port required")
InspectCmd.Flags().
String("db-backend",
config.DBBackend, "database backend: goleveldb | cleveldb | boltdb | rocksdb | badgerdb")
InspectCmd.Flags().
String("db-dir", config.DBPath, "database directory")
}

func runInspect(cmd *cobra.Command, args []string) error {
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()

c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGTERM, syscall.SIGINT)
go func() {
<-c
cancel()
}()

blockStoreDB, err := cfg.DefaultDBProvider(&cfg.DBContext{ID: "blockstore", Config: config})
if err != nil {
return err
}
blockStore := store.NewBlockStore(blockStoreDB)
defer blockStore.Close()

stateDB, err := cfg.DefaultDBProvider(&cfg.DBContext{ID: "state", Config: config})
if err != nil {
return err
}
stateStore := state.NewStore(stateDB, state.StoreOptions{DiscardABCIResponses: false})
defer stateStore.Close()

genDoc, err := types.GenesisDocFromFile(config.GenesisFile())
if err != nil {
return err
}
txIndexer, blockIndexer, err := block.IndexerFromConfig(config, cfg.DefaultDBProvider, genDoc.ChainID)
if err != nil {
return err
}
ins := inspect.New(config.RPC, blockStore, stateStore, txIndexer, blockIndexer, logger)

logger.Info("starting inspect server")
if err := ins.Run(ctx); err != nil {
return err
}
return nil
}
1 change: 1 addition & 0 deletions cmd/tendermint/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ func main() {
cmd.VersionCmd,
cmd.RollbackStateCmd,
cmd.CompactGoLevelDBCmd,
cmd.InspectCmd,
debug.DebugCmd,
cli.NewCompletionCmd(rootCmd, true),
)
Expand Down
30 changes: 30 additions & 0 deletions config/db.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package config

import (
"context"

dbm "github.com/tendermint/tm-db"

"github.com/tendermint/tendermint/libs/log"
"github.com/tendermint/tendermint/libs/service"
)

// ServiceProvider takes a config and a logger and returns a ready to go Node.
type ServiceProvider func(context.Context, *Config, log.Logger) (service.Service, error)

// DBContext specifies config information for loading a new DB.
type DBContext struct {
ID string
Config *Config
}

// DBProvider takes a DBContext and returns an instantiated DB.
type DBProvider func(*DBContext) (dbm.DB, error)

// DefaultDBProvider returns a database using the DBBackend and DBDir
// specified in the Config.
func DefaultDBProvider(ctx *DBContext) (dbm.DB, error) {
dbType := dbm.BackendType(ctx.Config.DBBackend)

return dbm.NewDB(ctx.ID, dbType, ctx.Config.DBDir())
}
2 changes: 1 addition & 1 deletion consensus/replay.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (cs *State) readReplayMessage(msg *TimedWALMessage, newStepSub types.Subscr
if m.Height != m2.Height || m.Round != m2.Round || m.Step != m2.Step {
return fmt.Errorf("roundState mismatch. Got %v; Expected %v", m2, m)
}
case <-newStepSub.Cancelled():
case <-newStepSub.Canceled():
return fmt.Errorf("failed to read off newStepSub.Out(). newStepSub was canceled")
case <-ticker:
return fmt.Errorf("failed to read off newStepSub.Out()")
Expand Down
3 changes: 2 additions & 1 deletion consensus/replay_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func startNewStateAndWaitForBlock(t *testing.T, consensusReplayConfig *cfg.Confi
require.NoError(t, err)
select {
case <-newBlockSub.Out():
case <-newBlockSub.Cancelled():
case <-newBlockSub.Canceled():
t.Fatal("newBlockSub was canceled")
case <-time.After(120 * time.Second):
t.Fatal("Timed out waiting for new block (see trace above)")
Expand Down Expand Up @@ -1198,6 +1198,7 @@ func (bs *mockBlockStore) PruneBlocks(height int64, state sm.State) (uint64, int
}

func (bs *mockBlockStore) DeleteLatestBlock() error { return nil }
func (bs *mockBlockStore) Close() error { return nil }

//---------------------------------------
// Test handshake/init chain
Expand Down
44 changes: 44 additions & 0 deletions docs/tools/debugging.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,47 @@ given destination directory. Each archive will contain:

Note: goroutine.out and heap.out will only be written if a profile address is
provided and is operational. This command is blocking and will log any error.

## Tendermint Inspect

Tendermint includes an `inspect` command for querying Tendermint's state store and block
store over Tendermint RPC.

When the Tendermint consensus engine detects inconsistent state, it will crash the
entire Tendermint process.
While in this inconsistent state, a node running Tendermint's consensus engine will not start up.
The `inspect` command runs only a subset of Tendermint's RPC endpoints for querying the block store
and state store.
`inspect` allows operators to query a read-only view of the stage.
`inspect` does not run the consensus engine at all and can therefore be used to debug
processes that have crashed due to inconsistent state.

### Running inspect

Start up the `inspect` tool on the machine where Tendermint crashed using:
```bash
tendermint inspect --home=</path/to/app.d>
```

`inspect` will use the data directory specified in your Tendermint configuration file.
`inspect` will also run the RPC server at the address specified in your Tendermint configuration file.

### Using inspect

With the `inspect` server running, you can access RPC endpoints that are critically important
for debugging.
Calling the `/status`, `/consensus_state` and `/dump_consensus_state` RPC endpoint
will return useful information about the Tendermint consensus state.

To start the `inspect` process, run
```bash
tendermint inspect
```

### RPC endpoints

The list of available RPC endpoints can be found by making a request to the RPC port.
For an `inspect` process running on `127.0.0.1:26657`, navigate your browser to
`http://127.0.0.1:26657/` to retrieve the list of enabled RPC endpoints.

Additional information on the Tendermint RPC endpoints can be found in the [rpc documentation](https://docs.tendermint.com/master/rpc).
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,7 @@ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
Expand Down
36 changes: 36 additions & 0 deletions inspect/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Package inspect provides a tool for investigating the state of a
failed Tendermint node.
This package provides the Inspector type. The Inspector type runs a subset of the Tendermint
RPC endpoints that are useful for debugging issues with Tendermint consensus.
When a node running the Tendermint consensus engine detects an inconsistent consensus state,
the entire node will crash. The Tendermint consensus engine cannot run in this
inconsistent state so the node will not be able to start up again.
The RPC endpoints provided by the Inspector type allow for a node operator to inspect
the block store and state store to better understand what may have caused the inconsistent state.
The Inspector type's lifecycle is controlled by a context.Context
ins := inspect.NewFromConfig(rpcConfig)
ctx, cancelFunc:= context.WithCancel(context.Background())
// Run blocks until the Inspector server is shut down.
go ins.Run(ctx)
...
// calling the cancel function will stop the running inspect server
cancelFunc()
Inspector serves its RPC endpoints on the address configured in the RPC configuration
rpcConfig.ListenAddress = "tcp://127.0.0.1:26657"
ins := inspect.NewFromConfig(rpcConfig)
go ins.Run(ctx)
The list of available RPC endpoints can then be viewed by navigating to
http://127.0.0.1:26657/ in the web browser.
*/
package inspect
Loading

0 comments on commit 3324f49

Please sign in to comment.