From a7a91464ecdf3527e583d61042bad7e2a2ddfdc1 Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Mon, 10 Feb 2020 12:36:07 -0500 Subject: [PATCH] cmd/roachtest: deflake gossip/chaos roachtest Deflake `gossip/chaos` by adding a missing `waitForFullReplication`. This test loops, killing a node and then verifying that the remaining nodes in the cluster stabilize on the same view of gossip connectivity. Periodically the test was failing because gossip wasn't stabilizing. The root issue was that the SQL query to retrieve the gossip connectivity from one node was hanging. And that query was hanging due to unavailability of a range. Logs show that the leaseholder for that range was on a down node and that the range only seemed to contain a single replica. This could happen near the start of the test if we started killing nodes before full replication was achieved. Fixes #38829 Release note: None --- pkg/cmd/roachtest/gossip.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachtest/gossip.go b/pkg/cmd/roachtest/gossip.go index e097a6ce5593..aaca4c4f6d5c 100644 --- a/pkg/cmd/roachtest/gossip.go +++ b/pkg/cmd/roachtest/gossip.go @@ -32,8 +32,10 @@ import ( func registerGossip(r *testRegistry) { runGossipChaos := func(ctx context.Context, t *test, c *cluster) { + args := startArgs("--args=--vmodule=*=1") c.Put(ctx, cockroach, "./cockroach", c.All()) - c.Start(ctx, t, c.All()) + c.Start(ctx, t, c.All(), args) + waitForFullReplication(t, c.Conn(ctx, 1)) gossipNetwork := func(node int) string { const query = ` @@ -65,6 +67,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') if i == deadNode { continue } + c.l.Printf("%d: checking gossip\n", i) s := gossipNetwork(i) if !initialized { deadNodeStr := fmt.Sprint(deadNode) @@ -88,7 +91,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') return false } } - fmt.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds()) + c.l.Printf("gossip ok: %s (%0.0fs)\n", expected, timeutil.Since(start).Seconds()) return true } @@ -109,7 +112,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') deadNode = nodes.randNode()[0] c.Stop(ctx, c.Node(deadNode)) waitForGossip() - c.Start(ctx, t, c.Node(deadNode)) + c.Start(ctx, t, c.Node(deadNode), args) } }