From 2cfa65edec409c6ec3506cdf0bb007f6896fd807 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Mon, 5 Jul 2021 07:42:43 +0200 Subject: [PATCH 1/3] Warn on low connectivity. --- node/network/gossip-support/src/lib.rs | 33 +++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/node/network/gossip-support/src/lib.rs b/node/network/gossip-support/src/lib.rs index 29ba32b6163b..ddb8f39454a1 100644 --- a/node/network/gossip-support/src/lib.rs +++ b/node/network/gossip-support/src/lib.rs @@ -52,6 +52,12 @@ const LOG_TARGET: &str = "parachain::gossip-support"; // since the last authority discovery resolution failure. const BACKOFF_DURATION: Duration = Duration::from_secs(5); +/// Duration after which we consider low connectivity a problem. +/// +/// Especially at startup low connectivity is expected (authority discovery cache needs to be +/// populated). After one minute of low connectivity a warning seems to be justified. +const STARTUP_PERIOD: Duration = Duration::from_secs(60); + /// The Gossip Support subsystem. pub struct GossipSupport { keystore: SyncCryptoStorePtr, @@ -64,6 +70,11 @@ struct State { // at least a third of authorities the last time. // `None` otherwise. last_failure: Option, + + /// First time we did not reach our connectivity treshold. + /// + /// Will be cleared once we reached it. + first_failure: Option, } impl GossipSupport { @@ -313,10 +324,26 @@ impl State { // issue another request for the same session // if at least a third of the authorities were not resolved - self.last_failure = if failures >= num / 3 { - Some(Instant::now()) + if failures >= num / 3 { + let timestamp = Instant::now(); + match self.first_failure { + None => self.first_failure = Some(timestamp), + Some(first) if first.elapsed() >= STARTUP_PERIOD => { + tracing::warn!( + target: LOG_TARGET, + connected = ?(num - failures), + target = ?num, + "Low connectivity" + ); + + } + Some(_) => {} + } + + self.last_failure = Some(timestamp); } else { - None + self.last_failure = None; + self.first_failure = None; }; Ok(()) From 3763dd94b83f526e15566873f0d3ec10ba2ce072 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Mon, 5 Jul 2021 11:14:28 +0200 Subject: [PATCH 2/3] Better timeout and docs. --- node/network/gossip-support/src/lib.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/node/network/gossip-support/src/lib.rs b/node/network/gossip-support/src/lib.rs index ddb8f39454a1..9605ca4d32ac 100644 --- a/node/network/gossip-support/src/lib.rs +++ b/node/network/gossip-support/src/lib.rs @@ -55,8 +55,12 @@ const BACKOFF_DURATION: Duration = Duration::from_secs(5); /// Duration after which we consider low connectivity a problem. /// /// Especially at startup low connectivity is expected (authority discovery cache needs to be -/// populated). After one minute of low connectivity a warning seems to be justified. -const STARTUP_PERIOD: Duration = Duration::from_secs(60); +/// populated). Authority discovery on Kusama takes around 8 minutes, so warning after 10 minutes +/// should be fine: +/// +/// https://github.com/paritytech/substrate/blob/fc49802f263529160635471c8a17888846035f5d/client/authority-discovery/src/lib.rs#L88 +/// +const LOW_CONNECTIVITY_WARN_DELAY: Duration = Duration::from_secs(600); /// The Gossip Support subsystem. pub struct GossipSupport { @@ -73,8 +77,10 @@ struct State { /// First time we did not reach our connectivity treshold. /// - /// Will be cleared once we reached it. - first_failure: Option, + /// This is the time of the first failed attempt to connect to >2/3 of all validators in a + /// potential sequence of failed attempts. It will be cleared once we reached >2/3 + /// connectivity. + failure_start: Option, } impl GossipSupport { @@ -326,9 +332,9 @@ impl State { // if at least a third of the authorities were not resolved if failures >= num / 3 { let timestamp = Instant::now(); - match self.first_failure { - None => self.first_failure = Some(timestamp), - Some(first) if first.elapsed() >= STARTUP_PERIOD => { + match self.failure_start { + None => self.failure_start = Some(timestamp), + Some(first) if first.elapsed() >= LOW_CONNECTIVITY_WARN_DELAY => { tracing::warn!( target: LOG_TARGET, connected = ?(num - failures), @@ -343,7 +349,7 @@ impl State { self.last_failure = Some(timestamp); } else { self.last_failure = None; - self.first_failure = None; + self.failure_start = None; }; Ok(()) From ea2d232115881b23076e19e95ac579d54c3b7bb6 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Mon, 5 Jul 2021 12:09:40 +0200 Subject: [PATCH 3/3] Review remarks. --- node/network/gossip-support/src/lib.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/node/network/gossip-support/src/lib.rs b/node/network/gossip-support/src/lib.rs index 9605ca4d32ac..adb9fa6e34f8 100644 --- a/node/network/gossip-support/src/lib.rs +++ b/node/network/gossip-support/src/lib.rs @@ -75,7 +75,7 @@ struct State { // `None` otherwise. last_failure: Option, - /// First time we did not reach our connectivity treshold. + /// First time we did not reach our connectivity threshold. /// /// This is the time of the first failed attempt to connect to >2/3 of all validators in a /// potential sequence of failed attempts. It will be cleared once we reached >2/3 @@ -339,13 +339,19 @@ impl State { target: LOG_TARGET, connected = ?(num - failures), target = ?num, - "Low connectivity" + "Low connectivity - authority lookup failed for too many validators." ); } - Some(_) => {} + Some(_) => { + tracing::debug!( + target: LOG_TARGET, + connected = ?(num - failures), + target = ?num, + "Low connectivity (due to authority lookup failures) - expected on startup." + ); + } } - self.last_failure = Some(timestamp); } else { self.last_failure = None;