From ad1fc9464b9564d7d9566e0b31a9ee0934808a0f Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Wed, 21 Feb 2024 12:00:39 +0200 Subject: [PATCH 01/12] Add more debug logs to understand statement-distribution misbehaving Signed-off-by: Alexandru Gheorghe --- .../network/statement-distribution/src/lib.rs | 3 ++- .../statement-distribution/src/v2/mod.rs | 23 ++++++++++++++++--- .../statement-distribution/src/v2/requests.rs | 18 ++++++++++++++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/lib.rs b/polkadot/node/network/statement-distribution/src/lib.rs index 7e91d2849120..f75708a6344f 100644 --- a/polkadot/node/network/statement-distribution/src/lib.rs +++ b/polkadot/node/network/statement-distribution/src/lib.rs @@ -211,6 +211,7 @@ impl StatementDistributionSubsystem { .boxed(), ) .map_err(FatalError::SpawnTask)?; + let mut request_throttle_freq = gum::Freq::new(); loop { // Wait for the next message. @@ -293,7 +294,7 @@ impl StatementDistributionSubsystem { }, }; - v2::dispatch_requests(&mut ctx, &mut state).await; + v2::dispatch_requests(&mut ctx, &mut state, &mut request_throttle_freq).await; } Ok(()) } diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index dc29c19a48e3..de6f49e401b5 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -1294,7 +1294,7 @@ async fn circulate_statement( let is_confirmed = candidates.is_confirmed(&candidate_hash); let originator = statement.validator_index(); - let (local_validator, targets) = { + let (local_validator, targets, cluster_relevant) = { let local_validator = match relay_parent_state.local_validator.as_mut() { Some(v) => v, None => return, // sanity: nothing to propagate if not a validator. @@ -1350,10 +1350,11 @@ async fn circulate_statement( }) .collect::>(); - (local_validator, targets) + (local_validator, targets, cluster_relevant) }; let mut statement_to_peers: Vec<(PeerId, ProtocolVersion)> = Vec::new(); + let targets_len = targets.len(); for (target, authority_id, kind) in targets { // Find peer ID based on authority ID, and also filter to connected. let peer_id: (PeerId, ProtocolVersion) = match authorities.get(&authority_id) { @@ -1397,6 +1398,17 @@ async fn circulate_statement( } } + if cluster_relevant && targets_len > 0 && statement_to_peers.is_empty() { + gum::warn!( + target: LOG_TARGET, + ?cluster_relevant, + ?targets_len, + groups = ?local_validator.active.as_ref().map(|active| active.group.0), + "Node does not participate in backing, because it can not resolve its group peer ids.\n + Restart might be needed if error persists for more than 3-4 consecutive sessions" + ); + } + let statement_to_v2_peers = filter_by_peer_version(&statement_to_peers, ValidationVersion::V2.into()); @@ -2866,7 +2878,11 @@ async fn apply_post_confirmation( /// Dispatch pending requests for candidate data & statements. #[overseer::contextbounds(StatementDistribution, prefix=self::overseer)] -pub(crate) async fn dispatch_requests(ctx: &mut Context, state: &mut State) { +pub(crate) async fn dispatch_requests( + ctx: &mut Context, + state: &mut State, + request_throttle_freq: &mut gum::Freq, +) { if !state.request_manager.has_pending_requests() { return } @@ -2946,6 +2962,7 @@ pub(crate) async fn dispatch_requests(ctx: &mut Context, state: &mut St &mut state.response_manager, request_props, peer_advertised, + request_throttle_freq, ) { // Peer is supposedly connected. ctx.send_message(NetworkBridgeTxMessage::SendRequests( diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index bed3d5c18ae2..523353e857de 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -314,8 +314,24 @@ impl RequestManager { response_manager: &mut ResponseManager, request_props: impl Fn(&CandidateIdentifier) -> Option, peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option, + request_throttle_freq: &mut gum::Freq, ) -> Option> { - if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { + // The number of parallel requests a node can answer is limited by + // `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no + // need for the current node to limit itself to the same amount the + // requests, because the requests are going to different nodes anyways. + // While looking at https://github.com/paritytech/polkadot-sdk/issues/3314, + // found out that this requests take around 100ms to fullfill, so it + // would make sense to try to request things as early as we can, given + // we would need to request it for each candidate, around 25 right now + // on kusama. + if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { + gum::warn_if_frequent!( + freq: request_throttle_freq, + max_rate: gum::Times::PerSecond(5), + target: LOG_TARGET, + "Too many requests in parallel, statement-distribution might be slow" + ); return None } From d86127e335daa37864433daf223bfe26a3f7f4e2 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 08:47:55 +0200 Subject: [PATCH 02/12] Make clippy happy Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/requests.rs | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index 523353e857de..b54e4d6e14ef 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -328,7 +328,7 @@ impl RequestManager { if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { gum::warn_if_frequent!( freq: request_throttle_freq, - max_rate: gum::Times::PerSecond(5), + max_rate: gum::Times::PerSecond(20), target: LOG_TARGET, "Too many requests in parallel, statement-distribution might be slow" ); @@ -1043,12 +1043,24 @@ mod tests { let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| { Some(StatementFilter::full(group_size)) }; + let mut request_throttle_freq = gum::Freq::new(); + let outgoing = request_manager - .next_request(&mut response_manager, request_props, peer_advertised) + .next_request( + &mut response_manager, + request_props, + peer_advertised, + &mut request_throttle_freq, + ) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); let outgoing = request_manager - .next_request(&mut response_manager, request_props, peer_advertised) + .next_request( + &mut response_manager, + request_props, + peer_advertised, + &mut request_throttle_freq, + ) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } @@ -1164,8 +1176,15 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); + let mut request_throttle_freq = gum::Freq::new(); + let outgoing = request_manager - .next_request(&mut response_manager, request_props, peer_advertised) + .next_request( + &mut response_manager, + request_props, + peer_advertised, + &mut request_throttle_freq, + ) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } @@ -1246,8 +1265,15 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); + let mut request_throttle_freq = gum::Freq::new(); + let outgoing = request_manager - .next_request(&mut response_manager, request_props, peer_advertised) + .next_request( + &mut response_manager, + request_props, + peer_advertised, + &mut request_throttle_freq, + ) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } From 6adf1fb0aef8a30cb1e72fad76db7f6ea010aa7b Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 10:40:36 +0200 Subject: [PATCH 03/12] Add more logs Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/cluster.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs index 619114de9670..48a3c781fae1 100644 --- a/polkadot/node/network/statement-distribution/src/v2/cluster.rs +++ b/polkadot/node/network/statement-distribution/src/v2/cluster.rs @@ -57,6 +57,7 @@ use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex}; +use crate::LOG_TARGET; use std::collections::{HashMap, HashSet}; #[derive(Hash, PartialEq, Eq)] @@ -424,6 +425,23 @@ impl ClusterTracker { fn is_in_group(&self, validator: ValidatorIndex) -> bool { self.validators.contains(&validator) } + + /// Dumps pending statement for this cluster. + /// + /// Normally we should not have any pending statements for our cluster, + /// but if we do for long periods of time something bad happened which + /// needs to be investigated. + pub fn dump_pending_statements(&self) { + if !self.pending.is_empty() { + gum::warn!( + target: LOG_TARGET, + num_pending = ?self.pending.len(), + validators = ?self.pending.keys().collect::>(), + "Cluster has pending statements, something wrong with our connection to our group peers \n + If error persists accross multiple consecutive sessions, validator might need restart" + ); + } + } } /// Incoming statement was accepted. From 3f803e00f057dcf7fedb38de694ed1baba46fba1 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 10:49:06 +0200 Subject: [PATCH 04/12] Add more logs Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/mod.rs | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index de6f49e401b5..aa5b9d6a2529 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -768,7 +768,13 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) { let pruned = state.implicit_view.deactivate_leaf(*leaf); for pruned_rp in pruned { // clean up per-relay-parent data based on everything removed. - state.per_relay_parent.remove(&pruned_rp); + state + .per_relay_parent + .remove(&pruned_rp) + .as_ref() + .and_then(|pruned| pruned.active_validator_state()) + .map(|active_state| active_state.cluster_tracker.dump_pending_statements()); + // clean up requests related to this relay parent. state.request_manager.remove_by_relay_parent(*leaf); } @@ -1354,7 +1360,6 @@ async fn circulate_statement( }; let mut statement_to_peers: Vec<(PeerId, ProtocolVersion)> = Vec::new(); - let targets_len = targets.len(); for (target, authority_id, kind) in targets { // Find peer ID based on authority ID, and also filter to connected. let peer_id: (PeerId, ProtocolVersion) = match authorities.get(&authority_id) { @@ -1366,6 +1371,17 @@ async fn circulate_statement( .protocol_version .into(), ), + None if cluster_relevant => { + gum::warn!( + target: LOG_TARGET, + ?cluster_relevant, + ?target, + groups = ?local_validator.active.as_ref().map(|active| active.group.0), + "Node can not resolve some of its backing group peer ids.\n + Restart might be needed if validator get 0 backing rewards for more than 3-4 consecutive sessions" + ); + continue + }, None | Some(_) => continue, }; @@ -1398,17 +1414,6 @@ async fn circulate_statement( } } - if cluster_relevant && targets_len > 0 && statement_to_peers.is_empty() { - gum::warn!( - target: LOG_TARGET, - ?cluster_relevant, - ?targets_len, - groups = ?local_validator.active.as_ref().map(|active| active.group.0), - "Node does not participate in backing, because it can not resolve its group peer ids.\n - Restart might be needed if error persists for more than 3-4 consecutive sessions" - ); - } - let statement_to_v2_peers = filter_by_peer_version(&statement_to_peers, ValidationVersion::V2.into()); From 63d99efbeb621ad04b28afe0a8ca8972703d1ac7 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 10:51:42 +0200 Subject: [PATCH 05/12] Add more logs Signed-off-by: Alexandru Gheorghe --- polkadot/node/network/statement-distribution/src/v2/cluster.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs index 48a3c781fae1..bf9f10812002 100644 --- a/polkadot/node/network/statement-distribution/src/v2/cluster.rs +++ b/polkadot/node/network/statement-distribution/src/v2/cluster.rs @@ -438,7 +438,7 @@ impl ClusterTracker { num_pending = ?self.pending.len(), validators = ?self.pending.keys().collect::>(), "Cluster has pending statements, something wrong with our connection to our group peers \n - If error persists accross multiple consecutive sessions, validator might need restart" + Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions" ); } } From 55e565246a1684a4e1a438feec532d1dc2437b8e Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 11:24:17 +0200 Subject: [PATCH 06/12] Fixup warning in parallel requests Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/requests.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index b54e4d6e14ef..42b756c5edda 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -285,6 +285,11 @@ impl RequestManager { false } + /// Returns the number of pending requests. + pub fn count_pending_requests(&self) -> u32 { + self.requests.iter().filter(|(_, entry)| entry.is_pending()).count() as u32 + } + /// Returns an instant at which the next request to be retried will be ready. pub fn next_retry_time(&mut self) -> Option { let mut next = None; @@ -326,12 +331,14 @@ impl RequestManager { // we would need to request it for each candidate, around 25 right now // on kusama. if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { - gum::warn_if_frequent!( - freq: request_throttle_freq, - max_rate: gum::Times::PerSecond(20), - target: LOG_TARGET, - "Too many requests in parallel, statement-distribution might be slow" - ); + if self.count_pending_requests() > 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS { + gum::warn_if_frequent!( + freq: request_throttle_freq, + max_rate: gum::Times::PerSecond(20), + target: LOG_TARGET, + "Too many requests in parallel, statement-distribution might be slow" + ); + } return None } From fb0a9c2f70a8e321f84275a127c3a66620a051ce Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 22 Feb 2024 16:58:19 +0200 Subject: [PATCH 07/12] Add more debug logs Signed-off-by: Alexandru Gheorghe --- polkadot/node/network/statement-distribution/src/v2/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index aa5b9d6a2529..066f7132d6ca 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -251,6 +251,13 @@ impl PerSessionState { if local_index.is_some() { self.local_validator.get_or_insert(LocalValidatorIndex::Inactive); } + + gum::info!( + target: LOG_TARGET, + index_in_gossip_topology = ?local_index, + index_in_parachain_authorities = ?self.local_validator, + "Node uses the following topology indicies " + ); } /// Returns `true` if local is neither active or inactive validator node. From 7998c98d923adafa86fe7a81d014186ddb57a1b9 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 23 Feb 2024 11:52:52 +0200 Subject: [PATCH 08/12] Add more debug logs Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/cluster.rs | 14 ++++++++------ .../network/statement-distribution/src/v2/mod.rs | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs index bf9f10812002..886a162e409d 100644 --- a/polkadot/node/network/statement-distribution/src/v2/cluster.rs +++ b/polkadot/node/network/statement-distribution/src/v2/cluster.rs @@ -55,7 +55,7 @@ //! and to keep track of what we have sent to other validators in the group and what we may //! continue to send them. -use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex}; +use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex}; use crate::LOG_TARGET; use std::collections::{HashMap, HashSet}; @@ -431,13 +431,15 @@ impl ClusterTracker { /// Normally we should not have any pending statements for our cluster, /// but if we do for long periods of time something bad happened which /// needs to be investigated. - pub fn dump_pending_statements(&self) { - if !self.pending.is_empty() { + pub fn dump_pending_statements(&self, parent_hash: Hash) { + if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >= + self.validators.len() + { gum::warn!( target: LOG_TARGET, - num_pending = ?self.pending.len(), - validators = ?self.pending.keys().collect::>(), - "Cluster has pending statements, something wrong with our connection to our group peers \n + pending_statements = ?self.pending, + ?parent_hash, + "Cluster has too many pending statements, something wrong with our connection to our group peers \n Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions" ); } diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index 066f7132d6ca..ca900478f212 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -780,7 +780,9 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) { .remove(&pruned_rp) .as_ref() .and_then(|pruned| pruned.active_validator_state()) - .map(|active_state| active_state.cluster_tracker.dump_pending_statements()); + .map(|active_state| { + active_state.cluster_tracker.dump_pending_statements(pruned_rp) + }); // clean up requests related to this relay parent. state.request_manager.remove_by_relay_parent(*leaf); From 73a6db74a33c511bdabfb39b0a647914f3149855 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 23 Feb 2024 11:57:24 +0200 Subject: [PATCH 09/12] Address warnings Signed-off-by: Alexandru Gheorghe --- .../network/statement-distribution/src/lib.rs | 3 +- .../statement-distribution/src/v2/mod.rs | 22 ++--------- .../statement-distribution/src/v2/requests.rs | 38 ++----------------- 3 files changed, 7 insertions(+), 56 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/lib.rs b/polkadot/node/network/statement-distribution/src/lib.rs index f75708a6344f..7e91d2849120 100644 --- a/polkadot/node/network/statement-distribution/src/lib.rs +++ b/polkadot/node/network/statement-distribution/src/lib.rs @@ -211,7 +211,6 @@ impl StatementDistributionSubsystem { .boxed(), ) .map_err(FatalError::SpawnTask)?; - let mut request_throttle_freq = gum::Freq::new(); loop { // Wait for the next message. @@ -294,7 +293,7 @@ impl StatementDistributionSubsystem { }, }; - v2::dispatch_requests(&mut ctx, &mut state, &mut request_throttle_freq).await; + v2::dispatch_requests(&mut ctx, &mut state).await; } Ok(()) } diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index ca900478f212..f8bc7400f38e 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -1309,7 +1309,7 @@ async fn circulate_statement( let is_confirmed = candidates.is_confirmed(&candidate_hash); let originator = statement.validator_index(); - let (local_validator, targets, cluster_relevant) = { + let (local_validator, targets) = { let local_validator = match relay_parent_state.local_validator.as_mut() { Some(v) => v, None => return, // sanity: nothing to propagate if not a validator. @@ -1365,7 +1365,7 @@ async fn circulate_statement( }) .collect::>(); - (local_validator, targets, cluster_relevant) + (local_validator, targets) }; let mut statement_to_peers: Vec<(PeerId, ProtocolVersion)> = Vec::new(); @@ -1380,17 +1380,6 @@ async fn circulate_statement( .protocol_version .into(), ), - None if cluster_relevant => { - gum::warn!( - target: LOG_TARGET, - ?cluster_relevant, - ?target, - groups = ?local_validator.active.as_ref().map(|active| active.group.0), - "Node can not resolve some of its backing group peer ids.\n - Restart might be needed if validator get 0 backing rewards for more than 3-4 consecutive sessions" - ); - continue - }, None | Some(_) => continue, }; @@ -2892,11 +2881,7 @@ async fn apply_post_confirmation( /// Dispatch pending requests for candidate data & statements. #[overseer::contextbounds(StatementDistribution, prefix=self::overseer)] -pub(crate) async fn dispatch_requests( - ctx: &mut Context, - state: &mut State, - request_throttle_freq: &mut gum::Freq, -) { +pub(crate) async fn dispatch_requests(ctx: &mut Context, state: &mut State) { if !state.request_manager.has_pending_requests() { return } @@ -2976,7 +2961,6 @@ pub(crate) async fn dispatch_requests( &mut state.response_manager, request_props, peer_advertised, - request_throttle_freq, ) { // Peer is supposedly connected. ctx.send_message(NetworkBridgeTxMessage::SendRequests( diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index 42b756c5edda..84b2df9b6f1e 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -285,11 +285,6 @@ impl RequestManager { false } - /// Returns the number of pending requests. - pub fn count_pending_requests(&self) -> u32 { - self.requests.iter().filter(|(_, entry)| entry.is_pending()).count() as u32 - } - /// Returns an instant at which the next request to be retried will be ready. pub fn next_retry_time(&mut self) -> Option { let mut next = None; @@ -319,7 +314,6 @@ impl RequestManager { response_manager: &mut ResponseManager, request_props: impl Fn(&CandidateIdentifier) -> Option, peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option, - request_throttle_freq: &mut gum::Freq, ) -> Option> { // The number of parallel requests a node can answer is limited by // `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no @@ -331,14 +325,6 @@ impl RequestManager { // we would need to request it for each candidate, around 25 right now // on kusama. if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize { - if self.count_pending_requests() > 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS { - gum::warn_if_frequent!( - freq: request_throttle_freq, - max_rate: gum::Times::PerSecond(20), - target: LOG_TARGET, - "Too many requests in parallel, statement-distribution might be slow" - ); - } return None } @@ -1050,15 +1036,9 @@ mod tests { let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| { Some(StatementFilter::full(group_size)) }; - let mut request_throttle_freq = gum::Freq::new(); let outgoing = request_manager - .next_request( - &mut response_manager, - request_props, - peer_advertised, - &mut request_throttle_freq, - ) + .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); let outgoing = request_manager @@ -1183,15 +1163,9 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); - let mut request_throttle_freq = gum::Freq::new(); let outgoing = request_manager - .next_request( - &mut response_manager, - request_props, - peer_advertised, - &mut request_throttle_freq, - ) + .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } @@ -1272,15 +1246,9 @@ mod tests { { let request_props = |_identifier: &CandidateIdentifier| Some((&request_properties).clone()); - let mut request_throttle_freq = gum::Freq::new(); let outgoing = request_manager - .next_request( - &mut response_manager, - request_props, - peer_advertised, - &mut request_throttle_freq, - ) + .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } From fc29130f36f5d3f4533f52fbc3b93913ab11c7e9 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 23 Feb 2024 11:58:38 +0200 Subject: [PATCH 10/12] Something else Signed-off-by: Alexandru Gheorghe --- .../node/network/statement-distribution/src/v2/requests.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index 84b2df9b6f1e..bbcb268415e6 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -1042,12 +1042,7 @@ mod tests { .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); let outgoing = request_manager - .next_request( - &mut response_manager, - request_props, - peer_advertised, - &mut request_throttle_freq, - ) + .next_request(&mut response_manager, request_props, peer_advertised) .unwrap(); assert_eq!(outgoing.payload.candidate_hash, candidate); } From f13a4c1b7176cb0001d26a50d06587b4b5f5e6f6 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Sun, 25 Feb 2024 19:50:38 +0200 Subject: [PATCH 11/12] Address review feedback Signed-off-by: Alexandru Gheorghe --- .../network/statement-distribution/src/v2/cluster.rs | 11 +++++++---- .../node/network/statement-distribution/src/v2/mod.rs | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs index 886a162e409d..c09916e56201 100644 --- a/polkadot/node/network/statement-distribution/src/v2/cluster.rs +++ b/polkadot/node/network/statement-distribution/src/v2/cluster.rs @@ -428,10 +428,13 @@ impl ClusterTracker { /// Dumps pending statement for this cluster. /// - /// Normally we should not have any pending statements for our cluster, - /// but if we do for long periods of time something bad happened which - /// needs to be investigated. - pub fn dump_pending_statements(&self, parent_hash: Hash) { + /// Normally we should not have pending statements to validators in our cluster, + /// but if we do for all validators in our cluster, then we don't participate + /// in backing. Ocasional pending statements are expected if two authorities + /// can't detect each otehr or after restart, where it takes a while to discover + /// the whole network. + + pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) { if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >= self.validators.len() { diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index f8bc7400f38e..e8a7fce5287e 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -781,7 +781,7 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) { .as_ref() .and_then(|pruned| pruned.active_validator_state()) .map(|active_state| { - active_state.cluster_tracker.dump_pending_statements(pruned_rp) + active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp) }); // clean up requests related to this relay parent. From e698f2409295589bd17acbdbcb55bdad24d46ea2 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Mon, 26 Feb 2024 07:41:44 +0200 Subject: [PATCH 12/12] Update polkadot/node/network/statement-distribution/src/v2/mod.rs Co-authored-by: ordian --- polkadot/node/network/statement-distribution/src/v2/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs index e8a7fce5287e..2c9cdba4ea8e 100644 --- a/polkadot/node/network/statement-distribution/src/v2/mod.rs +++ b/polkadot/node/network/statement-distribution/src/v2/mod.rs @@ -256,7 +256,7 @@ impl PerSessionState { target: LOG_TARGET, index_in_gossip_topology = ?local_index, index_in_parachain_authorities = ?self.local_validator, - "Node uses the following topology indicies " + "Node uses the following topology indices" ); }