apache · alamb · Sep 6, 2024 · Aug 29, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/datafusion/core/tests/fuzz_cases/merge_fuzz.rs b/datafusion/core/tests/fuzz_cases/merge_fuzz.rs
@@ -16,22 +16,41 @@
 // under the License.
 
 //! Fuzz Test for various corner cases merging streams of RecordBatches
-use std::sync::Arc;
+
+use std::any::Any;
+use std::fmt::Formatter;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+use std::time::Duration;
 
 use arrow::{
     array::{ArrayRef, Int32Array},
     compute::SortOptions,
     record_batch::RecordBatch,
 };
+use arrow_schema::{DataType, Field, Schema, SchemaRef};
 use datafusion::physical_plan::{
     collect,
     expressions::{col, PhysicalSortExpr},
     memory::MemoryExec,
     sorts::sort_preserving_merge::SortPreservingMergeExec,
 };
 use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_common::{DataFusionError, Result};
+use datafusion_common_runtime::SpawnedTask;
+use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
+use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties,
+};
 use test_utils::{batches_to_vec, partitions_to_sorted_vec, stagger_batch_with_seed};
 
+use futures::Stream;
+use tokio::time::timeout;
+
 #[tokio::test]
 async fn test_merge_2() {
     run_merge_test(vec![
@@ -160,3 +179,142 @@ fn concat(mut v1: Vec<RecordBatch>, v2: Vec<RecordBatch>) -> Vec<RecordBatch> {
     v1.extend(v2);
     v1
 }
+
+/// It returns pending for the 1st partition until the 2nd partition is polled.
+#[derive(Debug, Clone)]
+struct CongestedExec {
+    schema: Schema,
+    cache: PlanProperties,
+    congestion_cleared: Arc<Mutex<bool>>,
+}
+
+impl CongestedExec {
+    fn compute_properties(schema: SchemaRef) -> PlanProperties {
+        let columns = schema
+            .fields
+            .iter()
+            .enumerate()
+            .map(|(i, f)| Arc::new(Column::new(f.name(), i)) as Arc<dyn PhysicalExpr>)
+            .collect::<Vec<_>>();
+        let mut eq_properties = EquivalenceProperties::new(schema);
+        eq_properties.add_new_orderings(vec![columns
+            .iter()
+            .map(|expr| PhysicalSortExpr::new(expr.clone(), SortOptions::default()))
+            .collect::<Vec<_>>()]);
+        let mode = ExecutionMode::Unbounded;
+        PlanProperties::new(eq_properties, Partitioning::Hash(columns, 2), mode)
+    }
+}
+
+impl ExecutionPlan for CongestedExec {
+    fn name(&self) -> &'static str {
+        Self::static_name()
+    }
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn properties(&self) -> &PlanProperties {
+        &self.cache
+    }
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+    fn with_new_children(
+        self: Arc<Self>,
+        _: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(CongestedStream {
+            schema: Arc::new(self.schema.clone()),
+            congestion_cleared: self.congestion_cleared.clone(),
+            partition,
+        }))
+    }
+}
+
+impl DisplayAs for CongestedExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "CongestedExec",).unwrap()
+            }
+        }
+        Ok(())
+    }
+}
+
+/// It returns pending for the 1st partition until the 2nd partition is polled.
+#[derive(Debug)]
+pub struct CongestedStream {
+    schema: SchemaRef,
+    congestion_cleared: Arc<Mutex<bool>>,
+    partition: usize,
+}
+
+impl Stream for CongestedStream {
+    type Item = Result<RecordBatch>;
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        match self.partition {
+            0 => {
+                let cleared = self.congestion_cleared.lock().unwrap();
+                if *cleared {
+                    return Poll::Ready(None);
+                } else {
+                    Poll::Pending
+                }
+            }
+            1 => {
+                let mut cleared = self.congestion_cleared.lock().unwrap();
+                *cleared = true;
+                Poll::Ready(None)
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+impl RecordBatchStream for CongestedStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+#[tokio::test]
+async fn test_spm_congestion() -> Result<()> {
 #[cfg(test)] 
 mod tests { 
 #[cfg(test)] 
 mod tests { 
+    let task_ctx = Arc::new(TaskContext::default());
+    let schema = Schema::new(vec![Field::new("c1", DataType::UInt64, false)]);
+    let source = CongestedExec {
+        schema: schema.clone(),
+        cache: CongestedExec::compute_properties(Arc::new(schema.clone())),
+        congestion_cleared: Arc::new(Mutex::new(false)),
+    };
+    let spm = SortPreservingMergeExec::new(
+        vec![PhysicalSortExpr::new(
+            Arc::new(Column::new("c1", 0)),
+            SortOptions::default(),
+        )],
+        Arc::new(source),
+    );
+    let spm_task = SpawnedTask::spawn(collect(Arc::new(spm), task_ctx));
+
+    let result = timeout(Duration::from_secs(3), spm_task.join()).await;
+    match result {
+        Ok(Ok(Ok(_batches))) => Ok(()),
+        Ok(Ok(Err(e))) => Err(e),
+        Ok(Err(_)) => Err(DataFusionError::Execution(
+            "SortPreservingMerge task panicked or was cancelled".to_string(),
+        )),
+        Err(_) => Err(DataFusionError::Execution(
+            "SortPreservingMerge caused a deadlock".to_string(),
+        )),
+    }
+}
diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs
@@ -18,19 +18,22 @@
 //! Merge that deals with an arbitrary size of streaming inputs.
 //! This is an order-preserving merge.
 
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Context, Poll};
+
 use crate::metrics::BaselineMetrics;
 use crate::sorts::builder::BatchBuilder;
 use crate::sorts::cursor::{Cursor, CursorValues};
 use crate::sorts::stream::PartitionedStream;
 use crate::RecordBatchStream;
+
 use arrow::datatypes::SchemaRef;
 use arrow::record_batch::RecordBatch;
 use datafusion_common::Result;
 use datafusion_execution::memory_pool::MemoryReservation;
+
 use futures::Stream;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{ready, Context, Poll};
 
 /// A fallible [`PartitionedStream`] of [`Cursor`] and [`RecordBatch`]
 type CursorStream<C> = Box<dyn PartitionedStream<Output = Result<(C, RecordBatch)>>>;
@@ -97,6 +100,10 @@ pub(crate) struct SortPreservingMergeStream<C: CursorValues> {
 
     /// number of rows produced
     produced: usize,
+
+    /// Unitiated partitions. They are stored in a vector to keep them in
+    /// a priortiy order to visit the partitions in a round-robin fashion
+    uninitiated_partitions: Vec<usize>,
 }
 
 impl<C: CursorValues> SortPreservingMergeStream<C> {
@@ -121,6 +128,7 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
             batch_size,
             fetch,
             produced: 0,
+            uninitiated_partitions: (0..stream_count).collect(),
         }
     }
 
@@ -156,12 +164,22 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
         }
         // try to initialize the loser tree
         if self.loser_tree.is_empty() {
-            // Ensure all non-exhausted streams have a cursor from which
-            // rows can be pulled
-            for i in 0..self.streams.partitions() {
-                if let Err(e) = ready!(self.maybe_poll_stream(cx, i)) {
-                    self.aborted = true;
-                    return Poll::Ready(Some(Err(e)));
+            // Ensure all non-exhausted streams have a cursor from which rows can be pulled
+            let remaining_partitions = self.uninitiated_partitions.clone();
+            for i in remaining_partitions {
+                match self.maybe_poll_stream(cx, i) {
+                    Poll::Ready(Err(e)) => {
+                        self.aborted = true;
+                        return Poll::Ready(Some(Err(e)));
+                    }
+                    Poll::Pending => {
+                        self.uninitiated_partitions.rotate_left(1);
+                        cx.waker().wake_by_ref();
+                        return Poll::Pending;
+                    }
+                    _ => {
+                        self.uninitiated_partitions.retain(|idx| *idx != i);
+                    }
                 }
             }
             self.init_loser_tree();