apache
diff --git a/‎datafusion/physical-plan/src/aggregates/group_values/mod.rs‎
Lines changed: 3 additions & 0 deletions b/‎datafusion/physical-plan/src/aggregates/group_values/mod.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎datafusion/physical-plan/src/aggregates/group_values/row.rs‎
Lines changed: 66 additions & 23 deletions b/‎datafusion/physical-plan/src/aggregates/group_values/row.rs‎
Lines changed: 66 additions & 23 deletions
diff --git a/‎datafusion/physical-plan/src/aggregates/mod.rs‎
Lines changed: 225 additions & 3 deletions b/‎datafusion/physical-plan/src/aggregates/mod.rs‎
Lines changed: 225 additions & 3 deletions
@@ -111,6 +111,9 @@ pub trait GroupValues: Send {
     /// Emits the group values
     fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;
 
+    /// Signals that input is complete and drain mode should be activated
+    fn input_done(&mut self) {}
+
     /// Clear the contents and shrink the capacity to the size of the batch (free up memory usage)
     fn clear_shrink(&mut self, batch: &RecordBatch);
 }
 
@@ -76,6 +76,13 @@ pub struct GroupValuesRows {
 
     /// Random state for creating hashes
     random_state: RandomState,
+
+    /// State for iterative emission (activated after input is complete)
+    /// When true, emit() uses offset-based slicing instead of copying remaining rows
+    drain_mode: bool,
+
+    /// Current offset for drain mode emission (number of rows already emitted)
+    emission_offset: usize,
 }
 
 impl GroupValuesRows {
@@ -107,11 +114,19 @@ impl GroupValuesRows {
             hashes_buffer: Default::default(),
             rows_buffer,
             random_state: crate::aggregates::AGGREGATION_HASH_SEED,
+            drain_mode: false,
+            emission_offset: 0,
         })
     }
 }
 
 impl GroupValues for GroupValuesRows {
+    fn input_done(&mut self) {
+        self.drain_mode = true;
+        self.map.clear();
+        self.map_size = 0;
+    }
+
     fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
         // Convert the group keys into the row format
         let group_rows = &mut self.rows_buffer;
@@ -185,10 +200,22 @@ impl GroupValues for GroupValuesRows {
         self.len() == 0
     }
 
+    /// Returns the number of group values.
+    ///
+    /// In drain mode (after `input_done()`), returns remaining groups not yet emitted,
+    /// which matches the accumulator state size for consistency.
     fn len(&self) -> usize {
         self.group_values
             .as_ref()
-            .map(|group_values| group_values.num_rows())
+            .map(|group_values| {
+                let total_rows = group_values.num_rows();
+                if self.drain_mode {
+                    // In drain mode, return remaining rows (not yet emitted)
+                    total_rows.saturating_sub(self.emission_offset)
+                } else {
+                    total_rows
+                }
+            })
             .unwrap_or(0)
     }
 
@@ -206,29 +233,43 @@ impl GroupValues for GroupValuesRows {
                 output
             }
             EmitTo::First(n) => {
-                let groups_rows = group_values.iter().take(n);
-                let output = self.row_converter.convert_rows(groups_rows)?;
-                // Clear out first n group keys by copying them to a new Rows.
-                // TODO file some ticket in arrow-rs to make this more efficient?
-                let mut new_group_values = self.row_converter.empty_rows(0, 0);
-                for row in group_values.iter().skip(n) {
-                    new_group_values.push(row);
-                }
-                std::mem::swap(&mut new_group_values, &mut group_values);
-
-                self.map.retain(|(_exists_hash, group_idx)| {
-                    // Decrement group index by n
-                    match group_idx.checked_sub(n) {
-                        // Group index was >= n, shift value down
-                        Some(sub) => {
-                            *group_idx = sub;
-                            true
-                        }
-                        // Group index was < n, so remove from table
-                        None => false,
+                if self.drain_mode {
+                    let start = self.emission_offset;
+                    let end = std::cmp::min(start + n, group_values.num_rows());
+                    let iter = group_values.iter().skip(start).take(end - start);
+                    let output = self.row_converter.convert_rows(iter)?;
+                    self.emission_offset = end;
+                    if self.emission_offset == group_values.num_rows() {
+                        group_values.clear();
+                        self.emission_offset = 0;
                     }
-                });
-                output
+                    output
+                } else {
+                    let groups_rows = group_values.iter().take(n);
+                    let output = self.row_converter.convert_rows(groups_rows)?;
+
+                    // Clear out first n group keys by copying them to a new Rows.
+                    // TODO file some ticket in arrow-rs to make this more efficient?
+                    let mut new_group_values = self.row_converter.empty_rows(0, 0);
+                    for row in group_values.iter().skip(n) {
+                        new_group_values.push(row);
+                    }
+                    std::mem::swap(&mut new_group_values, &mut group_values);
+
+                    self.map.retain(|(_exists_hash, group_idx)| {
+                        // Decrement group index by n
+                        match group_idx.checked_sub(n) {
+                            // Group index was >= n, shift value down
+                            Some(sub) => {
+                                *group_idx = sub;
+                                true
+                            }
+                            // Group index was < n, so remove from table
+                            None => false,
+                        }
+                    });
+                    output
+                }
             }
         };
 
@@ -255,6 +296,8 @@ impl GroupValues for GroupValuesRows {
         self.map_size = self.map.capacity() * size_of::<(u64, usize)>();
         self.hashes_buffer.clear();
         self.hashes_buffer.shrink_to(count);
+        self.drain_mode = false;
+        self.emission_offset = 0;
     }
 }
 
 
@@ -1550,8 +1550,8 @@ mod tests {
     use crate::RecordBatchStream;
 
     use arrow::array::{
-        DictionaryArray, Float32Array, Float64Array, Int32Array, StructArray,
-        UInt32Array, UInt64Array,
+        DictionaryArray, Float32Array, Float64Array, Int32Array, Int64Builder,
+        LargeListBuilder, StringArray, StructArray, UInt32Array, UInt64Array,
     };
     use arrow::compute::{concat_batches, SortOptions};
     use arrow::datatypes::{DataType, Int32Type};
@@ -1572,7 +1572,7 @@ mod tests {
     use datafusion_physical_expr::Partitioning;
     use datafusion_physical_expr::PhysicalSortExpr;
 
-    use futures::{FutureExt, Stream};
+    use futures::{FutureExt, Stream, StreamExt};
     use insta::{allow_duplicates, assert_snapshot};
 
     // Generate a schema which consists of 5 columns (a, b, c, d, e)
@@ -3145,4 +3145,226 @@ mod tests {
         run_test_with_spill_pool_if_necessary(20_000, false).await?;
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_chunked_group_emission() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_id", DataType::UInt32, false),
+            Field::new("value", DataType::Float64, false),
+        ]));
+
+        let num_groups = 100_000;
+        let group_ids: Vec<u32> = (0..num_groups).collect();
+        let values: Vec<f64> = (0..num_groups).map(|i| i as f64).collect();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(group_ids)),
+                Arc::new(Float64Array::from(values)),
+            ],
+        )?;
+
+        let input =
+            TestMemoryExec::try_new_exec(&[vec![batch]], Arc::clone(&schema), None)?;
+
+        let group_by = PhysicalGroupBy::new_single(vec![(
+            col("group_id", &schema)?,
+            "group_id".to_string(),
+        )]);
+
+        let aggregates = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("COUNT(value)")
+                .build()?,
+        )];
+
+        // Use a small batch size to force chunked emission
+        let batch_size = 100;
+        let session_config = SessionConfig::new().with_batch_size(batch_size);
+
+        let task_ctx =
+            Arc::new(TaskContext::default().with_session_config(session_config));
+
+        let aggregate = Arc::new(AggregateExec::try_new(
+            AggregateMode::Single,
+            group_by,
+            aggregates,
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?);
+
+        let mut stream = aggregate.execute(0, task_ctx)?;
+        let mut total_rows = 0;
+        let mut batch_count = 0;
+        let mut max_batch_size = 0;
+
+        // Collect all batches and verify they are chunked
+        while let Some(result) = stream.next().await {
+            let batch = result?;
+            let batch_rows = batch.num_rows();
+            total_rows += batch_rows;
+            batch_count += 1;
+            max_batch_size = max_batch_size.max(batch_rows);
+
+            // Each batch should be <= batch_size (except possibly the last one)
+            assert!(
+                batch_rows <= batch_size || batch_count == 1,
+                "Batch {batch_count} has {batch_rows} rows, expected <= {batch_size}"
+            );
+        }
+
+        // Verify we got all groups
+        assert_eq!(total_rows, num_groups as usize, "Should emit all groups");
+
+        // Verify chunking happened (should have multiple batches)
+        assert!(
+            batch_count > 1,
+            "Expected multiple batches for chunked emission, got {batch_count}"
+        );
+
+        // Verify no single huge batch was emitted
+        assert!(
+            max_batch_size <= batch_size,
+            "Max batch size {max_batch_size} should be <= {batch_size}"
+        );
+
+        Ok(())
+    }
+
+    /// Reproducer for the "long poll" issue in group by aggregations.
+    ///
+    /// This test demonstrates the difference between:
+    /// 1. OLD BEHAVIOR (simulated with very large batch_size): Emits all groups at once,
+    ///    causing a long blocking operation before the first batch is returned
+    /// 2. NEW BEHAVIOR (with small batch_size): Emits groups in chunks, allowing
+    ///    incremental output without blocking the async runtime
+    #[tokio::test]
+    async fn test_long_poll_reproducer() -> Result<()> {
+        use datafusion_common::instant::Instant;
+        use std::time::Duration;
+
+        let num_groups = 1_000_000;
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("group_id", DataType::UInt32, false),
+            Field::new("group_name", DataType::Utf8, false),
+            Field::new(
+                "group_list",
+                DataType::LargeList(Arc::new(Field::new("item", DataType::Int64, true))),
+                false,
+            ),
+            Field::new("value", DataType::Float64, false),
+        ]));
+
+        // Generate test data
+        let group_ids: Vec<u32> = (0..num_groups).collect();
+        let group_names: Vec<String> =
+            (0..num_groups).map(|i| format!("group_{i}")).collect();
+
+        let mut list_builder = LargeListBuilder::new(Int64Builder::new());
+        for i in 0..num_groups {
+            list_builder.append_value([Some(i as i64), Some((i + 1) as i64)]);
+        }
+        let group_lists = list_builder.finish();
+        let values: Vec<f64> = (0..num_groups).map(|i| i as f64).collect();
+
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![
+                Arc::new(UInt32Array::from(group_ids)),
+                Arc::new(StringArray::from(group_names)),
+                Arc::new(group_lists),
+                Arc::new(Float64Array::from(values)),
+            ],
+        )?;
+
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (col("group_id", &schema)?, "group_id".to_string()),
+            (col("group_name", &schema)?, "group_name".to_string()),
+            (col("group_list", &schema)?, "group_list".to_string()),
+        ]);
+
+        let aggregates = vec![Arc::new(
+            AggregateExprBuilder::new(count_udaf(), vec![col("value", &schema)?])
+                .schema(Arc::clone(&schema))
+                .alias("COUNT(value)")
+                .build()?,
+        )];
+
+        println!("Testing with {num_groups} groups (UInt32 + String + LargeList keys)");
+
+        // Helper to run the aggregation with a specific batch size
+        // Returns (time_to_first_emission, total_batch_count)
+        let run_scenario = |batch_size: usize| {
+            let schema = Arc::clone(&schema);
+            let batch = batch.clone();
+            let group_by = group_by.clone();
+            let aggregates = aggregates.clone();
+
+            async move {
+                let input = TestMemoryExec::try_new_exec(
+                    &[vec![batch]],
+                    Arc::clone(&schema),
+                    None,
+                )?;
+
+                let session_config = SessionConfig::new().with_batch_size(batch_size);
+                let task_ctx =
+                    Arc::new(TaskContext::default().with_session_config(session_config));
+
+                let aggregate = Arc::new(AggregateExec::try_new(
+                    AggregateMode::Single,
+                    group_by,
+                    aggregates,
+                    vec![None],
+                    input,
+                    schema,
+                )?);
+
+                let mut stream = aggregate.execute(0, task_ctx)?;
+                let start = Instant::now();
+                let mut first_emission = None;
+                let mut batch_count = 0;
+
+                while let Some(result) = stream.next().await {
+                    if first_emission.is_none() {
+                        first_emission = Some(start.elapsed());
+                    }
+                    result?;
+                    batch_count += 1;
+                }
+
+                Ok::<(Duration, usize), DataFusionError>((
+                    first_emission.unwrap_or_default(),
+                    batch_count,
+                ))
+            }
+        };
+
+        // Case 1: Chunked emission (small batch size)
+        let (time_chunked, count_chunked) = run_scenario(1024).await?;
+        println!("Chunked emission (1024): {time_chunked:?} ({count_chunked} batches)");
+
+        // Case 2: Blocking emission (large batch size)
+        let (time_blocking, count_blocking) =
+            run_scenario(num_groups as usize + 1000).await?;
+        println!("Blocking emission (all): {time_blocking:?} ({count_blocking} batches)");
+
+        assert!(
+            count_chunked > 1,
+            "Chunked emission should produce multiple batches"
+        );
+        assert_eq!(
+            count_blocking, 1,
+            "Blocking emission should produce single batch"
+        );
+
+        // Example output:
+        // Testing with 1000000 groups (UInt32 + String + LargeList keys)
+        // Chunked emission (1024): 2.1316265s (977 batches)
+        // Blocking emission (all): 2.815402s (1 batches)
+        Ok(())
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,9 @@ pub trait GroupValues: Send {`
`111`	`111`	`/// Emits the group values`
`112`	`112`	`fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;`
`113`	`113`
	`114`	`+ /// Signals that input is complete and drain mode should be activated`
	`115`	`+ fn input_done(&mut self) {}`
	`116`	`+`
`114`	`117`	`/// Clear the contents and shrink the capacity to the size of the batch (free up memory usage)`
`115`	`118`	`fn clear_shrink(&mut self, batch: &RecordBatch);`
`116`	`119`	`}`