Skip to content

Commit 8ab8342

Browse files
authored
Implement Substrait Support for GROUPING SET CUBE (#18798)
## Which issue does this PR close? * Closes #16294. ## Rationale for this change This change adds complete Substrait round‑trip support for `GROUPING SET CUBE`, allowing logical plans containing cubes to successfully convert to and from Substrait. Previously, cube handling in the Substrait producer returned a hard `NotImplemented` error, preventing several SQLLogicTest cases from running under round‑trip mode. Supporting `CUBE` brings consistency with existing `ROLLUP` and `GROUPING SETS` handling, ensures correct logical plan serialization, and enables successful execution of tests that rely on cube semantics. ### Before ``` ❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52 Finished `test` profile [unoptimized + debuginfo] target(s) in 0.60s Running bin/sqllogictests.rs (target/debug/deps/sqllogictests-917e139464eeea33) Completed 1 test files in 0 seconds External error: 1 errors in file /Users/kosiew/GitHub/datafusion/datafusion/sqllogictest/test_files/grouping.slt 1. query failed: DataFusion error: This feature is not implemented: GroupingSet CUBE is not yet supported ``` ### After ``` ❯ cargo test --test sqllogictests -- --substrait-round-trip grouping.slt:52 Finished `test` profile [unoptimized + debuginfo] target(s) in 0.65s Running bin/sqllogictests.rs (target/debug/deps/sqllogictests-917e139464eeea33) Completed 1 test files in 0 seconds ``` ## What changes are included in this PR? * Introduces a shared internal helper `powerset_indices` for efficient subset generation. * Refactors `powerset` to use DataFusion error types and removes the string‑based error. * Adds a new `powerset_cloned` function for owned‑value subsets needed in the Substrait adapter. * Implements full Substrait producer support for `GroupingSet::Cube` using `powerset_cloned`. * Updates aggregate Substrait translation to correctly assemble grouping sets derived from cube expansions. * Adds a new Substrait round‑trip test case for `GROUP BY CUBE`. ## Are these changes tested? Yes. A new Substrait round‑trip test (`aggregate_grouping_cube`) validates the logical plan after translation. Existing grouping/aggregate tests continue to pass, covering other grouping‑set variants. ## Are there any user-facing changes? There are no user‑facing API changes. The behavior of `GROUP BY CUBE` is now consistent under Substrait round‑trip mode, which may allow previously failing queries to succeed. ## LLM-generated code disclosure This PR includes LLM‑generated code and comments. All LLM‑generated content has been manually reviewed and tested.
1 parent 0871741 commit 8ab8342

File tree

3 files changed

+66
-30
lines changed

3 files changed

+66
-30
lines changed

datafusion/expr/src/utils.rs

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ use datafusion_common::tree_node::{
3434
};
3535
use datafusion_common::utils::get_at_indices;
3636
use datafusion_common::{
37-
internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap,
38-
Result, TableReference,
37+
internal_err, plan_err, Column, DFSchema, DFSchemaRef, HashMap, Result,
38+
TableReference,
3939
};
4040

4141
#[cfg(not(feature = "sql"))]
@@ -66,6 +66,23 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
6666
}
6767
}
6868

69+
/// Internal helper that generates indices for powerset subsets using bitset iteration.
70+
/// Returns an iterator of index vectors, where each vector contains the indices
71+
/// of elements to include in that subset.
72+
fn powerset_indices(len: usize) -> impl Iterator<Item = Vec<usize>> {
73+
(0..(1 << len)).map(move |mask| {
74+
let mut indices = vec![];
75+
let mut bitset = mask;
76+
while bitset > 0 {
77+
let rightmost: u64 = bitset & !(bitset - 1);
78+
let idx = rightmost.trailing_zeros() as usize;
79+
indices.push(idx);
80+
bitset &= bitset - 1;
81+
}
82+
indices
83+
})
84+
}
85+
6986
/// The [power set] (or powerset) of a set S is the set of all subsets of S, \
7087
/// including the empty set and S itself.
7188
///
@@ -83,26 +100,14 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
83100
/// and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, z}, {x, y, z}}.
84101
///
85102
/// [power set]: https://en.wikipedia.org/wiki/Power_set
86-
fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>, String> {
103+
pub fn powerset<T>(slice: &[T]) -> Result<Vec<Vec<&T>>> {
87104
if slice.len() >= 64 {
88-
return Err("The size of the set must be less than 64.".into());
105+
return plan_err!("The size of the set must be less than 64");
89106
}
90107

91-
let mut v = Vec::new();
92-
for mask in 0..(1 << slice.len()) {
93-
let mut ss = vec![];
94-
let mut bitset = mask;
95-
while bitset > 0 {
96-
let rightmost: u64 = bitset & !(bitset - 1);
97-
let idx = rightmost.trailing_zeros();
98-
let item = slice.get(idx as usize).unwrap();
99-
ss.push(item);
100-
// zero the trailing bit
101-
bitset &= bitset - 1;
102-
}
103-
v.push(ss);
104-
}
105-
Ok(v)
108+
Ok(powerset_indices(slice.len())
109+
.map(|indices| indices.iter().map(|&idx| &slice[idx]).collect())
110+
.collect())
106111
}
107112

108113
/// check the number of expressions contained in the grouping_set
@@ -207,8 +212,7 @@ pub fn enumerate_grouping_sets(group_expr: Vec<Expr>) -> Result<Vec<Expr>> {
207212
grouping_sets.iter().map(|e| e.iter().collect()).collect()
208213
}
209214
Expr::GroupingSet(GroupingSet::Cube(group_exprs)) => {
210-
let grouping_sets = powerset(group_exprs)
211-
.map_err(|e| plan_datafusion_err!("{}", e))?;
215+
let grouping_sets = powerset(group_exprs)?;
212216
check_grouping_sets_size_limit(grouping_sets.len())?;
213217
grouping_sets
214218
}

datafusion/substrait/src/logical_plan/producer/rel/aggregate_rel.rs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
use crate::logical_plan::producer::{
1919
from_aggregate_function, substrait_field_ref, SubstraitProducer,
2020
};
21-
use datafusion::common::{internal_err, not_impl_err, DFSchemaRef, DataFusionError};
21+
use datafusion::common::{internal_err, not_impl_err, DFSchemaRef};
2222
use datafusion::logical_expr::expr::Alias;
23+
use datafusion::logical_expr::utils::powerset;
2324
use datafusion::logical_expr::{Aggregate, Distinct, Expr, GroupingSet};
2425
use substrait::proto::aggregate_rel::{Grouping, Measure};
2526
use substrait::proto::rel::RelType;
@@ -91,10 +92,22 @@ pub fn to_substrait_groupings(
9192
let groupings = match exprs.len() {
9293
1 => match &exprs[0] {
9394
Expr::GroupingSet(gs) => match gs {
94-
GroupingSet::Cube(_) => Err(DataFusionError::NotImplemented(
95-
"GroupingSet CUBE is not yet supported".to_string(),
96-
)),
97-
GroupingSet::GroupingSets(sets) => Ok(sets
95+
GroupingSet::Cube(set) => {
96+
// Generate power set of grouping expressions
97+
let cube_sets = powerset(set)?;
98+
cube_sets
99+
.iter()
100+
.map(|set| {
101+
parse_flat_grouping_exprs(
102+
producer,
103+
&set.iter().map(|v| (*v).clone()).collect::<Vec<_>>(),
104+
schema,
105+
&mut ref_group_exprs,
106+
)
107+
})
108+
.collect::<datafusion::common::Result<Vec<_>>>()
109+
}
110+
GroupingSet::GroupingSets(sets) => sets
98111
.iter()
99112
.map(|set| {
100113
parse_flat_grouping_exprs(
@@ -104,14 +117,13 @@ pub fn to_substrait_groupings(
104117
&mut ref_group_exprs,
105118
)
106119
})
107-
.collect::<datafusion::common::Result<Vec<_>>>()?),
120+
.collect::<datafusion::common::Result<Vec<_>>>(),
108121
GroupingSet::Rollup(set) => {
109122
let mut sets: Vec<Vec<Expr>> = vec![vec![]];
110123
for i in 0..set.len() {
111124
sets.push(set[..=i].to_vec());
112125
}
113-
Ok(sets
114-
.iter()
126+
sets.iter()
115127
.rev()
116128
.map(|set| {
117129
parse_flat_grouping_exprs(
@@ -121,7 +133,7 @@ pub fn to_substrait_groupings(
121133
&mut ref_group_exprs,
122134
)
123135
})
124-
.collect::<datafusion::common::Result<Vec<_>>>()?)
136+
.collect::<datafusion::common::Result<Vec<_>>>()
125137
}
126138
},
127139
_ => Ok(vec![parse_flat_grouping_exprs(

datafusion/substrait/tests/cases/roundtrip_logical_plan.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,26 @@ async fn aggregate_grouping_rollup() -> Result<()> {
322322
Ok(())
323323
}
324324

325+
#[tokio::test]
326+
async fn aggregate_grouping_cube() -> Result<()> {
327+
let plan = generate_plan_from_sql(
328+
"SELECT a, c, avg(b) FROM data GROUP BY CUBE (a, c)",
329+
true,
330+
true,
331+
)
332+
.await?;
333+
334+
assert_snapshot!(
335+
plan,
336+
@r#"
337+
Projection: data.a, data.c, avg(data.b)
338+
Aggregate: groupBy=[[GROUPING SETS ((), (data.a), (data.c), (data.a, data.c))]], aggr=[[avg(data.b)]]
339+
TableScan: data projection=[a, b, c]
340+
"#
341+
);
342+
Ok(())
343+
}
344+
325345
#[tokio::test]
326346
async fn multilayer_aggregate() -> Result<()> {
327347
let plan = generate_plan_from_sql(

0 commit comments

Comments
 (0)