-
Notifications
You must be signed in to change notification settings - Fork 1.8k
fix: partition pruning stats pruning when multiple values are present #18923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -245,7 +245,7 @@ impl PruningStatistics for PartitionPruningStatistics { | |||||||
| match acc { | ||||||||
| None => Some(Some(eq_result)), | ||||||||
| Some(acc_array) => { | ||||||||
| arrow::compute::kernels::boolean::and(&acc_array, &eq_result) | ||||||||
| arrow::compute::kernels::boolean::or_kleene(&acc_array, &eq_result) | ||||||||
| .map(Some) | ||||||||
| .ok() | ||||||||
| } | ||||||||
|
|
@@ -560,6 +560,79 @@ mod tests { | |||||||
| assert_eq!(partition_stats.num_containers(), 2); | ||||||||
| } | ||||||||
|
|
||||||||
| #[test] | ||||||||
| fn test_partition_pruning_statistics_multiple_positive_values() { | ||||||||
| let partition_values = vec![ | ||||||||
| vec![ScalarValue::from(1i32), ScalarValue::from(2i32)], | ||||||||
| vec![ScalarValue::from(3i32), ScalarValue::from(4i32)], | ||||||||
| ]; | ||||||||
| let partition_fields = vec![ | ||||||||
| Arc::new(Field::new("a", DataType::Int32, false)), | ||||||||
| Arc::new(Field::new("b", DataType::Int32, false)), | ||||||||
| ]; | ||||||||
| let partition_stats = | ||||||||
| PartitionPruningStatistics::try_new(partition_values, partition_fields) | ||||||||
| .unwrap(); | ||||||||
|
|
||||||||
| let column_a = Column::new_unqualified("a"); | ||||||||
|
|
||||||||
| let values = HashSet::from([ScalarValue::from(1i32), ScalarValue::from(3i32)]); | ||||||||
| let contained_a = partition_stats.contained(&column_a, &values).unwrap(); | ||||||||
| let expected_contained_a = BooleanArray::from(vec![true, true]); | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The stats:
Running I wonder if we need some cases that return
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The earlier case covers |
||||||||
| assert_eq!(contained_a, expected_contained_a); | ||||||||
| } | ||||||||
|
|
||||||||
| #[test] | ||||||||
| fn test_partition_pruning_statistics_null_in_values() { | ||||||||
| let partition_values = vec![ | ||||||||
| vec![ | ||||||||
| ScalarValue::from(1i32), | ||||||||
| ScalarValue::from(2i32), | ||||||||
| ScalarValue::from(3i32), | ||||||||
| ], | ||||||||
| vec![ | ||||||||
| ScalarValue::from(4i32), | ||||||||
| ScalarValue::from(5i32), | ||||||||
| ScalarValue::from(6i32), | ||||||||
| ], | ||||||||
| ]; | ||||||||
| let partition_fields = vec![ | ||||||||
| Arc::new(Field::new("a", DataType::Int32, false)), | ||||||||
| Arc::new(Field::new("b", DataType::Int32, false)), | ||||||||
| Arc::new(Field::new("c", DataType::Int32, false)), | ||||||||
| ]; | ||||||||
| let partition_stats = | ||||||||
| PartitionPruningStatistics::try_new(partition_values, partition_fields) | ||||||||
| .unwrap(); | ||||||||
|
|
||||||||
| let column_a = Column::new_unqualified("a"); | ||||||||
| let column_b = Column::new_unqualified("b"); | ||||||||
| let column_c = Column::new_unqualified("c"); | ||||||||
|
|
||||||||
| let values_a = HashSet::from([ScalarValue::from(1i32), ScalarValue::Int32(None)]); | ||||||||
| let contained_a = partition_stats.contained(&column_a, &values_a).unwrap(); | ||||||||
| let mut builder = BooleanArray::builder(2); | ||||||||
| builder.append_value(true); | ||||||||
| builder.append_null(); | ||||||||
| let expected_contained_a = builder.finish(); | ||||||||
| assert_eq!(contained_a, expected_contained_a); | ||||||||
|
|
||||||||
| // First match creates a NULL boolean array | ||||||||
| // The accumulator should update the value to true for the second value | ||||||||
| let values_b = HashSet::from([ScalarValue::Int32(None), ScalarValue::from(5i32)]); | ||||||||
| let contained_b = partition_stats.contained(&column_b, &values_b).unwrap(); | ||||||||
| let mut builder = BooleanArray::builder(2); | ||||||||
| builder.append_null(); | ||||||||
| builder.append_value(true); | ||||||||
| let expected_contained_b = builder.finish(); | ||||||||
| assert_eq!(contained_b, expected_contained_b); | ||||||||
|
|
||||||||
| // All matches are null, contained should return None | ||||||||
| let values_c = HashSet::from([ScalarValue::Int32(None)]); | ||||||||
| let contained_c = partition_stats.contained(&column_c, &values_c); | ||||||||
| assert!(contained_c.is_none()); | ||||||||
| } | ||||||||
|
|
||||||||
| #[test] | ||||||||
| fn test_partition_pruning_statistics_empty() { | ||||||||
| let partition_values = vec![]; | ||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't seem right to me 🤔
According to
https://docs.rs/datafusion/latest/datafusion/common/pruning/trait.PruningStatistics.html#tymethod.contained
This test I think has
awith values1and2. So the result ofcontainsfor values(1,3)should beNULLas3is not in values...Maybe I am missing something here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given no other tests fail, we clearly have some sort of test coverage gap
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see now that @dqkqd had the same question here: #18922 (comment)
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the partition columns are
(a, b), the value ofpartition_valuesin the test represent two partitions(a=1, b=2)and(a=3, b=4).The
containedis done on an array of columnavalues[1, 3], and not a single tuple(a=1, b=2). Which is why the result is[true, true]in this caseThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see -- I thought the stats represented this (which is wrong).
I'll try and make a follow on PR with some more comments to try and make this clearer