Skip to content

Commit 114beec

Browse files
authored
Fix array_has simplification with null argument (apache#18186)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes apache#123` indicates that this PR will close issue apache#123. --> - Closes #. ## Rationale for this change According to three-valued logic we should return `null` and that's also what happens when the argument is not a constant as seen in the test. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? Updated `ArrayHas::simplify` to explicitly handle `null` <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Updated the `array_has` SQL test and added unit tests <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? Yes, a minor change in behaviour wrt `null` <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent b7a10ad commit 114beec

File tree

2 files changed

+86
-17
lines changed

2 files changed

+86
-17
lines changed

datafusion/functions-nested/src/array_has.rs

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -132,23 +132,26 @@ impl ScalarUDFImpl for ArrayHas {
132132
// if the haystack is a constant list, we can use an inlist expression which is more
133133
// efficient because the haystack is not varying per-row
134134
match haystack {
135+
Expr::Literal(scalar, _) if scalar.is_null() => {
136+
return Ok(ExprSimplifyResult::Simplified(Expr::Literal(
137+
ScalarValue::Boolean(None),
138+
None,
139+
)))
140+
}
135141
Expr::Literal(
136142
// FixedSizeList gets coerced to List
137143
scalar @ ScalarValue::List(_) | scalar @ ScalarValue::LargeList(_),
138144
_,
139145
) => {
140-
let array = scalar.to_array().unwrap(); // guarantee of ScalarValue
141146
if let Ok(scalar_values) =
142-
ScalarValue::convert_array_to_scalar_vec(&array)
147+
ScalarValue::convert_array_to_scalar_vec(&scalar.to_array()?)
143148
{
144149
assert_eq!(scalar_values.len(), 1);
145150
let list = scalar_values
146151
.into_iter()
147-
// If the vec is a singular null, `list` will be empty due to this flatten().
148-
// It would be more clear if we handled the None separately, but this is more performant.
149152
.flatten()
150153
.flatten()
151-
.map(|v| Expr::Literal(v.clone(), None))
154+
.map(|v| Expr::Literal(v, None))
152155
.collect();
153156

154157
return Ok(ExprSimplifyResult::Simplified(in_list(
@@ -178,6 +181,12 @@ impl ScalarUDFImpl for ArrayHas {
178181
args: datafusion_expr::ScalarFunctionArgs,
179182
) -> Result<ColumnarValue> {
180183
let [first_arg, second_arg] = take_function_args(self.name(), &args.args)?;
184+
if first_arg.data_type().is_null() {
185+
// Always return null if the first argument is null
186+
// i.e. array_has(null, element) -> null
187+
return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
188+
}
189+
181190
match &second_arg {
182191
ColumnarValue::Array(array_needle) => {
183192
// the needle is already an array, convert the haystack to an array of the same length
@@ -663,6 +672,7 @@ fn general_array_has_all_and_any_kernel(
663672
mod tests {
664673
use std::sync::Arc;
665674

675+
use arrow::datatypes::Int32Type;
666676
use arrow::{
667677
array::{create_array, Array, ArrayRef, AsArray, Int32Array, ListArray},
668678
buffer::OffsetBuffer,
@@ -733,6 +743,40 @@ mod tests {
733743
);
734744
}
735745

746+
#[test]
747+
fn test_simplify_array_has_with_null_to_null() {
748+
let haystack = Expr::Literal(ScalarValue::Null, None);
749+
let needle = col("c");
750+
751+
let props = ExecutionProps::new();
752+
let context = datafusion_expr::simplify::SimplifyContext::new(&props);
753+
let Ok(ExprSimplifyResult::Simplified(simplified)) =
754+
ArrayHas::new().simplify(vec![haystack, needle], &context)
755+
else {
756+
panic!("Expected simplified expression");
757+
};
758+
759+
assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None), None));
760+
}
761+
762+
#[test]
763+
fn test_simplify_array_has_with_null_list_to_null() {
764+
let haystack =
765+
ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0], _>([None]);
766+
let haystack = Expr::Literal(ScalarValue::List(Arc::new(haystack)), None);
767+
let needle = col("c");
768+
769+
let props = ExecutionProps::new();
770+
let context = datafusion_expr::simplify::SimplifyContext::new(&props);
771+
let Ok(ExprSimplifyResult::Simplified(simplified)) =
772+
ArrayHas::new().simplify(vec![haystack, needle], &context)
773+
else {
774+
panic!("Expected simplified expression");
775+
};
776+
777+
assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None), None));
778+
}
779+
736780
#[test]
737781
fn test_array_has_complex_list_not_simplified() {
738782
let haystack = col("c1");
@@ -757,13 +801,9 @@ mod tests {
757801
Field::new_list("", Field::new("", DataType::Int32, true), true),
758802
true,
759803
));
760-
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
761-
let return_field = Arc::new(Field::new_list(
762-
"return",
763-
Field::new("", DataType::Boolean, true),
764-
true,
765-
));
766804

805+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
806+
let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
767807
let haystack = ListArray::new(
768808
Field::new_list_field(DataType::Int32, true).into(),
769809
OffsetBuffer::new(vec![0, 0].into()),
@@ -773,7 +813,6 @@ mod tests {
773813

774814
let haystack = ColumnarValue::Array(Arc::new(haystack));
775815
let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
776-
777816
let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
778817
args: vec![haystack, needle],
779818
arg_fields: vec![haystack_field, needle_field],
@@ -789,4 +828,34 @@ mod tests {
789828

790829
Ok(())
791830
}
831+
832+
#[test]
833+
fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
834+
let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true));
835+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
836+
let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
837+
let haystack =
838+
ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0], _>([
839+
None, None, None,
840+
]);
841+
842+
let haystack = ColumnarValue::Array(Arc::new(haystack));
843+
let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
844+
let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
845+
args: vec![haystack, needle],
846+
arg_fields: vec![haystack_field, needle_field],
847+
number_rows: 1,
848+
return_field,
849+
config_options: Arc::new(ConfigOptions::default()),
850+
})?;
851+
852+
let output = result.into_array(1)?;
853+
let output = output.as_boolean();
854+
assert_eq!(output.len(), 3);
855+
for i in 0..3 {
856+
assert!(output.is_null(i));
857+
}
858+
859+
Ok(())
860+
}
792861
}

datafusion/sqllogictest/test_files/array.slt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6040,13 +6040,13 @@ false
60406040
# array_has([1, 3, 5], 1) -> true (array contains element)
60416041
# array_has([], 1) -> false (empty array, not null)
60426042
# array_has(null, 1) -> null (null array)
6043-
query B
6044-
select array_has(column1, column2)
6043+
query BB
6044+
select array_has(column1, column2), array_has(null, column2)
60456045
from array_has_table_empty;
60466046
----
6047-
true
6048-
false
6049-
NULL
6047+
true NULL
6048+
false NULL
6049+
NULL NULL
60506050

60516051
# Test for issue: array_has should return false for empty arrays, not null
60526052
# This test demonstrates the correct behavior with COALESCE to show the distinction

0 commit comments

Comments
 (0)