Skip to content

Commit 197fc79

Browse files
authored
Support nested types for the new public Arrow format (#28413)
1 parent fc2c18b commit 197fc79

14 files changed

+4323
-3462
lines changed

ydb/core/formats/arrow/arrow_batch_builder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include <ydb/core/formats/arrow/arrow_helpers_minikql.h>
44
#include <ydb/core/formats/arrow/switch/switch_type.h>
55
#include <ydb/core/kqp/common/kqp_types.h>
6-
#include <ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h>
6+
#include <ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h>
77

88
#include <ydb/library/actors/core/log.h>
99

ydb/core/formats/arrow/arrow_helpers_minikql.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "arrow_helpers_minikql.h"
22

3-
#include <ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h>
3+
#include <ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h>
44
#include <util/string/join.h>
55

66
namespace NKikimr::NArrow {

ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp

Lines changed: 826 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#pragma once
2+
3+
#include <contrib/libs/apache/arrow/cpp/src/arrow/api.h>
4+
5+
#include <yql/essentials/minikql/mkql_node.h>
6+
7+
/**
8+
* @file kqp_formats_arrow.h
9+
* @brief Utilities for converting MiniKQL types to Apache Arrow types and vice versa.
10+
*
11+
* This module provides a comprehensive mapping between YQL internal type system (MiniKQL)
12+
* and Apache Arrow format. It handles conversion of both simple data types
13+
* (integers, strings, etc.) and complex types (structs, lists, optionals, etc.).
14+
*/
15+
16+
namespace NKikimr::NKqp::NFormats {
17+
18+
constexpr size_t MAX_VARIANT_FLATTEN_SIZE = static_cast<size_t>(arrow::UnionType::kMaxTypeCode) + 1;
19+
constexpr size_t MAX_VARIANT_NESTED_SIZE = MAX_VARIANT_FLATTEN_SIZE * MAX_VARIANT_FLATTEN_SIZE;
20+
constexpr size_t MAX_VARIANT_DEPTH = 2;
21+
22+
/**
23+
* @brief Dispatches MiniKQL data type to corresponding Arrow type via compile-time callback.
24+
*
25+
* This template function provides a type-safe way to map MiniKQL primitive data types
26+
* to their Arrow counterparts. The callback receives the Arrow type as a template parameter,
27+
* allowing for compile-time type dispatch without runtime overhead.
28+
*
29+
* Type mapping overview:
30+
* - Integer types: Int8/16/32/64, UInt8/16/32/64
31+
* - Floating point: Float, Double
32+
* - Temporal types: Date, Datetime, Timestamp, Interval (and their extended variants)
33+
* - String types: Utf8, Json, JsonDocument (serialized to string), DyNumber (serialized to string) -> arrow::StringType
34+
* - Binary types: String, Yson -> arrow::BinaryType
35+
* - Fixed-size binary: Decimal, Uuid -> arrow::FixedSizeBinaryType
36+
* - Timezone-aware: TzDate, TzDatetime, TzTimestamp -> arrow::StructType<datetimeType, arrow::StringType (serialized name of timezone)>
37+
*
38+
* @tparam TFunc Callable type accepting a single template parameter (Arrow type)
39+
* @param typeId The MiniKQL data slot to convert
40+
* @param callback A callable object with signature: template<typename TArrowType> bool operator()()
41+
* @return true if the type is supported and callback executed successfully, false otherwise
42+
*/
43+
template <typename TFunc>
44+
bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) {
45+
switch (typeId) {
46+
case NUdf::EDataSlot::Int8:
47+
return callback.template operator()<arrow::Int8Type>();
48+
49+
case NUdf::EDataSlot::Uint8:
50+
case NUdf::EDataSlot::Bool:
51+
return callback.template operator()<arrow::UInt8Type>();
52+
53+
case NUdf::EDataSlot::Int16:
54+
return callback.template operator()<arrow::Int16Type>();
55+
56+
case NUdf::EDataSlot::Date:
57+
case NUdf::EDataSlot::Uint16:
58+
return callback.template operator()<arrow::UInt16Type>();
59+
60+
case NUdf::EDataSlot::Int32:
61+
case NUdf::EDataSlot::Date32:
62+
return callback.template operator()<arrow::Int32Type>();
63+
64+
case NUdf::EDataSlot::Datetime:
65+
case NUdf::EDataSlot::Uint32:
66+
return callback.template operator()<arrow::UInt32Type>();
67+
68+
case NUdf::EDataSlot::Int64:
69+
case NUdf::EDataSlot::Interval:
70+
case NUdf::EDataSlot::Datetime64:
71+
case NUdf::EDataSlot::Timestamp64:
72+
case NUdf::EDataSlot::Interval64:
73+
return callback.template operator()<arrow::Int64Type>();
74+
75+
case NUdf::EDataSlot::Uint64:
76+
case NUdf::EDataSlot::Timestamp:
77+
return callback.template operator()<arrow::UInt64Type>();
78+
79+
case NUdf::EDataSlot::Float:
80+
return callback.template operator()<arrow::FloatType>();
81+
82+
case NUdf::EDataSlot::Double:
83+
return callback.template operator()<arrow::DoubleType>();
84+
85+
case NUdf::EDataSlot::Utf8:
86+
case NUdf::EDataSlot::Json:
87+
case NUdf::EDataSlot::DyNumber:
88+
case NUdf::EDataSlot::JsonDocument:
89+
return callback.template operator()<arrow::StringType>();
90+
91+
case NUdf::EDataSlot::String:
92+
case NUdf::EDataSlot::Yson:
93+
return callback.template operator()<arrow::BinaryType>();
94+
95+
case NUdf::EDataSlot::Decimal:
96+
case NUdf::EDataSlot::Uuid:
97+
return callback.template operator()<arrow::FixedSizeBinaryType>();
98+
99+
case NUdf::EDataSlot::TzDate:
100+
case NUdf::EDataSlot::TzDatetime:
101+
case NUdf::EDataSlot::TzTimestamp:
102+
case NUdf::EDataSlot::TzDate32:
103+
case NUdf::EDataSlot::TzDatetime64:
104+
case NUdf::EDataSlot::TzTimestamp64:
105+
return callback.template operator()<arrow::StructType>();
106+
}
107+
return false;
108+
}
109+
110+
/**
111+
* @brief Determines if a type requires wrapping in an external Optional layer.
112+
*
113+
* Some MiniKQL types don't have a native validity bitmap in Arrow representation
114+
* (e.g., Variant, Null). These types need to be wrapped in an additional
115+
* struct layer when used as optional values to properly represent NULL states.
116+
*
117+
* @param type The MiniKQL type to check
118+
* @return true if the type needs external Optional wrapping, false otherwise
119+
*
120+
* @note Types that need wrapping: Void, Null, Variant, Optional, EmptyList, EmptyDict
121+
*/
122+
bool NeedWrapByExternalOptional(const NMiniKQL::TType* type);
123+
124+
/**
125+
* @brief Converts a MiniKQL type to its corresponding Arrow DataType.
126+
*
127+
* This function recursively converts complex MiniKQL types (Struct, Tuple, List, Dict,
128+
* Variant, Optional, Tagged) to their Arrow equivalents. The conversion preserves the structure
129+
* and nullability information.
130+
*
131+
* Conversion rules:
132+
* - Data types: mapped according to SwitchMiniKQLDataTypeToArrowType
133+
* - Struct/Tuple: converted to arrow::StructType
134+
* - List: converted to arrow::ListType
135+
* - Dict: converted to arrow::ListType of arrow::StructType<Key, Value>
136+
* - Variant: converted to arrow::DenseUnionType
137+
* - Optional: nested optionals are flattened and represented via struct wrapping
138+
* - Tagged: converted to inner type
139+
*
140+
* @param type The MiniKQL type to convert
141+
* @return Shared pointer to corresponding Arrow DataType, or arrow::NullType if unsupported
142+
*/
143+
std::shared_ptr<arrow::DataType> GetArrowType(const NMiniKQL::TType* type);
144+
145+
/**
146+
* @brief Checks if a MiniKQL type can be represented in Arrow format.
147+
*
148+
* Not all MiniKQL types are compatible with Arrow. For example, Callable, Stream,
149+
* and Flow types cannot be represented. This function recursively checks complex
150+
* types (Struct, List, etc.) to ensure all nested types are compatible.
151+
*
152+
* @param type The MiniKQL type to validate
153+
* @return true if the type can be converted to Arrow format, false otherwise
154+
*
155+
* @note Incompatible types: Type, Stream, Callable, Any, Resource, Flow, Block, Pg, Multi, Linear
156+
*/
157+
bool IsArrowCompatible(const NMiniKQL::TType* type);
158+
159+
/**
160+
* @brief Appends a MiniKQL UnboxedValue to an Arrow ArrayBuilder.
161+
*
162+
* This function is the core serialization routine for converting MiniKQL values
163+
* to Arrow format. It handles all supported MiniKQL types, including
164+
* complex nested structures, and properly manages NULL values.
165+
*
166+
* The builder must be pre-configured with the correct Arrow type matching the
167+
* provided MiniKQL type. Type mismatches will result in assertion failures.
168+
*
169+
* @param value The MiniKQL value to append (may be NULL/empty)
170+
* @param builder The Arrow builder to append to (must match the type)
171+
* @param type The MiniKQL type descriptor for the value
172+
*/
173+
void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type);
174+
175+
} // namespace NKikimr::NKqp::NFormats

0 commit comments

Comments
 (0)