|
| 1 | +#pragma once |
| 2 | + |
| 3 | +#include <contrib/libs/apache/arrow/cpp/src/arrow/api.h> |
| 4 | + |
| 5 | +#include <yql/essentials/minikql/mkql_node.h> |
| 6 | + |
| 7 | +/** |
| 8 | + * @file kqp_formats_arrow.h |
| 9 | + * @brief Utilities for converting MiniKQL types to Apache Arrow types and vice versa. |
| 10 | + * |
| 11 | + * This module provides a comprehensive mapping between YQL internal type system (MiniKQL) |
| 12 | + * and Apache Arrow format. It handles conversion of both simple data types |
| 13 | + * (integers, strings, etc.) and complex types (structs, lists, optionals, etc.). |
| 14 | + */ |
| 15 | + |
| 16 | +namespace NKikimr::NKqp::NFormats { |
| 17 | + |
| 18 | +constexpr size_t MAX_VARIANT_FLATTEN_SIZE = static_cast<size_t>(arrow::UnionType::kMaxTypeCode) + 1; |
| 19 | +constexpr size_t MAX_VARIANT_NESTED_SIZE = MAX_VARIANT_FLATTEN_SIZE * MAX_VARIANT_FLATTEN_SIZE; |
| 20 | +constexpr size_t MAX_VARIANT_DEPTH = 2; |
| 21 | + |
| 22 | +/** |
| 23 | + * @brief Dispatches MiniKQL data type to corresponding Arrow type via compile-time callback. |
| 24 | + * |
| 25 | + * This template function provides a type-safe way to map MiniKQL primitive data types |
| 26 | + * to their Arrow counterparts. The callback receives the Arrow type as a template parameter, |
| 27 | + * allowing for compile-time type dispatch without runtime overhead. |
| 28 | + * |
| 29 | + * Type mapping overview: |
| 30 | + * - Integer types: Int8/16/32/64, UInt8/16/32/64 |
| 31 | + * - Floating point: Float, Double |
| 32 | + * - Temporal types: Date, Datetime, Timestamp, Interval (and their extended variants) |
| 33 | + * - String types: Utf8, Json, JsonDocument (serialized to string), DyNumber (serialized to string) -> arrow::StringType |
| 34 | + * - Binary types: String, Yson -> arrow::BinaryType |
| 35 | + * - Fixed-size binary: Decimal, Uuid -> arrow::FixedSizeBinaryType |
| 36 | + * - Timezone-aware: TzDate, TzDatetime, TzTimestamp -> arrow::StructType<datetimeType, arrow::StringType (serialized name of timezone)> |
| 37 | + * |
| 38 | + * @tparam TFunc Callable type accepting a single template parameter (Arrow type) |
| 39 | + * @param typeId The MiniKQL data slot to convert |
| 40 | + * @param callback A callable object with signature: template<typename TArrowType> bool operator()() |
| 41 | + * @return true if the type is supported and callback executed successfully, false otherwise |
| 42 | + */ |
| 43 | +template <typename TFunc> |
| 44 | +bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) { |
| 45 | + switch (typeId) { |
| 46 | + case NUdf::EDataSlot::Int8: |
| 47 | + return callback.template operator()<arrow::Int8Type>(); |
| 48 | + |
| 49 | + case NUdf::EDataSlot::Uint8: |
| 50 | + case NUdf::EDataSlot::Bool: |
| 51 | + return callback.template operator()<arrow::UInt8Type>(); |
| 52 | + |
| 53 | + case NUdf::EDataSlot::Int16: |
| 54 | + return callback.template operator()<arrow::Int16Type>(); |
| 55 | + |
| 56 | + case NUdf::EDataSlot::Date: |
| 57 | + case NUdf::EDataSlot::Uint16: |
| 58 | + return callback.template operator()<arrow::UInt16Type>(); |
| 59 | + |
| 60 | + case NUdf::EDataSlot::Int32: |
| 61 | + case NUdf::EDataSlot::Date32: |
| 62 | + return callback.template operator()<arrow::Int32Type>(); |
| 63 | + |
| 64 | + case NUdf::EDataSlot::Datetime: |
| 65 | + case NUdf::EDataSlot::Uint32: |
| 66 | + return callback.template operator()<arrow::UInt32Type>(); |
| 67 | + |
| 68 | + case NUdf::EDataSlot::Int64: |
| 69 | + case NUdf::EDataSlot::Interval: |
| 70 | + case NUdf::EDataSlot::Datetime64: |
| 71 | + case NUdf::EDataSlot::Timestamp64: |
| 72 | + case NUdf::EDataSlot::Interval64: |
| 73 | + return callback.template operator()<arrow::Int64Type>(); |
| 74 | + |
| 75 | + case NUdf::EDataSlot::Uint64: |
| 76 | + case NUdf::EDataSlot::Timestamp: |
| 77 | + return callback.template operator()<arrow::UInt64Type>(); |
| 78 | + |
| 79 | + case NUdf::EDataSlot::Float: |
| 80 | + return callback.template operator()<arrow::FloatType>(); |
| 81 | + |
| 82 | + case NUdf::EDataSlot::Double: |
| 83 | + return callback.template operator()<arrow::DoubleType>(); |
| 84 | + |
| 85 | + case NUdf::EDataSlot::Utf8: |
| 86 | + case NUdf::EDataSlot::Json: |
| 87 | + case NUdf::EDataSlot::DyNumber: |
| 88 | + case NUdf::EDataSlot::JsonDocument: |
| 89 | + return callback.template operator()<arrow::StringType>(); |
| 90 | + |
| 91 | + case NUdf::EDataSlot::String: |
| 92 | + case NUdf::EDataSlot::Yson: |
| 93 | + return callback.template operator()<arrow::BinaryType>(); |
| 94 | + |
| 95 | + case NUdf::EDataSlot::Decimal: |
| 96 | + case NUdf::EDataSlot::Uuid: |
| 97 | + return callback.template operator()<arrow::FixedSizeBinaryType>(); |
| 98 | + |
| 99 | + case NUdf::EDataSlot::TzDate: |
| 100 | + case NUdf::EDataSlot::TzDatetime: |
| 101 | + case NUdf::EDataSlot::TzTimestamp: |
| 102 | + case NUdf::EDataSlot::TzDate32: |
| 103 | + case NUdf::EDataSlot::TzDatetime64: |
| 104 | + case NUdf::EDataSlot::TzTimestamp64: |
| 105 | + return callback.template operator()<arrow::StructType>(); |
| 106 | + } |
| 107 | + return false; |
| 108 | +} |
| 109 | + |
| 110 | +/** |
| 111 | + * @brief Determines if a type requires wrapping in an external Optional layer. |
| 112 | + * |
| 113 | + * Some MiniKQL types don't have a native validity bitmap in Arrow representation |
| 114 | + * (e.g., Variant, Null). These types need to be wrapped in an additional |
| 115 | + * struct layer when used as optional values to properly represent NULL states. |
| 116 | + * |
| 117 | + * @param type The MiniKQL type to check |
| 118 | + * @return true if the type needs external Optional wrapping, false otherwise |
| 119 | + * |
| 120 | + * @note Types that need wrapping: Void, Null, Variant, Optional, EmptyList, EmptyDict |
| 121 | + */ |
| 122 | +bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); |
| 123 | + |
| 124 | +/** |
| 125 | + * @brief Converts a MiniKQL type to its corresponding Arrow DataType. |
| 126 | + * |
| 127 | + * This function recursively converts complex MiniKQL types (Struct, Tuple, List, Dict, |
| 128 | + * Variant, Optional, Tagged) to their Arrow equivalents. The conversion preserves the structure |
| 129 | + * and nullability information. |
| 130 | + * |
| 131 | + * Conversion rules: |
| 132 | + * - Data types: mapped according to SwitchMiniKQLDataTypeToArrowType |
| 133 | + * - Struct/Tuple: converted to arrow::StructType |
| 134 | + * - List: converted to arrow::ListType |
| 135 | + * - Dict: converted to arrow::ListType of arrow::StructType<Key, Value> |
| 136 | + * - Variant: converted to arrow::DenseUnionType |
| 137 | + * - Optional: nested optionals are flattened and represented via struct wrapping |
| 138 | + * - Tagged: converted to inner type |
| 139 | + * |
| 140 | + * @param type The MiniKQL type to convert |
| 141 | + * @return Shared pointer to corresponding Arrow DataType, or arrow::NullType if unsupported |
| 142 | + */ |
| 143 | +std::shared_ptr<arrow::DataType> GetArrowType(const NMiniKQL::TType* type); |
| 144 | + |
| 145 | +/** |
| 146 | + * @brief Checks if a MiniKQL type can be represented in Arrow format. |
| 147 | + * |
| 148 | + * Not all MiniKQL types are compatible with Arrow. For example, Callable, Stream, |
| 149 | + * and Flow types cannot be represented. This function recursively checks complex |
| 150 | + * types (Struct, List, etc.) to ensure all nested types are compatible. |
| 151 | + * |
| 152 | + * @param type The MiniKQL type to validate |
| 153 | + * @return true if the type can be converted to Arrow format, false otherwise |
| 154 | + * |
| 155 | + * @note Incompatible types: Type, Stream, Callable, Any, Resource, Flow, Block, Pg, Multi, Linear |
| 156 | + */ |
| 157 | +bool IsArrowCompatible(const NMiniKQL::TType* type); |
| 158 | + |
| 159 | +/** |
| 160 | + * @brief Appends a MiniKQL UnboxedValue to an Arrow ArrayBuilder. |
| 161 | + * |
| 162 | + * This function is the core serialization routine for converting MiniKQL values |
| 163 | + * to Arrow format. It handles all supported MiniKQL types, including |
| 164 | + * complex nested structures, and properly manages NULL values. |
| 165 | + * |
| 166 | + * The builder must be pre-configured with the correct Arrow type matching the |
| 167 | + * provided MiniKQL type. Type mismatches will result in assertion failures. |
| 168 | + * |
| 169 | + * @param value The MiniKQL value to append (may be NULL/empty) |
| 170 | + * @param builder The Arrow builder to append to (must match the type) |
| 171 | + * @param type The MiniKQL type descriptor for the value |
| 172 | + */ |
| 173 | +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type); |
| 174 | + |
| 175 | +} // namespace NKikimr::NKqp::NFormats |
0 commit comments