diff --git a/ydb/core/formats/arrow/arrow_batch_builder.cpp b/ydb/core/formats/arrow/arrow_batch_builder.cpp index cdb7d79913f8..b0cc3cd4ceff 100644 --- a/ydb/core/formats/arrow/arrow_batch_builder.cpp +++ b/ydb/core/formats/arrow/arrow_batch_builder.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/ydb/core/formats/arrow/arrow_helpers_minikql.cpp b/ydb/core/formats/arrow/arrow_helpers_minikql.cpp index 6d846347e599..351a47cfd8e6 100644 --- a/ydb/core/formats/arrow/arrow_helpers_minikql.cpp +++ b/ydb/core/formats/arrow/arrow_helpers_minikql.cpp @@ -1,6 +1,6 @@ #include "arrow_helpers_minikql.h" -#include +#include #include namespace NKikimr::NArrow { diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp new file mode 100644 index 000000000000..551306b2222f --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.cpp @@ -0,0 +1,826 @@ +#include "kqp_formats_arrow.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace NKikimr::NKqp::NFormats { + +namespace { + +template +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { + Y_UNUSED(slot); + return std::make_shared(); +} + +template <> +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { + Y_UNUSED(slot); + return arrow::fixed_size_binary(NScheme::FSB_SIZE); +} + +template <> +std::shared_ptr BuildArrowType(NUdf::EDataSlot slot) { + std::shared_ptr type; + switch (slot) { + case NUdf::EDataSlot::TzDate: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + case NUdf::EDataSlot::TzDatetime: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + case NUdf::EDataSlot::TzTimestamp: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + case NUdf::EDataSlot::TzDate32: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + case NUdf::EDataSlot::TzDatetime64: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + case NUdf::EDataSlot::TzTimestamp64: + type = NYql::NUdf::MakeTzLayoutArrowType(); + break; + default: + YQL_ENSURE(false, "Unexpected timezone datetime slot"); + return std::make_shared(); + } + + arrow::FieldVector fields{ + std::make_shared("datetime", type, false), + std::make_shared("timezone", arrow::utf8(), false), + }; + return arrow::struct_(fields); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TDataType* dataType) { + std::shared_ptr result; + bool success = SwitchMiniKQLDataTypeToArrowType(*dataType->GetDataSlot().Get(), + [&]() { + result = BuildArrowType(*dataType->GetDataSlot().Get()); + return true; + }); + if (success) { + return result; + } + return std::make_shared(); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TStructType* structType) { + arrow::FieldVector fields; + fields.reserve(structType->GetMembersCount()); + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + auto memberName = std::string(structType->GetMemberName(index)); + auto memberArrowType = NFormats::GetArrowType(memberType); + + fields.emplace_back(std::make_shared(memberName, memberArrowType, memberType->IsOptional())); + } + return arrow::struct_(fields); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TTupleType* tupleType) { + arrow::FieldVector fields; + fields.reserve(tupleType->GetElementsCount()); + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementName = "field" + std::to_string(index); + auto elementType = tupleType->GetElementType(index); + auto elementArrowType = NFormats::GetArrowType(elementType); + + fields.emplace_back(std::make_shared(elementName, elementArrowType, elementType->IsOptional())); + } + return arrow::struct_(fields); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TListType* listType) { + auto itemType = listType->GetItemType(); + auto itemArrowType = NFormats::GetArrowType(itemType); + auto field = std::make_shared("item", itemArrowType, itemType->IsOptional()); + return arrow::list(field); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TDictType* dictType) { + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + + auto structType = arrow::struct_({ + std::make_shared("key", NFormats::GetArrowType(keyType), keyType->IsOptional()), + std::make_shared("payload", NFormats::GetArrowType(payloadType), payloadType->IsOptional()) + }); + return arrow::list(structType); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* variantType) { + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + NMiniKQL::TStructType* structType = nullptr; + NMiniKQL::TTupleType* tupleType = nullptr; + + if (innerType->IsStruct()) { + structType = static_cast(innerType); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + tupleType = static_cast(innerType); + } + + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); + + arrow::FieldVector fields; + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { + ui32 numberOfGroups = ((variantType->GetAlternativesCount() - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1; + fields.reserve(numberOfGroups); + + for (ui32 group = 0; group < numberOfGroups; ++group) { + ui32 beginIndex = group * MAX_VARIANT_FLATTEN_SIZE; + ui32 endIndex = std::min((group + 1) * MAX_VARIANT_FLATTEN_SIZE, variantType->GetAlternativesCount()); + + arrow::FieldVector groupFields; + groupFields.reserve(endIndex - beginIndex); + + for (ui32 i = beginIndex; i < endIndex; ++i) { + auto itemName = (structType == nullptr) ? std::string("field" + ToString(i)) : std::string(structType->GetMemberName(i)); + auto itemType = (structType == nullptr) ? tupleType->GetElementType(i) : structType->GetMemberType(i); + auto itemArrowType = NFormats::GetArrowType(itemType); + + groupFields.emplace_back(std::make_shared( itemName, itemArrowType, itemType->IsOptional())); + } + + std::vector typeCodes(groupFields.size()); + std::iota(typeCodes.begin(), typeCodes.end(), 0); + + auto fieldName = std::string("field" + ToString(group)); + fields.emplace_back(std::make_shared(fieldName, arrow::dense_union(groupFields, typeCodes), false)); + } + + return arrow::dense_union(fields); + } + + fields.reserve(variantType->GetAlternativesCount()); + for (ui32 index = 0; index < variantType->GetAlternativesCount(); ++index) { + auto itemName = (structType == nullptr) ? std::string("field" + ToString(index)) : std::string(structType->GetMemberName(index)); + auto itemType = (structType == nullptr) ? tupleType->GetElementType(index) : structType->GetMemberType(index); + auto itemArrowType = NFormats::GetArrowType(itemType); + + fields.emplace_back(std::make_shared(itemName, itemArrowType, itemType->IsOptional())); + } + + std::vector typeCodes(fields.size()); + std::iota(typeCodes.begin(), typeCodes.end(), 0); + return arrow::dense_union(fields, typeCodes); +} + +std::shared_ptr GetArrowType(const NMiniKQL::TOptionalType* optionalType) { + auto currentType = SkipTaggedType(optionalType->GetItemType()); + ui32 depth = 1; + + while (currentType->IsOptional()) { + currentType = SkipTaggedType(static_cast(currentType)->GetItemType()); + ++depth; + } + + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type + if (NeedWrapByExternalOptional(currentType)) { + ++depth; + } + + std::shared_ptr innerArrowType = NFormats::GetArrowType(currentType); + while (depth > 1) { + innerArrowType = arrow::struct_({std::make_shared("opt", innerArrowType, true)}); + --depth; + } + return innerArrowType; +} + +template +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto typedBuilder = reinterpret_cast::BuilderType*>(builder); + arrow::Status status; + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + } else { + status = typedBuilder->Append(value.Get()); + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + YQL_ENSURE(builder->type()->id() == arrow::Type::UINT64, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + arrow::Status status; + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + } else { + status = typedBuilder->Append(value.Get()); + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + YQL_ENSURE(builder->type()->id() == arrow::Type::INT64, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + arrow::Status status; + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + } else { + status = typedBuilder->Append(value.Get()); + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRING, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + arrow::Status status; + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + } else { + switch (dataSlot) { + case NUdf::EDataSlot::Utf8: + case NUdf::EDataSlot::Json: { + auto data = value.AsStringRef(); + status = typedBuilder->Append(data.Data(), data.Size()); + break; + } + + case NUdf::EDataSlot::JsonDocument: { + YQL_ENSURE(NBinaryJson::IsValidBinaryJson(value.AsStringRef())); + auto textJson = NBinaryJson::SerializeToJson(value.AsStringRef()); + status = typedBuilder->Append(textJson.data(), textJson.size()); + break; + } + + case NUdf::EDataSlot::DyNumber: { + auto number = NDyNumber::DyNumberToString(value.AsStringRef()); + YQL_ENSURE(number.Defined(), "Failed to convert DyNumber to string"); + status = typedBuilder->Append(number->data(), number->size()); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + YQL_ENSURE(builder->type()->id() == arrow::Type::BINARY, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + arrow::Status status; + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + } else { + auto data = value.AsStringRef(); + status = typedBuilder->Append(data.Data(), data.Size()); + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + YQL_ENSURE(typedBuilder->num_fields() == 2, "StructBuilder of timezone datetime types should have 2 fields"); + + if (!value.HasValue()) { + auto status = typedBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); + return; + } + + auto status = typedBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); + + auto datetimeArray = typedBuilder->field_builder(0); + auto timezoneArray = reinterpret_cast(typedBuilder->field_builder(1)); + + switch (dataSlot) { + case NUdf::EDataSlot::TzDate: { + YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT16); + status = reinterpret_cast(datetimeArray)->Append(value.Get()); + break; + } + + case NUdf::EDataSlot::TzDatetime: { + YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT32); + status = reinterpret_cast(datetimeArray)->Append(value.Get()); + break; + } + + case NUdf::EDataSlot::TzTimestamp: { + YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT64); + status = reinterpret_cast(datetimeArray)->Append(value.Get()); + break; + } + + case NUdf::EDataSlot::TzDate32: { + YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::INT32); + status = reinterpret_cast(datetimeArray)->Append(value.Get()); + break; + } + + case NUdf::EDataSlot::TzDatetime64: + case NUdf::EDataSlot::TzTimestamp64: { + YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::INT64); + status = reinterpret_cast(datetimeArray)->Append(value.Get()); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected timezone datetime slot"); + return; + } + } + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); + + auto tzName = NMiniKQL::GetTimezoneIANAName(value.GetTimezoneId()); + status = timezoneArray->Append(tzName.Data(), tzName.size()); + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +template <> +void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { + YQL_ENSURE(builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY, "Unexpected builder type"); + auto typedBuilder = reinterpret_cast(builder); + arrow::Status status; + + if (!value.HasValue()) { + status = typedBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); + return; + } + + switch (dataSlot) { + case NUdf::EDataSlot::Uuid: { + auto data = value.AsStringRef(); + status = typedBuilder->Append(data.Data()); + break; + } + + case NUdf::EDataSlot::Decimal: { + auto intVal = value.GetInt128(); + status = typedBuilder->Append(reinterpret_cast(&intVal)); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + + YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TDataType* dataType) { + auto slot = *dataType->GetDataSlot().Get(); + bool success = SwitchMiniKQLDataTypeToArrowType(slot, [&]() { + AppendDataValue(builder, value, slot); + return true; + }); + YQL_ENSURE(success, "Failed to append data value to arrow builder"); +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TOptionalType* optionalType) { + auto innerType = SkipTaggedType(optionalType->GetItemType()); + ui32 depth = 1; + + while (innerType->IsOptional()) { + innerType = SkipTaggedType(static_cast(innerType) ->GetItemType()); + ++depth; + } + + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type + if (NeedWrapByExternalOptional(innerType)) { + ++depth; + } + + auto innerBuilder = builder; + auto innerValue = value; + + for (ui32 i = 1; i < depth; ++i) { + YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(innerBuilder); + YQL_ENSURE(structBuilder->num_fields() == 1, "Unexpected number of fields"); + + if (!innerValue) { + auto status = innerBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); + return; + } + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append optional value: " << status.ToString()); + + innerValue = innerValue.GetOptionalValue(); + innerBuilder = structBuilder->field_builder(0); + } + + if (innerValue) { + NFormats::AppendElement(innerValue.GetOptionalValue(), innerBuilder, innerType); + } else { + auto status = innerBuilder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TListType* listType) { + auto itemType = listType->GetItemType(); + + YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); + auto listBuilder = reinterpret_cast(builder); + + auto status = listBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append list value: " << status.ToString()); + + auto innerBuilder = listBuilder->value_builder(); + if (auto item = value.GetElements()) { + auto length = value.GetListLength(); + while (length > 0) { + NFormats::AppendElement(*item++, innerBuilder, itemType); + --length; + } + } else { + const auto iter = value.GetListIterator(); + for (NUdf::TUnboxedValue item; iter.Next(item);) { + NFormats::AppendElement(item, innerBuilder, itemType); + } + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TStructType* structType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(builder); + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append struct value: " << status.ToString()); + + YQL_ENSURE(static_cast(structBuilder->num_fields()) == structType->GetMembersCount(), "Unexpected number of fields"); + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto innerBuilder = structBuilder->field_builder(index); + auto memberType = structType->GetMemberType(index); + NFormats::AppendElement(value.GetElement(index), innerBuilder, memberType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TTupleType* tupleType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(builder); + + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append tuple value: " << status.ToString()); + + YQL_ENSURE(static_cast(structBuilder->num_fields()) == tupleType->GetElementsCount(), "Unexpected number of fields"); + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto innerBuilder = structBuilder->field_builder(index); + auto elementType = tupleType->GetElementType(index); + NFormats::AppendElement(value.GetElement(index), innerBuilder, elementType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TDictType* dictType) { + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + + YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); + auto listBuilder = reinterpret_cast(builder); + + auto status = listBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); + + YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(listBuilder->value_builder()); + YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); + + auto keyBuilder = structBuilder->field_builder(0); + auto itemBuilder = structBuilder->field_builder(1); + + const auto iter = value.GetDictIterator(); + for (NUdf::TUnboxedValue key, payload; iter.NextPair(key, payload);) { + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); + + NFormats::AppendElement(key, keyBuilder, keyType); + NFormats::AppendElement(payload, itemBuilder, payloadType); + } +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TVariantType* variantType) { + YQL_ENSURE(builder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); + auto unionBuilder = reinterpret_cast(builder); + + ui32 variantIndex = value.GetVariantIndex(); + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + + if (innerType->IsStruct()) { + innerType = static_cast(innerType)->GetMemberType(variantIndex); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); + + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { + ui32 numberOfGroups = ((variantType->GetAlternativesCount() - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1; + YQL_ENSURE(static_cast(unionBuilder->num_children()) == numberOfGroups, "Unexpected variant number of groups"); + + ui32 groupIndex = variantIndex / MAX_VARIANT_FLATTEN_SIZE; + auto status = unionBuilder->Append(groupIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto innerBuilder = unionBuilder->child_builder(groupIndex); + YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); + auto innerUnionBuilder = reinterpret_cast(innerBuilder.get()); + + ui32 innerVariantIndex = variantIndex % MAX_VARIANT_FLATTEN_SIZE; + status = innerUnionBuilder->Append(innerVariantIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto doubleInnerBuilder = innerUnionBuilder->child_builder(innerVariantIndex); + NFormats::AppendElement(value.GetVariantItem(), doubleInnerBuilder.get(), innerType); + } else { + auto status = unionBuilder->Append(variantIndex); + YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); + + auto innerBuilder = unionBuilder->child_builder(variantIndex); + NFormats::AppendElement(value.GetVariantItem(), innerBuilder.get(), innerType); + } +} + +} // namespace + +bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { + switch (type->GetKind()) { + case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: + case NMiniKQL::TType::EKind::Optional: + case NMiniKQL::TType::EKind::Variant: { + return true; + } + + case NMiniKQL::TType::EKind::Data: + case NMiniKQL::TType::EKind::Struct: + case NMiniKQL::TType::EKind::Tuple: + case NMiniKQL::TType::EKind::List: + case NMiniKQL::TType::EKind::Dict: + case NMiniKQL::TType::EKind::Tagged: { + return false; + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); + } + } + return false; +} + +std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { + YQL_ENSURE(IsArrowCompatible(type)); + switch (type->GetKind()) { + case NMiniKQL::TType::EKind::Null: { + return arrow::null(); + } + + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: { + return arrow::struct_({}); + } + + case NMiniKQL::TType::EKind::Data: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Optional: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Struct: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Tuple: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::List: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Dict: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Variant: { + return GetArrowType(static_cast(type)); + } + + case NMiniKQL::TType::EKind::Tagged: { + return GetArrowType(static_cast(type)->GetBaseType()); + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); + } + } + return arrow::null(); +} + +bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { + switch (type->GetKind()) { + case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: + case NMiniKQL::TType::EKind::Data: { + return true; + } + + case NMiniKQL::TType::EKind::Optional: { + auto optionalType = static_cast(type); + return IsArrowCompatible(optionalType->GetItemType()); + } + + case NMiniKQL::TType::EKind::Struct: { + auto structType = static_cast(type); + bool isCompatible = true; + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + isCompatible = isCompatible && IsArrowCompatible(memberType); + } + return isCompatible; + } + + case NMiniKQL::TType::EKind::Tuple: { + auto tupleType = static_cast(type); + bool isCompatible = true; + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + isCompatible = isCompatible && IsArrowCompatible(elementType); + } + return isCompatible; + } + + case NMiniKQL::TType::EKind::List: { + auto listType = static_cast(type); + auto itemType = listType->GetItemType(); + return IsArrowCompatible(itemType); + } + + case NMiniKQL::TType::EKind::Dict: { + auto dictType = static_cast(type); + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + return IsArrowCompatible(keyType) && IsArrowCompatible(payloadType); + } + + case NMiniKQL::TType::EKind::Variant: { + auto variantType = static_cast(type); + if (variantType->GetAlternativesCount() > MAX_VARIANT_NESTED_SIZE) { + return false; + } + + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + return (innerType->IsStruct() || innerType->IsTuple()) && IsArrowCompatible(innerType); + } + + case NMiniKQL::TType::EKind::Tagged: { + auto taggedType = static_cast(type); + return IsArrowCompatible(taggedType->GetBaseType()); + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + return false; + } + } + return true; +} + +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type) { + switch (type->GetKind()) { + case NMiniKQL::TType::EKind::Null: { + YQL_ENSURE(builder->type()->id() == arrow::Type::NA, "Unexpected builder type"); + auto status = builder->AppendNull(); + YQL_ENSURE(status.ok(), "Failed to append null value: " << status.ToString()); + break; + } + + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: { + YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); + auto structBuilder = reinterpret_cast(builder); + auto status = structBuilder->Append(); + YQL_ENSURE(status.ok(), "Failed to append struct value of a singular type: " << status.ToString()); + break; + } + + case NMiniKQL::TType::EKind::Data: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Optional: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Struct: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Tuple: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::List: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Dict: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Variant: { + AppendElement(value, builder, static_cast(type)); + break; + } + + case NMiniKQL::TType::EKind::Tagged: { + AppendElement(value, builder, static_cast(type)->GetBaseType()); + break; + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); + } + } +} + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h new file mode 100644 index 000000000000..a7d9beb44ae9 --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/kqp_formats_arrow.h @@ -0,0 +1,175 @@ +#pragma once + +#include + +#include + +/** + * @file kqp_formats_arrow.h + * @brief Utilities for converting MiniKQL types to Apache Arrow types and vice versa. + * + * This module provides a comprehensive mapping between YQL internal type system (MiniKQL) + * and Apache Arrow format. It handles conversion of both simple data types + * (integers, strings, etc.) and complex types (structs, lists, optionals, etc.). + */ + +namespace NKikimr::NKqp::NFormats { + +constexpr size_t MAX_VARIANT_FLATTEN_SIZE = static_cast(arrow::UnionType::kMaxTypeCode) + 1; +constexpr size_t MAX_VARIANT_NESTED_SIZE = MAX_VARIANT_FLATTEN_SIZE * MAX_VARIANT_FLATTEN_SIZE; +constexpr size_t MAX_VARIANT_DEPTH = 2; + +/** + * @brief Dispatches MiniKQL data type to corresponding Arrow type via compile-time callback. + * + * This template function provides a type-safe way to map MiniKQL primitive data types + * to their Arrow counterparts. The callback receives the Arrow type as a template parameter, + * allowing for compile-time type dispatch without runtime overhead. + * + * Type mapping overview: + * - Integer types: Int8/16/32/64, UInt8/16/32/64 + * - Floating point: Float, Double + * - Temporal types: Date, Datetime, Timestamp, Interval (and their extended variants) + * - String types: Utf8, Json, JsonDocument (serialized to string), DyNumber (serialized to string) -> arrow::StringType + * - Binary types: String, Yson -> arrow::BinaryType + * - Fixed-size binary: Decimal, Uuid -> arrow::FixedSizeBinaryType + * - Timezone-aware: TzDate, TzDatetime, TzTimestamp -> arrow::StructType + * + * @tparam TFunc Callable type accepting a single template parameter (Arrow type) + * @param typeId The MiniKQL data slot to convert + * @param callback A callable object with signature: template bool operator()() + * @return true if the type is supported and callback executed successfully, false otherwise + */ +template +bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot typeId, TFunc&& callback) { + switch (typeId) { + case NUdf::EDataSlot::Int8: + return callback.template operator()(); + + case NUdf::EDataSlot::Uint8: + case NUdf::EDataSlot::Bool: + return callback.template operator()(); + + case NUdf::EDataSlot::Int16: + return callback.template operator()(); + + case NUdf::EDataSlot::Date: + case NUdf::EDataSlot::Uint16: + return callback.template operator()(); + + case NUdf::EDataSlot::Int32: + case NUdf::EDataSlot::Date32: + return callback.template operator()(); + + case NUdf::EDataSlot::Datetime: + case NUdf::EDataSlot::Uint32: + return callback.template operator()(); + + case NUdf::EDataSlot::Int64: + case NUdf::EDataSlot::Interval: + case NUdf::EDataSlot::Datetime64: + case NUdf::EDataSlot::Timestamp64: + case NUdf::EDataSlot::Interval64: + return callback.template operator()(); + + case NUdf::EDataSlot::Uint64: + case NUdf::EDataSlot::Timestamp: + return callback.template operator()(); + + case NUdf::EDataSlot::Float: + return callback.template operator()(); + + case NUdf::EDataSlot::Double: + return callback.template operator()(); + + case NUdf::EDataSlot::Utf8: + case NUdf::EDataSlot::Json: + case NUdf::EDataSlot::DyNumber: + case NUdf::EDataSlot::JsonDocument: + return callback.template operator()(); + + case NUdf::EDataSlot::String: + case NUdf::EDataSlot::Yson: + return callback.template operator()(); + + case NUdf::EDataSlot::Decimal: + case NUdf::EDataSlot::Uuid: + return callback.template operator()(); + + case NUdf::EDataSlot::TzDate: + case NUdf::EDataSlot::TzDatetime: + case NUdf::EDataSlot::TzTimestamp: + case NUdf::EDataSlot::TzDate32: + case NUdf::EDataSlot::TzDatetime64: + case NUdf::EDataSlot::TzTimestamp64: + return callback.template operator()(); + } + return false; +} + +/** + * @brief Determines if a type requires wrapping in an external Optional layer. + * + * Some MiniKQL types don't have a native validity bitmap in Arrow representation + * (e.g., Variant, Null). These types need to be wrapped in an additional + * struct layer when used as optional values to properly represent NULL states. + * + * @param type The MiniKQL type to check + * @return true if the type needs external Optional wrapping, false otherwise + * + * @note Types that need wrapping: Void, Null, Variant, Optional, EmptyList, EmptyDict + */ +bool NeedWrapByExternalOptional(const NMiniKQL::TType* type); + +/** + * @brief Converts a MiniKQL type to its corresponding Arrow DataType. + * + * This function recursively converts complex MiniKQL types (Struct, Tuple, List, Dict, + * Variant, Optional, Tagged) to their Arrow equivalents. The conversion preserves the structure + * and nullability information. + * + * Conversion rules: + * - Data types: mapped according to SwitchMiniKQLDataTypeToArrowType + * - Struct/Tuple: converted to arrow::StructType + * - List: converted to arrow::ListType + * - Dict: converted to arrow::ListType of arrow::StructType + * - Variant: converted to arrow::DenseUnionType + * - Optional: nested optionals are flattened and represented via struct wrapping + * - Tagged: converted to inner type + * + * @param type The MiniKQL type to convert + * @return Shared pointer to corresponding Arrow DataType, or arrow::NullType if unsupported + */ +std::shared_ptr GetArrowType(const NMiniKQL::TType* type); + +/** + * @brief Checks if a MiniKQL type can be represented in Arrow format. + * + * Not all MiniKQL types are compatible with Arrow. For example, Callable, Stream, + * and Flow types cannot be represented. This function recursively checks complex + * types (Struct, List, etc.) to ensure all nested types are compatible. + * + * @param type The MiniKQL type to validate + * @return true if the type can be converted to Arrow format, false otherwise + * + * @note Incompatible types: Type, Stream, Callable, Any, Resource, Flow, Block, Pg, Multi, Linear + */ +bool IsArrowCompatible(const NMiniKQL::TType* type); + +/** + * @brief Appends a MiniKQL UnboxedValue to an Arrow ArrayBuilder. + * + * This function is the core serialization routine for converting MiniKQL values + * to Arrow format. It handles all supported MiniKQL types, including + * complex nested structures, and properly manages NULL values. + * + * The builder must be pre-configured with the correct Arrow type matching the + * provided MiniKQL type. Type mismatches will result in assertion failures. + * + * @param value The MiniKQL value to append (may be NULL/empty) + * @param builder The Arrow builder to append to (must match the type) + * @param type The MiniKQL type descriptor for the value + */ +void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type); + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp b/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp deleted file mode 100644 index 51db4942a44a..000000000000 --- a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.cpp +++ /dev/null @@ -1,1286 +0,0 @@ -#include "kqp_result_set_arrow.h" - -#include -#include -#include -#include -#include -#include - -namespace NKikimr::NKqp::NFormats { - -namespace { - -template -struct TTypeWrapper { - using T = TArrowType; -}; - -/** - * @brief Function to switch MiniKQL DataType correctly and uniformly converting - * it to arrow type using callback - * - * @tparam TFunc Callback type - * @param typeId Type callback work with. - * @param callback Template function of signature (TTypeWrapper) -> bool - * @return Result of execution of callback or false if the type typeId is not - * supported. - */ -template -bool SwitchMiniKQLDataTypeToArrowType(NUdf::EDataSlot type, TFunc &&callback) { - switch (type) { - case NUdf::EDataSlot::Int8: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Uint8: - case NUdf::EDataSlot::Bool: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int16: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Date: - case NUdf::EDataSlot::Uint16: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int32: - case NUdf::EDataSlot::Date32: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Datetime: - case NUdf::EDataSlot::Uint32: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Int64: - case NUdf::EDataSlot::Interval: - case NUdf::EDataSlot::Datetime64: - case NUdf::EDataSlot::Timestamp64: - case NUdf::EDataSlot::Interval64: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Uint64: - case NUdf::EDataSlot::Timestamp: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Float: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Double: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Utf8: - case NUdf::EDataSlot::Json: - case NUdf::EDataSlot::DyNumber: - case NUdf::EDataSlot::JsonDocument: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::String: - case NUdf::EDataSlot::Yson: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::Decimal: - case NUdf::EDataSlot::Uuid: - return callback(TTypeWrapper()); - case NUdf::EDataSlot::TzDate: - case NUdf::EDataSlot::TzDatetime: - case NUdf::EDataSlot::TzTimestamp: - case NUdf::EDataSlot::TzDate32: - case NUdf::EDataSlot::TzDatetime64: - case NUdf::EDataSlot::TzTimestamp64: - return callback(TTypeWrapper()); - } -} - -bool NeedWrapByExternalOptional(const NMiniKQL::TType* type) { - switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::Variant: - case NMiniKQL::TType::EKind::Optional: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: - return true; - case NMiniKQL::TType::EKind::Data: - case NMiniKQL::TType::EKind::Struct: - case NMiniKQL::TType::EKind::Tuple: - case NMiniKQL::TType::EKind::List: - case NMiniKQL::TType::EKind::Dict: - case NMiniKQL::TType::EKind::Tagged: - return false; - case NMiniKQL::TType::EKind::Type: - case NMiniKQL::TType::EKind::Stream: - case NMiniKQL::TType::EKind::Callable: - case NMiniKQL::TType::EKind::Any: - case NMiniKQL::TType::EKind::Resource: - case NMiniKQL::TType::EKind::Flow: - case NMiniKQL::TType::EKind::ReservedKind: - case NMiniKQL::TType::EKind::Block: - case NMiniKQL::TType::EKind::Pg: - case NMiniKQL::TType::EKind::Multi: - case NMiniKQL::TType::EKind::Linear: - YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); - return false; - } - return false; -} - -template -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { - Y_UNUSED(slot); - return std::make_shared(); -} - -template <> -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { - Y_UNUSED(slot); - return arrow::fixed_size_binary(NScheme::FSB_SIZE); -} - -template <> -std::shared_ptr CreateEmptyArrowImpl(NUdf::EDataSlot slot) { - std::shared_ptr type; - switch (slot) { - case NUdf::EDataSlot::TzDate: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - case NUdf::EDataSlot::TzDatetime: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - case NUdf::EDataSlot::TzTimestamp: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - case NUdf::EDataSlot::TzDate32: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - case NUdf::EDataSlot::TzDatetime64: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - case NUdf::EDataSlot::TzTimestamp64: - type = NYql::NUdf::MakeTzLayoutArrowType(); - break; - default: - YQL_ENSURE(false, "Unexpected timezone datetime slot"); - return std::make_shared(); - } - - std::vector> fields{ - std::make_shared("datetime", type, false), - std::make_shared("timezone", arrow::utf8(), false), - }; - return arrow::struct_(fields); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TDataType* dataType) { - std::shared_ptr result; - bool success = SwitchMiniKQLDataTypeToArrowType(*dataType->GetDataSlot().Get(), - [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); - result = CreateEmptyArrowImpl(*dataType->GetDataSlot().Get()); - return true; - }); - if (success) { - return result; - } - return std::make_shared(); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TStructType* structType) { - std::vector> fields; - fields.reserve(structType->GetMembersCount()); - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - auto memberName = std::string(structType->GetMemberName(index)); - auto memberArrowType = NFormats::GetArrowType(memberType); - - fields.emplace_back(std::make_shared(memberName, memberArrowType, memberType->IsOptional())); - } - return arrow::struct_(fields); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TTupleType* tupleType) { - std::vector> fields; - fields.reserve(tupleType->GetElementsCount()); - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementName = std::string("field" + ToString(index)); - auto elementType = tupleType->GetElementType(index); - auto elementArrowType = NFormats::GetArrowType(elementType); - - fields.push_back(std::make_shared(elementName, elementArrowType, elementType->IsOptional())); - } - return arrow::struct_(fields); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TListType* listType) { - auto itemType = listType->GetItemType(); - auto itemArrowType = NFormats::GetArrowType(itemType); - auto field = std::make_shared("item", itemArrowType, itemType->IsOptional()); - return arrow::list(field); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TDictType* dictType) { - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - - auto keyArrowType = NFormats::GetArrowType(keyType); - auto payloadArrowType = NFormats::GetArrowType(payloadType); - - auto custom =std::make_shared("custom", arrow::uint64(), false); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - std::vector> items; - items.emplace_back(std::make_shared("key", keyArrowType, true)); - items.emplace_back(std::make_shared("payload", payloadArrowType, payloadType->IsOptional())); - - auto fieldMap = std::make_shared("map", arrow::list(arrow::struct_(items)), false); - return arrow::struct_({fieldMap, custom}); - } - - auto fieldMap = std::make_shared("map", arrow::map(keyArrowType, payloadArrowType), false); - return arrow::struct_({fieldMap, custom}); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TVariantType* variantType) { - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - arrow::FieldVector types; - NMiniKQL::TStructType* structType = nullptr; - NMiniKQL::TTupleType* tupleType = nullptr; - - if (innerType->IsStruct()) { - structType = static_cast(innerType); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - tupleType = static_cast(innerType); - } - - // Create Union of unions if there are more types then arrow::dense_union supports. - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; - types.reserve(numberOfGroups); - - for (ui32 groupIndex = 0; groupIndex < numberOfGroups; ++groupIndex) { - ui32 beginIndex = groupIndex * arrow::UnionType::kMaxTypeCode; - ui32 endIndex = std::min((groupIndex + 1) * arrow::UnionType::kMaxTypeCode, variantType->GetAlternativesCount()); - - arrow::FieldVector groupTypes; - groupTypes.reserve(endIndex - beginIndex); - - for (ui32 index = beginIndex; index < endIndex; ++index) { - auto itemName = (structType == nullptr) - ? std::string("field" + ToString(index)) - : std::string(structType->GetMemberName(index)); - auto itemType = (structType == nullptr) - ? tupleType->GetElementType(index) - : structType->GetMemberType(index); - auto itemArrowType = NFormats::GetArrowType(itemType); - - groupTypes.emplace_back(std::make_shared( itemName, itemArrowType, itemType->IsOptional())); - } - - auto fieldName = std::string("field" + ToString(groupIndex)); - types.emplace_back(std::make_shared(fieldName, arrow::dense_union(groupTypes), false)); - } - - return arrow::dense_union(types); - } - - // Else put all types in one arrow::dense_union - types.reserve(variantType->GetAlternativesCount()); - for (ui32 index = 0; index < variantType->GetAlternativesCount(); ++index) { - auto itemName = (structType == nullptr) - ? std::string("field" + ToString(index)) - : std::string(structType->GetMemberName(index)); - auto itemType = (structType == nullptr) ? tupleType->GetElementType(index) : structType->GetMemberType(index); - auto itemArrowType = NFormats::GetArrowType(itemType); - - types.emplace_back(std::make_shared(itemName, itemArrowType, itemType->IsOptional())); - } - - return arrow::dense_union(types); -} - -std::shared_ptr GetArrowType(const NMiniKQL::TOptionalType* optionalType) { - auto currentType = optionalType->GetItemType(); - ui32 depth = 1; - - while (currentType->IsOptional()) { - currentType = static_cast(currentType)->GetItemType(); - ++depth; - } - - if (NeedWrapByExternalOptional(currentType)) { - ++depth; - } - - std::shared_ptr innerArrowType = NFormats::GetArrowType(currentType); - - for (ui32 i = 1; i < depth; ++i) { - auto field = std::make_shared("opt", innerArrowType, false); - innerArrowType = std::make_shared(std::vector>{field}); - } - - return innerArrowType; -} - -template -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto typedBuilder = reinterpret_cast::BuilderType*>(builder); - arrow::Status status; - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - status = typedBuilder->Append(value.Get()); - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - YQL_ENSURE(builder->type()->id() == arrow::Type::UINT64, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - arrow::Status status; - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - status = typedBuilder->Append(value.Get()); - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - YQL_ENSURE(builder->type()->id() == arrow::Type::INT64, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - arrow::Status status; - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - status = typedBuilder->Append(value.Get()); - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - YQL_ENSURE(builder->type()->id() == arrow::Type::STRING, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - arrow::Status status; - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - switch (dataSlot) { - case NUdf::EDataSlot::Utf8: - case NUdf::EDataSlot::Json: { - auto data = value.AsStringRef(); - status = typedBuilder->Append(data.Data(), data.Size()); - break; - } - - case NUdf::EDataSlot::JsonDocument: { - YQL_ENSURE(NBinaryJson::IsValidBinaryJson(value.AsStringRef())); - auto textJson = NBinaryJson::SerializeToJson(value.AsStringRef()); - status = typedBuilder->Append(textJson.data(), textJson.size()); - break; - } - - case NUdf::EDataSlot::DyNumber: { - auto number = NDyNumber::DyNumberToString(value.AsStringRef()); - YQL_ENSURE(number.Defined(), "Failed to convert DyNumber to string"); - status = typedBuilder->Append(number->data(), number->size()); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - YQL_ENSURE(builder->type()->id() == arrow::Type::BINARY, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - arrow::Status status; - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - auto data = value.AsStringRef(); - status = typedBuilder->Append(data.Data(), data.Size()); - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -// Only for timezone datetime types -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - YQL_ENSURE(typedBuilder->num_fields() == 2, "StructBuilder of timezone datetime types should have 2 fields"); - - if (!value.HasValue()) { - auto status = typedBuilder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); - return; - } - - auto status = typedBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); - - auto datetimeArray = typedBuilder->field_builder(0); - auto timezoneArray = reinterpret_cast(typedBuilder->field_builder(1)); - - switch (dataSlot) { - case NUdf::EDataSlot::TzDate: { - YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT16); - status = reinterpret_cast(datetimeArray)->Append(value.Get()); - break; - } - - case NUdf::EDataSlot::TzDatetime: { - YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT32); - status = reinterpret_cast(datetimeArray)->Append(value.Get()); - break; - } - - case NUdf::EDataSlot::TzTimestamp: { - YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::UINT64); - status = reinterpret_cast(datetimeArray)->Append(value.Get()); - break; - } - - case NUdf::EDataSlot::TzDate32: { - YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::INT32); - status = reinterpret_cast(datetimeArray)->Append(value.Get()); - break; - } - - case NUdf::EDataSlot::TzDatetime64: - case NUdf::EDataSlot::TzTimestamp64: { - YQL_ENSURE(datetimeArray->type()->id() == arrow::Type::INT64); - status = reinterpret_cast(datetimeArray)->Append(value.Get()); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected timezone datetime slot"); - return; - } - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); - - auto tzName = NMiniKQL::GetTimezoneIANAName(value.GetTimezoneId()); - status = timezoneArray->Append(tzName.Data(), tzName.size()); - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -template <> -void AppendDataValue(arrow::ArrayBuilder* builder, NUdf::TUnboxedValue value, NUdf::EDataSlot dataSlot) { - YQL_ENSURE(builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY, "Unexpected builder type"); - auto typedBuilder = reinterpret_cast(builder); - arrow::Status status; - - if (!value.HasValue()) { - status = typedBuilder->AppendNull(); - } else { - if (dataSlot == NUdf::EDataSlot::Uuid) { - auto data = value.AsStringRef(); - status = typedBuilder->Append(data.Data()); - } else if (dataSlot == NUdf::EDataSlot::Decimal) { - auto intVal = value.GetInt128(); - status = typedBuilder->Append(reinterpret_cast(&intVal)); - } else { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - YQL_ENSURE(status.ok(), "Failed to append data value: " << status.ToString()); -} - -} // namespace - -std::shared_ptr GetArrowType(const NMiniKQL::TType* type) { - switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Null: { - return arrow::null(); - } - - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: { - return arrow::struct_({}); - } - - case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(type); - return GetArrowType(dataType); - } - - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(type); - return GetArrowType(structType); - } - - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(type); - return GetArrowType(tupleType); - } - - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(type); - return GetArrowType(optionalType); - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(type); - return GetArrowType(listType); - } - - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - return GetArrowType(dictType); - } - - case NMiniKQL::TType::EKind::Variant: { - auto variantType = static_cast(type); - return GetArrowType(variantType); - } - - default: { - YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); - } - } - return arrow::null(); -} - -bool IsArrowCompatible(const NKikimr::NMiniKQL::TType* type) { - switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: - case NMiniKQL::TType::EKind::Data: { - return true; - } - - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(type); - bool isCompatible = true; - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - isCompatible = isCompatible && IsArrowCompatible(memberType); - } - return isCompatible; - } - - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(type); - bool isCompatible = true; - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - isCompatible = isCompatible && IsArrowCompatible(elementType); - } - return isCompatible; - } - - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(type); - return IsArrowCompatible(optionalType->GetItemType()); - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(type); - auto itemType = listType->GetItemType(); - return IsArrowCompatible(itemType); - } - - case NMiniKQL::TType::EKind::Variant: { - auto variantType = static_cast(type); - ui32 maxTypesCount = (arrow::UnionType::kMaxTypeCode + 1) * (arrow::UnionType::kMaxTypeCode + 1); - if (variantType->GetAlternativesCount() > maxTypesCount) { - return false; - } - - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct() || innerType->IsTuple()) { - return IsArrowCompatible(innerType); - } - - YQL_ENSURE(false, "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - return false; - } - - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - return IsArrowCompatible(keyType) && IsArrowCompatible(payloadType); - } - - case NMiniKQL::TType::EKind::Tagged: { - auto taggedType = static_cast(type); - return IsArrowCompatible(taggedType->GetBaseType()); - } - - case NMiniKQL::TType::EKind::Type: - case NMiniKQL::TType::EKind::Stream: - case NMiniKQL::TType::EKind::Callable: - case NMiniKQL::TType::EKind::Any: - case NMiniKQL::TType::EKind::Resource: - case NMiniKQL::TType::EKind::Flow: - case NMiniKQL::TType::EKind::ReservedKind: - case NMiniKQL::TType::EKind::Block: - case NMiniKQL::TType::EKind::Pg: - case NMiniKQL::TType::EKind::Multi: - case NMiniKQL::TType::EKind::Linear: { - return false; - } - } - return true; -} - -void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type) { - switch (type->GetKind()) { - case NMiniKQL::TType::EKind::Null: { - YQL_ENSURE(builder->type()->id() == arrow::Type::NA, "Unexpected builder type"); - auto status = builder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append null value: " << status.ToString()); - break; - } - - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: { - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(builder); - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append struct value of a singular type: " << status.ToString()); - break; - } - - case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(type); - auto slot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType( slot, [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); - AppendDataValue(builder, value, slot); - return true; - }); - YQL_ENSURE(success, "Failed to append data value to arrow builder"); - break; - } - - case NMiniKQL::TType::EKind::Optional: { - auto innerType = static_cast(type)->GetItemType(); - ui32 depth = 1; - - while (innerType->IsOptional()) { - innerType = static_cast(innerType) ->GetItemType(); - ++depth; - } - - if (NeedWrapByExternalOptional(innerType)) { - ++depth; - } - - auto innerBuilder = builder; - auto innerValue = value; - - for (ui32 i = 1; i < depth; ++i) { - YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(innerBuilder); - YQL_ENSURE(structBuilder->num_fields() == 1, "Unexpected number of fields"); - - if (!innerValue) { - auto status = innerBuilder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); - return; - } - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append optional value: " << status.ToString()); - - innerValue = innerValue.GetOptionalValue(); - innerBuilder = structBuilder->field_builder(0); - } - - if (innerValue) { - AppendElement(innerValue.GetOptionalValue(), innerBuilder, innerType); - } else { - auto status = innerBuilder->AppendNull(); - YQL_ENSURE(status.ok(), "Failed to append null optional value: " << status.ToString()); - } - break; - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(type); - auto itemType = listType->GetItemType(); - - YQL_ENSURE(builder->type()->id() == arrow::Type::LIST, "Unexpected builder type"); - auto listBuilder = reinterpret_cast(builder); - - auto status = listBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append list value: " << status.ToString()); - - auto innerBuilder = listBuilder->value_builder(); - if (auto item = value.GetElements()) { - auto length = value.GetListLength(); - while (length > 0) { - AppendElement(*item++, innerBuilder, itemType); - --length; - } - } else { - const auto iter = value.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - AppendElement(item, innerBuilder, itemType); - } - } - break; - } - - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(builder); - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append struct value: " << status.ToString()); - - YQL_ENSURE(static_cast(structBuilder->num_fields()) == structType->GetMembersCount(), "Unexpected number of fields"); - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto innerBuilder = structBuilder->field_builder(index); - auto memberType = structType->GetMemberType(index); - AppendElement(value.GetElement(index), innerBuilder, memberType); - } - break; - } - - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - auto structBuilder = reinterpret_cast(builder); - - auto status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append tuple value: " << status.ToString()); - - YQL_ENSURE(static_cast(structBuilder->num_fields()) == tupleType->GetElementsCount(), "Unexpected number of fields"); - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto innerBuilder = structBuilder->field_builder(index); - auto elementType = tupleType->GetElementType(index); - AppendElement(value.GetElement(index), innerBuilder, elementType); - } - break; - } - - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(type); - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - - arrow::ArrayBuilder* keyBuilder = nullptr; - arrow::ArrayBuilder* itemBuilder = nullptr; - arrow::StructBuilder* structBuilder = nullptr; - - YQL_ENSURE(builder->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - arrow::StructBuilder* wrapBuilder = reinterpret_cast(builder); - YQL_ENSURE(wrapBuilder->num_fields() == 2, "Unexpected number of fields"); - - auto status = wrapBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - YQL_ENSURE(wrapBuilder->field_builder(0)->type()->id() == arrow::Type::LIST, "Unexpected builder type"); - auto listBuilder = reinterpret_cast(wrapBuilder->field_builder(0)); - - auto status = listBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - YQL_ENSURE(listBuilder->value_builder()->type()->id() == arrow::Type::STRUCT, "Unexpected builder type"); - structBuilder = reinterpret_cast( - listBuilder->value_builder()); - YQL_ENSURE(structBuilder->num_fields() == 2, "Unexpected number of fields"); - - keyBuilder = structBuilder->field_builder(0); - itemBuilder = structBuilder->field_builder(1); - } else { - YQL_ENSURE(wrapBuilder->field_builder(0)->type()->id() == arrow::Type::MAP, "Unexpected builder type"); - auto mapBuilder = reinterpret_cast(wrapBuilder->field_builder(0)); - - auto status = mapBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - keyBuilder = mapBuilder->key_builder(); - itemBuilder = mapBuilder->item_builder(); - } - - arrow::UInt64Builder* customBuilder = reinterpret_cast(wrapBuilder->field_builder(1)); - status = customBuilder->Append(0); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - - // We do not sort dictionary before appending it to builder. - const auto iter = value.GetDictIterator(); - for (NUdf::TUnboxedValue key, payload; iter.NextPair(key, payload);) { - if (structBuilder != nullptr) { - status = structBuilder->Append(); - YQL_ENSURE(status.ok(), "Failed to append dict value: " << status.ToString()); - } - - AppendElement(key, keyBuilder, keyType); - AppendElement(payload, itemBuilder, payloadType); - } - break; - } - - case NMiniKQL::TType::EKind::Variant: { - // TODO Need to properly convert variants containing more than 127*127 - // types? - auto variantType = static_cast(type); - - YQL_ENSURE(builder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); - auto unionBuilder = reinterpret_cast(builder); - - ui32 variantIndex = value.GetVariantIndex(); - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - - if (innerType->IsStruct()) { - innerType = static_cast(innerType)->GetMemberType(variantIndex); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - ui32 numberOfGroups = (variantType->GetAlternativesCount() - 1) / arrow::UnionType::kMaxTypeCode + 1; - YQL_ENSURE(static_cast(unionBuilder->num_children()) == numberOfGroups, "Unexpected variant number of groups"); - - ui32 groupIndex = variantIndex / arrow::UnionType::kMaxTypeCode; - auto status = unionBuilder->Append(groupIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto innerBuilder = unionBuilder->child_builder(groupIndex); - YQL_ENSURE(innerBuilder->type()->id() == arrow::Type::DENSE_UNION, "Unexpected builder type"); - auto innerUnionBuilder = reinterpret_cast(innerBuilder.get()); - - ui32 innerVariantIndex = variantIndex % arrow::UnionType::kMaxTypeCode; - status = innerUnionBuilder->Append(innerVariantIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto doubleInnerBuilder = innerUnionBuilder->child_builder(innerVariantIndex); - AppendElement(value.GetVariantItem(), doubleInnerBuilder.get(), innerType); - } else { - auto status = unionBuilder->Append(variantIndex); - YQL_ENSURE(status.ok(), "Failed to append variant value: " << status.ToString()); - - auto innerBuilder = unionBuilder->child_builder(variantIndex); - AppendElement(value.GetVariantItem(), innerBuilder.get(), innerType); - } - break; - } - - default: { - YQL_ENSURE(false, "Unsupported type: " << type->GetKindAsStr()); - } - } -} - -namespace NTestUtils { - -namespace { - -template -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - using TArrayType = typename arrow::TypeTraits::ArrayType; - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> // For darwin build -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - YQL_ENSURE(array->num_fields() == 2, "StructArray of some TzDate type should have 2 fields"); - - auto datetimeArray = array->field(0); - auto timezoneArray = std::static_pointer_cast(array->field(1)); - - NUdf::TUnboxedValuePod value; - auto typeId = datetimeArray->type_id(); - - switch (dataSlot) { - case NUdf::EDataSlot::TzDate: { - YQL_ENSURE(typeId == arrow::Type::UINT16); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDatetime: { - YQL_ENSURE(typeId == arrow::Type::UINT32); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzTimestamp: { - YQL_ENSURE(typeId == arrow::Type::UINT64); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDate32: { - YQL_ENSURE(typeId == arrow::Type::INT32); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - case NUdf::EDataSlot::TzDatetime64: - case NUdf::EDataSlot::TzTimestamp64: { - YQL_ENSURE(typeId == arrow::Type::INT64); - value = NUdf::TUnboxedValuePod(static_cast( - std::static_pointer_cast(datetimeArray)->Value(row))); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected timezone datetime data type"); - return NUdf::TUnboxedValuePod(); - } - } - - auto view = timezoneArray->Value(row); - value.SetTimezoneId(NMiniKQL::GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); - return value; -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - Y_UNUSED(dataSlot); - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - - switch (dataSlot) { - case NUdf::EDataSlot::Utf8: - case NUdf::EDataSlot::Json: { - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); - } - - case NUdf::EDataSlot::JsonDocument: { - auto variant = NBinaryJson::SerializeToBinaryJson(TStringBuf(data.data(), data.size())); - if (std::holds_alternative(variant)) { - const auto& json = std::get(variant); - return NMiniKQL::MakeString(NUdf::TStringRef(json.Data(), json.Size())); - } - - YQL_ENSURE(false, "Cannot serialize to binary json"); - break; - } - - case NUdf::EDataSlot::DyNumber: { - auto number = NDyNumber::ParseDyNumberString(TStringBuf(data.data(), data.size())); - if (number.Defined()) { - return NMiniKQL::MakeString(*number); - } - - YQL_ENSURE(false, "Failed to convert string to DyNumber"); - break; - } - - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - return NUdf::TUnboxedValuePod(); -} - -template <> -NUdf::TUnboxedValue GetUnboxedValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { - auto array = std::static_pointer_cast(column); - auto data = array->GetView(row); - - switch (dataSlot) { - case NUdf::EDataSlot::Uuid: { - return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); - } - - case NUdf::EDataSlot::Decimal: { - NYql::NDecimal::TInt128 value; - std::memcpy(&value, data.data(), data.size()); - return NUdf::TUnboxedValuePod(value); - } - - default: { - YQL_ENSURE(false, "Unexpected data slot"); - } - } - return NUdf::TUnboxedValuePod(); -} - -} // namespace - -std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type) { - auto arrayType = GetArrowType(type); - std::unique_ptr builder; - auto status = arrow::MakeBuilder(arrow::default_memory_pool(), arrayType, &builder); - YQL_ENSURE(status.ok(), "Failed to make arrow builder: " << status.ToString()); - return builder; -} - -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { - auto builder = MakeArrowBuilder(itemType); - auto status = builder->Reserve(values.size()); - YQL_ENSURE(status.ok(), "Failed to reserve space for array: " << status.ToString()); - for (auto& value : values) { - AppendElement(value, builder.get(), itemType); - } - std::shared_ptr result; - status = builder->Finish(&result); - YQL_ENSURE(status.ok(), "Failed to finish array: " << status.ToString()); - return result; -} - -NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, const NMiniKQL::TType* itemType, - const NMiniKQL::THolderFactory& holderFactory) -{ - if (array->IsNull(row)) { - return NUdf::TUnboxedValuePod(); - } - - switch (itemType->GetKind()) { - case NMiniKQL::TType::EKind::Void: - case NMiniKQL::TType::EKind::Null: - case NMiniKQL::TType::EKind::EmptyList: - case NMiniKQL::TType::EKind::EmptyDict: { - break; - } - - case NMiniKQL::TType::EKind::Data: { - auto dataType = static_cast(itemType); - NUdf::TUnboxedValue result; - auto dataSlot = *dataType->GetDataSlot().Get(); - bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, - [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); - result = GetUnboxedValue(array, row, dataSlot); - return true; - }); - YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); - return result; - } - - case NMiniKQL::TType::EKind::Struct: { - auto structType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); - - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); - } - return result; - } - - case NMiniKQL::TType::EKind::Tuple: { - auto tupleType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); - - NUdf::TUnboxedValue* itemsPtr = nullptr; - auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); - - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - itemsPtr[index] = ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); - } - return result; - } - - case NMiniKQL::TType::EKind::Optional: { - auto optionalType = static_cast(itemType); - auto innerOptionalType = optionalType->GetItemType(); - - if (NeedWrapByExternalOptional(innerOptionalType)) { - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - - auto innerArray = array; - auto innerType = itemType; - - NUdf::TUnboxedValue value; - int depth = 0; - - while (innerArray->type_id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(innerArray); - YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); - - if (structArray->IsNull(row)) { - value = NUdf::TUnboxedValuePod(); - break; - } - - innerType = static_cast(innerType)->GetItemType(); - innerArray = structArray->field(0); - ++depth; - } - - auto wrap = NeedWrapByExternalOptional(innerType); - if (wrap || !innerArray->IsNull(row)) { - value = ExtractUnboxedValue(innerArray, row, innerType, holderFactory); - if (wrap) { - --depth; - } - } - - for (int i = 0; i < depth; ++i) { - value = value.MakeOptional(); - } - return value; - } - - return ExtractUnboxedValue(array, row, innerOptionalType, holderFactory).Release().MakeOptional(); - } - - case NMiniKQL::TType::EKind::List: { - auto listType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto typedArray = static_pointer_cast(array); - - auto arraySlice = typedArray->value_slice(row); - auto itemType = listType->GetItemType(); - const auto len = arraySlice->length(); - - NUdf::TUnboxedValue* items = nullptr; - auto list = holderFactory.CreateDirectArrayHolder(len, items); - for (ui64 i = 0; i < static_cast(len); ++i) { - *items++ = ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); - } - return list; - } - - case NMiniKQL::TType::EKind::Dict: { - auto dictType = static_cast(itemType); - - auto keyType = dictType->GetKeyType(); - auto payloadType = dictType->GetPayloadType(); - auto dictBuilder = holderFactory.NewDict(dictType, NUdf::TDictFlags::EDictKind::Hashed); - - std::shared_ptr keyArray = nullptr; - std::shared_ptr payloadArray = nullptr; - ui64 dictLength = 0; - ui64 offset = 0; - - YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto wrapArray = static_pointer_cast(array); - YQL_ENSURE(wrapArray->num_fields() == 2, "Unexpected count of fields"); - - auto dictSlice = wrapArray->field(0); - - if (keyType->GetKind() == NMiniKQL::TType::EKind::Optional) { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::LIST, "Unexpected array type"); - auto listArray = static_pointer_cast(dictSlice); - - auto arraySlice = listArray->value_slice(row); - YQL_ENSURE(arraySlice->type_id() == arrow::Type::STRUCT, "Unexpected array type"); - auto structArray = static_pointer_cast(arraySlice); - YQL_ENSURE(structArray->num_fields() == 2, "Unexpected count of fields"); - - dictLength = arraySlice->length(); - keyArray = structArray->field(0); - payloadArray = structArray->field(1); - } else { - YQL_ENSURE(dictSlice->type_id() == arrow::Type::MAP, "Unexpected array type"); - auto mapArray = static_pointer_cast(dictSlice); - - dictLength = mapArray->value_length(row); - offset = mapArray->value_offset(row); - keyArray = mapArray->keys(); - payloadArray = mapArray->items(); - } - - for (ui64 i = offset; i < offset + static_cast(dictLength); ++i) { - auto key = ExtractUnboxedValue(keyArray, i, keyType, holderFactory); - auto payload = ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); - dictBuilder->Add(std::move(key), std::move(payload)); - } - return dictBuilder->Build(); - } - - case NMiniKQL::TType::EKind::Variant: { - // TODO Need to properly convert variants containing more than 127*127 - // types? - auto variantType = static_cast(itemType); - - YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto unionArray = static_pointer_cast(array); - - auto variantIndex = unionArray->child_id(row); - auto rowInChild = unionArray->value_offset(row); - std::shared_ptr valuesArray = unionArray->field(variantIndex); - - if (variantType->GetAlternativesCount() > arrow::UnionType::kMaxTypeCode) { - // Go one step deeper - YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); - auto innerUnionArray = static_pointer_cast(valuesArray); - auto innerVariantIndex = innerUnionArray->child_id(rowInChild); - - rowInChild = innerUnionArray->value_offset(rowInChild); - valuesArray = innerUnionArray->field(innerVariantIndex); - variantIndex =variantIndex * arrow::UnionType::kMaxTypeCode + innerVariantIndex; - } - - NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct()) { - innerType =static_cast(innerType)->GetMemberType(variantIndex); - } else { - YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - - NUdf::TUnboxedValue value = ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); - return holderFactory.CreateVariantHolder(value.Release(), variantIndex); - } - default: { - YQL_ENSURE(false, "Unsupported type: " << itemType->GetKindAsStr()); - } - } - return NUdf::TUnboxedValuePod(); -} - -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, const NMiniKQL::TType* itemType, - const NMiniKQL::THolderFactory& holderFactory) -{ - NMiniKQL::TUnboxedValueVector values; - values.reserve(array->length()); - for (auto i = 0; i < array->length(); ++i) { - values.push_back(ExtractUnboxedValue(array, i, itemType, holderFactory)); - } - return values; -} - -} // namespace NTestUtils - -} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h b/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h deleted file mode 100644 index 0c3c62a0e2da..000000000000 --- a/ydb/core/kqp/common/result_set_format/kqp_result_set_arrow.h +++ /dev/null @@ -1,91 +0,0 @@ -#pragma once - -#include - -#include -#include - -namespace NKikimr::NKqp::NFormats { - -/** - * @brief Convert TType to the arrow::DataType object - * - * The logic of this conversion is from YQL-15332: - * - * Void, Null => NullType - * Bool => Uint8 - * Integral => Uint8..Uint64, Int8..Int64 - * Floats => Float, Double - * Date => Uint16 - * Datetime => Uint32 - * Timestamp => Uint64 - * Interval => Int64 - * Date32 => Int32 - * Interval64, Timestamp64, Datetime64 => Int64 - * Utf8, Json => String - * String, Yson, JsonDocument => Binary - * Decimal, UUID => FixedSizeBinary(16) - * Timezone datetime type => StructArray - * DyNumber => BinaryArray - * - * Struct, Tuple, EmptyList, EmptyDict => StructArray - * Names of fields constructed from tuple are just empty strings. - * - * List => ListArray - * - * Variant => DenseUnionArray - * If variant contains more than 127 items then we map - * Variant => DenseUnionArray - * TODO Implement convertion of data to DenseUnionArray and - * back - * - * Optional => StructArray if T is Variant - * Because DenseUnionArray does not have validity bitmap - * Optional => T for other types - * By default, other types have a validity bitmap - * - * Optional...>> => - * StructArray...>> For example: - * - Optional> => StructArray - * Int32 has validity bitmap, so we wrap it in StructArray N - 1 times, where - * N is the number of Optional levels - * - Optional>> => - * StructArray>> DenseUnionArray does - * not have validity bitmap, so we wrap it in StructArray N times, where N is - * the number of Optional levels - * - * Dict => StructArray, - * Uint64Array (on demand, default: 0)> We do not use arrow::DictArray because - * it must be used for encoding not for mapping keys to values. - * (https://arrow.apache.org/docs/cpp/api/array.html#classarrow_1_1_dictionary_array) - * If the type of dict key is optional then we map - * Dict, ValueType> => - * StructArray, Uint64Array (on - * demand, default: 0)> because keys of MapArray can not be nullable - * - * - * @param type Yql type to parse - * @return std::shared_ptr arrow type of the same structure as - * type - */ -std::shared_ptr GetArrowType(const NMiniKQL::TType* type); - -bool IsArrowCompatible(const NMiniKQL::TType* type); - -void AppendElement(NUdf::TUnboxedValue value, arrow::ArrayBuilder* builder, const NMiniKQL::TType* type); - -namespace NTestUtils { - -std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type); - -std::shared_ptr MakeArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); - -NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, - const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); - -NMiniKQL::TUnboxedValueVector ExtractUnboxedValues(const std::shared_ptr& array, - const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); - -} // namespace NTestUtils - -} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp b/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp index e67360a0f4e6..8dd8edf4b2c5 100644 --- a/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp +++ b/ydb/core/kqp/common/result_set_format/kqp_result_set_builders.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp new file mode 100644 index 000000000000..4ddc892f6c80 --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp @@ -0,0 +1,2793 @@ +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +using namespace NKikimr::NMiniKQL; +using namespace NYql; + +inline static constexpr size_t TEST_ARRAY_DATATYPE_SIZE = 1 << 16; +inline static constexpr size_t TEST_ARRAY_NESTED_SIZE = 1 << 8; +inline static constexpr ui8 DECIMAL_PRECISION = 35; +inline static constexpr ui8 DECIMAL_SCALE = 10; +inline static constexpr ui32 VARIANT_NESTED_SIZE = 260; +inline static constexpr ui32 VARIANT_OVER_LIMIT_SIZE = NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE + 1; + +static_assert(DECIMAL_PRECISION >= DECIMAL_SCALE, "Decimal precision must be greater than or equal to scale"); +static_assert(VARIANT_NESTED_SIZE <= NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE, "VARIANT_NESTED_SIZE must be less than or equal to MAX_VARIANT_NESTED_SIZE"); +static_assert(VARIANT_OVER_LIMIT_SIZE > NKikimr::NKqp::NFormats::MAX_VARIANT_NESTED_SIZE, "VARIANT_OVER_LIMIT_SIZE must be greater than MAX_VARIANT_NESTED_SIZE"); + +namespace { + +ui16 GetTimezoneIdSkipEmpty(ui16 index) { + const auto& timezones = NTi::GetTimezones(); + auto name = timezones[index % timezones.size()]; + return GetTimezoneId(name.empty() ? "Europe/Moscow" : name); +} + +std::string SerializeToBinaryJson(const TStringBuf json) { + auto variant = NKikimr::NBinaryJson::SerializeToBinaryJson(json); + if (std::holds_alternative(variant)) { + const auto binaryJson = std::get(variant); + return std::string(binaryJson.Data(), binaryJson.Size()); + } + UNIT_ASSERT_C(false, "Cannot serialize binary json"); + return {}; +} + +NUdf::TUnboxedValue GetValueOfBasicType(TType* type, ui64 value) { + Y_ABORT_UNLESS(type->GetKind() == TType::EKind::Data); + auto dataType = static_cast(type); + auto slot = *dataType->GetDataSlot().Get(); + switch (slot) { + case NUdf::EDataSlot::Bool: + return NUdf::TUnboxedValuePod(static_cast(value % 2 == 0)); + case NUdf::EDataSlot::Int8: + return NUdf::TUnboxedValuePod(static_cast(-(value % ((1 << 7) - 1)))); + case NUdf::EDataSlot::Uint8: + return NUdf::TUnboxedValuePod(static_cast(value % ((1 << 8)))); + case NUdf::EDataSlot::Int16: + return NUdf::TUnboxedValuePod(static_cast(-(value % ((1 << 15) - 1)))); + case NUdf::EDataSlot::Uint16: + return NUdf::TUnboxedValuePod(static_cast(value % (1 << 15))); + case NUdf::EDataSlot::Int32: + return NUdf::TUnboxedValuePod(static_cast(-(value % ((1ULL << 31) - 1)))); + case NUdf::EDataSlot::Uint32: + return NUdf::TUnboxedValuePod(static_cast(value % (1ULL << 31))); + case NUdf::EDataSlot::Int64: + return NUdf::TUnboxedValuePod(static_cast(-(value % ((1ULL << 63) - 1)))); + case NUdf::EDataSlot::Uint64: + return NUdf::TUnboxedValuePod(static_cast(value % (1ULL << 63))); + case NUdf::EDataSlot::Float: + return NUdf::TUnboxedValuePod(static_cast(value) / 1234); + case NUdf::EDataSlot::Double: + return NUdf::TUnboxedValuePod(static_cast(value) / 12345); + case NUdf::EDataSlot::Decimal: { + auto decimal = NDecimal::FromString(TStringBuilder() << value << ".123", DECIMAL_PRECISION, DECIMAL_SCALE); + return NUdf::TUnboxedValuePod(decimal); + } + case NUdf::EDataSlot::DyNumber: { + auto number = NKikimr::NDyNumber::ParseDyNumberString(TStringBuilder() << value); + UNIT_ASSERT_C(number.Defined(), "Failed to convert string to DyNumber"); + return MakeString(*number); + } + case NUdf::EDataSlot::Date: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE)); + case NUdf::EDataSlot::Datetime: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME)); + case NUdf::EDataSlot::Timestamp: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP)); + case NUdf::EDataSlot::Interval: + return NUdf::TUnboxedValuePod(static_cast(value / 2 - 1)); + case NUdf::EDataSlot::TzDate: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::TzDatetime: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::TzTimestamp: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::Date32: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE32)); + case NUdf::EDataSlot::Datetime64: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME64)); + case NUdf::EDataSlot::Timestamp64: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP64)); + case NUdf::EDataSlot::Interval64: + return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_INTERVAL64)); + case NUdf::EDataSlot::TzDate32: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE32)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::TzDatetime64: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME64)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::TzTimestamp64: { + auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP64)); + ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); + return ret; + } + case NUdf::EDataSlot::String: { + std::string string = TStringBuilder() << value; + return MakeString(NUdf::TStringRef(string.data(), string.size())); + } + case NUdf::EDataSlot::Utf8: { + std::string string = TStringBuilder() << value << "utf8"; + return MakeString(NUdf::TStringRef(string.data(), string.size())); + } + case NUdf::EDataSlot::Yson: { + std::string yson = TStringBuilder() << '[' << value << ']'; + return MakeString(NUdf::TStringRef(yson.data(), yson.size())); + } + case NUdf::EDataSlot::Json: { + std::string json = TStringBuilder() << '[' << value << ']'; + return MakeString(NUdf::TStringRef(json.data(), json.size())); + } + case NUdf::EDataSlot::JsonDocument: { + std::string json = SerializeToBinaryJson(TStringBuilder() << "{\"b\": " << value << ", \"a\": " << value / 2 << "}"); + return MakeString(NUdf::TStringRef(json.data(), json.size())); + } + case NUdf::EDataSlot::Uuid: { + std::string uuid; + for (size_t i = 0; i < NKikimr::NScheme::FSB_SIZE / 2; ++i) { + uuid += "a" + std::to_string((i + value) % 10); + } + return MakeString(NUdf::TStringRef(uuid)); + } + } + + return NUdf::TUnboxedValuePod(); +} + +struct TTestContext { + TScopedAlloc Alloc; + TTypeEnvironment TypeEnv; + TMemoryUsageInfo MemInfo; + THolderFactory HolderFactory; + TDefaultValueBuilder Vb; + + TVector BasicTypes = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataDecimalType::Create(DECIMAL_PRECISION, DECIMAL_SCALE, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + + TTestContext() + : Alloc(__LOCATION__) + , TypeEnv(Alloc) + , MemInfo("TestMem") + , HolderFactory(Alloc.Ref(), MemInfo) + , Vb(HolderFactory) + { + } + + TType* GetStructType() { + std::vector members = { + {"ABC", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"DEF", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"GHI", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"JKL", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"MNO", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + return TStructType::Create(5, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructs(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(5, items); + + std::string string = TStringBuilder() << value; + items[0] = MakeString(NUdf::TStringRef(string.data(), string.size())); + items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[2] = NUdf::TUnboxedValuePod((ui64) (value)); + items[3] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[4] = NUdf::TUnboxedValuePod(MakeString(NUdf::TStringRef(string.data(), string.size()))); + + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetStructNestedValueType() { + auto listType = GetListType(); + std::vector innerMembers = { + {"12", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"34", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + auto innerStructType = TStructType::Create(2, innerMembers.data(), TypeEnv); + + std::vector members = { + {"56", listType}, + {"78", innerStructType}, + {"910", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + return TStructType::Create(3, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsNestedValue(ui32 quantity) { + TUnboxedValueVector values; + auto lists = CreateLists(quantity); + + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(3, items); + + items[0] = lists[value]; + + NUdf::TUnboxedValue* innerItems; + auto innerStructValue = Vb.NewArray(2, innerItems); + innerItems[0] = NUdf::TUnboxedValuePod(static_cast(value)); + innerItems[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = std::move(innerStructValue); + + items[2] = NUdf::TUnboxedValuePod(static_cast(value)); + + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetStructOptionalValueType() { + std::vector members = { + {"opt1", TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv)}, + {"opt2", TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv)}, + {"opt3", TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv)}, + {"optless", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + return TStructType::Create(4, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(4, items); + + if (value % 2 == 0) { + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + items[1] = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + items[0] = NUdf::TUnboxedValuePod(); + items[1] = NUdf::TUnboxedValuePod(); + } + + if (value % 3 == 0) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + } else { + items[2] = NUdf::TUnboxedValuePod(); + } + + items[3] = NUdf::TUnboxedValuePod(static_cast(value)); + + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetStructTaggedValueType() { + std::vector members = { + {"1", TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv)}, + {"2", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"3", TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag2", TypeEnv)}, + }; + return TStructType::Create(3, members.data(), TypeEnv); + } + + TUnboxedValueVector CreateStructsTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto structValue = Vb.NewArray(3, items); + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = NUdf::TUnboxedValuePod(static_cast(value)); + items[2] = NUdf::TUnboxedValuePod(static_cast(value)); + values.emplace_back(std::move(structValue)); + } + return values; + } + + TType* GetTupleType() { + TType* members[3] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + return TTupleType::Create(3, members, TypeEnv); + } + + TUnboxedValueVector CreateTuples(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(3, items); + items[0] = NUdf::TUnboxedValuePod(value % 3 == 0); + items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[2] = NUdf::TUnboxedValuePod(static_cast(value)); + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetTupleNestedValueType() { + auto listType = GetListType(); + auto structType = GetStructType(); + TType* members[2] = { + listType, + structType, + }; + return TTupleType::Create(2, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesNestedValue(ui32 quantity) { + TUnboxedValueVector values; + + auto lists = CreateLists(quantity); + auto structs = CreateStructs(quantity); + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(2, items); + items[0] = lists[value]; + items[1] = structs[value]; + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetTupleOptionalValueType() { + TType* members[3] = { + TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), + TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv), + TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv) + }; + return TTupleType::Create(3, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(3, items); + + if (value % 2 == 0) { + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + items[1] = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + items[0] = NUdf::TUnboxedValuePod(); + items[1] = NUdf::TUnboxedValuePod(); + } + + if (value % 3 == 0) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + items[2] = NUdf::TUnboxedValuePod(static_cast(-value)).MakeOptional(); + } else { + items[2] = NUdf::TUnboxedValuePod(); + } + + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetTupleTaggedValueType() { + TType* members[2] = { + TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + return TTupleType::Create(2, members, TypeEnv); + } + + TUnboxedValueVector CreateTuplesTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui32 value = 0; value < quantity; ++value) { + NUdf::TUnboxedValue* items; + auto tupleValue = Vb.NewArray(2, items); + items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); + items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); + values.push_back(std::move(tupleValue)); + } + return values; + } + + TType* GetListType() { + auto itemType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateLists(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + items.push_back(NUdf::TUnboxedValuePod(static_cast(-i))); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListNestedValueType() { + std::vector members = { + {"first", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"second", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + }; + auto itemType = TStructType::Create(2, members.data(), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsNestedValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue* structItem; + auto structItemValue = Vb.NewArray(2, structItem); + structItem[0] = NUdf::TUnboxedValuePod(static_cast(i)); + structItem[1] = NUdf::TUnboxedValuePod(static_cast(-i)); + items.push_back(std::move(structItemValue)); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListOptionalValueType() { + auto itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + items.push_back((i % 2 == 0) ? NUdf::TUnboxedValuePod(static_cast(-i)).MakeOptional() : NUdf::TUnboxedValuePod()); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListOptionalVariantValueType() { + auto itemType = TOptionalType::Create(GetVariantOverTupleType(), TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsOptionalVariantValue(ui32 quantity) { + TUnboxedValueVector values; + values.reserve(quantity); + for (ui64 value = 0; value < quantity; ++value) { + TUnboxedValueVector items; + items.reserve(value); + for (ui64 i = 0; i < value; ++i) { + auto typeIndex = i % 4; + NUdf::TUnboxedValue item; + if (typeIndex == 0) { + item = NUdf::TUnboxedValuePod(i % 3 == 0); + } else if (typeIndex == 1) { + item = NUdf::TUnboxedValuePod(static_cast(-i)); + } else if (typeIndex == 2) { + item = NUdf::TUnboxedValuePod(static_cast(i)); + } else if (typeIndex == 3) { + item = NUdf::TUnboxedValuePod(static_cast(-i)); + } + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + items.emplace_back(std::move(wrapped)); + } + auto listValue = Vb.NewList(items.data(), value); + values.emplace_back(std::move(listValue)); + } + return values; + } + + TType* GetListTaggedValueType() { + auto itemType = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "test", TypeEnv); + return TListType::Create(itemType, TypeEnv); + } + + TUnboxedValueVector CreateListsTaggedValue(ui32 quantity) { + return CreateLists(quantity); + } + + TType* GetDictType() { + TType* keyType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDicts(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key = NUdf::TUnboxedValuePod(static_cast(i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictNestedKeyType() { + TType* tupleItems[2] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + }; + TType* keyType = TTupleType::Create(2, tupleItems, TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsNestedKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictNestedKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue* keyItems; + auto keyValue = Vb.NewArray(2, keyItems); + keyItems[0] = NUdf::TUnboxedValuePod(static_cast(i)); + keyItems[1] = NUdf::TUnboxedValuePod(static_cast(-i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(keyValue), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictOptionalKeyType() { + TType* keyType = TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsOptionalKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictOptionalKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key; + if (i % 3 == 0) { + key = NUdf::TUnboxedValuePod(static_cast(i)).MakeOptional().MakeOptional(); + } else if (i % 3 == 1) { + key = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + key = NUdf::TUnboxedValuePod(); + } + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictTaggedKeyType() { + TType* keyType = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "key_tag", TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsTaggedKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictTaggedKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key = NUdf::TUnboxedValuePod(static_cast(i)); + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetDictOptionalVariantKeyType() { + TType* variantMembers[2] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + auto tupleType = TTupleType::Create(2, variantMembers, TypeEnv); + auto variantType = TVariantType::Create(tupleType, TypeEnv); + TType* keyType = TOptionalType::Create(TOptionalType::Create(variantType, TypeEnv), TypeEnv); + TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + return TDictType::Create(keyType, payloadType, TypeEnv); + } + + TUnboxedValueVector CreateDictsOptionalVariantKey(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto dictBuilder = Vb.NewDict(GetDictOptionalVariantKeyType(), 0); + for (ui64 i = 0; i < value; ++i) { + NUdf::TUnboxedValue key; + if (i % 3 == 0) { + auto typeIndex = i % 2; + NUdf::TUnboxedValue variantItem; + if (typeIndex == 0) { + variantItem = NUdf::TUnboxedValuePod(static_cast(i)); + } else { + variantItem = NUdf::TUnboxedValuePod(static_cast(i)); + } + auto variantValue = Vb.NewVariant(typeIndex, std::move(variantItem)); + key = variantValue.MakeOptional().MakeOptional(); + } else if (i % 3 == 1) { + key = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + key = NUdf::TUnboxedValuePod(); + } + NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(i * value)); + dictBuilder->Add(std::move(key), std::move(payload)); + } + auto dictValue = dictBuilder->Build(); + values.emplace_back(std::move(dictValue)); + } + return values; + } + + TType* GetOptionalDataValueType() { + return TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsDataValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetOptionalSingularValueType() { + return TOptionalType::Create(GetTypeOfSingular(TypeEnv), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsSingularValueType(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetOptionalStructValueType() { + return TOptionalType::Create(GetStructType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsStructValue(ui32 quantity) { + TUnboxedValueVector values = CreateStructs(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalTupleValueType() { + return TOptionalType::Create(GetTupleType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsTupleValue(ui32 quantity) { + TUnboxedValueVector values = CreateTuples(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalListValueType() { + return TOptionalType::Create(GetListType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsValueList(ui32 quantity) { + TUnboxedValueVector values = CreateLists(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalDictValueType() { + return TOptionalType::Create(GetDictType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsDictValue(ui32 quantity) { + TUnboxedValueVector values = CreateDicts(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalVariantValueType() { + return TOptionalType::Create(GetVariantOverStructType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsVariantValue(ui32 quantity) { + TUnboxedValueVector values = CreateVariantsOverStruct(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalTaggedValueType() { + return TOptionalType::Create(GetTaggedType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsValueTagged(ui32 quantity) { + TUnboxedValueVector values = CreateTaggeds(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 2 == 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetOptionalOptionalValueType() { + return TOptionalType::Create(GetOptionalDataValueType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 3 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional()); + } else if (value % 3 == 1) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetOptionalOptionalVariantType() { + return TOptionalType::Create(GetOptionalVariantValueType(), TypeEnv); + } + + TUnboxedValueVector CreateOptionalsOptionalVariantValue(ui32 quantity) { + TUnboxedValueVector values = CreateOptionalsVariantValue(quantity); + for (size_t i = 0; i < values.size(); ++i) { + values[i] = (i % 4 != 0) ? values[i].MakeOptional() : NUdf::TUnboxedValuePod(); + } + return values; + } + + TType* GetTaggedType() { + return TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggeds(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value))); + } + return values; + } + + TType* GetTaggedStructValueType() { + return TTaggedType::Create(GetStructType(), "struct_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsStructValue(ui32 quantity) { + return CreateStructs(quantity); + } + + TType* GetTaggedTupleValueype() { + return TTaggedType::Create(GetTupleType(), "tuple_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsTupleValue(ui32 quantity) { + return CreateTuples(quantity); + } + + TType* GetTaggedListValueType() { + return TTaggedType::Create(GetListType(), "list_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsValueList(ui32 quantity) { + return CreateLists(quantity); + } + + TType* GetTaggedDictValueType() { + return TTaggedType::Create(GetDictType(), "dict_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsDictValue(ui32 quantity) { + return CreateDicts(quantity); + } + + TType* GetTaggedOptionalValueType() { + auto optionalType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + return TTaggedType::Create(optionalType, "opt_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedOptionalOptionalValueType() { + auto innerOptional = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); + auto outerOptional = TOptionalType::Create(innerOptional, TypeEnv); + return TTaggedType::Create(outerOptional, "opt_opt_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalOptionalValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 3 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional()); + } else if (value % 3 == 1) { + values.push_back(NUdf::TUnboxedValuePod().MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedTaggedValueType() { + auto innerTagged = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "inner_tag", TypeEnv); + return TTaggedType::Create(innerTagged, "outer_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value))); + } + return values; + } + + TType* GetTaggedOptionalTaggedValueType() { + auto innerTagged = TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "inner_tag", TypeEnv); + auto optional = TOptionalType::Create(innerTagged, TypeEnv); + return TTaggedType::Create(optional, "outer_tag", TypeEnv); + } + + TUnboxedValueVector CreateTaggedsOptionalTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetTaggedTaggedOptionalTaggedTaggedValueType() { + auto baseType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + auto innerTagged1 = TTaggedType::Create(baseType, "inner1", TypeEnv); + auto innerTagged2 = TTaggedType::Create(innerTagged1, "inner2", TypeEnv); + auto optional = TOptionalType::Create(innerTagged2, TypeEnv); + auto outerTagged1 = TTaggedType::Create(optional, "outer1", TypeEnv); + auto outerTagged2 = TTaggedType::Create(outerTagged1, "outer2", TypeEnv); + return outerTagged2; + } + + TUnboxedValueVector CreateTaggedsTaggedOptionalTaggedTaggedValue(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + if (value % 2 == 0) { + values.push_back(NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional()); + } else { + values.emplace_back(); + } + } + return values; + } + + TType* GetVariantOverStructType() { + TStructMember members[4] = { + {"0_i32", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"1_string", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"2_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, + {"3_bool", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} + }; + auto structType = TStructType::Create(4, members, TypeEnv); + return TVariantType::Create(structType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsOverStruct(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % 4; + NUdf::TUnboxedValue item; + if (typeIndex == 0) { + item = NUdf::TUnboxedValuePod(static_cast(value)); + } else if (typeIndex == 1) { + item = MakeString(TStringBuilder() << "value=" << value); + } else if (typeIndex == 2) { + item = NUdf::TUnboxedValuePod(static_cast(value) / 4); + } else if (typeIndex == 3) { + item = NUdf::TUnboxedValuePod(value % 2 == 0); + } + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.push_back(std::move(wrapped)); + } + return values; + } + + TType* GetVariantOverTupleType() { + TType* members[4] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TDataType::Create(NUdf::TDataType::Id, TypeEnv) + }; + auto tupleType = TTupleType::Create(4, members, TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsOverTuple(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % 4; + NUdf::TUnboxedValue item; + if (typeIndex == 0) { + item = NUdf::TUnboxedValuePod(value % 3 == 0); + } else if (typeIndex == 1) { + item = NUdf::TUnboxedValuePod(static_cast(-value)); + } else if (typeIndex == 2) { + item = NUdf::TUnboxedValuePod(static_cast(value)); + } else if (typeIndex == 3) { + item = NUdf::TUnboxedValuePod(static_cast(-value)); + } + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); + } + return values; + } + + TType* GetVariantNestedType() { + TVector members(VARIANT_NESTED_SIZE, nullptr); + for (ui32 i = 0; i < VARIANT_NESTED_SIZE; ++i) { + if (i % 3 == 0) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } else if (i % 3 == 1) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } else { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } + } + auto tupleType = TTupleType::Create(VARIANT_NESTED_SIZE, members.data(), TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsNested(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % VARIANT_NESTED_SIZE; + NUdf::TUnboxedValue item = NUdf::TUnboxedValuePod(static_cast(value)); + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); + } + return values; + } + + TType* GetVariantOverLimitType() { + TVector members(VARIANT_OVER_LIMIT_SIZE, nullptr); + for (ui32 i = 0; i < VARIANT_OVER_LIMIT_SIZE; ++i) { + members[i] = TDataType::Create(NUdf::TDataType::Id, TypeEnv); + } + auto tupleType = TTupleType::Create(VARIANT_OVER_LIMIT_SIZE, members.data(), TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsOverLimit(ui32 quantity) { + TUnboxedValueVector values; + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % VARIANT_OVER_LIMIT_SIZE; + NUdf::TUnboxedValue item = NUdf::TUnboxedValuePod(static_cast(value)); + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); + } + return values; + } + + TType* GetVariantComprehensiveType() { + // Variant over Tuple containing all type categories: + // Data, Optional, Optional, Singular, Struct, Tuple, List, Dict, Variant, Tagged + TType* members[10] = { + TDataType::Create(NUdf::TDataType::Id, TypeEnv), + TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), + TOptionalType::Create(TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv), TypeEnv), + GetTypeOfSingular(TypeEnv), + GetStructType(), + GetTupleType(), + GetListType(), + GetDictType(), + GetVariantOverTupleType(), + TTaggedType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), "tag", TypeEnv) + }; + auto tupleType = TTupleType::Create(10, members, TypeEnv); + return TVariantType::Create(tupleType, TypeEnv); + } + + TUnboxedValueVector CreateVariantsComprehensive(ui32 quantity) { + TUnboxedValueVector values; + auto structs = CreateStructs(5); + auto tuples = CreateTuples(5); + auto lists = CreateLists(5); + auto dicts = CreateDicts(5); + auto variants = CreateVariantsOverTuple(5); + + for (ui64 value = 0; value < quantity; ++value) { + auto typeIndex = value % 10; + NUdf::TUnboxedValue item; + + switch (typeIndex) { + case 0: + item = NUdf::TUnboxedValuePod(static_cast(value)); + break; + case 1: + if (value % 2 == 0) { + item = NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional(); + } else { + item = NUdf::TUnboxedValuePod(); + } + break; + case 2: + if (value % 3 == 0) { + item = NUdf::TUnboxedValuePod(static_cast(value)).MakeOptional().MakeOptional(); + } else if (value % 3 == 1) { + item = NUdf::TUnboxedValuePod().MakeOptional(); + } else { + item = NUdf::TUnboxedValuePod(); + } + break; + case 3: + item = NUdf::TUnboxedValuePod(); + break; + case 4: + item = structs[value % structs.size()]; + break; + case 5: + item = tuples[value % tuples.size()]; + break; + case 6: + item = lists[value % lists.size()]; + break; + case 7: + item = dicts[value % dicts.size()]; + break; + case 8: + item = variants[value % variants.size()]; + break; + case 9: + item = NUdf::TUnboxedValuePod(static_cast(value)); + break; + } + + auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); + values.emplace_back(std::move(wrapped)); + } + return values; + } +}; + +void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& right, TType* type) { + switch (type->GetKind()) { + case TType::EKind::Void: + case TType::EKind::Null: + case TType::EKind::EmptyList: + case TType::EKind::EmptyDict: { + UNIT_ASSERT(!left.HasValue()); + UNIT_ASSERT(!right.HasValue()); + break; + } + + case TType::EKind::Data: { + auto dataType = static_cast(type); + auto dataSlot = *dataType->GetDataSlot().Get(); + + switch (dataSlot) { + case NUdf::EDataSlot::JsonDocument: + left = MakeString(NKikimr::NBinaryJson::SerializeToJson(left.AsStringRef())); + right = MakeString(NKikimr::NBinaryJson::SerializeToJson(right.AsStringRef())); + case NUdf::EDataSlot::Json: + case NUdf::EDataSlot::Yson: { + UNIT_ASSERT_VALUES_EQUAL(std::string(left.AsStringRef()), std::string(right.AsStringRef())); + break; + } + + default: { + UNIT_ASSERT(NUdf::EquateValues(dataSlot, left, right)); + } + } + break; + } + + case TType::EKind::Optional: { + UNIT_ASSERT_VALUES_EQUAL(left.HasValue(), right.HasValue()); + if (left.HasValue()) { + auto innerType = static_cast(type)->GetItemType(); + NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); + NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); + AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); + } + break; + } + + case TType::EKind::List: { + auto listType = static_cast(type); + auto itemType = listType->GetItemType(); + + auto leftPtr = left.GetElements(); + auto rightPtr = right.GetElements(); + UNIT_ASSERT_VALUES_EQUAL(leftPtr != nullptr, rightPtr != nullptr); + + if (leftPtr != nullptr) { + auto leftLen = left.GetListLength(); + auto rightLen = right.GetListLength(); + UNIT_ASSERT_VALUES_EQUAL(leftLen, rightLen); + + while (leftLen > 0) { + NUdf::TUnboxedValue leftItem = *leftPtr++; + NUdf::TUnboxedValue rightItem = *rightPtr++; + AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + --leftLen; + } + } else { + const auto leftIter = left.GetListIterator(); + const auto rightIter = right.GetListIterator(); + + NUdf::TUnboxedValue leftItem; + NUdf::TUnboxedValue rightItem; + bool leftHasValue = leftIter.Next(leftItem); + bool rightHasValue = rightIter.Next(leftItem); + + while (leftHasValue && rightHasValue) { + AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); + leftHasValue = leftIter.Next(leftItem); + rightHasValue = rightIter.Next(leftItem); + } + UNIT_ASSERT_VALUES_EQUAL(leftHasValue, rightHasValue); + } + break; + } + + case TType::EKind::Struct: { + auto structType = static_cast(type); + UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); + UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + NUdf::TUnboxedValue leftMember = left.GetElement(index); + NUdf::TUnboxedValue rightMember = right.GetElement(index); + AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); + } + break; + } + + case TType::EKind::Tuple: { + auto tupleType = static_cast(type); + + UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); + UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); + + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + NUdf::TUnboxedValue leftMember = left.GetElement(index); + NUdf::TUnboxedValue rightMember = right.GetElement(index); + AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); + } + break; + } + + case TType::EKind::Dict: { + auto dictType = static_cast(type); + UNIT_ASSERT_VALUES_EQUAL(left.GetDictLength(), right.GetDictLength()); + + const auto leftIter = left.GetDictIterator(); + for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { + UNIT_ASSERT(right.Contains(key)); + NUdf::TUnboxedValue rightPayload = right.Lookup(key); + AssertUnboxedValuesAreEqual(leftPayload, rightPayload, dictType->GetPayloadType()); + } + break; + } + + case TType::EKind::Variant: { + auto variantType = static_cast(type); + UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); + ui32 variantIndex = left.GetVariantIndex(); + TType* innerType = variantType->GetUnderlyingType(); + if (innerType->IsStruct()) { + innerType = static_cast(innerType)->GetMemberType(variantIndex); + } else { + UNIT_ASSERT_C(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + NUdf::TUnboxedValue leftValue = left.GetVariantItem(); + NUdf::TUnboxedValue rightValue = right.GetVariantItem(); + AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); + break; + } + + case TType::EKind::Tagged: { + auto taggedType = static_cast(type); + AssertUnboxedValuesAreEqual(left, right, taggedType->GetBaseType()); + break; + } + + default: { + UNIT_ASSERT_C(false, TStringBuilder() << "Unsupported type: " << type->GetKindAsStr()); + } + } +} + +} // namespace + +namespace NKikimr::NKqp::NFormats { + +namespace { + +template +void TestDataTypeConversion(arrow::Type::type arrowTypeId) { + TTestContext context; + + auto type = TDataType::Create(NUdf::TDataType::Id, context.TypeEnv); + UNIT_ASSERT(IsArrowCompatible(type)); + + TUnboxedValueVector values; + values.reserve(TEST_ARRAY_DATATYPE_SIZE); + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + values.emplace_back(GetValueOfBasicType(type, i)); + } + + auto array = MakeArrowArray(values, type); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + std::shared_ptr typedArray; + std::shared_ptr timezoneArray; + + if constexpr (IsTimezoneType) { + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrowTypeId); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRING); + + typedArray = static_pointer_cast(structArray->field(0)); + timezoneArray = static_pointer_cast(structArray->field(1)); + } else { + UNIT_ASSERT(array->type_id() == arrowTypeId); + typedArray = static_pointer_cast(array); + } + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); + } +} + +template +void TestFixedSizeBinaryDataTypeConversion() { + TTestContext context; + TType* type; + + if constexpr (IsDecimalType) { + type = TDataDecimalType::Create(35, 10, context.TypeEnv); + } else { + type = TDataType::Create(NUdf::TDataType::Id, context.TypeEnv); + } + + UNIT_ASSERT(IsArrowCompatible(type)); + + TUnboxedValueVector values; + values.reserve(TEST_ARRAY_DATATYPE_SIZE); + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + values.emplace_back(GetValueOfBasicType(type, i)); + } + + auto array = MakeArrowArray(values, type); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + std::shared_ptr typedArray; + + UNIT_ASSERT(array->type_id() == arrow::Type::FIXED_SIZE_BINARY); + typedArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(typedArray->byte_width(), NScheme::FSB_SIZE); + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); + } +} + +template +void TestSingularTypeConversion() { + TTestContext context; + + TType* type = GetTypeOfSingular(context.TypeEnv); + UNIT_ASSERT(IsArrowCompatible(type)); + + TUnboxedValueVector values; + values.reserve(TEST_ARRAY_DATATYPE_SIZE); + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + values.emplace_back(); + } + + auto array = MakeArrowArray(values, type); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), TEST_ARRAY_DATATYPE_SIZE); + + if (SingularKind == TType::EKind::Null) { + UNIT_ASSERT(array->type_id() == arrow::Type::NA); + } else { + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 0); + } + + for (size_t i = 0; i < TEST_ARRAY_DATATYPE_SIZE; ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, type, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], type); + } +} + +} // namespace + +Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { + + // Integral types + Y_UNIT_TEST(DataType_Bool) { + TestDataTypeConversion(arrow::Type::UINT8); + } + + Y_UNIT_TEST(DataType_Int8) { + TestDataTypeConversion(arrow::Type::INT8); + } + + Y_UNIT_TEST(DataType_UInt8) { + TestDataTypeConversion(arrow::Type::UINT8); + } + + Y_UNIT_TEST(DataType_Int16) { + TestDataTypeConversion(arrow::Type::INT16); + } + + Y_UNIT_TEST(DataType_UInt16) { + TestDataTypeConversion(arrow::Type::UINT16); + } + + Y_UNIT_TEST(DataType_Int32) { + TestDataTypeConversion(arrow::Type::INT32); + } + + Y_UNIT_TEST(DataType_UInt32) { + TestDataTypeConversion(arrow::Type::UINT32); + } + + Y_UNIT_TEST(DataType_Int64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_UInt64) { + TestDataTypeConversion(arrow::Type::UINT64); + } + + // Binary number types + Y_UNIT_TEST(DataType_Decimal) { + TestFixedSizeBinaryDataTypeConversion(); + } + + Y_UNIT_TEST(DataType_DyNumber) { + TestDataTypeConversion(arrow::Type::STRING); + } + + // Floating point types + Y_UNIT_TEST(DataType_Float) { + TestDataTypeConversion(arrow::Type::FLOAT); + } + + Y_UNIT_TEST(DataType_Double) { + TestDataTypeConversion(arrow::Type::DOUBLE); + } + + // Datetime types + Y_UNIT_TEST(DataType_Date) { + TestDataTypeConversion(arrow::Type::UINT16); + } + + Y_UNIT_TEST(DataType_Datetime) { + TestDataTypeConversion(arrow::Type::UINT32); + } + + Y_UNIT_TEST(DataType_Timestamp) { + TestDataTypeConversion(arrow::Type::UINT64); + } + + Y_UNIT_TEST(DataType_Interval) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_TzDate) { + TestDataTypeConversion(arrow::Type::UINT16); + } + + Y_UNIT_TEST(DataType_TzDatetime) { + TestDataTypeConversion(arrow::Type::UINT32); + } + + Y_UNIT_TEST(DataType_TzTimestamp) { + TestDataTypeConversion(arrow::Type::UINT64); + } + + Y_UNIT_TEST(DataType_Date32) { + TestDataTypeConversion(arrow::Type::INT32); + } + + Y_UNIT_TEST(DataType_Datetime64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_Timestamp64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_Interval64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_TzDate32) { + TestDataTypeConversion(arrow::Type::INT32); + } + + Y_UNIT_TEST(DataType_TzDatetime64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + Y_UNIT_TEST(DataType_TzTimestamp64) { + TestDataTypeConversion(arrow::Type::INT64); + } + + // String types + Y_UNIT_TEST(DataType_String) { + TestDataTypeConversion(arrow::Type::BINARY); + } + + Y_UNIT_TEST(DataType_Utf8) { + TestDataTypeConversion(arrow::Type::STRING); + } + + Y_UNIT_TEST(DataType_Yson) { + TestDataTypeConversion(arrow::Type::BINARY); + } + + Y_UNIT_TEST(DataType_Json) { + TestDataTypeConversion(arrow::Type::STRING); + } + + Y_UNIT_TEST(DataType_JsonDocument) { + TestDataTypeConversion(arrow::Type::STRING); + } + + Y_UNIT_TEST(DataType_Uuid) { + TestFixedSizeBinaryDataTypeConversion(); + } + + // Singular types + Y_UNIT_TEST(DataType_Null) { + TestSingularTypeConversion(); + } + + Y_UNIT_TEST(DataType_Void) { + TestSingularTypeConversion(); + } + + Y_UNIT_TEST(DataType_EmptyList) { + TestSingularTypeConversion(); + } + + Y_UNIT_TEST(DataType_EmptyDict) { + TestSingularTypeConversion(); + } + + // Nested types + Y_UNIT_TEST(NestedType_List_DataValue) { + TTestContext context; + + auto listType = context.GetListType(); + auto values = context.CreateLists(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_List_NestedValue) { + TTestContext context; + + auto listType = context.GetListNestedValueType(); + auto values = context.CreateListsNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_List_OptionalValue) { + TTestContext context; + + auto listType = context.GetListOptionalValueType(); + auto values = context.CreateListsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_List_OptionalVariantValue) { + TTestContext context; + + auto listType = context.GetListOptionalVariantValueType(); + auto values = context.CreateListsOptionalVariantValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + }; + } + + Y_UNIT_TEST(NestedType_List_TaggedValue) { + TTestContext context; + + auto listType = context.GetListTaggedValueType(); + auto values = context.CreateListsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(listType)); + + auto array = MakeArrowArray(values, listType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, listType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], listType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_DataValue) { + TTestContext context; + + auto tupleType = context.GetTupleType(); + auto values = context.CreateTuples(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(0)->length()), values.size()); + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(1)->length()), values.size()); + UNIT_ASSERT_VALUES_EQUAL(static_cast(structArray->field(2)->length()), values.size()); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_NestedValue) { + TTestContext context; + + auto tupleType = context.GetTupleNestedValueType(); + auto values = context.CreateTuplesNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::LIST); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_OptionalValue) { + TTestContext context; + + auto tupleType = context.GetTupleOptionalValueType(); + auto values = context.CreateTuplesOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::STRUCT); + + auto secondStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(secondStructArray->num_fields(), 1); + UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::NA); + + auto thirdStructArray = static_pointer_cast(structArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(thirdStructArray->num_fields(), 1); + UNIT_ASSERT(thirdStructArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Tuple_TaggedValue) { + TTestContext context; + + auto tupleType = context.GetTupleTaggedValueType(); + auto values = context.CreateTuplesTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(tupleType)); + + auto array = MakeArrowArray(values, tupleType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, tupleType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], tupleType); + } + } + + Y_UNIT_TEST(NestedType_Struct_DataValue) { + TTestContext context; + + auto structType = context.GetStructType(); + auto values = context.CreateStructs(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (int i = 0; i < structArray->num_fields(); ++i) { + UNIT_ASSERT_VALUES_EQUAL(structArray->field(i)->length(), values.size()); + } + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_NestedValue) { + TTestContext context; + + auto structType = context.GetStructNestedValueType(); + auto values = context.CreateStructsNestedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->GetFieldByName("56") && structArray->GetFieldByName("56") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("78") && structArray->GetFieldByName("78") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("910") && structArray->GetFieldByName("910") == structArray->field(2)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::LIST); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::INT32); + + auto innerStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(innerStructArray->num_fields(), 2); + UNIT_ASSERT(innerStructArray->GetFieldByName("12") && innerStructArray->GetFieldByName("12") == innerStructArray->field(0)); + UNIT_ASSERT(innerStructArray->GetFieldByName("34") && innerStructArray->GetFieldByName("34") == innerStructArray->field(1)); + UNIT_ASSERT(innerStructArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(innerStructArray->field(1)->type_id() == arrow::Type::INT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_OptionalValue) { + TTestContext context; + + auto structType = context.GetStructOptionalValueType(); + auto values = context.CreateStructsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 4); + + UNIT_ASSERT(structArray->GetFieldByName("opt1") && structArray->GetFieldByName("opt1") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("opt2") && structArray->GetFieldByName("opt2") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("opt3") && structArray->GetFieldByName("opt3") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("optless") && structArray->GetFieldByName("optless") == structArray->field(3)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::UINT64); + + auto optNullStructArray = static_pointer_cast(structArray->field(1)); + UNIT_ASSERT_VALUES_EQUAL(optNullStructArray->num_fields(), 1); + UNIT_ASSERT(optNullStructArray->field(0)->type_id() == arrow::Type::NA); + + auto optOptStructArray = static_pointer_cast(structArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(optOptStructArray->num_fields(), 1); + UNIT_ASSERT(optOptStructArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Struct_TaggedValue) { + TTestContext context; + + auto structType = context.GetStructTaggedValueType(); + auto values = context.CreateStructsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(structType)); + + auto array = MakeArrowArray(values, structType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->GetFieldByName("1") && structArray->GetFieldByName("1") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("2") && structArray->GetFieldByName("2") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("3") && structArray->GetFieldByName("3") == structArray->field(2)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, structType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], structType); + } + } + + Y_UNIT_TEST(NestedType_Dict_DataKey) { + TTestContext context; + + auto dictType = context.GetDictType(); + auto values = context.CreateDicts(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_NestedKey) { + TTestContext context; + + auto dictType = context.GetDictNestedKeyType(); + auto values = context.CreateDictsNestedKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 2); + + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(keyStructArray->field(1)->type_id() == arrow::Type::INT8); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_OptionalKey) { + TTestContext context; + + auto dictType = context.GetDictOptionalKeyType(); + auto values = context.CreateDictsOptionalKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 1); + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_TaggedKey) { + TTestContext context; + + auto dictType = context.GetDictTaggedKeyType(); + auto values = context.CreateDictsTaggedKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Dict_OptionalVariantKey) { + TTestContext context; + + auto dictType = context.GetDictOptionalVariantKeyType(); + auto values = context.CreateDictsOptionalVariantKey(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(dictType)); + + auto array = MakeArrowArray(values, dictType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(listArray->num_fields(), 1); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto structArray = static_pointer_cast(listArray->value_slice(i)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + auto keyStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyStructArray->num_fields(), 1); + UNIT_ASSERT(keyStructArray->field(0)->type_id() == arrow::Type::STRUCT); + + auto keyInnerStructArray = static_pointer_cast(keyStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(keyInnerStructArray->num_fields(), 1); + UNIT_ASSERT(keyInnerStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(keyInnerStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 2); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::UINT8); + + auto arrowValue = ExtractUnboxedValue(array, i, dictType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], dictType); + } + } + + Y_UNIT_TEST(NestedType_Optional_DataValue) { + TTestContext context; + + auto optionalType = context.GetOptionalDataValueType(); + auto values = context.CreateOptionalsDataValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_SingularValue) { + TTestContext context; + + auto optionalType = context.GetOptionalSingularValueType(); + auto values = context.CreateOptionalsSingularValueType(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::NA); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_StructValue) { + TTestContext context; + + auto optionalType = context.GetOptionalStructValueType(); + auto values = context.CreateOptionalsStructValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + + UNIT_ASSERT(structArray->GetFieldByName("ABC") && structArray->GetFieldByName("ABC") == structArray->field(0)); + UNIT_ASSERT(structArray->GetFieldByName("DEF") && structArray->GetFieldByName("DEF") == structArray->field(1)); + UNIT_ASSERT(structArray->GetFieldByName("GHI") && structArray->GetFieldByName("GHI") == structArray->field(2)); + UNIT_ASSERT(structArray->GetFieldByName("JKL") && structArray->GetFieldByName("JKL") == structArray->field(3)); + UNIT_ASSERT(structArray->GetFieldByName("MNO") && structArray->GetFieldByName("MNO") == structArray->field(4)); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_TupleValue) { + TTestContext context; + + auto optionalType = context.GetOptionalTupleValueType(); + auto values = context.CreateOptionalsTupleValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_ListValue) { + TTestContext context; + + auto optionalType = context.GetOptionalListValueType(); + auto values = context.CreateOptionalsValueList(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_DictValue) { + TTestContext context; + + auto optionalType = context.GetOptionalDictValueType(); + auto values = context.CreateOptionalsDictValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(listArray->value_slice(0)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 2); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DOUBLE); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_VariantValue) { + TTestContext context; + + auto variantType = context.GetOptionalVariantValueType(); + auto values = context.CreateOptionalsVariantValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Optional_TaggedValue) { + TTestContext context; + + auto optionalType = context.GetOptionalTaggedValueType(); + auto values = context.CreateOptionalsValueTagged(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_OptionalValue) { + TTestContext context; + + auto optionalType = context.GetOptionalOptionalValueType(); + auto values = context.CreateOptionalsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Optional_OptionalVariantValue) { + TTestContext context; + + auto optionalType = context.GetOptionalOptionalVariantType(); + auto values = context.CreateOptionalsOptionalVariantValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(optionalType)); + + auto array = MakeArrowArray(values, optionalType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::STRUCT); + + auto innerStructArray = static_pointer_cast(structArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(innerStructArray->num_fields(), 1); + UNIT_ASSERT(innerStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); + + auto innerUnionArray = static_pointer_cast(innerStructArray->field(0)); + UNIT_ASSERT_VALUES_EQUAL(innerUnionArray->num_fields(), 4); + UNIT_ASSERT(innerUnionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(innerUnionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(innerUnionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(innerUnionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, optionalType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], optionalType); + } + } + + Y_UNIT_TEST(NestedType_Variant_Struct) { + TTestContext context; + + auto variantType = context.GetVariantOverStructType(); + auto values = context.CreateVariantsOverStruct(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FLOAT); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_Tuple) { + TTestContext context; + + auto variantType = context.GetVariantOverTupleType(); + auto values = context.CreateVariantsOverTuple(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 4); + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_Nested) { + TTestContext context; + + auto variantType = context.GetVariantNestedType(); + auto values = context.CreateVariantsNested(TEST_ARRAY_NESTED_SIZE * 3); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), ((VARIANT_NESTED_SIZE - 1) / MAX_VARIANT_FLATTEN_SIZE) + 1); + + for (ui32 i = 0; i < static_cast(unionArray->num_fields()); ++i) { + UNIT_ASSERT(unionArray->field(i)->type_id() == arrow::Type::DENSE_UNION); + auto innerUnionArray = static_pointer_cast(unionArray->field(i)); + + auto remainingSize = static_cast(variantType)->GetAlternativesCount() - i * MAX_VARIANT_FLATTEN_SIZE; + UNIT_ASSERT_VALUES_EQUAL(innerUnionArray->num_fields(), std::min(MAX_VARIANT_FLATTEN_SIZE, remainingSize)); + + for (ui32 j = 0; j < static_cast(innerUnionArray->num_fields()); ++j) { + auto idx = j + i * MAX_VARIANT_FLATTEN_SIZE; + if (idx % 3 == 0) { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::INT32); + } else if (idx % 3 == 1) { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::INT64); + } else { + UNIT_ASSERT(innerUnionArray->field(j)->type_id() == arrow::Type::UINT32); + } + } + } + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Variant_OverLimit) { + TTestContext context; + + auto variantType = context.GetVariantOverLimitType(); + auto values = context.CreateVariantsOverLimit(TEST_ARRAY_NESTED_SIZE * 3); + + UNIT_ASSERT(!IsArrowCompatible(variantType)); + + try { + Y_UNUSED(MakeArrowArray(values, variantType)); + UNIT_FAIL("Expected exception"); + } catch (...) {} + } + + Y_UNIT_TEST(NestedType_Variant_Comprehensive) { + TTestContext context; + + auto variantType = context.GetVariantComprehensiveType(); + auto values = context.CreateVariantsComprehensive(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(variantType)); + + auto array = MakeArrowArray(values, variantType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); + + auto unionArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(unionArray->num_fields(), 10); + + // Field 0: Data (i32) + UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::INT32); + + // Field 1: Optional + UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT32); + + // Field 2: Optional> + UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::STRUCT); + auto optOptStructArray = static_pointer_cast(unionArray->field(2)); + UNIT_ASSERT_VALUES_EQUAL(optOptStructArray->num_fields(), 1); + UNIT_ASSERT(optOptStructArray->field(0)->type_id() == arrow::Type::INT32); + + // Field 3: Void + UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::STRUCT); + auto voidStructArray = static_pointer_cast(unionArray->field(3)); + UNIT_ASSERT_VALUES_EQUAL(voidStructArray->num_fields(), 0); + + // Field 4: Struct + UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::STRUCT); + auto structArray = static_pointer_cast(unionArray->field(4)); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + // Field 5: Tuple + UNIT_ASSERT(unionArray->field(5)->type_id() == arrow::Type::STRUCT); + auto tupleArray = static_pointer_cast(unionArray->field(5)); + UNIT_ASSERT_VALUES_EQUAL(tupleArray->num_fields(), 3); + UNIT_ASSERT(tupleArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(tupleArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(tupleArray->field(2)->type_id() == arrow::Type::UINT8); + + // Field 6: List + UNIT_ASSERT(unionArray->field(6)->type_id() == arrow::Type::LIST); + auto listArray = static_pointer_cast(unionArray->field(6)); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + // Field 7: Dict + UNIT_ASSERT(unionArray->field(7)->type_id() == arrow::Type::LIST); + auto dictListArray = static_pointer_cast(unionArray->field(7)); + UNIT_ASSERT(dictListArray->value_type()->id() == arrow::Type::STRUCT); + + // Field 8: Variant + UNIT_ASSERT(unionArray->field(8)->type_id() == arrow::Type::DENSE_UNION); + auto nestedUnionArray = static_pointer_cast(unionArray->field(8)); + UNIT_ASSERT_VALUES_EQUAL(nestedUnionArray->num_fields(), 4); + UNIT_ASSERT(nestedUnionArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(nestedUnionArray->field(1)->type_id() == arrow::Type::INT16); + UNIT_ASSERT(nestedUnionArray->field(2)->type_id() == arrow::Type::UINT16); + UNIT_ASSERT(nestedUnionArray->field(3)->type_id() == arrow::Type::INT32); + + // Field 9: Tagged + UNIT_ASSERT(unionArray->field(9)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, variantType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], variantType); + }; + } + + Y_UNIT_TEST(NestedType_Tagged_DataValue) { + TTestContext context; + + auto taggedType = context.GetTaggedType(); + auto values = context.CreateTaggeds(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_StructValue) { + TTestContext context; + + auto taggedType = context.GetTaggedStructValueType(); + auto values = context.CreateTaggedsStructValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 5); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); + UNIT_ASSERT(structArray->field(3)->type_id() == arrow::Type::INT64); + UNIT_ASSERT(structArray->field(4)->type_id() == arrow::Type::STRING); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TupleValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTupleValueype(); + auto values = context.CreateTaggedsTupleValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 3); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); + UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); + UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_ListValue) { + TTestContext context; + + auto taggedType = context.GetTaggedListValueType(); + auto values = context.CreateTaggedsValueList(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_DictValue) { + TTestContext context; + + auto taggedType = context.GetTaggedDictValueType(); + auto values = context.CreateTaggedsDictValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::LIST); + + auto listArray = static_pointer_cast(array); + UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRUCT); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalValueType(); + auto values = context.CreateTaggedsOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalOptionalValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalOptionalValueType(); + auto values = context.CreateTaggedsOptionalOptionalValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); + + auto structArray = static_pointer_cast(array); + UNIT_ASSERT_VALUES_EQUAL(structArray->num_fields(), 1); + UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTaggedValueType(); + auto values = context.CreateTaggedsTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_OptionalTaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedOptionalTaggedValueType(); + auto values = context.CreateTaggedsOptionalTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } + + Y_UNIT_TEST(NestedType_Tagged_TaggedOptionalTaggedTaggedValue) { + TTestContext context; + + auto taggedType = context.GetTaggedTaggedOptionalTaggedTaggedValueType(); + auto values = context.CreateTaggedsTaggedOptionalTaggedTaggedValue(TEST_ARRAY_NESTED_SIZE); + + UNIT_ASSERT(IsArrowCompatible(taggedType)); + + auto array = MakeArrowArray(values, taggedType); + UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); + UNIT_ASSERT_VALUES_EQUAL(array->length(), values.size()); + UNIT_ASSERT(array->type_id() == arrow::Type::INT32); + + for (size_t i = 0; i < values.size(); ++i) { + auto arrowValue = ExtractUnboxedValue(array, i, taggedType, context.HolderFactory); + AssertUnboxedValuesAreEqual(arrowValue, values[i], taggedType); + } + } +} + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp new file mode 100644 index 000000000000..cec436ecb94c --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.cpp @@ -0,0 +1,435 @@ +#include "kqp_formats_ut_helpers.h" + +#include + +#include +#include +#include +#include +#include + +namespace NKikimr::NKqp::NFormats { + +namespace { + +template +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + using TArrayType = typename arrow::TypeTraits::ArrayType; + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> // For darwin build +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> // For darwin build +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + return NUdf::TUnboxedValuePod(static_cast(array->Value(row))); +} + +template <> +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + YQL_ENSURE(array->num_fields() == 2, "StructArray of some TzDate type should have 2 fields"); + + auto datetimeArray = array->field(0); + auto timezoneArray = std::static_pointer_cast(array->field(1)); + + NUdf::TUnboxedValuePod value; + auto typeId = datetimeArray->type_id(); + + switch (dataSlot) { + case NUdf::EDataSlot::TzDate: { + YQL_ENSURE(typeId == arrow::Type::UINT16); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDatetime: { + YQL_ENSURE(typeId == arrow::Type::UINT32); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzTimestamp: { + YQL_ENSURE(typeId == arrow::Type::UINT64); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDate32: { + YQL_ENSURE(typeId == arrow::Type::INT32); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + case NUdf::EDataSlot::TzDatetime64: + case NUdf::EDataSlot::TzTimestamp64: { + YQL_ENSURE(typeId == arrow::Type::INT64); + value = NUdf::TUnboxedValuePod(static_cast( + std::static_pointer_cast(datetimeArray)->Value(row))); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected timezone datetime data type"); + return NUdf::TUnboxedValuePod(); + } + } + + auto view = timezoneArray->Value(row); + value.SetTimezoneId(NMiniKQL::GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); + return value; +} + +template <> +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + Y_UNUSED(dataSlot); + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); +} + +template <> +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + + switch (dataSlot) { + case NUdf::EDataSlot::Utf8: + case NUdf::EDataSlot::Json: { + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + } + + case NUdf::EDataSlot::JsonDocument: { + auto variant = NBinaryJson::SerializeToBinaryJson(TStringBuf(data.data(), data.size())); + if (std::holds_alternative(variant)) { + const auto& json = std::get(variant); + return NMiniKQL::MakeString(NUdf::TStringRef(json.Data(), json.Size())); + } + + YQL_ENSURE(false, "Cannot serialize to binary json"); + break; + } + + case NUdf::EDataSlot::DyNumber: { + auto number = NDyNumber::ParseDyNumberString(TStringBuf(data.data(), data.size())); + if (number.Defined()) { + return NMiniKQL::MakeString(*number); + } + + YQL_ENSURE(false, "Failed to convert string to DyNumber"); + break; + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + return NUdf::TUnboxedValuePod(); +} + +template <> +NUdf::TUnboxedValue ExtractDataValue(std::shared_ptr column, ui32 row, NUdf::EDataSlot dataSlot) { + auto array = std::static_pointer_cast(column); + auto data = array->GetView(row); + + switch (dataSlot) { + case NUdf::EDataSlot::Uuid: { + return NMiniKQL::MakeString(NUdf::TStringRef(data.data(), data.size())); + } + + case NUdf::EDataSlot::Decimal: { + NYql::NDecimal::TInt128 value; + std::memcpy(&value, data.data(), data.size()); + return NUdf::TUnboxedValuePod(value); + } + + default: { + YQL_ENSURE(false, "Unexpected data slot"); + } + } + return NUdf::TUnboxedValuePod(); +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TDataType* dataType) +{ + NUdf::TUnboxedValue result; + auto dataSlot = *dataType->GetDataSlot().Get(); + bool success = SwitchMiniKQLDataTypeToArrowType(dataSlot, + [&]() { + result = ExtractDataValue(array, row, dataSlot); + return true; + }); + YQL_ENSURE(success, "Failed to extract unboxed value from arrow array"); + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TOptionalType* optionalType, const NMiniKQL::THolderFactory& holderFactory) +{ + auto innerType = SkipTaggedType(optionalType->GetItemType()); + ui32 depth = 1; + + while (innerType->IsOptional()) { + innerType = SkipTaggedType(static_cast(innerType)->GetItemType()); + ++depth; + } + + // For types without native validity bitmap (e.g., Variant, Null) we need to wrap them in an additional struct layer + // Furthermore, other singular types (e.g., Void, EmptyList, EmptyDict) also need to wrap (from YQL-15332) + // Thus, the depth == 2 for Optional> type + if (NeedWrapByExternalOptional(innerType)) { + ++depth; + } + + auto innerArray = array; + NUdf::TUnboxedValue value; + + for (ui32 i = 1; i < depth; ++i) { + YQL_ENSURE(innerArray->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto structArray = static_pointer_cast(innerArray); + YQL_ENSURE(structArray->num_fields() == 1, "Unexpected count of fields"); + + if (structArray->IsNull(row)) { + for (ui32 j = 1; j < i; ++j) { + value = value.MakeOptional(); + } + return value; + } + + innerArray = structArray->field(0); + } + + if (!innerArray->IsNull(row)) { + value = NFormats::ExtractUnboxedValue(innerArray, row, innerType, holderFactory); + } + + for (ui32 i = 1; i < depth; ++i) { + value = value.MakeOptional(); + } + return value; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TStructType* structType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == structType->GetMembersCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(structType->GetMembersCount(), itemsPtr); + + for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { + auto memberType = structType->GetMemberType(index); + itemsPtr[index] = NFormats::ExtractUnboxedValue(typedArray->field(index), row, memberType, holderFactory); + } + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TTupleType* tupleType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::STRUCT, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + YQL_ENSURE(static_cast(typedArray->num_fields()) == tupleType->GetElementsCount(), "Unexpected count of fields"); + + NUdf::TUnboxedValue* itemsPtr = nullptr; + auto result = holderFactory.CreateDirectArrayHolder(tupleType->GetElementsCount(), itemsPtr); + + for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { + auto elementType = tupleType->GetElementType(index); + itemsPtr[index] = NFormats::ExtractUnboxedValue(typedArray->field(index), row, elementType, holderFactory); + } + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TListType* listType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto typedArray = static_pointer_cast(array); + + auto arraySlice = typedArray->value_slice(row); + auto itemType = listType->GetItemType(); + const auto len = arraySlice->length(); + + NUdf::TUnboxedValue* items = nullptr; + auto list = holderFactory.CreateDirectArrayHolder(len, items); + for (ui64 i = 0; i < static_cast(len); ++i) { + *items++ = NFormats::ExtractUnboxedValue(arraySlice, i, itemType, holderFactory); + } + return list; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TDictType* dictType, const NMiniKQL::THolderFactory& holderFactory) +{ + auto keyType = dictType->GetKeyType(); + auto payloadType = dictType->GetPayloadType(); + auto dictBuilder = holderFactory.NewDict(dictType, 0); + + YQL_ENSURE(array->type_id() == arrow::Type::LIST, "Unexpected array type"); + auto listArray = static_pointer_cast(array); + YQL_ENSURE(listArray->value_type()->id() == arrow::Type::STRUCT, "Unexpected array type"); + + auto structArray = static_pointer_cast(listArray->value_slice(row)); + YQL_ENSURE(static_cast(structArray->num_fields()) == 2, "Unexpected count of fields"); + + std::shared_ptr keyArray = structArray->field(0); + std::shared_ptr payloadArray = structArray->field(1); + + for (ui64 i = 0; i < static_cast(structArray->length()); ++i) { + auto key = NFormats::ExtractUnboxedValue(keyArray, i, keyType, holderFactory); + auto payload = NFormats::ExtractUnboxedValue(payloadArray, i, payloadType, holderFactory); + dictBuilder->Add(std::move(key), std::move(payload)); + } + return dictBuilder->Build(); +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TVariantType* variantType, const NMiniKQL::THolderFactory& holderFactory) +{ + YQL_ENSURE(array->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto unionArray = static_pointer_cast(array); + + auto variantIndex = unionArray->child_id(row); + auto rowInChild = unionArray->value_offset(row); + auto valuesArray = unionArray->field(variantIndex); + + YQL_ENSURE(variantType->GetAlternativesCount() <= MAX_VARIANT_NESTED_SIZE, "Variant type has more than " << MAX_VARIANT_NESTED_SIZE << " alternatives"); + + if (variantType->GetAlternativesCount() > MAX_VARIANT_FLATTEN_SIZE) { + YQL_ENSURE(valuesArray->type_id() == arrow::Type::DENSE_UNION, "Unexpected array type"); + auto innerUnionArray = static_pointer_cast(valuesArray); + auto innerVariantIndex = innerUnionArray->child_id(rowInChild); + + rowInChild = innerUnionArray->value_offset(rowInChild); + valuesArray = innerUnionArray->field(innerVariantIndex); + variantIndex = variantIndex * MAX_VARIANT_FLATTEN_SIZE + innerVariantIndex; + } + + NMiniKQL::TType* innerType = variantType->GetUnderlyingType(); + if (innerType->IsStruct()) { + innerType = static_cast(innerType)->GetMemberType(variantIndex); + } else { + YQL_ENSURE(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); + innerType = static_cast(innerType)->GetElementType(variantIndex); + } + + auto value = NFormats::ExtractUnboxedValue(valuesArray, rowInChild, innerType, holderFactory); + return holderFactory.CreateVariantHolder(value.Release(), variantIndex); +} + +} // namespace + +std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type) { + auto arrayType = GetArrowType(type); + std::unique_ptr builder; + auto status = arrow::MakeBuilder(arrow::default_memory_pool(), arrayType, &builder); + YQL_ENSURE(status.ok(), "Failed to make arrow builder: " << status.ToString()); + return builder; +} + +std::shared_ptr MakeArrowArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType) { + auto builder = MakeArrowBuilder(itemType); + auto status = builder->Reserve(values.size()); + YQL_ENSURE(status.ok(), "Failed to reserve space for array: " << status.ToString()); + for (auto& value : values) { + AppendElement(value, builder.get(), itemType); + } + std::shared_ptr result; + status = builder->Finish(&result); + YQL_ENSURE(status.ok(), "Failed to finish array: " << status.ToString()); + return result; +} + +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, const NMiniKQL::TType* itemType, + const NMiniKQL::THolderFactory& holderFactory) +{ + if (array->IsNull(row)) { + return NUdf::TUnboxedValuePod(); + } + + switch (itemType->GetKind()) { + case NMiniKQL::TType::EKind::Null: + case NMiniKQL::TType::EKind::Void: + case NMiniKQL::TType::EKind::EmptyList: + case NMiniKQL::TType::EKind::EmptyDict: { + break; + } + + case NMiniKQL::TType::EKind::Data: { + return ExtractUnboxedValue(array, row, static_cast(itemType)); + } + + case NMiniKQL::TType::EKind::Optional: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::Struct: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::Tuple: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::List: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::Dict: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::Variant: { + return ExtractUnboxedValue(array, row, static_cast(itemType), holderFactory); + } + + case NMiniKQL::TType::EKind::Tagged: { + return ExtractUnboxedValue(array, row, static_cast(itemType)->GetBaseType(), holderFactory); + } + + case NMiniKQL::TType::EKind::Type: + case NMiniKQL::TType::EKind::Stream: + case NMiniKQL::TType::EKind::Callable: + case NMiniKQL::TType::EKind::Any: + case NMiniKQL::TType::EKind::Resource: + case NMiniKQL::TType::EKind::Flow: + case NMiniKQL::TType::EKind::ReservedKind: + case NMiniKQL::TType::EKind::Block: + case NMiniKQL::TType::EKind::Pg: + case NMiniKQL::TType::EKind::Multi: + case NMiniKQL::TType::EKind::Linear: { + YQL_ENSURE(false, "Unsupported type: " << itemType->GetKindAsStr()); + } + } + return NUdf::TUnboxedValuePod(); +} + +NMiniKQL::TUnboxedValueVector ExtractUnboxedVector(const std::shared_ptr& array, const NMiniKQL::TType* itemType, + const NMiniKQL::THolderFactory& holderFactory) +{ + NMiniKQL::TUnboxedValueVector values; + values.reserve(array->length()); + for (auto i = 0; i < array->length(); ++i) { + values.push_back(ExtractUnboxedValue(array, i, itemType, holderFactory)); + } + return values; +} + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h new file mode 100644 index 000000000000..028183709c9b --- /dev/null +++ b/ydb/core/kqp/common/result_set_format/ut/kqp_formats_ut_helpers.h @@ -0,0 +1,62 @@ +#pragma once + +#include + +#include +#include + +/** + * @file kqp_formats_ut_helpers.h + * @brief Utilities for testing KQP formats. + * + * This module provides utilities for testing KQP formats. + * It includes functions for making arrow arrays and extracting unboxed values from arrow arrays. + */ + +namespace NKikimr::NKqp::NFormats { + +/** + * @brief Make arrow array builder for given type. + * The type is converted to arrow type by NKqp::NFormats::GetArrowType function. + * + * @param type type to make builder for + * @return unique pointer to arrow array builder + */ +std::unique_ptr MakeArrowBuilder(const NMiniKQL::TType* type); + +/** + * @brief Make arrow array for given values and type. + * The type is converted to arrow type by NKqp::NFormats::GetArrowType function. + * + * @param values values to make array for + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @return shared pointer to arrow array + */ +std::shared_ptr MakeArrowArray(NMiniKQL::TUnboxedValueVector& values, const NMiniKQL::TType* itemType); + +/** + * @brief Extract unboxed value from arrow array for given row and type. + * The type of the item and the arrow array type must be the same by NKqp::NFormats::GetArrowType function. + * + * @param array arrow array to extract value from + * @param row row to extract value from + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @param holderFactory holder factory to use + * @return unboxed value + */ +NUdf::TUnboxedValue ExtractUnboxedValue(const std::shared_ptr& array, ui64 row, + const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); + +/** + * @brief Extract unboxed values from arrow array for given type. + * The type of items and the arrow array type must be the same by NKqp::NFormats::GetArrowType function. + * + * @param array arrow array to extract values from + * @param itemType type of each element to parse it and to construct corresponding arrow type + * @param holderFactory holder factory to use + * @return vector of unboxed values + */ +NMiniKQL::TUnboxedValueVector ExtractUnboxedVector(const std::shared_ptr& array, + const NMiniKQL::TType* itemType, const NMiniKQL::THolderFactory& holderFactory); + +} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp b/ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp deleted file mode 100644 index 2339e1fdd98e..000000000000 --- a/ydb/core/kqp/common/result_set_format/ut/kqp_result_set_arrow_ut.cpp +++ /dev/null @@ -1,1993 +0,0 @@ -#include - -#include - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace NKikimr::NMiniKQL; -using namespace NKikimr::NArrow; -using namespace NYql; - -inline static constexpr size_t TEST_ARRAY_SIZE = 1 << 16; -inline static constexpr ui8 DECIMAL_PRECISION = 35; -inline static constexpr ui8 DECIMAL_SCALE = 10; - -static_assert(DECIMAL_PRECISION >= DECIMAL_SCALE, "Decimal precision must be greater than or equal to scale"); - -namespace { - -ui16 GetTimezoneIdSkipEmpty(ui16 index) { - auto size = NTi::GetTimezones().size(); - while (NTi::GetTimezones()[index % size].empty()) { - index = (index + 1) % size; - } - return GetTimezoneId(NTi::GetTimezones()[index % size]); -} - -std::string SerializeToBinaryJson(const TStringBuf json) { - auto variant = NKikimr::NBinaryJson::SerializeToBinaryJson(json); - if (std::holds_alternative(variant)) { - const auto binaryJson = std::get(variant); - return std::string(binaryJson.Data(), binaryJson.Size()); - } - UNIT_ASSERT_C(false, "Cannot serialize binary json"); - return {}; -} - -NUdf::TUnboxedValue GetValueOfBasicType(TType* type, ui64 value) { - Y_ABORT_UNLESS(type->GetKind() == TType::EKind::Data); - auto dataType = static_cast(type); - auto slot = *dataType->GetDataSlot().Get(); - switch (slot) { - case NUdf::EDataSlot::Bool: - return NUdf::TUnboxedValuePod(static_cast(value % 2 == 0)); - case NUdf::EDataSlot::Int8: - return NUdf::TUnboxedValuePod(static_cast(-(value % ((1 << 7) - 1)))); - case NUdf::EDataSlot::Uint8: - return NUdf::TUnboxedValuePod(static_cast(value % ((1 << 8)))); - case NUdf::EDataSlot::Int16: - return NUdf::TUnboxedValuePod(static_cast(-(value % ((1 << 15) - 1)))); - case NUdf::EDataSlot::Uint16: - return NUdf::TUnboxedValuePod(static_cast(value % (1 << 15))); - case NUdf::EDataSlot::Int32: - return NUdf::TUnboxedValuePod(static_cast(-(value % ((1ULL << 31) - 1)))); - case NUdf::EDataSlot::Uint32: - return NUdf::TUnboxedValuePod(static_cast(value % (1ULL << 31))); - case NUdf::EDataSlot::Int64: - return NUdf::TUnboxedValuePod(static_cast(-(value % ((1ULL << 63) - 1)))); - case NUdf::EDataSlot::Uint64: - return NUdf::TUnboxedValuePod(static_cast(value % (1ULL << 63))); - case NUdf::EDataSlot::Float: - return NUdf::TUnboxedValuePod(static_cast(value) / 1234); - case NUdf::EDataSlot::Double: - return NUdf::TUnboxedValuePod(static_cast(value) / 12345); - case NUdf::EDataSlot::Decimal: { - auto decimal = NYql::NDecimal::FromString(TStringBuilder() << value << ".123", DECIMAL_PRECISION, DECIMAL_SCALE); - return NUdf::TUnboxedValuePod(decimal); - } - case NUdf::EDataSlot::DyNumber: { - auto number = NKikimr::NDyNumber::ParseDyNumberString(TStringBuilder() << value); - UNIT_ASSERT_C(number.Defined(), "Failed to convert string to DyNumber"); - return MakeString(*number); - } - case NUdf::EDataSlot::Date: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE)); - case NUdf::EDataSlot::Datetime: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME)); - case NUdf::EDataSlot::Timestamp: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP)); - case NUdf::EDataSlot::Interval: - return NUdf::TUnboxedValuePod(static_cast(value / 2 - 1)); - case NUdf::EDataSlot::TzDate: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::TzDatetime: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::TzTimestamp: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::Date32: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE32)); - case NUdf::EDataSlot::Datetime64: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME64)); - case NUdf::EDataSlot::Timestamp64: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP64)); - case NUdf::EDataSlot::Interval64: - return NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_INTERVAL64)); - case NUdf::EDataSlot::TzDate32: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATE32)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::TzDatetime64: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_DATETIME64)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::TzTimestamp64: { - auto ret = NUdf::TUnboxedValuePod(static_cast(value % NUdf::MAX_TIMESTAMP64)); - ret.SetTimezoneId(GetTimezoneIdSkipEmpty(value)); - return ret; - } - case NUdf::EDataSlot::String: { - std::string string = TStringBuilder() << value; - return MakeString(NUdf::TStringRef(string.data(), string.size())); - } - case NUdf::EDataSlot::Utf8: { - std::string string = TStringBuilder() << value << "utf8"; - return MakeString(NUdf::TStringRef(string.data(), string.size())); - } - case NUdf::EDataSlot::Yson: { - std::string yson = TStringBuilder() << '[' << value << ']'; - return MakeString(NUdf::TStringRef(yson.data(), yson.size())); - } - case NUdf::EDataSlot::Json: { - std::string json = TStringBuilder() << '[' << value << ']'; - return MakeString(NUdf::TStringRef(json.data(), json.size())); - } - case NUdf::EDataSlot::JsonDocument: { - std::string json = SerializeToBinaryJson(TStringBuilder() << "{\"b\": " << value << ", \"a\": " << value / 2 << "}"); - return MakeString(NUdf::TStringRef(json.data(), json.size())); - } - case NUdf::EDataSlot::Uuid: { - std::string uuid; - for (size_t i = 0; i < NKikimr::NScheme::FSB_SIZE / 2; ++i) { - uuid += "a" + std::to_string((i + value) % 10); - } - return MakeString(NUdf::TStringRef(uuid)); - } - } - - return NUdf::TUnboxedValuePod(); -} - -struct TTestContext { - TScopedAlloc Alloc; - TTypeEnvironment TypeEnv; - TMemoryUsageInfo MemInfo; - THolderFactory HolderFactory; - TDefaultValueBuilder Vb; - ui16 VariantSize = 0; - - TVector BasicTypes = { - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataDecimalType::Create(DECIMAL_PRECISION, DECIMAL_SCALE, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv) - }; - - TTestContext() - : Alloc(__LOCATION__) - , TypeEnv(Alloc) - , MemInfo("TestMem") - , HolderFactory(Alloc.Ref(), MemInfo) - , Vb(HolderFactory) - { - } - - TType* GetStructType() { - TStructMember members[3] = { - {"s", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"x", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"y", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} - }; - return TStructType::Create(3, members, TypeEnv); - } - - TUnboxedValueVector CreateStructs(ui32 quantity) { - TUnboxedValueVector values; - for (ui32 value = 0; value < quantity; ++value) { - NUdf::TUnboxedValue* items; - auto structValue = Vb.NewArray(3, items); - std::string string = TStringBuilder() << value; - items[0] = MakeString(NUdf::TStringRef(string.data(), string.size())); - items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); - items[2] = NUdf::TUnboxedValuePod((ui64) (value * value)); - values.emplace_back(std::move(structValue)); - } - return values; - } - - TType* GetTupleType() { - TType* members[3] = { - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv) - }; - return TTupleType::Create(3, members, TypeEnv); - } - - TUnboxedValueVector CreateTuples(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - for (ui32 value = 0; value < quantity; ++value) { - NUdf::TUnboxedValue* items; - auto tupleValue = Vb.NewArray(3, items); - items[0] = NUdf::TUnboxedValuePod(value % 3 == 0); - items[1] = NUdf::TUnboxedValuePod(static_cast(-value)); - items[2] = NUdf::TUnboxedValuePod(static_cast(value)); - values.push_back(std::move(tupleValue)); - } - return values; - } - - TType* GetDictUtf8ToIntervalType() { - TType* keyType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - TType* payloadType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - return TDictType::Create(keyType, payloadType, TypeEnv); - } - - TUnboxedValueVector CreateDictUtf8ToInterval(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - auto dictType = GetDictUtf8ToIntervalType(); - for (ui32 value = 0; value < quantity; ++value) { - auto dictBuilder = Vb.NewDict(dictType, 0); - for (ui32 i = 0; i < value * value; ++i) { - std::string string = TStringBuilder() << "This is a long string #" << i; - NUdf::TUnboxedValue key = MakeString(NUdf::TStringRef(string.data(), string.size())); - NUdf::TUnboxedValue payload = NUdf::TUnboxedValuePod(static_cast(value * i)); - dictBuilder->Add(std::move(key), std::move(payload)); - } - auto dictValue = dictBuilder->Build(); - values.emplace_back(std::move(dictValue)); - } - return values; - } - - TType* GetListOfJsonsType() { - TType* itemType = TDataType::Create(NUdf::TDataType::Id, TypeEnv); - return TListType::Create(itemType, TypeEnv); - } - - TUnboxedValueVector CreateListOfJsons(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - TUnboxedValueVector items; - items.reserve(value); - for (ui64 i = 0; i < value; ++i) { - std::string json = TStringBuilder() << "{'item':" << i << "}"; - items.push_back(MakeString(NUdf::TStringRef(json.data(), json.size()))); - } - auto listValue = Vb.NewList(items.data(), value); - values.emplace_back(std::move(listValue)); - } - return values; - } - - TType* GetOptionalListOfOptional() { - TType* itemType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); - return TOptionalType::Create(TListType::Create(itemType, TypeEnv), TypeEnv); - } - - TUnboxedValueVector CreateOptionalListOfOptional(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - if (value % 2 == 0) { - values.emplace_back(NUdf::TUnboxedValuePod()); - continue; - } - - TUnboxedValueVector items; - items.reserve(value); - for (ui64 i = 0; i < value; ++i) { - NUdf::TUnboxedValue item = ((value + i) % 2 == 0) ? NUdf::TUnboxedValuePod() : NUdf::TUnboxedValuePod(i); - items.push_back(std::move(item).MakeOptional()); - } - - auto listValue = Vb.NewList(items.data(), value); - values.emplace_back(std::move(listValue).MakeOptional()); - } - return values; - } - - TType* GetVariantOverStructType() { - TStructMember members[4] = { - {"0_yson", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"1_json-document", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"2_uuid", TDataType::Create(NUdf::TDataType::Id, TypeEnv)}, - {"3_float", TDataType::Create(NUdf::TDataType::Id, TypeEnv)} - }; - auto structType = TStructType::Create(4, members, TypeEnv); - return TVariantType::Create(structType, TypeEnv); - } - - TUnboxedValueVector CreateVariantOverStruct(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 4; - NUdf::TUnboxedValue item; - if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); - } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(value) / 4); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); - values.push_back(std::move(wrapped)); - } - return values; - } - - TType* GetOptionalVariantOverStructType() { - return TOptionalType::Create(GetVariantOverStructType(), TypeEnv); - } - - TUnboxedValueVector CreateOptionalVariantOverStruct(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 4; - NUdf::TUnboxedValue item; - - if (value % 2 == 0) { - values.push_back(NUdf::TUnboxedValuePod()); - continue; - } - - if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); - } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(value) / 4); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); - values.push_back(std::move(wrapped)); - } - return values; - } - - TType* GetDoubleOptionalVariantOverStructType() { - return TOptionalType::Create(GetOptionalVariantOverStructType(), TypeEnv); - } - - TUnboxedValueVector CreateDoubleOptionalVariantOverStruct(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 4; - NUdf::TUnboxedValue item; - - if (value % 3 == 0) { - if (typeIndex == 0) { - std::string data = TStringBuilder() << "{value=" << value << "}"; - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 1) { - std::string data = TStringBuilder() << "{\"value\":" << value << "}"; - item = MakeString(SerializeToBinaryJson(data)); - } else if (typeIndex == 2) { - std::string sample = "7856341212905634789012345678901"; - std::string data = TStringBuilder() << HexDecode(sample + static_cast('0' + (value % 10))); - item = MakeString(NUdf::TStringRef(data.data(), data.size())); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(value) / 4); - } - - item = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); - } else { - item = NUdf::TUnboxedValuePod(); - } - - if (value % 3 != 2) { - item = item.MakeOptional(); - } - - values.push_back(std::move(item)); - } - return values; - } - - TType* GetVariantOverTupleWithOptionalsType() { - TType* members[5] = { - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv) - }; - auto tupleType = TTupleType::Create(5, members, TypeEnv); - return TVariantType::Create(tupleType, TypeEnv); - } - - TUnboxedValueVector CreateVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 5; - NUdf::TUnboxedValue item; - if (typeIndex == 0) { - item = NUdf::TUnboxedValuePod(value % 3 == 0); - } else if (typeIndex == 1) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 2) { - item = NUdf::TUnboxedValuePod(static_cast(value)); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)); - values.emplace_back(std::move(wrapped)); - } - return values; - } - - TType* GetOptionalVariantOverTupleWithOptionalsType() { - return TOptionalType::Create(GetVariantOverTupleWithOptionalsType(), TypeEnv); - } - - TUnboxedValueVector CreateOptionalVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - - if (value % 2 == 0) { - values.push_back(NUdf::TUnboxedValuePod()); - continue; - } - - auto typeIndex = value % 5; - NUdf::TUnboxedValue item; - if (typeIndex == 0) { - item = NUdf::TUnboxedValuePod(value % 3 == 0); - } else if (typeIndex == 1) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 2) { - item = NUdf::TUnboxedValuePod(static_cast(value)); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); - } - auto wrapped = Vb.NewVariant(typeIndex, std::move(item)).MakeOptional(); - values.emplace_back(std::move(wrapped)); - } - return values; - } - - TType* GetDoubleOptionalVariantOverTupleWithOptionalsType() { - return TOptionalType::Create(GetOptionalVariantOverTupleWithOptionalsType(), TypeEnv); - } - - TUnboxedValueVector CreateDoubleOptionalVariantOverTupleWithOptionals(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto typeIndex = value % 5; - NUdf::TUnboxedValue item; - - if (value % 3 == 0) { - if (typeIndex == 0) { - item = NUdf::TUnboxedValuePod(value % 3 == 0); - } else if (typeIndex == 1) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 2) { - item = NUdf::TUnboxedValuePod(static_cast(value)); - } else if (typeIndex == 3) { - item = NUdf::TUnboxedValuePod(static_cast(-value)); - } else if (typeIndex == 4) { - NUdf::TUnboxedValue innerItem; - innerItem = value % 2 == 0 - ? NUdf::TUnboxedValuePod(static_cast(value)) - : NUdf::TUnboxedValuePod(); - item = innerItem.MakeOptional(); - } - - item = Vb.NewVariant(typeIndex, std::move(item)); - } else { - item = NUdf::TUnboxedValuePod(); - } - - if (value % 3 != 2) { - item = item.MakeOptional(); - } - - values.emplace_back(std::move(item)); - } - return values; - } - - TType* GetDictOptionalToTupleType() { - TType* keyType = TOptionalType::Create(TDataType::Create(NUdf::TDataType::Id, TypeEnv), TypeEnv); - TType* members[2] = { - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - }; - TType* payloadType = TTupleType::Create(2, members, TypeEnv); - return TDictType::Create(keyType, payloadType, TypeEnv); - } - - TUnboxedValueVector CreateDictOptionalToTuple(ui32 quantity) { - NKikimr::NMiniKQL::TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - auto dictBuilder = Vb.NewDict(GetDictOptionalToTupleType(), 0); - for (ui64 i = 0; i < value * value; ++i) { - NUdf::TUnboxedValue key; - if (i == 0) { - key = NUdf::TUnboxedValuePod(); - } else { - key = NUdf::TUnboxedValuePod(value / 4).MakeOptional(); - } - NUdf::TUnboxedValue* items; - auto payload = Vb.NewArray(2, items); - items[0] = NUdf::TUnboxedValuePod(static_cast(-value)); - items[1] = NUdf::TUnboxedValuePod(static_cast(value)); - dictBuilder->Add(std::move(key), std::move(payload)); - } - auto dictValue = dictBuilder->Build(); - values.emplace_back(std::move(dictValue)); - } - return values; - } - - TType* GetOptionalOfOptionalType() { - return TOptionalType::Create( - TOptionalType::Create( - TDataType::Create(NUdf::TDataType::Id, TypeEnv), - TypeEnv), - TypeEnv); - } - - TUnboxedValueVector CreateOptionalOfOptional(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 value = 0; value < quantity; ++value) { - NUdf::TUnboxedValue element = value % 3 == 0 - ? NUdf::TUnboxedValuePod(value).MakeOptional() - : NUdf::TUnboxedValuePod(); - if (value % 3 != 2) { - element = element.MakeOptional(); - } - values.emplace_back(std::move(element)); - } - return values; - } - - TType* GetLargeVariantType(const ui16 variantSize) { - VariantSize = variantSize; - TVector tupleTypes; - tupleTypes.reserve(variantSize); - for (ui64 index = 0; index < variantSize; ++index) { - tupleTypes.push_back(TTupleType::Create(BasicTypes.size(), BasicTypes.data(), TypeEnv)); - } - auto tupleOfTuplesType = TTupleType::Create(variantSize, tupleTypes.data(), TypeEnv); - return TVariantType::Create(tupleOfTuplesType, TypeEnv); - } - - TUnboxedValueVector CreateLargeVariant(ui32 quantity) { - TUnboxedValueVector values; - for (ui64 index = 0; index < quantity; ++index) { - NUdf::TUnboxedValue item; - auto typeIndex = index % VariantSize; - TUnboxedValueVector tupleItems; - for (ui64 i = 0; i < BasicTypes.size(); ++i) { - tupleItems.push_back(GetValueOfBasicType(BasicTypes[i], i + typeIndex)); - } - auto wrapped = Vb.NewVariant(typeIndex, HolderFactory.VectorAsArray(tupleItems)); - values.emplace_back(std::move(wrapped)); - } - return values; - } -}; - -void AssertUnboxedValuesAreEqual(NUdf::TUnboxedValue& left, NUdf::TUnboxedValue& right, TType* type) { - switch (type->GetKind()) { - case TType::EKind::Void: - case TType::EKind::Null: - case TType::EKind::EmptyList: - case TType::EKind::EmptyDict: { - UNIT_ASSERT(!left.HasValue()); - UNIT_ASSERT(!right.HasValue()); - break; - } - - case TType::EKind::Data: { - auto dataType = static_cast(type); - auto dataSlot = *dataType->GetDataSlot().Get(); - // Json-like type are not comparable so just skip them - if (dataSlot != NUdf::EDataSlot::Json && dataSlot != NUdf::EDataSlot::Yson && dataSlot != NUdf::EDataSlot::JsonDocument) { - UNIT_ASSERT(NUdf::EquateValues(dataSlot, left, right)); - } - break; - } - - case TType::EKind::Optional: { - UNIT_ASSERT_EQUAL(left.HasValue(), right.HasValue()); - if (left.HasValue()) { - auto innerType = static_cast(type)->GetItemType(); - NUdf::TUnboxedValue leftInner = left.GetOptionalValue(); - NUdf::TUnboxedValue rightInner = right.GetOptionalValue(); - AssertUnboxedValuesAreEqual(leftInner, rightInner, innerType); - } - break; - } - - case TType::EKind::List: { - auto listType = static_cast(type); - auto itemType = listType->GetItemType(); - auto leftPtr = left.GetElements(); - auto rightPtr = right.GetElements(); - UNIT_ASSERT_EQUAL(leftPtr != nullptr, rightPtr != nullptr); - if (leftPtr != nullptr) { - auto leftLen = left.GetListLength(); - auto rightLen = right.GetListLength(); - UNIT_ASSERT_EQUAL(leftLen, rightLen); - while (leftLen > 0) { - NUdf::TUnboxedValue leftItem = *leftPtr++; - NUdf::TUnboxedValue rightItem = *rightPtr++; - AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - --leftLen; - } - } else { - const auto leftIter = left.GetListIterator(); - const auto rightIter = right.GetListIterator(); - NUdf::TUnboxedValue leftItem; - NUdf::TUnboxedValue rightItem; - bool leftHasValue = leftIter.Next(leftItem); - bool rightHasValue = rightIter.Next(leftItem); - while (leftHasValue && rightHasValue) { - AssertUnboxedValuesAreEqual(leftItem, rightItem, itemType); - leftHasValue = leftIter.Next(leftItem); - rightHasValue = rightIter.Next(leftItem); - } - UNIT_ASSERT_EQUAL(leftHasValue, rightHasValue); - } - break; - } - - case TType::EKind::Struct: { - auto structType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetListLength(), structType->GetMembersCount()); - UNIT_ASSERT_EQUAL(right.GetListLength(), structType->GetMembersCount()); - for (ui32 index = 0; index < structType->GetMembersCount(); ++index) { - auto memberType = structType->GetMemberType(index); - NUdf::TUnboxedValue leftMember = left.GetElement(index); - NUdf::TUnboxedValue rightMember = right.GetElement(index); - AssertUnboxedValuesAreEqual(leftMember, rightMember, memberType); - } - break; - } - - case TType::EKind::Tuple: { - auto tupleType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetListLength(), tupleType->GetElementsCount()); - UNIT_ASSERT_EQUAL(right.GetListLength(), tupleType->GetElementsCount()); - for (ui32 index = 0; index < tupleType->GetElementsCount(); ++index) { - auto elementType = tupleType->GetElementType(index); - NUdf::TUnboxedValue leftMember = left.GetElement(index); - NUdf::TUnboxedValue rightMember = right.GetElement(index); - AssertUnboxedValuesAreEqual(leftMember, rightMember, elementType); - } - break; - } - - case TType::EKind::Dict: { - auto dictType = static_cast(type); - auto payloadType = dictType->GetPayloadType(); - - UNIT_ASSERT_EQUAL(left.GetDictLength(), right.GetDictLength()); - const auto leftIter = left.GetDictIterator(); - for (NUdf::TUnboxedValue key, leftPayload; leftIter.NextPair(key, leftPayload);) { - UNIT_ASSERT(right.Contains(key)); - NUdf::TUnboxedValue rightPayload = right.Lookup(key); - AssertUnboxedValuesAreEqual(leftPayload, rightPayload, payloadType); - } - break; - } - - case TType::EKind::Variant: { - auto variantType = static_cast(type); - UNIT_ASSERT_EQUAL(left.GetVariantIndex(), right.GetVariantIndex()); - ui32 variantIndex = left.GetVariantIndex(); - TType* innerType = variantType->GetUnderlyingType(); - if (innerType->IsStruct()) { - innerType = static_cast(innerType)->GetMemberType(variantIndex); - } else { - Y_VERIFY_S(innerType->IsTuple(), "Unexpected underlying variant type: " << innerType->GetKindAsStr()); - innerType = static_cast(innerType)->GetElementType(variantIndex); - } - NUdf::TUnboxedValue leftValue = left.GetVariantItem(); - NUdf::TUnboxedValue rightValue = right.GetVariantItem(); - AssertUnboxedValuesAreEqual(leftValue, rightValue, innerType); - break; - } - - default: - THROW yexception() << "Unsupported type: " << type->GetKindAsStr(); - } -} - -} // namespace - -namespace NKikimr::NKqp::NFormats { - -namespace { - -template -void TestDataTypeConversion(arrow::Type::type arrowTypeId) { - TTestContext context; - - auto type = TDataType::Create(NUdf::TDataType::Id, context.TypeEnv); - UNIT_ASSERT(IsArrowCompatible(type)); - - TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); - - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - values.emplace_back(GetValueOfBasicType(type, i)); - } - - auto array = NTestUtils::MakeArray(values, type); - UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - - std::shared_ptr typedArray; - std::shared_ptr timezoneArray; - - if constexpr (IsTimezoneType) { - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 2); - UNIT_ASSERT(structArray->field(0)->type_id() == arrowTypeId); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::STRING); - - typedArray = static_pointer_cast(structArray->field(0)); - timezoneArray = static_pointer_cast(structArray->field(1)); - } else { - UNIT_ASSERT(array->type_id() == arrowTypeId); - typedArray = static_pointer_cast(array); - } - - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - if constexpr (IsStringType) { - if constexpr (std::is_same_v) { - auto val = NBinaryJson::SerializeToJson(values[i].AsStringRef()); - UNIT_ASSERT(static_cast(typedArray->Value(i)) == val); - } else { - auto value = NTestUtils::ExtractUnboxedValue(array, i, type, context.HolderFactory); - AssertUnboxedValuesAreEqual(value, values[i], type); - } - } else { - UNIT_ASSERT(static_cast(typedArray->Value(i)) == values[i].Get()); - } - - if constexpr (IsTimezoneType) { - auto view = timezoneArray->Value(i); - UNIT_ASSERT(values[i].GetTimezoneId() == GetTimezoneId(NUdf::TStringRef(view.data(), view.size()))); - } - } -} - -template -void TestFixedSizeBinaryDataTypeConversion() { - TTestContext context; - TType* type; - - if constexpr (IsDecimalType) { - type = TDataDecimalType::Create(35, 10, context.TypeEnv); - } else { - type = TDataType::Create(NUdf::TDataType::Id, context.TypeEnv); - } - - UNIT_ASSERT(IsArrowCompatible(type)); - - TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); - - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - values.emplace_back(GetValueOfBasicType(type, i)); - } - - auto array = NTestUtils::MakeArray(values, type); - UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - - std::shared_ptr typedArray; - - UNIT_ASSERT(array->type_id() == arrow::Type::FIXED_SIZE_BINARY); - typedArray = static_pointer_cast(array); - UNIT_ASSERT(typedArray->byte_width() == NScheme::FSB_SIZE); - - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - auto view = typedArray->GetView(i); - if constexpr (IsDecimalType) { - NYql::NDecimal::TInt128 actual; - std::memcpy(&actual, view.data(), view.size()); - - NYql::NDecimal::TInt128 expected = values[i].GetInt128(); - UNIT_ASSERT(actual == expected); - } else { - auto expected = values[i].AsStringRef(); - UNIT_ASSERT_STRINGS_EQUAL(std::string(view.data(), view.size()), std::string(expected.Data(), expected.Size())); - } - } -} - -template -void TestSingularTypeConversion() { - TTestContext context; - - TType* type = GetTypeOfSingular(context.TypeEnv); - UNIT_ASSERT(IsArrowCompatible(type)); - - TUnboxedValueVector values; - values.reserve(TEST_ARRAY_SIZE); - - for (size_t i = 0; i < TEST_ARRAY_SIZE; ++i) { - values.emplace_back(); - } - - auto array = NTestUtils::MakeArray(values, type); - UNIT_ASSERT_C(array->ValidateFull().ok(), array->ValidateFull().ToString()); - UNIT_ASSERT(array->length() == static_cast(TEST_ARRAY_SIZE)); - - if (SingularKind == TType::EKind::Null) { - UNIT_ASSERT(array->type_id() == arrow::Type::NA); - } else { - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 0); - } -} - -} // namespace - -Y_UNIT_TEST_SUITE(KqpFormats_Arrow_Conversion) { - - // Integral types - Y_UNIT_TEST(DataType_Bool) { - TestDataTypeConversion(arrow::Type::UINT8); - } - - Y_UNIT_TEST(DataType_Int8) { - TestDataTypeConversion(arrow::Type::INT8); - } - - Y_UNIT_TEST(DataType_UInt8) { - TestDataTypeConversion(arrow::Type::UINT8); - } - - Y_UNIT_TEST(DataType_Int16) { - TestDataTypeConversion(arrow::Type::INT16); - } - - Y_UNIT_TEST(DataType_UInt16) { - TestDataTypeConversion(arrow::Type::UINT16); - } - - Y_UNIT_TEST(DataType_Int32) { - TestDataTypeConversion(arrow::Type::INT32); - } - - Y_UNIT_TEST(DataType_UInt32) { - TestDataTypeConversion(arrow::Type::UINT32); - } - - Y_UNIT_TEST(DataType_Int64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_UInt64) { - TestDataTypeConversion(arrow::Type::UINT64); - } - - // Binary number types - Y_UNIT_TEST(DataType_Decimal) { - TestFixedSizeBinaryDataTypeConversion(); - } - - Y_UNIT_TEST(DataType_DyNumber) { - TestDataTypeConversion(arrow::Type::STRING); - } - - // Floating point types - Y_UNIT_TEST(DataType_Float) { - TestDataTypeConversion(arrow::Type::FLOAT); - } - - Y_UNIT_TEST(DataType_Double) { - TestDataTypeConversion(arrow::Type::DOUBLE); - } - - // Datetime types - Y_UNIT_TEST(DataType_Date) { - TestDataTypeConversion(arrow::Type::UINT16); - } - - Y_UNIT_TEST(DataType_Datetime) { - TestDataTypeConversion(arrow::Type::UINT32); - } - - Y_UNIT_TEST(DataType_Timestamp) { - TestDataTypeConversion(arrow::Type::UINT64); - } - - Y_UNIT_TEST(DataType_Interval) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_TzDate) { - TestDataTypeConversion(arrow::Type::UINT16); - } - - Y_UNIT_TEST(DataType_TzDatetime) { - TestDataTypeConversion(arrow::Type::UINT32); - } - - Y_UNIT_TEST(DataType_TzTimestamp) { - TestDataTypeConversion(arrow::Type::UINT64); - } - - Y_UNIT_TEST(DataType_Date32) { - TestDataTypeConversion(arrow::Type::INT32); - } - - Y_UNIT_TEST(DataType_Datetime64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_Timestamp64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_Interval64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_TzDate32) { - TestDataTypeConversion(arrow::Type::INT32); - } - - Y_UNIT_TEST(DataType_TzDatetime64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - Y_UNIT_TEST(DataType_TzTimestamp64) { - TestDataTypeConversion(arrow::Type::INT64); - } - - // String types - Y_UNIT_TEST(DataType_String) { - TestDataTypeConversion(arrow::Type::BINARY); - } - - Y_UNIT_TEST(DataType_Utf8) { - TestDataTypeConversion(arrow::Type::STRING); - } - - Y_UNIT_TEST(DataType_Yson) { - TestDataTypeConversion(arrow::Type::BINARY); - } - - Y_UNIT_TEST(DataType_Json) { - TestDataTypeConversion(arrow::Type::STRING); - } - - Y_UNIT_TEST(DataType_JsonDocument) { - TestDataTypeConversion(arrow::Type::STRING); - } - - Y_UNIT_TEST(DataType_Uuid) { - TestFixedSizeBinaryDataTypeConversion(); - } - - // Singular types - Y_UNIT_TEST(DataType_Null) { - TestSingularTypeConversion(); - } - - Y_UNIT_TEST(DataType_Void) { - TestSingularTypeConversion(); - } - - Y_UNIT_TEST(DataType_EmptyList) { - TestSingularTypeConversion(); - } - - Y_UNIT_TEST(DataType_EmptyDict) { - TestSingularTypeConversion(); - } -} - -Y_UNIT_TEST_SUITE(DqUnboxedValueToNativeArrowConversion) { - Y_UNIT_TEST(Struct) { - TTestContext context; - - auto structType = context.GetStructType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(structType)); - - auto values = context.CreateStructs(100); - auto array = NFormats::NTestUtils::MakeArray(values, structType); - - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 3); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::BINARY); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT64); - UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); - auto binaryArray = static_pointer_cast(structArray->field(0)); - auto int32Array = static_pointer_cast(structArray->field(1)); - auto uint64Array = static_pointer_cast(structArray->field(2)); - auto index = 0; - for (const auto& value: values) { - auto stringValue = value.GetElement(0); - auto stringRef = stringValue.AsStringRef(); - auto stringView = binaryArray->GetView(index); - UNIT_ASSERT_EQUAL(std::string(stringRef.Data(), stringRef.Size()), std::string(stringView)); - - auto intValue = value.GetElement(1).Get(); - auto intArrow = int32Array->Value(index); - UNIT_ASSERT_EQUAL(intValue, intArrow); - - auto uIntValue = value.GetElement(2).Get(); - auto uIntArrow = uint64Array->Value(index); - UNIT_ASSERT_EQUAL(uIntValue, uIntArrow); - ++index; - } - } - - Y_UNIT_TEST(Tuple) { - TTestContext context; - - auto tupleType = context.GetTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); - - auto values = context.CreateTuples(100); - auto array = NFormats::NTestUtils::MakeArray(values, tupleType); - UNIT_ASSERT(array->ValidateFull().ok()); - - UNIT_ASSERT(array->length() == static_cast(values.size())); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 3); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(structArray->field(1)->type_id() == arrow::Type::INT8); - UNIT_ASSERT(structArray->field(2)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(static_cast(structArray->field(0)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(1)->length()) == values.size()); - UNIT_ASSERT(static_cast(structArray->field(2)->length()) == values.size()); - auto boolArray = static_pointer_cast(structArray->field(0)); - auto int8Array = static_pointer_cast(structArray->field(1)); - auto uint8Array = static_pointer_cast(structArray->field(2)); - auto index = 0; - for (const auto& value: values) { - auto boolValue = value.GetElement(0).Get(); - auto boolArrow = boolArray->Value(index); - UNIT_ASSERT(boolValue == boolArrow); - - auto intValue = value.GetElement(1).Get(); - auto intArrow = int8Array->Value(index); - UNIT_ASSERT(intValue == intArrow); - - auto uIntValue = value.GetElement(2).Get(); - auto uIntArrow = uint8Array->Value(index); - UNIT_ASSERT(uIntValue == uIntArrow); - ++index; - } - } - - Y_UNIT_TEST(ListOfJsons) { - TTestContext context; - - auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); - - auto values = context.CreateListOfJsons(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - auto listArray = static_pointer_cast(array); - - UNIT_ASSERT(listArray->num_fields() == 1); - UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::STRING); - auto jsonArray = static_pointer_cast(listArray->values()); - auto index = 0; - auto innerIndex = 0; - for (const auto& value: values) { - UNIT_ASSERT(value.GetListLength() == static_cast(listArray->value_length(index))); - const auto iter = value.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - auto view = jsonArray->GetView(innerIndex); - std::string itemArrow(view.data(), view.size()); - auto stringRef = item.AsStringRef(); - std::string itemList(stringRef.Data(), stringRef.Size()); - UNIT_ASSERT(itemList == itemArrow); - ++innerIndex; - } - ++index; - } - } - - Y_UNIT_TEST(OptionalListOfOptional) { - TTestContext context; - - auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); - - auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::LIST); - - auto listArray = static_pointer_cast(array); - UNIT_ASSERT(listArray->num_fields() == 1); - UNIT_ASSERT(listArray->value_type()->id() == arrow::Type::INT32); - - auto i32Array = static_pointer_cast(listArray->values()); - auto index = 0; - auto innerIndex = 0; - for (const auto& value: values) { - if (!value.HasValue()) { - UNIT_ASSERT(listArray->IsNull(index)); - ++index; - continue; - } - - auto listValue = value.GetOptionalValue(); - - UNIT_ASSERT_VALUES_EQUAL(listValue.GetListLength(), static_cast(listArray->value_length(index))); - const auto iter = listValue.GetListIterator(); - for (NUdf::TUnboxedValue item; iter.Next(item);) { - if (!item.HasValue()) { - UNIT_ASSERT(i32Array->IsNull(innerIndex)); - } else { - UNIT_ASSERT(i32Array->Value(innerIndex) == item.GetOptionalValue().Get()); - } - ++innerIndex; - } - ++index; - } - } - - // Y_UNIT_TEST(VariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); - // auto unionArray = static_pointer_cast(array); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT(valueArrow == valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT(valueArrow == valueInner); - // } - // } - // } - - // Y_UNIT_TEST(OptionalVariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - // auto structArray = static_pointer_cast(array); - // UNIT_ASSERT(structArray->num_fields() == 1); - // UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - // auto unionArray = static_pointer_cast(structArray->field(0)); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // if (!value.HasValue()) { - // // NULL - // UNIT_ASSERT(structArray->IsNull(index)); - // continue; - // } - - // UNIT_ASSERT(!structArray->IsNull(index)); - - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT(valueArrow == valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT(valueArrow == valueInner); - // } - // } - // } - - // Y_UNIT_TEST(DoubleOptionalVariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); - // UNIT_ASSERT(array->ValidateFull().ok()); - // UNIT_ASSERT(static_cast(array->length()) == values.size()); - // UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - // auto firstStructArray = static_pointer_cast(array); - // UNIT_ASSERT(firstStructArray->num_fields() == 1); - // UNIT_ASSERT(firstStructArray->field(0)->type_id() == arrow::Type::STRUCT); - - // auto secondStructArray = static_pointer_cast(firstStructArray->field(0)); - // UNIT_ASSERT(secondStructArray->num_fields() == 1); - // UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - // auto unionArray = static_pointer_cast(secondStructArray->field(0)); - - // UNIT_ASSERT(unionArray->num_fields() == 4); - // UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::BINARY); - // UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::STRING); - // UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::FIXED_SIZE_BINARY); - // UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::FLOAT); - - // auto ysonArray = static_pointer_cast(unionArray->field(0)); - // auto jsonDocArray = static_pointer_cast(unionArray->field(1)); - // auto uuidArray = static_pointer_cast(unionArray->field(2)); - // auto floatArray = static_pointer_cast(unionArray->field(3)); - - // for (ui64 index = 0; index < values.size(); ++index) { - // auto value = values[index]; - // if (!value.HasValue()) { - // if (value) { - // // Optional(NULL) - // UNIT_ASSERT(secondStructArray->IsNull(index)); - // } else { - // // NULL - // UNIT_ASSERT(firstStructArray->IsNull(index)); - // } - // continue; - // } - - // UNIT_ASSERT(!firstStructArray->IsNull(index) && !secondStructArray->IsNull(index)); - - // UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - // auto fieldIndex = unionArray->value_offset(index); - // if (value.GetVariantIndex() == 3) { - // auto valueArrow = floatArray->Value(fieldIndex); - // auto valueInner = value.GetVariantItem().Get(); - // UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - // } else { - // arrow::util::string_view viewArrow; - // if (value.GetVariantIndex() == 0) { - // viewArrow = ysonArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 1) { - // viewArrow = jsonDocArray->GetView(fieldIndex); - // } else if (value.GetVariantIndex() == 2) { - // viewArrow = uuidArray->GetView(fieldIndex); - // } - // std::string valueArrow(viewArrow.data(), viewArrow.size()); - // auto innerItem = value.GetVariantItem(); - // auto refInner = innerItem.AsStringRef(); - // std::string valueInner(refInner.Data(), refInner.Size()); - // UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - // } - // } - // } - - Y_UNIT_TEST(VariantOverTupleWithOptionals) { - TTestContext context; - - auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::DENSE_UNION); - auto unionArray = static_pointer_cast(array); - - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } - - Y_UNIT_TEST(OptionalVariantOverTupleWithOptionals) { - // DenseUnionArray does not support NULL values, so we wrap it in a StructArray - - TTestContext context; - - auto variantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - auto structArray = static_pointer_cast(array); - UNIT_ASSERT(structArray->num_fields() == 1); - UNIT_ASSERT(structArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - auto unionArray = static_pointer_cast(structArray->field(0)); - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - if (!value) { - // NULL - UNIT_ASSERT(structArray->IsNull(index)); - continue; - } - - UNIT_ASSERT(!structArray->IsNull(index)); - - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } - - Y_UNIT_TEST(DoubleOptionalVariantOverTupleWithOptionals) { - // DenseUnionArray does not support NULL values, so we wrap it in a StructArray - - TTestContext context; - - auto variantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT(static_cast(array->length()) == values.size()); - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - - auto firstStructArray = static_pointer_cast(array); - UNIT_ASSERT(firstStructArray->num_fields() == 1); - UNIT_ASSERT(firstStructArray->field(0)->type_id() == arrow::Type::STRUCT); - - auto secondStructArray = static_pointer_cast(firstStructArray->field(0)); - UNIT_ASSERT(secondStructArray->num_fields() == 1); - UNIT_ASSERT(secondStructArray->field(0)->type_id() == arrow::Type::DENSE_UNION); - - auto unionArray = static_pointer_cast(secondStructArray->field(0)); - UNIT_ASSERT(unionArray->num_fields() == 5); - UNIT_ASSERT(unionArray->field(0)->type_id() == arrow::Type::UINT8); - UNIT_ASSERT(unionArray->field(1)->type_id() == arrow::Type::INT16); - UNIT_ASSERT(unionArray->field(2)->type_id() == arrow::Type::UINT16); - UNIT_ASSERT(unionArray->field(3)->type_id() == arrow::Type::INT32); - UNIT_ASSERT(unionArray->field(4)->type_id() == arrow::Type::UINT32); - auto boolArray = static_pointer_cast(unionArray->field(0)); - auto i16Array = static_pointer_cast(unionArray->field(1)); - auto ui16Array = static_pointer_cast(unionArray->field(2)); - auto i32Array = static_pointer_cast(unionArray->field(3)); - auto ui32Array = static_pointer_cast(unionArray->field(4)); - for (ui64 index = 0; index < values.size(); ++index) { - auto value = values[index]; - if (!value.HasValue()) { - if (value && !value.GetOptionalValue()) { - // Optional(NULL) - UNIT_ASSERT(secondStructArray->IsNull(index)); - } else if (!value) { - // NULL - UNIT_ASSERT(firstStructArray->IsNull(index)); - } - continue; - } - - UNIT_ASSERT(!firstStructArray->IsNull(index) && !secondStructArray->IsNull(index)); - - UNIT_ASSERT(value.GetVariantIndex() == static_cast(unionArray->child_id(index))); - auto fieldIndex = unionArray->value_offset(index); - if (value.GetVariantIndex() == 0) { - bool valueArrow = boolArray->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 1) { - auto valueArrow = i16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 2) { - auto valueArrow = ui16Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 3) { - auto valueArrow = i32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } else if (value.GetVariantIndex() == 4) { - if (!value.GetVariantItem().HasValue()) { - UNIT_ASSERT(ui32Array->IsNull(fieldIndex)); - } else { - auto valueArrow = ui32Array->Value(fieldIndex); - auto valueInner = value.GetVariantItem().Get(); - UNIT_ASSERT_VALUES_EQUAL(valueArrow, valueInner); - } - } - } - } -} - -Y_UNIT_TEST_SUITE(DqUnboxedValueDoNotFitToArrow) { - Y_UNIT_TEST(DictUtf8ToInterval) { - TTestContext context; - - auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); - - auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - UNIT_ASSERT(array->ValidateFull().ok()); - - UNIT_ASSERT(array->type_id() == arrow::Type::STRUCT); - auto wrapArray = static_pointer_cast(array); - UNIT_ASSERT_VALUES_EQUAL(wrapArray->num_fields(), 2); - UNIT_ASSERT_VALUES_EQUAL(static_cast(wrapArray->length()), values.size()); - - UNIT_ASSERT(wrapArray->field(0)->type_id() == arrow::Type::MAP); - auto mapArray = static_pointer_cast(wrapArray->field(0)); - UNIT_ASSERT_VALUES_EQUAL(static_cast(mapArray->length()), values.size()); - - UNIT_ASSERT(wrapArray->field(1)->type_id() == arrow::Type::UINT64); - auto customArray = static_pointer_cast(wrapArray->field(1)); - UNIT_ASSERT_VALUES_EQUAL(static_cast(customArray->length()), values.size()); - - UNIT_ASSERT_VALUES_EQUAL(mapArray->num_fields(), 1); - - UNIT_ASSERT(mapArray->keys()->type_id() == arrow::Type::STRING); - auto utf8Array = static_pointer_cast(mapArray->keys()); - - UNIT_ASSERT(mapArray->items()->type_id() == arrow::Type::INT64); - auto intervalArray = static_pointer_cast(mapArray->items()); - - ui64 index = 0; - for (const auto& value: values) { - UNIT_ASSERT_VALUES_EQUAL(value.GetDictLength(), static_cast(mapArray->value_length(index))); - for (auto subindex = mapArray->value_offset(index); subindex < mapArray->value_offset(index + 1); ++subindex) { - auto keyArrow = utf8Array->GetView(subindex); - NUdf::TUnboxedValue key = MakeString(NUdf::TStringRef(keyArrow.data(), keyArrow.size())); - UNIT_ASSERT(value.Contains(key)); - NUdf::TUnboxedValue payloadValue = value.Lookup(key); - UNIT_ASSERT_VALUES_EQUAL(intervalArray->Value(subindex), payloadValue.Get()); - } - ++index; - } - } - - Y_UNIT_TEST(DictOptionalToTuple) { - TTestContext context; - - auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); - - auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::STRUCT); - - auto wrapArray = static_pointer_cast(array); - UNIT_ASSERT_EQUAL(wrapArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(wrapArray->field(0)->type_id(), arrow::Type::LIST); - - UNIT_ASSERT_EQUAL(wrapArray->field(1)->type_id(), arrow::Type::UINT64); - auto listArray = static_pointer_cast(wrapArray->field(0)); - UNIT_ASSERT_EQUAL(static_cast(listArray->length()), values.size()); - - UNIT_ASSERT_EQUAL(wrapArray->field(1)->type_id(), arrow::Type::UINT64); - auto customArray = static_pointer_cast(wrapArray->field(1)); - UNIT_ASSERT_EQUAL(static_cast(customArray->length()), values.size()); - - UNIT_ASSERT_EQUAL(listArray->value_type()->id(), arrow::Type::STRUCT); - auto structArray = static_pointer_cast(listArray->values()); - - UNIT_ASSERT_EQUAL(listArray->num_fields(), 1); - UNIT_ASSERT_EQUAL(structArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(structArray->field(0)->type_id(), arrow::Type::DOUBLE); - UNIT_ASSERT_EQUAL(structArray->field(1)->type_id(), arrow::Type::STRUCT); - auto keysArray = static_pointer_cast(structArray->field(0)); - auto itemsArray = static_pointer_cast(structArray->field(1)); - UNIT_ASSERT_EQUAL(itemsArray->num_fields(), 2); - UNIT_ASSERT_EQUAL(itemsArray->field(0)->type_id(), arrow::Type::INT32); - UNIT_ASSERT_EQUAL(itemsArray->field(1)->type_id(), arrow::Type::UINT32); - auto i32Array = static_pointer_cast(itemsArray->field(0)); - auto ui32Array = static_pointer_cast(itemsArray->field(1)); - - ui64 index = 0; - for (const auto& value: values) { - UNIT_ASSERT(value.GetDictLength() == static_cast(listArray->value_length(index))); - for (auto subindex = listArray->value_offset(index); subindex < listArray->value_offset(index + 1); ++subindex) { - NUdf::TUnboxedValue key = keysArray->IsNull(subindex) - ? NUdf::TUnboxedValuePod() - : NUdf::TUnboxedValuePod(keysArray->Value(subindex)); - UNIT_ASSERT(value.Contains(key)); - NUdf::TUnboxedValue payloadValue = value.Lookup(key); - UNIT_ASSERT_EQUAL(payloadValue.GetElement(0).Get(), i32Array->Value(subindex)); - UNIT_ASSERT_EQUAL(payloadValue.GetElement(1).Get(), ui32Array->Value(subindex)); - } - ++index; - } - } - - Y_UNIT_TEST(OptionalOfOptional) { - TTestContext context; - - auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); - - auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - - auto index = 0; - for (auto value: values) { - std::shared_ptr currentArray = array; - int depth = 0; - - while (currentArray->type()->id() == arrow::Type::STRUCT) { - auto structArray = static_pointer_cast(currentArray); - UNIT_ASSERT_EQUAL(structArray->num_fields(), 1); - - if (structArray->IsNull(index)) { - break; - } - - ++depth; - - auto childArray = structArray->field(0); - if (childArray->type()->id() == arrow::Type::DENSE_UNION) { - break; - } - - currentArray = childArray; - } - - while (depth--) { - UNIT_ASSERT(value); - value = value.GetOptionalValue(); - } - - if (value.HasValue()) { - if (currentArray->type()->id() == arrow::Type::INT32) { - UNIT_ASSERT_EQUAL(value.Get(), static_pointer_cast(currentArray)->Value(index)); - } else { - UNIT_ASSERT(!currentArray->IsNull(index)); - } - } else { - UNIT_ASSERT(currentArray->IsNull(index)); - } - - ++index; - } - } - - Y_UNIT_TEST(LargeVariant) { - TTestContext context; - - ui32 numberOfTypes = 500; - auto variantType = context.GetLargeVariantType(numberOfTypes); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateLargeVariant(1000); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - UNIT_ASSERT(array->ValidateFull().ok()); - UNIT_ASSERT_EQUAL(static_cast(array->length()), values.size()); - UNIT_ASSERT_EQUAL(array->type_id(), arrow::Type::DENSE_UNION); - auto unionArray = static_pointer_cast(array); - ui32 numberOfGroups = (numberOfTypes - 1) / arrow::UnionType::kMaxTypeCode + 1; - UNIT_ASSERT_EQUAL(numberOfGroups, static_cast(unionArray->num_fields())); - ui32 typesInArrow = 0; - for (auto i = 0 ; i < unionArray->num_fields(); ++i) { - UNIT_ASSERT_EQUAL(unionArray->field(i)->type_id(), arrow::Type::DENSE_UNION); - typesInArrow += unionArray->field(i)->num_fields(); - } - UNIT_ASSERT_EQUAL(numberOfTypes, typesInArrow); - } -} - -Y_UNIT_TEST_SUITE(ConvertUnboxedValueToArrowAndBack){ - Y_UNIT_TEST(Struct) { - TTestContext context; - - auto structType = context.GetStructType(); - auto values = context.CreateStructs(100); - auto array = NFormats::NTestUtils::MakeArray(values, structType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, structType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], structType); - } - } - - Y_UNIT_TEST(Tuple) { - TTestContext context; - - auto tupleType = context.GetTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(tupleType)); - - auto values = context.CreateTuples(100); - auto array = NFormats::NTestUtils::MakeArray(values, tupleType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, tupleType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], tupleType); - } - } - - Y_UNIT_TEST(DictUtf8ToInterval) { - TTestContext context; - - auto dictType = context.GetDictUtf8ToIntervalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); - - auto values = context.CreateDictUtf8ToInterval(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, dictType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); - } - } - - Y_UNIT_TEST(ListOfJsons) { - TTestContext context; - - auto listType = context.GetListOfJsonsType(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); - - auto values = context.CreateListOfJsons(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, listType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); - } - } - - Y_UNIT_TEST(OptionalListOfOptional) { - TTestContext context; - - auto listType = context.GetOptionalListOfOptional(); - Y_ABORT_UNLESS(NFormats::IsArrowCompatible(listType)); - - auto values = context.CreateOptionalListOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, listType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, listType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], listType); - } - } - - // Y_UNIT_TEST(VariantOverStruct) { - // TTestContext context; - - // auto variantType = context.GetVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - // auto values = context.CreateVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, variantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - // } - // } - - // Y_UNIT_TEST(OptionalVariantOverStruct) { - // TTestContext context; - - // auto optionalVariantType = context.GetOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); - - // auto values = context.CreateOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, optionalVariantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); - // } - // } - - // Y_UNIT_TEST(DoubleOptionalVariantOverStruct) { - // TTestContext context; - - // auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverStructType(); - // UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); - - // auto values = context.CreateDoubleOptionalVariantOverStruct(100); - // auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalVariantType); - // auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); - // UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - // for (ui64 index = 0; index < values.size(); ++index) { - // AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); - // } - // } - - Y_UNIT_TEST(VariantOverTupleWithOptionals) { - TTestContext context; - - auto variantType = context.GetVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - } - } - - Y_UNIT_TEST(OptionalVariantOverTupleWithOptionals) { - TTestContext context; - - auto optionalVariantType = context.GetOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(optionalVariantType)); - - auto values = context.CreateOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, optionalVariantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, optionalVariantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], optionalVariantType); - } - } - - Y_UNIT_TEST(DoubleOptionalVariantOverTupleWithOptionals) { - TTestContext context; - - auto doubleOptionalVariantType = context.GetDoubleOptionalVariantOverTupleWithOptionalsType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalVariantType)); - - auto values = context.CreateDoubleOptionalVariantOverTupleWithOptionals(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalVariantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalVariantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalVariantType); - } - } - - Y_UNIT_TEST(DictOptionalToTuple) { - TTestContext context; - - auto dictType = context.GetDictOptionalToTupleType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(dictType)); - - auto values = context.CreateDictOptionalToTuple(100); - auto array = NFormats::NTestUtils::MakeArray(values, dictType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, dictType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], dictType); - } - } - - Y_UNIT_TEST(OptionalOfOptional) { - TTestContext context; - - auto doubleOptionalType = context.GetOptionalOfOptionalType(); - UNIT_ASSERT(NFormats::IsArrowCompatible(doubleOptionalType)); - - auto values = context.CreateOptionalOfOptional(100); - auto array = NFormats::NTestUtils::MakeArray(values, doubleOptionalType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, doubleOptionalType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], doubleOptionalType); - } - } - - Y_UNIT_TEST(LargeVariant) { - TTestContext context; - - auto variantType = context.GetLargeVariantType(500); - UNIT_ASSERT(NFormats::IsArrowCompatible(variantType)); - - auto values = context.CreateLargeVariant(1000); - auto array = NFormats::NTestUtils::MakeArray(values, variantType); - auto restoredValues = NFormats::NTestUtils::ExtractUnboxedValues(array, variantType, context.HolderFactory); - UNIT_ASSERT_EQUAL(values.size(), restoredValues.size()); - for (ui64 index = 0; index < values.size(); ++index) { - AssertUnboxedValuesAreEqual(values[index], restoredValues[index], variantType); - } - } -} - -} // namespace NKikimr::NKqp::NFormats diff --git a/ydb/core/kqp/common/result_set_format/ut/ya.make b/ydb/core/kqp/common/result_set_format/ut/ya.make index e53b7c288dd1..0af51ab0a206 100644 --- a/ydb/core/kqp/common/result_set_format/ut/ya.make +++ b/ydb/core/kqp/common/result_set_format/ut/ya.make @@ -5,7 +5,8 @@ FORK_SUBTESTS() SIZE(MEDIUM) SRCS( - kqp_result_set_arrow_ut.cpp + kqp_formats_ut_helpers.cpp + kqp_formats_arrow_ut.cpp ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/kqp/common/result_set_format/ya.make b/ydb/core/kqp/common/result_set_format/ya.make index 4c390964fe9d..6fc718eb9b67 100644 --- a/ydb/core/kqp/common/result_set_format/ya.make +++ b/ydb/core/kqp/common/result_set_format/ya.make @@ -1,7 +1,7 @@ LIBRARY() SRCS( - kqp_result_set_arrow.cpp + kqp_formats_arrow.cpp kqp_result_set_builders.cpp ) diff --git a/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp b/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp index f2d15a3a21ff..58916dbee26d 100644 --- a/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp +++ b/ydb/core/kqp/ut/arrow/kqp_result_set_formats_ut.cpp @@ -1482,7 +1482,7 @@ UuidNotNullValue: [ /** * More tests for different types with correctness and convertations between Arrow and UV : - * ydb/library/yql/dq/runtime/dq_arrow_helpers_ut.cpp + * ydb/core/kqp/common/result_set_format/ut/kqp_formats_arrow_ut.cpp */ // Optional @@ -1565,58 +1565,8 @@ column1: -- is_valid: all not null } } - // Optional>>> - Y_UNIT_TEST(ArrowFormat_Types_Optional_3) { - auto kikimr = CreateKikimrRunner(/* withSampleTables */ true); - auto client = kikimr.GetQueryClient(); - - { - auto batches = ExecuteAndCombineBatches(client, R"( - SELECT Just(Just(Just(Key1))), Just(Just(Just(Name))) FROM Join2 - WHERE Key1 IN [104, 106, 108] - ORDER BY Key1; - )", /* assertSize */ false, 1); - - UNIT_ASSERT_C(!batches.empty(), "Batches must not be empty"); - - const auto& batch = batches.front(); - - UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 3); - UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 2); - - ValidateOptionalColumn(batch->column(0), 3, false); - ValidateOptionalColumn(batch->column(1), 3, false); - - const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: struct not null> - -- is_valid: all not null - -- child 0 type: struct - -- is_valid: all not null - -- child 0 type: uint32 - [ - 104, - 106, - 108 - ] -column1: -- is_valid: all not null - -- child 0 type: struct not null> - -- is_valid: all not null - -- child 0 type: struct - -- is_valid: all not null - -- child 0 type: binary - [ - 4E616D6533, - 4E616D6533, - null - ] -)"; - UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected); - } - } - // Optional> - Y_UNIT_TEST(ArrowFormat_Types_Optional_4) { + Y_UNIT_TEST(ArrowFormat_Types_Optional_3) { auto kikimr = CreateKikimrRunner(/* withSampleTables */ false); auto client = kikimr.GetQueryClient(); @@ -1656,7 +1606,7 @@ R"(column0: -- is_valid: all not null } // Optional>> - Y_UNIT_TEST(ArrowFormat_Types_Optional_5) { + Y_UNIT_TEST(ArrowFormat_Types_Optional_4) { auto kikimr = CreateKikimrRunner(/* withSampleTables */ false); auto client = kikimr.GetQueryClient(); @@ -1676,7 +1626,7 @@ R"(column0: -- is_valid: all not null const TString expected = R"(column0: -- is_valid: all not null - -- child 0 type: struct not null> + -- child 0 type: struct> -- is_valid: all not null -- child 0 type: dense_union -- is_valid: all not null @@ -1899,29 +1849,24 @@ R"(column0: -- is_valid: all not null UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 1); UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1); - UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::STRUCT, "Column type must be arrow::Type::STRUCT"); + UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::LIST, "Column type must be arrow::Type::LIST"); const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: map - [ - keys: +R"(column0: [ + -- is_valid: all not null + -- child 0 type: binary [ 61, 63, 62 ] - values: + -- child 1 type: int32 [ 1, 3, 2 ] - ] - -- child 1 type: uint64 - [ - 0 - ] + ] )"; UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected); @@ -1944,30 +1889,24 @@ R"(column0: -- is_valid: all not null UNIT_ASSERT_VALUES_EQUAL(batch->num_rows(), 1); UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1); - UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::STRUCT, "Column type must be arrow::Type::STRUCT"); + UNIT_ASSERT_C(batch->column(0)->type()->id() == arrow::Type::LIST, "Column type must be arrow::Type::LIST"); const TString expected = -R"(column0: -- is_valid: all not null - -- child 0 type: list> - [ - -- is_valid: all not null - -- child 0 type: binary - [ - 61, - 62, - null - ] - -- child 1 type: int32 - [ - 1, - 2, - 3 - ] - ] - -- child 1 type: uint64 - [ - 0 - ] +R"(column0: [ + -- is_valid: all not null + -- child 0 type: binary + [ + 61, + 62, + null + ] + -- child 1 type: int32 + [ + 1, + 2, + 3 + ] + ] )"; UNIT_ASSERT_VALUES_EQUAL(batch->ToString(), expected);