Skip to content

Commit ebd1e39

Browse files
committed
Specify csv params to correctly support tsv
1 parent 3f40f7a commit ebd1e39

File tree

2 files changed

+44
-4
lines changed

2 files changed

+44
-4
lines changed

ydb/library/workload/vector/vector_data_generator.cpp

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#include "vector_data_generator.h"
22

3+
#include <ydb/library/formats/arrow/csv/converter/csv_arrow.h>
34
#include <ydb/library/yql/udfs/common/knn/knn-serializer-shared.h>
45

6+
#include <ydb/public/api/protos/ydb_formats.pb.h>
7+
58
#include <contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h>
69
#include <contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.h>
710
#include <contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h>
@@ -44,14 +47,50 @@ class TTransformingDataGenerator final: public IBulkDataGenerator {
4447
return std::make_pair(schema, recordBatch);
4548
}
4649

47-
static std::shared_ptr<arrow::Table> Deserialize(TDataPortion::TCsv* data) {
50+
std::shared_ptr<arrow::Table> Deserialize(TDataPortion::TCsv* data) {
51+
Ydb::Formats::CsvSettings csvSettings;
52+
if (Y_UNLIKELY(!csvSettings.ParseFromString(data->FormatString))) {
53+
ythrow yexception() << "Unable to parse CsvSettings";
54+
}
55+
56+
arrow::csv::ReadOptions readOptions = arrow::csv::ReadOptions::Defaults();
57+
readOptions.skip_rows = csvSettings.skip_rows();
58+
if (data->Data.size() > NKikimr::NFormats::TArrowCSV::DEFAULT_BLOCK_SIZE) {
59+
ui32 blockSize = NKikimr::NFormats::TArrowCSV::DEFAULT_BLOCK_SIZE;
60+
blockSize *= data->Data.size() / blockSize + 1;
61+
readOptions.block_size = blockSize;
62+
}
63+
64+
arrow::csv::ParseOptions parseOptions = arrow::csv::ParseOptions::Defaults();
65+
const auto& quoting = csvSettings.quoting();
66+
if (Y_UNLIKELY(quoting.quote_char().length() > 1)) {
67+
ythrow yexception() << "Cannot read CSV: Wrong quote char '" << quoting.quote_char() << "'";
68+
}
69+
const char qchar = quoting.quote_char().empty() ? '"' : quoting.quote_char().front();
70+
parseOptions.quoting = false;
71+
parseOptions.quote_char = qchar;
72+
parseOptions.double_quote = !quoting.double_quote_disabled();
73+
if (csvSettings.delimiter()) {
74+
if (Y_UNLIKELY(csvSettings.delimiter().size() != 1)) {
75+
ythrow yexception() << "Cannot read CSV: Invalid delimitr in csv: " << csvSettings.delimiter();
76+
}
77+
parseOptions.delimiter = csvSettings.delimiter().front();
78+
}
79+
80+
arrow::csv::ConvertOptions convertOptions = arrow::csv::ConvertOptions::Defaults();
81+
if (csvSettings.null_value()) {
82+
convertOptions.null_values = { std::string(csvSettings.null_value().data(), csvSettings.null_value().size()) };
83+
convertOptions.strings_can_be_null = true;
84+
convertOptions.quoted_strings_can_be_null = false;
85+
}
86+
4887
auto bufferReader = std::make_shared<arrow::io::BufferReader>(arrow::util::string_view(data->Data.data(), data->Data.size()));
4988
auto csvReader = arrow::csv::TableReader::Make(
5089
arrow::io::default_io_context(),
5190
bufferReader,
52-
arrow::csv::ReadOptions::Defaults(),
53-
arrow::csv::ParseOptions::Defaults(),
54-
arrow::csv::ConvertOptions::Defaults()
91+
readOptions,
92+
parseOptions,
93+
convertOptions
5594
).ValueOrDie();
5695

5796
return csvReader->Read().ValueOrDie();

ydb/library/workload/vector/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ SRCS(
1414
PEERDIR(
1515
contrib/libs/apache/arrow
1616
ydb/library/workload/abstract
17+
ydb/public/api/protos
1718
)
1819

1920
GENERATE_ENUM_SERIALIZATION_WITH_HEADER(vector_enums.h)

0 commit comments

Comments
 (0)