Skip to content

Commit bd3e408

Browse files
committed
type-safer ColumnString::EstimatedValueSize, better validation, more tests
- Added validation for ColumnString::EstimatedValueSize value in ColumnString - Minor refactoring of block allocation in ColumnString - Moved test code for ColumnString and ColumnLowCardinality to separate files, - Added more test cases
1 parent 90414cc commit bd3e408

File tree

9 files changed

+357
-405
lines changed

9 files changed

+357
-405
lines changed

clickhouse/columns/string.cpp

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,37 @@ size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len
2323
}
2424

2525
// based on https://stackoverflow.com/a/9194117
26-
size_t RoundUp(size_t numToRound, size_t multiple)
27-
{
28-
size_t isPositive = static_cast<size_t>(numToRound >= 0);
29-
return ((numToRound + isPositive * (multiple - 1)) / multiple) * multiple;
26+
size_t RoundUp(size_t numToRound, size_t multiple) {
27+
return ((numToRound + (multiple - 1)) / multiple) * multiple;
28+
}
29+
30+
size_t ComputeValueSizeEstimation(size_t total_size, size_t number_of_items) {
31+
number_of_items = number_of_items ? number_of_items : 1; // just to avoid divide by zero
32+
size_t estimation = std::ceil(static_cast<double>(total_size) / number_of_items);
33+
34+
return estimation == 0;
35+
}
36+
37+
size_t EstimateBlockSize(size_t value_size_estimation) {
38+
size_t estimated_number_of_items_per_block = 32; // just arbitrary value
39+
40+
// do not pre-allocate too big blocks when expected values are big to minimize waste or when user explicitly requested not to
41+
if (value_size_estimation > DEFAULT_BLOCK_SIZE || value_size_estimation == static_cast<size_t>(clickhouse::ColumnString::NO_PREALLOCATE)) {
42+
// for really big items do not pre-allocate blocks, and allowing later code to put 1 item per block
43+
return 0;
44+
} else if (value_size_estimation > static_cast<size_t>(clickhouse::ColumnString::EstimatedValueSize::MEDIUM)) {
45+
// for not so big items, create blocks that fit smaller number of items, reducing produced block size.
46+
estimated_number_of_items_per_block = ceil(DEFAULT_BLOCK_SIZE / static_cast<double>(value_size_estimation));
47+
}
48+
49+
return std::max<size_t>(DEFAULT_BLOCK_SIZE, RoundUp(value_size_estimation * estimated_number_of_items_per_block, DEFAULT_BLOCK_SIZE));
50+
}
51+
52+
inline auto Validate(clickhouse::ColumnString::EstimatedValueSize value_size_estimation) {
53+
if (static_cast<int>(value_size_estimation) < 0)
54+
throw clickhouse::ValidationError("ColumnString received negative number as value size estimation");
55+
56+
return value_size_estimation;
3057
}
3158

3259
}
@@ -132,22 +159,6 @@ ItemView ColumnFixedString::GetItem(size_t index) const {
132159
return ItemView{Type::FixedString, this->At(index)};
133160
}
134161

135-
namespace {
136-
137-
size_t ComputeValueSizeEstimation(size_t total_size, size_t number_of_items) {
138-
number_of_items = number_of_items ? number_of_items : 1; // just to avoid divide by zero
139-
size_t estimation = std::ceil(static_cast<double>(total_size) / number_of_items);
140-
141-
return estimation == 0 ? ColumnString::DEFAULT_ESTIMATION : estimation;
142-
}
143-
144-
size_t EstimateNextBlockSize(size_t value_size_estimation) {
145-
const size_t estimated_number_of_items_per_block = 32; // just arbitrary value
146-
return std::max<size_t>(DEFAULT_BLOCK_SIZE, value_size_estimation * estimated_number_of_items_per_block);
147-
}
148-
149-
}
150-
151162
struct ColumnString::Block
152163
{
153164
using CharT = typename std::string::value_type;
@@ -188,11 +199,9 @@ struct ColumnString::Block
188199

189200
ColumnString::ColumnString(EstimatedValueSize value_size_estimation)
190201
: Column(Type::CreateString())
191-
, value_size_estimation_(value_size_estimation)
202+
, value_size_estimation_(static_cast<size_t>(Validate(value_size_estimation)))
192203
, next_block_size_(DEFAULT_BLOCK_SIZE)
193204
{
194-
if (value_size_estimation < 0)
195-
throw ValidationError("ColumnString received negative number as value size estimation");
196205
}
197206

198207
ColumnString::ColumnString(size_t element_count, EstimatedValueSize value_size_estimation)
@@ -237,25 +246,23 @@ void ColumnString::Reserve(size_t new_cap) {
237246
items_.reserve(new_cap);
238247

239248
if (blocks_.empty() || blocks_.back().GetAvailable() < value_size_estimation_) {
240-
blocks_.emplace_back(new_cap * value_size_estimation_);
249+
if (value_size_estimation_ != static_cast<size_t>(NO_PREALLOCATE))
250+
blocks_.emplace_back(new_cap * value_size_estimation_);
241251
} else {
242-
// make sure that next block will have enought space for all remaining items.
252+
// Estimate space required for items that woudn't fit into current Block.
243253
const size_t estimated_items_in_next_block = value_size_estimation_ ? new_cap - blocks_.back().GetAvailable() / value_size_estimation_ : new_cap;
244254
next_block_size_ = std::max(DEFAULT_BLOCK_SIZE, estimated_items_in_next_block * value_size_estimation_);
245255
}
246256
}
247257

248258
void ColumnString::SetEstimatedValueSize(EstimatedValueSize value_size_estimation) {
249-
value_size_estimation_ = value_size_estimation;
259+
value_size_estimation_ = static_cast<size_t>(Validate(value_size_estimation));
250260
}
251261

252262
void ColumnString::Append(std::string_view str) {
253-
if (blocks_.empty() || blocks_.back().GetAvailable() < str.length()) {
254-
blocks_.emplace_back(std::max(next_block_size_, str.size()));
255-
next_block_size_ = EstimateNextBlockSize(value_size_estimation_);
256-
}
263+
auto & block = PrepareBlockWithSpaceForAtLeast(str.length());
257264

258-
items_.emplace_back(blocks_.back().AppendUnsafe(str));
265+
items_.emplace_back(block.AppendUnsafe(str));
259266
}
260267

261268
void ColumnString::Append(const char* str) {
@@ -276,6 +283,18 @@ void ColumnString::AppendUnsafe(std::string_view str) {
276283
items_.emplace_back(blocks_.back().AppendUnsafe(str));
277284
}
278285

286+
ColumnString::Block & ColumnString::PrepareBlockWithSpaceForAtLeast(size_t minimum_required_bytes) {
287+
if (blocks_.empty() || blocks_.back().GetAvailable() < minimum_required_bytes) {
288+
if (next_block_size_ == 0)
289+
next_block_size_ = DEFAULT_BLOCK_SIZE;
290+
291+
blocks_.emplace_back(std::max(next_block_size_, minimum_required_bytes));
292+
next_block_size_ = EstimateBlockSize(value_size_estimation_);
293+
}
294+
295+
return blocks_.back();
296+
}
297+
279298
void ColumnString::Clear() {
280299
items_.clear();
281300
blocks_.clear();
@@ -292,10 +311,7 @@ void ColumnString::Append(ColumnRef column) {
292311
const auto total_size = ComputeTotalSize(col->items_);
293312

294313
// TODO: fill up existing block with some items and then add a new one for the rest of items
295-
if (blocks_.size() == 0 || blocks_.back().GetAvailable() < total_size) {
296-
blocks_.emplace_back(std::max(next_block_size_, total_size));
297-
next_block_size_ = EstimateNextBlockSize(value_size_estimation_);
298-
}
314+
PrepareBlockWithSpaceForAtLeast(total_size);
299315

300316
// Intentionally not doing items_.reserve() since that cripples performance.
301317
for (size_t i = 0; i < column->Size(); ++i) {
@@ -377,10 +393,10 @@ ColumnRef ColumnString::Slice(size_t begin, size_t len) const {
377393
auto result = std::make_shared<ColumnString>(EstimatedValueSize(value_size_estimation_));
378394

379395
result->items_.reserve(len);
380-
result->blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, ComputeTotalSize(items_, begin, len)));
396+
result->PrepareBlockWithSpaceForAtLeast(ComputeTotalSize(items_, begin, len));
381397

382398
for (size_t i = begin; i < begin + len; ++i) {
383-
result->Append(items_[i]);
399+
result->AppendUnsafe(items_[i]);
384400
}
385401

386402
return result;

clickhouse/columns/string.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,17 +88,18 @@ class ColumnString : public Column {
8888
// int32_t to be able to validate againts (unintentional) negative values in ColumnString c-tor.
8989
// Otherwise those just silently underflow unsigned type,
9090
// resulting in attempt to allocate enormous amount of memory at run time.
91-
enum EstimatedValueSize : int32_t {
91+
enum class EstimatedValueSize : int32_t {
9292
TINY = 8,
9393
SMALL = 32,
9494
MEDIUM = 128,
9595
LARGE = 512,
96-
HUGE = 2048,
9796
};
98-
static constexpr auto DEFAULT_ESTIMATION = EstimatedValueSize::MEDIUM;
9997

100-
explicit ColumnString(EstimatedValueSize value_size_estimation = DEFAULT_ESTIMATION);
101-
explicit ColumnString(size_t element_count, EstimatedValueSize value_size_estimation = DEFAULT_ESTIMATION);
98+
// Memory for item storage is not pre-allocated on Reserve(), same as old behaviour.
99+
static constexpr auto NO_PREALLOCATE = EstimatedValueSize(0);
100+
101+
explicit ColumnString(EstimatedValueSize value_size_estimation = NO_PREALLOCATE);
102+
explicit ColumnString(size_t element_count, EstimatedValueSize value_size_estimation = NO_PREALLOCATE);
102103
explicit ColumnString(const std::vector<std::string> & data);
103104
explicit ColumnString(std::vector<std::string>&& data);
104105

@@ -157,16 +158,19 @@ class ColumnString : public Column {
157158
ItemView GetItem(size_t) const override;
158159

159160
private:
161+
struct Block;
162+
160163
void AppendUnsafe(std::string_view);
164+
Block & PrepareBlockWithSpaceForAtLeast(size_t minimum_required_bytes);
161165

162166
private:
163-
struct Block;
164167

165168
std::vector<std::string_view> items_;
166169
std::vector<Block> blocks_;
167170
std::deque<std::string> append_data_;
168-
uint32_t value_size_estimation_ = DEFAULT_ESTIMATION;
169-
size_t next_block_size_;
171+
172+
uint32_t value_size_estimation_ = 0;
173+
size_t next_block_size_ = 0;
170174
};
171175

172176
}

ut/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ SET ( clickhouse-cpp-ut-src
2626
utils.cpp
2727
value_generators.cpp
2828
low_cardinality_nullable_tests.cpp
29+
30+
ColumnString_ut.cpp
31+
ColumnLowCardinalityT_ut.cpp
2932
)
3033

3134
IF (WITH_OPENSSL)

0 commit comments

Comments
 (0)