@@ -23,10 +23,37 @@ size_t ComputeTotalSize(const Container & strings, size_t begin = 0, size_t len
2323}
2424
2525// based on https://stackoverflow.com/a/9194117
26- size_t RoundUp (size_t numToRound, size_t multiple)
27- {
28- size_t isPositive = static_cast <size_t >(numToRound >= 0 );
29- return ((numToRound + isPositive * (multiple - 1 )) / multiple) * multiple;
26+ size_t RoundUp (size_t numToRound, size_t multiple) {
27+ return ((numToRound + (multiple - 1 )) / multiple) * multiple;
28+ }
29+
30+ size_t ComputeValueSizeEstimation (size_t total_size, size_t number_of_items) {
31+ number_of_items = number_of_items ? number_of_items : 1 ; // just to avoid divide by zero
32+ size_t estimation = std::ceil (static_cast <double >(total_size) / number_of_items);
33+
34+ return estimation == 0 ;
35+ }
36+
37+ size_t EstimateBlockSize (size_t value_size_estimation) {
38+ size_t estimated_number_of_items_per_block = 32 ; // just arbitrary value
39+
40+ // do not pre-allocate too big blocks when expected values are big to minimize waste or when user explicitly requested not to
41+ if (value_size_estimation > DEFAULT_BLOCK_SIZE || value_size_estimation == static_cast <size_t >(clickhouse::ColumnString::NO_PREALLOCATE)) {
42+ // for really big items do not pre-allocate blocks, and allowing later code to put 1 item per block
43+ return 0 ;
44+ } else if (value_size_estimation > static_cast <size_t >(clickhouse::ColumnString::EstimatedValueSize::MEDIUM)) {
45+ // for not so big items, create blocks that fit smaller number of items, reducing produced block size.
46+ estimated_number_of_items_per_block = ceil (DEFAULT_BLOCK_SIZE / static_cast <double >(value_size_estimation));
47+ }
48+
49+ return std::max<size_t >(DEFAULT_BLOCK_SIZE, RoundUp (value_size_estimation * estimated_number_of_items_per_block, DEFAULT_BLOCK_SIZE));
50+ }
51+
52+ inline auto Validate (clickhouse::ColumnString::EstimatedValueSize value_size_estimation) {
53+ if (static_cast <int >(value_size_estimation) < 0 )
54+ throw clickhouse::ValidationError (" ColumnString received negative number as value size estimation" );
55+
56+ return value_size_estimation;
3057}
3158
3259}
@@ -132,22 +159,6 @@ ItemView ColumnFixedString::GetItem(size_t index) const {
132159 return ItemView{Type::FixedString, this ->At (index)};
133160}
134161
135- namespace {
136-
137- size_t ComputeValueSizeEstimation (size_t total_size, size_t number_of_items) {
138- number_of_items = number_of_items ? number_of_items : 1 ; // just to avoid divide by zero
139- size_t estimation = std::ceil (static_cast <double >(total_size) / number_of_items);
140-
141- return estimation == 0 ? ColumnString::DEFAULT_ESTIMATION : estimation;
142- }
143-
144- size_t EstimateNextBlockSize (size_t value_size_estimation) {
145- const size_t estimated_number_of_items_per_block = 32 ; // just arbitrary value
146- return std::max<size_t >(DEFAULT_BLOCK_SIZE, value_size_estimation * estimated_number_of_items_per_block);
147- }
148-
149- }
150-
151162struct ColumnString ::Block
152163{
153164 using CharT = typename std::string::value_type;
@@ -188,11 +199,9 @@ struct ColumnString::Block
188199
189200ColumnString::ColumnString (EstimatedValueSize value_size_estimation)
190201 : Column(Type::CreateString())
191- , value_size_estimation_(value_size_estimation)
202+ , value_size_estimation_(static_cast < size_t >(Validate( value_size_estimation)) )
192203 , next_block_size_(DEFAULT_BLOCK_SIZE)
193204{
194- if (value_size_estimation < 0 )
195- throw ValidationError (" ColumnString received negative number as value size estimation" );
196205}
197206
198207ColumnString::ColumnString (size_t element_count, EstimatedValueSize value_size_estimation)
@@ -237,25 +246,23 @@ void ColumnString::Reserve(size_t new_cap) {
237246 items_.reserve (new_cap);
238247
239248 if (blocks_.empty () || blocks_.back ().GetAvailable () < value_size_estimation_) {
240- blocks_.emplace_back (new_cap * value_size_estimation_);
249+ if (value_size_estimation_ != static_cast <size_t >(NO_PREALLOCATE))
250+ blocks_.emplace_back (new_cap * value_size_estimation_);
241251 } else {
242- // make sure that next block will have enought space for all remaining items .
252+ // Estimate space required for items that woudn't fit into current Block .
243253 const size_t estimated_items_in_next_block = value_size_estimation_ ? new_cap - blocks_.back ().GetAvailable () / value_size_estimation_ : new_cap;
244254 next_block_size_ = std::max (DEFAULT_BLOCK_SIZE, estimated_items_in_next_block * value_size_estimation_);
245255 }
246256}
247257
248258void ColumnString::SetEstimatedValueSize (EstimatedValueSize value_size_estimation) {
249- value_size_estimation_ = value_size_estimation;
259+ value_size_estimation_ = static_cast < size_t >( Validate ( value_size_estimation)) ;
250260}
251261
252262void ColumnString::Append (std::string_view str) {
253- if (blocks_.empty () || blocks_.back ().GetAvailable () < str.length ()) {
254- blocks_.emplace_back (std::max (next_block_size_, str.size ()));
255- next_block_size_ = EstimateNextBlockSize (value_size_estimation_);
256- }
263+ auto & block = PrepareBlockWithSpaceForAtLeast (str.length ());
257264
258- items_.emplace_back (blocks_. back () .AppendUnsafe (str));
265+ items_.emplace_back (block .AppendUnsafe (str));
259266}
260267
261268void ColumnString::Append (const char * str) {
@@ -276,6 +283,18 @@ void ColumnString::AppendUnsafe(std::string_view str) {
276283 items_.emplace_back (blocks_.back ().AppendUnsafe (str));
277284}
278285
286+ ColumnString::Block & ColumnString::PrepareBlockWithSpaceForAtLeast (size_t minimum_required_bytes) {
287+ if (blocks_.empty () || blocks_.back ().GetAvailable () < minimum_required_bytes) {
288+ if (next_block_size_ == 0 )
289+ next_block_size_ = DEFAULT_BLOCK_SIZE;
290+
291+ blocks_.emplace_back (std::max (next_block_size_, minimum_required_bytes));
292+ next_block_size_ = EstimateBlockSize (value_size_estimation_);
293+ }
294+
295+ return blocks_.back ();
296+ }
297+
279298void ColumnString::Clear () {
280299 items_.clear ();
281300 blocks_.clear ();
@@ -292,10 +311,7 @@ void ColumnString::Append(ColumnRef column) {
292311 const auto total_size = ComputeTotalSize (col->items_ );
293312
294313 // TODO: fill up existing block with some items and then add a new one for the rest of items
295- if (blocks_.size () == 0 || blocks_.back ().GetAvailable () < total_size) {
296- blocks_.emplace_back (std::max (next_block_size_, total_size));
297- next_block_size_ = EstimateNextBlockSize (value_size_estimation_);
298- }
314+ PrepareBlockWithSpaceForAtLeast (total_size);
299315
300316 // Intentionally not doing items_.reserve() since that cripples performance.
301317 for (size_t i = 0 ; i < column->Size (); ++i) {
@@ -377,10 +393,10 @@ ColumnRef ColumnString::Slice(size_t begin, size_t len) const {
377393 auto result = std::make_shared<ColumnString>(EstimatedValueSize (value_size_estimation_));
378394
379395 result->items_ .reserve (len);
380- result->blocks_ . emplace_back ( std::max (DEFAULT_BLOCK_SIZE, ComputeTotalSize (items_, begin, len) ));
396+ result->PrepareBlockWithSpaceForAtLeast ( ComputeTotalSize (items_, begin, len));
381397
382398 for (size_t i = begin; i < begin + len; ++i) {
383- result->Append (items_[i]);
399+ result->AppendUnsafe (items_[i]);
384400 }
385401
386402 return result;
0 commit comments