Skip to content

Commit d1b129e

Browse files
yizhang82inikep
authored andcommitted
Improve count(*) performance and fix index merge bug
Summary: Today in `SELECT count(*)` MyRocks would still decode every single column due to this check, despite the readset being empty: ``` // bitmap is cleared on index merge, but it still needs to decode columns bool field_requested = decode_all_fields || m_verify_row_debug_checksums || bitmap_is_set(field_map, m_table->field[i]->field_index); ``` As a result MyRocks is significantly slower than InnoDB in this particular scenario. Turns out in index merge, when it tries to reset, it calls ha_index_init with an empty column_bitmap, so our field decoders didn't know it needs to decode anything, so the entire query would return nothing. This is discussed in [this commit](facebook@70f2bcd), and [issue 624](facebook#624) and [PR 626](facebook#626). So the workaround we had at that time is to simply treat empty map as implicitly everything, and the side effect is massively slowed down count(*). We have a few options to address this: 1. Fix index merge optimizer - looking at the code in QUICK_RANGE_SELECT::init_ror_merged_scan, it actually fixes up the column_bitmap properly, but after init/reset, so the fix would simply be moving the bitmap set code up. For secondary keys, prepare_for_position will automatically call `mark_columns_used_by_index_no_reset(s->primary_key, read_set)` if HA_PRIMARY_KEY_REQUIRED_FOR_POSITION is set (true for both InnoDB and MyRocks), so we would know correctly that we need to unpack PK when walking SK during index merge. 2. Overriding `column_bitmaps_signal` and setup decoders whenever the bitmap changes - however this doesn't work by itself. Because no storage engine today actually use handler::column_bitmaps_signal this path haven't been tested properly in index merge. In this case, QUICK_RANGE_SELECT::init_ror_merged_scan should call set_column_bitmaps_no_signal to avoid resetting the correct read/write set of head since head is used as first handler (reuses_handler=true) and subsequent place holders for read/write set updates (reuse_handler=false). 3. Follow InnoDB's solution - InnoDB delays it actually initialize its template again in index_read for the 2nd time (relying on `prebuilt->sql_stat_start`), and during index_read `QUICK_RANGE_SELECT::column_bitmap` is already fixed up and the table read/write set is switched to it, so the new template would be built correctly. In order to make it easier to maintain and port, after discussing with Manuel, I'm going with a simplified version of #3 that delays decoder creation until the first read operation (index_*, rnd_*, range_read_*, multi_range_read_*), and setting the delay flag in index_init / rnd_init / multi_range_read_init. Also, I ran into a bug with truncation_partition where Rdb_converter's tbl_def is stale (we only update ha_rocksdb::m_tbl_def), but it is fine because it is not being used after table open. But my change moves the lookup_bitmap initialization into Rdb_converter which takes a dependency on Rdb_converter::m_tbl_def so now we need to reset it properly. Reference Patch: facebook@44d6a8d --------- Porting Note: Due to 8.0's new counting infra (handler::record & handler::record_with_index), this only helps PK counting. Will send out a better fix that works better with 8.0 new counting infra. Reviewed By: Pushapgl Differential Revision: D26265470
1 parent 078b9ea commit d1b129e

File tree

5 files changed

+84
-38
lines changed

5 files changed

+84
-38
lines changed

storage/rocksdb/ha_rocksdb.cc

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7115,7 +7115,8 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
71157115
mrr_enabled_keyread(false),
71167116
mrr_used_cpk(false),
71177117
m_in_rpl_delete_rows(false),
7118-
m_in_rpl_update_rows(false) {}
7118+
m_in_rpl_update_rows(false),
7119+
m_need_build_decoder(false) {}
71197120

71207121
ha_rocksdb::~ha_rocksdb() {
71217122
int err MY_ATTRIBUTE((__unused__));
@@ -8588,8 +8589,7 @@ int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
85888589
*/
85898590
int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg,
85908591
const std::string &actual_user_table_name,
8591-
const TABLE *table_arg,
8592-
ulonglong auto_increment_value,
8592+
TABLE *table_arg, ulonglong auto_increment_value,
85938593
dd::Table *table_def) {
85948594
DBUG_ENTER_FUNC();
85958595

@@ -8677,6 +8677,7 @@ int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg,
86778677

86788678
/* Update the local m_tbl_def reference */
86798679
m_tbl_def = ddl_manager.find(orig_tablename);
8680+
m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table_arg));
86808681
DBUG_RETURN(err);
86818682
}
86828683

@@ -8972,8 +8973,9 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
89728973
#endif
89738974
DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
89748975

8975-
bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
8976-
kd.covers_lookup(&value, &m_lookup_bitmap);
8976+
bool covered_lookup =
8977+
(m_keyread_only && kd.can_cover_lookup()) ||
8978+
kd.covers_lookup(&value, m_converter->get_lookup_bitmap());
89778979

89788980
#ifndef NDEBUG
89798981
m_keyread_only = save_keyread_only;
@@ -9098,7 +9100,8 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
90989100
rocksdb::Slice value = m_scan_it->value();
90999101
bool covered_lookup =
91009102
(m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
9101-
m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap);
9103+
m_key_descr_arr[keyno]->covers_lookup(
9104+
&value, m_converter->get_lookup_bitmap());
91029105
if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
91039106
rc = m_key_descr_arr[keyno]->unpack_record(
91049107
table, buf, &key, &value,
@@ -9143,6 +9146,8 @@ int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
91439146
enum ha_rkey_function find_flag) {
91449147
DBUG_ENTER_FUNC();
91459148

9149+
check_build_decoder();
9150+
91469151
DBUG_EXECUTE_IF(
91479152
"myrocks_busy_loop_on_row_read", volatile int debug_i = 0;
91489153
while (1) { debug_i++; });
@@ -9759,6 +9764,8 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
97599764
int ha_rocksdb::index_next(uchar *const buf) {
97609765
DBUG_ENTER_FUNC();
97619766

9767+
check_build_decoder();
9768+
97629769
bool moves_forward = true;
97639770
ha_statistic_increment(&System_status_var::ha_read_next_count);
97649771
if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
@@ -9779,6 +9786,8 @@ int ha_rocksdb::index_next(uchar *const buf) {
97799786
int ha_rocksdb::index_prev(uchar *const buf) {
97809787
DBUG_ENTER_FUNC();
97819788

9789+
check_build_decoder();
9790+
97829791
bool moves_forward = false;
97839792
ha_statistic_increment(&System_status_var::ha_read_prev_count);
97849793
if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
@@ -9839,6 +9848,8 @@ int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
98399848
int ha_rocksdb::index_first(uchar *const buf) {
98409849
DBUG_ENTER_FUNC();
98419850

9851+
check_build_decoder();
9852+
98429853
m_sk_match_prefix = nullptr;
98439854
ha_statistic_increment(&System_status_var::ha_read_first_count);
98449855
int rc = index_read_intern(buf, true /* first */);
@@ -9855,6 +9866,8 @@ int ha_rocksdb::index_first(uchar *const buf) {
98559866
int ha_rocksdb::index_last(uchar *const buf) {
98569867
DBUG_ENTER_FUNC();
98579868

9869+
check_build_decoder();
9870+
98589871
m_sk_match_prefix = nullptr;
98599872
ha_statistic_increment(&System_status_var::ha_read_last_count);
98609873
int rc = index_read_intern(buf, false /* first */);
@@ -11226,17 +11239,16 @@ void ha_rocksdb::setup_iterator_for_rnd_scan() {
1122611239
int ha_rocksdb::rnd_init(bool scan) {
1122711240
DBUG_ENTER_FUNC();
1122811241

11242+
m_need_build_decoder = true;
11243+
active_index = table->s->primary_key;
11244+
1122911245
THD *thd = ha_thd();
1123011246
if (thd && thd->killed) {
1123111247
DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
1123211248
}
1123311249

1123411250
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
1123511251

11236-
// when this table is being updated, decode all fields
11237-
m_converter->setup_field_decoders(table->read_set,
11238-
m_lock_rows == RDB_LOCK_WRITE);
11239-
1124011252
if (scan) {
1124111253
m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
1124211254
setup_iterator_for_rnd_scan();
@@ -11260,6 +11272,8 @@ int ha_rocksdb::rnd_init(bool scan) {
1126011272
int ha_rocksdb::rnd_next(uchar *const buf) {
1126111273
DBUG_ENTER_FUNC();
1126211274

11275+
check_build_decoder();
11276+
1126311277
int rc;
1126411278
ha_statistic_increment(&System_status_var::ha_read_rnd_next_count);
1126511279
for (;;) {
@@ -11400,11 +11414,26 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
1140011414
int ha_rocksdb::rnd_end() {
1140111415
DBUG_ENTER_FUNC();
1140211416

11417+
m_need_build_decoder = false;
11418+
1140311419
release_scan_iterator();
1140411420

1140511421
DBUG_RETURN(HA_EXIT_SUCCESS);
1140611422
}
1140711423

11424+
void ha_rocksdb::build_decoder() {
11425+
m_converter->setup_field_decoders(table->read_set, active_index,
11426+
m_keyread_only,
11427+
m_lock_rows == RDB_LOCK_WRITE);
11428+
}
11429+
11430+
void ha_rocksdb::check_build_decoder() {
11431+
if (m_need_build_decoder) {
11432+
build_decoder();
11433+
m_need_build_decoder = false;
11434+
}
11435+
}
11436+
1140811437
/**
1140911438
@return
1141011439
HA_EXIT_SUCCESS OK
@@ -11413,6 +11442,9 @@ int ha_rocksdb::rnd_end() {
1141311442
int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
1141411443
DBUG_ENTER_FUNC();
1141511444

11445+
m_need_build_decoder = true;
11446+
active_index = idx;
11447+
1141611448
THD *thd = ha_thd();
1141711449
if (thd && thd->killed) {
1141811450
DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
@@ -11421,21 +11453,11 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
1142111453
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
1142211454
assert(tx != nullptr);
1142311455

11424-
// when this table is being updated, decode all fields
11425-
m_converter->setup_field_decoders(table->read_set,
11426-
m_lock_rows == RDB_LOCK_WRITE);
11427-
11428-
if (!m_keyread_only) {
11429-
m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
11430-
}
11431-
1143211456
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
1143311457
// when accessing the index, so don't acquire the snapshot right away.
1143411458
// Otherwise acquire the snapshot immediately.
1143511459
tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
1143611460

11437-
active_index = idx;
11438-
1143911461
DBUG_RETURN(HA_EXIT_SUCCESS);
1144011462
}
1144111463

@@ -11446,9 +11468,9 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
1144611468
int ha_rocksdb::index_end() {
1144711469
DBUG_ENTER_FUNC();
1144811470

11449-
release_scan_iterator();
11471+
m_need_build_decoder = false;
1145011472

11451-
bitmap_free(&m_lookup_bitmap);
11473+
release_scan_iterator();
1145211474

1145311475
active_index = MAX_KEY;
1145411476
in_range_check_pushed_down = false;
@@ -11801,6 +11823,8 @@ void ha_rocksdb::position(const uchar *const record) {
1180111823
int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
1180211824
DBUG_ENTER_FUNC();
1180311825

11826+
check_build_decoder();
11827+
1180411828
int rc;
1180511829
size_t len;
1180611830

@@ -16445,6 +16469,8 @@ class Mrr_sec_key_rowid_source : public Mrr_rowid_source {
1644516469
int ha_rocksdb::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
1644616470
uint n_ranges, uint mode,
1644716471
HANDLER_BUFFER *buf) {
16472+
m_need_build_decoder = true;
16473+
1644816474
int res;
1644916475

1645016476
if (!current_thd->optimizer_switch_flag(OPTIMIZER_SWITCH_MRR) ||
@@ -16691,6 +16717,8 @@ void ha_rocksdb::mrr_free_rows() {
1669116717
}
1669216718

1669316719
int ha_rocksdb::multi_range_read_next(char **range_info) {
16720+
check_build_decoder();
16721+
1669416722
if (mrr_uses_default_impl) {
1669516723
return handler::multi_range_read_next(range_info);
1669616724
}

storage/rocksdb/ha_rocksdb.h

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -380,13 +380,6 @@ class ha_rocksdb : public my_core::handler {
380380

381381
void set_last_rowkey(const uchar *const old_data);
382382

383-
/*
384-
For the active index, indicates which columns must be covered for the
385-
current lookup to be covered. If the bitmap field is null, that means this
386-
index does not cover the current lookup for any record.
387-
*/
388-
MY_BITMAP m_lookup_bitmap;
389-
390383
int alloc_key_buffers(const TABLE *const table_arg,
391384
const Rdb_tbl_def *const tbl_def_arg,
392385
bool alloc_alter_buffers = false)
@@ -953,7 +946,7 @@ class ha_rocksdb : public my_core::handler {
953946
dd::Table *table_def);
954947
int truncate_table(Rdb_tbl_def *tbl_def,
955948
const std::string &actual_user_table_name,
956-
const TABLE *table_arg, ulonglong auto_increment_value,
949+
TABLE *table_arg, ulonglong auto_increment_value,
957950
dd::Table *table_def);
958951
bool check_if_incompatible_data(HA_CREATE_INFO *const info,
959952
uint table_changes) override
@@ -1009,6 +1002,9 @@ class ha_rocksdb : public my_core::handler {
10091002
void update_row_read(ulonglong count);
10101003
static void inc_covered_sk_lookup();
10111004

1005+
void build_decoder();
1006+
void check_build_decoder();
1007+
10121008
public:
10131009
virtual void rpl_before_delete_rows() override;
10141010
virtual void rpl_after_delete_rows() override;
@@ -1041,6 +1037,9 @@ class ha_rocksdb : public my_core::handler {
10411037
/* Flags tracking if we are inside different replication operation */
10421038
bool m_in_rpl_delete_rows;
10431039
bool m_in_rpl_update_rows;
1040+
1041+
/* Need to build decoder on next read operation */
1042+
bool m_need_build_decoder;
10441043
};
10451044

10461045
/*

storage/rocksdb/nosql_access.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1306,7 +1306,8 @@ void INLINE_ATTR select_exec::scan_value() {
13061306
m_key_def->get_lookup_bitmap(m_table, &m_lookup_bitmap);
13071307
}
13081308

1309-
m_converter->setup_field_decoders(m_table->read_set, m_index);
1309+
m_converter->setup_field_decoders(m_table->read_set, m_index,
1310+
false /* keyread_only */);
13101311
}
13111312

13121313
bool INLINE_ATTR select_exec::run() {

storage/rocksdb/rdb_converter.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ Rdb_converter::~Rdb_converter() {
313313
m_encoder_arr = nullptr;
314314
// These are needed to suppress valgrind errors in rocksdb.partition
315315
m_storage_record.mem_free();
316+
bitmap_free(&m_lookup_bitmap);
316317
}
317318

318319
/*
@@ -337,31 +338,30 @@ void Rdb_converter::get_storage_type(Rdb_field_encoder *const encoder,
337338
Setup which fields will be unpacked when reading rows
338339
339340
@detail
340-
Three special cases when we still unpack all fields:
341+
Two special cases when we still unpack all fields:
341342
- When client requires decode_all_fields, such as this table is being
342343
updated (m_lock_rows==RDB_LOCK_WRITE).
343344
- When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need
344345
to read all fields to find whether there is a row checksum at the end. We
345346
could skip the fields instead of decoding them, but currently we do
346347
decoding.)
347-
- On index merge as bitmap is cleared during that operation
348348
349349
@seealso
350350
Rdb_converter::setup_field_encoders()
351351
Rdb_converter::convert_record_from_storage_format()
352352
*/
353353
void Rdb_converter::setup_field_decoders(const MY_BITMAP *field_map,
354+
uint active_index, bool keyread_only,
354355
bool decode_all_fields) {
355356
m_key_requested = false;
356357
m_decoders_vect.clear();
358+
bitmap_free(&m_lookup_bitmap);
357359
int last_useful = 0;
358360
int skip_size = 0;
359361

360362
for (uint i = 0; i < m_table->s->fields; i++) {
361-
// bitmap is cleared on index merge, but it still needs to decode columns
362363
bool field_requested =
363364
decode_all_fields || m_verify_row_debug_checksums ||
364-
bitmap_is_clear_all(field_map) ||
365365
bitmap_is_set(field_map, m_table->field[i]->field_index());
366366

367367
// We only need the decoder if the whole record is stored.
@@ -397,6 +397,11 @@ void Rdb_converter::setup_field_decoders(const MY_BITMAP *field_map,
397397
// skipping. Remove them.
398398
m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
399399
m_decoders_vect.end());
400+
401+
if (!keyread_only && active_index != m_table->s->primary_key) {
402+
m_tbl_def->m_key_descr_arr[active_index]->get_lookup_bitmap(
403+
m_table, &m_lookup_bitmap);
404+
}
400405
}
401406

402407
void Rdb_converter::setup_field_encoders() {
@@ -584,6 +589,11 @@ int Rdb_converter::convert_record_from_storage_format(
584589
const rocksdb::Slice *const key_slice,
585590
const rocksdb::Slice *const value_slice, uchar *const dst,
586591
bool decode_value = true) {
592+
bool skip_value = !decode_value || get_decode_fields()->size() == 0;
593+
if (!m_key_requested && skip_value) {
594+
return HA_EXIT_SUCCESS;
595+
}
596+
587597
int err = HA_EXIT_SUCCESS;
588598

589599
Rdb_string_reader value_slice_reader(value_slice);
@@ -605,7 +615,7 @@ int Rdb_converter::convert_record_from_storage_format(
605615
}
606616
}
607617

608-
if (!decode_value || get_decode_fields()->size() == 0) {
618+
if (skip_value) {
609619
// We are done
610620
return HA_EXIT_SUCCESS;
611621
}

storage/rocksdb/rdb_converter.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,8 @@ class Rdb_converter {
134134
Rdb_converter &operator=(const Rdb_converter &decoder) = delete;
135135
~Rdb_converter();
136136

137-
void setup_field_decoders(const MY_BITMAP *field_map,
138-
bool decode_all_fields = false);
137+
void setup_field_decoders(const MY_BITMAP *field_map, uint active_index,
138+
bool keyread_only, bool decode_all_fields = false);
139139

140140
int decode(const std::shared_ptr<Rdb_key_def> &key_def, uchar *dst,
141141
const rocksdb::Slice *key_slice, const rocksdb::Slice *value_slice,
@@ -172,6 +172,8 @@ class Rdb_converter {
172172
return &m_decoders_vect;
173173
}
174174

175+
const MY_BITMAP *get_lookup_bitmap() { return &m_lookup_bitmap; }
176+
175177
int decode_value_header_for_pk(Rdb_string_reader *reader,
176178
const std::shared_ptr<Rdb_key_def> &pk_def,
177179
rocksdb::Slice *unpack_slice);
@@ -241,5 +243,11 @@ class Rdb_converter {
241243
my_core::ha_rows m_row_checksums_checked;
242244
// buffer to hold data during encode_value_slice
243245
String m_storage_record;
246+
/*
247+
For the active index, indicates which columns must be covered for the
248+
current lookup to be covered. If the bitmap field is null, that means this
249+
index does not cover the current lookup for any record.
250+
*/
251+
MY_BITMAP m_lookup_bitmap;
244252
};
245253
} // namespace myrocks

0 commit comments

Comments
 (0)