Skip to content

Commit 44b1a31

Browse files
yizhang82inikep
authored andcommitted
Skip decoding completely during count and improve SK count perf by 20x
Summary: 8.0 is taking a new approach with doing counts - MySQL will call handler::records / records_with_index which will then use rnd_init+rnd_next / index_init+index_next to iterate through everything. If you try count(*) you'll see MyRocks 8.0 is actually much slower when iterating the table with secondary keys than 5.6 (the optimizer picks secondary key for whatever reason). This is because MySQL no longer calls `handler::extra(KEY_READ)` so we no longer get the hints to only decode the key - instead we'll see that the SK is not covering so we fall back to PK, which dramatically slows down by *a lot*. This fixes it by adding a m_iteration_only = true hint - with this hint we'll just quickly iterate through everything without any decoding nor falling back to PK. Note even with this fix, if you compare MyRocks 8.0 with InnoDB 8.0, you'll see that InnoDB 8.0 is also much faster - this is because InnoDB overrides the implementation and iterate the records directly in parallel (default to 4). This is something we can do as well. Reviewed By: Pushapgl Differential Revision: D26272733
1 parent d1b129e commit 44b1a31

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

storage/rocksdb/ha_rocksdb.cc

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "my_bit.h"
5151
#include "my_stacktrace.h"
5252
#include "my_sys.h"
53+
#include "scope_guard.h"
5354
#include "sql/binlog.h"
5455
#include "sql/debug_sync.h"
5556
#include "sql-common/json_dom.h"
@@ -7108,6 +7109,7 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
71087109
m_pack_buffer(nullptr),
71097110
m_lock_rows(RDB_LOCK_NONE),
71107111
m_keyread_only(false),
7112+
m_iteration_only(false),
71117113
m_insert_with_update(false),
71127114
m_dup_key_found(false),
71137115
mrr_rowid_reader(nullptr),
@@ -9085,6 +9087,11 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
90859087

90869088
/* Check if we've ran out of records of this index */
90879089
if (m_key_descr_arr[keyno]->covers_key(key)) {
9090+
if (m_iteration_only) {
9091+
table->m_status = 0;
9092+
return 0;
9093+
}
9094+
90889095
int rc = 0;
90899096

90909097
// TODO: We could here check if we have ran out of range we're scanning
@@ -9756,6 +9763,22 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
97569763
DBUG_RETURN(rc);
97579764
}
97589765

9766+
int ha_rocksdb::records(ha_rows *num_rows) {
9767+
m_iteration_only = true;
9768+
auto iteration_guard =
9769+
create_scope_guard([this]() { m_iteration_only = false; });
9770+
int count = handler::records(num_rows);
9771+
return count;
9772+
}
9773+
9774+
int ha_rocksdb::records_from_index(ha_rows *num_rows, uint index) {
9775+
m_iteration_only = true;
9776+
auto iteration_guard =
9777+
create_scope_guard([this]() { m_iteration_only = false; });
9778+
int count = handler::records_from_index(num_rows, index);
9779+
return count;
9780+
}
9781+
97599782
/**
97609783
@return
97619784
HA_EXIT_SUCCESS OK
@@ -11301,7 +11324,7 @@ int ha_rocksdb::rnd_next(uchar *const buf) {
1130111324
int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
1130211325
DBUG_ENTER_FUNC();
1130311326

11304-
int rc;
11327+
int rc = 0;
1130511328
THD *thd = ha_thd();
1130611329

1130711330
table->m_status = STATUS_NOT_FOUND;
@@ -11349,6 +11372,11 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
1134911372
break;
1135011373
}
1135111374

11375+
if (m_iteration_only) {
11376+
table->m_status = 0;
11377+
break;
11378+
}
11379+
1135211380
if (m_lock_rows != RDB_LOCK_NONE) {
1135311381
/*
1135411382
Lock the row we've just read.

storage/rocksdb/ha_rocksdb.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ class ha_rocksdb : public my_core::handler {
272272
/* true means we're doing an index-only read. false means otherwise. */
273273
bool m_keyread_only;
274274

275+
/* We only iterate but don't need to decode anything */
276+
bool m_iteration_only;
277+
275278
bool m_skip_scan_it_next_call;
276279

277280
/* true means we are accessing the first row after a snapshot was created */
@@ -1005,6 +1008,10 @@ class ha_rocksdb : public my_core::handler {
10051008
void build_decoder();
10061009
void check_build_decoder();
10071010

1011+
protected:
1012+
int records(ha_rows *num_rows) override;
1013+
int records_from_index(ha_rows *num_rows, uint index) override;
1014+
10081015
public:
10091016
virtual void rpl_before_delete_rows() override;
10101017
virtual void rpl_after_delete_rows() override;

0 commit comments

Comments
 (0)