From 307be4df492cc84097afb47dab711535930d88a5 Mon Sep 17 00:00:00 2001 From: Laurynas Biveinis Date: Fri, 26 Apr 2024 14:47:00 +0300 Subject: [PATCH 1/3] Add range locking support for MyRocks - Add new sysvar, rocksdb_use_range_locking. When it's on: - RocksDB is initialized to used range-locking lock manager - For all DML operations (including SELECT ... FOR UPDATE), the scanned range will be locked before reading/modifying rows - For all DML operations, MyRocks will read and modify the latest committed data, just like InnoDB does, because there is no snapshot checking if range locking is used. - Introduce a RocksDB locking iterator, which reads the rows, locks the range, - and re-reads the rows. It is used for queries that do not have a finite range to scan, like UPDATE t1 ... ORDER BY t1.key LIMIT n - Add another new sysvar, rocksdb_use_range_lock_manager_as_point, which uses the point locking algorithm but the lock manager used is the range one. - Add enabled rocksdb_use_range_locking as another MTR rocksdb suite test combination. Co-authored-by: Sergei Petrunia Co-authored-by: Laurynas Biveinis --- mysql-test/r/mysqld--help-notwin.result | 10 + mysql-test/suite/rocksdb/combinations | 3 + .../rocksdb/include/have_range_locking.inc | 3 + .../rocksdb/include/not_range_locking.inc | 5 + .../include/select_from_is_rowlocks.inc | 99 +++ .../rocksdb/r/hermitage-range_locking.result | 652 ++++++++++++++++ ...243_transactionStatus-range_locking.result | 182 +++++ ...level_repeatable_read-range_locking.result | 106 +++ .../suite/rocksdb/r/range_locking.result | 596 +++++++++++++++ .../rocksdb/r/range_locking_conc_test.result | 4 + .../r/range_locking_deadlock_tracking.result | 453 +++++++++++ .../rocksdb/r/range_locking_escalation.result | 27 + .../r/range_locking_partial_index.result | 202 +++++ .../r/range_locking_refresh_iter.result | 50 ++ .../rocksdb/r/range_locking_rev_cf.result | 556 ++++++++++++++ .../r/range_locking_seek_for_update.result | 291 +++++++ .../r/range_locking_seek_for_update2.result | 142 ++++ ...nge_locking_seek_for_update2_rev_cf.result | 142 ++++ .../r/range_locking_shared_locks.result | 251 ++++++ mysql-test/suite/rocksdb/r/rocksdb.result | 3 + .../rocksdb/r/rocksdb_read_free_rpl.result | 2 +- .../rocksdb/r/rocksdb_timeout_rollback.result | 3 + .../rocksdb/r/select_count_for_update.result | 4 +- mysql-test/suite/rocksdb/r/trx_info.result | 7 +- mysql-test/suite/rocksdb/r/unique_sec.result | 4 + .../suite/rocksdb/r/unique_sec_rev_cf.result | 4 + .../suite/rocksdb/t/deadlock_tracking.test | 7 +- .../t/drop_cf_before_show_deadlock_info.test | 4 + .../rocksdb/t/hermitage-range_locking.test | 21 + mysql-test/suite/rocksdb/t/hermitage.inc | 14 +- mysql-test/suite/rocksdb/t/hermitage.test | 3 + mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 + mysql-test/suite/rocksdb/t/issue111.test | 3 + ...ue243_transactionStatus-range_locking.test | 10 + .../rocksdb/t/issue243_transactionStatus.test | 4 + .../level_repeatable_read-range_locking.test | 9 + .../rocksdb/t/level_repeatable_read.test | 3 + mysql-test/suite/rocksdb/t/lock_info.test | 3 + .../suite/rocksdb/t/locking_issues.test | 3 + mysql-test/suite/rocksdb/t/max_row_locks.test | 1 + mysql-test/suite/rocksdb/t/range_locking.inc | 612 +++++++++++++++ mysql-test/suite/rocksdb/t/range_locking.test | 6 + .../t/range_locking_conc_test-master.opt | 1 + .../rocksdb/t/range_locking_conc_test.py | 520 +++++++++++++ .../rocksdb/t/range_locking_conc_test.test | 18 + .../rocksdb/t/range_locking_conc_test.txt | 91 +++ .../t/range_locking_deadlock_tracking.test | 196 +++++ .../t/range_locking_escalation-master.opt | 1 + .../rocksdb/t/range_locking_escalation.test | 39 + .../t/range_locking_partial_index.test | 120 +++ .../rocksdb/t/range_locking_refresh_iter.test | 70 ++ .../suite/rocksdb/t/range_locking_rev_cf.test | 12 + .../t/range_locking_seek_for_update.test | 308 ++++++++ .../t/range_locking_seek_for_update2.inc | 55 ++ .../t/range_locking_seek_for_update2.test | 4 + ...range_locking_seek_for_update2_rev_cf.test | 4 + ...range_locking_seek_for_update_iter_end.inc | 41 + .../rocksdb/t/range_locking_shared_locks.test | 200 +++++ mysql-test/suite/rocksdb/t/rocksdb.test | 3 + .../rocksdb/t/rocksdb_concurrent_delete.test | 4 + mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 + .../rocksdb/t/rocksdb_read_free_rpl.test | 2 +- .../rocksdb/t/rocksdb_timeout_rollback.test | 2 + .../suite/rocksdb/t/rpl_row_not_found.inc | 2 + .../rocksdb/t/select_count_for_update.test | 14 + .../rocksdb/t/select_lock_in_share_mode.test | 3 + .../suite/rocksdb/t/skip_locked_nowait.test | 3 + mysql-test/suite/rocksdb/t/trx_info.test | 6 +- mysql-test/suite/rocksdb/t/unique_check.test | 5 + mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +- .../suite/rocksdb/t/varbinary_format.test | 4 + .../suite/rocksdb/t/varchar_format.test | 2 + .../r/rocksdb_max_lock_memory_basic.result | 7 + ...e_range_lock_manager_as_point_basic.result | 7 + .../r/rocksdb_use_range_locking_basic.result | 7 + .../t/rocksdb_max_lock_memory_basic.test | 5 + ...use_range_lock_manager_as_point_basic.test | 5 + .../t/rocksdb_use_range_locking_basic.test | 5 + storage/rocksdb/CMakeLists.txt | 1 + storage/rocksdb/get_rocksdb_files.sh | 4 +- storage/rocksdb/ha_rocksdb.cc | 715 ++++++++++++++++-- storage/rocksdb/ha_rocksdb.h | 39 +- .../locking-iterator-partial-index.txt | 120 +++ storage/rocksdb/nosql_access.cc | 4 +- storage/rocksdb/rdb_i_s.cc | 89 ++- storage/rocksdb/rdb_iterator.cc | 35 +- storage/rocksdb/rdb_iterator.h | 16 +- storage/rocksdb/rdb_locking_iter.cc | 280 +++++++ storage/rocksdb/rdb_locking_iter.h | 143 ++++ storage/rocksdb/rdb_utils.cc | 24 + storage/rocksdb/rdb_utils.h | 14 + 91 files changed, 7654 insertions(+), 117 deletions(-) create mode 100644 mysql-test/suite/rocksdb/include/have_range_locking.inc create mode 100644 mysql-test/suite/rocksdb/include/not_range_locking.inc create mode 100644 mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc create mode 100644 mysql-test/suite/rocksdb/r/hermitage-range_locking.result create mode 100644 mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result create mode 100644 mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_conc_test.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_escalation.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_partial_index.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_rev_cf.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result create mode 100644 mysql-test/suite/rocksdb/r/range_locking_shared_locks.result create mode 100644 mysql-test/suite/rocksdb/t/hermitage-range_locking.test create mode 100644 mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test create mode 100644 mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking.inc create mode 100644 mysql-test/suite/rocksdb/t/range_locking.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt create mode 100644 mysql-test/suite/rocksdb/t/range_locking_conc_test.py create mode 100644 mysql-test/suite/rocksdb/t/range_locking_conc_test.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_conc_test.txt create mode 100644 mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt create mode 100644 mysql-test/suite/rocksdb/t/range_locking_escalation.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_partial_index.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_rev_cf.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc create mode 100644 mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test create mode 100644 mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc create mode 100644 mysql-test/suite/rocksdb/t/range_locking_shared_locks.test create mode 100644 mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result create mode 100644 mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result create mode 100644 mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result create mode 100644 mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test create mode 100644 mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test create mode 100644 mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test create mode 100644 storage/rocksdb/locking-iterator-partial-index.txt create mode 100644 storage/rocksdb/rdb_locking_iter.cc create mode 100644 storage/rocksdb/rdb_locking_iter.h diff --git a/mysql-test/r/mysqld--help-notwin.result b/mysql-test/r/mysqld--help-notwin.result index 5031dfadf92f..0b39c928d6c6 100644 --- a/mysql-test/r/mysqld--help-notwin.result +++ b/mysql-test/r/mysqld--help-notwin.result @@ -2356,6 +2356,9 @@ The following options may be given as the first argument: and newtransaction will be started. --rocksdb-max-latest-deadlocks=# Maximum number of recent deadlocks to store + --rocksdb-max-lock-memory=# + Range-locking mode: Maximum amount of memory that locks + from all transactions can use at a time --rocksdb-max-log-file-size=# DBOptions::max_log_file_size for RocksDB --rocksdb-max-manifest-file-size=# @@ -2594,6 +2597,10 @@ The following options may be given as the first argument: RocksDB --rocksdb-use-io-uring Use io_uring for RocksDB + --rocksdb-use-range-lock-manager-as-point + Use Range Lock Manager as point + --rocksdb-use-range-locking + Use Range Locking --rocksdb-use-write-buffer-manager For experiment only. Use write buffer manager --rocksdb-validate-tables=# @@ -3932,6 +3939,7 @@ rocksdb-max-compaction-history 64 rocksdb-max-file-opening-threads 16 rocksdb-max-intrinsic-tmp-table-write-count 1000 rocksdb-max-latest-deadlocks 5 +rocksdb-max-lock-memory 1073741824 rocksdb-max-log-file-size 0 rocksdb-max-manifest-file-size 1073741824 rocksdb-max-manual-compactions 10 @@ -4011,6 +4019,8 @@ rocksdb-use-direct-reads FALSE rocksdb-use-fsync FALSE rocksdb-use-hyper-clock-cache FALSE rocksdb-use-io-uring FALSE +rocksdb-use-range-lock-manager-as-point FALSE +rocksdb-use-range-locking FALSE rocksdb-use-write-buffer-manager FALSE rocksdb-validate-tables 1 rocksdb-vector-index ON diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations index 86d1afc696c7..ed9f9a0e103a 100644 --- a/mysql-test/suite/rocksdb/combinations +++ b/mysql-test/suite/rocksdb/combinations @@ -2,3 +2,6 @@ [rocksdb_ddse] default_dd_system_storage_engine = rocksdb + +[range_locking] +rocksdb_use_range_locking=1 diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc new file mode 100644 index 000000000000..a8600daea77a --- /dev/null +++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) { + --skip Test requires range locking +} diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc new file mode 100644 index 000000000000..62c26b134bce --- /dev/null +++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc @@ -0,0 +1,5 @@ +--let $_use_range_locking= `select @@rocksdb_use_range_locking` +if ($_use_range_locking == 1) +{ + --skip Test doesn't support range locking +} diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc new file mode 100644 index 000000000000..cc6a328e566b --- /dev/null +++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc @@ -0,0 +1,99 @@ +--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +# +# An include to print contents of I_S.ROCKSB_LOCKS +# +# Implicit "parameters" +# - Currently it prints locks on t1.PRIMARY +# +# Explicit "parameter" variables: +# - $TRX1_ID - print this transaction as "TRX1" +# - $TRX2_ID - print this transaction as "TRX2" +# +# - $select_from_is_rowlocks_current_trx_only +# - $order_by_rowkey +# +# - $SECOND_INDEX_NAME + +--disable_query_log +set @cf_id=(select column_family from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); +set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx + where thread_id=connection_id()); +set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +let $extra_where = where 1; + +if ($select_from_is_rowlocks_current_trx_only) +{ + let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id); +} + +## transaction column + +# If TRX1_ID is not specified, get the current transaction: +let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id"); +if ($TRX1_ID) +{ + let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID"); +} + +if ($TRX2_ID) +{ + let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID"); +} + +## CF_ID column +let $cf_id_col= column_family_id; + +if ($SECOND_INDEX_NAME) +{ + eval set @cf2_id=(select column_family from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + let $cf_id_col= replace($cf_id_col, @cf2_id, "\$cf2_id"); +} +let $cf_id_col= replace($cf_id_col, @cf_id, "\$cf_id"); + +## KEY column +let $key_col= (`key`); +if ($SECOND_INDEX_NAME) +{ + eval set @indexnr2= (select lower(lpad(hex(index_number),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + eval set @indexnr2_next= (select lower(lpad(hex(index_number+1),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='$SECOND_INDEX_NAME'); + + let $key_col = replace($key_col, @indexnr2, '\${indexnr2}'); + let $key_col = replace($key_col, @indexnr2_next, '\${indexnr2+1}'); +} + +let $key_col = replace($key_col, @indexnr, '\${indexnr}'); +let $key_col = replace($key_col, @indexnr_next, '\${indexnr+1}'); + +## ORDER BY +if ($order_by_rowkey) +{ + let $extra_order_by = ORDER BY 3,2; +} + +if (!$order_by_rowkey) +{ + --sorted_result +} + +eval select + $cf_id_col as COLUMN_FAMILY_ID, + $transaction_col as TRANSACTION_ID, + $key_col as `KEY`, + mode +from information_schema.rocksdb_locks $extra_where $extra_order_by; + +--enable_query_log diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result new file mode 100644 index 000000000000..3938fa38b6cb --- /dev/null +++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result @@ -0,0 +1,652 @@ +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 11 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 12 +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +1 12 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 11 +2 19 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result new file mode 100644 index 000000000000..b48535c5ee6c --- /dev/null +++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result @@ -0,0 +1,182 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 ( +id INT, +val1 INT, +val2 INT, +PRIMARY KEY (id) +) ENGINE=rocksdb; +INSERT INTO t1 VALUES(1,1,1),(2,1,2); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 1 2 +UPDATE t1 SET val1=2 WHERE id=2; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t1 VALUES(20,1,1),(30,30,30); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 1 1 +30 30 30 +UPDATE t1 SET val1=20, val2=20 WHERE id=20; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 20 20 +30 30 30 +DELETE FROM t1 WHERE id=30; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +---SNAPSHOT, ACTIVE NUM sec +MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION +SHOW ENGINE rocksdb TRANSACTION STATUS +lock count 4, write count 4 +insert count 2, update count 1, delete count 1 +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +START TRANSACTION; +INSERT INTO t1 VALUES(40,40,40); +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +COMMIT; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=1; +DROP TABLE t1; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +UNIQUE KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result new file mode 100644 index 000000000000..0592b0992385 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result @@ -0,0 +1,106 @@ +DROP TABLE IF EXISTS t1; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb; +START TRANSACTION; +SELECT a FROM t1; +a +connection con2; +BEGIN; +INSERT INTO t1 (a) VALUES(1); +connection con1; +SELECT a FROM t1; +a +connection con2; +INSERT INTO t1 (a) VALUES (2); +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+100 FROM t1; +SELECT a FROM t1; +a +connection con2; +SELECT a FROM t1; +a +1 +2 +COMMIT; +SELECT a FROM t1; +a +1 +2 +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+200 FROM t1; +SELECT a FROM t1; +a +201 +202 +COMMIT; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection con2; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection default; +CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb; +INSERT INTO t2 (a) VALUES (1); +COMMIT; +connection con1; +BEGIN; +SELECT a from t2; +a +1 +INSERT INTO t2 (a) VALUES (1), (3); +ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY' +connection con2; +INSERT INTO t2 (a) VALUES (2); +COMMIT; +connection con1; +SELECT a from t2; +a +1 +COMMIT; +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t1; +DROP TABLE t2; +CREATE TABLE t3 ( +pk int unsigned PRIMARY KEY, +count int unsigned DEFAULT '0' +) ENGINE=ROCKSDB; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +BEGIN; +SELECT * FROM t3; +pk count +connection con2; +BEGIN; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +connection con1; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +SELECT count FROM t3; +count +1 +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t3; diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result new file mode 100644 index 000000000000..4a2f99f86fc7 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking.result @@ -0,0 +1,596 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'default', +unique key(a) comment 'default' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +1 1 +2 2 +3 2223 +4 2223 +5 2223 +6 6 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2 lock in share mode; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Another no-snapshot-checking test, this time for single-statement +# transaction +# +create table t1 ( +pk int, +a int, +name varchar(16), +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); +connect con1,localhost,root,,; +connection con1; +select get_lock('row1', 100); +get_lock('row1', 100) +1 +connection default; +# The following will read the first row (1,1,'row1'), and stop. +update t1 set a=a+100 where get_lock(name, 1000)=1; +connection con1; +update t1 set a=5 where pk=2; +select release_lock('row1'); +release_lock('row1') +1 +connection default; +# Look at the row with pk=2: +# 2, 105, row2 - means the UPDATE was reading current data (Correct) +# 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; +pk a name +1 101 row1 +2 105 row2 +# Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; +release_lock(name) +1 +1 +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'default' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; +# +# Range locking and READ-COMMITTED isolation level +# +connect con1,localhost,root,,; +connection con1; +set session transaction isolation level read committed; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +begin; +select * from t1 where pk between 2 and 5 for update; +pk a +2 NULL +3 NULL +4 NULL +5 NULL +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +begin; +update t1 set a=a+1 where pk between 2 and 5; +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +drop table t1; +disconnect con1; +connection default; +# +# Range Locking and READ-COMMITTED, another test +# +create table t1 ( +pk int, +a int, +b int, +primary key (pk), +key(a) +) engine=rocksdb; +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; +connect con1,localhost,root,,; +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +insert into t1 values (5, 250, 1500); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a +rollback; +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_conc_test.result b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result new file mode 100644 index 000000000000..a70152da808f --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result @@ -0,0 +1,4 @@ +set @save_rlwt=@@rocksdb_lock_wait_timeout; +# Run range_locking_conc_test.py +set global rocksdb_lock_wait_timeout= @save_rlwt; +DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result new file mode 100644 index 000000000000..00fd1788dfd6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result @@ -0,0 +1,453 @@ +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +# Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +create table t (i int primary key) engine=rocksdb; +insert into t values (1), (2), (3); +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #1 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #2 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 10; +Deadlock #3 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 1; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set rocksdb_deadlock_detect_depth = 2; +# Range locking code will report deadlocks, because it doesn't honor +# rocksdb_deadlock_detect_depth: +Deadlock #4 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +begin; +select * from t where i=3 for update; +i +3 +select * from t where i=2 for update; +select * from t where i=3 for update; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +deadlocks +true +rollback; +i +3 +rollback; +i +2 +rollback; +set global rocksdb_max_latest_deadlocks = 5; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #6 +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; +begin; +update t1 set value=value+200 where id=3; +update t1 set value=value+100 where id=3; +update t1 set value=value+200 where id=1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select * from t1; +id value +1 101 +2 102 +3 103 +4 4 +5 5 +drop table t1; +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 0; +# Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result new file mode 100644 index 000000000000..dd19d728ef24 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result @@ -0,0 +1,27 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +show variables like 'rocksdb_max_lock_memory'; +Variable_name Value +rocksdb_max_lock_memory 1024 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 0 +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select +A.a + B.a*10 + C.a*100 + D.a*1000, +12345 +from t0 A, t0 B, t0 C, t0 D; +select count(*) from t1; +count(*) +10000 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 127 +drop table t0,t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_partial_index.result b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result new file mode 100644 index 000000000000..0c3e85fe8fef --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result @@ -0,0 +1,202 @@ +create table t0(a int primary key) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int not null, +b int, +primary key (pk1, pk2), +key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5' +) engine=rocksdb; +insert into t1 select +1, +A.a, +100 + A.a, +123456 +from t0 A; +select * from t1 force index (key1) where pk1=1; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +insert into t1 select +2, +A.a, +100 + A.a, +123456 +from t0 A limit 3; +insert into t1 select +10000 + A.a +10 *B.a +100*C.a, +A.a, +100 + A.a, +123456 +from t0 A, t0 B, t0 C; +create table t3(pk int primary key); +connect con2,localhost,root,,; +connection con2; +begin; +insert into t3 values(3333333); +connection default; +# +# First, test a query with range lock +# +explain +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range key1 key1 4 NULL # 100.00 Using index condition +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` FORCE INDEX (`key1`) where ((`test`.`t1`.`pk1` >= 1) and (`test`.`t1`.`pk1` <= 10)) +connect con1,localhost,root,,; +connection con1; +begin; +# Allocate a snapshot +select * from t0 where a=3; +a +3 +connection default; +# Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); +connection con1; +# This doesn't see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +# This DOES see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +1 11 99999 99999 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +2 11 99999 99999 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}8000000a:1 X +$cf_id $trx_id ${indexnr}8000000180000000 X +$cf_id $trx_id ${indexnr}8000000180000001 X +$cf_id $trx_id ${indexnr}8000000180000002 X +$cf_id $trx_id ${indexnr}8000000180000003 X +$cf_id $trx_id ${indexnr}8000000180000004 X +$cf_id $trx_id ${indexnr}8000000180000005 X +$cf_id $trx_id ${indexnr}8000000180000006 X +$cf_id $trx_id ${indexnr}8000000180000007 X +$cf_id $trx_id ${indexnr}8000000180000008 X +$cf_id $trx_id ${indexnr}8000000180000009 X +$cf_id $trx_id ${indexnr}800000018000000b X +$cf_id $trx_id ${indexnr}8000000280000000 X +$cf_id $trx_id ${indexnr}8000000280000001 X +$cf_id $trx_id ${indexnr}8000000280000002 X +$cf_id $trx_id ${indexnr}800000028000000b X +rollback; +# +# Now, test a query with LockingIterator +# +delete from t1 where b=99999; +begin; +# Allocate a snapshot +select * from t0 where a=3; +a +3 +connection default; +# Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); +connection con1; +# This doesn't see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +10000 0 100 123456 +10001 1 101 123456 +# This DOES see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update; +pk1 pk2 a b +1 0 100 123456 +1 1 101 123456 +1 2 102 123456 +1 3 103 123456 +1 4 104 123456 +1 5 105 123456 +1 6 106 123456 +1 7 107 123456 +1 8 108 123456 +1 9 109 123456 +1 11 99999 99999 +2 0 100 123456 +2 1 101 123456 +2 2 102 123456 +2 11 99999 99999 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf2_id $trx_id ${indexnr2}80000001 X +$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}800000018000006480000000 X +$cf2_id $trx_id ${indexnr2}800000018000006480000000-${indexnr2}800000018000006580000001 X +$cf2_id $trx_id ${indexnr2}800000018000006580000001-${indexnr2}800000018000006680000002 X +$cf2_id $trx_id ${indexnr2}800000018000006680000002-${indexnr2}800000018000006780000003 X +$cf2_id $trx_id ${indexnr2}800000018000006780000003-${indexnr2}800000018000006880000004 X +$cf2_id $trx_id ${indexnr2}800000018000006880000004-${indexnr2}800000018000006980000005 X +$cf2_id $trx_id ${indexnr2}800000018000006980000005-${indexnr2}800000018000006a80000006 X +$cf2_id $trx_id ${indexnr2}800000018000006a80000006-${indexnr2}800000018000006b80000007 X +$cf2_id $trx_id ${indexnr2}800000018000006b80000007-${indexnr2}800000018000006c80000008 X +$cf2_id $trx_id ${indexnr2}800000018000006c80000008-${indexnr2}800000018000006d80000009 X +$cf2_id $trx_id ${indexnr2}800000018000006d80000009-${indexnr2}800000018001869f8000000b X +$cf2_id $trx_id ${indexnr2}800000018001869f8000000b-${indexnr2+1} X +$cf_id $trx_id ${indexnr}8000000180000000 X +$cf_id $trx_id ${indexnr}8000000180000001 X +$cf_id $trx_id ${indexnr}8000000180000002 X +$cf_id $trx_id ${indexnr}8000000180000003 X +$cf_id $trx_id ${indexnr}8000000180000004 X +$cf_id $trx_id ${indexnr}8000000180000005 X +$cf_id $trx_id ${indexnr}8000000180000006 X +$cf_id $trx_id ${indexnr}8000000180000007 X +$cf_id $trx_id ${indexnr}8000000180000008 X +$cf_id $trx_id ${indexnr}8000000180000009 X +$cf_id $trx_id ${indexnr}800000018000000b X +$cf_id $trx_id ${indexnr}80000002-${indexnr}8000271080000000 X +rollback; +disconnect con1; +connection default; +disconnect con2; +drop table t0, t1,t3; diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result new file mode 100644 index 000000000000..1067087e8165 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result @@ -0,0 +1,50 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +set debug_sync='RESET'; +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; +set debug_sync='now WAIT_FOR con1_stopped'; +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +set debug_sync='now SIGNAL con1_cont'; +select * from t1 where pk<100; +pk a +0 100 +1 101 +2 102 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +40 5100 +41 5100 +43 5100 +44 5100 +45 5100 +46 146 +47 147 +48 148 +49 149 +commit; +set debug_sync='RESET'; +drop table t1, ten, one_k; diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result new file mode 100644 index 000000000000..e39c6bb3339d --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result @@ -0,0 +1,556 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1', +unique key(a) comment '' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +6 6 +5 2223 +4 2223 +3 2223 +2 2 +1 1 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2 lock in share mode; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; +# +# Range locking and READ-COMMITTED isolation level +# +connect con1,localhost,root,,; +connection con1; +set session transaction isolation level read committed; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +begin; +select * from t1 where pk between 2 and 5 for update; +pk a +2 NULL +3 NULL +4 NULL +5 NULL +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +begin; +update t1 set a=a+1 where pk between 2 and 5; +# Below should show individual row locks, not locked range: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002 X +$cf_id $trx_id ${indexnr}80000003 X +$cf_id $trx_id ${indexnr}80000004 X +$cf_id $trx_id ${indexnr}80000005 X +$cf_id $trx_id ${indexnr}80000006 X +rollback; +drop table t1; +disconnect con1; +connection default; +# +# Range Locking and READ-COMMITTED, another test +# +create table t1 ( +pk int, +a int, +b int, +primary key (pk), +key(a) +) engine=rocksdb; +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; +connect con1,localhost,root,,; +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +insert into t1 values (5, 250, 1500); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a +rollback; +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result new file mode 100644 index 000000000000..bf1cb916a5b6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result @@ -0,0 +1,291 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +# Make another connection to get the lock tree out of the STO-mode +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +connection default; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +# Now, we will just see locks on 10=0xA and 11=0xB: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +# +# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +# +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3 +select * from t1 where pk>=500 order by pk limit 3 for update; +pk a +500 500 +501 501 +502 502 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X +rollback; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +explain +select * from t1 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3 +select * from t1 order by pk limit 3 for update; +pk a +0 0 +1 1 +2 2 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}80000002 X +$cf_id $trx_id ${indexnr}8000000b X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# Concurrent tests: let one thread do SeekForUpdate and the other +# interfere by committing modifications +# +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +select * from t1 where pk<10; +pk a +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +delete from t1 where pk<10; +select * from t1 where pk<10; +pk a +# Test what happens when another transaction commits a row +# right before the range we are about to lock (nothing) +explain +select * from t1 where pk >=5 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 3 for update; +connect con1,localhost,root,,; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +11 11 +12 12 +rollback; +delete from t1 where pk=3; +# +# Now, repeat the test but let the other transaction insert the row into +# the range we are locking +explain +select * from t1 where pk >=5 order by pk limit 1 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X +rollback; +delete from t1 where pk=8; +# +# Repeat the third time, this time deleting the row that SeekForUpdate saw +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +rollback; +# +# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +# error. MyRocks code should now be prepared that data reads cause this +# error +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +# +# Test the thd_killed check in the iterator +# +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +kill query CONN_ID; +connection default; +ERROR HY000: Got error 10 'Operation aborted: ' from ROCKSDB +rollback; +# +# Backward scan test +# +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +insert into t1 values +(1001, 1001), +(1005, 1005), +(1007, 1007), +(1010, 1010); +begin; +select * from t1 order by pk desc limit 2 for update; +pk a +1010 1010 +1007 1007 +# The below will lock from pk=1007 (0x3ef) till the end of the table: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X +rollback; +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +pk a +1005 1005 +1001 1001 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X +connection con1; +rollback; +connection default; +rollback; +# +# Backward scan test 2: error condition +# +connection con1; +begin; +select * from t1 where pk=1010 for update; +pk a +1010 1010 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; +pk a +1007 1007 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# A test: full table scan doesn't lock gaps +# +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 values (10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +connection con1; +begin; +select * from t1 for update; +pk a +10 10 +20 20 +30 30 +connection con2; +insert into t1 values (5,5); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result new file mode 100644 index 000000000000..2afb2eea5895 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result @@ -0,0 +1,142 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +insert into t1 (pk) +select +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +delete from t1 where pk<100; +connect con1,localhost,root,,; +connection con1; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 5 for update; +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; +connection con1; +pk a +10 NULL +20 NULL +30 NULL +40 NULL +50 NULL +# This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); +lock_count +1 +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=off; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=on; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result new file mode 100644 index 000000000000..fef9e0ef6a0b --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result @@ -0,0 +1,142 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +insert into t1 (pk) +select +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +delete from t1 where pk<100; +connect con1,localhost,root,,; +connection con1; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 5 for update; +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; +connection con1; +pk a +10 NULL +20 NULL +30 NULL +40 NULL +50 NULL +# This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); +lock_count +1 +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=off; +# +# A testcase for locking at the end of the scan +# +create table t1 ( +pk int, +primary key (pk) comment 'rev:rlsfu_test' +) engine=rocksdb; +connect con1,localhost,root,,; +connection con1; +insert into t1 values (1), (10), (100); +begin; +select * from t1 for update; +pk +1 +10 +100 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (150); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +begin; +explain +select * from t1 order by pk desc for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc +select * from t1 order by pk desc for update; +pk +100 +10 +1 +connection default; +select * from t1; +pk +1 +10 +100 +insert into t1 values (0); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +disconnect con1; +connection default; +drop table t1; +set global rocksdb_enable_iterate_bounds=on; diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result new file mode 100644 index 000000000000..580108de6f6b --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result @@ -0,0 +1,251 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from t0; +# A basic test for shared locks +begin; +select * from t1 where pk=3 for update; +pk a +3 3 +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX2_ID ${indexnr}80000005 S +rollback; +# Now, TRX2_ID should be gone: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +connection default; +# Get a read lock on pk=3 (where we have a write lock). +# The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +pk a +3 3 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +# Get a write lock on pk=5 (where we have a read lock). +# The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +pk a +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 X +connection default; +rollback; +# +# Test if a read lock inhibits write locks +# +begin; +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +select * from t1 where pk=8 for update; +pk a +8 8 +connection con1; +begin; +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +select * from t1 where pk between 0 and 4 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +delete from t1 where pk=2; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +# Get a shared lock +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +# But this should still prevent us from acquiring a write lock on that value: +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection default; +rollback; +drop table t1; +create table t1 ( +pk int not null primary key, +a int not null, +key(a) +) engine=rocksdb; +insert into t1 +select +A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from +t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; +connection con1; +begin; +select * from t1 where pk=900 for update; +pk a +900 900 +connection default; +begin; +explain +select * from t1 where a between 2 and 5 lock in share mode; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5) +select * from t1 where a between 2 and 5 lock in share mode; +pk a +2 2 +3 3 +4 4 +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X +$cf_id $TRX1_ID ${indexnr}80000002 S +$cf_id $TRX1_ID ${indexnr}80000003 S +$cf_id $TRX1_ID ${indexnr}80000004 S +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX1_ID ${indexnr}80000006 S +$cf_id $TRX2_ID ${indexnr}80000384 X +rollback; +disconnect con1; +drop table t0,t1; +# +# Test shared point locks and lock escalation +# +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 0 +connect con1,localhost,root,,; +connection con1; +begin; +# CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +select * from t1 where pk=2500 lock in share mode; +pk a +connection default; +begin; +# DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +# DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +pk a +# DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; +connection con1; +# CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000 X +$cf_id $TRX2_ID ${indexnr}80000001 X +$cf_id $TRX2_ID ${indexnr}80000002 X +$cf_id $TRX2_ID ${indexnr}80000003 X +$cf_id $TRX2_ID ${indexnr}80000004 X +$cf_id $TRX2_ID ${indexnr}80000005 X +$cf_id $TRX2_ID ${indexnr}80000006 X +$cf_id $TRX2_ID ${indexnr}80000007 X +$cf_id $TRX2_ID ${indexnr}80000008 X +$cf_id $TRX2_ID ${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0 X +$cf_id $TRX1_ID ${indexnr}800007d1 X +$cf_id $TRX1_ID ${indexnr}800007d2 X +$cf_id $TRX1_ID ${indexnr}800007d3 X +$cf_id $TRX1_ID ${indexnr}800007d4 X +$cf_id $TRX1_ID ${indexnr}800007d5 X +$cf_id $TRX1_ID ${indexnr}800007d6 X +$cf_id $TRX1_ID ${indexnr}800007d7 X +$cf_id $TRX1_ID ${indexnr}800007d8 X +$cf_id $TRX1_ID ${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 8792 +set @save_mlm= @@rocksdb_max_lock_memory; +# Set the limit to cause lock escalation: +set @cur_mem_usage= (select +variable_value +from +performance_schema.global_status +where +variable_name='rocksdb_locktree_current_lock_memory'); +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +$cf_id $TRX1_ID ${indexnr}80000bb8 X +$cf_id $TRX1_ID ${indexnr}80000bb9 X +$cf_id $TRX1_ID ${indexnr}80000bba X +$cf_id $TRX1_ID ${indexnr}80000bbb X +$cf_id $TRX1_ID ${indexnr}80000bbc X +$cf_id $TRX1_ID ${indexnr}80000bbd X +$cf_id $TRX1_ID ${indexnr}80000bbe X +$cf_id $TRX1_ID ${indexnr}80000bbf X +$cf_id $TRX1_ID ${indexnr}80000bc0 X +$cf_id $TRX1_ID ${indexnr}80000bc1 X +connection con1; +rollback; +connection default; +rollback; +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); +drop table t0, t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result index 72312d0edaea..a6eaaddcd720 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb.result +++ b/mysql-test/suite/rocksdb/r/rocksdb.result @@ -1018,6 +1018,7 @@ rocksdb_max_compaction_history 64 rocksdb_max_file_opening_threads 16 rocksdb_max_intrinsic_tmp_table_write_count 1000 rocksdb_max_latest_deadlocks 5 +rocksdb_max_lock_memory 1073741824 rocksdb_max_log_file_size 0 rocksdb_max_manifest_file_size 1073741824 rocksdb_max_manual_compactions 10 @@ -1091,6 +1092,8 @@ rocksdb_use_direct_reads OFF rocksdb_use_fsync OFF rocksdb_use_hyper_clock_cache OFF rocksdb_use_io_uring OFF +rocksdb_use_range_lock_manager_as_point OFF +rocksdb_use_range_locking OFF rocksdb_use_write_buffer_manager OFF rocksdb_validate_tables 1 rocksdb_verify_row_debug_checksums OFF diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result index 58329d03ebc9..71e6ff4d30b2 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result @@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; include/sync_slave_sql_with_master.inc [connection slave] -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; read_free false select * from t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result index 1e253a9974b3..08a0a2f59426 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result @@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF begin work; insert into t1 values (9); insert into t1 values (10); +# Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; +a update t1 set a = a + 1 where a = 2; begin work; insert into t1 values (11); diff --git a/mysql-test/suite/rocksdb/r/select_count_for_update.result b/mysql-test/suite/rocksdb/r/select_count_for_update.result index 1107aa2f6cb0..6672d43eb438 100644 --- a/mysql-test/suite/rocksdb/r/select_count_for_update.result +++ b/mysql-test/suite/rocksdb/r/select_count_for_update.result @@ -35,9 +35,9 @@ SELECT COUNT(*) FROM t1 FORCE INDEX (sk); COUNT(*) 3 SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE; -ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE; -ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX connection con1; COMMIT; SELECT COUNT(*) FROM t1 FORCE INDEX (sk); diff --git a/mysql-test/suite/rocksdb/r/trx_info.result b/mysql-test/suite/rocksdb/r/trx_info.result index ada2e127021e..562c1d7022ed 100644 --- a/mysql-test/suite/rocksdb/r/trx_info.result +++ b/mysql-test/suite/rocksdb/r/trx_info.result @@ -9,5 +9,10 @@ a 2 select * from information_schema.rocksdb_trx; TRANSACTION_ID STATE NAME WRITE_COUNT LOCK_COUNT TIMEOUT_SEC WAITING_KEY WAITING_COLUMN_FAMILY_ID IS_REPLICATION SKIP_TRX_API READ_ONLY HAS_DEADLOCK_DETECTION NUM_ONGOING_BULKLOAD THREAD_ID QUERY -_TRX_ID_ STARTED _NAME_ 0 2 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx +_TRX_ID_ STARTED _NAME_ 0 2_or_3 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx +select +if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT +from information_schema.rocksdb_trx; +LOCK_COUNT_IS_CORRECT +1 DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result index 1da78db24b1c..d4ef2e0ff2ee 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec.result +++ b/mysql-test/suite/rocksdb/r/unique_sec.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result index d6d06f6ece5a..0e71e6481aa7 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result +++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test index 42e46bb0f28f..55e6502c079b 100644 --- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test +++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test @@ -1,3 +1,9 @@ +# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE; +# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks. +# A part of this test that works with range locking is in +# range_locking_deadlock_tracking.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; @@ -137,7 +143,6 @@ rollback; connection default; --replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ show engine rocksdb transaction status; - echo Deadlock #6; connection con1; create table t1 (id int primary key, value int) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test index f7eb8151f40d..05ae30f2dddd 100644 --- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test +++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test @@ -3,6 +3,10 @@ --source include/have_rocksdb.inc --source include/count_sessions.inc +# Doesn't work with range locking because range locking +# does not provide info in rocksdb_deadlock. +--source suite/rocksdb/include/not_range_locking.inc + --disable_query_log call mtr.add_suppression("Column family '[a-z_]+' not found"); --enable_query_log diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test new file mode 100644 index 000000000000..b62eb74fa44e --- /dev/null +++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test @@ -0,0 +1,21 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + + +# Hermitage is an attempt to test transaction isolation levels. +# https://github.com/ept/hermitage + +let $trx_isolation = READ COMMITTED; +let $skip_snapshot_validation = 0; +--source hermitage.inc + +let $trx_isolation = REPEATABLE READ; +let $skip_snapshot_validation = 0; +--source hermitage.inc + +let $trx_isolation = REPEATABLE READ; +let $skip_snapshot_validation = 1; +--source hermitage.inc diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc index f7f8ef1f9a8d..fc26d0f51596 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.inc +++ b/mysql-test/suite/rocksdb/t/hermitage.inc @@ -111,6 +111,8 @@ select * from test where value % 3 = 0; commit; --source hermitage_init.inc +let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`; +let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`; connection con1; update test set value = value + 10; connection con2; @@ -120,13 +122,13 @@ send delete from test where value = 20; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 2 => 30 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { if ($skip_snapshot_validation == 0) { @@ -157,13 +159,13 @@ send update test set value = 12 where id = 1; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 1 => 12 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { if ($skip_snapshot_validation == 0) { @@ -218,12 +220,12 @@ update test set value = 12 where id = 1; update test set value = 18 where id = 2; commit; connection con1; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { delete from test where value = 20; # doesn't delete anything select * from test where id = 2; # shows 2 => 18 } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { if ($skip_snapshot_validation == 0) { diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test index 35842e9f29b4..14fab7dc348d 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.test +++ b/mysql-test/suite/rocksdb/t/hermitage.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See hermitage-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + # Hermitage is an attempt to test transaction isolation levels. # https://github.com/ept/hermitage diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test index e0479d6a3370..82fa9fc6bbdc 100644 --- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test +++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that +# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test index 85b5b019e756..a08a6f4e0ba7 100644 --- a/mysql-test/suite/rocksdb/t/issue111.test +++ b/mysql-test/suite/rocksdb/t/issue111.test @@ -1,4 +1,7 @@ --source include/have_rocksdb.inc +# The testcase here assumes key tracking is present +# (and range locking uses InnoDB-like approach, "DMLs use Read Commited") +--source suite/rocksdb/include/not_range_locking.inc # Save the initial number of concurrent sessions --source include/count_sessions.inc diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test new file mode 100644 index 000000000000..465fb9099dac --- /dev/null +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test @@ -0,0 +1,10 @@ +# +# A range-locking variant of issue243_transactionStatus.test + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +let $forced_range_locking=1; +--source issue243_transactionStatus.test + + diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test index 1e2f0b41226b..5c1948ebe817 100644 --- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +if (!$forced_range_locking) { +--source suite/rocksdb/include/not_range_locking.inc +} + --disable_warnings DROP TABLE IF EXISTS t1; --enable_warnings diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test new file mode 100644 index 000000000000..6c42c7be12cf --- /dev/null +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test @@ -0,0 +1,9 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + +let $trx_isolation = REPEATABLE READ; +--source transaction_isolation.inc + diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test index cf29073f69ec..b81dcf31ab1b 100644 --- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See level_repeatable_read-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + let $trx_isolation = REPEATABLE READ; --source transaction_isolation.inc diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test index 1b624cf38c05..a277c1b8d8de 100644 --- a/mysql-test/suite/rocksdb/t/lock_info.test +++ b/mysql-test/suite/rocksdb/t/lock_info.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test) +--source suite/rocksdb/include/not_range_locking.inc + --disable_warnings DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test index 035046ae368d..95a6676f78a2 100644 --- a/mysql-test/suite/rocksdb/t/locking_issues.test +++ b/mysql-test/suite/rocksdb/t/locking_issues.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# A lot of tests below assume point locking, not range. +--source suite/rocksdb/include/not_range_locking.inc + let $isolation_level = REPEATABLE READ; --source suite/rocksdb/include/locking_issues_case1_1.inc diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test index 694b93802829..18a846dd8990 100644 --- a/mysql-test/suite/rocksdb/t/max_row_locks.test +++ b/mysql-test/suite/rocksdb/t/max_row_locks.test @@ -1,4 +1,5 @@ --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2)); --disable_query_log diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc new file mode 100644 index 000000000000..b3ec0ae7367b --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.inc @@ -0,0 +1,612 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; + +eval create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +--echo ### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (15,15); + +connection con1; +rollback; + +--echo ## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; + +connection con2; +begin; +# This must not block +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test that locks are not released when a statement inside +--echo ## a transaction is rolled back +eval +create table t2 ( + pk int, + a int, + primary key (pk) comment '$pk_cf', + unique key(a) comment '$sk_cf' +) engine=rocksdb; + +insert into t2 values (1,1),(2,2); + +begin; +insert into t2 values (3,3); +--error ER_DUP_ENTRY +insert into t2 values (10,2); + +connection con2; +begin; +# This must time out: +--error ER_LOCK_WAIT_TIMEOUT +select * from t2 where pk=3 for update; + +rollback; +connection con1; +rollback; +drop table t2; + +# cleanup +connection default; +disconnect con1; +disconnect con2; +drop table t1; + +--echo # +--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode +--echo # + +connect (con1,localhost,root,,); +connection con1; +eval create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; + + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +begin; +select * from t1 where pk=10 for update; + +#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +delete from t1 where pk between 25 and 40; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +begin; +--echo # The following will show a range lock on 2-9 and also a point lock on 10. +--echo # This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; + +--echo # +--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + primary key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; + +begin; +select * from t1 where kp1=2 for update; + +connection default; +--echo # The lock on kp1=2 should inhibit the following INSERT: +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values ( 2,5,9999); +rollback; + +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; + +--echo # +--echo # Test that locks on ranges on non-unique secondary keys inhibit +--echo # modifications of the contents of these ranges +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; +begin; +--replace_column 10 # +explain +select * from t1 where kp1=2 for update; +select * from t1 where kp1=2 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (2, 9, 9999); + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where kp1=2 and kp2=5; + +# Update that "moves a row away" from the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=333 where kp1=2 and kp2=3; + +# Update that "moves a row into" the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=2 where kp1=1 and kp2=8; + +rollback; + +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # Transaction isolation test +--echo # + +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; + +--echo # Examine the result: +--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +--echo # (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same test as above, but check the range scan +--echo # + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; + +--echo # Examine the result: +--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same as above, but test SELECT FOR UPDATE. +--echo # +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; + +--echo # TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +select * from t1 where pk=2 for update; +select * from t1 where pk=2 lock in share mode; +select * from t1 where pk=2; + +commit; + +disconnect con1; +connection default; +drop table t1; + +if (!$PK_USES_REVERSE_CF) { +--echo # +--echo # Another no-snapshot-checking test, this time for single-statement +--echo # transaction +--echo # +eval +create table t1 ( + pk int, + a int, + name varchar(16), + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); + +connect (con1,localhost,root,,); +connection con1; +select get_lock('row1', 100); + +connection default; + +--echo # The following will read the first row (1,1,'row1'), and stop. + +send update t1 set a=a+100 where get_lock(name, 1000)=1; + +# Wait till the default connection has stopped: +connection con1; + +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock" + AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1"; +--source include/wait_condition.inc + +# Update the second row +update t1 set a=5 where pk=2; + +select release_lock('row1'); + +connection default; +reap; + +--echo # Look at the row with pk=2: +--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct) +--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; + +--echo # Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; + +disconnect con1; +connection default; +drop table t1; +} + +--echo # +--echo # Check that I_S.processlist.state is set correctly now. +--echo # +eval +create table t1( + pk int, + a int, + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +begin; +select * from t1 where pk=2 for update; + +--connect (con1,localhost,root,,) +begin; +set rocksdb_lock_wait_timeout=300; +send select * from t1 where pk=2 for update; + +connection default; +--echo # Now, will wait until we see con1 have state="Waiting for row lock" +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock" + AND INFO = "select * from t1 where pk=2 for update"; +--source include/wait_condition.inc + +rollback; +connection con1; +--reap +rollback; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST +--echo # +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +eval +create table t1 ( + pk1 int, + pk2 int, + a int, + primary key(pk1, pk2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 +select + A.a, B.a, A.a*10+B.a +from + t0 A, t0 B; + + +# Get a lock in another connection so that the primary transaction is not using +# STO optimization, and its locks can be seen in I_S.rocksdb_locks +--connect (con1,localhost,root,,) +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); + +connection default; +begin; +--echo # Should use ref access w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +--echo # + +begin; +--echo # Should use range access with 2 keyparts and w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; + +connection default; +drop table t0, t1; + +--echo # +--echo # A bug: range locking was not used when scan started at table start or end +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; + +create table t1 ( + pk int not null, + a int, + primary key(pk) +) engine=rocksdb; + +insert into t1 select a*2,a*2 from t10; + +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +begin; +select * from t1 where pk<10 order by pk limit 10 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; +disconnect con1; + +connection default; +drop table t0,t10,t1; + +--echo # +--echo # Range locking and READ-COMMITTED isolation level +--echo # +connect (con1,localhost,root,,); +connection con1; +set session transaction isolation level read committed; +create table t1 ( + pk int not null, + a int, + primary key(pk) +) engine=rocksdb; + +insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +begin; +select * from t1 where pk between 2 and 5 for update; +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Below should show individual row locks, not locked range: +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +begin; +update t1 set a=a+1 where pk between 2 and 5; +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Below should show individual row locks, not locked range: +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +drop table t1; +disconnect con1; +connection default; + +--echo # +--echo # Range Locking and READ-COMMITTED, another test +--echo # +create table t1 ( + pk int, + a int, + b int, + primary key (pk), + key(a) +) engine=rocksdb; + +insert into t1 values +(1, 100, 1000), +(2, 200, 2000), +(3, 300, 3000); + +set transaction isolation level repeatable read; +begin; +update t1 set b = b + 1 where a > 200; + +connect (con1,localhost,root,,); +connection con1; +set transaction isolation level read committed; +begin; +insert into t1 values (4, 150, 1500); +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (5, 250, 1500); + +rollback; + +disconnect con1; +connection default; +rollback; +drop table t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test new file mode 100644 index 000000000000..5c599238a0a7 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.test @@ -0,0 +1,6 @@ + +--let pk_cf=default +--let sk_cf=default + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt b/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt new file mode 100644 index 000000000000..b20b064d5af7 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test-master.opt @@ -0,0 +1 @@ +--rocksdb_deadlock_detect=1 --rocksdb_lock_wait_timeout=10 diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.py b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py new file mode 100644 index 000000000000..d048f8823834 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py @@ -0,0 +1,520 @@ +""" +This script tests Range Locking. See + mysql-test/suite/rocksdb/t/range_locking_conc_test.txt for details. + +Usage: + + python3 suite/rocksdb/t/range_locking_conc_test.py \ + [--progress] [--verbose] \ + root 127.0.0.1 $MASTER_MYPORT test t1 \ + num_inserts num_insert_threads \ + num_group_ops num_group_threads + + + For Example: + + time python3 suite/rocksdb/t/range_locking_conc_test.py --progress root 127.0.0.1 3314 test t1 2000 20 10000 40 + +""" + +import hashlib +import MySQLdb +from MySQLdb.constants import ER +import os +import random +import signal +import sys +import threading +import time +import string +import traceback + +# MAX_PK_VAL = 1000*1000*1000 +MAX_PK_VAL = 1000 * 1000 + +show_progress = False +verbose_output = False + +counter_n_inserted = 0 +counter_n_deleted = 0 +counter_n_insert_failed = 0 +counter_n_groups_created = 0 +counter_n_group_create_fails = 0 +counter_n_groups_verified = 0 +counter_n_groups_deleted = 0 + + +def is_lock_error(exc): + error_code = exc.args[0] + return error_code == ER.LOCK_WAIT_TIMEOUT or error_code == ER.LOCK_DEADLOCK + + +# +# Watcher prints the test progress and some status variables once per second +# +class Watcher(threading.Thread): + Instance = None + + def __init__(self, con): + threading.Thread.__init__(self) + self.should_stop = False + self.finished = False + self.con = con + self.start() + + def run(self): + global counter_n_inserted + global counter_n_deleted + global counter_n_insert_failed + global counter_n_groups_created + global counter_n_group_create_fails + global counter_n_groups_verified + global counter_n_groups_deleted + event = threading.Event() + + save_counter_n_inserted = 0 + save_counter_n_deleted = 0 + save_counter_n_insert_failed = 0 + save_counter_n_groups_created = 0 + save_counter_n_group_create_fails = 0 + save_counter_n_groups_verified = 0 + save_counter_n_groups_deleted = 0 + save_wait_count = 0 + n = 0 + cur = self.con.cursor() + try: + while not self.should_stop: + event.wait(1) + + cur.execute("show status like '%rocksdb_locktree_wait_count%'") + row = cur.fetchone() + wait_count = int(row[1]) + + print("== %d ========================" % n) + print( + "counter_n_inserted=%d" + % (counter_n_inserted - save_counter_n_inserted) + ) + print( + "counter_n_deleted=%d" + % (counter_n_deleted - save_counter_n_deleted) + ) + print( + "counter_n_insert_failed=%d" + % (counter_n_insert_failed - save_counter_n_insert_failed) + ) + print( + "counter_n_groups_created=%d" + % (counter_n_groups_created - save_counter_n_groups_created) + ) + print( + "counter_n_group_create_fails=%d" + % (counter_n_group_create_fails - save_counter_n_group_create_fails) + ) + print( + "counter_n_groups_verified=%d" + % (counter_n_groups_verified - save_counter_n_groups_verified) + ) + print( + "counter_n_groups_deleted=%d" + % (counter_n_groups_deleted - save_counter_n_groups_deleted) + ) + print("wait_count=%d" % (wait_count - save_wait_count)) + + save_counter_n_inserted = counter_n_inserted + save_counter_n_deleted = counter_n_deleted + save_counter_n_insert_failed = counter_n_insert_failed + save_counter_n_groups_created = counter_n_groups_created + save_counter_n_group_create_fails = counter_n_group_create_fails + save_counter_n_groups_verified = counter_n_groups_verified + save_counter_n_groups_deleted = counter_n_groups_deleted + save_wait_count = wait_count + n += 1 + + except Exception as e: + self.exception = traceback.format_exc() + print("Watcher caught (%s)" % (e)) + + finally: + self.finish() + + def finish(self): + n = 0 + # Do nothing + + +# +# A worker is one client thread producing the benchmark workload +# +class Worker(threading.Thread): + Instance = None + + def __init__(self, con, table_name, worker_type_arg, num_inserts): + threading.Thread.__init__(self) + self.finished = False + self.num_inserts = num_inserts + con.autocommit(False) + self.con = con + self.rand = random.Random() + self.exception = None + self.table_name = table_name + self.worker_type = worker_type_arg + Worker.Instance = self + self.start() + + def run(self): + self.rand.seed(threading.get_ident()) + my_id = threading.get_ident() + try: + self.cur = self.con.cursor() + if self.worker_type == "insert": + self.run_inserts() + if self.worker_type == "join_group": + self.run_create_groups() + except Exception as e: + print(e) + self.exception = traceback.format_exc() + print("THR %d caught (%s)" % (my_id, e)) + + finally: + self.finish() + + # + # Insert one row, making sure this doesn't break any groups + # + def run_one_insert(self): + global counter_n_inserted + global counter_n_deleted + global counter_n_insert_failed + cur = self.cur + # pk = self.rand.randint(1, MAX_PK_VAL) + # Note: here the distribution is intentionally 2x wider than the grouping + # thread has. + pk = int(self.rand.normalvariate(MAX_PK_VAL / 2, MAX_PK_VAL / 50.0)) + + cur.execute("begin") + do_commit = False + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 1 for update", + (pk,), + ) + row = cur.fetchone() + group_to_delete = None + if row is None: + # print("No row found, inserting %d" % (pk+1000*1000)) + cur.execute("insert into t1 (pk) values(%s)", (pk + 1000 * 1000,)) + do_commit = True + else: + if (row[0] - pk) > 2 and row[1] is None: + # print("Row found, inserting into gap, %d" % pk) + cur.execute("insert into t1 (pk) values(%s)", (pk,)) + do_commit = True + else: + # print("Row found, grouped or too tight") + if row[2]: + # if parent_id is present, use it + group_to_delete = row[0] + # print("About to delete %d" % group_to_delete) + do_commit = False + + if do_commit: + cur.execute("commit") + counter_n_inserted += 1 + return 1 + else: + counter_n_insert_failed += 1 + if group_to_delete: + counter_n_deleted += 5 + self.delete_group(group_to_delete, True) + cur.execute("commit") + else: + cur.execute("rollback") + return 0 + + def run_one_create_group(self): + global counter_n_groups_created + global counter_n_group_create_fails + global counter_n_groups_deleted + cur = self.cur + # pk = self.rand.randint(1, MAX_PK_VAL) + pk = int(self.rand.normalvariate(MAX_PK_VAL / 2, MAX_PK_VAL / 100)) + + n_rows = 0 + n_groups_deleted = 0 + first_pk = None + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", + (pk,), + ) + row = cur.fetchone() + while row is not None: + if first_pk is None: + first_pk = row[0] + group_list = str(first_pk) + else: + group_list = group_list + "," + str(row[0]) + + last_pk = row[0] + if row[1] is not None: + # Found a row in a group + # Continue until group end. + found_next_group = False + row = cur.fetchone() + while row is not None: + if row[1] is None: + found_next_group = True + first_pk = row[0] + group_list = str(first_pk) + break + row = cur.fetchone() + + if not found_next_group: + break + + if row[2] is not None: + # Found a group leader row. + ungrouped_ids = self.delete_group(row[0], False) + n_groups_deleted += 1 + i = 1 + n_rows += 1 + while n_rows < 5: + group_list = group_list + "," + str(ungrouped_ids[i]) + last_pk = ungrouped_ids[i] + i += 1 + n_rows += 1 + break + n_rows += 1 + row = cur.fetchone() + + if n_rows == 5 or n_groups_deleted > 0: + # Ok we got 5 rows in a row and they are all standalone + # Create a group. + # print("Creating group %d" % first_pk) + cur.execute( + "update t1 set group_list=%s where pk=%s", + ( + group_list, + first_pk, + ), + ) + cur.execute( + "update t1 set parent_id=%s where pk > %s and pk <=%s", + (first_pk, first_pk, last_pk), + ) + cur.execute("commit") + counter_n_groups_created += 1 + counter_n_groups_deleted += n_groups_deleted + return 1 + else: + # print("Failed to join a group") + counter_n_group_create_fails += 1 + cur.execute("rollback") + return 0 + + # + # Verify and delete the group + # @return An array listing the deleted PKs + # + def delete_group(self, group_id, delete_rows): + global counter_n_groups_verified + cur = self.con.cursor() + cur.execute( + "select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", + (group_id,), + ) + first_pk = None + n_rows = 0 + + row = cur.fetchone() + while row is not None: + if first_pk is None: + first_pk = row[0] + group_list = str(first_pk) + group_arr = [] + group_arr.append(first_pk) + group_list_base = row[2] + if first_pk != group_id: + self.raise_error("First row is not the group leader!") + else: + group_list = group_list + "," + str(row[0]) + group_arr.append(row[0]) + + last_pk = row[0] + if row[0] != first_pk and row[1] != first_pk: + self.raise_error( + "Row in group has wrong parent_id (expect %s got %s)" + % (first_pk, row[1]) + ) + break + if row[0] != first_pk and row[2] is not None: + self.raise_error("Row in group is a group leader?") + break + n_rows += 1 + row = cur.fetchone() + + if n_rows != 5: + self.raise_error( + "Expected %d rows got %d" + % ( + 5, + n_rows, + ) + ) + if group_list != group_list_base: + self.raise_error( + "Group contents mismatch: expected '%s' got '%s'" + % (group_list_base, group_list) + ) + # Ok, everything seems to be in order. + if delete_rows: + cur.execute( + "delete from t1 where pk>=%s and pk<=%s", + ( + group_id, + last_pk, + ), + ) + else: + cur.execute( + "update t1 set parent_id=NULL, group_list=NULL where pk>=%s and pk<=%s", + ( + group_id, + last_pk, + ), + ) + + counter_n_groups_verified += 1 + return group_arr + + def raise_error(self, msg): + print("Data corruption detected: " + msg) + sys.exit("Failed!") + + def run_inserts(self): + # print("Worker.run_inserts") + i = 0 + while i < self.num_inserts: + try: + i += self.run_one_insert() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + if not is_lock_error(e): + raise e + + def run_create_groups(self): + # print("Worker.run_create_groups") + i = 0 + while i < self.num_inserts: + try: + i += self.run_one_create_group() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + if not is_lock_error(e): + raise e + + def finish(self): + self.finished = True + + +if __name__ == "__main__": + if len(sys.argv) != 10 and len(sys.argv) != 11: + print( + "Usage: range_locking_conc_test.py " + "[--progress] " + "user host port db_name table_name " + "num_inserts num_insert_threads " + "num_grp_ops num_group_threads" + ) + sys.exit(1) + i = 1 + if sys.argv[i] == "--progress": + show_progress = True + i += 1 + + if sys.argv[i] == "--verbose": + verbose_output = True + i += 1 + + user = sys.argv[i] + i += 1 + + host = sys.argv[i] + i += 1 + + port = int(sys.argv[i]) + i += 1 + + db = sys.argv[i] + i += 1 + + table_name = sys.argv[i] + i += 1 + + num_inserts = int(sys.argv[i]) + i += 1 + + num_insert_workers = int(sys.argv[i]) + i += 1 + + num_group_ops = int(sys.argv[i]) + i += 1 + + num_group_workers = int(sys.argv[i]) + i += 1 + + con = MySQLdb.connect(user=user, host=host, port=port, db=db) + con.cursor().execute("set global rocksdb_lock_wait_timeout=20") + con.cursor().execute("drop table if exists t1") + con.cursor().execute( + "create table t1 ( " + " pk bigint primary key, " + " group_list varchar(128), " + " parent_id bigint " + ") engine=rocksdb;" + ) + + worker_failed = False + workers = [] + worker_type = "insert" + num_loops = num_inserts + for i in range(num_insert_workers + num_group_workers): + worker = Worker( + MySQLdb.connect(user=user, host=host, port=port, db=db), + table_name, + worker_type, + num_loops, + ) + workers.append(worker) + if i == num_insert_workers - 1: + worker_type = "join_group" + num_loops = num_group_ops + + # A watcher thread to print the statistics periodically + if show_progress: + watcher = Watcher(MySQLdb.connect(user=user, host=host, port=port, db=db)) + + for w in workers: + w.join() + if w.exception: + print("Worker hit an exception:\n%s\n" % w.exception) + worker_failed = True + + # Stop the watcher + if show_progress: + watcher.should_stop = True + watcher.join() + + if verbose_output: + print("\n") + print("rows_inserted: %d" % counter_n_inserted) + print("rows_deleted: %d" % counter_n_deleted) + print("rows_insert_failed: %d" % counter_n_insert_failed) + print("groups_created: %d" % counter_n_groups_created) + print("groups_verified: %d" % counter_n_groups_verified) + print("groups_deleted: %d" % counter_n_groups_deleted) + print("group_create_fails: %d" % counter_n_group_create_fails) + + if worker_failed: + sys.exit(1) diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.test b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test new file mode 100644 index 000000000000..0b0d4681bdb8 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test @@ -0,0 +1,18 @@ + +# +# Test Range Locking behavior. See +# ./suite/rocksdb/t/range_locking_conc_test.txt for details +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +set @save_rlwt=@@rocksdb_lock_wait_timeout; +let $SERVER_PORT=`select @@port`; +let $exec = /usr/bin/python3 suite/rocksdb/t/range_locking_conc_test.py root 127.0.0.1 $SERVER_PORT test t1 2000 20 30000 10; + +--echo # Run range_locking_conc_test.py +exec $exec; + +set global rocksdb_lock_wait_timeout= @save_rlwt; +DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt new file mode 100644 index 000000000000..f10bfe33925f --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt @@ -0,0 +1,91 @@ + + +A concurrent test for Range Locking code + +== Requirements == + +The idea is to have a concurrent workload, where +- Some clients make modifications to the database that use range locking (let's + denote these RL-UPDATES) + +- There is some concurrent data modification activity, which is likely to + make RL-UPDATES corrupt the data if transaction isolation does not perform + proper range locking + +- There is a way to detect this data corruption. First suggestion: have some + invariants that must remain true regardless of any action done by + RL-UPDATES. We can run the workload, then stop it and verify the invariants + still hold. + +== Basic idea == + +Rows and groups. + +Consider a table: + +create table t1 ( + pk bigint primary key, + group_list varchar(128), + parent_id bigint +) engine=rocksdb; + + +We allow the following to be stored: + +1. Individual rows. An individual row has group_list=NULL, parent_id=NULL. + +2. Groups. + +A group is 5 (potentially could be some other number) of rows with adjacent +PK values. + +The row with the smallest PK value is the "group leader" and has +group_list=(comma-separated-list-of-pks-of-group members). + +The rest of the rows are "followers" and have parent_id=$GROUP_LEADER.pk + +Example of a group: + +mysql> select * from t1 where pk>=720418192 order by pk limit 5; ++-----------+---------------------------------------------------+-----------+ +| pk | group_list | parent_id | ++-----------+---------------------------------------------------+-----------+ +| 720418192 | 720418192,721972360,730798130,741595383,742883456 | NULL | +| 721972360 | NULL | 720418192 | +| 730798130 | NULL | 720418192 | +| 741595383 | NULL | 720418192 | +| 742883456 | NULL | 720418192 | ++-----------+---------------------------------------------------+-----------+ +5 rows in set (0.01 sec) + + +Operations: +- Insert an individual row. It is obvious we may not insert a row into a group. + +- Convert 5 individual rows into a group. One needs range locking to prevent + other threads from deleting these rows or putting them into another group. + +- Disband a group (convert it back into 5 individual rows). + When we are disbanding a group, we verify it to be valid. + +- Delete a group. If we attempt to insert a row and hit a group leader, we + don't insert the row and delete the whole group we've hit, instead. (when + deleting the group, we also verify it to be valid) + This provides some balance between inserts and deletes. + +=== Making sure lock contention happens === + +We use normal distribution (rand.normalvariate) to pick random PK values, +which are then used to make an attempt to insert a row or create a group. + +This creates a much greater contention than the uniform distribution. + +With sufficiently small sigma parameter, the contention seems to be +sufficiently high. + +=== Implementation === + +range_locking_conc_test.py implements the above operations. + + + diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test new file mode 100644 index 000000000000..2a5966b65c3a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test @@ -0,0 +1,196 @@ +--source suite/rocksdb/include/have_range_locking.inc + +# +# This is deadlock_tracking.test, variant for running with Range Locking: +# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests +# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print +# deadlock information. +# +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +--echo # Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +let $engine = rocksdb; + +--source include/count_sessions.inc +connect (con1,localhost,root,,); +let $con1= `SELECT CONNECTION_ID()`; + +connect (con2,localhost,root,,); +let $con2= `SELECT CONNECTION_ID()`; + +connect (con3,localhost,root,,); +let $con3= `SELECT CONNECTION_ID()`; + +connection default; +eval create table t (i int primary key) engine=$engine; +insert into t values (1), (2), (3); +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #1; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #2; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 10; + +echo Deadlock #3; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 1; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +connection con3; +set rocksdb_deadlock_detect_depth = 2; + +--echo # Range locking code will report deadlocks, because it doesn't honor +--echo # rocksdb_deadlock_detect_depth: +echo Deadlock #4; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 for update; + +connection con1; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +send select * from t where i=3 for update; + +connection con3; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con2 and waiting_key != ""; +--source include/wait_condition.inc + +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +--error ER_LOCK_DEADLOCK +select * from t where i=1 for update; +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +rollback; + +connection con2; +reap; +rollback; + +connection con1; +reap; +rollback; + +connection default; +set global rocksdb_max_latest_deadlocks = 5; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +--disable_testcase BUG#0000 +echo Deadlock #5; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 lock in share mode; + +connection con1; +select * from t where i=100 for update; +select * from t where i=101 for update; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +select * from t where i=3 lock in share mode; +select * from t where i=200 for update; +select * from t where i=201 for update; + +--error ER_LOCK_DEADLOCK +select * from t where i=1 lock in share mode; +rollback; + +connection con1; +reap; +rollback; + +connection con3; +rollback; + +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--enable_testcase +echo Deadlock #6; +connection con1; +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; + +connection con2; +begin; +update t1 set value=value+200 where id=3; + +connection con1; +send update t1 set value=value+100 where id=3; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc +--error ER_LOCK_DEADLOCK +update t1 set value=value+200 where id=1; + +# con2 tx is automatically rolled back +connection con1; +reap; +select * from t1; +drop table t1; + +connection default; + +disconnect con1; +disconnect con2; +disconnect con3; + +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 0; +--echo # Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt new file mode 100644 index 000000000000..d0087e2a77b0 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt @@ -0,0 +1 @@ +--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024 diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test new file mode 100644 index 000000000000..5a6e9fa6616f --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test @@ -0,0 +1,39 @@ +# +# Range Locking - Lock Escalation Tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; +show variables like 'rocksdb_max_lock_memory'; +show status like 'rocksdb_locktree_escalation_count'; +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +#begin; +#insert into t1 values (1000111,100011); +#connect (con1,localhost,root,,); +#connection con1; + +insert into t1 +select + A.a + B.a*10 + C.a*100 + D.a*1000, + 12345 +from t0 A, t0 B, t0 C, t0 D; + +select count(*) from t1; + +#connection default; +#disconnect con1; +show status like 'rocksdb_locktree_escalation_count'; + +drop table t0,t1; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_partial_index.test b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test new file mode 100644 index 000000000000..ce49737b2f61 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test @@ -0,0 +1,120 @@ +# +# Range Locking and Partial Indexes +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--source include/have_debug_sync.inc +--enable_connect_log + +create table t0(a int primary key) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk1 int, + pk2 int, + a int not null, + b int, + primary key (pk1, pk2), + key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5' +) engine=rocksdb; + +# pk1=1 IS materialized prefix (N=10) +insert into t1 select + 1, + A.a, + 100 + A.a, + 123456 +from t0 A; +# Run a SELECT so that it is acually materialized: +select * from t1 force index (key1) where pk1=1; + +# pk1=2 IS NOT materialized (N=3) +insert into t1 select + 2, + A.a, + 100 + A.a, + 123456 +from t0 A limit 3; + +# and some more rows +insert into t1 select + 10000 + A.a +10 *B.a +100*C.a, + A.a, + 100 + A.a, + 123456 +from t0 A, t0 B, t0 C; + +create table t3(pk int primary key); + +connect (con2,localhost,root,,); +connection con2; +begin; +insert into t3 values(3333333); +connection default; + +--echo # +--echo # First, test a query with range lock +--echo # + +--replace_column 10 # +explain +select * from t1 force index (key1) where pk1>=1 and pk1<=10; + +connect (con1,localhost,root,,); +connection con1; +begin; +--echo # Allocate a snapshot +select * from t0 where a=3; + +connection default; +--echo # Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); + +connection con1; +--echo # This doesn't see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10; +--echo # This DOES see the modifications +select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update; + +let $order_by_rowkey=1; +let $SECOND_INDEX_NAME=key1; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +--echo # +--echo # Now, test a query with LockingIterator +--echo # +delete from t1 where b=99999; + +begin; +--echo # Allocate a snapshot +select * from t0 where a=3; + +connection default; +--echo # Make some modifications not visible in the snapshot +insert into t1 values (1,11, 99999, 99999); +insert into t1 values (2,11, 99999, 99999); + +connection con1; +--echo # This doesn't see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15; +--echo # This DOES see the modifications: +select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update; + +let $order_by_rowkey=1; +let $SECOND_INDEX_NAME=key1; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +disconnect con1; +connection default; + +disconnect con2; + +drop table t0, t1,t3; diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test new file mode 100644 index 000000000000..9bbb1b9b3927 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test @@ -0,0 +1,70 @@ +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--source include/have_debug_sync.inc + +select @@rocksdb_use_range_locking; + +--disable_warnings +set debug_sync='RESET'; +--enable_warnings +# +# Testcase for iterator snapshot refresh +# +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +# Ok, now the table has these PK ranges: +# 0..9 40..49 100...1000 +# and all rows have pk=a +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +send +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; + +# The query is how stuck at the start of the second range. + + +## con2> +connection con2; +set debug_sync='now WAIT_FOR con1_stopped'; + +# Make some changes to check if the iterator is reading current data or +# snapshot +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +set debug_sync='now SIGNAL con1_cont'; + +connection con1; +#--error ER_GET_ERRMSG +reap; +select * from t1 where pk<100; + +commit; +disconnect con1; +disconnect con2; +connection default; +set debug_sync='RESET'; + +drop table t1, ten, one_k; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test new file mode 100644 index 000000000000..8b993764235a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test @@ -0,0 +1,12 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--let pk_cf=rev:cf1 +--let PK_USES_REVERSE_CF=1 + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test new file mode 100644 index 000000000000..542d76661e29 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test @@ -0,0 +1,308 @@ +# +# Range Locking : tests for SeekForUpdate feature +# + +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log +show variables like 'rocksdb_use_range_locking'; + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +--echo # Make another connection to get the lock tree out of the STO-mode +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=10 for update; + +connection default; +begin; +select * from t1 where pk=11 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Now, we will just see locks on 10=0xA and 11=0xB: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # +--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +--echo # +--replace_column 10 # +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +select * from t1 where pk>=500 order by pk limit 3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + + +begin; +select * from t1 where pk=11 for update; +explain +select * from t1 order by pk limit 3 for update; +select * from t1 order by pk limit 3 for update; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; + + +--echo # +--echo # Concurrent tests: let one thread do SeekForUpdate and the other +--echo # interfere by committing modifications +--echo # + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +select * from t1 where pk<10; +delete from t1 where pk<10; +select * from t1 where pk<10; + + +--echo # Test what happens when another transaction commits a row +--echo # right before the range we are about to lock (nothing) + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 3 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send select * from t1 where pk >=5 order by pk limit 3 for update; + +connect (con1,localhost,root,,); +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; +rollback; + +delete from t1 where pk=3; + +--echo # +--echo # Now, repeat the test but let the other transaction insert the row into +--echo # the range we are locking + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 1 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +delete from t1 where pk=8; + +--echo # +--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +rollback; + +--echo # +--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +--echo # error. MyRocks code should now be prepared that data reads cause this +--echo # error +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +--error ER_LOCK_WAIT_TIMEOUT +reap; + +rollback; + +connection con1; +rollback; +connection default; + +--echo # +--echo # Test the thd_killed check in the iterator +--echo # +let $conn_id=`select connection_id()`; +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +--disable_query_log +eval kill query $conn_id; +--enable_query_log +--echo kill query CONN_ID; +connection default; +--error ER_GET_ERRMSG +reap; +rollback; + +--echo # +--echo # Backward scan test +--echo # +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +insert into t1 values + (1001, 1001), + (1005, 1005), + (1007, 1007), + (1010, 1010); + +begin; +select * from t1 order by pk desc limit 2 for update; + +let $select_from_is_rowlocks_current_trx_only=1; + +--echo # The below will lock from pk=1007 (0x3ef) till the end of the table: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; + +connection default; +rollback; + +--echo # +--echo # Backward scan test 2: error condition +--echo # +connection con1; +begin; +select * from t1 where pk=1010 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; + +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # A test: full table scan doesn't lock gaps +--echo # + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 values (10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; + +select * from t1 for update; + +connection con2; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (5,5); + +connection con1; +rollback; + +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc new file mode 100644 index 000000000000..6ab2f31cae58 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc @@ -0,0 +1,55 @@ + + +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log +show variables like 'rocksdb_use_range_locking'; + + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +eval create table t1 ( + pk int, + a int, + primary key (pk) comment '$cf' +) engine=rocksdb; + +insert into t1 (pk) +select + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; +delete from t1 where pk<100; + +connect (con1,localhost,root,,); +connection con1; + +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 5 for update; + +connection default; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 (pk) values +(10),(20),(30),(40),(50); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection con1; +reap; +--echo # This must return 1, no 5: +select lock_count from information_schema.rocksdb_trx +where thread_id=CONNECTION_ID(); + +rollback; +disconnect con1; +connection default; +drop table t0, t1; + +--source range_locking_seek_for_update_iter_end.inc +set global rocksdb_enable_iterate_bounds=off; +--source range_locking_seek_for_update_iter_end.inc +set global rocksdb_enable_iterate_bounds=on; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test new file mode 100644 index 000000000000..703331cab9a2 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test @@ -0,0 +1,4 @@ + +--let cf=rlsfu_test +--source range_locking_seek_for_update2.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test new file mode 100644 index 000000000000..0620593b4e78 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test @@ -0,0 +1,4 @@ + +--let cf=rev:rlsfu_test + +--source range_locking_seek_for_update2.inc diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc new file mode 100644 index 000000000000..3b0fb6c53b37 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc @@ -0,0 +1,41 @@ +--echo # +--echo # A testcase for locking at the end of the scan +--echo # +eval create table t1 ( + pk int, + primary key (pk) comment '$cf' +) engine=rocksdb; + +connect (con1,localhost,root,,); +connection con1; + +insert into t1 values (1), (10), (100); + +begin; +select * from t1 for update; + +connection default; +select * from t1; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (150); + +connection con1; +rollback; + +begin; +--replace_column 10 # +explain +select * from t1 order by pk desc for update; +select * from t1 order by pk desc for update; + +connection default; +select * from t1; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (0); + +disconnect con1; +connection default; +drop table t1; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test new file mode 100644 index 000000000000..8d9b5e41a117 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test @@ -0,0 +1,200 @@ +# +# Test for shared lock support for range locking +# +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + +select @@rocksdb_use_range_locking; + +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + + +insert into t1 select a,a from t0; + +--echo # A basic test for shared locks + +begin; +select * from t1 where pk=3 for update; +select * from t1 where pk=5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; +--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +--echo # Now, TRX2_ID should be gone: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; + +--echo # Get a read lock on pk=3 (where we have a write lock). +--echo # The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # Get a write lock on pk=5 (where we have a read lock). +--echo # The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +rollback; + +--echo # +--echo # Test if a read lock inhibits write locks +--echo # + +begin; +select * from t1 where pk=2 lock in share mode; +select * from t1 where pk=8 for update; + +connection con1; +begin; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 0 and 4 for update; + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where pk=2; + +--echo # Get a shared lock +select * from t1 where pk=2 lock in share mode; + +--echo # But this should still prevent us from acquiring a write lock on that value: +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +rollback; +connection default; +rollback; + +drop table t1; +create table t1 ( + pk int not null primary key, + a int not null, + key(a) +) engine=rocksdb; + +insert into t1 +select + A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from + t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; + +connection con1; +begin; +select * from t1 where pk=900 for update; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--replace_column 10 # +explain +select * from t1 where a between 2 and 5 lock in share mode; +select * from t1 where a between 2 and 5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +disconnect con1; + +drop table t0,t1; + +--echo # +--echo # Test shared point locks and lock escalation +--echo # +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; + +show status like 'rocksdb_locktree_current_lock_memory'; + +connect (con1,localhost,root,,); +connection con1; + +begin; +--echo # CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +select * from t1 where pk=2500 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--echo # DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +--echo # DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + + +--echo # DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; + +connection con1; +--echo # CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; + +let $order_by_rowkey=1; +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +set @save_mlm= @@rocksdb_max_lock_memory; + +--echo # Set the limit to cause lock escalation: +set @cur_mem_usage= (select + variable_value + from + performance_schema.global_status + where + variable_name='rocksdb_locktree_current_lock_memory'); + +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); + +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; + +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; +connection default; +rollback; + +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); + +drop table t0, t1; diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test index 0f45700ebc69..2a318b0e4abc 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb.test +++ b/mysql-test/suite/rocksdb/t/rocksdb.test @@ -3,6 +3,9 @@ --source include/have_compact_range_for_drop_table.inc --source include/count_sessions.inc +# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode +--source suite/rocksdb/include/not_range_locking.inc + # # RocksDB Storage Engine tests # diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test index cafd2e83668f..8fb61346f2ca 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test @@ -27,6 +27,10 @@ # In all cases, RR gets snapshot conflict errors if non-first rows get # deleted by another transaction after scanning. +# The tests do not work with range locking as it locks it is about to +# read, first. +--source suite/rocksdb/include/not_range_locking.inc + --source include/have_rocksdb.inc --source include/have_debug_sync.inc diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test index 1c98e9a4cb3a..c050e261ab34 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test @@ -8,6 +8,9 @@ # --source include/have_debug.inc +# Range locking requests locks before doing snapshot checking. +--source suite/rocksdb/include/not_range_locking.inc + --enable_connect_log create table t1 (pk int not null primary key) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test index 9d2f43c03694..a3172b2065c5 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test @@ -78,7 +78,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; --source include/sync_slave_sql_with_master.inc --source include/rpl_connection_slave.inc -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; select * from t1; --echo diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test index 694594efd70f..1273a2b6f70a 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test @@ -46,6 +46,8 @@ begin work; insert into t1 values (9); insert into t1 values (10); +--echo # Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; update t1 set a = a + 1 where a = 2; connection con1; diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc index 5a78979f0487..63b72ce5c5ae 100644 --- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc +++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc @@ -3,6 +3,8 @@ --source include/have_debug.inc --source include/have_debug_sync.inc +--source suite/rocksdb/include/not_range_locking.inc + connection master; --disable_warnings drop table if exists t1; diff --git a/mysql-test/suite/rocksdb/t/select_count_for_update.test b/mysql-test/suite/rocksdb/t/select_count_for_update.test index 2c6f5d474a1f..aa7059dfc7e6 100644 --- a/mysql-test/suite/rocksdb/t/select_count_for_update.test +++ b/mysql-test/suite/rocksdb/t/select_count_for_update.test @@ -52,9 +52,23 @@ SET lock_wait_timeout = 1; SELECT COUNT(*) FROM t1 FORCE INDEX (sk); # ... but not with LOCK IN SHARE MODE / FOR UPDATE +let $uses_range_locking=`select @@rocksdb_use_range_locking`; + +if ($uses_range_locking == "0") { +--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/ +} +if ($uses_range_locking == "1") { +--replace_regex /test.t1.sk/$FAILING_INDEX/ +} --error ER_LOCK_WAIT_TIMEOUT SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE; +if ($uses_range_locking == "0") { +--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/ +} +if ($uses_range_locking == "1") { +--replace_regex /test.t1.sk/$FAILING_INDEX/ +} --error ER_LOCK_WAIT_TIMEOUT SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE; diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test index 23ce6d452344..cf9d53ff88ae 100644 --- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test +++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range locking only supports exclusive locks currently. +--source suite/rocksdb/include/not_range_locking.inc + # # SELECT .. LOCK IN SHARE MODE # diff --git a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test index bfa36714816b..3b8bcb033c07 100644 --- a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test +++ b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test @@ -2,5 +2,8 @@ # wl#8919 Implement NOWAIT and SKIP LOCKED # +# Range locking cannot support SKIP LOCKED? (TODO: but can support NOWAIT) +--source suite/rocksdb/include/not_range_locking.inc + --let $engine=ROCKSDB --source include/skip_locked_nowait.inc diff --git a/mysql-test/suite/rocksdb/t/trx_info.test b/mysql-test/suite/rocksdb/t/trx_info.test index 975bed6132c8..009c0ce67d49 100644 --- a/mysql-test/suite/rocksdb/t/trx_info.test +++ b/mysql-test/suite/rocksdb/t/trx_info.test @@ -11,7 +11,11 @@ insert into t1 values (2); set autocommit=0; select * from t1 for update; ---replace_column 1 _TRX_ID_ 3 _NAME_ 7 _KEY_ 14 _THREAD_ID_ +--replace_column 1 _TRX_ID_ 3 _NAME_ 5 2_or_3 7 _KEY_ 14 _THREAD_ID_ select * from information_schema.rocksdb_trx; +select + if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT +from information_schema.rocksdb_trx; + DROP TABLE t1; diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test index 47ca74d0e5eb..9814d89448d8 100644 --- a/mysql-test/suite/rocksdb/t/unique_check.test +++ b/mysql-test/suite/rocksdb/t/unique_check.test @@ -2,6 +2,11 @@ --source include/have_debug_sync.inc --source include/count_sessions.inc +# Doesn't work with range locking because lock tree waits do not set +# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for +# details. +--source suite/rocksdb/include/not_range_locking.inc + # For GitHub issue#167 -- Unique key check doesn't work connect (con1, localhost, root,,); diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc index ce0bb1e39a90..508816e6ace1 100644 --- a/mysql-test/suite/rocksdb/t/unique_sec.inc +++ b/mysql-test/suite/rocksdb/t/unique_sec.inc @@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38; UPDATE t1 SET id5=34 WHERE id1=38; --echo # NULL values are unique +--echo # (Note: the following UPDATE reads through the whole table without +--echo # finding anything to update. With point locking, this is fine, +--echo # but with range locking it will time out while waiting on a row lock +--echo # that the other transaction is holding) +if (`select @@rocksdb_use_range_locking=0`) { UPDATE t1 SET id5=NULL WHERE value1 > 37; - +} +if (`select @@rocksdb_use_range_locking=1`) { +-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37; +} connection con1; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test index fbebfeac85a7..0d8a35a1321c 100644 --- a/mysql-test/suite/rocksdb/t/varbinary_format.test +++ b/mysql-test/suite/rocksdb/t/varbinary_format.test @@ -1,6 +1,10 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +# The test uses SELECT .. FOR UPDATE and examines which locks it acquires +# Range Locking will use different locks from point locking +--source suite/rocksdb/include/not_range_locking.inc + # Create a table with a varbinary key with the current format and validate # that it sorts correctly CREATE TABLE t1( diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test index 3ea1a1a60b31..985b2c0c8e7d 100644 --- a/mysql-test/suite/rocksdb/t/varchar_format.test +++ b/mysql-test/suite/rocksdb/t/varchar_format.test @@ -1,6 +1,8 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc + #################### # Create a table with a varchar key with the current format and validate # that it sorts correctly diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result new file mode 100644 index 000000000000..614737fcfbc6 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result new file mode 100644 index 000000000000..b2c00bccf799 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_lock_manager_as_point_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT = 444; +ERROR HY000: Variable 'rocksdb_use_range_lock_manager_as_point' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result new file mode 100644 index 000000000000..614737fcfbc6 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test new file mode 100644 index 000000000000..ee185aba6600 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test new file mode 100644 index 000000000000..ee6eda473fd4 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_lock_manager_as_point_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCK_MANAGER_AS_POINT +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test new file mode 100644 index 000000000000..ee185aba6600 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index 18f4df7d6c57..3a4722de3cc9 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -159,6 +159,7 @@ SET(ROCKSDB_SE_SOURCES logger.h rdb_datadic.cc rdb_datadic.h rdb_iterator.cc rdb_iterator.h + rdb_locking_iter.cc rdb_locking_iter.h rdb_sst_partitioner_factory.h rdb_cf_options.cc rdb_cf_options.h rdb_cf_manager.cc rdb_cf_manager.h diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh index 756bc9ec25bb..1f45a9528f7b 100755 --- a/storage/rocksdb/get_rocksdb_files.sh +++ b/storage/rocksdb/get_rocksdb_files.sh @@ -11,9 +11,9 @@ FOLLY_DIR = ./third-party/folly all:" > $MKFILE if [ -z $1 ]; then - echo " @echo \"\$(LIB_SOURCES)\"" >> $MKFILE + echo " @echo \"\$(LIB_SOURCES)\" \"\$(RANGE_TREE_SOURCES)\"" >> $MKFILE else - echo " @echo \"\$(LIB_SOURCES)\" \"\$(FOLLY_SOURCES)\"" >> $MKFILE + echo " @echo \"\$(LIB_SOURCES)\" \"\$(RANGE_TREE_SOURCES)\" \"\$(FOLLY_SOURCES)\"" >> $MKFILE fi for f in `make --makefile $MKFILE` do diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 9b8d3dfbb81d..4cfc1ce49dea 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -105,6 +105,7 @@ #include "./rdb_i_s.h" #include "./rdb_index_merge.h" #include "./rdb_iterator.h" +#include "./rdb_locking_iter.h" #include "./rdb_mutex_wrapper.h" #include "./rdb_native_dd.h" #include "./rdb_psi.h" @@ -792,6 +793,9 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var, static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var, void *var_ptr, const void *save); +static void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR *, void *, + const void *save); + static void rdb_set_collation_exception_list(const char *exception_list); static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var, void *var_ptr, @@ -1000,6 +1004,17 @@ enum file_checksums_type { static ulong rocksdb_file_checksums = file_checksums_type::CHECKSUMS_OFF; static std::time_t last_binlog_ttl_compaction_ts = std::time(nullptr); +// Range Locking: how much memory can be used used for the lock data structure, +// which holds the locks acquired by all clients. +// MySQL and RocksDB data types do not match. Check that they are the same +// underlying type. +static_assert(sizeof(ulonglong) == sizeof(std::size_t)); +static ulonglong rocksdb_max_lock_memory; + +bool rocksdb_use_range_locking = false; +static bool rocksdb_use_range_lock_manager_as_point = false; +std::shared_ptr range_lock_mgr; + static std::atomic rocksdb_row_lock_deadlocks(0); static std::atomic rocksdb_row_lock_wait_timeouts(0); static std::atomic rocksdb_snapshot_conflict_errors(0); @@ -1888,6 +1903,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks, nullptr, rocksdb_set_max_latest_deadlocks, rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0); +static MYSQL_SYSVAR_ULONGLONG( + max_lock_memory, rocksdb_max_lock_memory, PLUGIN_VAR_RQCMDARG, + "If range locking is used, the maximum amount of memory that locks from " + "all transactions can use at a time", + nullptr, rocksdb_set_max_lock_memory, + /* initial */ 1073741824, 0, UINT64_MAX, 0); + static MYSQL_SYSVAR_ENUM( info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG, "Filter level for info logs to be written mysqld error log. " @@ -2816,6 +2838,18 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan, rocksdb_update_table_stats_use_table_scan, rocksdb_table_stats_use_table_scan); +static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use range locking", nullptr, nullptr, + rocksdb_use_range_locking); + +static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point, + rocksdb_use_range_lock_manager_as_point, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use range locking, but only for point locking", + nullptr, nullptr, + rocksdb_use_range_lock_manager_as_point); + static MYSQL_SYSVAR_BOOL( allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, @@ -3280,6 +3314,10 @@ static struct SYS_VAR *rocksdb_system_variables[] = { MYSQL_SYSVAR(manual_compaction_bottommost_level), MYSQL_SYSVAR(rollback_on_timeout), + MYSQL_SYSVAR(use_range_locking), + MYSQL_SYSVAR(use_range_lock_manager_as_point), + MYSQL_SYSVAR(max_lock_memory), + MYSQL_SYSVAR(enable_insert_with_update_caching), MYSQL_SYSVAR(trace_block_cache_access), MYSQL_SYSVAR(trace_queries), @@ -3590,7 +3628,7 @@ class Rdb_transaction { ulonglong m_insert_count = 0; ulonglong m_update_count = 0; ulonglong m_delete_count = 0; - // per row data + // Per row data. With range locking, each locked range is counted here, too ulonglong m_row_lock_count = 0; std::unordered_map m_auto_incr_map; @@ -3671,8 +3709,12 @@ class Rdb_transaction { virtual rocksdb::Status do_pop_savepoint() = 0; virtual void do_rollback_to_savepoint() = 0; + [[nodiscard]] virtual bool in_snapshot_ignore_mode() const = 0; + virtual void end_ignore_snapshot_if_needed() = 0; + public: rocksdb::ReadOptions m_read_opts[2]; + const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; const char *m_mysql_max_gtid; @@ -3918,6 +3960,18 @@ class Rdb_transaction { virtual void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey, bool force = false) = 0; + [[nodiscard]] virtual rocksdb::Status lock_range( + rocksdb::ColumnFamilyHandle &cf, const rocksdb::Endpoint &start, + const rocksdb::Endpoint &end) = 0; + + [[nodiscard]] rocksdb::Status lock_singlepoint_range( + rocksdb::ColumnFamilyHandle &cf, const rocksdb::Slice &point) { + // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs. + // But here we are locking just one point so this is not necessary. + rocksdb::Endpoint endp(point, false); + return lock_range(cf, endp, endp); + } + virtual bool prepare() = 0; bool commit_or_rollback() { @@ -3932,6 +3986,8 @@ class Rdb_transaction { } bool commit() { + end_ignore_snapshot_if_needed(); + if (get_write_count() == 0) { rollback(); return false; @@ -3989,12 +4045,20 @@ class Rdb_transaction { m_is_delayed_snapshot = false; } + protected: + void locking_iter_created() { + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + + public: virtual void acquire_snapshot(bool acquire_now, TABLE_TYPE table_type) = 0; virtual void release_snapshot(TABLE_TYPE table_type) = 0; bool has_snapshot(TABLE_TYPE table_type) const { if (table_type == INTRINSIC_TMP) return false; - return m_read_opts[table_type].snapshot != nullptr; + return m_read_opts[table_type].snapshot != nullptr || + in_snapshot_ignore_mode(); } private: @@ -4731,7 +4795,8 @@ class Rdb_transaction { [[nodiscard]] virtual std::unique_ptr get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle &column_family, TABLE_TYPE table_type) = 0; + rocksdb::ColumnFamilyHandle &column_family, TABLE_TYPE table_type, + const Rdb_key_def &kd, bool use_locking_iterator = false) = 0; virtual void multi_get(rocksdb::ColumnFamilyHandle &column_family, size_t num_keys, const rocksdb::Slice *keys, @@ -4740,14 +4805,16 @@ class Rdb_transaction { bool sorted_input) const = 0; [[nodiscard]] std::unique_ptr get_iterator( - rocksdb::ColumnFamilyHandle &column_family, bool skip_bloom_filter, - const rocksdb::Slice &eq_cond_lower_bound, + rocksdb::ColumnFamilyHandle &column_family, const Rdb_key_def &kd, + bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, TABLE_TYPE table_type, - bool read_current = false, bool create_snapshot = true) { + bool read_current = false, bool create_snapshot = true, + bool use_locking_iterator = false) { // Make sure we are not doing both read_current (which implies we don't // want a snapshot) and create_snapshot which makes sure we create // a snapshot assert(!read_current || !create_snapshot); + assert(!use_locking_iterator || table_type == TABLE_TYPE::USER_TABLE); if (create_snapshot) acquire_snapshot(true, table_type); @@ -4772,12 +4839,14 @@ class Rdb_transaction { if (read_current) { options.snapshot = nullptr; } - return get_iterator(options, column_family, table_type); + return get_iterator(options, column_family, table_type, kd, + use_locking_iterator); } virtual bool is_tx_started(TABLE_TYPE table_type) const = 0; virtual void start_tx(TABLE_TYPE table_type) = 0; - virtual void start_stmt() = 0; + virtual void start_stmt(bool is_dml_statement) = 0; + virtual void start_autocommit_stmt(bool is_dml_statement) = 0; virtual void set_name() = 0; protected: @@ -4875,6 +4944,8 @@ class Rdb_transaction { m_writes_at_last_savepoint = m_write_count[USER_TABLE]; } + end_ignore_snapshot_if_needed(); + return HA_EXIT_SUCCESS; } @@ -5001,6 +5072,55 @@ class Rdb_transaction_impl : public Rdb_transaction { std::vector m_rocksdb_tx{nullptr, nullptr}; std::vector m_rocksdb_reuse_tx{nullptr, nullptr}; + /** If true, the current statement should not use a snapshot for reading. Note + that in a multi-statement transaction, the snapshot may have been allocated by + one of the previous statements. This flag cannot be replaced with an + m_saved_snapshot != nullptr check because the read snapshot does not have to + exist. */ + bool m_stmt_ignores_snapshot = false; + + /** Snapshot-ignore mode will put away m_reads_opts.snapshot here. */ + const rocksdb::Snapshot *m_saved_snapshot = nullptr; + + void start_ignore_snapshot() { + // This may be called several times for the same statement + if (in_snapshot_ignore_mode()) { + assert(m_read_opts[TABLE_TYPE::USER_TABLE].snapshot == nullptr); + return; + } + + assert(m_saved_snapshot == nullptr); + + m_saved_snapshot = m_read_opts[TABLE_TYPE::USER_TABLE].snapshot; + m_read_opts[TABLE_TYPE::USER_TABLE].snapshot = nullptr; + m_stmt_ignores_snapshot = true; + + // For repeatable-read AUTOCOMMIT statements on systems using range locking + // and ttl, there is a possibility of no snapshot and m_snapshot_time + // assigned. Compaction relies on the oldest snapshot time found in RocksDB + // to determine what is visible to transactions, so it's possible for + // compaction to remove rows that were already read by an autocommit + // statement (e.g. insert into .. select union select) since no snapshot was + // created to hold them. + // + // There will be further fixes for TTL which should resolve this case. + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + + void maybe_start_ignore_snapshot(bool is_dml_statement) { + if (rocksdb_use_range_locking && is_dml_statement) { + /* + In Range Locking mode, RocksDB does not do "key tracking". + Use InnoDB-like concurrency mode: make the DML statements always read + the latest data (instead of using transaction's snapshot). + This "downgrades" the transaction isolation to READ-COMMITTED on the + primary, but in return the actions can be replayed on the replica. + */ + start_ignore_snapshot(); + } + } + public: void set_lock_timeout(int timeout_sec_arg, TABLE_TYPE table_type) override { assert(!is_ac_nl_ro_rc_transaction()); @@ -5034,6 +5154,16 @@ class Rdb_transaction_impl : public Rdb_transaction { bool is_writebatch_trx() const override { return false; } + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle &cf, + const rocksdb::Endpoint &start_endp, + const rocksdb::Endpoint &end_endp) override { + assert(!is_ac_nl_ro_rc_transaction()); + + incr_row_lock_count(TABLE_TYPE::USER_TABLE); + return m_rocksdb_tx[TABLE_TYPE::USER_TABLE]->GetRangeLock(&cf, start_endp, + end_endp); + } + private: void release_tx(void) { // We are done with the current active transaction object. Preserve it @@ -5138,6 +5268,8 @@ class Rdb_transaction_impl : public Rdb_transaction { public: void rollback() override { + assert(!in_snapshot_ignore_mode()); + on_rollback(); m_write_count[USER_TABLE] = 0; m_write_count[INTRINSIC_TMP] = 0; @@ -5171,7 +5303,7 @@ class Rdb_transaction_impl : public Rdb_transaction { return; } - if (m_read_opts[table_type].snapshot == nullptr) { + if (has_snapshot(table_type)) { const auto thd_ss = std::static_pointer_cast( m_thd->get_explicit_snapshot()); if (thd_ss) { @@ -5379,13 +5511,18 @@ class Rdb_transaction_impl : public Rdb_transaction { [[nodiscard]] std::unique_ptr get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle &column_family, - TABLE_TYPE table_type) override { + rocksdb::ColumnFamilyHandle &column_family, TABLE_TYPE table_type, + const Rdb_key_def &kd, bool use_locking_iterator) override { if (table_type == USER_TABLE) { global_stats.queries[QUERIES_RANGE].inc(); } - return std::unique_ptr( - m_rocksdb_tx[table_type]->GetIterator(options, &column_family)); + if (use_locking_iterator) { + locking_iter_created(); + return GetLockingIterator(*m_rocksdb_tx[TABLE_TYPE::USER_TABLE], options, + column_family, kd, &m_row_lock_count); + } else + return std::unique_ptr( + m_rocksdb_tx[table_type]->GetIterator(options, &column_family)); } const rocksdb::Transaction *get_rdb_trx() const { @@ -5477,26 +5614,48 @@ class Rdb_transaction_impl : public Rdb_transaction { m_rocksdb_tx[TABLE_TYPE::USER_TABLE]->RollbackToSavePoint(); } + [[nodiscard]] bool in_snapshot_ignore_mode() const override { + return m_stmt_ignores_snapshot; + } + + void end_ignore_snapshot_if_needed() override { + if (!in_snapshot_ignore_mode()) return; + + if (m_read_opts[TABLE_TYPE::USER_TABLE].snapshot != nullptr) { + // FIXME(laurynas): how to free this snapshot? + } + + m_stmt_ignores_snapshot = false; + m_read_opts[TABLE_TYPE::USER_TABLE].snapshot = m_saved_snapshot; + m_saved_snapshot = nullptr; + } + /* Start a statement inside a multi-statement transaction. - @todo: are we sure this is called once (and not several times) per - statement start? + @note: If a statement uses N tables, this function will be called N times, + for each TABLE object that is used. For hooking to start of statement that is its own transaction, see ha_rocksdb::external_lock(). */ - void start_stmt() override { + void start_stmt(bool is_dml_statement) override { + maybe_start_ignore_snapshot(is_dml_statement); // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) acquire_snapshot(can_acquire_snapshot_without_conflicts(), TABLE_TYPE::USER_TABLE); } + void start_autocommit_stmt(bool is_dml_statement) override { + maybe_start_ignore_snapshot(is_dml_statement); + } + /* This must be called when last statement is rolled back, but the transaction continues */ void rollback_stmt() override { + end_ignore_snapshot_if_needed(); /* TODO: here we must release the locks taken since the start_stmt() call */ if (m_rocksdb_tx[TABLE_TYPE::USER_TABLE]) { const rocksdb::Snapshot *const org_snapshot = @@ -5527,6 +5686,8 @@ class Rdb_transaction_impl : public Rdb_transaction { } virtual ~Rdb_transaction_impl() override { + assert(!in_snapshot_ignore_mode()); + // Remove from the global list before all other processing is started. // Otherwise, information_schema.rocksdb_trx can crash on this object. Rdb_transaction::remove_from_global_trx_list(); @@ -5618,6 +5779,10 @@ class Rdb_writebatch_impl : public Rdb_transaction { void do_rollback_to_savepoint() override { m_batch->RollbackToSavePoint(); } + [[nodiscard]] bool in_snapshot_ignore_mode() const override { return false; } + + void end_ignore_snapshot_if_needed() override {} + public: bool is_writebatch_trx() const override { return true; } @@ -5638,6 +5803,12 @@ class Rdb_writebatch_impl : public Rdb_transaction { // Nothing to do here since we don't hold any row locks. } + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle &, + const rocksdb::Endpoint &, + const rocksdb::Endpoint &) override { + return rocksdb::Status::OK(); + } + void rollback() override { on_rollback(); m_write_count[TABLE_TYPE::USER_TABLE] = 0; @@ -5794,7 +5965,7 @@ class Rdb_writebatch_impl : public Rdb_transaction { [[nodiscard]] std::unique_ptr get_iterator( const rocksdb::ReadOptions &options, rocksdb::ColumnFamilyHandle &, - TABLE_TYPE table_type) override { + TABLE_TYPE table_type, const Rdb_key_def &, bool) override { if (table_type == INTRINSIC_TMP) { assert(false); return nullptr; @@ -5830,7 +6001,8 @@ class Rdb_writebatch_impl : public Rdb_transaction { void set_name() override {} - void start_stmt() override {} + void start_stmt(bool) override {} + void start_autocommit_stmt(bool) override {} void rollback_stmt() override { if (m_batch) rollback_to_stmt_savepoint(); @@ -6631,8 +6803,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { "=========================================\n"; } - static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info( - const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) { + template + [[nodiscard]] static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info( + const PathStruct &txn, GL_INDEX_ID gl_index_id) { Rdb_deadlock_info::Rdb_dl_trx_info txn_data; txn_data.trx_id = txn.m_txn_id; @@ -6656,24 +6829,51 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { txn_data.cf_name = (cfh) ? cfh->GetName() : "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id); - - txn_data.waiting_key = - rdb_hexdump(txn.m_waiting_key.data(), txn.m_waiting_key.length(), 0); - + txn_data.waiting_key = format_wait_key(txn); txn_data.exclusive_lock = txn.m_exclusive; return txn_data; } - static Rdb_deadlock_info get_dl_path_trx_info( - const rocksdb::DeadlockPath &path_entry) { + // Get the key to use to find the index number (and then, index name). Two + // functions with matching signatures so get_dl_path_trx_info() template can + // be used with both point and range locking. + static const std::string &get_key_for_indexnr( + const rocksdb::DeadlockInfo &info) { + return info.m_waiting_key; + } + + static const std::string &get_key_for_indexnr( + const rocksdb::RangeDeadlockInfo &info) { + // Range locks do not span across indexes, so take the left bound + return info.m_start.slice; + } + + // Print the locked key (or range) in hex. Two functions with matching + // signatures so get_dl_path_trx_info() template can be used with both point + // and range locking. + static std::string format_wait_key(const rocksdb::DeadlockInfo &info) { + return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length(), + 0); + } + + static std::string format_wait_key(const rocksdb::RangeDeadlockInfo &info) { + // FIXME(laurynas): limit? length? + return rdb_hexdump_range(info.m_start, info.m_end); + } + + // Get deadlock path info. A templated function so one can use it with both + // point and range locking. + template + static Rdb_deadlock_info get_dl_path_trx_info(const PathStruct &path_entry) { Rdb_deadlock_info deadlock_info; for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) { const auto &txn = *it; + auto waiting_key = get_key_for_indexnr(txn); const GL_INDEX_ID gl_index_id = { txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast( - txn.m_waiting_key.c_str()))}; + waiting_key.c_str()))}; deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id)); } assert_IFF(path_entry.limit_exceeded, path_entry.path.empty()); @@ -6698,7 +6898,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { /* Calculate the duration the snapshot has existed */ int64_t snapshot_timestamp = tx->m_snapshot_timestamp; - if (snapshot_timestamp != 0) { + if (snapshot_timestamp != 0 && tx->has_snapshot(TABLE_TYPE::USER_TABLE)) { int64_t curr_time; rdb->GetEnv()->GetCurrentTime(&curr_time); @@ -6717,8 +6917,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } - void populate_deadlock_buffer() { - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + template + void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) { m_data += "----------LATEST DETECTED DEADLOCKS----------\n"; for (const auto &path_entry : dlock_buffer) { @@ -6758,12 +6958,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } - std::vector get_deadlock_info() { + void populate_deadlock_buffer() { + if (rocksdb_use_range_locking) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } + } + + [[nodiscard]] std::vector get_deadlock_info() { std::vector deadlock_info; - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); - for (const auto &path_entry : dlock_buffer) { - if (!path_entry.limit_exceeded) { - deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + + if (rocksdb_use_range_locking) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } + } + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } } } return deadlock_info; @@ -7284,9 +7504,12 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */, return ret_val; } -static inline void rocksdb_register_tx( - handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd, - Rdb_transaction *const tx) { +/* + @param is_dml_statement If true, we are is a DML statement +*/ +static inline void rocksdb_register_tx(handlerton *const, THD *const thd, + Rdb_transaction *const tx, + bool is_dml_stmt) { assert(tx != nullptr); trans_register_ha(thd, false, rocksdb_hton, NULL); @@ -7301,8 +7524,10 @@ static inline void rocksdb_register_tx( } } if (!is_autocommit(*thd)) { - tx->start_stmt(); + tx->start_stmt(is_dml_stmt); trans_register_ha(thd, true, rocksdb_hton, NULL); + } else { + tx->start_autocommit_stmt(is_dml_stmt); } } @@ -7389,7 +7614,7 @@ static int rocksdb_start_tx_and_assign_read_view( assert(!tx->has_snapshot(TABLE_TYPE::USER_TABLE)); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true, TABLE_TYPE::USER_TABLE); if (!ss_info) { (void)tx->get_or_create_ttl_read_filtering_ts(); @@ -7451,7 +7676,7 @@ static int rocksdb_start_tx_with_shared_read_view( assert(!tx->has_snapshot(TABLE_TYPE::USER_TABLE)); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true, TABLE_TYPE::USER_TABLE); // case: an explicit snapshot was not assigned to this transaction @@ -7715,6 +7940,21 @@ static void move_wals_to_target_dir() { } } +/* + Range locking escalation barrier function + The bytes used for the index number are compared. Endpoints + that span indexes are not merged together. +*/ +static bool rocksdb_escalation_barrier(const rocksdb::Endpoint &a, + const rocksdb::Endpoint &b) { + assert(a.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE); + assert(b.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE); + + return a.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE && + b.slice.size() >= Rdb_key_def::INDEX_NUMBER_SIZE && + memcmp(a.slice.data(), b.slice.data(), Rdb_key_def::INDEX_NUMBER_SIZE); +} + /* Storage Engine initialization function, invoked when plugin is loaded. */ @@ -8327,6 +8567,20 @@ static int rocksdb_init_internal(void *const p) { tx_db_options.write_policy = static_cast(rocksdb_write_policy); + if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) { + rdb_log_status_error( + status, + "Can't enable both range_locking and range_lock_manager_as_point"); + DBUG_RETURN(HA_EXIT_FAILURE); + } + + if (rocksdb_use_range_locking || rocksdb_use_range_lock_manager_as_point) { + range_lock_mgr.reset( + rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory)); + tx_db_options.lock_mgr_handle = range_lock_mgr; + range_lock_mgr->SetEscalationBarrierFunc(rocksdb_escalation_barrier); + } + status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -8376,6 +8630,15 @@ static int rocksdb_init_internal(void *const p) { LogPluginErrMsg(INFORMATION_LEVEL, ER_LOG_PRINTF_MSG, "...done"); } + if (rocksdb_use_range_locking) { + range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory); + sql_print_information("RocksDB: using range locking\n"); + sql_print_information("RocksDB: maximum lock memory = %llu", + rocksdb_max_lock_memory); + } else { + sql_print_information("RocksDB: using point locking"); + } + // NO_LINT_DEBUG LogPluginErrMsg(INFORMATION_LEVEL, ER_LOG_PRINTF_MSG, "RocksDB:Init column families..."); @@ -9515,6 +9778,7 @@ int ha_rocksdb::open(const char *const name, } m_lock_rows = RDB_LOCK_NONE; + assert(!m_use_range_locking); m_locked_row_action = THR_WAIT; m_key_descr_arr = m_tbl_def->m_key_descr_arr; @@ -11279,6 +11543,10 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, Rdb_transaction *const tx = get_or_create_tx(table->in_use, m_tbl_def->get_table_type()); + + if ((rc = set_range_lock(*tx, kd, find_flag, slice, end_slice, end_range))) + DBUG_RETURN(rc); + const bool is_new_snapshot = !tx->has_snapshot(m_tbl_def->get_table_type()); // Loop as long as we get a deadlock error AND we end up creating the @@ -11327,6 +11595,196 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, DBUG_RETURN(rc); } +/** + @brief + Compute the range lock endpoints and set the range lock, if necessary + + @param use_locking_iter OUT If true, locks are not set and LockingIterator + should be used instead + + @detail + If the scanned range doesn't have the endpoint we're scanning towards, + don't set the lock, it will be too coarse. Indicate that LockingIterator + should be used, instead. + + == RangeFlagsShouldBeFlippedForRevCF == + When using reverse column families, the value of Endpoint::inf_suffix has + the reverse meaning. + + Let's consider a forward-ordered CF and some keys and endpoints in it: + + key=a, inf_suffix=false + key=ab + key=az + key=a, inf_suffix=true + + Now, let's put the same data and endpoints into a reverse-ordered CF. The + physical order of the data will be the reverse of the above: + + key=a, inf_suffix=true + key=az + key=ab + key=a, inf_suffix=false + + Note that inf_suffix=false comes *before* any values with the same prefix. + And inf_suffix=true comes *after* all values with the same prefix. + + The Endpoint comparison function in RocksDB doesn't "know" if the CF is + reverse-ordered or not. It uses the Key Comparator for key values, and + then it assumes that Endpoint(key=$VAL, inf_suffix=false) comes before + the row with key=$VAL. + + The only way to achieve the required ordering is to flip the endpoint + flag value before passing class Endpoint to RocksDB. This function does + it before the lock_range() call. + + @return + 0 Ok + Other Error acquiring the lock (wait timeout, deadlock, etc) +*/ + +int ha_rocksdb::set_range_lock(Rdb_transaction &tx, const Rdb_key_def &kd, + enum ha_rkey_function find_flag, + rocksdb::Slice slice_arg, + rocksdb::Slice end_slice_arg, + const key_range *const end_key) { + if (!m_use_range_locking) { + return 0; + } + + assert(m_lock_rows != RDB_LOCK_NONE); + assert(m_tbl_def->get_table_type() == TABLE_TYPE::USER_TABLE); + + bool start_has_inf_suffix = false, end_has_inf_suffix = false; + rocksdb::Slice slice(slice_arg); + rocksdb::Slice end_slice(end_slice_arg); + bool use_locking_iterator = false; + + /* + The 'slice' has the left endpoint of the range to lock. + Figure out the right endpoint. + */ + + if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST) { + if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) { + // This is a full table/index scan + // (in case of HA_READ_PREFIX_LAST, a reverse-ordered one) + use_locking_iterator = true; + } else { + /* + HA_READ_KEY_EXACT: + This is "key_part= const" interval. We need to lock this range: + (lookup_value, -inf) < key < (lookup_value, +inf) + HA_READ_PREFIX_LAST: + We get here for queries like: + + select * from t1 where pk1=const order by pk1 desc for update + + assuming this uses an index on (pk1, ...). + We get end_key=nullptr. + */ + end_has_inf_suffix = true; + end_slice = slice; + } + } else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) { + /* + We get here for queries like: + + select * from t1 where pk1=const1 and pk2 between const2 and const3 + order by pk1 desc + for update + + assuming this uses an index on (pk1, pk2). + The slice has the right endpoint: {const1, const3} + the end_key has the left endpoint: {const1, const2}. + */ + + std::swap(slice, end_slice); + end_has_inf_suffix = true; + } else if (find_flag == HA_READ_BEFORE_KEY) { + /* + We get here for queries like + select * from t1 + where pk <1007 order by pk desc limit 2 for update + select * from t1 + where pk >=800 and pk <1007 order by pk desc limit 2 for update + */ + + if (end_key) { + std::swap(slice, end_slice); + // end_has_inf_suffix is false, because we're looking keyflag == HA_READ_AFTER_KEY) { + // this is "key_part <= const". + end_has_inf_suffix = true; + } else if (end_key->flag == HA_READ_BEFORE_KEY) { + // this is "key_part < const", non-inclusive. + } else { + // Unknown type of range, shouldn't happen + assert(0); + } + } else { + assert(find_flag == HA_READ_AFTER_KEY || find_flag == HA_READ_KEY_OR_NEXT); + + use_locking_iterator = true; + } + + /* + RocksDB's iterator is reading the snapshot of the data that was taken at + the time the iterator was created. + + After we've got a lock on the range, we'll need to refresh the iterator + to read the latest contents. (If we use the iterator created before the + lock_range() call, we may miss the changes that were made/committed after + the iterator was created but before the lock_range() call was made). + + RocksDB has Iterator::Refresh() method, but alas, it is not implemented for + the iterator returned by Transaction object (Transaction object returns + BaseDeltaIterator which allows one to see the transactions's own changes). + + Our solution to this is to release the iterator and create the new one. + We release it here, it will be created as soon as there's a need to read + records. + */ + m_iterator->reset(use_locking_iterator); + + if (use_locking_iterator) { + return 0; + } + + rocksdb::Endpoint start_endp; + rocksdb::Endpoint end_endp; + + if (kd.m_is_reverse_cf) { + // Flip the endpoint flag values, as explained in the + // RangeFlagsShouldBeFlippedForRevCF comment above. + start_endp = rocksdb::Endpoint(end_slice, !end_has_inf_suffix); + end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix); + } else { + start_endp = rocksdb::Endpoint(slice, start_has_inf_suffix); + end_endp = rocksdb::Endpoint(end_slice, end_has_inf_suffix); + } + + const auto s = tx.lock_range(kd.get_cf(), start_endp, end_endp); + if (!s.ok()) { + return tx.set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler); + } + return 0; +} + /* See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL index navigation commands are converted into RocksDB lookup commands. @@ -11886,7 +12344,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, } } - if (rc) { + if (rc != HA_EXIT_SUCCESS) { break; } @@ -11897,7 +12355,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, table->m_status = 0; rc = 0; } else if (active_index == table->s->primary_key) { - if (m_lock_rows != RDB_LOCK_NONE) { + if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) { DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete"); /* We need to put a lock and re-read */ bool skip_row = false; @@ -12103,7 +12561,9 @@ void ha_rocksdb::unlock_row() { DBUG_VOID_RETURN; } - if (m_lock_rows != RDB_LOCK_NONE) { + // Don't release the lock when using range locking. + // This breaks m_row_lock_count + if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) { Rdb_transaction *const tx = get_or_create_tx(table->in_use, m_tbl_def->get_table_type()); tx->release_lock(*m_pk_descr, @@ -12671,10 +13131,17 @@ int ha_rocksdb::check_and_lock_sk( /* If there are no uniqueness requirements, there's no need to obtain a - lock for this key. + lock for this key unless we use range locking. */ if (!(key_info->flags & HA_NOSAME)) { - return HA_EXIT_SUCCESS; + // FIXME(laurynas): rocksdb_use_range_locking might be too broad for temp + // tables, DD tables, etc. Adjust asserts in check_and_lock_non_unique_sk + // too. + // m_use_range_locking is not used here because RC writes still need to be + // blocked by RR range locks + return rocksdb_use_range_locking && m_lock_rows != RDB_LOCK_NONE + ? check_and_lock_non_unique_sk(kd, row_info) + : HA_EXIT_SUCCESS; } /* @@ -12796,6 +13263,47 @@ int ha_rocksdb::check_and_lock_sk( return rc; } +/** + @brief + Lock the non-unique sk for range locking +*/ +int ha_rocksdb::check_and_lock_non_unique_sk(const Rdb_key_def &kd, + const update_row_info &row_info) { + assert(m_lock_rows != RDB_LOCK_NONE); + assert(rocksdb_use_range_locking); + + if (row_info.old_data != nullptr) { + const auto old_packed_size = kd.pack_record( + table, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, nullptr, + should_store_row_debug_checksums(), row_info.hidden_pk_id, 0, nullptr, + m_ttl_bytes); + const auto old_key_slice = rocksdb::Slice( + reinterpret_cast(m_sk_packed_tuple_old), old_packed_size); + + const auto s = + row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice); + if (!s.ok()) { + return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler); + } + } + + const auto new_packed_size = kd.pack_record( + table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, nullptr, 0, + row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes); + const auto new_key_slice = rocksdb::Slice( + reinterpret_cast(m_sk_packed_tuple), new_packed_size); + + const auto s = + row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice); + if (!s.ok()) { + return (row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + + return HA_EXIT_SUCCESS; +} + /** Enumerate all keys to check their uniquess and also lock it @@ -13916,6 +14424,20 @@ int ha_rocksdb::delete_row(const uchar *const buf) { nullptr, false, hidden_pk_id); rocksdb::Slice secondary_key_slice( reinterpret_cast(m_sk_packed_tuple), packed_size); + + /* + For point locking, deleting on secondary key doesn't need any locks. + Range locking must get a lock. + */ + if (rocksdb_use_range_locking) { + const auto s = + tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice); + if (!s.ok()) { + DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + } + s = tx->get_indexed_write_batch(m_tbl_def->get_table_type()) ->SingleDelete(&kd.get_cf(), secondary_key_slice); if (!s.ok()) { @@ -14626,8 +15148,12 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { } } tx->m_n_mysql_tables_in_use++; - rocksdb_register_tx(rocksdb_hton, thd, tx); + rocksdb_register_tx(rocksdb_hton, thd, tx, (m_lock_rows != RDB_LOCK_NONE)); tx->io_perf_start(&m_io_perf); + + m_use_range_locking = rocksdb_use_range_locking && + m_lock_rows != RDB_LOCK_NONE && + isolation >= ISO_REPEATABLE_READ; } DBUG_RETURN(res); @@ -14655,7 +15181,7 @@ int ha_rocksdb::start_stmt(THD *const thd, Rdb_transaction *const tx = get_or_create_tx(thd, m_tbl_def->get_table_type()); read_thd_vars(thd); - rocksdb_register_tx(ht, thd, tx); + rocksdb_register_tx(ht, thd, tx, (m_lock_rows != RDB_LOCK_NONE)); tx->io_perf_start(&m_io_perf); DBUG_RETURN(HA_EXIT_SUCCESS); @@ -17624,6 +18150,39 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)), return 0; } +// +// Lock Tree Status variables +// +static longlong rocksdb_locktree_escalation_count = 0; +static longlong rocksdb_locktree_current_lock_memory = 0; +static longlong rocksdb_locktree_lock_wait_count = 0; + +static SHOW_VAR rocksdb_locktree_status_variables[] = { + DEF_STATUS_VAR_FUNC("escalation_count", &rocksdb_locktree_escalation_count, + SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("current_lock_memory", + &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("lock_wait_count", &rocksdb_locktree_lock_wait_count, + SHOW_LONGLONG), + // end of the array marker + {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; + +static SHOW_VAR rocksdb_empty_status_variables[] = { + {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; + +static void show_rocksdb_locktree_vars(THD *, SHOW_VAR *var, char *) { + var->type = SHOW_ARRAY; + if (range_lock_mgr) { + const auto status = range_lock_mgr->GetStatus(); + rocksdb_locktree_escalation_count = status.escalation_count; + rocksdb_locktree_current_lock_memory = status.current_lock_memory; + rocksdb_locktree_lock_wait_count = status.lock_wait_count; + var->value = reinterpret_cast(&rocksdb_locktree_status_variables); + } else { + var->value = reinterpret_cast(&rocksdb_empty_status_variables); + } +} + static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_miss), DEF_STATUS_VAR(block_cache_hit), @@ -17763,6 +18322,8 @@ static SHOW_VAR rocksdb_status_vars[] = { SHOW_SCOPE_GLOBAL}, {"rocksdb_stall", reinterpret_cast(&show_rocksdb_stall_vars), SHOW_FUNC, SHOW_SCOPE_GLOBAL}, + {"rocksdb_locktree", reinterpret_cast(show_rocksdb_locktree_vars), + SHOW_FUNC, SHOW_SCOPE_GLOBAL}, {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; /* @@ -18761,6 +19322,37 @@ static void rocksdb_set_delayed_write_rate( RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +static void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR *, void *, + const void *save) { + const auto new_val = *static_cast(save); + + if (rocksdb_max_lock_memory == new_val) return; + + const auto ret = range_lock_mgr->SetMaxLockMemory(new_val); + switch (ret) { + case 0: + // Succeeded + rocksdb_max_lock_memory = new_val; + return; + case EDOM: + // FIXME(laurynas): test + sql_print_warning( + "MyRocks: cannot set rocksdb_max_lock_memory to a lower value"); + push_warning_printf( + thd, Sql_condition::SL_WARNING, ER_ERROR_WHEN_EXECUTING_COMMAND, + "Cannot set max_lock_memory to size below currently used"); + return; + default: + assert(0); + sql_print_warning( + "MyRocks: failed to set rocksdb_max_lock_memory, unknown error"); + push_warning_printf( + thd, Sql_condition::SL_WARNING, ER_ERROR_WHEN_EXECUTING_COMMAND, + "Failed to set rocksdb_max_lock_memory, unknown error"); + return; + } +} + static void rocksdb_set_max_latest_deadlocks( THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)), void *var_ptr MY_ATTRIBUTE((unused)), const void *save) { @@ -18768,7 +19360,12 @@ static void rocksdb_set_max_latest_deadlocks( const uint32_t new_val = *static_cast(save); if (rocksdb_max_latest_deadlocks != new_val) { rocksdb_max_latest_deadlocks = new_val; - rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + if (range_lock_mgr) { + range_lock_mgr->SetRangeDeadlockInfoBufferSize( + rocksdb_max_latest_deadlocks); + } else { + rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + } } RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } @@ -19512,11 +20109,11 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) { } std::unique_ptr rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle &cf, bool skip_bloom_filter, - const rocksdb::Slice &eq_cond_lower_bound, - const rocksdb::Slice &eq_cond_upper_bound, - const rocksdb::Snapshot **snapshot, TABLE_TYPE table_type, - bool read_current, bool create_snapshot) { + THD *thd, rocksdb::ColumnFamilyHandle &cf, const Rdb_key_def &kd, + bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, + const rocksdb::Slice &eq_cond_upper_bound, const rocksdb::Snapshot **snapshot, + TABLE_TYPE table_type, bool read_current, bool create_snapshot, + bool use_locking_iter) { if (commit_in_the_middle(thd)) { assert(snapshot && *snapshot == nullptr); if (snapshot) { @@ -19532,9 +20129,9 @@ std::unique_ptr rdb_tx_get_iterator( } } else { Rdb_transaction *tx = get_tx_from_thd(thd); - return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound, + return tx->get_iterator(cf, kd, skip_bloom_filter, eq_cond_lower_bound, eq_cond_upper_bound, table_type, read_current, - create_snapshot); + create_snapshot, use_locking_iter); } } @@ -19573,6 +20170,12 @@ void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, tx->release_lock(kd, std::string(key.data(), key.size()), force); } +rocksdb::Status rdb_tx_lock_range(Rdb_transaction &tx, const Rdb_key_def &kd, + const rocksdb::Endpoint &start_key, + const rocksdb::Endpoint &end_key) { + return tx.lock_range(kd.get_cf(), start_key, end_key); +} + void rdb_tx_multi_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle &column_family, const size_t num_keys, const rocksdb::Slice *keys, diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index e5c522eeb166..d15350642a84 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -39,6 +39,7 @@ /* RocksDB header files */ #include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/write_batch_with_index.h" /* MyRocks header files */ @@ -60,6 +61,18 @@ refinements (@see /storage/rocksdb/README file). */ +// Forward declarations + +#ifdef ROCKSDB_CUSTOM_NAMESPACE +namespace ROCKSDB_CUSTOM_NAMESPACE { +#else +namespace rocksdb { +#endif + +class RangeLockManagerHandle; + +} // namespace ROCKSDB_CUSTOM_NAMESPACE / rocksdb + namespace myrocks { class Rdb_converter; @@ -78,6 +91,9 @@ extern PSI_rwlock_key key_rwlock_read_free_rpl_tables; #endif extern Regex_list_handler rdb_read_free_regex_handler; static bool rocksdb_column_default_value_as_expression = true; + +extern bool rocksdb_use_range_locking; + /** @brief Rdb_table_handler is a reference-counted structure storing information for @@ -282,6 +298,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { /* Type of locking to apply to rows */ Rdb_lock_type m_lock_rows; + bool m_use_range_locking{false}; + thr_locked_row_action m_locked_row_action; /* true means we're doing an index-only read. false means otherwise. */ @@ -366,6 +384,12 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { int fill_virtual_columns(); + [[nodiscard]] int set_range_lock(Rdb_transaction &tx, const Rdb_key_def &kd, + enum ha_rkey_function find_flag, + rocksdb::Slice slice_arg, + rocksdb::Slice end_slice_arg, + const key_range *const end_key); + int get_row_by_rowid(uchar *const buf, const char *const rowid, const uint rowid_size, bool *skip_row = nullptr, const bool skip_lookup = false, @@ -844,6 +868,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer { [[nodiscard]] int check_and_lock_sk(const uint key_id, const struct update_row_info &row_info, bool *const found); + [[nodiscard]] int check_and_lock_non_unique_sk( + const Rdb_key_def &kd, const update_row_info &row_info); [[nodiscard]] int check_uniqueness_and_lock( const struct update_row_info &row_info, bool pk_changed); bool over_bulk_load_threshold(int *err) @@ -1213,11 +1239,12 @@ void remove_tmp_table_handler(THD *const thd, ha_rocksdb *rocksdb_handler); const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx); [[nodiscard]] std::unique_ptr rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle &cf, bool skip_bloom_filter, - const rocksdb::Slice &eq_cond_lower_bound, + THD *thd, rocksdb::ColumnFamilyHandle &cf, const Rdb_key_def &kd, + bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, const rocksdb::Snapshot **snapshot, TABLE_TYPE table_type, - bool read_current = false, bool create_snapshot = true); + bool read_current = false, bool create_snapshot = true, + bool use_locking_iter = false); [[nodiscard]] rocksdb::Status rdb_tx_get( Rdb_transaction *tx, rocksdb::ColumnFamilyHandle &column_family, @@ -1231,6 +1258,10 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx, TABLE_TYPE table_type, bool exclusive, bool skip_wait); +[[nodiscard]] rocksdb::Status rdb_tx_lock_range( + Rdb_transaction &tx, const Rdb_key_def &kd, + const rocksdb::Endpoint &start_key, const rocksdb::Endpoint &end_key); + void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, const rocksdb::Slice &key, bool force); @@ -1309,6 +1340,8 @@ extern uint rocksdb_clone_checkpoint_max_count; extern unsigned long long rocksdb_converter_record_cached_length; +extern std::shared_ptr range_lock_mgr; + [[nodiscard]] inline bool is_wal_dir_separate() noexcept { return rocksdb_wal_dir && *rocksdb_wal_dir && // Prefer cheapness over accuracy by doing lexicographic path diff --git a/storage/rocksdb/locking-iterator-partial-index.txt b/storage/rocksdb/locking-iterator-partial-index.txt new file mode 100644 index 000000000000..d3ffb41d8f00 --- /dev/null +++ b/storage/rocksdb/locking-iterator-partial-index.txt @@ -0,0 +1,120 @@ + +This document describes how Locking Reads are implemented with partial index +iterator (Rdb_iterator_partial) + +== Closed ranges == +ha_rocksdb will make the lock_range() call before making any reads. + +We just need to make sure that after that, the iterator reads the latest +committed data (not the data from the read snapshot). + +== Open ranges and LockingIterator == + +With Open Ranges, regular indexes use LockingIterator. + +How does one make Rdb_iterator_partial use the LockingIterator? + +=== Background info: Rdb_iterator_Partial === + +Partial iterator is used for partial secondary keys. + +PK and SK share the same key prefix (see 'partial_group_keyparts=N' parameter). + +For a given value of key prefix: + +* if the number of rows exceeds 'partial_group_threshold' parameter, one reads + the secondary index, as usual (the SK is "materialized" for this prefix) + +* if the number of rows is less than 'partial_group_threshold', then the SK is + "not materialized" and has no rows for the prefix. The rows are obtained by + reading the records using PK, buffering+sorting them in memory, and then + returning them. + +* Switching from non-materialized to materialized is done "lazily" on-read. + The write locks the group prefix in SK (see + Rdb_iterator_partial::materialize_prefix()) + +* Update/Delete operations also lock the group prefix in SK (see + ha_rocksdb::acquire_prefix_lock) + +* De-materialization is currently not performed. + +=== Regular reads with Rdb_iterator_partial === + +Regular (that is, non locking) reads are done as follows: + +Step #1: first, we need to figure out the key prefix we're going to read. +There are two possibilities: + +A. It can be inferred from the lookup key value. +B. It cannot be inferred (e.g. we scan from the start of the table). + In this case, we read from the PK to find out the first key prefix. + +See Rdb_iterator_partial::get_prefix_from_start(). + +Step #2 is to read rows within this prefix. + +We first try to read through the SK. If it has a row within the prefix, it +means this prefix is materialized and we continue to read from the SK within +the bounds of the key prefix. + +If the SK has no data we read all rows through the PK, buffer, sort them, and +return. (See Rdb_iterator_partial::read_prefix_from_pk()) + +Step #3: +When we have exhausted rows in this key prefix, we check if we need to continue +the scan. +If we do, we take the current prefix value and try to get the next row after +it using the approach in Step #1. See Rdb_iterator_partial::get_next_prefix(). + +=== Locking Reads with Rdb_iterator_partial === + +This section describes how one can perform locking reads for Steps 1-3 from the +previous section. + +Step #1: +for Case A, there's no need to lock anything. + +for case B, we need to lock the range from the lookup key to the start of +the first key prefix. This can be achieved by making m_iterator_pk use +a LockingIterator. + +(PK iterator uses a prefix of the lookup key, so we may end up with a +coarser-grained lock, but this is still correct). + +Step #2: + +Quoting the previous section: + +> We first try to read through the SK. +> If it has a row within the prefix, it means this prefix is materialized and we +> continue to read from the SK within the bounds of the key prefix. + +If we use a LockingIterator for scanning the SK, we are doing a locking read +and have achieved our goal. + +> If the SK has no data we read all rows through the PK + +Suppose we use a LockingIterator to try read through the SK. The read is done +with this call in Rdb_iterator_partial::seek_next_prefix: + + rc = Rdb_iterator_base::seek( + direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key, + false, empty_end_key); + +If the SK has no data for the cur_prefix_key, the LockingIterator will lock the +range before returning from this call. +Note that it will lock just this key prefix: this is achieved by use of iterator +bounds. + +If there is a Materialization operation in progress, the locking read will +eventually come into conflict with it. To avoid hitting the conflict at the +end, Rdb_iterator_partial::materialize_prefix() is made to acquire a lock +on the whole prefix when Range Locking is used. + +Step #3: The same approach as in step #1. If we use LockingIterator for the PK +it will correctly lock the gap between the prefixes (or just the gap after the +last prefix if there are no further prefixes). + +This way, one can provide LockingIterator semantics with Rdb_iterator_partial. + diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc index 876579d275f0..60922369abeb 100644 --- a/storage/rocksdb/nosql_access.cc +++ b/storage/rocksdb/nosql_access.cc @@ -1506,9 +1506,9 @@ class select_exec { } [[nodiscard]] std::unique_ptr get_iterator( - rocksdb::ColumnFamilyHandle &cf, bool use_bloom, + rocksdb::ColumnFamilyHandle &cf, const Rdb_key_def &kd, bool use_bloom, const rocksdb::Slice &lower_bound, const rocksdb::Slice &upper_bound) { - return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound, + return rdb_tx_get_iterator(m_thd, cf, kd, !use_bloom, lower_bound, upper_bound, nullptr, m_table_type); } diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc index 11ebe72a5916..695171d11c34 100644 --- a/storage/rocksdb/rdb_i_s.cc +++ b/storage/rocksdb/rdb_i_s.cc @@ -2182,6 +2182,60 @@ static ST_FIELD_INFO rdb_i_s_lock_info_fields_info[] = { ROCKSDB_FIELD_INFO("MODE", 32, MYSQL_TYPE_STRING, 0), ROCKSDB_FIELD_INFO_END}; +// Dump the locked key (or range) into a string. +template +std::string dump_key(const LockInfo &info); + +// Specialization for point lock manager. +template <> +std::string dump_key(const rocksdb::KeyLockInfo &info) { + return rdb_hexdump(info.key.c_str(), info.key.length()); +} + +// Specialization for Range Lock manager. +template <> +std::string dump_key( + const rocksdb::RangeLockInfo &info) { + return rdb_hexdump_range(info.start, info.end); +} + +// +// A template that walks the Lock info data structure and dumps its contents. +// +template +int dump_locks(my_core::THD *thd, my_core::TABLE *table, + const LockInfo &lock_info) { + for (const auto &lock : lock_info) { + const auto cf_id = lock.first; + const auto &one_lock_info = lock.second; + const auto key_hexstr = dump_key(one_lock_info); + + for (const auto &id : one_lock_info.ids) { + table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, true); + table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); + + table->field[RDB_LOCKS_FIELD::KEY]->store( + key_hexstr.data(), + // If the key range is too long, truncation will happen here instead + // of stopping the dump in rdb_hexdump_range / rdb_hexdump. Do that + // there if that becomes a problem. + std::min(key_hexstr.size(), FN_REFLEN), + system_charset_info); + table->field[RDB_LOCKS_FIELD::MODE]->store( + one_lock_info.exclusive ? "X" : "S", 1, system_charset_info); + + /* Tell MySQL about this row in the virtual table */ + const auto ret = + static_cast(my_core::schema_table_store_record(thd, table)); + + if (ret != 0) { + return ret; + } + } + } + return 0; +} + /* Fill the information_schema.rocksdb_locks virtual table */ static int rdb_i_s_lock_info_fill_table( my_core::THD *const thd, my_core::Table_ref *const tables, @@ -2201,36 +2255,13 @@ static int rdb_i_s_lock_info_fill_table( DBUG_RETURN(ret); } - /* cf id -> rocksdb::KeyLockInfo */ - std::unordered_multimap lock_info = - rdb->GetLockStatusData(); - - for (const auto &lock : lock_info) { - const uint32_t cf_id = lock.first; - const auto &key_lock_info = lock.second; - const auto key_hexstr = - rdb_hexdump(key_lock_info.key.data(), key_lock_info.key.length()); - - for (const auto &id : key_lock_info.ids) { - tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, - true); - tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); - - tables->table->field[RDB_LOCKS_FIELD::KEY]->store( - key_hexstr.c_str(), key_hexstr.size(), system_charset_info); - tables->table->field[RDB_LOCKS_FIELD::MODE]->store( - key_lock_info.exclusive ? "X" : "S", 1, system_charset_info); - - /* Tell MySQL about this row in the virtual table */ - ret = static_cast( - my_core::schema_table_store_record(thd, tables->table)); - - if (ret != 0) { - break; - } - } + if (range_lock_mgr) { + const auto lock_info = range_lock_mgr->GetRangeLockStatusData(); + ret = dump_locks(thd, tables->table, lock_info); + } else { + const auto lock_info = rdb->GetLockStatusData(); + ret = dump_locks(thd, tables->table, lock_info); } - DBUG_RETURN(ret); } diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc index d9393e9eed16..e5e66f013e2c 100644 --- a/storage/rocksdb/rdb_iterator.cc +++ b/storage/rocksdb/rdb_iterator.cc @@ -199,9 +199,9 @@ void Rdb_iterator_base::setup_scan_iterator( */ if (!m_scan_it) { m_scan_it = rdb_tx_get_iterator( - m_thd, m_kd.get_cf(), skip_bloom, m_scan_it_lower_bound_slice, + m_thd, m_kd.get_cf(), m_kd, skip_bloom, m_scan_it_lower_bound_slice, m_scan_it_upper_bound_slice, &m_scan_it_snapshot, m_table_type, - read_current, !read_current); + read_current, !read_current, m_scan_it_is_locking); m_scan_it_skips_bloom = skip_bloom; } } @@ -712,6 +712,8 @@ int Rdb_iterator_partial::seek_next_prefix(bool direction) { * group. */ int Rdb_iterator_partial::materialize_prefix() { + assert(m_table_type == TABLE_TYPE::USER_TABLE); + uint tmp; int rc = HA_EXIT_SUCCESS; Rdb_transaction *const tx = get_tx_from_thd(m_thd); @@ -724,9 +726,17 @@ int Rdb_iterator_partial::materialize_prefix() { // It is possible that someone else has already materialized this group // before we locked. Double check by doing a locking read on the sentinel. - rocksdb::PinnableSlice value; - auto s = rdb_tx_get_for_update(tx, m_kd, cur_prefix_key, &value, m_table_type, - true, false); + rocksdb::Status s; + if (rocksdb_use_range_locking) { + rocksdb::Endpoint start_endp(cur_prefix_key, false); + rocksdb::Endpoint end_endp(cur_prefix_key, true); + s = rdb_tx_lock_range(*tx, m_kd, start_endp, end_endp); + } else { + rocksdb::PinnableSlice value; + s = rdb_tx_get_for_update(tx, m_kd, cur_prefix_key, &value, m_table_type, + true, false); + } + if (s.ok()) { rdb_tx_release_lock(tx, m_kd, cur_prefix_key, true /* force */); thd_proc_info(m_thd, old_proc_info); @@ -929,7 +939,7 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag, return HA_ERR_INTERNAL_ERROR; } - reset(); + reset(m_scan_it_is_locking); Rdb_iterator_base::setup_prefix_buffer(find_flag, start_key); bool direction = (find_flag == HA_READ_KEY_EXACT) || @@ -947,6 +957,15 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag, m_kd.get_infimum_key(m_cur_prefix_key, &tmp); rocksdb::PinnableSlice value; + // + // Range Locking note: When using a locking iterator (i.e. + // m_iter_should_use_locking=true), this will read (and lock) + // and the value space up to the next prefix. + // If the next prefix is not materialized, it will lock the whole + // prefix in the secondary key. It will not lock more than that, + // because the iterator use the iterator bounds to limit the scan + // to the prefix specified. + // rc = Rdb_iterator_base::get(&cur_prefix_key, &value, RDB_LOCK_NONE, true /* skip ttl check*/); @@ -1179,14 +1198,14 @@ int Rdb_iterator_partial::prev() { return rc; } -void Rdb_iterator_partial::reset() { +void Rdb_iterator_partial::reset(bool become_locking) { m_partial_valid = false; m_materialized = false; m_mem_root.ClearForReuse(); m_iterator_pk_position = Iterator_position::UNKNOWN; m_records.clear(); m_iterator_pk.reset(); - Rdb_iterator_base::reset(); + Rdb_iterator_base::reset(become_locking); } rocksdb::Slice Rdb_iterator_partial::key() { diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h index f7f7ca61bf8a..43aacfb06dc2 100644 --- a/storage/rocksdb/rdb_iterator.h +++ b/storage/rocksdb/rdb_iterator.h @@ -77,7 +77,7 @@ class Rdb_iterator { virtual int prev() = 0; virtual rocksdb::Slice key() = 0; virtual rocksdb::Slice value() = 0; - virtual void reset() = 0; + virtual void reset(bool become_locking = false) = 0; virtual bool is_valid() = 0; }; @@ -127,9 +127,12 @@ class Rdb_iterator_base : public Rdb_iterator { rocksdb::Slice value() override { return m_scan_it->value(); } - void reset() override { + void reset(bool become_locking = false) override { + assert(!become_locking || m_table_type == TABLE_TYPE::USER_TABLE); + release_scan_iterator(); m_valid = false; + m_scan_it_is_locking = become_locking; } bool is_valid() override { return m_valid; } @@ -155,6 +158,8 @@ class Rdb_iterator_base : public Rdb_iterator { /* Iterator used for range scans and for full table/index scans */ std::unique_ptr m_scan_it; + bool m_scan_it_is_locking{false}; + /* Whether m_scan_it was created with skip_bloom=true */ bool m_scan_it_skips_bloom; @@ -179,6 +184,11 @@ class Rdb_iterator_base : public Rdb_iterator { Rdb_iterator_base &operator=(Rdb_iterator_base &&) = delete; }; +/* + Iterator for reading partial secondary indexes + + It can do locking reads, see locking_iterator_partial_index.txt for details. +*/ class Rdb_iterator_partial : public Rdb_iterator_base { private: TABLE *m_table; @@ -266,7 +276,7 @@ class Rdb_iterator_partial : public Rdb_iterator_base { int prev() override; rocksdb::Slice key() override; rocksdb::Slice value() override; - void reset() override; + void reset(bool become_locking = false) override; bool is_valid() override { // This function only used for intrinsic temp tables. assert(false); diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc new file mode 100644 index 000000000000..9bf554da8f38 --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.cc @@ -0,0 +1,280 @@ +/* + Copyright (C) 2022, 2023, 2024 Meta Platforms, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* This C++ file's header file */ +#include "./rdb_locking_iter.h" + +#include +#include + +#include "sql/debug_sync.h" + +namespace myrocks { + +/** + @brief + Seek to the first key K that is equal or greater than target, + locking the range [target; K]. +*/ + +void LockingIterator::Seek(const rocksdb::Slice &target) { + m_locked_until.clear(); + m_iter.reset(m_txn.GetIterator(m_read_opts, &m_cfh)); + m_iter->Seek(target); + ScanForward(target, true); +} + +void LockingIterator::SeekForPrev(const rocksdb::Slice &target) { + m_locked_until.clear(); + m_iter.reset(m_txn.GetIterator(m_read_opts, &m_cfh)); + m_iter->SeekForPrev(target); + ScanBackward(target, true); +} + +/** + @brief + Move the iterator to the next key, locking the range between the current + and the next key. + + @detail + Implementation is similar to Seek(next_key). Since we don't know what the + next_key is, we reach it by calling { Seek(current_key); Next(); } +*/ +void LockingIterator::Next() { + DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next"); + assert(Valid()); + // Save the current key value. We need it as the left endpoint + // of the range lock we're going to acquire + const auto current_key = m_iter->key().ToString(); + + m_iter->Next(); + ScanForward(rocksdb::Slice(current_key), false); +} + +/* + @brief + Move the iterator to the previous key, locking the range between the current + and the previous key. +*/ + +void LockingIterator::Prev() { + assert(Valid()); + + const auto current_key = m_iter->key().ToString(); + m_iter->Prev(); + ScanBackward(rocksdb::Slice(current_key), false); +} + +/* + @brief + Lock range from target to end_key. + + @detail + In forward-ordered scan, target < end_key. In backward-ordered scan, it's + other way around. + + We might have already locked a subset of this range, a subrange that + starts from target and extends to some point between target and end_key. +*/ +void LockingIterator::lock_up_to(bool scan_forward, + const rocksdb::Slice &target, + const rocksdb::Slice &end_key) { + const auto inv = scan_forward ? 1 : -1; + const auto *const cmp = m_cfh.GetComparator(); + const auto endp_arg = m_kd.m_is_reverse_cf; + + if (!m_locked_until.empty() && + cmp->Compare(end_key, rocksdb::Slice(m_locked_until)) * inv <= 0) { + // We've already locked this range. The following has happened: + // - m_iter->key() returned $KEY + // - other transaction(s) have inserted row $ROW before the $KEY. + // - we got a range lock on [range_start, $KEY] + // - we've read $ROW and returned. + // Now, we're looking to lock [$ROW, $KEY] but we don't need to, + // we already have a lock on this range. + } else { + m_status = m_txn.GetRangeLock( + &m_cfh, rocksdb::Endpoint(target, endp_arg), + rocksdb::Endpoint((scan_forward ? end_key : target), endp_arg)); + + if (!m_status.ok()) return; + + // Save the bound where we locked until: + assert(!end_key.empty()); + m_locked_until.assign(end_key.data(), end_key.size()); + if (m_lock_count) (*m_lock_count)++; + } +} + +/* + Lock the range from target till the iterator end point that we are scaning + towards. If there's no iterator bound, use index start (or end, depending + on the scan direction) +*/ +void LockingIterator::lock_till_iterator_end(bool scan_forward, + const rocksdb::Slice &target) { + rocksdb::Slice end; + uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE]; + uint size; + if (scan_forward) { + if (m_read_opts.iterate_upper_bound) + end = *m_read_opts.iterate_upper_bound; + else { + if (m_kd.m_is_reverse_cf) + m_kd.get_infimum_key(buf, &size); + else + m_kd.get_supremum_key(buf, &size); + + assert(size == Rdb_key_def::INDEX_NUMBER_SIZE); + end = rocksdb::Slice(reinterpret_cast(buf), size); + } + } else { + if (m_read_opts.iterate_lower_bound) + end = *m_read_opts.iterate_lower_bound; + else { + if (m_kd.m_is_reverse_cf) + m_kd.get_supremum_key(buf, &size); + else + m_kd.get_infimum_key(buf, &size); + + assert(size == Rdb_key_def::INDEX_NUMBER_SIZE); + end = rocksdb::Slice(reinterpret_cast(buf), size); + } + } + // This will set m_status accordingly + lock_up_to(scan_forward, target, end); +} + +/** + Lock the range between [target, (current m_iter position)] and position + the iterator on the first record in it. + + @param skip_next true means current iterator position is achieved by + calling Seek(target). + false means one also needs to call Next() +*/ +void LockingIterator::Scan(bool scan_forward, const rocksdb::Slice &target, + bool skip_next) { + if (!m_iter->Valid()) { + m_status = m_iter->status(); + m_valid = false; + if (m_status.ok()) { + // m_iter has reached EOF + lock_till_iterator_end(scan_forward, target); + } + return; + } + + auto *const thd = my_core::thd_get_current_thd(); + const auto inv = scan_forward ? 1 : -1; + const auto *const cmp = m_cfh.GetComparator(); + + while (1) { + DEBUG_SYNC(thd, "rocksdb.locking_iter_scan"); + + if (my_core::thd_killed(thd)) { + m_status = rocksdb::Status::Aborted(); + m_valid = false; + return; + } + + const auto &end_key = m_iter->key(); + const auto end_key_copy = end_key.ToString(); + + lock_up_to(scan_forward, target, end_key); + if (!m_status.ok()) { + // Failed to get a lock (most likely lock wait timeout) + m_valid = false; + return; + } + + // Ok, now we have a lock which is inhibiting modifications in the range + // Somebody might have done external modifications, though: + // - removed the key we've found + // - added a key before that key. + + // First, refresh the iterator: + m_iter.reset(m_txn.GetIterator(m_read_opts, &m_cfh)); + + // Then, try seeking to the same row + if (scan_forward) + m_iter->Seek(target); + else + m_iter->SeekForPrev(target); + + if (!skip_next && m_iter->Valid() && !cmp->Compare(m_iter->key(), target)) { + if (scan_forward) + m_iter->Next(); + else + m_iter->Prev(); + } + + if (m_iter->Valid()) { + if (cmp->Compare(m_iter->key(), rocksdb::Slice(end_key_copy)) * inv <= + 0) { + // Ok, the found key is within the locked range. + m_status = rocksdb::Status::OK(); + m_valid = true; + break; + } else { + // We've got a key but it is outside the range we've locked. + // Re-try the lock-and-read step. + continue; + } + } else { + m_valid = false; + m_status = m_iter->status(); + if (m_status.ok()) { + // m_iter has reached EOF + lock_till_iterator_end(scan_forward, target); + } + break; + } + } +} + +/* + @detail + Ideally, this function should + - find the first key $first_key + - lock the range [-inf; $first_key] + - return, the iterator is positioned on $first_key + + The problem here is that we cannot have "-infinity" bound. + + Note: we don't have a practical use for this function - MyRocks always + searches within one index_name.table_name, which means we are only looking + at the keys with index_number as the prefix. +*/ + +void LockingIterator::SeekToFirst() { + assert(0); + m_status = rocksdb::Status::NotSupported("Not implemented"); + m_valid = false; +} + +/* + @detail + See SeekToFirst. +*/ + +void LockingIterator::SeekToLast() { + assert(0); + m_status = rocksdb::Status::NotSupported("Not implemented"); + m_valid = false; +} + +} // namespace myrocks diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h new file mode 100644 index 000000000000..8e1d35bb7cb2 --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.h @@ -0,0 +1,143 @@ +/* + Copyright (C) 2022, 2023, 2024 Meta Platforms, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef RDB_LOCKING_ITER_H_ +#define RDB_LOCKING_ITER_H_ + +// C++ header files +#include +#include + +// RocksDB header files +#include "rocksdb/iterator.h" + +// MySQL header files + +// MyRocks header files +#include "./ha_rocksdb.h" +#include "./rdb_datadic.h" + +namespace myrocks { + +// +// LockingIterator is an iterator that locks the rows before returning, as well +// as scanned gaps between the rows. +// +// Example: +// lock_iter= trx->GetLockingIterator(); +// lock_iter->Seek('abc'); +// lock_iter->Valid() && lock_iter->key() == 'bcd'; +// +// After the above, the returned record 'bcd' is locked by transaction trx. +// Also, the range between ['abc'..'bcd'] is empty and locked by trx. +// +// lock_iter->Next(); +// lock_iter->Valid() && lock_iter->key() == 'efg' +// +// Now, the range ['bcd'.. 'efg'] (bounds inclusive) is also locked, and there +// are no records between 'bcd' and 'efg'. +// +class [[nodiscard]] LockingIterator : public rocksdb::Iterator { + rocksdb::Transaction &m_txn; + rocksdb::ColumnFamilyHandle &m_cfh; + const Rdb_key_def &m_kd; + + rocksdb::ReadOptions m_read_opts; + std::unique_ptr m_iter; + + // Status for either m_iter or m_txn operation + rocksdb::Status m_status; + + // note: an iterator that has reached EOF has status()==OK && m_valid==false + bool m_valid; + + ulonglong *m_lock_count; + + // The key value until we've locked the range. That is, we have a range lock + // on [current_position ... m_locked_until]. + // This is used to avoid making extra GetRangeLock() calls. + std::string m_locked_until; + + public: + LockingIterator(rocksdb::Transaction &txn, rocksdb::ColumnFamilyHandle &cfh, + const Rdb_key_def &kd, const rocksdb::ReadOptions &opts, + ulonglong *lock_count = nullptr) + : m_txn(txn), + m_cfh(cfh), + m_kd(kd), + m_read_opts(opts), + m_status(rocksdb::Status::InvalidArgument()), + m_valid(false), + m_lock_count(lock_count) {} + + bool Valid() const override { return m_valid; } + + // Note: MyRocks doesn't ever call these: + void SeekToFirst() override; + void SeekToLast() override; + + void Seek(const rocksdb::Slice &target) override; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + void SeekForPrev(const rocksdb::Slice &target) override; + + void Next() override; + void Prev() override; + + rocksdb::Slice key() const override { + assert(Valid()); + return m_iter->key(); + } + + rocksdb::Slice value() const override { + assert(Valid()); + return m_iter->value(); + } + + rocksdb::Status status() const override { return m_status; } + + private: + void lock_up_to(bool scan_forward, const rocksdb::Slice &target, + const rocksdb::Slice &end_key); + void lock_till_iterator_end(bool scan_forward, const rocksdb::Slice &target); + void Scan(bool scan_forward, const rocksdb::Slice &target, bool skip_next); + + inline void ScanForward(const rocksdb::Slice &target, bool skip_next) { + Scan(true, target, skip_next); + } + + inline void ScanBackward(const rocksdb::Slice &target, bool skip_next) { + Scan(false, target, skip_next); + } + LockingIterator(const LockingIterator &) = delete; + LockingIterator &operator=(const LockingIterator &) = delete; + LockingIterator(LockingIterator &&) = delete; + LockingIterator &operator=(LockingIterator &&) = delete; +}; + +[[nodiscard]] inline std::unique_ptr GetLockingIterator( + rocksdb::Transaction &trx, const rocksdb::ReadOptions &read_options, + rocksdb::ColumnFamilyHandle &column_family, const Rdb_key_def &kd, + ulonglong *counter) { + return std::make_unique(trx, column_family, kd, read_options, + counter); +} + +} // namespace myrocks + +#endif // RDB_LOCKING_ITER_H_ diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc index f05191b58e33..466452b66a87 100644 --- a/storage/rocksdb/rdb_utils.cc +++ b/storage/rocksdb/rdb_utils.cc @@ -40,6 +40,7 @@ #include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/io_status.h" +#include "rocksdb/utilities/transaction_db.h" namespace myrocks { @@ -273,6 +274,29 @@ std::string rdb_concat_paths(std::string_view dir, std::string_view file) { return result; } +/* + Print the range in hex, in "start_endpoint-end_endpoint" form +*/ + +std::string rdb_hexdump_range(const rocksdb::EndpointWithString &start, + const rocksdb::EndpointWithString &end) { + auto res = rdb_hexdump(start.slice.data(), start.slice.length()); + + // For keys: :0 keys should look like point keys + if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) + // This is a single-point range, show it like a key + return res; + + if (start.inf_suffix) res.append(":1"); + + res.append("-"); + + const auto key2 = rdb_hexdump(end.slice.c_str(), end.slice.length()); + res.append(key2); + if (end.inf_suffix) res.append(":1"); + return res; +} + /* Attempt to access the database subdirectory to see if it exists */ diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h index fed38034b2be..98d4e04d7c7c 100644 --- a/storage/rocksdb/rdb_utils.h +++ b/storage/rocksdb/rdb_utils.h @@ -40,8 +40,19 @@ #include #endif +// Forward declarations struct fileinfo; +#ifdef ROCKSDB_CUSTOM_NAMESPACE +namespace ROCKSDB_CUSTOM_NAMESPACE { +#else +namespace rocksdb { +#endif + +struct EndpointWithString; + +} // namespace ROCKSDB_CUSTOM_NAMESPACE / rocksdb + namespace myrocks { /* @@ -345,6 +356,9 @@ const std::vector parse_into_tokens(const std::string &s, const char *data, std::size_t data_len, std::size_t maxsize = RDB_MAX_HEXDUMP_LEN); +[[nodiscard]] std::string rdb_hexdump_range( + const rocksdb::EndpointWithString &left, + const rocksdb::EndpointWithString &right); /* Helper function to return dir + '/' + file */ From 5f7e03607eae1c3f685c6e85aafc83c84d77645d Mon Sep 17 00:00:00 2001 From: Laurynas Biveinis Date: Fri, 26 Apr 2024 16:24:21 +0300 Subject: [PATCH 2/3] TODO: PR this separately --- storage/rocksdb/rdb_datadic.cc | 4 +-- storage/rocksdb/rdb_vector_db.cc | 45 ++++++++++++++++++-------------- storage/rocksdb/rdb_vector_db.h | 4 ++- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc index be9a785469ce..7c136228de6a 100644 --- a/storage/rocksdb/rdb_datadic.cc +++ b/storage/rocksdb/rdb_datadic.cc @@ -3557,8 +3557,8 @@ uint Rdb_key_def::setup_vector_index(const TABLE &tbl, } return create_vector_index(cmd_srv_helper, tbl_def.base_dbname(), - m_vector_index_config, m_cf_handle, m_index_number, - m_vector_index); + m_vector_index_config, m_cf_handle, *this, + m_index_number, m_vector_index); } // See Rdb_charset_space_info::spaces_xfrm diff --git a/storage/rocksdb/rdb_vector_db.cc b/storage/rocksdb/rdb_vector_db.cc index d06fb364ecb5..88d9de3ed139 100644 --- a/storage/rocksdb/rdb_vector_db.cc +++ b/storage/rocksdb/rdb_vector_db.cc @@ -177,7 +177,7 @@ class Rdb_vector_iterator : public faiss::InvertedListsIterator { public: Rdb_vector_iterator(Rdb_faiss_inverted_list_context *context, Index_id index_id, rocksdb::ColumnFamilyHandle &cf, - const uint code_size, size_t list_id) + const Rdb_key_def &kd, uint code_size, size_t list_id) : m_context(context), m_index_id(index_id), m_list_id(list_id), @@ -190,7 +190,7 @@ class Rdb_vector_iterator : public faiss::InvertedListsIterator { write_inverted_list_key(upper_key_writer, index_id, list_id + 1); m_iterator_upper_bound_key.PinSelf(upper_key_writer.to_slice()); m_iterator = rdb_tx_get_iterator( - context->m_thd, cf, /* skip_bloom_filter */ true, + context->m_thd, cf, kd, /* skip_bloom_filter */ true, m_iterator_lower_bound_key, m_iterator_upper_bound_key, /* snapshot */ nullptr, TABLE_TYPE::USER_TABLE); m_iterator->SeekToFirst(); @@ -270,8 +270,11 @@ class Rdb_vector_iterator : public faiss::InvertedListsIterator { class Rdb_faiss_inverted_list : public faiss::InvertedLists { public: Rdb_faiss_inverted_list(Index_id index_id, rocksdb::ColumnFamilyHandle &cf, - uint nlist, uint code_size) - : InvertedLists(nlist, code_size), m_index_id(index_id), m_cf(cf) { + const Rdb_key_def &kd, uint nlist, uint code_size) + : InvertedLists(nlist, code_size), + m_index_id(index_id), + m_cf(cf), + m_kd(kd) { use_iterator = true; } ~Rdb_faiss_inverted_list() override = default; @@ -293,7 +296,7 @@ class Rdb_faiss_inverted_list : public faiss::InvertedLists { return new Rdb_vector_iterator( reinterpret_cast( inverted_list_context), - m_index_id, m_cf, code_size, list_no); + m_index_id, m_cf, m_kd, code_size, list_no); } const uint8_t *get_codes(size_t list_no) const override { @@ -353,14 +356,18 @@ class Rdb_faiss_inverted_list : public faiss::InvertedLists { private: Index_id m_index_id; rocksdb::ColumnFamilyHandle &m_cf; + const Rdb_key_def &m_kd; }; class Rdb_vector_index_ivf : public Rdb_vector_index { public: Rdb_vector_index_ivf(const FB_vector_index_config index_def, std::shared_ptr cf_handle, - const Index_id index_id) - : m_index_id{index_id}, m_index_def{index_def}, m_cf_handle{cf_handle} {} + const Rdb_key_def &kd, const Index_id index_id) + : m_index_id{index_id}, + m_index_def{index_def}, + m_cf_handle{cf_handle}, + m_kd{kd} {} virtual ~Rdb_vector_index_ivf() override = default; @@ -446,8 +453,8 @@ class Rdb_vector_index_ivf : public Rdb_vector_index { for (std::size_t i = 0; i < m_list_size_stats.size(); i++) { std::size_t list_size = 0; Rdb_faiss_inverted_list_context context(thd); - Rdb_vector_iterator vector_iter(&context, m_index_id, *m_cf_handle, - m_index_l2->code_size, i); + Rdb_vector_iterator vector_iter(&context, m_index_id, *m_cf_handle.get(), + m_kd, m_index_l2->code_size, i); while (vector_iter.is_available()) { uint rtn = vector_iter.get_pk_and_codes(pk, codes); if (rtn) { @@ -524,7 +531,8 @@ class Rdb_vector_index_ivf : public Rdb_vector_index { // create inverted list m_inverted_list = std::make_unique( - m_index_id, *m_cf_handle, m_index_l2->nlist, m_index_l2->code_size); + m_index_id, *m_cf_handle.get(), m_kd, m_index_l2->nlist, + m_index_l2->code_size); m_index_l2->replace_invlists(m_inverted_list.get()); m_index_ip->replace_invlists(m_inverted_list.get()); @@ -589,6 +597,7 @@ class Rdb_vector_index_ivf : public Rdb_vector_index { Index_id m_index_id; FB_vector_index_config m_index_def; std::shared_ptr m_cf_handle; + const Rdb_key_def &m_kd; std::atomic m_hit{0}; std::unique_ptr m_quantizer; std::unique_ptr m_index_l2; @@ -683,13 +692,14 @@ uint create_vector_index(Rdb_cmd_srv_helper &cmd_srv_helper, const std::string &db_name, const FB_vector_index_config index_def, std::shared_ptr cf_handle, + const Rdb_key_def &kd, const Index_id index_id, std::unique_ptr &index) { if (index_def.type() == FB_VECTOR_INDEX_TYPE::FLAT || index_def.type() == FB_VECTOR_INDEX_TYPE::IVFFLAT || index_def.type() == FB_VECTOR_INDEX_TYPE::IVFPQ) { index = - std::make_unique(index_def, cf_handle, index_id); + std::make_unique(index_def, cf_handle, kd, index_id); } else { assert(false); return HA_ERR_UNSUPPORTED; @@ -700,14 +710,11 @@ uint create_vector_index(Rdb_cmd_srv_helper &cmd_srv_helper, #else // dummy implementation for non-fbvectordb builds -uint create_vector_index(Rdb_cmd_srv_helper &cmd_srv_helper [[maybe_unused]], - const std::string &db_name [[maybe_unused]], - const FB_vector_index_config index_def - [[maybe_unused]], - std::shared_ptr cf_handle - [[maybe_unused]], - const Index_id index_id [[maybe_unused]], - std::unique_ptr &index) { +uint create_vector_index(Rdb_cmd_srv_helper &, const std::string &, + const FB_vector_index_config, + std::shared_ptr, + const Rdb_key_def, const Index_id, + std::unique_ptr &) { index = nullptr; return HA_ERR_UNSUPPORTED; } diff --git a/storage/rocksdb/rdb_vector_db.h b/storage/rocksdb/rdb_vector_db.h index d710f857b328..0a0b60470116 100644 --- a/storage/rocksdb/rdb_vector_db.h +++ b/storage/rocksdb/rdb_vector_db.h @@ -30,6 +30,8 @@ namespace myrocks { +class Rdb_key_def; + /** for infomation schema */ class Rdb_vector_index_info { public: @@ -112,7 +114,7 @@ uint create_vector_index(Rdb_cmd_srv_helper &cmd_srv_helper, const std::string &db_name, const FB_vector_index_config index_def, std::shared_ptr cf_handle, - const Index_id index_id, + const Rdb_key_def &kd, const Index_id index_id, std::unique_ptr &index); /** From 3b35dabb41aa9c252328c86cad37de8d287f956b Mon Sep 17 00:00:00 2001 From: Laurynas Biveinis Date: Mon, 29 Apr 2024 16:49:01 +0300 Subject: [PATCH 3/3] TODO fixes --- storage/rocksdb/ha_rocksdb.cc | 7 ++----- storage/rocksdb/rdb_vector_db.cc | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 4cfc1ce49dea..3e2bf2328dc6 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -5084,10 +5084,7 @@ class Rdb_transaction_impl : public Rdb_transaction { void start_ignore_snapshot() { // This may be called several times for the same statement - if (in_snapshot_ignore_mode()) { - assert(m_read_opts[TABLE_TYPE::USER_TABLE].snapshot == nullptr); - return; - } + if (in_snapshot_ignore_mode()) return; assert(m_saved_snapshot == nullptr); @@ -5303,7 +5300,7 @@ class Rdb_transaction_impl : public Rdb_transaction { return; } - if (has_snapshot(table_type)) { + if (!has_snapshot(table_type)) { const auto thd_ss = std::static_pointer_cast( m_thd->get_explicit_snapshot()); if (thd_ss) { diff --git a/storage/rocksdb/rdb_vector_db.cc b/storage/rocksdb/rdb_vector_db.cc index 88d9de3ed139..bd0528ee1e09 100644 --- a/storage/rocksdb/rdb_vector_db.cc +++ b/storage/rocksdb/rdb_vector_db.cc @@ -714,7 +714,7 @@ uint create_vector_index(Rdb_cmd_srv_helper &, const std::string &, const FB_vector_index_config, std::shared_ptr, const Rdb_key_def, const Index_id, - std::unique_ptr &) { + std::unique_ptr &index) { index = nullptr; return HA_ERR_UNSUPPORTED; }