Skip to content

Commit b8db04e

Browse files
author
Jan Kunzmann
committed
check-mysql-replication-status: add lag flapping protection
1 parent 3883967 commit b8db04e

File tree

4 files changed

+99
-17
lines changed

4 files changed

+99
-17
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ This CHANGELOG follows the format listed [here](https://github.com/sensu-plugins
1010
### Changed
1111
- check-mysql-replication-status: refactoring & spec tests (@DrMurx)
1212

13+
### Added
14+
- check-mysql-replication-status: added protection against `SHOW SLAVE STATUS` high lag reporting bug (@DrMurx)
15+
1316
## [3.1.0] - 2018-12-15
1417
### Added
1518
- metrics-mysql-multiple-select-countcript (@nagyt234)

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ $ /opt/sensu/embedded/bin/check-mysql-threads.rb --host=<DBHOST> --ini=/etc/sens
8080
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini
8181
```
8282

83+
**check-mysql-replication-status** example with flapping protection
84+
85+
MariaDB/MySQL sometimes wrongly reports a very high replication lag for a short moment. Flapping protection helps mitigating this issue
86+
better than setting `occurrences` in sensu's `checks` definition because you don't lose any alerting granularity.
87+
88+
```bash
89+
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini --flapping-retry=1 --flapping-lag=86400 --flapping-sleep=2
90+
```
91+
8392
**check-mysql-msr-replication-status** example
8493
```bash
8594
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini

bin/check-mysql-replication-status.rb

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,27 @@ class CheckMysqlReplicationStatus < Sensu::Plugin::Check::CLI
9494
# #YELLOW
9595
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
9696

97+
option :flapping_lag,
98+
short: '-l',
99+
long: '--flapping-lag=VALUE',
100+
description: 'Lag threshold to trigger flapping protection',
101+
default: 100000,
102+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
103+
104+
option :flapping_retry,
105+
short: '-r',
106+
long: '--flapping-retry=VALUE',
107+
description: 'Number of retries when lag flapping protection is triggered',
108+
default: 0,
109+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
110+
111+
option :flapping_sleep,
112+
long: '--flapping-sleep=VALUE',
113+
description: 'Sleep between flapping protection retries',
114+
default: 1,
115+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
116+
117+
97118
def detect_replication_status?(row)
98119
%w[
99120
Slave_IO_State
@@ -176,19 +197,29 @@ def ok_slave_message
176197
def run
177198
db = open_connection
178199

179-
row = query_slave_status(db)
180-
ok 'show slave status was nil. This server is not a slave.' if row.nil?
181-
warn "couldn't detect replication status" unless detect_replication_status?(row)
182-
183-
slave_running = slave_running?(row)
184-
critical broken_slave_message(row) unless slave_running
185-
186-
replication_delay = row['Seconds_Behind_Master'].to_i
187-
message = "replication delayed by #{replication_delay}"
188-
# TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks
189-
critical message if replication_delay > config[:crit]
190-
warning message if replication_delay > config[:warn]
191-
ok "#{ok_slave_message}, #{message}"
200+
retries = config[:flapping_retry]
201+
while retries >= 0
202+
row = query_slave_status(db)
203+
ok 'show slave status was nil. This server is not a slave.' if row.nil?
204+
warn "couldn't detect replication status" unless detect_replication_status?(row)
205+
206+
slave_running = slave_running?(row)
207+
critical broken_slave_message(row) unless slave_running
208+
209+
replication_delay = row['Seconds_Behind_Master'].to_i
210+
retries -= 1
211+
if replication_delay >= config[:flapping_lag] && retries >= 0
212+
sleep config[:flapping_sleep]
213+
next
214+
end
215+
216+
message = "replication delayed by #{replication_delay}"
217+
# TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks
218+
critical message if replication_delay > config[:crit]
219+
warning message if replication_delay > config[:warn]
220+
ok "#{ok_slave_message}, #{message}"
221+
end
222+
unknown "unable to retrieve slave status"
192223
rescue Mysql::Error => e
193224
errstr = "Error code: #{e.errno} Error message: #{e.error}"
194225
critical "#{errstr} SQLSTATE: #{e.sqlstate}" if e.respond_to?('sqlstate')

test/check-mysql-replication-status_spec.rb

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ def checker.critical(*_args)
5252
['No', 'Yes', nil, 2, 'critical'],
5353
['Yes', 'No', nil, 2, 'critical'],
5454
['No', 'No', nil, 2, 'critical'],
55-
['Yes', 'Yes', 899, 0, 'ok'],
56-
['Yes', 'Yes', 900, 1, 'warning'],
57-
['Yes', 'Yes', 1799, 1, 'warning'],
58-
['Yes', 'Yes', 1800, 2, 'critical'],
55+
['Yes', 'Yes', 900, 0, 'ok'],
56+
['Yes', 'Yes', 901, 1, 'warning'],
57+
['Yes', 'Yes', 1800, 1, 'warning'],
58+
['Yes', 'Yes', 1801, 2, 'critical'],
5959
].each do |testdata|
6060
it "returns #{testdata[4]} for default thresholds" do
6161
slave_status_row = {
@@ -76,4 +76,43 @@ def checker.critical(*_args)
7676
expect(exit_code).to eq testdata[3]
7777
end
7878
end
79+
80+
[
81+
[ 0, 0, 'ok'],
82+
[99999, 2, 'critical'],
83+
].each do |testdata|
84+
it "sleeps with flapping protection and returns #{testdata[2]} for default thresholds" do
85+
checker.config[:flapping_retry] = 1
86+
checker.config[:flapping_sleep] = 10
87+
88+
slave_status_row = [
89+
{
90+
"Slave_IO_State" => '',
91+
"Slave_IO_Running" => 'Yes',
92+
"Slave_SQL_Running" => 'Yes',
93+
"Last_IO_Error" => '',
94+
"Last_SQL_Error" => '',
95+
"Seconds_Behind_Master" => 100000
96+
},
97+
{
98+
"Slave_IO_State" => '',
99+
"Slave_IO_Running" => 'Yes',
100+
"Slave_SQL_Running" => 'Yes',
101+
"Last_IO_Error" => '',
102+
"Last_SQL_Error" => '',
103+
"Seconds_Behind_Master" => testdata[0]
104+
}
105+
]
106+
107+
begin
108+
allow(checker).to receive(:open_connection) # do nothing
109+
allow(checker).to receive(:query_slave_status).and_return slave_status_row[0], slave_status_row[1]
110+
expect(checker).to receive(:sleep).with(10)
111+
checker.run
112+
rescue SystemExit => e
113+
exit_code = e.status
114+
end
115+
expect(exit_code).to eq testdata[1]
116+
end
117+
end
79118
end

0 commit comments

Comments
 (0)