Skip to content

Commit 787474c

Browse files
author
Jan Kunzmann
committed
check-mysql-replication-status: add lag flapping protection
1 parent 1429737 commit 787474c

File tree

4 files changed

+92
-17
lines changed

4 files changed

+92
-17
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ This CHANGELOG follows the format listed [here](https://github.com/sensu-plugins
77
### Changed
88
- check-mysql-replication-status: fix code flow if server is not a slave (@DrMurx)
99
- check-mysql-replication-status: refactoring & spec tests (@DrMurx)
10+
- check-mysql-replication-status: added flapping protection (@DrMurx)
1011

1112
## [3.1.0] - 2018-12-15
1213
### Added

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ $ /opt/sensu/embedded/bin/check-mysql-threads.rb --host=<DBHOST> --ini=/etc/sens
8080
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini
8181
```
8282

83+
**check-mysql-replication-status** example with flapping protection
84+
85+
MariaDB/MySQL sometimes wrongly reports a very high replication lag for a short moment. Flapping protection helps mitigating this issue
86+
better than setting `occurrences` in sensu's `checks` definition because you don't lose any alerting granularity.
87+
88+
```bash
89+
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini --flapping-retry=1 --flapping-lag=86400 --flapping-sleep=2
90+
```
91+
8392
**check-mysql-msr-replication-status** example
8493
```bash
8594
$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host=<SLAVE> --ini=/etc/sensu/my.ini

bin/check-mysql-replication-status.rb

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,27 @@ class CheckMysqlReplicationStatus < Sensu::Plugin::Check::CLI
9494
# #YELLOW
9595
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
9696

97+
option :flapping_lag,
98+
short: '-l',
99+
long: '--flapping-lag=VALUE',
100+
description: 'Lag threshold to trigger flapping protection',
101+
default: 100000,
102+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
103+
104+
option :flapping_retry,
105+
short: '-r',
106+
long: '--flapping-retry=VALUE',
107+
description: 'Number of retries when lag flapping protection is triggered',
108+
default: 0,
109+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
110+
111+
option :flapping_sleep,
112+
long: '--flapping-sleep=VALUE',
113+
description: 'Sleep between flapping protection retries',
114+
default: 1,
115+
proc: lambda { |s| s.to_i } # rubocop:disable Lambda
116+
117+
97118
def detect_replication_status?(row)
98119
%w[
99120
Slave_IO_State
@@ -175,19 +196,29 @@ def ok_slave_message
175196
def run
176197
db = open_connection
177198

178-
row = query_slave_status(db)
179-
ok 'show slave status was nil. This server is not a slave.' if row.nil?
180-
warn "couldn't detect replication status" unless detect_replication_status?(row)
181-
182-
slave_running = slave_running?(row)
183-
critical broken_slave_message(row) unless slave_running
184-
185-
replication_delay = row['Seconds_Behind_Master'].to_i
186-
message = "replication delayed by #{replication_delay}"
187-
# TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks
188-
critical message if replication_delay > config[:crit]
189-
warning message if replication_delay > config[:warn]
190-
ok "#{ok_slave_message}, #{message}"
199+
retries = config[:flapping_retry]
200+
# Note: Endless loop will exit via `ok`, `warning` or `critical`, or by Exception
201+
loop do
202+
row = query_slave_status(db)
203+
ok 'show slave status was nil. This server is not a slave.' if row.nil?
204+
warn "couldn't detect replication status" unless detect_replication_status?(row)
205+
206+
slave_running = slave_running?(row)
207+
critical broken_slave_message(row) unless slave_running
208+
209+
replication_delay = row['Seconds_Behind_Master'].to_i
210+
retries -= 1
211+
if replication_delay >= config[:flapping_lag] && retries >= 0
212+
sleep config[:flapping_sleep]
213+
next
214+
end
215+
216+
message = "replication delayed by #{replication_delay}"
217+
# TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks
218+
critical message if replication_delay > config[:crit]
219+
warning message if replication_delay > config[:warn]
220+
ok "#{ok_slave_message}, #{message}"
221+
end
191222
rescue Mysql::Error => e
192223
errstr = "Error code: #{e.errno} Error message: #{e.error}"
193224
critical "#{errstr} SQLSTATE: #{e.sqlstate}" if e.respond_to?('sqlstate')

test/check-mysql-replication-status_spec.rb

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ def checker.critical(*_args)
5252
['No', 'Yes', nil, 2, 'critical'],
5353
['Yes', 'No', nil, 2, 'critical'],
5454
['No', 'No', nil, 2, 'critical'],
55-
['Yes', 'Yes', 899, 0, 'ok'],
56-
['Yes', 'Yes', 900, 1, 'warning'],
57-
['Yes', 'Yes', 1799, 1, 'warning'],
58-
['Yes', 'Yes', 1800, 2, 'critical'],
55+
['Yes', 'Yes', 900, 0, 'ok'],
56+
['Yes', 'Yes', 901, 1, 'warning'],
57+
['Yes', 'Yes', 1800, 1, 'warning'],
58+
['Yes', 'Yes', 1801, 2, 'critical'],
5959
].each do |testdata|
6060
it "returns #{testdata[4]} for default thresholds" do
6161
slave_status_row = {
@@ -76,4 +76,38 @@ def checker.critical(*_args)
7676
expect(exit_code).to eq testdata[3]
7777
end
7878
end
79+
80+
it "sleeps with flapping protection for default thresholds" do
81+
checker.config[:flapping_retry] = 1
82+
checker.config[:flapping_sleep] = 10
83+
84+
slave_status_row = [
85+
{
86+
"Slave_IO_State" => '',
87+
"Slave_IO_Running" => 'Yes',
88+
"Slave_SQL_Running" => 'Yes',
89+
"Last_IO_Error" => '',
90+
"Last_SQL_Error" => '',
91+
"Seconds_Behind_Master" => 100000
92+
},
93+
{
94+
"Slave_IO_State" => '',
95+
"Slave_IO_Running" => 'Yes',
96+
"Slave_SQL_Running" => 'Yes',
97+
"Last_IO_Error" => '',
98+
"Last_SQL_Error" => '',
99+
"Seconds_Behind_Master" => 99999
100+
}
101+
]
102+
103+
begin
104+
allow(checker).to receive(:open_connection) # do nothing
105+
allow(checker).to receive(:query_slave_status).and_return slave_status_row[0], slave_status_row[1]
106+
expect(checker).to receive(:sleep).with(10)
107+
checker.run
108+
rescue SystemExit => e
109+
exit_code = e.status
110+
end
111+
expect(exit_code).to eq 2
112+
end
79113
end

0 commit comments

Comments
 (0)