Skip to content

Commit 230d4e0

Browse files
committed
feat: get time diff btw replica and master from pg_stat_replication
1 parent 94cf15b commit 230d4e0

File tree

1 file changed

+107
-3
lines changed

1 file changed

+107
-3
lines changed

mamonsu/plugins/pgsql/xlog.py

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,36 @@
77

88
class Xlog(Plugin):
99
DEFAULT_CONFIG = {'lag_more_than_in_sec': str(60 * 5)}
10-
query_wal_lsn_diff = " select pg_catalog.pg_wal_lsn_diff " \
10+
11+
# get amount of WAL since '0/00000000'
12+
query_wal_lsn_diff = " SELECT pg_catalog.pg_wal_lsn_diff " \
1113
"(pg_catalog.pg_current_wal_lsn(), '0/00000000');"
12-
query_xlog_lsn_diff = "select pg_catalog.pg_xlog_location_diff " \
14+
15+
query_xlog_lsn_diff = "SELECT pg_catalog.pg_xlog_location_diff " \
1316
"(pg_catalog.pg_current_xlog_location(), '0/00000000');"
17+
18+
# get time of replication lag
1419
query_agent_replication_lag = "SELECT CASE WHEN extract(epoch from now()-pg_last_xact_replay_timestamp()) " \
1520
"IS NULL THEN 0 ELSE extract(epoch from now()-pg_last_xact_replay_timestamp()) END"
21+
22+
# get diff in lsn for replica (for version 10 and higher also write, flush and replay
23+
24+
query_wal_lag_lsn = "SELECT application_name, " \
25+
" flush_lag, replay_lag, write_lag, " \
26+
" pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS total_lag " \
27+
" FROM pg_stat_replication;"
28+
query_xlog_lag_lsn = "SELECT application_name, " \
29+
"pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS total_lag " \
30+
"FROM pg_stat_replication;"
31+
32+
# for discovery rule for name of each replica
33+
key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
34+
key_total_lag = 'pgsql.replication.total_lag{0}'
35+
# for PG 10 and higher
36+
key_flush = 'pgsql.replication.flush_lag{0}'
37+
key_replay = 'pgsql.replication.replay_lag{0}'
38+
key_write = 'pgsql.replication.write_lag{0}'
39+
1640
key_wall = 'pgsql.wal.write{0}'
1741
key_count_wall = "pgsql.wal.count{0}"
1842
key_replication = "pgsql.replication_lag{0}"
@@ -27,12 +51,39 @@ def run(self, zbx):
2751
zbx.send('pgsql.replication_lag[sec]', float(lag[0][0]))
2852
else:
2953
Pooler.run_sql_type('replication_lag_master_query')
30-
# xlog location
54+
3155
if Pooler.server_version_greater('10.0'):
3256
result = Pooler.query(self.query_wal_lsn_diff)
57+
result_lags = Pooler.query(self.query_wal_lag_lsn)
58+
if result_lags:
59+
lags = []
60+
for info in result_lags:
61+
lags.append({'{#APPLICATION_NAME}': info[0]})
62+
zbx.send('pgsql.replication.flush_lag[{0}]'.format(
63+
info[0]), info[1])
64+
zbx.send('pgsql.replication.replay_lag[{0}]'.format(
65+
info[0]), info[2])
66+
zbx.send('pgsql.replication.write_lag[{0}]'.format(
67+
info[0]), info[3])
68+
zbx.send('pgsql.replication.total_lag[{0}]'.format(
69+
info[0]), float(info[4]), self.DELTA_SPEED)
70+
zbx.send('pgsql.replication.discovery[]', zbx.json({'data': lags}))
71+
del lags
3372
else:
3473
result = Pooler.query(self.query_xlog_lsn_diff)
74+
result_lags = Pooler.query(self.query_xlog_lag_lsn)
75+
if result_lags:
76+
lags = []
77+
for info in result_lags:
78+
lags.append({'{#APPLICATION_NAME}': info[0]})
79+
80+
zbx.send('pgsql.replication.total_lag[{0}]'.format(
81+
info[0]), float(info[1]), self.DELTA_SPEED)
82+
zbx.send('pgsql.replication.discovery[]', zbx.json({'data': lags}))
83+
del lags
84+
3585
zbx.send(self.key_wall.format("[]"), float(result[0][0]), self.DELTA_SPEED)
86+
3687
# count of xlog files
3788
if Pooler.server_version_greater('10.0'):
3889
result = Pooler.run_sql_type('count_wal_files')
@@ -89,6 +140,59 @@ def triggers(self, template):
89140
self.plugin_config('lag_more_than_in_sec')
90141
})
91142

143+
def discovery_rules(self, template):
144+
rule = {
145+
'name': 'Replication lag discovery',
146+
'key': self.key_lsn_replication_discovery.format('[{0}]'.format(self.Macros[self.Type])),
147+
148+
}
149+
if Plugin.old_zabbix:
150+
conditions = []
151+
rule['filter'] = '{#APPLICATION_NAME}:.*'
152+
else:
153+
conditions = [
154+
{
155+
'condition': [
156+
{'macro': '{#APPLICATION_NAME}',
157+
'value': '.*',
158+
'operator': None,
159+
'formulaid': 'A'}
160+
]
161+
}
162+
163+
]
164+
items = [
165+
{'key': self.right_type(self.key_flush, var_discovery="{#APPLICATION_NAME},"),
166+
'name': 'Time elapsed between flushing recent WAL locally and receiving notification that '
167+
'this standby server {#APPLICATION_NAME} has written and flushed it',
168+
'value_type': Plugin.VALUE_TYPE.text,
169+
'delay': self.plugin_config('interval')},
170+
{'key': self.right_type(self.key_replay, var_discovery="{#APPLICATION_NAME},"),
171+
'name': 'Time elapsed between flushing recent WAL locally and receiving notification that '
172+
'this standby server {#APPLICATION_NAME} has written, flushed and applied',
173+
'value_type': Plugin.VALUE_TYPE.text,
174+
'delay': self.plugin_config('interval')},
175+
{'key': self.right_type(self.key_write, var_discovery="{#APPLICATION_NAME},"),
176+
'name': 'Time elapsed between flushing recent WAL locally and receiving notification that '
177+
'this standby server {#APPLICATION_NAME} has written it',
178+
'value_type': Plugin.VALUE_TYPE.text,
179+
'delay': self.plugin_config('interval')},
180+
{'key': self.right_type(self.key_total_lag, var_discovery="{#APPLICATION_NAME},"),
181+
'name': 'Delta of total lag for {#APPLICATION_NAME}',
182+
'value_type': Plugin.VALUE_TYPE.numeric_float,
183+
'delay': self.plugin_config('interval')}
184+
]
185+
graphs = [
186+
{
187+
'name': 'Delta of total lag for {#APPLICATION_NAME}',
188+
'items': [
189+
{'color': 'CC0000',
190+
'key': self.right_type(self.key_total_lag, var_discovery="{#APPLICATION_NAME},")},
191+
]
192+
}
193+
]
194+
return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs)
195+
92196
def keys_and_queries(self, template_zabbix):
93197
result = []
94198
if LooseVersion(self.VersionPG) < LooseVersion('10'):

0 commit comments

Comments
 (0)