Monday, September 23, 2013

PostgreSQL 9.2 Master – Slave Monitoring

Nagios plugin script written in Bash to check the master-slave replication in PostgreSQL (tested on PostgreSQL 9.2.4) (executed on the slave).
The script will report how many bytes the slave server is behind, and how many seconds ago the last replay of data occurred.
The script must be executed as ‘postgres’ user.
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash
 
# $Id: check_slave_replication.sh 3421 2013-08-09 07:52:44Z jmorano $
 
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
  
## Master (p_) and Slave (s_) DB Server Information
export s_host=$1
export s_port=$2
export p_db=$3
export p_host=$4
export p_port=$5
  
export psql=/opt/postgresql/bin/psql
export bc=/usr/bin/bc
  
## Limits
export  critical_limit=83886080 # 5 * 16MB, size of 5 WAL files
export   warning_limit=16777216 # 16 MB, size of 1 WAL file
  
master_lag=$($psql -U postgres -h$p_host -p$p_port -A -t -c "SELECT pg_xlog_location_diff(pg_current_xlog_location(), '0/0') AS offset" $p_db)
slave_lag=$($psql -U postgres  -h$s_host -p$s_port -A -t -c "SELECT pg_xlog_location_diff(pg_last_xlog_receive_location(), '0/0') AS receive" $p_db)
replay_lag=$($psql -U postgres -h$s_host -p$s_port -A -t -c "SELECT pg_xlog_location_diff(pg_last_xlog_replay_location(), '0/0') AS replay" $p_db)
replay_timediff=$($psql -U postgres -h$s_host -p$s_port -A -t -c "SELECT NOW() - pg_last_xact_replay_timestamp() AS replication_delay" $p_db)
  
if [[ $master_lag -eq '' || $slave_lag -eq '' || $replay_lag -eq '' ]]; then
    echo "CRITICAL: Stream has no value to compare (is replication configured or connectivity problem?)"
    exit $STATE_CRITICAL
else
    if [[ $master_lag -eq $slave_lag && $master_lag -eq $replay_lag && $slave_lag -eq $replay_lag ]] ; then
        echo "OK: Stream: MASTER:$master_lag Slave:$slave_lag Replay:$replay_lag"
        exit $STATE_OK
    else
        if [[ $master_lag -eq $slave_lag ]] ; then
            if [[ $master_lag -ne $replay_lag ]] ; then
                if [ $(bc <<< $master_lag-$replay_lag) -lt $warning_limit ]; then
                    echo "OK: Stream: MASTER:$master_lag Replay:$replay_lag :: REPLAY BEHIND"
                    exit $STATE_OK
                else
                    echo "WARNING: Stream: MASTER:$master_lag Replay:$replay_lag :: REPLAY $(bc <<< $master_lag-$replay_lag)bytes BEHIND (${replay_timediff}seconds)"
                    exit $STATE_WARNING
                fi
            fi
        else
            if [ $(bc <<< $master_lag-$slave_lag) -gt $critical_limit ]; then
                echo "CRITICAL: Stream: MASTER:$master_lag Slave:$slave_lag :: STREAM BEYOND CRITICAL LIMIT ($(bc <<< $master_lag-$slave_lag)bytes)"
                exit $STATE_CRITICAL
            else
                if [ $(bc <<< $master_lag-$slave_lag) -lt $warning_limit ]; then
                    echo "OK: Stream: MASTER:$master_lag Slave:$slave_lag Replay:$replay_lag :: STREAM BEHIND"
                    exit $STATE_OK
                else
                    echo "WARNING: Stream: MASTER:$master_lag Slave:$slave_lag :: STREAM BEYOND WARNING LIMIT ($(bc <<< $master_lag-$replay_lag)bytes)"
                    exit $STATE_WARNING
                fi
            fi
        fi
        echo "UNKNOWN: Stream: MASTER: $master_lag Slave: $slave_lag Replay: $replay_lag"
        exit $STATE_UNKNOWN
    fi
fi
Possible outputs:
?
1
2
3
4
5
6
7
8
$ bash check_slave_replication.sh 192.168.0.1 5432 live 192.168.0.2 5432
WARNING: Stream: MASTER:1907958306184 Replay:1907878056888 :: REPLAY 80249296bytes BEHIND (00:03:14.056747seconds)
$ bash check_slave_replication.sh 192.168.0.1 5432 live 192.168.0.2 5432
OK: Stream: MASTER:2055690128376 Slave:2055690143144 Replay:2055690193744 :: STREAM BEHIND
$ bash check_slave_replication.sh 192.168.0.1 5432 live 192.168.0.2 5432
OK: Stream: MASTER:2055690497120 Replay:2055690497328 :: REPLAY BEHIND
$ bash check_slave_replication.sh 192.168.0.1 5432 live 192.168.0.2 5432
OK: Stream: MASTER:2055691704672 Slave:2055691704672 Replay:2055691704672

No comments:

Post a Comment