Redis哨兵节点管理以及高可用redis集群的容灾演练

发布 : 2017-06-15 分类 : 大数据 浏览 :

容灾演练

演练环境准备

1
2
(1).Redis集群服务已启动
(2).Redis哨兵集群服务已启动

进入哨兵命令行

1
[root@matrix-cache01 ~]# redis-cli -h 192.168.31.231 -p 5000

通过哨兵查看当前的master

1
2
3
192.168.31.231:5000> SENTINEL get-master-addr-by-name mymaster
1) "192.168.31.231"
2) "6379"

查看哨兵集群

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
192.168.31.231:5000> sentinel sentinels mymaster
1) 1) "name"
2) "707fb3721cfa9fd1edd4059da134614a41840745"
3) "ip"
4) "192.168.31.233"
5) "port"
6) "5000"
7) "runid"
8) "707fb3721cfa9fd1edd4059da134614a41840745"
9) "flags"
10) "sentinel"
11) "link-pending-commands"
12) "0"
13) "link-refcount"
14) "1"
15) "last-ping-sent"
16) "0"
17) "last-ok-ping-reply"
18) "555"
19) "last-ping-reply"
20) "555"
21) "down-after-milliseconds"
22) "1000"
23) "last-hello-message"
24) "220"
25) "voted-leader"
26) "?"
27) "voted-leader-epoch"
28) "0"
2) 1) "name"
2) "61e7c8a7ba924ee22fab0d611f928139e4c9dbbf"
3) "ip"
4) "192.168.31.232"
5) "port"
6) "5000"
7) "runid"
8) "61e7c8a7ba924ee22fab0d611f928139e4c9dbbf"
9) "flags"
10) "sentinel"
11) "link-pending-commands"
12) "0"
13) "link-refcount"
14) "1"
15) "last-ping-sent"
16) "0"
17) "last-ok-ping-reply"
18) "555"
19) "last-ping-reply"
20) "555"
21) "down-after-milliseconds"
22) "1000"
23) "last-hello-message"
24) "483"
25) "voted-leader"
26) "?"
27) "voted-leader-epoch"
28) "0"

kill -9掉Redis master,同时删除pid文件

1
2
3
4
[root@matrix-cache01 ~]# ps aux | grep redis
root 880 0.1 0.2 33192 2152 ? Ssl 11:23 0:01 /usr/local/bin/redis-server 192.168.31.231:6379
root 952 0.2 0.2 31608 2268 pts/0 Sl+ 11:31 0:01 redis-sentinel 192.168.31.231:5000 [sentinel]
root 973 0.0 0.0 5976 764 pts/1 S+ 11:38 0:00 grep redis
1
2
3
4
5
[root@matrix-cache01 ~]# kill -9 880
[root@matrix-cache01 ~]# ps aux | grep redis
root 952 0.2 0.2 31608 2280 pts/0 Sl+ 11:31 0:01 redis-sentinel 192.168.31.231:5000 [sentinel]
root 975 0.0 0.0 5976 768 pts/1 S+ 11:39 0:00 grep redis
[root@matrix-cache01 ~]# rm -rf /var/run/redis_6379.pid

查看sentinal的日志

1
2
出现+sdown字样,识别出了master的宕机问题
然后出现+odown字样,就是指定的quorum哨兵数量,都认为master宕机了

matrix-cache01

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
952:X 24 Jun 11:39:17.430 # +sdown master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.483 # +odown master mymaster 192.168.31.231 6379 #quorum 2/2
952:X 24 Jun 11:39:17.483 # +new-epoch 1
952:X 24 Jun 11:39:17.483 # +try-failover master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.498 # +vote-for-leader 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 1
952:X 24 Jun 11:39:17.512 # 707fb3721cfa9fd1edd4059da134614a41840745 voted for 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 1
952:X 24 Jun 11:39:17.512 # 61e7c8a7ba924ee22fab0d611f928139e4c9dbbf voted for 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 1
952:X 24 Jun 11:39:17.599 # +elected-leader master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.599 # +failover-state-select-slave master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.666 # +selected-slave slave 192.168.31.233:6379 192.168.31.233 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.666 * +failover-state-send-slaveof-noone slave 192.168.31.233:6379 192.168.31.233 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:17.721 * +failover-state-wait-promotion slave 192.168.31.233:6379 192.168.31.233 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:18.544 # +promoted-slave slave 192.168.31.233:6379 192.168.31.233 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:18.544 # +failover-state-reconf-slaves master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:18.618 * +slave-reconf-sent slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:19.589 * +slave-reconf-inprog slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:19.589 * +slave-reconf-done slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:19.672 # -odown master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:19.672 # +failover-end master mymaster 192.168.31.231 6379
952:X 24 Jun 11:39:19.672 # +switch-master mymaster 192.168.31.231 6379 192.168.31.233 6379
952:X 24 Jun 11:39:19.672 * +slave slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.233 6379
952:X 24 Jun 11:39:19.672 * +slave slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379
952:X 24 Jun 11:39:20.739 # +sdown slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379

matrix-cache02

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
953:X 24 Jun 11:31:22.280 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of 128.
953:X 24 Jun 11:31:22.280 # Sentinel ID is 61e7c8a7ba924ee22fab0d611f928139e4c9dbbf
953:X 24 Jun 11:31:22.280 # +monitor master mymaster 192.168.31.231 6379 quorum 2
953:X 24 Jun 11:31:23.286 # +sdown sentinel 707fb3721cfa9fd1edd4059da134614a41840745 192.168.31.233 5000 @ mymaster 192.168.31.231 6379
953:X 24 Jun 11:31:25.486 # -sdown sentinel 707fb3721cfa9fd1edd4059da134614a41840745 192.168.31.233 5000 @ mymaster 192.168.31.231 6379
953:X 24 Jun 11:39:17.417 # +sdown master mymaster 192.168.31.231 6379
953:X 24 Jun 11:39:17.511 # +new-epoch 1
953:X 24 Jun 11:39:17.513 # +vote-for-leader 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 1
953:X 24 Jun 11:39:18.528 # +odown master mymaster 192.168.31.231 6379 #quorum 3/2
953:X 24 Jun 11:39:18.528 # Next failover delay: I will not start a failover before Sat Jun 24 11:39:27 2017
953:X 24 Jun 11:39:18.621 # +config-update-from sentinel 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 192.168.31.231 5000 @ mymaster 192.168.31.231 6379
953:X 24 Jun 11:39:18.621 # +switch-master mymaster 192.168.31.231 6379 192.168.31.233 6379
953:X 24 Jun 11:39:18.621 * +slave slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.233 6379
953:X 24 Jun 11:39:18.621 * +slave slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379
953:X 24 Jun 11:39:19.622 # +sdown slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379

matrix-cache03

1
2
3
4
5
6
7
8
9
10
11
12
13
937:X 24 Jun 11:31:22.965 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of 128.
937:X 24 Jun 11:31:22.966 # Sentinel ID is 707fb3721cfa9fd1edd4059da134614a41840745
937:X 24 Jun 11:31:22.966 # +monitor master mymaster 192.168.31.231 6379 quorum 2
937:X 24 Jun 11:39:16.141 # +sdown master mymaster 192.168.31.231 6379
937:X 24 Jun 11:39:16.180 # +new-epoch 1
937:X 24 Jun 11:39:16.182 # +vote-for-leader 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 1
937:X 24 Jun 11:39:16.217 # +odown master mymaster 192.168.31.231 6379 #quorum 3/2
937:X 24 Jun 11:39:16.217 # Next failover delay: I will not start a failover before Sat Jun 24 11:39:26 2017
937:X 24 Jun 11:39:17.289 # +config-update-from sentinel 398099b3bd68a6ac6d264d6d7c651e9d3b25a61c 192.168.31.231 5000 @ mymaster 192.168.31.231 6379
937:X 24 Jun 11:39:17.289 # +switch-master mymaster 192.168.31.231 6379 192.168.31.233 6379
937:X 24 Jun 11:39:17.290 * +slave slave 192.168.31.232:6379 192.168.31.232 6379 @ mymaster 192.168.31.233 6379
937:X 24 Jun 11:39:17.290 * +slave slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379
937:X 24 Jun 11:39:18.326 # +sdown slave 192.168.31.231:6379 192.168.31.231 6379 @ mymaster 192.168.31.233 6379

通过哨兵查看现在的master

1
2
3
4
[root@matrix-cache01 ~]# redis-cli -h 192.168.31.231 -p 5000
192.168.31.231:5000> SENTINEL get-master-addr-by-name mymaster
1) "192.168.31.233"
2) "6379"
1
可以发现当192.168.31.231宕机后,哨兵集群选举了192.168.31.233为master

查看新master

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
[root@matrix-cache03 ~]# redis-cli -h 192.168.31.233 -p 5000
192.168.31.233:5000> sentinel master mymaster
1) "name"
2) "mymaster"
3) "ip"
4) "192.168.31.233"
5) "port"
6) "6379"
7) "runid"
8) "7945069d6501eeaeaa59a15c6de658f16a9d8f65"
9) "flags"
10) "master"
11) "link-pending-commands"
12) "0"
13) "link-refcount"
14) "1"
15) "last-ping-sent"
16) "0"
17) "last-ok-ping-reply"
18) "972"
19) "last-ping-reply"
20) "972"
21) "down-after-milliseconds"
22) "1000"
23) "info-refresh"
24) "88"
25) "role-reported"
26) "master"
27) "role-reported-time"
28) "539597"
29) "config-epoch"
30) "1"
31) "num-slaves"
32) "2"
33) "num-other-sentinels"
34) "2"
35) "quorum"
36) "2"
37) "failover-timeout"
38) "5000"
39) "parallel-syncs"
40) "1"

将旧的master重新启动,查看是否被哨兵自动切换成slave节点

1
2
3
[root@matrix-cache01 ~]# cd /etc/init.d
[root@matrix-cache01 init.d]# ./redis_6379 start
Starting Redis server...
1
2
3
4
[root@matrix-cache01 ~]# redis-cli -h 192.168.31.231 -p 5000
192.168.31.231:5000> SENTINEL get-master-addr-by-name mymaster
1) "192.168.31.233"
2) "6379"
1
2
3
4
5
6
7
8
9
(1).三个哨兵进程都认为master是sdown了
(2).超过quorum指定的哨兵进程都认为sdown之后,就变为odown
(3).哨兵1是被选举为要执行后续的主备切换的那个哨兵
(4).哨兵1去新的master(slave)获取了一个新的config version
(5).尝试执行failover
(6).投票选举出一个slave区切换成master,每隔哨兵都会执行一次投票
(7).让salve,slaveof noone,不让它去做任何节点的slave了; 把slave提拔成master; 旧的master认为不再是master了
(8).哨兵就自动认为之前的187:6379变成了slave了,19:6379变成了master了
(9).哨兵去探查了一下187:6379这个salve的状态,认为它sdown了
本文作者 : Matrix
原文链接 : https://matrixsparse.github.io/2017/06/15/Redis哨兵节点管理以及高可用redis集群的容灾演练/
版权声明 : 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明出处!

知识 & 情怀 | 二者兼得

微信扫一扫, 向我投食

微信扫一扫, 向我投食

支付宝扫一扫, 向我投食

支付宝扫一扫, 向我投食

留下足迹