1start_server {tags {"psync2"}} { 2start_server {} { 3start_server {} { 4start_server {} { 5start_server {} { 6 set master_id 0 ; # Current master 7 set start_time [clock seconds] ; # Test start time 8 set counter_value 0 ; # Current value of the Redis counter "x" 9 10 # Config 11 set debug_msg 0 ; # Enable additional debug messages 12 13 set no_exit 0 ; # Do not exit at end of the test 14 15 set duration 20 ; # Total test seconds 16 17 set genload 1 ; # Load master with writes at every cycle 18 19 set genload_time 5000 ; # Writes duration time in ms 20 21 set disconnect 1 ; # Break replication link between random 22 # master and slave instances while the 23 # master is loaded with writes. 24 25 set disconnect_period 1000 ; # Disconnect repl link every N ms. 26 27 for {set j 0} {$j < 5} {incr j} { 28 set R($j) [srv [expr 0-$j] client] 29 set R_host($j) [srv [expr 0-$j] host] 30 set R_port($j) [srv [expr 0-$j] port] 31 if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"} 32 } 33 34 set cycle 1 35 while {([clock seconds]-$start_time) < $duration} { 36 test "PSYNC2: --- CYCLE $cycle ---" {} 37 incr cycle 38 39 # Create a random replication layout. 40 # Start with switching master (this simulates a failover). 41 42 # 1) Select the new master. 43 set master_id [randomInt 5] 44 set used [list $master_id] 45 test "PSYNC2: \[NEW LAYOUT\] Set #$master_id as master" { 46 $R($master_id) slaveof no one 47 if {$counter_value == 0} { 48 $R($master_id) set x $counter_value 49 } 50 } 51 52 # 2) Attach all the slaves to a random instance 53 while {[llength $used] != 5} { 54 while 1 { 55 set slave_id [randomInt 5] 56 if {[lsearch -exact $used $slave_id] == -1} break 57 } 58 set rand [randomInt [llength $used]] 59 set mid [lindex $used $rand] 60 set master_host $R_host($mid) 61 set master_port $R_port($mid) 62 63 test "PSYNC2: Set #$slave_id to replicate from #$mid" { 64 $R($slave_id) slaveof $master_host $master_port 65 } 66 lappend used $slave_id 67 } 68 69 # 3) Increment the counter and wait for all the instances 70 # to converge. 71 test "PSYNC2: cluster is consistent after failover" { 72 $R($master_id) incr x; incr counter_value 73 for {set j 0} {$j < 5} {incr j} { 74 wait_for_condition 50 1000 { 75 [$R($j) get x] == $counter_value 76 } else { 77 fail "Instance #$j x variable is inconsistent" 78 } 79 } 80 } 81 82 # 4) Generate load while breaking the connection of random 83 # slave-master pairs. 84 test "PSYNC2: generate load while killing replication links" { 85 set t [clock milliseconds] 86 set next_break [expr {$t+$disconnect_period}] 87 while {[clock milliseconds]-$t < $genload_time} { 88 if {$genload} { 89 $R($master_id) incr x; incr counter_value 90 } 91 if {[clock milliseconds] == $next_break} { 92 set next_break \ 93 [expr {[clock milliseconds]+$disconnect_period}] 94 set slave_id [randomInt 5] 95 if {$disconnect} { 96 $R($slave_id) client kill type master 97 if {$debug_msg} { 98 puts "+++ Breaking link for replica #$slave_id" 99 } 100 } 101 } 102 } 103 } 104 105 # 5) Increment the counter and wait for all the instances 106 set x [$R($master_id) get x] 107 test "PSYNC2: cluster is consistent after load (x = $x)" { 108 for {set j 0} {$j < 5} {incr j} { 109 wait_for_condition 50 1000 { 110 [$R($j) get x] == $counter_value 111 } else { 112 fail "Instance #$j x variable is inconsistent" 113 } 114 } 115 } 116 117 # Put down the old master so that it cannot generate more 118 # replication stream, this way in the next master switch, the time at 119 # which we move slaves away is not important, each will have full 120 # history (otherwise PINGs will make certain slaves have more history), 121 # and sometimes a full resync will be needed. 122 $R($master_id) slaveof 127.0.0.1 0 ;# We use port zero to make it fail. 123 124 if {$debug_msg} { 125 for {set j 0} {$j < 5} {incr j} { 126 puts "$j: sync_full: [status $R($j) sync_full]" 127 puts "$j: id1 : [status $R($j) master_replid]:[status $R($j) master_repl_offset]" 128 puts "$j: id2 : [status $R($j) master_replid2]:[status $R($j) second_repl_offset]" 129 puts "$j: backlog : firstbyte=[status $R($j) repl_backlog_first_byte_offset] len=[status $R($j) repl_backlog_histlen]" 130 puts "---" 131 } 132 } 133 134 test "PSYNC2: total sum of full synchronizations is exactly 4" { 135 set sum 0 136 for {set j 0} {$j < 5} {incr j} { 137 incr sum [status $R($j) sync_full] 138 } 139 assert {$sum == 4} 140 } 141 142 # Limit anyway the maximum number of cycles. This is useful when the 143 # test is skipped via --only option of the test suite. In that case 144 # we don't want to see many seconds of this test being just skipped. 145 if {$cycle > 50} break 146 } 147 148 test "PSYNC2: Bring the master back again for next test" { 149 $R($master_id) slaveof no one 150 set master_host $R_host($master_id) 151 set master_port $R_port($master_id) 152 for {set j 0} {$j < 5} {incr j} { 153 if {$j == $master_id} continue 154 $R($j) slaveof $master_host $master_port 155 } 156 157 # Wait for slaves to sync 158 wait_for_condition 50 1000 { 159 [status $R($master_id) connected_slaves] == 4 160 } else { 161 fail "Replica not reconnecting" 162 } 163 } 164 165 test "PSYNC2: Partial resync after restart using RDB aux fields" { 166 # Pick a random slave 167 set slave_id [expr {($master_id+1)%5}] 168 set sync_count [status $R($master_id) sync_full] 169 set sync_partial [status $R($master_id) sync_partial_ok] 170 catch { 171 $R($slave_id) config rewrite 172 $R($slave_id) debug restart 173 } 174 # note: just waiting for connected_slaves==4 has a race condition since 175 # we might do the check before the master realized that the slave disconnected 176 wait_for_condition 50 1000 { 177 [status $R($master_id) sync_partial_ok] == $sync_partial + 1 178 } else { 179 fail "Replica not reconnecting" 180 } 181 set new_sync_count [status $R($master_id) sync_full] 182 assert {$sync_count == $new_sync_count} 183 } 184 185 test "PSYNC2: Replica RDB restart with EVALSHA in backlog issue #4483" { 186 # Pick a random slave 187 set slave_id [expr {($master_id+1)%5}] 188 set sync_count [status $R($master_id) sync_full] 189 190 # Make sure to replicate the first EVAL while the salve is online 191 # so that it's part of the scripts the master believes it's safe 192 # to propagate as EVALSHA. 193 $R($master_id) EVAL {return redis.call("incr","__mycounter")} 0 194 $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 195 196 # Wait for the two to sync 197 wait_for_condition 50 1000 { 198 [$R($master_id) debug digest] == [$R($slave_id) debug digest] 199 } else { 200 fail "Replica not reconnecting" 201 } 202 203 # Prevent the slave from receiving master updates, and at 204 # the same time send a new script several times to the 205 # master, so that we'll end with EVALSHA into the backlog. 206 $R($slave_id) slaveof 127.0.0.1 0 207 208 $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 209 $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 210 $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 211 212 catch { 213 $R($slave_id) config rewrite 214 $R($slave_id) debug restart 215 } 216 217 # Reconfigure the slave correctly again, when it's back online. 218 set retry 50 219 while {$retry} { 220 if {[catch { 221 $R($slave_id) slaveof $master_host $master_port 222 }]} { 223 after 1000 224 } else { 225 break 226 } 227 incr retry -1 228 } 229 230 # The master should be back at 4 slaves eventually 231 wait_for_condition 50 1000 { 232 [status $R($master_id) connected_slaves] == 4 233 } else { 234 fail "Replica not reconnecting" 235 } 236 set new_sync_count [status $R($master_id) sync_full] 237 assert {$sync_count == $new_sync_count} 238 239 # However if the slave started with the full state of the 240 # scripting engine, we should now have the same digest. 241 wait_for_condition 50 1000 { 242 [$R($master_id) debug digest] == [$R($slave_id) debug digest] 243 } else { 244 fail "Debug digest mismatch between master and replica in post-restart handshake" 245 } 246 } 247 248 if {$no_exit} { 249 while 1 { puts -nonewline .; flush stdout; after 1000} 250 } 251 252}}}}} 253