@@ -82,6 +82,7 @@ type RaftNode interface {
8282 Delete ()
8383 RecreateInternalSubs () error
8484 IsSystemAccount () bool
85+ Quiesce () error
8586}
8687
8788type WAL interface {
@@ -225,6 +226,9 @@ type raft struct {
225226 observer bool // The node is observing, i.e. not able to become leader
226227 initializing bool // The node is new, and "empty log" checks can be temporarily relaxed.
227228 scaleUp bool // The node is part of a scale up, puts us in observer mode until the log contains data.
229+
230+ quiesce chan bool // Channel to notify leader loop to quiesc
231+ quiesced bool // The node is quiesced
228232}
229233
230234type proposedEntry struct {
@@ -260,6 +264,7 @@ const (
260264 lostQuorumCheckIntervalDefault = hbIntervalDefault * 10 // 10 seconds
261265 observerModeIntervalDefault = 48 * time .Hour
262266 peerRemoveTimeoutDefault = 5 * time .Minute
267+ quiesceIntervalDefault = 15 * time .Minute
263268)
264269
265270var (
@@ -272,6 +277,7 @@ var (
272277 lostQuorumCheck = lostQuorumCheckIntervalDefault
273278 observerModeInterval = observerModeIntervalDefault
274279 peerRemoveTimeout = peerRemoveTimeoutDefault
280+ quiesceInterval = quiesceIntervalDefault
275281)
276282
277283type RaftConfig struct {
@@ -426,6 +432,7 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel
426432 leadc : make (chan bool , 32 ),
427433 observer : cfg .Observer ,
428434 extSt : ps .domainExt ,
435+ quiesce : make (chan bool ),
429436 }
430437
431438 // Setup our internal subscriptions for proposals, votes and append entries.
@@ -1601,6 +1608,44 @@ func (n *raft) selectNextLeader() string {
16011608 return nextLeader
16021609}
16031610
1611+ func (n * raft ) Quiesce () error {
1612+ if n .State () != Leader {
1613+ return errNotLeader
1614+ }
1615+ n .quiesce <- true
1616+ return nil
1617+ }
1618+
1619+ // Return true if the node can be quiesced
1620+ func (n * raft ) mayQuiesce () bool {
1621+ n .RLock ()
1622+ defer n .RUnlock ()
1623+ // TODO this test should be strengthened:
1624+ // must check that followers are up-to-date
1625+ return ! n .quiesced && n .State () == Leader && n .hasQuorumLocked ()
1626+ }
1627+
1628+ func (n * raft ) doQuiesce () bool {
1629+ if n .mayQuiesce () {
1630+ n .sendQuiesce ()
1631+ n .setQuiesced (true )
1632+ return true
1633+ }
1634+ return false
1635+ }
1636+
1637+ func (n * raft ) isQuiesced () bool {
1638+ n .RLock ()
1639+ defer n .RUnlock ()
1640+ return n .quiesced
1641+ }
1642+
1643+ func (n * raft ) setQuiesced (quiesced bool ) {
1644+ n .Lock ()
1645+ defer n .Unlock ()
1646+ n .quiesced = quiesced
1647+ }
1648+
16041649// StepDown will have a leader stepdown and optionally do a leader transfer.
16051650func (n * raft ) StepDown (preferred ... string ) error {
16061651 if n .State () != Leader {
@@ -2140,8 +2185,13 @@ func (n *raft) runAsFollower() {
21402185
21412186 select {
21422187 case <- n .entry .ch :
2188+ wasQuiesced := n .isQuiesced ()
21432189 // New append entries have arrived over the network.
21442190 n .processAppendEntries ()
2191+ if ! wasQuiesced && n .isQuiesced () {
2192+ // Avoid unquiescing immediately
2193+ continue
2194+ }
21452195 case <- n .s .quitCh :
21462196 // The server is shutting down.
21472197 return
@@ -2188,6 +2238,11 @@ func (n *raft) runAsFollower() {
21882238 n .processVoteRequest (voteReq )
21892239 }
21902240 }
2241+
2242+ if n .isQuiesced () {
2243+ n .setQuiesced (false )
2244+ n .debug ("Follower unquiesced" )
2245+ }
21912246 }
21922247}
21932248
@@ -2308,6 +2363,7 @@ const (
23082363 EntryRemovePeer
23092364 EntryLeaderTransfer
23102365 EntrySnapshot
2366+ EntryQuiesce
23112367)
23122368
23132369func (t EntryType ) String () string {
@@ -2326,6 +2382,8 @@ func (t EntryType) String() string {
23262382 return "LeaderTransfer"
23272383 case EntrySnapshot :
23282384 return "Snapshot"
2385+ case EntryQuiesce :
2386+ return "Quiesce"
23292387 }
23302388 return fmt .Sprintf ("Unknown [%d]" , uint8 (t ))
23312389}
@@ -2585,10 +2643,15 @@ func (n *raft) runAsLeader() {
25852643 n .sendPeerState ()
25862644
25872645 hb := time .NewTicker (hbInterval )
2588- defer hb .Stop ()
2589-
25902646 lq := time .NewTicker (lostQuorumCheck )
2591- defer lq .Stop ()
2647+ qu := time .NewTicker (quiesceInterval )
2648+
2649+ stopTicking := func () {
2650+ hb .Stop ()
2651+ lq .Stop ()
2652+ qu .Stop ()
2653+ }
2654+ defer stopTicking ()
25922655
25932656 for n .State () == Leader {
25942657 select {
@@ -2602,6 +2665,12 @@ func (n *raft) runAsLeader() {
26022665 n .processAppendEntryResponse (ar )
26032666 }
26042667 n .resp .recycle (& ars )
2668+ // TODO follower could avoid sending a response
2669+ // for EntryQuiesce
2670+ if n .isQuiesced () {
2671+ // Avoid unquiescing immediately
2672+ continue
2673+ }
26052674 case <- n .prop .ch :
26062675 const maxBatch = 256 * 1024
26072676 const maxEntries = 512
@@ -2664,15 +2733,31 @@ func (n *raft) runAsLeader() {
26642733 }
26652734 case <- n .entry .ch :
26662735 n .processAppendEntries ()
2736+ case <- qu .C :
2737+ if time .Since (n .active ) > quiesceInterval && n .doQuiesce () {
2738+ stopTicking ()
2739+ continue
2740+ }
2741+ case <- n .quiesce :
2742+ if n .doQuiesce () {
2743+ stopTicking ()
2744+ continue
2745+ }
2746+ }
2747+
2748+ // Any interaction unquiesces the leader
2749+ if n .isQuiesced () {
2750+ hb .Reset (hbInterval )
2751+ lq .Reset (lostQuorumInterval )
2752+ qu .Reset (quiesceInterval )
2753+ n .setQuiesced (false )
2754+ n .debug ("Leader unquiesced" )
26672755 }
26682756 }
26692757}
26702758
2671- // Quorum reports the quorum status. Will be called on former leaders.
2672- func (n * raft ) Quorum () bool {
2673- n .RLock ()
2674- defer n .RUnlock ()
2675-
2759+ // Return true if leader believes it still has a quorum.
2760+ func (n * raft ) hasQuorumLocked () bool {
26762761 nc := 0
26772762 for id , peer := range n .peers {
26782763 if id == n .id || time .Since (peer .ts ) < lostQuorumInterval {
@@ -2684,6 +2769,13 @@ func (n *raft) Quorum() bool {
26842769 return false
26852770}
26862771
2772+ // Quorum reports the quorum status. Will be called on former leaders.
2773+ func (n * raft ) Quorum () bool {
2774+ n .RLock ()
2775+ defer n .RUnlock ()
2776+ return n .hasQuorumLocked ()
2777+ }
2778+
26872779func (n * raft ) lostQuorum () bool {
26882780 n .RLock ()
26892781 defer n .RUnlock ()
@@ -2698,15 +2790,7 @@ func (n *raft) lostQuorumLocked() bool {
26982790 return false
26992791 }
27002792
2701- nc := 0
2702- for id , peer := range n .peers {
2703- if id == n .id || time .Since (peer .ts ) < lostQuorumInterval {
2704- if nc ++ ; nc >= n .qn {
2705- return false
2706- }
2707- }
2708- }
2709- return true
2793+ return ! n .hasQuorumLocked ()
27102794}
27112795
27122796// Check for being not active in terms of sending entries.
@@ -3719,6 +3803,11 @@ CONTINUE:
37193803 // Check to see if we have any related entries to process here.
37203804 for _ , e := range ae .entries {
37213805 switch e .Type {
3806+ case EntryQuiesce :
3807+ if isNew && n .State () == Follower {
3808+ n .elect .Stop ()
3809+ n .quiesced = true
3810+ }
37223811 case EntryLeaderTransfer :
37233812 // Only process these if they are new, so no replays or catchups.
37243813 if isNew {
@@ -3870,6 +3959,11 @@ func (n *raft) buildAppendEntry(entries []*Entry) *appendEntry {
38703959// Determine if we should store an entry. This stops us from storing
38713960// heartbeat messages.
38723961func (ae * appendEntry ) shouldStore () bool {
3962+ if len (ae .entries ) == 1 {
3963+ if e := ae .entries [0 ]; e .Type == EntryQuiesce {
3964+ return false
3965+ }
3966+ }
38733967 return ae != nil && len (ae .entries ) > 0
38743968}
38753969
@@ -4033,6 +4127,11 @@ func (n *raft) sendHeartbeat() {
40334127 n .sendAppendEntry (nil )
40344128}
40354129
4130+ // Tell the cluster to quiesce the current term
4131+ func (n * raft ) sendQuiesce () {
4132+ n .sendAppendEntry ([]* Entry {{EntryQuiesce , nil }})
4133+ }
4134+
40364135type voteRequest struct {
40374136 term uint64
40384137 lastTerm uint64
0 commit comments