@@ -6,8 +6,10 @@ import (
66 "log"
77 "log/slog"
88 "os"
9+ "sync"
910 "time"
1011
12+ "github.com/cenkalti/backoff/v4"
1113 "github.com/skupperproject/skupper/api/types"
1214 "github.com/skupperproject/skupper/internal/config"
1315 "github.com/skupperproject/skupper/internal/flow"
@@ -47,20 +49,6 @@ func (s *StatusSyncClient) Update(ctx context.Context, latest *corev1.ConfigMap)
4749 return err
4850}
4951
50- func updateLockOwner (lockname , namespace string , owner * metav1.OwnerReference , cli * internalclient.KubeClient ) error {
51- current , err := cli .Kube .CoordinationV1 ().Leases (namespace ).Get (context .TODO (), lockname , metav1.GetOptions {})
52- if err != nil {
53- return err
54- }
55- if owner != nil {
56- current .ObjectMeta .OwnerReferences = []metav1.OwnerReference {
57- * owner ,
58- }
59- }
60- _ , err = cli .Kube .CoordinationV1 ().Leases (namespace ).Update (context .TODO (), current , metav1.UpdateOptions {})
61- return err
62- }
63-
6452func siteCollector (ctx context.Context , cli * internalclient.KubeClient ) {
6553 siteData := map [string ]string {}
6654 platform := config .GetPlatform ()
@@ -84,11 +72,6 @@ func siteCollector(ctx context.Context, cli *internalclient.KubeClient) {
8472 log .Fatal ("Failed to create site status config map " , err .Error ())
8573 }
8674
87- err = updateLockOwner (types .SiteLeaderLockName , cli .Namespace , & owner , cli )
88- if err != nil {
89- log .Println ("Update lock error" , err .Error ())
90- }
91-
9275 factory := session .NewContainerFactory ("amqp://localhost:5672" , session.ContainerConfig {ContainerID : "kube-flow-collector" })
9376 statusSyncClient := & StatusSyncClient {
9477 client : cli .Kube .CoreV1 ().ConfigMaps (cli .Namespace ),
@@ -136,37 +119,56 @@ func startFlowController(ctx context.Context, cli *internalclient.KubeClient) er
136119}
137120
138121func runLeaderElection (lock * resourcelock.LeaseLock , id string , cli * internalclient.KubeClient ) {
139- ctx := context .Background ()
140- begin := time .Now ()
141- podname , _ := os .Hostname ()
142- leaderelection .RunOrDie (ctx , leaderelection.LeaderElectionConfig {
143- Lock : lock ,
144- ReleaseOnCancel : true ,
145- LeaseDuration : 15 * time .Second ,
146- RenewDeadline : 10 * time .Second ,
147- RetryPeriod : 2 * time .Second ,
148- Callbacks : leaderelection.LeaderCallbacks {
149- OnStartedLeading : func (c context.Context ) {
150- log .Printf ("COLLECTOR: Leader %s starting site collection after %s\n " , podname , time .Since (begin ))
151- siteCollector (ctx , cli )
152- if err := startFlowController (ctx , cli ); err != nil {
153- log .Printf ("COLLECTOR: Failed to start controller for emitting site events: %s" , err )
154- }
155- },
156- OnStoppedLeading : func () {
157- // we held the lock but lost it. This indicates that something
158- // went wrong. Exit and restart.
159- log .Fatalf ("COLLECTOR: Lost leader lock after %s" , time .Since (begin ))
160- },
161- OnNewLeader : func (current_id string ) {
162- if current_id == id {
163- // Remain as the leader
164- return
165- }
166- log .Printf ("COLLECTOR: New leader for site collection is %s\n " , current_id )
122+ var (
123+ mu sync.Mutex
124+ leaderCtx context.Context
125+ leaderCtxCancel func ()
126+ )
127+ // attempt to run leader election forever
128+ strategy := backoff .NewExponentialBackOff (backoff .WithMaxElapsedTime (0 ))
129+ backoff .RetryNotify (func () error {
130+ leaderelection .RunOrDie (context .Background (), leaderelection.LeaderElectionConfig {
131+ Lock : lock ,
132+ ReleaseOnCancel : true ,
133+ LeaseDuration : 15 * time .Second ,
134+ RenewDeadline : 10 * time .Second ,
135+ RetryPeriod : 2 * time .Second ,
136+ Callbacks : leaderelection.LeaderCallbacks {
137+ OnStartedLeading : func (ctx context.Context ) {
138+ mu .Lock ()
139+ defer mu .Unlock ()
140+ leaderCtx , leaderCtxCancel = context .WithCancel (ctx )
141+ log .Printf ("COLLECTOR: Became leader. Starting status sync and site controller after %s." , strategy .GetElapsedTime ())
142+ siteCollector (leaderCtx , cli )
143+ if err := startFlowController (leaderCtx , cli ); err != nil {
144+ log .Printf ("COLLECTOR: Failed to start controller for emitting site events: %s" , err )
145+ }
146+ },
147+ OnStoppedLeading : func () {
148+ log .Printf ("COLLECTOR: Lost leader lock after %s. Stopping status sync and site controller." , strategy .GetElapsedTime ())
149+ mu .Lock ()
150+ defer mu .Unlock ()
151+ if leaderCtxCancel == nil {
152+ return
153+ }
154+ leaderCtxCancel ()
155+ leaderCtx , leaderCtxCancel = nil , nil
156+ },
157+ OnNewLeader : func (current_id string ) {
158+ if current_id == id {
159+ // Remain as the leader
160+ return
161+ }
162+ log .Printf ("COLLECTOR: New leader for site collection is %s\n " , current_id )
163+ },
167164 },
168- },
169- })
165+ })
166+ return fmt .Errorf ("leader election died" )
167+ },
168+ strategy ,
169+ func (_ error , d time.Duration ) {
170+ log .Printf ("COLLECTOR: leader election failed. retrying after %s" , d )
171+ })
170172}
171173
172174func StartCollector (cli * internalclient.KubeClient ) {
0 commit comments