Skip to content

Commit 6135fa9

Browse files
Retry on delete backend failure (#330) (#333)
* Retry create/delete of backend on failure and ignore reconcile if lb is not active
1 parent 2724252 commit 6135fa9

6 files changed

+270
-238
lines changed

cloud/scope/load_balancer_reconciler.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ func (s *ClusterScope) ReconcileApiServerLB(ctx context.Context) error {
3737
return err
3838
}
3939
if lb != nil {
40+
if lb.LifecycleState != loadbalancer.LoadBalancerLifecycleStateActive {
41+
return errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", lb.LifecycleState))
42+
}
4043
lbIP, err := s.getLoadbalancerIp(*lb)
4144
if err != nil {
4245
return err

cloud/scope/load_balancer_reconciler_test.go

Lines changed: 64 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,12 @@ func TestLBReconciliation(t *testing.T) {
9797
})).
9898
Return(loadbalancer.GetLoadBalancerResponse{
9999
LoadBalancer: loadbalancer.LoadBalancer{
100-
Id: common.String("lb-id"),
101-
FreeformTags: tags,
102-
DefinedTags: make(map[string]map[string]interface{}),
103-
IsPrivate: common.Bool(false),
104-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
100+
Id: common.String("lb-id"),
101+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
102+
FreeformTags: tags,
103+
DefinedTags: make(map[string]map[string]interface{}),
104+
IsPrivate: common.Bool(false),
105+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
105106
IpAddresses: []loadbalancer.IpAddress{
106107
{
107108
IpAddress: common.String("2.2.2.2"),
@@ -123,11 +124,12 @@ func TestLBReconciliation(t *testing.T) {
123124
})).
124125
Return(loadbalancer.GetLoadBalancerResponse{
125126
LoadBalancer: loadbalancer.LoadBalancer{
126-
Id: common.String("lb-id"),
127-
FreeformTags: tags,
128-
DefinedTags: make(map[string]map[string]interface{}),
129-
IsPrivate: common.Bool(false),
130-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
127+
Id: common.String("lb-id"),
128+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
129+
FreeformTags: tags,
130+
DefinedTags: make(map[string]map[string]interface{}),
131+
IsPrivate: common.Bool(false),
132+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
131133
},
132134
}, nil)
133135
},
@@ -143,11 +145,12 @@ func TestLBReconciliation(t *testing.T) {
143145
})).
144146
Return(loadbalancer.GetLoadBalancerResponse{
145147
LoadBalancer: loadbalancer.LoadBalancer{
146-
Id: common.String("lb-id"),
147-
FreeformTags: tags,
148-
DefinedTags: make(map[string]map[string]interface{}),
149-
IsPrivate: common.Bool(false),
150-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
148+
Id: common.String("lb-id"),
149+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
150+
FreeformTags: tags,
151+
DefinedTags: make(map[string]map[string]interface{}),
152+
IsPrivate: common.Bool(false),
153+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
151154
IpAddresses: []loadbalancer.IpAddress{
152155
{
153156
IpAddress: common.String("2.2.2.2"),
@@ -189,11 +192,12 @@ func TestLBReconciliation(t *testing.T) {
189192
})).
190193
Return(loadbalancer.GetLoadBalancerResponse{
191194
LoadBalancer: loadbalancer.LoadBalancer{
192-
Id: common.String("lb-id"),
193-
FreeformTags: tags,
194-
DefinedTags: make(map[string]map[string]interface{}),
195-
IsPrivate: common.Bool(false),
196-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
195+
Id: common.String("lb-id"),
196+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
197+
FreeformTags: tags,
198+
DefinedTags: make(map[string]map[string]interface{}),
199+
IsPrivate: common.Bool(false),
200+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
197201
IpAddresses: []loadbalancer.IpAddress{
198202
{
199203
IpAddress: common.String("2.2.2.2"),
@@ -537,11 +541,12 @@ func TestLBReconciliation(t *testing.T) {
537541
})).
538542
Return(loadbalancer.GetLoadBalancerResponse{
539543
LoadBalancer: loadbalancer.LoadBalancer{
540-
Id: common.String("lb-id"),
541-
FreeformTags: tags,
542-
DefinedTags: make(map[string]map[string]interface{}),
543-
IsPrivate: common.Bool(false),
544-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
544+
Id: common.String("lb-id"),
545+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
546+
FreeformTags: tags,
547+
DefinedTags: make(map[string]map[string]interface{}),
548+
IsPrivate: common.Bool(false),
549+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
545550
IpAddresses: []loadbalancer.IpAddress{
546551
{
547552
IpAddress: common.String("2.2.2.2"),
@@ -570,6 +575,34 @@ func TestLBReconciliation(t *testing.T) {
570575
}, nil)
571576
},
572577
},
578+
{
579+
name: "lb not active",
580+
errorExpected: true,
581+
errorSubStringMatch: true,
582+
matchError: errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", loadbalancer.LoadBalancerLifecycleStateCreating)),
583+
testSpecificSetup: func(clusterScope *ClusterScope, lbClient *mock_lb.MockLoadBalancerClient) {
584+
ociClusterAccessor.OCICluster.Spec.NetworkSpec.APIServerLB.LoadBalancerId = common.String("lb-id")
585+
lbClient.EXPECT().GetLoadBalancer(gomock.Any(), gomock.Eq(loadbalancer.GetLoadBalancerRequest{
586+
LoadBalancerId: common.String("lb-id"),
587+
})).
588+
Return(loadbalancer.GetLoadBalancerResponse{
589+
LoadBalancer: loadbalancer.LoadBalancer{
590+
Id: common.String("lb-id"),
591+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateCreating,
592+
FreeformTags: tags,
593+
DefinedTags: make(map[string]map[string]interface{}),
594+
IsPrivate: common.Bool(false),
595+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
596+
IpAddresses: []loadbalancer.IpAddress{
597+
{
598+
IpAddress: common.String("2.2.2.2"),
599+
IsPublic: common.Bool(true),
600+
},
601+
},
602+
},
603+
}, nil)
604+
},
605+
},
573606
{
574607
name: "lb update request failed",
575608
errorExpected: true,
@@ -582,11 +615,12 @@ func TestLBReconciliation(t *testing.T) {
582615
})).
583616
Return(loadbalancer.GetLoadBalancerResponse{
584617
LoadBalancer: loadbalancer.LoadBalancer{
585-
Id: common.String("lb-id"),
586-
FreeformTags: tags,
587-
DefinedTags: make(map[string]map[string]interface{}),
588-
IsPrivate: common.Bool(false),
589-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
618+
Id: common.String("lb-id"),
619+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
620+
FreeformTags: tags,
621+
DefinedTags: make(map[string]map[string]interface{}),
622+
IsPrivate: common.Bool(false),
623+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
590624
IpAddresses: []loadbalancer.IpAddress{
591625
{
592626
IpAddress: common.String("2.2.2.2"),

cloud/scope/machine.go

Lines changed: 89 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -520,33 +520,29 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
520520
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port))
521521
if !m.containsLBBackend(backendSet, backendName) {
522522
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
523-
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
524523
logger.Info("Checking work request status for create backend")
525-
if workRequest != "" {
526-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
527-
if err != nil {
528-
return err
529-
}
530-
} else {
531-
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
532-
LoadBalancerId: loadbalancerId,
533-
BackendSetName: backendSet.Name,
534-
CreateBackendDetails: loadbalancer.CreateBackendDetails{
535-
IpAddress: common.String(instanceIp),
536-
Port: common.Int(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port)),
537-
},
538-
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
539-
})
540-
if err != nil {
541-
return err
542-
}
543-
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
544-
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
545-
logger.Info("Waiting for LB work request to be complete")
546-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
547-
if err != nil {
548-
return err
549-
}
524+
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
525+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
526+
// work request, the create backend call may throw an error which is ok, as reconcile will go into
527+
// an exponential backoff
528+
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
529+
LoadBalancerId: loadbalancerId,
530+
BackendSetName: backendSet.Name,
531+
CreateBackendDetails: loadbalancer.CreateBackendDetails{
532+
IpAddress: common.String(instanceIp),
533+
Port: common.Int(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port)),
534+
},
535+
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
536+
})
537+
if err != nil {
538+
return err
539+
}
540+
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
541+
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
542+
logger.Info("Waiting for LB work request to be complete")
543+
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
544+
if err != nil {
545+
return err
550546
}
551547
}
552548

@@ -561,37 +557,32 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
561557
if !m.containsNLBBackend(backendSet, m.Name()) {
562558
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
563559
logger.Info("Checking work request status for create backend")
564-
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
565-
if workRequest != "" {
566-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
567-
if err != nil {
568-
return err
569-
}
570-
} else {
571-
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
572-
NetworkLoadBalancerId: loadbalancerId,
573-
BackendSetName: backendSet.Name,
574-
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
575-
IpAddress: common.String(instanceIp),
576-
Port: common.Int(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port)),
577-
Name: common.String(m.Name()),
578-
},
579-
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
580-
})
581-
if err != nil {
582-
return err
583-
}
584-
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
585-
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
586-
logger.Info("Waiting for NLB work request to be complete")
587-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
588-
if err != nil {
589-
return err
590-
}
591-
logger.Info("NLB Backend addition work request is complete")
560+
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
561+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
562+
// work request, the create backend call may throw an error which is ok, as reconcile will go into
563+
// an exponential backoff
564+
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
565+
NetworkLoadBalancerId: loadbalancerId,
566+
BackendSetName: backendSet.Name,
567+
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
568+
IpAddress: common.String(instanceIp),
569+
Port: common.Int(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port)),
570+
Name: common.String(m.Name()),
571+
},
572+
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
573+
})
574+
if err != nil {
575+
return err
576+
}
577+
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
578+
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
579+
logger.Info("Waiting for NLB work request to be complete")
580+
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
581+
if err != nil {
582+
return err
592583
}
584+
logger.Info("NLB Backend addition work request is complete")
593585
}
594-
595586
}
596587
return nil
597588
}
@@ -628,36 +619,31 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
628619
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCICluster.Spec.ControlPlaneEndpoint.Port))
629620
if m.containsLBBackend(backendSet, backendName) {
630621
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
631-
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
632-
if workRequest != "" {
633-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
634-
if err != nil {
635-
return err
636-
}
637-
} else {
638-
// in OCI CLI, the colon in the backend name is replaced by %3A
639-
// replace the colon in the backend name by %3A to avoid the error in PCA
640-
escapedBackendName := url.QueryEscape(backendName)
641-
642-
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
643-
LoadBalancerId: loadbalancerId,
644-
BackendSetName: backendSet.Name,
645-
BackendName: common.String(escapedBackendName),
646-
})
647-
if err != nil {
648-
logger.Error(err, "Delete instance from LB backend-set failed",
649-
"backendSetName", *backendSet.Name,
650-
"backendName", escapedBackendName,
651-
)
652-
return err
653-
}
654-
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
655-
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
656-
logger.Info("Waiting for LB work request to be complete")
657-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
658-
if err != nil {
659-
return err
660-
}
622+
// in OCI CLI, the colon in the backend name is replaced by %3A
623+
// replace the colon in the backend name by %3A to avoid the error in PCA
624+
escapedBackendName := url.QueryEscape(backendName)
625+
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
626+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
627+
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
628+
// an exponential backoff
629+
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
630+
LoadBalancerId: loadbalancerId,
631+
BackendSetName: backendSet.Name,
632+
BackendName: common.String(escapedBackendName),
633+
})
634+
if err != nil {
635+
logger.Error(err, "Delete instance from LB backend-set failed",
636+
"backendSetName", *backendSet.Name,
637+
"backendName", escapedBackendName,
638+
)
639+
return err
640+
}
641+
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
642+
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
643+
logger.Info("Waiting for LB work request to be complete")
644+
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
645+
if err != nil {
646+
return err
661647
}
662648
logger.Info("LB Backend addition work request is complete")
663649
}
@@ -671,28 +657,24 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
671657
backendSet := lb.BackendSets[APIServerLBBackendSetName]
672658
if m.containsNLBBackend(backendSet, m.Name()) {
673659
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
674-
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
675-
if workRequest != "" {
676-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
677-
if err != nil {
678-
return err
679-
}
680-
} else {
681-
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
682-
NetworkLoadBalancerId: loadbalancerId,
683-
BackendSetName: backendSet.Name,
684-
BackendName: common.String(m.Name()),
685-
})
686-
if err != nil {
687-
return err
688-
}
689-
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
690-
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
691-
logger.Info("Waiting for LB work request to be complete")
692-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
693-
if err != nil {
694-
return err
695-
}
660+
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
661+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
662+
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
663+
// an exponential backoff
664+
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
665+
NetworkLoadBalancerId: loadbalancerId,
666+
BackendSetName: backendSet.Name,
667+
BackendName: common.String(m.Name()),
668+
})
669+
if err != nil {
670+
return err
671+
}
672+
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
673+
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
674+
logger.Info("Waiting for LB work request to be complete")
675+
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
676+
if err != nil {
677+
return err
696678
}
697679
}
698680
}

0 commit comments

Comments
 (0)