diff --git a/charts/aws-ebs-csi-driver/templates/controller.yaml b/charts/aws-ebs-csi-driver/templates/controller.yaml index 7296f28856..a6d61cf9af 100644 --- a/charts/aws-ebs-csi-driver/templates/controller.yaml +++ b/charts/aws-ebs-csi-driver/templates/controller.yaml @@ -188,15 +188,15 @@ spec: path: /healthz port: healthz initialDelaySeconds: 10 - timeoutSeconds: 3 + timeoutSeconds: 60 periodSeconds: 10 - failureThreshold: 5 + failureThreshold: 10 readinessProbe: httpGet: path: /healthz port: healthz initialDelaySeconds: 10 - timeoutSeconds: 3 + timeoutSeconds: 60 periodSeconds: 10 failureThreshold: 5 {{- with .Values.controller.resources }} @@ -535,6 +535,7 @@ spec: imagePullPolicy: {{ default .Values.image.pullPolicy .Values.sidecars.livenessProbe.image.pullPolicy }} args: - --csi-address=/csi/csi.sock + - --probe-timeout=60s {{- if and (.Values.controller.enableMetrics) (not (regexMatch "(-metrics-address)" (join " " .Values.sidecars.livenessProbe.additionalArgs))) }} - --metrics-address=0.0.0.0:3305 {{- end}} diff --git a/deploy/kubernetes/base/controller.yaml b/deploy/kubernetes/base/controller.yaml index 8964d8bc56..996d3ae599 100644 --- a/deploy/kubernetes/base/controller.yaml +++ b/deploy/kubernetes/base/controller.yaml @@ -116,15 +116,15 @@ spec: path: /healthz port: healthz initialDelaySeconds: 10 - timeoutSeconds: 3 + timeoutSeconds: 60 periodSeconds: 10 - failureThreshold: 5 + failureThreshold: 10 readinessProbe: httpGet: path: /healthz port: healthz initialDelaySeconds: 10 - timeoutSeconds: 3 + timeoutSeconds: 60 periodSeconds: 10 failureThreshold: 5 resources: @@ -266,6 +266,7 @@ spec: imagePullPolicy: IfNotPresent args: - --csi-address=/csi/csi.sock + - --probe-timeout=60s volumeMounts: - name: socket-dir mountPath: /csi diff --git a/pkg/cloud/cloud.go b/pkg/cloud/cloud.go index fbaf1da99a..99fd68cd90 100644 --- a/pkg/cloud/cloud.go +++ b/pkg/cloud/cloud.go @@ -26,6 +26,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -89,8 +90,11 @@ var ( ) const ( - cacheForgetDelay = 1 * time.Hour - volInitCacheForgetDelay = 6 * time.Hour + cacheForgetDelay = 1 * time.Hour + volInitCacheForgetDelay = 6 * time.Hour + + dryRunInterval = 3 * time.Hour + getCallerIdentityRetryDelay = 30 * time.Second ) @@ -337,6 +341,7 @@ type cloud struct { volumeInitializations expiringcache.ExpiringCache[string, volumeInitialization] accountID string accountIDOnce sync.Once + attemptDryRun atomic.Bool } var _ Cloud = &cloud{} @@ -395,7 +400,7 @@ func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchin bm = newBatcherManager(svc) } - return &cloud{ + c := &cloud{ awsConfig: cfg, region: region, dm: dm.NewDeviceManager(), @@ -408,6 +413,16 @@ func NewCloud(region string, awsSdkDebugLog bool, userAgentExtra string, batchin latestClientTokens: expiringcache.New[string, int](cacheForgetDelay), volumeInitializations: expiringcache.New[string, volumeInitialization](volInitCacheForgetDelay), } + + // Ensure an EC2 Dry-run API call is made on startup and every dryRunInterval + c.attemptDryRun.Store(true) + go func() { + for range time.Tick(dryRunInterval) { + c.attemptDryRun.Store(true) + } + }() + + return c } // newBatcherManager initializes a new instance of batcherManager. @@ -1774,6 +1789,29 @@ func (c *cloud) EnableFastSnapshotRestores(ctx context.Context, availabilityZone return response, nil } +// DryRun will make a dry-run EC2 API call. Nil return value means we successfully received EC2 DryRunOperation error code. +func (c *cloud) DryRun(ctx context.Context) error { + if c.attemptDryRun.Load() { + // Rely on EC2 DAZ because it is required in ebs controller IAM role, but not in instance default role. + _, apiErr := c.ec2.DescribeAvailabilityZones(ctx, + &ec2.DescribeAvailabilityZonesInput{DryRun: aws.Bool(true)}, + func(o *ec2.Options) { + o.Retryer = aws.NopRetryer{} // Don't retry so we can catch network failures. CO should retry liveness check multiple times. + o.APIOptions = nil // Don't add our logging/metrics middleware because we expect errors. + }) + if apiErr != nil { + var awsErr smithy.APIError + if errors.As(apiErr, &awsErr) && awsErr.ErrorCode() == "DryRunOperation" { + c.attemptDryRun.Store(false) + return nil + } + return fmt.Errorf("dry-run EC2 API call failed: %w", apiErr) + } + } + + return nil +} + func describeVolumes(ctx context.Context, svc EC2API, request *ec2.DescribeVolumesInput) ([]types.Volume, error) { var volumes []types.Volume var nextToken *string diff --git a/pkg/cloud/cloud_test.go b/pkg/cloud/cloud_test.go index c17e461a0b..62b76f2628 100644 --- a/pkg/cloud/cloud_test.go +++ b/pkg/cloud/cloud_test.go @@ -4099,6 +4099,78 @@ func TestIsVolumeInitialized(t *testing.T) { } } +func TestDryRun(t *testing.T) { + testCases := []struct { + name string + errCode string + attemptDryRunBefore bool + attemptDryRunAfter bool + dryRunAttempts int + expectedErr bool + }{ + { + name: "Successful DryRunOperation", + errCode: "DryRunOperation", + attemptDryRunBefore: true, + attemptDryRunAfter: false, + dryRunAttempts: 1, + expectedErr: false, + }, + { + name: "Skip DryRun", + attemptDryRunBefore: false, + attemptDryRunAfter: false, + dryRunAttempts: 1, + expectedErr: false, + }, + { + name: "Failed DryRun", + errCode: "UnexpectedErr", + attemptDryRunBefore: true, + attemptDryRunAfter: true, + dryRunAttempts: 1, + expectedErr: true, + }, + { + name: "Fail, fail, successful DryRun", + errCode: "UnexpectedErr", + attemptDryRunBefore: true, + attemptDryRunAfter: true, + dryRunAttempts: 3, + expectedErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + mockCtrl := gomock.NewController(t) + mockEC2 := NewMockEC2API(mockCtrl) + c := &cloud{ + region: "test-region", + ec2: mockEC2, + } + c.attemptDryRun.Store(tc.attemptDryRunBefore) + + if tc.attemptDryRunBefore { + mockEC2.EXPECT().DescribeAvailabilityZones(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, &smithy.GenericAPIError{ + Code: tc.errCode, + }).Times(tc.dryRunAttempts) + } + + var err error + for range tc.dryRunAttempts { + err = c.DryRun(context.Background()) + } + + assert.Equal(t, tc.attemptDryRunAfter, c.attemptDryRun.Load()) + + if tc.expectedErr { + assert.Error(t, err) + } + }) + } +} + func confirmInitializationCacheUpdated(tb testing.TB, cache expiringcache.ExpiringCache[string, volumeInitialization], volID string, dvsOutput types.VolumeStatusItem) { tb.Helper() diff --git a/pkg/cloud/interface.go b/pkg/cloud/interface.go index b977379a18..0a6d43b0dd 100644 --- a/pkg/cloud/interface.go +++ b/pkg/cloud/interface.go @@ -39,4 +39,5 @@ type Cloud interface { ListSnapshots(ctx context.Context, volumeID string, maxResults int32, nextToken string) (listSnapshotsResponse *ListSnapshotsResponse, err error) EnableFastSnapshotRestores(ctx context.Context, availabilityZones []string, snapshotID string) (*ec2.EnableFastSnapshotRestoresOutput, error) AvailabilityZones(ctx context.Context) (map[string]struct{}, error) + DryRun(ctx context.Context) error } diff --git a/pkg/cloud/mock_cloud.go b/pkg/cloud/mock_cloud.go index e99e18037f..6c2ade3a63 100644 --- a/pkg/cloud/mock_cloud.go +++ b/pkg/cloud/mock_cloud.go @@ -140,6 +140,20 @@ func (mr *MockCloudMockRecorder) DetachDisk(ctx, volumeID, nodeID interface{}) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DetachDisk", reflect.TypeOf((*MockCloud)(nil).DetachDisk), ctx, volumeID, nodeID) } +// DryRun mocks base method. +func (m *MockCloud) DryRun(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DryRun", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// DryRun indicates an expected call of DryRun. +func (mr *MockCloudMockRecorder) DryRun(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DryRun", reflect.TypeOf((*MockCloud)(nil).DryRun), ctx) +} + // EnableFastSnapshotRestores mocks base method. func (m *MockCloud) EnableFastSnapshotRestores(ctx context.Context, availabilityZones []string, snapshotID string) (*ec2.EnableFastSnapshotRestoresOutput, error) { m.ctrl.T.Helper() diff --git a/pkg/driver/identity.go b/pkg/driver/identity.go index 7a8c198412..aeb52abcf3 100644 --- a/pkg/driver/identity.go +++ b/pkg/driver/identity.go @@ -21,6 +21,8 @@ import ( csi "github.com/container-storage-interface/spec/lib/go/csi" "github.com/kubernetes-sigs/aws-ebs-csi-driver/pkg/util" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "k8s.io/klog/v2" ) @@ -59,6 +61,13 @@ func (d *Driver) GetPluginCapabilities(ctx context.Context, req *csi.GetPluginCa } func (d *Driver) Probe(ctx context.Context, req *csi.ProbeRequest) (*csi.ProbeResponse, error) { - klog.V(6).InfoS("Probe: called", "args", req) + // Controller Service will make dry-run EC2 call to ensure proper auth/networking + if d.controller != nil && d.controller.cloud != nil { + err := d.controller.cloud.DryRun(ctx) + if err != nil { + return &csi.ProbeResponse{}, status.Errorf(codes.FailedPrecondition, "Failed health check (verify network connection and IAM credentials): %v", err) + } + } + return &csi.ProbeResponse{}, nil } diff --git a/tests/sanity/fake_sanity_cloud.go b/tests/sanity/fake_sanity_cloud.go index 886e498793..5b47a655ad 100644 --- a/tests/sanity/fake_sanity_cloud.go +++ b/tests/sanity/fake_sanity_cloud.go @@ -221,3 +221,7 @@ func (d *fakeCloud) WaitForAttachmentState(ctx context.Context, expectedState ty func (d *fakeCloud) IsVolumeInitialized(ctx context.Context, volumeID string) (bool, error) { return true, nil } + +func (d *fakeCloud) DryRun(ctx context.Context) error { + return nil +}