Skip to content

Commit 182c43d

Browse files
committed
feat(aws-connection): Improves retry logic and makes timeout configurable
With the changes in this commit, users of the Terraform provider can configure the timeout for the `create` operation for the `role_arn` resource by adding the following block to the resource: ``` timeouts { create = "<x>m" } `` where <x> is the max amount of minutes the user wants to wait.
1 parent e876a36 commit 182c43d

File tree

4 files changed

+162
-7
lines changed

4 files changed

+162
-7
lines changed

dynatrace/api/builtin/hyperscalerauthentication/connections/aws/role_arn/service.go

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package role_arn
2020
import (
2121
"context"
2222
"errors"
23+
"fmt"
2324
"time"
2425

2526
"github.com/dynatrace-oss/terraform-provider-dynatrace/dynatrace/api"
@@ -29,8 +30,12 @@ import (
2930
"github.com/dynatrace-oss/terraform-provider-dynatrace/dynatrace/rest"
3031
"github.com/dynatrace-oss/terraform-provider-dynatrace/dynatrace/settings"
3132
"github.com/dynatrace-oss/terraform-provider-dynatrace/dynatrace/settings/services/settings20"
33+
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
3234
)
3335

36+
const (
37+
timeoutDeadlineBuffer = time.Minute
38+
)
3439
const SchemaID = "builtin:hyperscaler-authentication.connections.aws"
3540
const SchemaVersion = "0.0.15"
3641

@@ -80,16 +85,71 @@ func (me *service) Create(ctx context.Context, v *role_arn.Settings) (*api.Stub,
8085
connValue.AWSWebIdentity.RoleARN = v.RoleARN
8186
}
8287

83-
var err error
84-
var retry = 10
85-
for i := 0; i < retry; i++ {
86-
if err = me.connService.Update(ctx, v.AWSConnectionID, &connValue); err == nil {
87-
return &api.Stub{ID: v.AWSConnectionID, Name: v.AWSConnectionID}, nil
88+
ctxRetry, cancel, retryTimeout, err := computeRetryContext(ctx, timeoutDeadlineBuffer, role_arn.DefaultCreateTimeout)
89+
if err != nil {
90+
return nil, err
91+
}
92+
defer cancel()
93+
94+
if err = retry.RetryContext(ctxRetry, retryTimeout, func() *retry.RetryError {
95+
return classifyRetryError(me.connService.Update(ctxRetry, v.AWSConnectionID, &connValue))
96+
}); err != nil {
97+
return nil, err
98+
}
99+
100+
return &api.Stub{ID: v.AWSConnectionID, Name: v.AWSConnectionID}, nil
101+
}
102+
103+
// computeRetryContext computes a safe retry timeout based on the incoming ctx deadline.
104+
// - timeoutDeadlineBuffer: amount of time to reserve for finalization (e.g. 1 minute).
105+
// - defaultTimeout: fallback when caller didn't provide a deadline.
106+
// Returns the derived ctx (with timeout), its cancel func, the retryTimeout, or an error
107+
// if the caller's deadline already expired.
108+
func computeRetryContext(ctx context.Context, timeoutDeadlineBuffer time.Duration, defaultTimeout time.Duration) (context.Context, context.CancelFunc, time.Duration, error) {
109+
if dl, ok := ctx.Deadline(); ok {
110+
remaining := time.Until(dl)
111+
if remaining <= 0 {
112+
return nil, nil, 0, context.DeadlineExceeded
113+
}
114+
var retryTimeout time.Duration
115+
if remaining > timeoutDeadlineBuffer {
116+
retryTimeout = remaining - timeoutDeadlineBuffer
117+
} else {
118+
retryTimeout = remaining
88119
}
120+
ctxRetry, cancel := context.WithTimeout(ctx, retryTimeout)
121+
return ctxRetry, cancel, retryTimeout, nil
122+
}
123+
// no deadline: use conservative default
124+
ctxRetry, cancel := context.WithTimeout(ctx, defaultTimeout)
125+
return ctxRetry, cancel, defaultTimeout, nil
126+
}
89127

90-
time.Sleep(time.Second * 2)
128+
// classifyRetryError encapsulates which errors should be retried.
129+
// - 400 and 404 are considered retryable due to eventual consistency.
130+
// - other 4xx are non-retryable.
131+
// - 5xx and non-HTTP (network) errors are retryable.
132+
func classifyRetryError(err error) *retry.RetryError {
133+
if err == nil {
134+
return nil
135+
}
136+
137+
var restError rest.Error
138+
if errors.As(err, &restError) {
139+
code := restError.Code
140+
// Retry on specific client errors that can be transient (eventual consistency).
141+
if code == 400 || code == 404 {
142+
return retry.RetryableError(fmt.Errorf("IAM role not yet usable (HTTP %d): %w", code, err))
143+
}
144+
// Treat other 4xx as non-retryable client errors.
145+
if code >= 400 && code < 500 {
146+
return retry.NonRetryableError(fmt.Errorf("IAM role unusable (HTTP %d): %w", code, err))
147+
}
148+
// 5xx and others -> retryable
149+
return retry.RetryableError(fmt.Errorf("IAM role not yet usable (HTTP %d): %w", code, err))
91150
}
92-
return nil, err
151+
// Non-HTTP errors (network, timeouts, context) -> retryable
152+
return retry.RetryableError(fmt.Errorf("IAM role not yet usable: %w", err))
93153
}
94154

95155
func (me *service) Update(_ context.Context, _ string, _ *role_arn.Settings) error {
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package role_arn
2+
3+
import (
4+
"context"
5+
"errors"
6+
"testing"
7+
"time"
8+
9+
"github.com/dynatrace-oss/terraform-provider-dynatrace/dynatrace/rest"
10+
"github.com/stretchr/testify/assert"
11+
)
12+
13+
func approx(d1, d2 time.Duration) bool {
14+
delta := d1 - d2
15+
if delta < 0 {
16+
delta = -delta
17+
}
18+
return delta <= 200*time.Millisecond
19+
}
20+
21+
func TestComputeRetryContext_NoDeadline(t *testing.T) {
22+
ctx := context.Background()
23+
_, cancel, retryTimeout, _ := computeRetryContext(ctx, time.Minute, 2*time.Minute)
24+
defer cancel()
25+
26+
assert.Equal(t, 2*time.Minute, retryTimeout)
27+
}
28+
29+
func TestComputeRetryContext_WithDeadline_Plenty(t *testing.T) {
30+
// caller provides 5 minutes; timeoutDeadlineBuffer is 1 minute -> retryTimeout should be ~4 minutes
31+
parent := context.Background()
32+
deadline := time.Now().Add(5 * time.Minute)
33+
ctxWithDL, cancelParent := context.WithDeadline(parent, deadline)
34+
defer cancelParent()
35+
36+
_, cancel, retryTimeout, _ := computeRetryContext(ctxWithDL, time.Minute, 2*time.Minute)
37+
defer cancel()
38+
39+
// expect ~4 minutes
40+
expected := 4 * time.Minute
41+
assert.True(t, approx(expected, retryTimeout), "expected approximately %v, got %v", expected, retryTimeout)
42+
}
43+
44+
func TestComputeRetryContext_ExpiredDeadline(t *testing.T) {
45+
ctxWithExpiredDL, cancel := context.WithDeadline(context.Background(), time.Now().Add(-time.Second))
46+
defer cancel()
47+
48+
_, _, _, err := computeRetryContext(ctxWithExpiredDL, time.Minute, 2*time.Minute)
49+
50+
assert.ErrorIs(t, err, context.DeadlineExceeded)
51+
}
52+
53+
func TestClassifyRetryError(t *testing.T) {
54+
tests := []struct {
55+
err error
56+
expectedSubstr string
57+
}{
58+
{err: rest.Error{Code: 400, Message: "bad"}, expectedSubstr: "not yet usable (HTTP 400)"},
59+
{err: rest.Error{Code: 404, Message: "notfound"}, expectedSubstr: "not yet usable (HTTP 404)"},
60+
{err: rest.Error{Code: 401, Message: "unauth"}, expectedSubstr: "unusable (HTTP 401)"},
61+
{err: rest.Error{Code: 500, Message: "server"}, expectedSubstr: "not yet usable (HTTP 500)"},
62+
{err: errors.New("network failure"), expectedSubstr: "not yet usable"},
63+
}
64+
65+
for _, tc := range tests {
66+
t.Run(tc.expectedSubstr, func(t *testing.T) {
67+
re := classifyRetryError(tc.err)
68+
msg := re.Err.Error()
69+
70+
assert.Contains(t, msg, tc.expectedSubstr)
71+
})
72+
}
73+
}

dynatrace/api/builtin/hyperscalerauthentication/connections/aws/role_arn/settings/settings.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,29 @@
1818
package role_arn
1919

2020
import (
21+
"time"
22+
2123
"github.com/dynatrace-oss/terraform-provider-dynatrace/terraform/hcl"
2224
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
2325
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"
2426
)
2527

28+
const (
29+
DefaultCreateTimeout = 20 * time.Minute
30+
)
31+
2632
type Settings struct {
2733
Name string
2834
AWSConnectionID string
2935
RoleARN string `json:"roleArn"` // The ARN of the AWS role that should be assumed
3036
}
3137

38+
func (me *Settings) Timeouts() *schema.ResourceTimeout {
39+
return &schema.ResourceTimeout{
40+
Create: schema.DefaultTimeout(DefaultCreateTimeout),
41+
}
42+
}
43+
3244
func (me *Settings) Schema() map[string]*schema.Schema {
3345
return map[string]*schema.Schema{
3446
"aws_connection_id": {

resources/generic.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ type DiffCustomizer interface {
120120
CustomizeDiff(ctx context.Context, rd *schema.ResourceDiff, i any) error
121121
}
122122

123+
type TimeoutProvider interface {
124+
Timeouts() *schema.ResourceTimeout
125+
}
126+
123127
func (me *Generic) Resource() *schema.Resource {
124128
stngs := me.Descriptor.NewSettings()
125129
sch := VisitSchemaMap(stngs.Schema())
@@ -146,6 +150,9 @@ func (me *Generic) Resource() *schema.Resource {
146150
if dc, ok := stngs.(DiffCustomizer); ok {
147151
resRes.CustomizeDiff = dc.CustomizeDiff
148152
}
153+
if tp, ok := stngs.(TimeoutProvider); ok {
154+
resRes.Timeouts = tp.Timeouts()
155+
}
149156
return resRes
150157
}
151158

@@ -162,6 +169,9 @@ func (me *Generic) Resource() *schema.Resource {
162169
if dc, ok := stngs.(DiffCustomizer); ok {
163170
resRes.CustomizeDiff = dc.CustomizeDiff
164171
}
172+
if tp, ok := stngs.(TimeoutProvider); ok {
173+
resRes.Timeouts = tp.Timeouts()
174+
}
165175
return resRes
166176
}
167177

0 commit comments

Comments
 (0)