Skip to content

Commit eec49c5

Browse files
Merge pull request #369 from openshift-cherrypick-robot/cherry-pick-365-to-release-4.18
[release-4.18] OCPBUGS-54304: Intermittent authentication issues when accessing OpenShift registry
2 parents 5bfc808 + c04839f commit eec49c5

File tree

2 files changed

+309
-31
lines changed

2 files changed

+309
-31
lines changed

pkg/internalregistry/controllers/image_pull_secret_controller.go

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ import (
77
"fmt"
88
"reflect"
99
"sync"
10+
"sync/atomic"
1011
"time"
1112

12-
"go.uber.org/atomic"
1313
"golang.org/x/exp/slices"
1414
"gopkg.in/go-jose/go-jose.v2/jwt"
1515
authenticationv1 "k8s.io/api/authentication/v1"
@@ -52,8 +52,8 @@ func NewImagePullSecretController(kubeclient kubernetes.Interface, secrets infor
5252
serviceAccounts: serviceAccounts.Lister(),
5353
cacheSyncs: []cache.InformerSynced{secrets.Informer().HasSynced, serviceAccounts.Informer().HasSynced},
5454
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "bound-token-managed-image-pull-secrets"),
55-
kids: atomic.NewPointer[[]string](nil),
56-
urls: atomic.NewPointer[[]string](nil),
55+
kids: &atomic.Pointer[[]string]{},
56+
urls: &atomic.Pointer[[]string]{},
5757
kidsC: make(chan []string),
5858
urlsC: make(chan []string),
5959
}
@@ -147,10 +147,10 @@ func (c *imagePullSecretController) sync(ctx context.Context, key string) (error
147147
return c.cleanupOrphanedManagedImagePullSecret(ctx, secret), 0
148148
}
149149

150-
refreshNow, refreshAt := c.isSecretRefreshNeeded(secret, *urls, *kids)
151-
if !refreshNow {
150+
now := time.Now()
152151

153-
// the annotation is missing or incorrect, fix it
152+
if refreshNow, refreshAt := isSecretRefreshNeeded(secret, *urls, *kids, now); !refreshNow {
153+
// if the annotation is missing or incorrect, fix it
154154
if secret.Annotations[InternalRegistryAuthTokenTypeAnnotation] != AuthTokenTypeBound {
155155
patch, err := applycorev1.ExtractSecret(secret, imagePullSecretControllerFieldManager)
156156
if err != nil {
@@ -167,9 +167,9 @@ func (c *imagePullSecretController) sync(ctx context.Context, key string) (error
167167
}
168168

169169
// token is not expired and not expiring soon, requeue when expected to need a refresh
170-
refreshAfter := refreshAt.Sub(time.Now())
171-
klog.V(4).InfoS(key, "refreshAfter", refreshAfter)
172-
return nil, refreshAfter
170+
requeueAfter := refreshAt.Sub(now)
171+
klog.V(4).InfoS(key, "requeueAfter", requeueAfter, "refreshed", false)
172+
return nil, requeueAfter
173173
}
174174

175175
var serviceAccountName = serviceAccountNameForManagedSecret(secret)
@@ -202,13 +202,14 @@ func (c *imagePullSecretController) sync(ctx context.Context, key string) (error
202202
// the service account controller is responsible for re-creating the initial secret.
203203
WithUID(secret.UID)
204204
_, err = c.client.CoreV1().Secrets(secret.Namespace).Apply(ctx, patch, metav1.ApplyOptions{Force: true, FieldManager: imagePullSecretControllerFieldManager})
205-
206205
if err != nil {
207206
return err, 0
208207
}
209208

210-
refreshAfter := refreshAt.Sub(time.Now())
211-
return nil, refreshAfter
209+
// assume `now` as the value of nbf as to not have to parse the token, it should be close enough
210+
requeueAfter := refreshThresholdTime(now, tokenRequest.Status.ExpirationTimestamp.Time).Sub(now)
211+
klog.V(4).InfoS(key, "requeueAfter", requeueAfter, "refreshed", true)
212+
return nil, requeueAfter
212213
}
213214

214215
func (c *imagePullSecretController) cleanupOrphanedManagedImagePullSecret(ctx context.Context, secret *corev1.Secret) error {
@@ -267,31 +268,31 @@ func dockerConfig(token string, urls []string) any {
267268
return auth
268269
}
269270

270-
func (c *imagePullSecretController) isSecretRefreshNeeded(secret *corev1.Secret, urls, kids []string) (bool, time.Time) {
271-
valid, refreshAt := c.registryAuthenticationFileValid(secret, urls, kids)
271+
func isSecretRefreshNeeded(secret *corev1.Secret, urls, kids []string, now time.Time) (bool, *time.Time) {
272+
valid, refreshAt := registryAuthenticationFileValid(secret, urls, kids, now)
272273
return !valid, refreshAt
273274
}
274275

275-
func (c *imagePullSecretController) registryAuthenticationFileValid(imagePullSecret *corev1.Secret, urls, kids []string) (bool, time.Time) {
276+
func registryAuthenticationFileValid(imagePullSecret *corev1.Secret, urls, kids []string, now time.Time) (bool, *time.Time) {
276277
if imagePullSecret.Type != corev1.SecretTypeDockercfg {
277278
klog.V(2).InfoS("Internal registry pull secret type is incorrect.", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "type", imagePullSecret.Type)
278-
return false, time.Now()
279+
return false, nil
279280
}
280281
// registry authentication file must exist
281282
if _, ok := imagePullSecret.Data[corev1.DockerConfigKey]; !ok {
282283
klog.V(2).InfoS("Internal registry pull secret does not contain the expected key", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "keys", reflect.ValueOf(imagePullSecret.Data).MapKeys())
283-
return false, time.Now()
284+
return false, nil
284285
}
285286
// parse the registry authentication file
286287
auth := credentialprovider.DockerConfig{}
287288
if err := json.Unmarshal(imagePullSecret.Data[corev1.DockerConfigKey], &auth); err != nil {
288289
klog.V(2).InfoS("Internal registry pull secret auth data cannot be parsed", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name)
289-
return false, time.Now()
290+
return false, nil
290291
}
291292
// there should be an entries for each internal registry url
292293
if len(auth) != len(urls) {
293294
klog.V(2).InfoS("Internal registry pull secret auth data does not contain the correct number of entries", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "expected", len(urls), "actual", len(auth))
294-
return false, time.Now()
295+
return false, nil
295296
}
296297
matches := 0
297298
CheckUrl:
@@ -305,7 +306,7 @@ CheckUrl:
305306
}
306307
if matches != len(urls) {
307308
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth data does not contain the correct entries", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "expected", urls, "actual", reflect.ValueOf(auth).MapKeys())
308-
return false, time.Now()
309+
return false, nil
309310
}
310311

311312
// track the earliest refresh time of the token (they should all be the same, but check anyway)
@@ -316,10 +317,10 @@ CheckUrl:
316317
token, err := jwt.ParseSigned(entry.Password)
317318
if err != nil {
318319
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth token cannot be parsed", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "error", err)
319-
return false, time.Now()
320+
return false, nil
320321
}
321322

322-
// was token created with previoud token signing cert?
323+
// was token created with previous token signing cert?
323324
var validKeyID bool
324325
for _, kid := range kids {
325326
if token.Headers[0].KeyID == kid {
@@ -329,29 +330,39 @@ CheckUrl:
329330
}
330331
if !validKeyID {
331332
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth token was not signed by a current signer", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "error", err)
332-
return false, time.Now()
333+
return false, nil
333334
}
334335

335336
var claims jwt.Claims
336337
// "unsafe" in the following API just means we are not validating the signature
337338
err = token.UnsafeClaimsWithoutVerification(&claims)
338339
if err != nil {
339340
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth token claim cannot be parsed", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "error", err)
340-
return false, time.Now()
341+
return false, nil
341342
}
342-
// if token is expired or will only be valid less than 40% of its remaining duration we want to trigger a new token request
343-
refreshTime := claims.Expiry.Time().Add(time.Duration(-int64(float64(time.Now().Sub(claims.Expiry.Time())) * 0.4)))
344-
klog.V(4).InfoS("Token expiration check.", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "expirtyTime", claims.Expiry.Time(), "refreshTime", refreshTime)
345-
if time.Now().After(refreshTime) {
346-
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth token needs to be refreshed", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "expirtyTime", claims.Expiry.Time(), "refreshTime", refreshTime)
347-
return false, time.Now()
343+
// if token is expired or less than 40% of its valid duration is left, we want to trigger a new token request
344+
refreshTime := refreshThresholdTime(claims.NotBefore.Time(), claims.Expiry.Time())
345+
klog.V(4).InfoS("Token expiration check.", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "exp", claims.Expiry.Time(), "refreshTime", refreshTime)
346+
if now.After(refreshTime) {
347+
klog.V(2).InfoS("Internal registry pull secret needs to be refreshed", "reason", "auth token needs to be refreshed", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name, "url", url, "exp", claims.Expiry.Time(), "refreshTime", refreshTime)
348+
return false, nil
348349
}
349350
if requeueAt.IsZero() || requeueAt.After(refreshTime) {
350351
requeueAt = refreshTime
351352
}
352353
}
353354
klog.V(4).InfoS("Internal registry pull secret does not need to be refreshed.", "ns", imagePullSecret.Namespace, "name", imagePullSecret.Name)
354-
return true, requeueAt
355+
return true, &requeueAt
356+
}
357+
358+
func refreshThresholdTime(nbf, exp time.Time) time.Time {
359+
// calculate the time at which only 40% of the valid duration would be left
360+
validDuration := exp.Sub(nbf)
361+
if validDuration < 0 {
362+
// this should not happen, but let's not get stuck if it ever does
363+
return time.Time{}
364+
}
365+
return exp.Add(-time.Duration(int64(float64(validDuration) * 0.4)))
355366
}
356367

357368
func (c *imagePullSecretController) Run(ctx context.Context, workers int) {

0 commit comments

Comments
 (0)