Skip to content

Commit 6748c47

Browse files
committed
fix bug cant get status when worker pod spec is invalid
1 parent 4a63d3c commit 6748c47

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

pkg/controller/mpi_job_controller.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -961,8 +961,13 @@ func (c *MPIJobController) getOrCreateWorker(mpiJob *kubeflow.MPIJob) ([]*corev1
961961
// If an error occurs during Get/Create, we'll requeue the item so we
962962
// can attempt processing again later. This could have been caused by a
963963
// temporary network failure, or any other transient reason.
964+
// But, if err is about pod spec invalid, retrying would be
965+
// futile, the status of job should turn to failed.
964966
if err != nil {
965967
c.recorder.Eventf(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, "worker pod created failed: %v", err)
968+
if errors.IsInvalid(err) {
969+
return workerPods, nil
970+
}
966971
return nil, err
967972
}
968973
// If the worker is not controlled by this MPIJob resource, we should log
@@ -1076,7 +1081,6 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
10761081
running = 0
10771082
evict = 0
10781083
)
1079-
10801084
initializeMPIJobStatuses(mpiJob, kubeflow.MPIReplicaTypeWorker)
10811085
//spec := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker]
10821086
for i := 0; i < len(worker); i++ {
@@ -1100,7 +1104,19 @@ func (c *MPIJobController) updateMPIJobStatus(mpiJob *kubeflow.MPIJob, launcher
11001104
c.recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobEvict, msg)
11011105
}
11021106

1103-
if isMPIJobSuspended(mpiJob) {
1107+
// When workerSpec != nil and workerSpec.Replicas != 0 and len(worker) == 0,
1108+
// pod spec must be wrong, job failed.
1109+
workerSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker]
1110+
if workerSpec != nil && len(worker) == 0 && *workerSpec.Replicas != 0 {
1111+
msg := "invalid pod spec"
1112+
c.recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobFailedReason, msg)
1113+
if mpiJob.Status.CompletionTime == nil {
1114+
now := metav1.Now()
1115+
mpiJob.Status.CompletionTime = &now
1116+
}
1117+
updateMPIJobConditions(mpiJob, kubeflow.JobFailed, corev1.ConditionTrue, mpiJobFailedReason, msg)
1118+
mpiJobsFailureCount.Inc()
1119+
} else if isMPIJobSuspended(mpiJob) {
11041120
msg := fmt.Sprintf("MPIJob %s/%s is suspended.", mpiJob.Namespace, mpiJob.Name)
11051121
updateMPIJobConditions(mpiJob, kubeflow.JobRunning, corev1.ConditionFalse, mpiJobSuspendedReason, msg)
11061122
} else if launcher != nil && launcherPodsCnt >= 1 && running == len(worker) {

0 commit comments

Comments
 (0)