pytorch · EquationWalker · Sep 14, 2025
@@ -448,3 +448,25 @@ def _clip_grad_norm_with_ep(
     torch.nn.utils.clip_grads_with_norm_(non_ep_params, max_norm, total_norm, foreach)
 
     return total_norm
+
+
+@contextlib.contextmanager
+def _no_grad_sync(model: torch.nn.Module):
+    model.set_requires_gradient_sync(False)
+    try:
+        yield
+    finally:
+        model.set_requires_gradient_sync(True)
+
+
+@contextlib.contextmanager
+def no_grad_sync(models: list[torch.nn.Module], enable: bool = False):
+    if not enable:
+        yield
+        return
+
+    with contextlib.ExitStack() as stack:
+        for m in models:
+            ctx = _no_grad_sync(m) if hasattr(m, "set_requires_gradient_sync") else contextlib.nullcontext()
+            stack.enter_context(ctx)
+        yield
@@ -482,7 +482,9 @@ def train_step(
         # entire step will not be executed.
         for _microbatch in range(self.gradient_accumulation_steps):
             input_dict, labels = next(data_iterator)
-            loss = self.forward_backward_step(input_dict, labels)
+            no_grad_sync = _microbatch < self.gradient_accumulation_steps - 1
+            with dist_utils.no_grad_sync(self.model_parts, no_grad_sync):
+                loss = self.forward_backward_step(input_dict, labels)
             accumulated_losses.append(loss.detach())
 
         grad_norm = dist_utils.clip_grad_norm_(