misc

fegin · fegin · commit 743ab4acaffb · 2025-10-28T12:27:34.000-07:00
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -397,19 +397,7 @@ class Parallelism:
     """
     Expert parallelism degree. 1 means disabled. No effect for non-MoE models.
 
-    Currently, it is supported with the following constraints:
-
-    - when etp = tp:
-
-      - cp <= ep <= dp_shard * cp
-      - ep % cp == 0
-      - dp_shard * cp % ep == 0
-
-    - when etp = 1:
-
-      - cp * tp <= ep <= dp_shard * cp * tp
-      - ep % (cp * tp) == 0
-      - dp_shard * cp * tp % ep == 0
+    Currently, etp is either 1 or is the same as tp.
 
     Note that this is still an experimental feature. Some constraints will be
     relaxed soon when we have more flexible DeviceMesh support.
diff --git a/torchtitan/distributed/parallel_dims.py b/torchtitan/distributed/parallel_dims.py
@@ -57,12 +57,6 @@ def _validate(self):
 
         if ep > 1:
             assert etp == tp or etp == 1, "Currently we only support ETP=TP or ETP=1"
-            if etp == tp:
-                # EP would borrow all cp and some dp_shard degree
-                assert ep % cp == 0 and (dp_shard * cp) % ep == 0
-            elif etp == 1:
-                # EP would borrow all cp and tp and some dp_shard degree
-                assert ep % (cp * tp) == 0 and (dp_shard * cp * tp) % ep == 0
 
     def build_mesh(self) -> DeviceMesh:
         """
@@ -71,15 +65,14 @@ def build_mesh(self) -> DeviceMesh:
         The following mesh dimensions will be created:
 
             pp:      Pipeline Parallelism (PP).
-            spmd:    Used by SPMD DTensor RNG seed.
             batch:   Used by data loading to determine the global batch size and which
                      part of the data each rank should read. This dimension includes both
                      ``dp_replicate`` and ``dp_shard``. The backend is set to ``fake`` for
                      this dimension to avoid unnecessary process group creation.
             loss:    Used by all-reduce when computing the loss. Includes ``dp_replicate``,
                      ``dp_shard``, and ``cp`` degrees, as all are data parallelisms.
             dp_replicate: For DDP or HSDP replicate dimension.
-            fsdp:    For FSDP dimension. This includes ``cp``.
+            fsdp:    For FSDP dimension. This includes ``dp_shard`` and ``cp``.
             cp:      Context Parallelism (CP).
             tp:      Tensor Parallelism (TP).
             ep:      Expert Parallelism (EP).
@@ -89,7 +82,6 @@ def build_mesh(self) -> DeviceMesh:
         Note: All the dimensions above are created by unflattening the world mesh.
         This API performs the following unflatten operations:
 
-            ["pp", "spmd"]
             ["pp", "batch", "cp", "tp"]
             ["pp", "loss", "tp"]
             ["pp", "dp_replicate", "fsdp", "tp"]
@@ -127,20 +119,16 @@ def unflatten_mesh(
         loss = self.dp_replicate * self.dp_shard * self.cp
         fsdp = self.dp_shard * self.cp
         efsdp = fsdp * self.tp // (self.etp * self.ep)
-        spmd = self.world_size // self.pp
 
         self._world_mesh = init_device_mesh(
             device_type, (self.world_size,), mesh_dim_names=("world",)
         )
-        pp_spmd_mesh = unflatten_mesh(self._world_mesh, ("pp", "spmd"), (self.pp, spmd))
-        data_mesh = unflatten_mesh(
+        dataloading_mesh = unflatten_mesh(
             self._world_mesh,
             ("pp", "batch", "cp", "tp"),
             (self.pp, batch, self.cp, self.tp),
         )
-        loss_mesh = unflatten_mesh(
-            self._world_mesh, ("pp", "loss", "tp"), (self.pp, loss, self.tp)
-        )
+        loss_mesh = dataloading_mesh["batch", "cp"].flatten("loss_mesh")
         dense_mesh = unflatten_mesh(
             self._world_mesh,
             ("pp", "dp_replicate", "fsdp", "tp"),
@@ -153,14 +141,13 @@ def unflatten_mesh(
         )
 
         self._meshes = {
-            "pp": pp_spmd_mesh["pp"],
-            "spmd": pp_spmd_mesh["spmd"],
-            "batch": data_mesh["batch"],
+            "pp": dataloading_mesh["pp"],
+            "batch": dataloading_mesh["batch"],
             "loss": loss_mesh["loss"],
             "dp_replicate": dense_mesh["dp_replicate"],
             "fsdp": dense_mesh["fsdp"],
-            "cp": data_mesh["cp"],
-            "tp": data_mesh["tp"],
+            "cp": dataloading_mesh["cp"],
+            "tp": dataloading_mesh["tp"],
             "ep": sparse_mesh["ep"],
             "efsdp": sparse_mesh["efsdp"],
             "etp": sparse_mesh["etp"],
@@ -180,7 +167,6 @@ def _validate_meshes(self):
         """Validate that created meshes have the expected sizes."""
         expected_sizes = {
             "pp": self.pp,
-            "spmd": self.world_size // self.pp,
             "batch": self.dp_replicate * self.dp_shard,
             "loss": self.dp_replicate * self.dp_shard * self.cp,
             "dp_replicate": self.dp_replicate,
@@ -199,34 +185,38 @@ def _validate_meshes(self):
                 f"expected {expected_size}, got {actual_size}"
             )
 
-    def get_mesh(self, dim: str) -> DeviceMesh | None:
-        """Get a device mesh by dimension name.
+    def get_mesh(self, dims: str | list[str]) -> DeviceMesh | None:
+        """Get a device mesh by dimension names.
 
         Args:
-            dim: Name of the mesh dimension. Valid options include:
-                 'pp', 'spmd', 'batch', 'loss', 'dp_replicate', 'fsdp',
+            dims: Names of the mesh dimension. Valid options include:
+                 'pp', 'batch', 'loss', 'dp_replicate', 'fsdp',
                  'cp', 'tp', 'ep', 'etp', 'efsdp'
 
         Returns:
-            DeviceMesh for the requested dimension, or None if the dimension
-            has size 1 (i.e., parallelism is disabled for that dimension).
+            DeviceMesh for the requested dimension(s), or None if any of
+            dimension(s) has size 1 (i.e., parallelism is disabled for that dimension).
 
         Raises:
-            ValueError: If the requested dimension name is not valid.
+            ValueError: If the requested dimension name(s) is not valid.
         """
         if not self._meshes:
             self.build_mesh()
 
-        if dim not in self._meshes:
+        if isinstance(dims, str):
+            dims = [dims]
+
+        if not all(dim in self._meshes for dim in dims):
             valid_dims = sorted(self._meshes.keys())
             raise ValueError(
-                f"Invalid mesh dim: '{dim}'. Valid dimensions are: {valid_dims}"
+                f"Invalid mesh dim: '{dims}'. Valid dimensions are: {valid_dims}"
             )
 
-        if self._meshes[dim].size() == 1:
+        if any(self._meshes[dim].size() == 1 for dim in dims):
             return None
 
-        return self._meshes[dim]
+        meshes = [self._meshes[dim] for dim in dims]
+        return meshes[0] if len(meshes) == 1 else DeviceMesh._concatenate(meshes)
 
     def get_all_meshes(self) -> dict[str, DeviceMesh]:
         if not self._meshes:
@@ -256,7 +246,7 @@ def cp_enabled(self):
         return self.cp > 1
 
     @property
-    def batch_enabled(self):
+    def dp_cp_enabled(self):
         return self.dp_enabled or self.cp_enabled
 
     @property
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -86,18 +86,29 @@ def set_determinism(
     device: torch.device,
     seed: int | None = None,
     deterministic: bool = False,
-    distinct_seed_mesh_dim: str = "pp",
+    distinct_seed_mesh_dims: list[str] | None = None,
 ) -> None:
     """
     Set the same DTensor manual seed for all dimensions in world mesh, but only different seeds
-    across dimension denoted by `distinct_seed_mesh_dim`. An example use case is pipeline parallelism,
+    across dimensions denoted by `distinct_seed_mesh_dims`. An example use case is pipeline parallelism,
     where we want to have the same seed across SPMD groups, but different seeds across PP groups.
 
     Currently, does not set seeds for the CUDA RNG since TorchTitan always uses DTensor for SPMD parallelisms,
     and DTensor manages its own RNG tracker, but we could extend to support both if needed.
 
     Set Determinism flags for increased reproducibility with loss of performance.
+
+    Args:
+        world_mesh: Device mesh for distributed training
+        device: Device to use
+        seed: Base seed value (if None, will be determined automatically)
+        deterministic: Whether to enable deterministic algorithms
+        distinct_seed_mesh_dims: List of mesh dimension names to have distinct seeds across.
+            If None, defaults to ["pp"] for backward compatibility.
     """
+    if distinct_seed_mesh_dims is None:
+        distinct_seed_mesh_dims = ["pp"]
+
     if deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
@@ -115,7 +126,7 @@ def set_determinism(
 
         FlexAttentionWrapper._compiled_flex_attn = torch.compile(flex_attention)
 
-    if parallel_dims.world_size == 1:
+    if not parallel_dims.world_size == 1:
         if seed is not None:
             torch.manual_seed(seed)
             os.environ["PYTHONHASHSEED"] = str(seed % 2**32)
@@ -131,19 +142,46 @@ def set_determinism(
         torch.distributed.broadcast(seed_tensor, src=0)
         seed = seed_tensor.to("cpu").view(torch.uint64).item()
 
-    # Set distinct seed for each rank in mesh dimensions, with dimension name provided by `distinct_seed_mesh_dim`
+    # Set distinct seed for each rank in mesh dimensions, with dimension names provided by `distinct_seed_mesh_dims`
     # For PP + SPMD cases, we want to separate the world into the SPMD mesh and the PP mesh,
     # and choose a unique seed for each rank on the PP mesh.
-    # TODO(jianiw): We could further extend this to support multiple distinct dimensions instead of just one.
-    duplicate_seed_mesh = parallel_dims.get_mesh("spmd")
-    logger.debug(f"Global Rank {c10d.get_rank()} using seed: {seed}")
-    all_meshes = parallel_dims.get_all_meshes()
-    if distinct_seed_mesh_dim in all_meshes.keys():
-        distinct_mesh = all_meshes[distinct_seed_mesh_dim]
-        seed += distinct_mesh.get_local_rank()
+    # We support multiple distinct dimensions by adding each distinct dimension's local rank to the seed.
+    distinct_seed_meshes = [
+        parallel_dims.get_mesh(dim) for dim in distinct_seed_mesh_dims
+    ]
+    distinct_seed_meshes = [mesh for mesh in distinct_seed_meshes if mesh is not None]
+
+    if distinct_seed_meshes:
+        # Use mixed-radix positional system to ensure unique seed per coordinate
+        # Each dimension contributes: local_rank * (product of all previous dimension sizes)
+        # This guarantees uniqueness like multi-dimensional array indexing
+        seed_offset = 0
+        cumulative_size = 1
+
+        for distinct_mesh in distinct_seed_meshes:
+            local_rank = distinct_mesh.get_local_rank()
+            # Add contribution from this dimension
+            seed_offset += local_rank * cumulative_size
+
+            # Update cumulative size for next dimension
+            cumulative_size *= distinct_mesh.size()
+
+        seed += seed_offset
         seed %= 2**64
 
-        logger.debug(f"{distinct_seed_mesh_dim} rank {distinct_mesh.get_local_rank()}")
+        logger.debug(
+            f"Distinct dims {distinct_dims_in_mesh}, Global rank {c10d.get_rank()} using seed: {seed}"
+        )
+
+        # Filter out all distinct dimensions to get duplicate_seed_mesh
+        duplicate_seed_meshes = [
+            v
+            for k, v in parallel_dims.get_all_meshes()
+            if k not in distinct_dims_in_mesh
+        ]
+    else:
+        duplicate_seed_meshes = [parallel_dims.world_mesh]
+        logger.debug(f"Global Rank {c10d.get_rank()} using seed: {seed}")
 
     # The native RNGs and python RNG may not be important, except for the 1-D PP case, but we seed them for consistency.
     torch.manual_seed(seed)
@@ -152,8 +190,8 @@ def set_determinism(
 
     # As long as we are not in the 1-D (PP-only) case, we will have a seed to use for all ranks of the SPMD mesh.
     # IF PP is also used, this seed is unique per PP rank.
-    if duplicate_seed_mesh:
-        torch.distributed.tensor._random.manual_seed(seed, duplicate_seed_mesh)
+    if duplicate_seed_meshes:
+        torch.distributed.tensor._random.manual_seed(seed, duplicate_seed_meshes[0])
 
 
 def create_context_parallel_ctx(
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -111,12 +111,10 @@ def parallelize_llama(
 
     if parallel_dims.fsdp_enabled:
         # dp_mesh is the mesh for FSDP/HSDP
-        if parallel_dims.dp_replicate_enabled:
-            dp_mesh = DeviceMesh._concatenate(
-                [parallel_dims.get_mesh("dp_replicate"), parallel_dims.get_mesh("fsdp")]
-            )
-        else:
-            dp_mesh = parallel_dims.get_mesh("fsdp")
+        names = (
+            ["dp_replicate", "fsdp"] if parallel_dims.dp_replicate_enabled else ["fsdp"]
+        )
+        dp_mesh = parallel_dims.get_mesh(names)
         apply_fsdp(
             model,
             dp_mesh,
diff --git a/torchtitan/models/llama4/infra/parallelize.py b/torchtitan/models/llama4/infra/parallelize.py
@@ -101,7 +101,7 @@ def parallelize_llama(
             tp_mesh=tp_mesh,
             ep_mesh=parallel_dims.get_mesh("ep"),
             etp_mesh=parallel_dims.get_mesh("etp"),
-            etp_enabled=parallel_dims.etp_enabled,
+            ep_etp_mesh=parallel_dims.get_mesh(["ep", "etp"]),
         )
 
     model_compile_enabled = (
@@ -123,23 +123,16 @@ def parallelize_llama(
 
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
         # dp_mesh is the mesh for FSDP/HSDP
-        if parallel_dims.dp_replicate_enabled:
-            dp_mesh = DeviceMesh._concatenate(
-                [parallel_dims.get_mesh("dp_replicate"), parallel_dims.get_mesh("fsdp")]
-            )
-        else:
-            dp_mesh = parallel_dims.get_mesh("fsdp")
+        names = (
+            ["dp_replicate", "fsdp"] if parallel_dims.dp_replicate_enabled else ["fsdp"]
+        )
+        dp_mesh = parallel_dims.get_mesh(names)
 
         # the mesh dim names of which the MoE params are sharded on via FSDP/HSDP
         dp_mod_ep_mesh = None
         if parallel_dims.ep_enabled:
             if parallel_dims.dp_replicate_enabled:
-                dp_mod_ep_mesh = DeviceMesh._concatenate(
-                    [
-                        parallel_dims.get_mesh("dp_replicate"),
-                        parallel_dims.get_mesh("efsdp"),
-                    ]
-                )
+                dp_mod_ep_mesh = parallel_dims.get_mesh(["dp_replicate", "efsdp"])
             else:
                 dp_mod_ep_mesh = parallel_dims.get_mesh("efsdp")
 
@@ -434,6 +427,7 @@ def apply_moe_ep_tp(
     tp_mesh: DeviceMesh | None,
     ep_mesh: DeviceMesh | None,
     etp_mesh: DeviceMesh | None,
+    ep_etp_mesh: DeviceMesh | None,
 ):
     assert ep_mesh is not None or tp_mesh is not None
 
@@ -477,17 +471,19 @@ def apply_moe_ep_tp(
                 parallelize_plan=moe_layer_plan,
             )
 
-        experts_mesh, experts_plan = None, None
+        expert_mesh, experts_plan = None, None
         if ep_mesh is None:
+            assert ep_etp_mesh is None
             experts_mesh = tp_mesh
             # input Replicate, output Partial
             experts_plan = TensorParallel()
         elif tp_mesh is None or etp_mesh is None:
+            assert ep_etp_mesh is None
             experts_mesh = ep_mesh
             # input / output sharding on the batch / tokens dim
             experts_plan = ExpertParallel()
         else:
-            experts_mesh = DeviceMesh._concatenate([ep_mesh, etp_mesh])
+            experts_mesh = ep_etp_mesh
             experts_plan = ExpertTensorParallel()
 
         parallelize_module(
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -526,7 +526,7 @@ def train_step(
         if not self.metrics_processor.should_log(self.step):
             return
 
-        if parallel_dims.batch_enabled:
+        if parallel_dims.dp_cp_enabled:
             loss = loss.detach()
             ft_pg = self.ft_manager.loss_sync_pg
             batch_mesh = parallel_dims.get_mesh("batch")