[Flux] Enable unique random seed for multiple ranks (#1946)

wwwjn · web-flow · commit ff078526d1b9 · 2025-10-30T14:40:43.000-07:00
Now when enabling HSDP, the `dp_replicate` dim will have the same seed
across all ranks, which is bad for noise generation in Flux training.
Fix this my changing set_determinism.
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -134,7 +134,12 @@ def test_generate(
         apply_tp_minus_sp(model, parallel_dims.world_mesh["tp"])
 
     debug_config = DebugConfig(seed=seed, deterministic=deterministic)
-    dist_utils.set_determinism(world_mesh, device, debug_config)
+    dist_utils.set_determinism(
+        world_mesh=world_mesh,
+        device=device,
+        debug_config=debug_config,
+        distinct_seed_mesh_dims=["pp"],
+    )
 
     # materalize model
     model.to_empty(device=device_type)
diff --git a/tests/unit_tests/test_set_determinism.py b/tests/unit_tests/test_set_determinism.py
@@ -0,0 +1,213 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+from torchtitan.config import Debug as DebugConfig
+from torchtitan.distributed.utils import set_determinism
+
+
+class FakeDeviceMesh:
+    """Fake DeviceMesh for testing seed uniqueness.
+
+    Args:
+        mesh_dim_names: List of dimension names (e.g., ["dp", "pp", "tp"])
+        mesh_sizes: List of sizes for each dimension (e.g., [4, 2, 8])
+        rank_coords: Tuple of coordinates for this rank (e.g., (2, 1, 5))
+    """
+
+    def __init__(self, mesh_dim_names, mesh_sizes, rank_coords):
+        self.mesh_dim_names = mesh_dim_names
+        self.mesh_sizes = dict(zip(mesh_dim_names, mesh_sizes))
+        self.rank_coords = dict(zip(mesh_dim_names, rank_coords))
+
+    def __getitem__(self, key):
+        """Return a submesh for the given dimension(s)."""
+        if isinstance(key, str):
+            # Single dimension
+            submesh = MagicMock()
+            submesh.get_local_rank.return_value = self.rank_coords[key]
+            submesh.size.return_value = self.mesh_sizes[key]
+            submesh.get_coordinate.return_value = self.rank_coords[key]
+            return submesh
+        elif isinstance(key, list):
+            # Multiple dimensions
+            submesh = MagicMock()
+            # For multiple dimensions, get_coordinate should return None
+            # since we're not testing this path
+            submesh.get_coordinate.return_value = None
+            return submesh
+        else:
+            raise ValueError(f"Unsupported key type: {type(key)}")
+
+    def get_coordinate(self):
+        """Return the coordinate tuple for this rank."""
+        return tuple(self.rank_coords[dim] for dim in self.mesh_dim_names)
+
+
+class TestSetDeterminismWithFakeMesh(unittest.TestCase):
+    """Test set_determinism with fake mesh to verify seed uniqueness."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.device = torch.device("cpu")
+
+    def tearDown(self):
+        """Clean up after tests."""
+        torch.use_deterministic_algorithms(False)
+        if "PYTHONHASHSEED" in os.environ:
+            del os.environ["PYTHONHASHSEED"]
+        if "CUBLAS_WORKSPACE_CONFIG" in os.environ:
+            del os.environ["CUBLAS_WORKSPACE_CONFIG"]
+
+    @patch("torch.distributed.distributed_c10d.get_world_size")
+    @patch("torch.distributed.distributed_c10d.get_rank")
+    def test_seed_uniqueness_2d_mesh(self, mock_get_rank, mock_get_world_size):
+        """Test that different PP ranks get unique seeds, same DP ranks share seeds."""
+        mock_get_world_size.return_value = 8  # 4 * 2
+
+        mesh_dim_names = ["dp", "pp"]
+        mesh_sizes = [4, 2]
+        base_seed = 1000
+
+        seeds_by_coord = {}
+
+        # Test all possible rank coordinates
+        for dp_rank in range(mesh_sizes[0]):
+            for pp_rank in range(mesh_sizes[1]):
+                mock_get_rank.return_value = dp_rank * mesh_sizes[1] + pp_rank
+
+                # Create fake mesh for this rank
+                rank_coords = (dp_rank, pp_rank)
+                fake_mesh = FakeDeviceMesh(mesh_dim_names, mesh_sizes, rank_coords)
+
+                # Call set_determinism with distinct seeds only on PP dimension
+                debug_config = DebugConfig(seed=base_seed, deterministic=False)
+                set_determinism(
+                    world_mesh=fake_mesh,
+                    device=self.device,
+                    debug_config=debug_config,
+                    distinct_seed_mesh_dims=["pp"],
+                )
+
+                # Capture the seed that was set
+                rng_state = torch.get_rng_state()
+                actual_seed = rng_state[:8].view(torch.int64).item()
+
+                # Store for verification
+                coord_key = (dp_rank, pp_rank)
+                seeds_by_coord[coord_key] = actual_seed
+
+        # Verify that coordinates with same PP but different DP have same seed
+        for pp_rank in range(mesh_sizes[1]):
+            # All DP ranks should have same seed for this PP rank
+            seeds_for_this_pp = [
+                seeds_by_coord[(dp_rank, pp_rank)] for dp_rank in range(mesh_sizes[0])
+            ]
+            self.assertEqual(
+                len(set(seeds_for_this_pp)),
+                1,
+                f"Different DP ranks at pp={pp_rank} should have same seed, "
+                f"got {seeds_for_this_pp}",
+            )
+
+        # Verify that different PP ranks have different seeds
+        unique_pp_seeds = set()
+        for pp_rank in range(mesh_sizes[1]):
+            seed = seeds_by_coord[(0, pp_rank)]  # Just check first DP rank
+            self.assertNotIn(seed, unique_pp_seeds, f"Duplicate seed for pp={pp_rank}")
+            unique_pp_seeds.add(seed)
+
+        self.assertEqual(
+            len(unique_pp_seeds),
+            mesh_sizes[1],
+            f"Expected {mesh_sizes[1]} unique seeds for PP dimension",
+        )
+
+    @patch("torch.distributed.distributed_c10d.get_world_size")
+    @patch("torch.distributed.distributed_c10d.get_rank")
+    def test_seed_uniqueness_3d_mesh(self, mock_get_rank, mock_get_world_size):
+        """Test that different dp_shard and dp_replicate get unique seeds, TP shares seeds."""
+        mesh_dim_names = ["dp_shard", "dp_replicate", "tp"]
+        mesh_sizes = [3, 2, 4]
+        mock_get_world_size.return_value = 3 * 2 * 4
+        base_seed = 2000
+
+        seeds_by_coord = {}
+
+        # Test all possible rank coordinates
+        for dp_shard_rank in range(mesh_sizes[0]):
+            for dp_replicate_rank in range(mesh_sizes[1]):
+                for tp_rank in range(mesh_sizes[2]):
+                    global_rank = (
+                        dp_shard_rank * (mesh_sizes[1] * mesh_sizes[2])
+                        + dp_replicate_rank * mesh_sizes[2]
+                        + tp_rank
+                    )
+                    mock_get_rank.return_value = global_rank
+
+                    # Create fake mesh for this rank
+                    rank_coords = (dp_shard_rank, dp_replicate_rank, tp_rank)
+                    fake_mesh = FakeDeviceMesh(mesh_dim_names, mesh_sizes, rank_coords)
+
+                    # Call set_determinism with distinct seeds on dp_shard and dp_replicate only
+                    debug_config = DebugConfig(seed=base_seed, deterministic=False)
+                    set_determinism(
+                        world_mesh=fake_mesh,
+                        device=self.device,
+                        debug_config=debug_config,
+                        distinct_seed_mesh_dims=["dp_shard", "dp_replicate"],
+                    )
+
+                    # Capture the seed that was set
+                    rng_state = torch.get_rng_state()
+                    actual_seed = rng_state[:8].view(torch.int64).item()
+
+                    # Store for verification
+                    coord_key = (dp_shard_rank, dp_replicate_rank, tp_rank)
+                    seeds_by_coord[coord_key] = actual_seed
+
+        # Verify that coordinates with same (dp_shard, dp_replicate) but different TP have same seed
+        for dp_shard_rank in range(mesh_sizes[0]):
+            for dp_replicate_rank in range(mesh_sizes[1]):
+                # All TP ranks should have same seed for this (dp_shard, dp_replicate)
+                seeds_for_this_dp = [
+                    seeds_by_coord[(dp_shard_rank, dp_replicate_rank, tp_rank)]
+                    for tp_rank in range(mesh_sizes[2])
+                ]
+                self.assertEqual(
+                    len(set(seeds_for_this_dp)),
+                    1,
+                    f"Different TP ranks at (dp_shard={dp_shard_rank}, dp_replicate={dp_replicate_rank}) "
+                    f"should have same seed, got {seeds_for_this_dp}",
+                )
+
+        # Verify that different (dp_shard, dp_replicate) combinations have different seeds
+        unique_dp_seeds = set()
+        for dp_shard_rank in range(mesh_sizes[0]):
+            for dp_replicate_rank in range(mesh_sizes[1]):
+                seed = seeds_by_coord[
+                    (dp_shard_rank, dp_replicate_rank, 0)
+                ]  # Just check first TP rank
+                self.assertNotIn(
+                    seed,
+                    unique_dp_seeds,
+                    f"Duplicate seed for (dp_shard={dp_shard_rank}, dp_replicate={dp_replicate_rank})",
+                )
+                unique_dp_seeds.add(seed)
+
+        self.assertEqual(
+            len(unique_dp_seeds),
+            mesh_sizes[0] * mesh_sizes[1],
+            f"Expected {mesh_sizes[0] * mesh_sizes[1]} unique seeds for (dp_shard, dp_replicate) combinations",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -84,17 +84,24 @@ def set_determinism(
     world_mesh: DeviceMesh | None,
     device: torch.device,
     debug_config: DebugConfig,
-    distinct_seed_mesh_dim: str = "pp",
+    distinct_seed_mesh_dims: list[str],
 ) -> None:
     """
     Set the same DTensor manual seed for all dimensions in world mesh, but only different seeds
-    across dimension denoted by `distinct_seed_mesh_dim`. An example use case is pipeline parallelism,
+    across dimensions denoted by `distinct_seed_mesh_dims`. An example use case is pipeline parallelism,
     where we want to have the same seed across SPMD groups, but different seeds across PP groups.
 
     Currently, does not set seeds for the CUDA RNG since TorchTitan always uses DTensor for SPMD parallelisms,
     and DTensor manages its own RNG tracker, but we could extend to support both if needed.
 
     Set Determinism flags for increased reproducibility with loss of performance.
+
+    Args:
+        world_mesh: Device mesh for distributed training
+        device: Device to use
+        distinct_seed_mesh_dims: List of mesh dimension names to have distinct seeds across.
+        seed: Base seed value (if None, will be determined automatically)
+        deterministic: Whether to enable deterministic algorithms
     """
     if debug_config.deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
@@ -133,28 +140,43 @@ def set_determinism(
         torch.distributed.broadcast(seed_tensor, src=0)
         seed = seed_tensor.to("cpu").view(torch.uint64).item()
 
-    # Set distinct seed for each rank in mesh dimensions, with dimension name provided by `distinct_seed_mesh_dim`
+    # Set distinct seed for each rank in mesh dimensions, with dimension names provided by `distinct_seed_mesh_dims`
     # For PP + SPMD cases, we want to separate the world into the SPMD mesh and the PP mesh,
     # and choose a unique seed for each rank on the PP mesh.
-    # TODO(jianiw): We could further extend this to support multiple distinct dimensions instead of just one.
-    if (
-        c10d.get_world_size() > 1
-        and distinct_seed_mesh_dim in world_mesh.mesh_dim_names
-    ):
-        distinct_mesh = world_mesh[distinct_seed_mesh_dim]
-        seed += distinct_mesh.get_local_rank()
+    # We support multiple distinct dimensions by adding each distinct dimension's local rank to the seed.
+    distinct_dims_in_mesh = [
+        dim for dim in distinct_seed_mesh_dims if dim in world_mesh.mesh_dim_names
+    ]
+
+    if c10d.get_world_size() > 1 and distinct_dims_in_mesh:
+        # Each dimension contributes: local_rank * (product of all previous dimension sizes)
+        # This guarantees uniqueness like multi-dimensional array indexing
+        seed_offset = 0
+        cumulative_size = 1
+
+        for dim in distinct_dims_in_mesh:
+            distinct_mesh = world_mesh[dim]
+            local_rank = distinct_mesh.get_local_rank()
+            # Add contribution from this dimension
+            seed_offset += local_rank * cumulative_size
+            # Update cumulative size for next dimension
+            cumulative_size *= distinct_mesh.size()
+
+        seed += seed_offset
         seed %= 2**64
 
         logger.debug(
-            f"{distinct_seed_mesh_dim} rank {distinct_mesh.get_local_rank()}, Global rank {c10d.get_rank()} using seed: {seed}"
-        )
-        duplicate_seed_mesh = list(
-            filter(
-                lambda name: name != distinct_seed_mesh_dim, world_mesh.mesh_dim_names
-            )
+            f"Distinct dims {distinct_dims_in_mesh}, Global rank {c10d.get_rank()} using seed: {seed}"
         )
+
+        # Filter out all distinct dimensions to get duplicate_seed_mesh
+        duplicate_seed_mesh_dims = [
+            name
+            for name in world_mesh.mesh_dim_names
+            if name not in distinct_dims_in_mesh
+        ]
         duplicate_seed_mesh = (
-            world_mesh[duplicate_seed_mesh] if len(duplicate_seed_mesh) else None
+            world_mesh[duplicate_seed_mesh_dims] if duplicate_seed_mesh_dims else None
         )
     else:
         duplicate_seed_mesh = world_mesh
diff --git a/torchtitan/models/flux/train.py b/torchtitan/models/flux/train.py
@@ -35,7 +35,7 @@ def __init__(self, job_config: JobConfig):
             self.parallel_dims.world_mesh,
             self.device,
             job_config.debug,
-            distinct_seed_mesh_dim="dp_shard",
+            distinct_seed_mesh_dims=["dp_shard", "dp_replicate"],
         )
 
         # NOTE: self._dtype is the data type used for encoders (image encoder, T5 text encoder, CLIP text encoder).
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -120,6 +120,7 @@ def __init__(self, job_config: JobConfig):
             world_mesh,
             self.device,
             job_config.debug,
+            distinct_seed_mesh_dims=["pp"],
         )
         self.train_spec = train_spec_module.get_train_spec(job_config.model.name)
 

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def __init__(self, job_config: JobConfig):`
`35`	`35`	`self.parallel_dims.world_mesh,`
`36`	`36`	`self.device,`
`37`	`37`	`job_config.debug,`
`38`		`- distinct_seed_mesh_dim="dp_shard",`
	`38`	`+ distinct_seed_mesh_dims=["dp_shard", "dp_replicate"],`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`# NOTE: self._dtype is the data type used for encoders (image encoder, T5 text encoder, CLIP text encoder).`
Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ def __init__(self, job_config: JobConfig):`
`120`	`120`	`world_mesh,`
`121`	`121`	`self.device,`
`122`	`122`	`job_config.debug,`
	`123`	`+ distinct_seed_mesh_dims=["pp"],`
`123`	`124`	`)`
`124`	`125`	`self.train_spec = train_spec_module.get_train_spec(job_config.model.name)`
`125`	`126`