misc

fegin · fegin · commit d6eae5863d6d · 2025-11-06T23:20:00.000-08:00
diff --git a/torchtitan/experiments/simple_fsdp/tests/test_numerics.py b/torchtitan/experiments/simple_fsdp/tests/test_numerics.py
@@ -20,13 +20,13 @@ def init_test(self):
         self.loss_fn = cross_entropy_loss
         data_parallel_shard_degree = -1
         if self.mode == "replicate":
-            self.dp_mesh_dim_names = ("dp_replicate",)
+            self.dp_mesh_dim_names = ["dp_replicate"]
             data_parallel_replicate_degree = self.world_size
         elif self.mode == "fully_shard":
-            self.dp_mesh_dim_names = ("dp_shard_cp",)
+            self.dp_mesh_dim_names = ["fsdp"]
             data_parallel_replicate_degree = 1
         elif self.mode == "hybrid_shard":
-            self.dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+            self.dp_mesh_dim_names = ["dp_replicate", "fsdp"]
             data_parallel_replicate_degree = self.world_size // 2
         else:
             raise ValueError(f"Unsupported mode {self.mode}")
@@ -41,7 +41,6 @@ def init_test(self):
             etp=1,
             world_size=self.world_size,
         )
-        self.device_mesh = self.parallel_dims.world_mesh
 
     def get_input(self):
         inputs = torch.randn(8, 8).cuda()
@@ -50,7 +49,7 @@ def get_input(self):
         return model, inputs, labels
 
     def run_fsdp2(self, model, inputs, labels, epoch=20):
-        fully_shard(model, mesh=self.device_mesh[tuple(self.dp_mesh_dim_names)])
+        fully_shard(model, mesh=self.parallel_dims.get_mesh(self.dp_mesh_dim_names))
         optim = self.optimizer(model.parameters(), lr=1e-4)
         losses = []
         for _ in range(epoch):
@@ -65,7 +64,7 @@ def run_fsdp2(self, model, inputs, labels, epoch=20):
     def run_simple_fsdp(self, model, inputs, labels, epoch=20):
         model = data_parallel(
             model,
-            device_mesh=self.device_mesh[tuple(self.dp_mesh_dim_names)],
+            device_mesh=self.parallel_dims.get_mesh(self.dp_mesh_dim_names),
             mode=self.mode,
         )
         optim = self.optimizer(model.parameters(), lr=1e-4)
@@ -82,7 +81,7 @@ def run_simple_fsdp(self, model, inputs, labels, epoch=20):
     def run_simple_fsdp_compiled_aot_eager(self, model, inputs, labels, epoch=20):
         model = data_parallel(
             model,
-            device_mesh=self.device_mesh[tuple(self.dp_mesh_dim_names)],
+            device_mesh=self.parallel_dims.get_mesh(self.dp_mesh_dim_names),
             mode=self.mode,
         )
         # TODO: Add "inductor" backend when it's numerical issues are fixed
diff --git a/torchtitan/models/flux/train.py b/torchtitan/models/flux/train.py
@@ -35,7 +35,7 @@ def __init__(self, job_config: JobConfig):
             self.parallel_dims,
             self.device,
             job_config.debug,
-            distinct_seed_mesh_dims=["dp_shard", "dp_replicate"],
+            distinct_seed_mesh_dims=["fsdp", "dp_replicate"],
         )
 
         # NOTE: self._dtype is the data type used for encoders (image encoder, T5 text encoder, CLIP text encoder).

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def __init__(self, job_config: JobConfig):`
`35`	`35`	`self.parallel_dims,`
`36`	`36`	`self.device,`
`37`	`37`	`job_config.debug,`
`38`		`- distinct_seed_mesh_dims=["dp_shard", "dp_replicate"],`
	`38`	`+ distinct_seed_mesh_dims=["fsdp", "dp_replicate"],`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`# NOTE: self._dtype is the data type used for encoders (image encoder, T5 text encoder, CLIP text encoder).`