[RFC] Support full bf16 training (#1646)

ebsmothers · web-flow · commit e99e16c915aa · 2025-09-16T10:21:38.000-07:00
This PR adds support for full bfloat16 training. In SFT it is pretty
common to store everything in bfloat16 to save memory, with select
tensors (logits, RoPE buffers and activations) maintained in a higher
precision to preserve numerical accuracy. Separately I think having this
supported more generally would be useful for faster iteration -- e.g. it
allows me to run Llama3 70B on a single node of H100s, which otherwise
is not possible with the default config.

Assuming this is generally useful, would like feedback on:

1) Acceptable loss convergence: in the first 100 steps on Llama3 8B full
bf16 training goes from 12.25 -&gt; 8, as opposed to 12.25 -&gt; 7 with fp32
training. Is this a concern? (As mentioned, for SFT this is less of an
issue; happy to validate that statement if that's helpful.)
2) Interaction with mixed precision training -- where is the right place
to validate that these are not both set at once?
3) Where to put the `set_default_dtype` API
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -201,6 +201,13 @@ class Training:
     Whether to apply CPU offloading of parameters, gradients, and optimizer states in FSDP
     """
 
+    dtype: Literal["bfloat16", "float32"] = "float32"
+    """
+    torch dtype for training. In contrast to mixed precision training, setting training_dtype=bfloat16 will
+    put all parameters, gradients, and optimizer states in bfloat16, without an extra copy of fp32 weights.
+    In the case of full bf16 training, RoPE calculations and logits will still be in fp32.
+    """
+
     mixed_precision_param: Literal["bfloat16", "float32"] = "bfloat16"
     """
     torch dtype to use for parameters when applying mixed precision via fully_shard or torch.autocast.
diff --git a/torchtitan/tools/utils.py b/torchtitan/tools/utils.py
@@ -4,11 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import contextlib
 import gc
 import subprocess
 import time
 from dataclasses import dataclass
-from typing import Optional
+from typing import Generator, Optional
 
 import torch
 from torch._utils import _get_available_device_type, _get_device_module
@@ -174,3 +175,30 @@ def check_if_feature_in_pytorch(
             f"{min_nightly_version}. Please upgrade a newer version to include the "
             f"change in ({pull_request}) for correct {feature_name}."
         )
+
+
+@contextlib.contextmanager
+def set_default_dtype(dtype: torch.dtype) -> Generator[None, None, None]:
+    """
+    Context manager to set torch's default dtype.
+
+    Args:
+        dtype (torch.dtype): The desired default dtype inside the context manager.
+
+    Returns:
+        ContextManager: context manager for setting default dtype.
+
+    Example:
+        >>> with set_default_dtype(torch.bfloat16):
+        >>>     x = torch.tensor([1, 2, 3])
+        >>>     x.dtype
+        torch.bfloat16
+
+
+    """
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    try:
+        yield
+    finally:
+        torch.set_default_dtype(old_dtype)
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -22,7 +22,7 @@
     build_metrics_processor,
     ensure_pp_loss_visible,
 )
-from torchtitan.config import ConfigManager, JobConfig
+from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.models.attention import init_attention_mask
 from torchtitan.protocols.model_converter import build_model_converters
@@ -154,7 +154,10 @@ def __init__(self, job_config: JobConfig):
         logger.info(
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
-        with torch.device("meta"):
+        with (
+            torch.device("meta"),
+            utils.set_default_dtype(TORCH_DTYPE_MAP[job_config.training.dtype]),
+        ):
             model = self.train_spec.model_cls(model_args)
 
         # Build the collection of model converters. No-op if `model.converters` empty