Switch to "device" argument to enable hpu code path

splotnikv · splotnikv · commit a49a03ae850d · 2025-06-13T09:15:56.000-07:00
Signed-off-by: Sergey Plotnikov &lt;sergey.plotnikov@intel.com&gt;
diff --git a/docs/hpu.md b/docs/hpu.md
@@ -0,0 +1,48 @@
+# InstructLab Training on HPU
+
+## HPU specific changes
+Next changes are required to enable training on HPU:
+
+|GPU|HPU|
+|---|---|
+|`from accelerate import Accelerator` | `from optimum.habana.accelerate import GaudiAccelerator`|
+|`from accelerate.utils import FullyShardedDataParallelPlugin` | `from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin` |
+
+It is also recommended to use HPU optimized versions of transformers:
+
+```python
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+adapt_transformers_to_gaudi()
+```
+
+## Bucketing
+Multipack sampler implementation produces wide range of batches with different sample lengths and number of samples. Each of these combinations leads to graph recompilation and this recompilation takes time and slows down training. To reduce number of recompilations HPU implementation uses bucketing approach, when maximum sample length in batch is aligned to some predefined value. It is similar to padding but all samples in the batch are padded not to the longest sample but to the some slightly bigger value.
+
+![bucketing vs. padding](./hpu_pic/bucketing_vs_padding.png)
+
+
+To compute bucked size, we use next algorithm: 
+- Firstly, we find MSB of the longest sample in the batch, let's call it S.
+- Then we slice the range [2 ** S, 2 ** (S+1)] into 16 buckets of the same size.
+- Then we use top boundary of the smallest suitable bucked as padding value.
+
+This approach limits overhead of the bucketing to 1/16 th of the longest sample and allows us to significantly reduce number of recompilations.
+
+## How to run
+To run training make next changes to config file:
+```json
+train:
+  device: hpu
+  distributed_backend: fsdp
+  fsdp_cpu_offload_optimizer: false
+  is_padding_free: true
+  pipeline: accelerated 
+  disable_flash_attn: true
+```
+
+And use this command line:
+```bash
+ilab --config=./config.yaml model train --pipeline accelerated --data-path ./data.jsonl
+```
+
+
diff --git a/docs/hpu_pic/bucketing_vs_padding.png b/docs/hpu_pic/bucketing_vs_padding.png
diff --git a/src/instructlab/training/accelerator.py b/src/instructlab/training/accelerator.py
@@ -3,11 +3,6 @@
 from typing import Callable, Optional
 
 # Third Party
-from instructlab.training.hpu_utils import is_torch_hpu_available
-if is_torch_hpu_available():
-    from optimum.habana.accelerate import GaudiAccelerator as TransformersAccel
-else:
-    from accelerate import Accelerator as TransformersAccel
 
 from torch.utils.data import DataLoader
 from transformers import get_scheduler
@@ -37,6 +32,7 @@ def __init__(
         deepspeed_cpu_offload_optimizer_pin_memory: Optional[bool] = False,
         deepspeed_cpu_offload_optimizer_ratio: Optional[float] = None,
         fsdp_cpu_offload_params: Optional[bool] = False,
+        device: Optional[str] = None,
     ):
         self.samples_per_gpu = samples_per_gpu
         self.save_samples = save_samples
@@ -74,6 +70,12 @@ def __init__(
                 "fsdp_plugin": self.get_fsdp_config(),
                 "mixed_precision": "bf16",
             }
+
+        if device == "hpu":
+            from optimum.habana.accelerate import GaudiAccelerator as TransformersAccel
+        else:
+            from accelerate import Accelerator as TransformersAccel
+
         self.accelerator = TransformersAccel(
             **accel_args,
         )
@@ -129,11 +131,6 @@ def get_fsdp_config(self):
         from functools import partial
 
         # Third Party
-        if is_torch_hpu_available():
-            from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin
-        else:
-            from accelerate.utils import FullyShardedDataParallelPlugin
-
         from peft.utils.other import fsdp_auto_wrap_policy
         from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
         from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
@@ -161,17 +158,27 @@ def get_fsdp_config(self):
         prefetch_policy = (
             BackwardPrefetch.BACKWARD_POST if is_lora else BackwardPrefetch.BACKWARD_PRE
         )
-        fsdp_plugin = (GaudiFullyShardedDataParallelPlugin if is_torch_hpu_available() else FullyShardedDataParallelPlugin)(
-            auto_wrap_policy=wrap_policy,
-            limit_all_gathers=True,
-            backward_prefetch=prefetch_policy,
-            sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
-            cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
-        )
 
-        if is_torch_hpu_available():
+        if self.device.type == "hpu":
+            from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin
+            fsdp_plugin = GaudiFullyShardedDataParallelPlugin(
+                auto_wrap_policy=wrap_policy,
+                limit_all_gathers=True,
+                backward_prefetch=prefetch_policy,
+                sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
+                cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
+            )
             fsdp_plugin.use_orig_params=True
             fsdp_plugin.sync_module_states=True
+        else:
+            from accelerate.utils import FullyShardedDataParallelPlugin
+            fsdp_plugin = FullyShardedDataParallelPlugin(
+                auto_wrap_policy=wrap_policy,
+                limit_all_gathers=True,
+                backward_prefetch=prefetch_policy,
+                sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
+                cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
+            )
 
         # `use_orig_params` must be disabled when using LoRA and FSDP together
         # Source: https://huggingface.co/docs/peft/en/accelerate/fsdp#the-important-parts
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
@@ -245,3 +245,5 @@ class TrainingArgs(BaseModel):
     log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(
         default="INFO"
     )
+
+    device: Optional[str] = None
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -131,7 +131,7 @@ def train(
         if local_rank == 0:
             inner_pb = tqdm(range(num_epoch_steps), desc=f"Epoch {epoch}")
 
-        # blast through the batches in the train loader up to the last step within the epoch.
+        # blast through the batches in the train loader up to the last step within the epoch. 
         for batch in accelerator.train_loader:
             if global_step <= args.last_step:
                 # in the case of resuming, last_step > 0
@@ -147,10 +147,10 @@ def train(
             total_length = float(torch.tensor([batch.pop("total_length")]))
             if not args.use_dolomite:
                 for k in batch:
-                    batch[k] = batch[k].to('hpu' if is_torch_hpu_available() else local_rank)
+                    batch[k] = batch[k].to('hpu' if args.device == "hpu" else local_rank)
 
             hpu_args = {}
-            if is_torch_hpu_available():
+            if args.device == "hpu":
                 hpu_args = {
                     "use_flash_attention":True,
                     "lazy_mode":False,
@@ -197,7 +197,7 @@ def train(
                 overall_throughput = args.samples_per_gpu * world_size / elapsed_time
                 current_lr = accelerator.lr_scheduler.get_last_lr()[0]
 
-                if is_torch_hpu_available():
+                if args.device == "hpu":
                     mem_allocated = torch.hpu.memory_allocated() / (1024**3)
                     malloc_retries = 0
                 else:
@@ -225,8 +225,8 @@ def train(
                         "rank": torch.distributed.get_rank(),
                         "overall_throughput": overall_throughput,
                         "lr": current_lr,
-                        ("hpu" if is_torch_hpu_available() else "cuda") + "_mem_allocated": mem_allocated,
-                        ("hpu" if is_torch_hpu_available() else "cuda") + "_malloc_retries": malloc_retries,
+                        ("hpu" if args.device == "hpu" else "cuda") + "_mem_allocated": mem_allocated,
+                        ("hpu" if args.device == "hpu" else "cuda") + "_malloc_retries": malloc_retries,
                         "num_loss_counted_tokens": int(num_loss_counted_tokens),
                         "num_tokens_rank0": int(total_length),
                         "batch_size": int(micro_batch_size),
@@ -260,7 +260,7 @@ def train(
             if local_rank == 0:
                 inner_pb.update(1)
 
-            if not is_torch_hpu_available():
+            if args.device != "hpu":
                 torch.cuda.empty_cache()
 
         if args.checkpoint_at_epoch:
@@ -340,20 +340,20 @@ def main(args):
     args.model_type = model_conf.model_type
 
     #### distributed init #####
-    if is_torch_hpu_available():
+    if args.device == "hpu":
         torch.hpu.set_device(int(os.environ["LOCAL_RANK"]))
     else:
         torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 
     args.local_rank = int(os.environ["LOCAL_RANK"])
 
     timeout = _get_collective_timeout()
-    backend = "hccl" if is_torch_hpu_available() else None
+    backend = "hccl" if args.device == "hpu" else None
     torch.distributed.init_process_group(backend=backend, timeout=timeout)
 
     args.global_rank = torch.distributed.get_rank()
 
-    if is_torch_hpu_available():
+    if args.device == "hpu":
         tensor = torch.ByteTensor([False]).to('hpu')
     else:
         tensor = torch.ByteTensor([False]).cuda()
@@ -407,6 +407,7 @@ def main(args):
         flash_enabled=flash_enabled,
         noise_alpha=args.NEFTune_alpha,
         lora_quant_bits=args.lora_quant_bits,
+        device=args.device,
     )
 
     args.base_model_args = m.base_model_args
@@ -446,6 +447,7 @@ def main(args):
         samples_per_gpu=args.samples_per_gpu,
         sampler=args.sampler,
         seed=args.seed,
+        device=args.device,
     )
     if len(train_loader) == 0:
         # this happens sometimes when we have more GPUs than data to process. In this case
@@ -466,6 +468,7 @@ def main(args):
             samples_per_gpu=args.samples_per_gpu,
             sampler=args.sampler,
             seed=args.seed,
+            device=args.device,
         )
 
     if args.local_rank == 0:
@@ -497,6 +500,7 @@ def main(args):
         deepspeed_cpu_offload_optimizer_ratio=args.cpu_offload_optimizer_ratio,
         fsdp_cpu_offload_params=args.cpu_offload_params_fsdp,
         save_samples=args.save_samples,
+        device=args.device,
     )
     # optimizer needs model that has been prepared by accelerator
     # and then accelerator needs to be prepared AGAIN once optimizer is initialized
@@ -679,6 +683,10 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     if train_args.keep_last_checkpoint_only:
         command.append("--keep_last_checkpoint_only")
 
+    command.append(
+        f"--device={train_args.device}"
+    )
+
     logger.info("Running training command as subprocess: %s", " ".join(command))
     process = None
     interrupt: KeyboardInterrupt | Exception | None = None
@@ -876,6 +884,14 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         action="store_true",
         help="Use Liger kernels for training.",
     )
+
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="PyTorch device to use.",
+    )
+
     args = parser.parse_args()
     set_random_seed(args.seed)
     main(args)
diff --git a/src/instructlab/training/model.py b/src/instructlab/training/model.py
@@ -34,7 +34,6 @@
 import torch
 
 # First Party
-from instructlab.training.hpu_utils import is_torch_hpu_available
 
 from instructlab.training.config import (  # Adjust this import if needed
     DistributedBackend,
@@ -52,11 +51,13 @@ def __init__(
         flash_enabled: bool = False,
         lora_config: Optional[LoraConfig] = None,
         lora_quant_bits: int = 0,
+        device: Optional[str] = None,
     ):
         self.lora_config = lora_config
         self.noise_alpha = noise_alpha
         self.tokenizer = tokenizer
         self.distributed_framework = distributed_framework
+        self.device = device
         bnb_config = None
         if lora_config and lora_config.r > 0 and lora_quant_bits == 4:
             # Third Party
@@ -81,7 +82,7 @@ def __init__(
     def _post_model_init(self):
         """Common initialization steps that should happen after model initialization."""
 
-        if is_torch_hpu_available() and os.getenv("HPU_ENABLE_TORCH_COMPILE", False):
+        if self.device == "hpu" and os.getenv("HPU_ENABLE_TORCH_COMPILE", False):
             torch._dynamo.config.cache_size_limit = 10*1000
             torch._dynamo.config.accumulated_cache_size_limit = 20*1000
             self.model = torch.compile(self.model, backend="hpu_backend", dynamic=False)
@@ -274,7 +275,7 @@ def _is_causal_lm_model(self) -> bool:
             bool: True if the model is a causal language model, False otherwise.
         """
         # Third Party
-        if not is_torch_hpu_available():
+        if self.device != "hpu":
             class_name = self.model.__class__.__name__
         else:
             class_name = self.model._orig_mod.__class__.__name__ if self.model.__class__.__name__ == 'OptimizedModule' else self.model.__class__.__name__
@@ -334,7 +335,7 @@ def reconcile_tokenizer(self):
         ):
             self.model.config.eos_token_id = self.tokenizer.eos_token_id
 
-        if is_torch_hpu_available():
+        if self.device == "hpu":
             model = self.model._orig_mod if self.model.__class__.__name__ == 'OptimizedModule' else self.model
             class_name = model.__class__.__name__
 
@@ -410,6 +411,7 @@ def __init__(
         flash_enabled: bool = False,
         lora_config: Optional[LoraConfig] = None,
         lora_quant_bits: int = 0,
+        device: Optional[str] = None,
     ):
         super().__init__(
             model_path=model_path,
@@ -419,6 +421,7 @@ def __init__(
             flash_enabled=flash_enabled,
             lora_config=lora_config,
             lora_quant_bits=lora_quant_bits,
+            device=device,
         )
         try:
             # Third Party
@@ -451,6 +454,7 @@ def __init__(
         flash_enabled: bool = False,
         lora_config: Optional[LoraConfig] = None,
         lora_quant_bits: int = 0,
+        device: Optional[str] = None,
     ):
         super().__init__(
             model_path=model_path,
@@ -460,6 +464,7 @@ def __init__(
             flash_enabled=flash_enabled,
             lora_config=lora_config,
             lora_quant_bits=lora_quant_bits,
+            device=device,
         )
         # Third Party
         from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
@@ -494,6 +499,7 @@ def __init__(
         flash_enabled: bool = False,
         lora_config: Optional[LoraConfig] = None,
         lora_quant_bits: int = 0,
+        device: Optional[str] = None,
     ):
         super().__init__(
             model_path=model_path,
@@ -503,6 +509,7 @@ def __init__(
             flash_enabled=flash_enabled,
             lora_config=lora_config,
             lora_quant_bits=lora_quant_bits,
+            device=device,
         )
         # Third Party
         from transformers import AutoModelForCausalLM
diff --git a/src/instructlab/training/token_dataset.py b/src/instructlab/training/token_dataset.py
@@ -13,7 +13,7 @@
 from instructlab.training.multipack_sampler import MultipackDistributedBatchSampler
 from instructlab.training.utils import log_rank_0, make_collate_fn
 
-from instructlab.training.hpu_utils import is_torch_hpu_available, bucket
+from instructlab.training.hpu_utils import bucket
 
 class TokenDataset(Dataset):
     def __init__(self, data_path):
@@ -98,19 +98,21 @@ def setup_dataloader(
     samples_per_gpu=None,
     sampler="multipack",
     seed=47,
+    device=None,
 ) -> DataLoader:
     collate_fn = make_collate_fn(
         pad_token_id,
         use_dolomite=use_dolomite,
         flash_enabled=flash_enabled,
         max_batch_len=max_batch_len,
+        device=device,
     )
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
 
     lengths = dataset.get_lengths()
     if sampler == "multipack":
-        if is_torch_hpu_available():
+        if device == "hpu":
             bucket_v = np.vectorize(bucket)
             lengths = bucket_v(lengths)
 
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py

Original file line number	Diff line number	Diff line change
`@@ -245,3 +245,5 @@ class TrainingArgs(BaseModel):`
`245`	`245`	`log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(`
`246`	`246`	`default="INFO"`
`247`	`247`	`)`
	`248`	`+`
	`249`	`+ device: Optional[str] = None`