Merge pull request #35 from aws/release-1.4.0

rohithn1 · web-flow · commit 6e6c6eb4c495 · 2025-04-09T15:07:25.000-07:00
Sagemaker Hyperpod Recipes Release 1.4.0
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Amazon SageMaker HyperPod recipes include built-in support for:
 - Automated distributed checkpointing
 - Distributed optimizer
 - Accelerators: NVIDIA H100 (ml.p5), NVIDIA A100 (ml.p4), and AWS Trainium (ml.trn1)
-- Fine-tuning: Full, QLoRA, LoRA
+- Fine-tuning: Full, QLoRA, LoRA, DPO
 - AWS Instances: ml.p5.48xlarge, ml.p4d.24xlarge, and ml.trn1.32xlarge instance families
 - Supported Models: DeepSeek R1, DeepSeek R1 Distill Llama, DeepSeek R1 Distill Qwen, Llama, Mistral, Mixtral models
 - Model Evaluation: [Tensorboard](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html#module-lightning.pytorch.loggers.tensorboard), [MLflow](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.mlflow.html), [Wandb](https://lightning.ai/docs/pytorch/stable/extensions/generated/lightning.pytorch.loggers.WandbLogger.html) - feel free to add any key word arguments to the Logger classes by using their associated kwargs config
diff --git a/launcher/nemo/stages.py b/launcher/nemo/stages.py
@@ -299,6 +299,9 @@ def _make_launch_docker_container_text(self):
             if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "deepseek_r1":
                 transformers_upgrade_cmd = "pip install transformers==4.48.2"
                 post_launch_commands.append(transformers_upgrade_cmd)
+            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
+                transformers_upgrade_cmd = "pip install transformers==4.51.1"
+                post_launch_commands.append(transformers_upgrade_cmd)
 
         launch_docker_container_text.append(f'  "{image}" sleep infinity')
         launch_docker_container_text.append("")
@@ -421,6 +424,10 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
                 transformers_upgrade_cmd = "pip install transformers==4.48.2"
                 script_text.append("")
                 script_text.append(transformers_upgrade_cmd)
+            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
+                transformers_upgrade_cmd = "pip install transformers==4.51.1"
+                script_text.append("")
+                script_text.append(transformers_upgrade_cmd)
 
         script_text.append("")
         script_text.append(self._make_custom_call_string(stage_cfg_path))
@@ -757,6 +764,9 @@ def update_stage_specific_k8s_values(self, values_template):
         if OmegaConf.select(self.cfg, "recipes.model.model_type", default=False) == "deepseek_r1":
             transformers_upgrade_cmd = "pip install transformers==4.48.2"
             values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
+        if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
+            transformers_upgrade_cmd = "pip install transformers==4.51.1"
+            values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
 
         return values_template
 
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_dpo.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_dpo.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_dpo \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-dpo" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=2 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh b/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama-4-17b-16e-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_dpo.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_dpo.yaml
@@ -0,0 +1,155 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  # experiment loggers
+  create_tensorboard_logger: False
+  summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
+  create_mlflow_logger: False
+  mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
+  create_wandb_logger: False
+  wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+  # DPO config
+  dpo:
+    enabled: True
+    beta: 0.1
+    label_smoothing: 0.0
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 1e-6
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1e-7
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml