Merge pull request #36 from aws/release-1.3.3

rohithn1 · web-flow · commit fe7bdb50b0d0 · 2025-04-17T16:03:20.000-07:00
Sagemaker Hyperpod Recipes Release 1.3.3
diff --git a/README.md b/README.md
@@ -69,6 +69,8 @@ All model sources are from Hugging Face.
 
 | Model     | Method | Size | Sequence length | Nodes | Instance       | Accelerator | Recipe | Script |
 | --------- | ------ | ---- | ----------------| ----- | -------------- | ----------- | ------ | ------ |
+| LLama 4 Scout | LoRA (multi-modal)  | 17B 16E (109B)   | 1024          | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh) |
+| LLama 4 Scout | LoRA (text-only)   | 17B 16E (109B)   | 1024          | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh) |
 | DeepSeek R1 | QLoRA  | 671b   | 8192          | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_qlora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh) |
 | DeepSeek R1 | LoRA   | 671b   | 8192          | 5     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh) |
 | DeepSeek R1 Distill Llama 3 | SFT  | 8b   | 8192          | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh) |
diff --git a/launcher/nemo/stages.py b/launcher/nemo/stages.py
@@ -300,7 +300,7 @@ def _make_launch_docker_container_text(self):
                 transformers_upgrade_cmd = "pip install transformers==4.48.2"
                 post_launch_commands.append(transformers_upgrade_cmd)
             if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
-                transformers_upgrade_cmd = "pip install transformers==4.51.1"
+                transformers_upgrade_cmd = "pip install transformers==4.51.3"
                 post_launch_commands.append(transformers_upgrade_cmd)
 
         launch_docker_container_text.append(f'  "{image}" sleep infinity')
@@ -425,7 +425,7 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
                 script_text.append("")
                 script_text.append(transformers_upgrade_cmd)
             if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
-                transformers_upgrade_cmd = "pip install transformers==4.51.1"
+                transformers_upgrade_cmd = "pip install transformers==4.51.3"
                 script_text.append("")
                 script_text.append(transformers_upgrade_cmd)
 
@@ -765,7 +765,7 @@ def update_stage_specific_k8s_values(self, values_template):
             transformers_upgrade_cmd = "pip install transformers==4.48.2"
             values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
         if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
-            transformers_upgrade_cmd = "pip install transformers==4.51.1"
+            transformers_upgrade_cmd = "pip install transformers==4.51.3"
             values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
 
         return values_template
diff --git a/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh b/launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama-4-17b-16e-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml
@@ -0,0 +1,151 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: hf-llama-4-17b-16e-lora
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  # experiment loggers
+  create_tensorboard_logger: False
+  summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
+  create_mlflow_logger: False
+  mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
+  create_wandb_logger: False
+  wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v4
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  multi_modal: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 1024
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 48 # text_config.num_hidden_layers
+  hidden_size: 5120 # text_config.hidden_size
+  num_attention_heads: 40 # text_config.num_attention_heads
+  intermediate_size: 8192 # text_config.intermediate_size
+  initializer_range: 0.02 # text_config.initializer_range
+  vocab_size: 202048 # text_config.vocab_size
+  num_key_value_heads: 8 # text_config.num_key_value_heads
+  rope_theta: 500000.0  # text_config.rope_theta
+  use_flash_attention: True
+
+  # rope scaling
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+    rank: 16
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml
@@ -86,7 +86,7 @@ model:
   fp8: False # PEFT does not support fp8
 
   # Model architecture
-  max_context_width: 4096
+  max_context_width: 1024
   max_position_embeddings: ${.max_context_width}
   num_hidden_layers: 48 # text_config.num_hidden_layers
   hidden_size: 5120 # text_config.hidden_size
@@ -98,7 +98,7 @@ model:
   rope_theta: 500000.0  # text_config.rope_theta
   use_flash_attention: True
 
-  # rope scaling for llama3
+  # rope scaling
   rope_scaling:
     rope_type: llama3
     factor: 8.0
@@ -114,6 +114,7 @@ model:
   # PEFT config
   peft:
     peft_type: lora
+    target_modules: ["language_model.model.layers.*self_attn.(q_proj|k_proj|v_proj|o_proj)"]
     rank: 16
     alpha: 16
     dropout: 0.1