Release 1.5.1

zachgk · zachgk · commit 0413a713597b · 2025-08-13T23:28:05.000-07:00
diff --git a/README.md b/README.md
@@ -19,7 +19,8 @@ Amazon SageMaker HyperPod recipes include built-in support for:
 - Supported Models: DeepSeek R1, DeepSeek R1 Distill Llama, DeepSeek R1 Distill Qwen, Llama, Mistral, Mixtral models, Nova Micro, Nova Lite, Nova Pro.
 - Model Evaluation: [Tensorboard](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html#module-lightning.pytorch.loggers.tensorboard), [MLflow](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.mlflow.html), [Wandb](https://lightning.ai/docs/pytorch/stable/extensions/generated/lightning.pytorch.loggers.WandbLogger.html) - feel free to add any key word arguments to the Logger classes by using their associated kwargs config
 
-###### ***Note: For DeepSeek R1 671b customers must ensure that their model repository contains weights of type bf16. DeepSeek's [HuggingFace repository](https://huggingface.co/deepseek-ai/DeepSeek-R1) contains the model in dtype fp8 by default. In order to convert a model repository from fp8 to bf16 we recommend using [this script](https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo/blob/main/src/hyperpod_nemo_adapter/scripts/fp8_cast_bf16.py) and pointing your recipe to the output directory.
+###### ***Note: DeepSeek R1 671b customers must ensure that their model repository contains weights of type bf16. DeepSeek's [HuggingFace repository](https://huggingface.co/deepseek-ai/DeepSeek-R1) contains the model in dtype fp8 by default. In order to convert a model repository from fp8 to bf16 we recommend using [this script](https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo/blob/main/src/hyperpod_nemo_adapter/scripts/fp8_cast_bf16.py) and pointing your recipe to the output directory.
+###### ***Note: GPT OSS customers are recommended to use the gpt-oss-patch image `658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:sm-pytorch_gpt_oss_patch_pt-2.7_cuda12.8` to support vllm-flash-attn3 and run the recipe as written. Per device batch sizes > 1 are not currently supported.
 
 ## Model Support
 
@@ -116,6 +117,8 @@ Nova Pro | Model Distillation for Post-Training | -   | -            | 1    | ml
 | DeepSeek R1 Distill Qwen 2 | LoRA | 32b    | 8192          | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq8k_gpu_lora.sh) |
 | DeepSeek R1 Distill Qwen 2 | SFT  | 32b    | 16384         | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_fine_tuning.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_fine_tuning.sh) |
 | DeepSeek R1 Distill Qwen 2 | LoRA | 32b    | 16384         | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_qwen_32b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_qwen_32b_seq16k_gpu_lora.sh) |
+| GPT OSS | LoRA   | 20b | 16384            | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_20b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/gpt_oss/run_hf_gpt_oss_20b_seq16k_gpu_lora.sh) |
+| GPT OSS | LoRA   | 120b | 4096           | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_120b_seq4k_gpu_lora.yaml) | [link](launcher_scripts/gpt_oss/run_hf_gpt_oss_120b_seq4k_gpu_lora.sh) |
 | Llama 3.1 | QLoRA  | 405b | 131072          | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh) |
 | Llama 3.1 | QLoRA  | 405b | 32768           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh) |
 | Llama 3.1 | LoRA   | 405b | 16384           | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh) |
diff --git a/launcher/nemo/stages.py b/launcher/nemo/stages.py
@@ -303,6 +303,9 @@ def _make_launch_docker_container_text(self):
             if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
                 transformers_upgrade_cmd = "pip install transformers==4.51.3"
                 post_launch_commands.append(transformers_upgrade_cmd)
+            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "gpt_oss":
+                transformers_upgrade_cmd = "pip install transformers==4.55.0"
+                post_launch_commands.append(transformers_upgrade_cmd)
 
         launch_docker_container_text.append(f'  "{image}" sleep infinity')
         launch_docker_container_text.append("")
@@ -429,6 +432,10 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
                 transformers_upgrade_cmd = "pip install transformers==4.51.3"
                 script_text.append("")
                 script_text.append(transformers_upgrade_cmd)
+            if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "gpt_oss":
+                transformers_upgrade_cmd = "pip install transformers==4.55.0"
+                script_text.append("")
+                script_text.append(transformers_upgrade_cmd)
 
         script_text.append("")
         script_text.append(self._make_custom_call_string(stage_cfg_path))
@@ -768,6 +775,9 @@ def update_stage_specific_k8s_values(self, values_template):
         if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "llama_v4":
             transformers_upgrade_cmd = "pip install transformers==4.51.3"
             values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
+        if OmegaConf.select(self.cfg, "recipes.model.model_type", default=None) == "gpt_oss":
+            transformers_upgrade_cmd = "pip install transformers==4.55.0"
+            values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
 
         return values_template
 
diff --git a/launcher_scripts/gpt_oss/run_hf_gpt_oss_120b_seq4k_gpu_lora.sh b/launcher_scripts/gpt_oss/run_hf_gpt_oss_120b_seq4k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/gpt_oss/hf_gpt_oss_120b_seq4k_gpu_lora \
+    container="658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:sm-pytorch_gpt_oss_patch_pt-2.7_cuda12.8" \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-gpt-oss-120b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/gpt_oss/run_hf_gpt_oss_20b_seq16k_gpu_lora.sh b/launcher_scripts/gpt_oss/run_hf_gpt_oss_20b_seq16k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/gpt_oss/hf_gpt_oss_20b_seq16k_gpu_lora \
+    container="658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:sm-pytorch_gpt_oss_patch_pt-2.7_cuda12.8" \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-gpt-oss-20b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_120b_seq4k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_120b_seq4k_gpu_lora.yaml
@@ -0,0 +1,156 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: gpt-oss-120b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  # experiment loggers
+  create_tensorboard_logger: False
+  summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
+  create_mlflow_logger: False
+  mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
+  create_wandb_logger: False
+  wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Model training configs
+model:
+  model_type: gpt_oss
+  # Base configs
+  train_batch_size: 1 # Batch sizes > 1 are not currently supported
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 4096
+  max_position_embeddings: ${.max_context_width} # 131072
+  num_hidden_layers: 36
+  hidden_size: 2880
+  num_attention_heads: 64
+  intermediate_size: 2880
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 201088
+  num_key_value_heads: 8
+  rms_norm_eps: 1e-05
+  use_flash_attention: False # Use the gpt-oss-patch container for kernels-community/vllm-flash-attn3
+  sliding_window: 128
+  use_sliding_window: True
+  num_experts_per_tok: 4
+  num_local_experts: 128
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 150000.0
+  tie_word_embeddings: False
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 16
+    alpha: 32
+    dropout: 0.1
+    target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_20b_seq16k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/gpt_oss/hf_gpt_oss_20b_seq16k_gpu_lora.yaml