Skip to content

Commit ce96b51

Browse files
committed
Sagemaker Hyperpod Recipes Release 1.3.3
1 parent fe7bdb5 commit ce96b51

7 files changed

+365
-6
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ All model sources are from Hugging Face.
6969

7070
| Model | Method | Size | Sequence length | Nodes | Instance | Accelerator | Recipe | Script |
7171
| --------- | ------ | ---- | ----------------| ----- | -------------- | ----------- | ------ | ------ |
72-
| LLama 4 Scout | LoRA (multi-modal) | 17B 16E (109B) | 1024 | 1 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh) |
73-
| LLama 4 Scout | LoRA (text-only) | 17B 16E (109B) | 1024 | 1 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_text_to_text.sh) |
72+
| LLama 4 Scout | LoRA (multi-modal) | 17B 16E (109B) | 8192 | 2 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh) |
73+
| LLama 4 Scout | LoRA (multi-modal) | 17B 16E (109B) | 4096 | 1 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning.sh) |
74+
| LLama 4 Scout | LoRA (text-only) | 17B 16E (109B) | 4096 | 1 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.yaml) | [link](launcher_scripts/llama/run_hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text.sh) |
7475
| DeepSeek R1 | QLoRA | 671b | 8192 | 2 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_qlora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_qlora.sh) |
7576
| DeepSeek R1 | LoRA | 671b | 8192 | 5 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_671b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_671b_seq8k_gpu_lora.sh) |
7677
| DeepSeek R1 Distill Llama 3 | SFT | 8b | 8192 | 1 | ml.p5.48xlarge | GPU H100 | [link](recipes_collection/recipes/fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/deepseek/run_hf_deepseek_r1_llama_8b_seq8k_gpu_fine_tuning.sh) |
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
4+
5+
#Users should setup their cluster type in /recipes_collection/config.yaml
6+
7+
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
8+
9+
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10+
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11+
12+
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13+
VAL_DIR="${VAL_DIR}" # Location of validation dataset
14+
15+
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16+
17+
18+
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19+
recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_multimodal_finetuning \
20+
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21+
recipes.run.name="hf-llama-4-17b-16e-lora" \
22+
recipes.exp_manager.exp_dir="$EXP_DIR" \
23+
recipes.trainer.num_nodes=1 \
24+
recipes.model.train_batch_size=1 \
25+
recipes.model.data.train_dir="$TRAIN_DIR" \
26+
recipes.model.data.val_dir="$VAL_DIR" \
27+
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28+
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
4+
5+
#Users should setup their cluster type in /recipes_collection/config.yaml
6+
7+
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
8+
9+
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
10+
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
11+
12+
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
13+
VAL_DIR="${VAL_DIR}" # Location of validation dataset
14+
15+
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
16+
17+
18+
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
19+
recipes=fine-tuning/llama/hf_llama4_17b_16e_seq4k_gpu_lora_text_to_text \
20+
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
21+
recipes.run.name="hf-llama-4-17b-16e-lora" \
22+
recipes.exp_manager.exp_dir="$EXP_DIR" \
23+
recipes.trainer.num_nodes=1 \
24+
recipes.model.train_batch_size=1 \
25+
recipes.model.data.train_dir="$TRAIN_DIR" \
26+
recipes.model.data.val_dir="$VAL_DIR" \
27+
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
28+
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \

launcher_scripts/llama/run_hf_llama4_17b_16e_seq8k_gpu_lora_multimodal_finetuning.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
2020
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
2121
recipes.run.name="hf-llama-4-17b-16e-lora" \
2222
recipes.exp_manager.exp_dir="$EXP_DIR" \
23-
recipes.trainer.num_nodes=1 \
23+
recipes.trainer.num_nodes=2 \
2424
recipes.model.train_batch_size=1 \
2525
recipes.model.data.train_dir="$TRAIN_DIR" \
2626
recipes.model.data.val_dir="$VAL_DIR" \
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
2+
3+
# Basic run information configs
4+
run:
5+
name: hf-llama-4-17b-16e-lora
6+
results_dir: ${base_results_dir}/${.name}
7+
time_limit: "6-00:00:00"
8+
model_type: hf # huggingface for our recipes
9+
10+
# Basic pytorch lightning trainer config
11+
trainer:
12+
devices: 8
13+
num_nodes: 1
14+
accelerator: gpu
15+
precision: bf16
16+
max_steps: 50
17+
log_every_n_steps: 1
18+
val_check_interval: 1
19+
limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
20+
21+
# Basic pytorch lightning experiment config
22+
# Config for checkpoint/tensorboard etc
23+
exp_manager:
24+
exp_dir: null
25+
name: experiment
26+
# experiment loggers
27+
create_tensorboard_logger: False
28+
summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
29+
create_mlflow_logger: False
30+
mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
31+
create_wandb_logger: False
32+
wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
33+
create_checkpoint_callback: True
34+
# Configs to save checkpoint with a fixed interval
35+
# Note: These config will not work with auto checkpoint mode
36+
checkpoint_callback_params:
37+
# Set save_top_k = 0 to disable sharded checkpointing
38+
save_top_k: 0
39+
every_n_train_steps: 10
40+
monitor: "step"
41+
mode: "max"
42+
save_last: False
43+
checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
44+
resume_from_checkpoint: null
45+
# Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
46+
auto_checkpoint:
47+
enabled: False
48+
export_full_model:
49+
# Set every_n_train_steps = 0 to disable full checkpointing
50+
every_n_train_steps: 0
51+
save_last: True
52+
53+
################# Predefined configs ##########################
54+
use_smp_model: False # Disable sagemaker model parallelism for PEFT
55+
distributed_backend: nccl
56+
57+
58+
# Model training configs
59+
model:
60+
model_type: llama_v4
61+
# Base configs
62+
train_batch_size: 1
63+
seed: 12345
64+
grad_clip: 1.0
65+
log_reduced_training_loss: True
66+
67+
# Memory saving / distributed training configs
68+
context_parallel_degree: 1
69+
moe: False
70+
activation_checkpointing: True
71+
activation_loading_horizon: 2
72+
delayed_param: True
73+
offload_activations: False
74+
multi_modal: True
75+
76+
# FSDP Configs
77+
sharding_strategy: hybrid_shard
78+
forward_prefetch: True
79+
shard_degree: 8
80+
backward_fetch_policy: backward_pre
81+
auto_wrap_policy: transformer_auto_wrap_policy
82+
limit_all_gathers: True
83+
use_orig_param: False
84+
85+
# FP8 config
86+
fp8: False # PEFT does not support fp8
87+
88+
# Model architecture
89+
max_context_width: 4096
90+
max_position_embeddings: ${.max_context_width}
91+
num_hidden_layers: 48 # text_config.num_hidden_layers
92+
hidden_size: 5120 # text_config.hidden_size
93+
num_attention_heads: 40 # text_config.num_attention_heads
94+
intermediate_size: 8192 # text_config.intermediate_size
95+
initializer_range: 0.02 # text_config.initializer_range
96+
vocab_size: 202048 # text_config.vocab_size
97+
num_key_value_heads: 8 # text_config.num_key_value_heads
98+
rope_theta: 500000.0 # text_config.rope_theta
99+
use_flash_attention: True
100+
101+
# rope scaling
102+
rope_scaling:
103+
rope_type: llama3
104+
factor: 8.0
105+
high_freq_factor: 4.0
106+
low_freq_factor: 1.0
107+
original_max_position_embeddings: 8192
108+
109+
# Finetuning config
110+
do_finetune: True
111+
# The path to resume from, needs to be HF compatible
112+
hf_model_name_or_path: null
113+
hf_access_token: null
114+
# PEFT config
115+
peft:
116+
peft_type: lora
117+
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
118+
rank: 16
119+
alpha: 16
120+
dropout: 0.1
121+
122+
precision: ${recipes.trainer.precision}
123+
################# End of Predefined configs ##########################
124+
125+
# Learning rate and optimizer configs
126+
lr_decay_iters: ${recipes.trainer.max_steps}
127+
# Optimizer
128+
optim:
129+
name: adamw
130+
lr: 0.0001
131+
weight_decay: 0.01
132+
betas:
133+
- 0.9
134+
- 0.95
135+
sched:
136+
name: CosineAnnealing
137+
warmup_steps: 0
138+
constant_steps: 0
139+
min_lr: 0.000001
140+
141+
# Data configs
142+
data:
143+
train_dir: null
144+
val_dir: null
145+
dataset_type: hf
146+
use_synthetic_data: False
147+
148+
# Profiling configs
149+
# Viztracer profiling options
150+
viztracer:
151+
enabled: false
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
2+
3+
# Basic run information configs
4+
run:
5+
name: hf-llama-4-17b-16e-lora
6+
results_dir: ${base_results_dir}/${.name}
7+
time_limit: "6-00:00:00"
8+
model_type: hf # huggingface for our recipes
9+
10+
# Basic pytorch lightning trainer config
11+
trainer:
12+
devices: 8
13+
num_nodes: 1
14+
accelerator: gpu
15+
precision: bf16
16+
max_steps: 50
17+
log_every_n_steps: 1
18+
val_check_interval: 1
19+
limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
20+
21+
# Basic pytorch lightning experiment config
22+
# Config for checkpoint/tensorboard etc
23+
exp_manager:
24+
exp_dir: null
25+
name: experiment
26+
# experiment loggers
27+
create_tensorboard_logger: False
28+
summary_writer_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}/tensorboard"}
29+
create_mlflow_logger: False
30+
mlflow_logger_kwargs: {"tracking_uri" : "${recipes.exp_manager.exp_dir}/mlflow"}
31+
create_wandb_logger: False
32+
wandb_logger_kwargs: {"save_dir" : "${recipes.exp_manager.exp_dir}"} # wandb creates a wandb folder by default
33+
create_checkpoint_callback: True
34+
# Configs to save checkpoint with a fixed interval
35+
# Note: These config will not work with auto checkpoint mode
36+
checkpoint_callback_params:
37+
# Set save_top_k = 0 to disable sharded checkpointing
38+
save_top_k: 0
39+
every_n_train_steps: 10
40+
monitor: "step"
41+
mode: "max"
42+
save_last: False
43+
checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
44+
resume_from_checkpoint: null
45+
# Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
46+
auto_checkpoint:
47+
enabled: False
48+
export_full_model:
49+
# Set every_n_train_steps = 0 to disable full checkpointing
50+
every_n_train_steps: 0
51+
save_last: True
52+
53+
################# Predefined configs ##########################
54+
use_smp_model: False # Disable sagemaker model parallelism for PEFT
55+
distributed_backend: nccl
56+
57+
58+
# Model training configs
59+
model:
60+
model_type: llama_v4
61+
# Base configs
62+
train_batch_size: 1
63+
seed: 12345
64+
grad_clip: 1.0
65+
log_reduced_training_loss: True
66+
67+
# Memory saving / distributed training configs
68+
context_parallel_degree: 1
69+
moe: False
70+
activation_checkpointing: True
71+
activation_loading_horizon: 2
72+
delayed_param: False
73+
offload_activations: False
74+
multi_modal: False # this recipe is for text based finetuning only, in other words, the vision model is left untouched
75+
76+
# FSDP Configs
77+
sharding_strategy: hybrid_shard
78+
forward_prefetch: True
79+
shard_degree: 8
80+
backward_fetch_policy: backward_pre
81+
auto_wrap_policy: transformer_auto_wrap_policy
82+
limit_all_gathers: True
83+
use_orig_param: False
84+
85+
# FP8 config
86+
fp8: False # PEFT does not support fp8
87+
88+
# Model architecture
89+
max_context_width: 4096
90+
max_position_embeddings: ${.max_context_width}
91+
num_hidden_layers: 48 # text_config.num_hidden_layers
92+
hidden_size: 5120 # text_config.hidden_size
93+
num_attention_heads: 40 # text_config.num_attention_heads
94+
intermediate_size: 8192 # text_config.intermediate_size
95+
initializer_range: 0.02 # text_config.initializer_range
96+
vocab_size: 202048 # text_config.vocab_size
97+
num_key_value_heads: 8 # text_config.num_key_value_heads
98+
rope_theta: 500000.0 # text_config.rope_theta
99+
use_flash_attention: True
100+
101+
# rope scaling
102+
rope_scaling:
103+
rope_type: llama3
104+
factor: 8.0
105+
high_freq_factor: 4.0
106+
low_freq_factor: 1.0
107+
original_max_position_embeddings: 8192
108+
109+
# Finetuning config
110+
do_finetune: True
111+
# The path to resume from, needs to be HF compatible
112+
hf_model_name_or_path: null
113+
hf_access_token: null
114+
# PEFT config
115+
peft:
116+
peft_type: lora
117+
target_modules: ["language_model.model.layers.*self_attn.(q_proj|k_proj|v_proj|o_proj)"]
118+
rank: 16
119+
alpha: 16
120+
dropout: 0.1
121+
122+
precision: ${recipes.trainer.precision}
123+
################# End of Predefined configs ##########################
124+
125+
# Learning rate and optimizer configs
126+
lr_decay_iters: ${recipes.trainer.max_steps}
127+
# Optimizer
128+
optim:
129+
name: adamw
130+
lr: 0.0001
131+
weight_decay: 0.01
132+
betas:
133+
- 0.9
134+
- 0.95
135+
sched:
136+
name: CosineAnnealing
137+
warmup_steps: 0
138+
constant_steps: 0
139+
min_lr: 0.000001
140+
141+
# Data configs
142+
data:
143+
train_dir: null
144+
val_dir: null
145+
dataset_type: hf
146+
use_synthetic_data: False
147+
148+
# Profiling configs
149+
# Viztracer profiling options
150+
viztracer:
151+
enabled: false

0 commit comments

Comments
 (0)