Skip to content

Commit f9f37ea

Browse files
authored
Merge pull request #22 from aws/release-1-2-1
Sagemaker Hyperpod Recipes Release 1.2.1
2 parents f95303f + 4bc0379 commit f9f37ea

File tree

13 files changed

+189
-12
lines changed

13 files changed

+189
-12
lines changed

launcher/nemo/k8s_templates/training/train-script-gpu.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ data:
1515
git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
1616
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
1717
cd $GIT_CLONE_DIR
18+
rm -rf __pycache__
1819
1920
{{- if $config.git.branch }}
2021
git checkout {{ $config.git.branch }}
@@ -24,6 +25,11 @@ data:
2425
git fetch origin {{ $config.git.commit }}
2526
git reset --hard {{ $config.git.commit }}
2627
{{- end }}
28+
{{- if $config.git.update_adapter }}
29+
30+
pip install . --force-reinstall --no-deps
31+
32+
{{- end }}
2733
{{- else }}
2834
GIT_CLONE_DIR=""
2935
{{- end }}

launcher/nemo/k8s_templates/training/train-script-trn.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ data:
1717
git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
1818
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
1919
cd $GIT_CLONE_DIR
20+
rm -rf __pycache__
2021
2122
{{- if $config.git.branch }}
2223
git checkout {{ $config.git.branch }}
@@ -26,6 +27,11 @@ data:
2627
git fetch origin {{ $config.git.commit }}
2728
git reset --hard {{ $config.git.commit }}
2829
{{- end }}
30+
{{- if $config.git.update_adapter }}
31+
32+
pip install . --force-reinstall --no-deps
33+
34+
{{- end }}
2935
{{- else }}
3036
GIT_CLONE_DIR=""
3137
{{- end }}

launcher/nemo/k8s_templates/training/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ trainingConfig:
7070
branch: null
7171
commit: null
7272
token: null
73+
update_adapter: null
7374

7475
# Commands to run before training
7576
pre_script: []

launcher/nemo/stages.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,9 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
386386
f"git clone {repo_url_or_path} $GIT_CLONE_DIR",
387387
"GIT_CLONE_DIR=${GIT_CLONE_DIR}/",
388388
"cd $GIT_CLONE_DIR",
389+
# cache can lead to unexpected behavior when user clones
390+
# the Adapter and modifies it
391+
"rm -rf __pycache__",
389392
]
390393
)
391394
else:
@@ -397,6 +400,8 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
397400
if self.cfg.get("git", None) is not None and self.cfg.git.get("commit", None) is not None:
398401
script_text.append(f"git fetch origin {self.cfg.git.commit}")
399402
script_text.append(f"git reset --hard {self.cfg.git.commit}")
403+
if OmegaConf.select(self.cfg, "git.update_adapter", default=False):
404+
script_text.append("\npip install . --force-reinstall --no-deps")
400405
else:
401406
script_text.append('GIT_CLONE_DIR=""')
402407

@@ -703,6 +708,8 @@ def generate_default_k8s_value_template(self, template_root, cluster_parameters,
703708
values_template.trainingConfig.git.branch = self.cfg.git.branch
704709
if self.cfg.git.get("commit", None) is not None:
705710
values_template.trainingConfig.git.commit = self.cfg.git.commit
711+
if self.cfg.git.get("update_adapter", None) is not None:
712+
values_template.trainingConfig.git.update_adapter = self.cfg.git.update_adapter
706713

707714
values_template.trainingConfig.device = self.device
708715
values_template.trainingConfig.scriptArgs = self.get_script_args_str(stage_cfg_path)

recipes_collection/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ git:
2727
commit: null
2828
entry_script: null
2929
token: null
30+
update_adapter: false # if true it will re-install the Adapter code but not its dependencies
3031

3132
env_vars:
3233
NCCL_DEBUG: WARN # Logging level for NCCL. Set to "INFO" for debug information

tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ trainingConfig:
2727
branch: test_branch
2828
commit: test_commit
2929
token: null
30+
update_adapter: false
3031
pre_script: []
3132
post_script: []
3233
labelSelector:

tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ trainingConfig:
2626
branch: null
2727
commit: null
2828
token: null
29+
update_adapter: null
2930
pre_script: []
3031
post_script: []
3132
labelSelector:

tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
2525
git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
2626
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
2727
cd $GIT_CLONE_DIR
28+
rm -rf __pycache__
2829

2930
unset SLURM_NTASKS
3031

tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
2525
git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
2626
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
2727
cd $GIT_CLONE_DIR
28+
rm -rf __pycache__
2829

2930
unset SLURM_NTASKS
3031

tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
2525
git clone https://github.com/example $GIT_CLONE_DIR
2626
GIT_CLONE_DIR=${GIT_CLONE_DIR}/
2727
cd $GIT_CLONE_DIR
28+
rm -rf __pycache__
2829

2930
unset SLURM_NTASKS
3031

0 commit comments

Comments
 (0)