Skip to content

Commit a15395f

Browse files
authored
Support deep seek distill model (#50)
* support deepseek distilled model
1 parent d7cbc77 commit a15395f

File tree

3 files changed

+44
-4
lines changed

3 files changed

+44
-4
lines changed

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,37 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
161161
* `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users.
162162
* `results-dir` (string) - Optional. The location to store the results, checkpoints, and logs. The cluster admin users should set this up and provide it to the data scientist users. The default value is `./results`.
163163
* `service-account-name` - Optional. The Kubernetes service account that allows Pods to access resources based on the permissions granted to that service account. The cluster admin users should create the Kubernetes service account.
164+
* `recipe` (string) - Optional. The recipe to use for the job. The recipe is a predefined set of parameters for the job.
165+
* `override-parameters` (string) - Optional. The parameters to override for the job. The parameters are in JSON format.
166+
Example:
167+
```
168+
hyperpod start-job --recipe <recipe-name>
169+
```
170+
171+
Below is an example of how to use the `override-parameters` option and deepseek recipe.
172+
173+
```
174+
hyperpod start-job --recipe fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning --override-parameters \
175+
'{
176+
"cluster":"k8s",
177+
"cluster_type":"k8s",
178+
"container":"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121",
179+
"+cluster.persistent_volume_claims.0.claimName":"fsx-claim-large",
180+
"+cluster.persistent_volume_claims.0.mountPath":"data",
181+
"cluster.service_account_name":"",
182+
"recipes.run.name":"deepseek",
183+
"recipes.model.train_batch_size":"1",
184+
"instance_type":"p4d.24xlarge",
185+
"recipes.model.data.use_synthetic_data":"True",
186+
"recipes.model.fp8":"False",
187+
"recipes.exp_manager.auto_checkpoint.enabled":"False",
188+
"recipes.exp_manager.export_full_model.save_last":"False",
189+
"recipes.exp_manager.checkpoint_callback_params.save_last":"False",
190+
"recipes.model.hf_model_name_or_path":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
191+
"recipes.model.hf_access_token":"<your-access-token>",
192+
"recipes.exp_manager.exp_dir":""
193+
}'\
194+
```
164195
165196
166197
### Getting Job Details

src/hyperpod_cli/commands/job.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,14 @@ def cancel_job(
473473
fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \n
474474
fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \n
475475
fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \n
476+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \n
477+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \n
478+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \n
479+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \n
480+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \n
481+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \n
482+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \n
483+
fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \n
476484
"""
477485
)
478486
@click.option(
@@ -776,7 +784,7 @@ def start_job(
776784
max_retry=max_retry,
777785
deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only,
778786
)
779-
787+
# TODO: Unblock this after fixing customer using EKS cluster.
780788
console_link = utils.get_cluster_console_url()
781789
print(json.dumps({"Console URL": console_link}, indent=1, sort_keys=False))
782790

@@ -808,8 +816,9 @@ def patch_job(patch_type: str, job_name: str, namespace: Optional[str]):
808816
group=KUEUE_CUSTOM_OBJECT_GROUP,
809817
resource=WORKLOAD_CUSTOM_OBJECT_PLURAL,
810818
)
811-
namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
812-
819+
# TODO: Unblock this after better customer onboarding experience for Crescendo.
820+
#namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
821+
namespace = "default"
813822

814823
patch_type_enum = JobPatchType(patch_type)
815824
k8s_client = KubernetesClient()

0 commit comments

Comments
 (0)