Support deep seek distill model (#50)

xiaoxshe · web-flow · commit a15395f42f95 · 2025-01-31T11:53:13.000-08:00
* support deepseek distilled model
diff --git a/README.md b/README.md
@@ -161,6 +161,37 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
 * `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users.
 * `results-dir` (string) - Optional. The location to store the results, checkpoints, and logs. The cluster admin users should set this up and provide it to the data scientist users. The default value is `./results`.
 * `service-account-name` - Optional. The Kubernetes service account that allows Pods to access resources based on the permissions granted to that service account. The cluster admin users should create the Kubernetes service account.
+* `recipe` (string) - Optional. The recipe to use for the job. The recipe is a predefined set of parameters for the job.
+* `override-parameters` (string) - Optional. The parameters to override for the job. The parameters are in JSON format.
+Example:
+```
+hyperpod start-job --recipe <recipe-name>
+```
+
+Below is an example of how to use the `override-parameters` option and deepseek recipe.
+
+```
+hyperpod start-job --recipe fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning --override-parameters \
+'{
+    "cluster":"k8s",
+    "cluster_type":"k8s",
+    "container":"658645717510.dkr.ecr.us-west-2.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121",
+    "+cluster.persistent_volume_claims.0.claimName":"fsx-claim-large",
+    "+cluster.persistent_volume_claims.0.mountPath":"data",
+    "cluster.service_account_name":"",
+    "recipes.run.name":"deepseek",
+    "recipes.model.train_batch_size":"1",
+    "instance_type":"p4d.24xlarge",
+    "recipes.model.data.use_synthetic_data":"True",
+    "recipes.model.fp8":"False",
+    "recipes.exp_manager.auto_checkpoint.enabled":"False",
+    "recipes.exp_manager.export_full_model.save_last":"False",
+    "recipes.exp_manager.checkpoint_callback_params.save_last":"False",
+    "recipes.model.hf_model_name_or_path":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "recipes.model.hf_access_token":"<your-access-token>",
+    "recipes.exp_manager.exp_dir":""   
+}'\
+```
 
 
 ### Getting Job Details
diff --git a/src/hyperpod_cli/commands/job.py b/src/hyperpod_cli/commands/job.py
@@ -473,6 +473,14 @@ def cancel_job(
 fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \n
 fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \n
 fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq8k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_8b_seq16k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_fine_tuning \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq8k_gpu_lora \n
+fine-tuning/deepseek/hf_deepseek_r1_distilled_llama_70b_seq16k_gpu_lora \n
             """
 )
 @click.option(
@@ -776,7 +784,7 @@ def start_job(
         max_retry=max_retry,
         deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only,
     )
-
+    # TODO: Unblock this after fixing customer using EKS cluster.
     console_link = utils.get_cluster_console_url()
     print(json.dumps({"Console URL": console_link}, indent=1, sort_keys=False))
 
@@ -808,8 +816,9 @@ def patch_job(patch_type: str, job_name: str, namespace: Optional[str]):
             group=KUEUE_CUSTOM_OBJECT_GROUP,
             resource=WORKLOAD_CUSTOM_OBJECT_PLURAL,
         )
-        namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
-
+        # TODO: Unblock this after better customer onboarding experience for Crescendo.
+        #namespace = DiscoverNamespaces().discover_accessible_namespace(resource_attributes_template)
+        namespace = "default"
     
     patch_type_enum = JobPatchType(patch_type)
     k8s_client = KubernetesClient()
diff --git a/src/hyperpod_cli/sagemaker_hyperpod_recipes b/src/hyperpod_cli/sagemaker_hyperpod_recipes
@@ -1 +1 @@
-Subproject commit 66e49e0a86bc3602ae8db5ea8f01e249328475b6
+Subproject commit 6bd77d3b0917bbed1a311e8f7fafa2cdce45b10e