From 5a6e2206d16f6d6ea75c8d1890d20b15f311c94a Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:44:54 +0200
Subject: [PATCH 1/5] Add `exec` & fix formatting in `sagemaker-entrypoint.sh`

Use `exec` so that the process runs with PID 1, allowing it to receive
signals directly; and so on, to be gracefully shut down
---
 sagemaker-entrypoint.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh
index b6cec7bb..1f3a4a9d 100644
--- a/sagemaker-entrypoint.sh
+++ b/sagemaker-entrypoint.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 if [[ -z "${HF_MODEL_ID}" ]]; then
-  echo "HF_MODEL_ID must be set"
-  exit 1
+    echo "HF_MODEL_ID must be set"
+    exit 1
 fi
 export MODEL_ID="${HF_MODEL_ID}"
 
 if [[ -n "${HF_MODEL_REVISION}" ]]; then
-  export REVISION="${HF_MODEL_REVISION}"
+    export REVISION="${HF_MODEL_REVISION}"
 fi
 
-text-embeddings-router --port 8080 --json-output
+exec text-embeddings-router --port 8080 --json-output

From c910de37576a5df87c89841e73c4864e686e344e Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:45:30 +0200
Subject: [PATCH 2/5] Fix `compute_cap` parsing and formatting in
 `sagemaker-entrypoint-cuda-all.sh`

---
 sagemaker-entrypoint-cuda-all.sh | 81 +++++++++-----------------------
 1 file changed, 22 insertions(+), 59 deletions(-)

diff --git a/sagemaker-entrypoint-cuda-all.sh b/sagemaker-entrypoint-cuda-all.sh
index a3c63cbb..0d89ce28 100644
--- a/sagemaker-entrypoint-cuda-all.sh
+++ b/sagemaker-entrypoint-cuda-all.sh
@@ -1,14 +1,21 @@
 #!/bin/bash
 
+if ! command -v nvidia-smi &>/dev/null; then
+    echo "Error: 'nvidia-smi' command not found."
+    exit 1
+fi
+
+# Function to compare version numbers
 verlte() {
     [ "$1" = "$2" ] && return 1 || [ "$2" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
 }
 
+# CUDA compat libs logic
 if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
     CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d"." -f 3-)
     echo "CUDA compat package requires Nvidia driver ≤${CUDA_COMPAT_MAX_DRIVER_VERSION}"
     cat /proc/driver/nvidia/version
-    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module \([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
     echo "Current installed Nvidia driver version is ${NVIDIA_DRIVER_VERSION}"
     if [ $(verlte "$CUDA_COMPAT_MAX_DRIVER_VERSION" "$NVIDIA_DRIVER_VERSION") ]; then
         echo "Setup CUDA compatibility libs path to LD_LIBRARY_PATH"
@@ -21,71 +28,27 @@ else
     echo "Skip CUDA compat libs setup as package not found"
 fi
 
+# Model variables check
 if [[ -z "${HF_MODEL_ID}" ]]; then
-  echo "HF_MODEL_ID must be set"
-  exit 1
+    echo "HF_MODEL_ID must be set"
+    exit 1
 fi
 export MODEL_ID="${HF_MODEL_ID}"
 
 if [[ -n "${HF_MODEL_REVISION}" ]]; then
-  export REVISION="${HF_MODEL_REVISION}"
-fi
-
-if ! command -v nvidia-smi &> /dev/null; then
-    echo "Error: 'nvidia-smi' command not found."
-    exit 1
-fi
-
-# Query GPU name using nvidia-smi
-gpu_name=$(nvidia-smi --query-gpu=gpu_name --format=csv | awk 'NR==2')
-if [ $? -ne 0 ]; then
-    echo "Error: $gpu_name"
-    echo "Query gpu_name failed"
-else
-    echo "Query gpu_name succeeded. Printing output: $gpu_name"
+    export REVISION="${HF_MODEL_REVISION}"
 fi
 
-# Function to get compute capability based on GPU name
-get_compute_cap() {
-    gpu_name="$1"
-
-    # Check if the GPU name contains "A10G"
-    if [[ "$gpu_name" == *"A10G"* ]]; then
-        echo "86"
-    # Check if the GPU name contains "A100"
-    elif [[ "$gpu_name" == *"A100"* ]]; then
-        echo "80"
-    # Check if the GPU name contains "H100"
-    elif [[ "$gpu_name" == *"H100"* ]]; then
-        echo "90"
-    # Cover Nvidia T4
-    elif [[ "$gpu_name" == *"T4"* ]]; then
-        echo "75"
-    # Cover Nvidia L4
-    elif [[ "$gpu_name" == *"L4"* ]]; then
-        echo "89"
-    else
-        echo "80"  # Default compute capability
-    fi
-}
-
-if [[ -z "${CUDA_COMPUTE_CAP}" ]]
-then
-    compute_cap=$(get_compute_cap "$gpu_name")
-    echo "the compute_cap is $compute_cap"
-else
-    compute_cap=$CUDA_COMPUTE_CAP
-fi
+compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
 
-if [[ ${compute_cap} -eq 75 ]]
-then
-    text-embeddings-router-75 --port 8080 --json-output
-elif [[ ${compute_cap} -ge 80 && ${compute_cap} -lt 90 ]]
-then
-    text-embeddings-router-80 --port 8080 --json-output
-elif [[ ${compute_cap} -eq 90 ]]
-then
-    text-embeddings-router-90 --port 8080 --json-output
+# Router selection logic
+if [ ${compute_cap} -eq 75 ]; then
+    exec text-embeddings-router-75 --port 8080 --json-output
+elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then
+    exec text-embeddings-router-80 --port 8080 --json-output
+elif [ ${compute_cap} -eq 90 ]; then
+    exec text-embeddings-router-90 --port 8080 --json-output
 else
-    echo "cuda compute cap ${compute_cap} is not supported"; exit 1
+    echo "cuda compute cap ${compute_cap} is not supported"
+    exit 1
 fi

From feec6a4177c27c3662fa60979aad0f6b42f20a6c Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:45:48 +0200
Subject: [PATCH 3/5] Fix formatting in `cuda-all-entrypoint.sh`

---
 cuda-all-entrypoint.sh | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cuda-all-entrypoint.sh b/cuda-all-entrypoint.sh
index d9be21ea..6f1c909b 100644
--- a/cuda-all-entrypoint.sh
+++ b/cuda-all-entrypoint.sh
@@ -1,21 +1,19 @@
 #!/bin/bash
 
-if ! command -v nvidia-smi &> /dev/null; then
+if ! command -v nvidia-smi &>/dev/null; then
     echo "Error: 'nvidia-smi' command not found."
     exit 1
 fi
 
 compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
 
-if [ ${compute_cap} -eq 75 ]
-then
+if [ ${compute_cap} -eq 75 ]; then
     exec text-embeddings-router-75 "$@"
-elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]
-then
+elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then
     exec text-embeddings-router-80 "$@"
-elif [ ${compute_cap} -eq 90 ]
-then
+elif [ ${compute_cap} -eq 90 ]; then
     exec text-embeddings-router-90 "$@"
 else
-    echo "cuda compute cap ${compute_cap} is not supported"; exit 1
+    echo "cuda compute cap ${compute_cap} is not supported"
+    exit 1
 fi

From b55218ff131d37e1a6ff828462ba14a3b177519e Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:47:34 +0200
Subject: [PATCH 4/5] Update `Dockerfile` to exclude AWS SageMaker

Removed since it's not being directly used anymore, since the
`Dockerfile` is ported as-is into
https://github.com/awslabs/llm-hosting-container/tree/main/huggingface/pytorch/tei/docker
as well as its respective entrypoint, and re-built in there, so no need
for it to live here (most likely the entrypoint could also be removed)
---
 Dockerfile | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 97d3c4e9..2bd8e491 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -110,14 +110,5 @@ FROM base AS http
 
 COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 
-# Amazon SageMaker compatible image
-FROM http AS sagemaker
-COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
-
-ENTRYPOINT ["./entrypoint.sh"]
-
-# Default image
-FROM http
-
 ENTRYPOINT ["text-embeddings-router"]
 CMD ["--json-output"]

From f84347781c7cc8cc9e61a019eb7590090cc40557 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:49:41 +0200
Subject: [PATCH 5/5] Remove both Google Vertex & AWS SageMaker from
 `Dockerfile-cuda-all`

Neither of those is required, since both Google Cloud and AWS SageMaker
port the `Dockerfile-cuda-all` as-is, and then re-builds it there,
meaning that the actual BUILD_ARG for VERTEX is not being used at all,
neither the AWS SageMaker stage. For more information check the
repositories
https://github.com/huggingface/Google-Cloud-Containers/tree/main/containers/tei,
and
https://github.com/awslabs/llm-hosting-container/tree/main/huggingface/pytorch/tei/docker,
respectively for Google Cloud and AWS SageMaker.
---
 Dockerfile-cuda-all | 60 ++++++---------------------------------------
 1 file changed, 7 insertions(+), 53 deletions(-)

diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
index 5dca432a..c1e9e2ec 100644
--- a/Dockerfile-cuda-all
+++ b/Dockerfile-cuda-all
@@ -35,7 +35,6 @@ FROM base-builder AS builder
 
 ARG GIT_SHA
 ARG DOCKER_LABEL
-ARG VERTEX="false"
 
 # sccache specific variables
 ARG SCCACHE_GHA_ENABLED
@@ -51,39 +50,19 @@ COPY --from=planner /usr/src/recipe.json recipe.json
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      cargo chef cook --release --features google --recipe-path recipe.json && sccache -s; \
-    else \
-      cargo chef cook --release --recipe-path recipe.json && sccache -s; \
-    fi;
+    cargo chef cook --release --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
 COPY backends backends
 COPY core core
@@ -93,34 +72,19 @@ COPY Cargo.lock ./
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
 
@@ -142,16 +106,6 @@ COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local
 COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
 COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
 
-# Amazon SageMaker compatible image
-FROM base AS sagemaker
-
-COPY --chmod=775 sagemaker-entrypoint-cuda-all.sh entrypoint.sh
-
-ENTRYPOINT ["./entrypoint.sh"]
-
-# Default image
-FROM base
-
 COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh
 
 ENTRYPOINT ["./entrypoint.sh"]