From 5a6e2206d16f6d6ea75c8d1890d20b15f311c94a Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:44:54 +0200 Subject: [PATCH 1/5] Add `exec` & fix formatting in `sagemaker-entrypoint.sh` Use `exec` so that the process runs with PID 1, allowing it to receive signals directly; and so on, to be gracefully shut down --- sagemaker-entrypoint.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh index b6cec7bb..1f3a4a9d 100644 --- a/sagemaker-entrypoint.sh +++ b/sagemaker-entrypoint.sh @@ -1,13 +1,13 @@ #!/bin/bash if [[ -z "${HF_MODEL_ID}" ]]; then - echo "HF_MODEL_ID must be set" - exit 1 + echo "HF_MODEL_ID must be set" + exit 1 fi export MODEL_ID="${HF_MODEL_ID}" if [[ -n "${HF_MODEL_REVISION}" ]]; then - export REVISION="${HF_MODEL_REVISION}" + export REVISION="${HF_MODEL_REVISION}" fi -text-embeddings-router --port 8080 --json-output +exec text-embeddings-router --port 8080 --json-output From c910de37576a5df87c89841e73c4864e686e344e Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:45:30 +0200 Subject: [PATCH 2/5] Fix `compute_cap` parsing and formatting in `sagemaker-entrypoint-cuda-all.sh` --- sagemaker-entrypoint-cuda-all.sh | 81 +++++++++----------------------- 1 file changed, 22 insertions(+), 59 deletions(-) diff --git a/sagemaker-entrypoint-cuda-all.sh b/sagemaker-entrypoint-cuda-all.sh index a3c63cbb..0d89ce28 100644 --- a/sagemaker-entrypoint-cuda-all.sh +++ b/sagemaker-entrypoint-cuda-all.sh @@ -1,14 +1,21 @@ #!/bin/bash +if ! command -v nvidia-smi &>/dev/null; then + echo "Error: 'nvidia-smi' command not found." + exit 1 +fi + +# Function to compare version numbers verlte() { [ "$1" = "$2" ] && return 1 || [ "$2" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] } +# CUDA compat libs logic if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d"." -f 3-) echo "CUDA compat package requires Nvidia driver ≤${CUDA_COMPAT_MAX_DRIVER_VERSION}" cat /proc/driver/nvidia/version - NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) + NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module \([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) echo "Current installed Nvidia driver version is ${NVIDIA_DRIVER_VERSION}" if [ $(verlte "$CUDA_COMPAT_MAX_DRIVER_VERSION" "$NVIDIA_DRIVER_VERSION") ]; then echo "Setup CUDA compatibility libs path to LD_LIBRARY_PATH" @@ -21,71 +28,27 @@ else echo "Skip CUDA compat libs setup as package not found" fi +# Model variables check if [[ -z "${HF_MODEL_ID}" ]]; then - echo "HF_MODEL_ID must be set" - exit 1 + echo "HF_MODEL_ID must be set" + exit 1 fi export MODEL_ID="${HF_MODEL_ID}" if [[ -n "${HF_MODEL_REVISION}" ]]; then - export REVISION="${HF_MODEL_REVISION}" -fi - -if ! command -v nvidia-smi &> /dev/null; then - echo "Error: 'nvidia-smi' command not found." - exit 1 -fi - -# Query GPU name using nvidia-smi -gpu_name=$(nvidia-smi --query-gpu=gpu_name --format=csv | awk 'NR==2') -if [ $? -ne 0 ]; then - echo "Error: $gpu_name" - echo "Query gpu_name failed" -else - echo "Query gpu_name succeeded. Printing output: $gpu_name" + export REVISION="${HF_MODEL_REVISION}" fi -# Function to get compute capability based on GPU name -get_compute_cap() { - gpu_name="$1" - - # Check if the GPU name contains "A10G" - if [[ "$gpu_name" == *"A10G"* ]]; then - echo "86" - # Check if the GPU name contains "A100" - elif [[ "$gpu_name" == *"A100"* ]]; then - echo "80" - # Check if the GPU name contains "H100" - elif [[ "$gpu_name" == *"H100"* ]]; then - echo "90" - # Cover Nvidia T4 - elif [[ "$gpu_name" == *"T4"* ]]; then - echo "75" - # Cover Nvidia L4 - elif [[ "$gpu_name" == *"L4"* ]]; then - echo "89" - else - echo "80" # Default compute capability - fi -} - -if [[ -z "${CUDA_COMPUTE_CAP}" ]] -then - compute_cap=$(get_compute_cap "$gpu_name") - echo "the compute_cap is $compute_cap" -else - compute_cap=$CUDA_COMPUTE_CAP -fi +compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g') -if [[ ${compute_cap} -eq 75 ]] -then - text-embeddings-router-75 --port 8080 --json-output -elif [[ ${compute_cap} -ge 80 && ${compute_cap} -lt 90 ]] -then - text-embeddings-router-80 --port 8080 --json-output -elif [[ ${compute_cap} -eq 90 ]] -then - text-embeddings-router-90 --port 8080 --json-output +# Router selection logic +if [ ${compute_cap} -eq 75 ]; then + exec text-embeddings-router-75 --port 8080 --json-output +elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then + exec text-embeddings-router-80 --port 8080 --json-output +elif [ ${compute_cap} -eq 90 ]; then + exec text-embeddings-router-90 --port 8080 --json-output else - echo "cuda compute cap ${compute_cap} is not supported"; exit 1 + echo "cuda compute cap ${compute_cap} is not supported" + exit 1 fi From feec6a4177c27c3662fa60979aad0f6b42f20a6c Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:45:48 +0200 Subject: [PATCH 3/5] Fix formatting in `cuda-all-entrypoint.sh` --- cuda-all-entrypoint.sh | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cuda-all-entrypoint.sh b/cuda-all-entrypoint.sh index d9be21ea..6f1c909b 100644 --- a/cuda-all-entrypoint.sh +++ b/cuda-all-entrypoint.sh @@ -1,21 +1,19 @@ #!/bin/bash -if ! command -v nvidia-smi &> /dev/null; then +if ! command -v nvidia-smi &>/dev/null; then echo "Error: 'nvidia-smi' command not found." exit 1 fi compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g') -if [ ${compute_cap} -eq 75 ] -then +if [ ${compute_cap} -eq 75 ]; then exec text-embeddings-router-75 "$@" -elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ] -then +elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then exec text-embeddings-router-80 "$@" -elif [ ${compute_cap} -eq 90 ] -then +elif [ ${compute_cap} -eq 90 ]; then exec text-embeddings-router-90 "$@" else - echo "cuda compute cap ${compute_cap} is not supported"; exit 1 + echo "cuda compute cap ${compute_cap} is not supported" + exit 1 fi From b55218ff131d37e1a6ff828462ba14a3b177519e Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:47:34 +0200 Subject: [PATCH 4/5] Update `Dockerfile` to exclude AWS SageMaker Removed since it's not being directly used anymore, since the `Dockerfile` is ported as-is into https://github.com/awslabs/llm-hosting-container/tree/main/huggingface/pytorch/tei/docker as well as its respective entrypoint, and re-built in there, so no need for it to live here (most likely the entrypoint could also be removed) --- Dockerfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 97d3c4e9..2bd8e491 100644 --- a/Dockerfile +++ b/Dockerfile @@ -110,14 +110,5 @@ FROM base AS http COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router -# Amazon SageMaker compatible image -FROM http AS sagemaker -COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh - -ENTRYPOINT ["./entrypoint.sh"] - -# Default image -FROM http - ENTRYPOINT ["text-embeddings-router"] CMD ["--json-output"] From f84347781c7cc8cc9e61a019eb7590090cc40557 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:49:41 +0200 Subject: [PATCH 5/5] Remove both Google Vertex & AWS SageMaker from `Dockerfile-cuda-all` Neither of those is required, since both Google Cloud and AWS SageMaker port the `Dockerfile-cuda-all` as-is, and then re-builds it there, meaning that the actual BUILD_ARG for VERTEX is not being used at all, neither the AWS SageMaker stage. For more information check the repositories https://github.com/huggingface/Google-Cloud-Containers/tree/main/containers/tei, and https://github.com/awslabs/llm-hosting-container/tree/main/huggingface/pytorch/tei/docker, respectively for Google Cloud and AWS SageMaker. --- Dockerfile-cuda-all | 60 ++++++--------------------------------------- 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all index 5dca432a..c1e9e2ec 100644 --- a/Dockerfile-cuda-all +++ b/Dockerfile-cuda-all @@ -35,7 +35,6 @@ FROM base-builder AS builder ARG GIT_SHA ARG DOCKER_LABEL -ARG VERTEX="false" # sccache specific variables ARG SCCACHE_GHA_ENABLED @@ -51,39 +50,19 @@ COPY --from=planner /usr/src/recipe.json recipe.json RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - cargo chef cook --release --features google --recipe-path recipe.json && sccache -s; \ - else \ - cargo chef cook --release --recipe-path recipe.json && sccache -s; \ - fi; + cargo chef cook --release --recipe-path recipe.json && sccache -s; RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; COPY backends backends COPY core core @@ -93,34 +72,19 @@ COPY Cargo.lock ./ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F google && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F google && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - if [ $VERTEX = "true" ]; \ - then \ - CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F google && sccache -s; \ - else \ - CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \ - fi; + CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90 @@ -142,16 +106,6 @@ COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80 COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90 -# Amazon SageMaker compatible image -FROM base AS sagemaker - -COPY --chmod=775 sagemaker-entrypoint-cuda-all.sh entrypoint.sh - -ENTRYPOINT ["./entrypoint.sh"] - -# Default image -FROM base - COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh ENTRYPOINT ["./entrypoint.sh"]