diff --git a/src/emd/models/__init__.py b/src/emd/models/__init__.py index b2b50dd..b1fdada 100644 --- a/src/emd/models/__init__.py +++ b/src/emd/models/__init__.py @@ -17,6 +17,7 @@ vlms, comfyui, asr, + audio, embeddings, reranks, custom, diff --git a/src/emd/models/audio/__init__.py b/src/emd/models/audio/__init__.py new file mode 100644 index 0000000..d6e3578 --- /dev/null +++ b/src/emd/models/audio/__init__.py @@ -0,0 +1 @@ +from . import higgs_audio diff --git a/src/emd/models/audio/higgs_audio.py b/src/emd/models/audio/higgs_audio.py new file mode 100644 index 0000000..d90de54 --- /dev/null +++ b/src/emd/models/audio/higgs_audio.py @@ -0,0 +1,38 @@ +from .. import Model +from ..engines import vllm_higgs_audio_engine091 +from ..model_series import HIGGS_AUDIO_SERIES +from ..instances import ( + g5d48xlarge_instance, + local_instance +) +from ..services import ( + sagemaker_service, + sagemaker_async_service, + ecs_service, + local_service +) +from ..frameworks import fastapi_framework +from emd.models.utils.constants import ModelType + +Model.register( + dict( + model_id="bosonai-higgs-audio-v2-generation-3B-base", + model_type=ModelType.AUDIO, + description="Higgs Audio v2 Generation 3B Base is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on vLLM engine with OpenAI-compatible API for text-to-speech and audio generation tasks.", + application_scenario="voice cloning, text-to-speech, audio generation, multi-speaker synthesis, smart voice generation", + supported_engines=[vllm_higgs_audio_engine091], + supported_instances=[ + g5d48xlarge_instance, local_instance + ], + supported_services=[ + sagemaker_service, local_service + ], + supported_frameworks=[ + fastapi_framework + ], + allow_china_region=True, + huggingface_model_id="bosonai/higgs-audio-v2-generation-3B-base", + require_huggingface_token=False, + need_prepare_model=False, + ) +) diff --git a/src/emd/models/engines.py b/src/emd/models/engines.py index 6e972d1..5a596b2 100644 --- a/src/emd/models/engines.py +++ b/src/emd/models/engines.py @@ -555,6 +555,17 @@ class KtransformersEngine(OpenAICompitableEngine): "description": "VLLM v0.9.1 engine for dots.ocr multilingual document parsing model with flash-attn support and eager execution for custom models" }) +# VLLM Engine v0.9.1 for Higgs Audio +vllm_higgs_audio_engine091 = VllmEngine(**{ + **vllm_engine064.model_dump(), + "engine_dockerfile_config": {"VERSION":"v0.9.1"}, + "dockerfile_name": "Dockerfile_higgs_audio", + "engine_cls": "vllm.higgs_audio_backend.HiggsAudioBackend", + "environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True", + "default_cli_args": " --shm-size=30gb", + "description": "VLLM v0.9.1 engine for Higgs Audio v2 Generation 3B Base multimodal audio generation model using native Docker entrypoint" +}) + custom_engine = Engine(**{ "engine_type":EngineType.CUSTOM, }) diff --git a/src/emd/models/model_series.py b/src/emd/models/model_series.py index 32ab1e5..6846fdd 100644 --- a/src/emd/models/model_series.py +++ b/src/emd/models/model_series.py @@ -163,3 +163,9 @@ description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.", reference_link="https://github.com/rednote-hilab/dots.ocr" ) + +HIGGS_AUDIO_SERIES = ModelSeries( + model_series_name=ModelSeriesType.HIGGS_AUDIO, + description="Higgs Audio v2 Generation is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on advanced neural architectures, it provides high-quality text-to-speech capabilities with support for various audio generation tasks including voice cloning and multi-speaker scenarios.", + reference_link="https://huggingface.co/bosonai/higgs-audio-v2-generation-3B-base" +) diff --git a/src/emd/models/utils/constants.py b/src/emd/models/utils/constants.py index 06039d2..c13cfb9 100644 --- a/src/emd/models/utils/constants.py +++ b/src/emd/models/utils/constants.py @@ -142,6 +142,7 @@ class ModelType(ConstantBase): VLM = "vlm" EMBEDDING = "embedding" VIDEO = "video" + AUDIO = "audio" class ServiceCode(ConstantBase): SAGEMAKER = "sagemaker" @@ -236,3 +237,4 @@ class ModelSeriesType(ConstantBase): DEEPSEEK_v3 = "deepseek v3" BAICHUAN = "baichuan" DOTS_OCR = "dots_ocr" + HIGGS_AUDIO = "higgs_audio" diff --git a/src/pipeline/backend/vllm/Dockerfile_higgs_audio b/src/pipeline/backend/vllm/Dockerfile_higgs_audio new file mode 100644 index 0000000..2572c5d --- /dev/null +++ b/src/pipeline/backend/vllm/Dockerfile_higgs_audio @@ -0,0 +1,10 @@ +FROM public.ecr.aws/aws-gcr-solutions/dmaa/higgs-audio-vllm:latest AS base + +WORKDIR /opt/ml/code + +# Copy the workspace from the base image itself +COPY /vllm-workspace/ /opt/ml/code/ + +EXPOSE 8080 + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.bosonai.api_server"] diff --git a/src/pipeline/backend/vllm/__init__.py b/src/pipeline/backend/vllm/__init__.py new file mode 100644 index 0000000..54de697 --- /dev/null +++ b/src/pipeline/backend/vllm/__init__.py @@ -0,0 +1,5 @@ +# VLLM Backend Module +from .vllm_backend import VLLMBackend +from .higgs_audio_backend import HiggsAudioBackend + +__all__ = ['VLLMBackend', 'HiggsAudioBackend'] diff --git a/src/pipeline/backend/vllm/higgs_audio_backend.py b/src/pipeline/backend/vllm/higgs_audio_backend.py new file mode 100644 index 0000000..a750d28 --- /dev/null +++ b/src/pipeline/backend/vllm/higgs_audio_backend.py @@ -0,0 +1,86 @@ +import httpx +import sys +import os +from emd.models.utils.constants import ModelType +import inspect +from backend.backend import OpenAICompitableProxyBackendBase +from emd.utils.logger_utils import get_logger + +logger = get_logger(__name__) + +class HiggsAudioBackend(OpenAICompitableProxyBackendBase): + """ + Higgs Audio Backend that uses the Docker image's native entrypoint + instead of the standard vLLM serve command. + + This backend is specifically designed for the Higgs Audio v2 Generation 3B Base model + which provides its own API server via the pre-built Docker image with entrypoint: + ["python3", "-m", "vllm.entrypoints.bosonai.api_server"] + """ + + def before_start(self,model_dir=None): + logger.info(f"before_startbefore_startbefore_startbefore_startbefore_start") + + def create_proxy_server_start_command(self, model_path): + return f'python3 -m vllm.entrypoints.bosonai.api_server --served-model-name higgs-audio-v2-generation-3B-base --model bosonai/higgs-audio-v2-generation-3B-base --audio-tokenizer-type bosonai/higgs-audio-v2-tokenizer --limit-mm-per-prompt audio=50 --max-model-len 8192 --tensor-parallel-size 8 --pipeline-parallel-size 1 --port 8000 --gpu-memory-utilization 0.65 --disable-mm-preprocessor-cache' + + def openai_create_helper(self, fn: callable, request: dict): + """ + Helper method to handle OpenAI-compatible API calls with extra parameters. + """ + sig = inspect.signature(fn) + extra_body = request.get("extra_body", {}) + extra_params = {k: request.pop(k) for k in list(request.keys()) if k not in sig.parameters} + extra_body.update(extra_params) + request['extra_body'] = extra_body + return fn(**request) + + def invoke(self, request): + """ + Invoke the Higgs Audio model with OpenAI-compatible API. + Supports audio modalities for voice cloning, smart voice generation, and multi-speaker synthesis. + """ + # Transform input to Higgs Audio format + request = self._transform_request(request) + + logger.info(f"Higgs Audio request: {request}") + + # Handle different model types - Higgs Audio is primarily for audio generation + if self.model_type == ModelType.AUDIO: + # Use chat completions endpoint for audio generation + response = self.openai_create_helper(self.client.chat.completions.create, request) + else: + # Fallback to standard chat completions + response = self.openai_create_helper(self.client.chat.completions.create, request) + + logger.info(f"Higgs Audio response: {response}, request: {request}") + + if request.get("stream", False): + return self._transform_streaming_response(response) + else: + return self._transform_response(response) + + async def ainvoke(self, request): + """ + Async invoke the Higgs Audio model with OpenAI-compatible API. + """ + # Transform input to Higgs Audio format + request = self._transform_request(request) + + logger.info(f"Higgs Audio async request: {request}") + + # Handle different model types - Higgs Audio is primarily for audio generation + if self.model_type == ModelType.AUDIO: + # Use chat completions endpoint for audio generation + response = await self.openai_create_helper(self.async_client.chat.completions.create, request) + else: + # Fallback to standard chat completions + response = await self.openai_create_helper(self.async_client.chat.completions.create, request) + + logger.info(f"Higgs Audio async response: {response}, request: {request}") + + if request.get("stream", False): + logger.info(f"Higgs Audio streaming response: {response}") + return await self._atransform_streaming_response(response) + else: + return await self._atransform_response(response)