Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/emd/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
vlms,
comfyui,
asr,
audio,
embeddings,
reranks,
custom,
Expand Down
1 change: 1 addition & 0 deletions src/emd/models/audio/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import higgs_audio
38 changes: 38 additions & 0 deletions src/emd/models/audio/higgs_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from .. import Model
from ..engines import vllm_higgs_audio_engine091
from ..model_series import HIGGS_AUDIO_SERIES
from ..instances import (
g5d48xlarge_instance,
local_instance
)
from ..services import (
sagemaker_service,
sagemaker_async_service,
ecs_service,
local_service
)
from ..frameworks import fastapi_framework
from emd.models.utils.constants import ModelType

Model.register(
dict(
model_id="bosonai-higgs-audio-v2-generation-3B-base",
model_type=ModelType.AUDIO,
description="Higgs Audio v2 Generation 3B Base is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on vLLM engine with OpenAI-compatible API for text-to-speech and audio generation tasks.",
application_scenario="voice cloning, text-to-speech, audio generation, multi-speaker synthesis, smart voice generation",
supported_engines=[vllm_higgs_audio_engine091],
supported_instances=[
g5d48xlarge_instance, local_instance
],
supported_services=[
sagemaker_service, local_service
],
supported_frameworks=[
fastapi_framework
],
allow_china_region=True,
huggingface_model_id="bosonai/higgs-audio-v2-generation-3B-base",
require_huggingface_token=False,
need_prepare_model=False,
)
)
11 changes: 11 additions & 0 deletions src/emd/models/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,17 @@ class KtransformersEngine(OpenAICompitableEngine):
"description": "VLLM v0.9.1 engine for dots.ocr multilingual document parsing model with flash-attn support and eager execution for custom models"
})

# VLLM Engine v0.9.1 for Higgs Audio
vllm_higgs_audio_engine091 = VllmEngine(**{
**vllm_engine064.model_dump(),
"engine_dockerfile_config": {"VERSION":"v0.9.1"},
"dockerfile_name": "Dockerfile_higgs_audio",
"engine_cls": "vllm.higgs_audio_backend.HiggsAudioBackend",
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
"default_cli_args": " --shm-size=30gb",
"description": "VLLM v0.9.1 engine for Higgs Audio v2 Generation 3B Base multimodal audio generation model using native Docker entrypoint"
})

custom_engine = Engine(**{
"engine_type":EngineType.CUSTOM,
})
6 changes: 6 additions & 0 deletions src/emd/models/model_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,9 @@
description="dots.ocr is a powerful, multilingual document parser that unifies layout detection and content recognition within a single vision-language model while maintaining good reading order. Despite its compact 1.7B-parameter LLM foundation, it achieves state-of-the-art(SOTA) performance on text, tables, and reading order tasks with multilingual support for over 100 languages.",
reference_link="https://github.com/rednote-hilab/dots.ocr"
)

HIGGS_AUDIO_SERIES = ModelSeries(
model_series_name=ModelSeriesType.HIGGS_AUDIO,
description="Higgs Audio v2 Generation is a powerful multimodal audio generation model that supports voice cloning, smart voice generation, and multi-speaker synthesis. Built on advanced neural architectures, it provides high-quality text-to-speech capabilities with support for various audio generation tasks including voice cloning and multi-speaker scenarios.",
reference_link="https://huggingface.co/bosonai/higgs-audio-v2-generation-3B-base"
)
2 changes: 2 additions & 0 deletions src/emd/models/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class ModelType(ConstantBase):
VLM = "vlm"
EMBEDDING = "embedding"
VIDEO = "video"
AUDIO = "audio"

class ServiceCode(ConstantBase):
SAGEMAKER = "sagemaker"
Expand Down Expand Up @@ -236,3 +237,4 @@ class ModelSeriesType(ConstantBase):
DEEPSEEK_v3 = "deepseek v3"
BAICHUAN = "baichuan"
DOTS_OCR = "dots_ocr"
HIGGS_AUDIO = "higgs_audio"
10 changes: 10 additions & 0 deletions src/pipeline/backend/vllm/Dockerfile_higgs_audio
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM public.ecr.aws/aws-gcr-solutions/dmaa/higgs-audio-vllm:latest AS base

WORKDIR /opt/ml/code

# Copy the workspace from the base image itself
COPY /vllm-workspace/ /opt/ml/code/

EXPOSE 8080

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.bosonai.api_server"]
5 changes: 5 additions & 0 deletions src/pipeline/backend/vllm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# VLLM Backend Module
from .vllm_backend import VLLMBackend
from .higgs_audio_backend import HiggsAudioBackend

__all__ = ['VLLMBackend', 'HiggsAudioBackend']
86 changes: 86 additions & 0 deletions src/pipeline/backend/vllm/higgs_audio_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import httpx
import sys
import os
from emd.models.utils.constants import ModelType
import inspect
from backend.backend import OpenAICompitableProxyBackendBase
from emd.utils.logger_utils import get_logger

logger = get_logger(__name__)

class HiggsAudioBackend(OpenAICompitableProxyBackendBase):
"""
Higgs Audio Backend that uses the Docker image's native entrypoint
instead of the standard vLLM serve command.

This backend is specifically designed for the Higgs Audio v2 Generation 3B Base model
which provides its own API server via the pre-built Docker image with entrypoint:
["python3", "-m", "vllm.entrypoints.bosonai.api_server"]
"""

def before_start(self,model_dir=None):
logger.info(f"before_startbefore_startbefore_startbefore_startbefore_start")

def create_proxy_server_start_command(self, model_path):
return f'python3 -m vllm.entrypoints.bosonai.api_server --served-model-name higgs-audio-v2-generation-3B-base --model bosonai/higgs-audio-v2-generation-3B-base --audio-tokenizer-type bosonai/higgs-audio-v2-tokenizer --limit-mm-per-prompt audio=50 --max-model-len 8192 --tensor-parallel-size 8 --pipeline-parallel-size 1 --port 8000 --gpu-memory-utilization 0.65 --disable-mm-preprocessor-cache'

def openai_create_helper(self, fn: callable, request: dict):
"""
Helper method to handle OpenAI-compatible API calls with extra parameters.
"""
sig = inspect.signature(fn)
extra_body = request.get("extra_body", {})
extra_params = {k: request.pop(k) for k in list(request.keys()) if k not in sig.parameters}
extra_body.update(extra_params)
request['extra_body'] = extra_body
return fn(**request)

def invoke(self, request):
"""
Invoke the Higgs Audio model with OpenAI-compatible API.
Supports audio modalities for voice cloning, smart voice generation, and multi-speaker synthesis.
"""
# Transform input to Higgs Audio format
request = self._transform_request(request)

logger.info(f"Higgs Audio request: {request}")

# Handle different model types - Higgs Audio is primarily for audio generation
if self.model_type == ModelType.AUDIO:
# Use chat completions endpoint for audio generation
response = self.openai_create_helper(self.client.chat.completions.create, request)
else:
# Fallback to standard chat completions
response = self.openai_create_helper(self.client.chat.completions.create, request)

logger.info(f"Higgs Audio response: {response}, request: {request}")

if request.get("stream", False):
return self._transform_streaming_response(response)
else:
return self._transform_response(response)

async def ainvoke(self, request):
"""
Async invoke the Higgs Audio model with OpenAI-compatible API.
"""
# Transform input to Higgs Audio format
request = self._transform_request(request)

logger.info(f"Higgs Audio async request: {request}")

# Handle different model types - Higgs Audio is primarily for audio generation
if self.model_type == ModelType.AUDIO:
# Use chat completions endpoint for audio generation
response = await self.openai_create_helper(self.async_client.chat.completions.create, request)
else:
# Fallback to standard chat completions
response = await self.openai_create_helper(self.async_client.chat.completions.create, request)

logger.info(f"Higgs Audio async response: {response}, request: {request}")

if request.get("stream", False):
logger.info(f"Higgs Audio streaming response: {response}")
return await self._atransform_streaming_response(response)
else:
return await self._atransform_response(response)