feat: add bey video service

niqodea · lucjac · nicodea · niqodea · commit 6910d09c4af4 · 2025-09-11T19:42:28.000+02:00
Co-authored-by: Lucas Jacobson &lt;lucas@beyondpresence.ai&gt;
Co-authored-by: Nicola De Angeli &lt;nicola@beyondpresence.ai&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `OpenAIRealtimeLLMService` and `AzureRealtimeLLMService` which provide
   access to OpenAI Realtime.
 
+- Added `BeyVideoService`. This is an integration for Beyond Presence AI video avatars.
+  (see <https://beyondpresence.ai>)
+
 ### Removed
 
 - Remove `VisionImageRawFrame` in favor of context frames (`LLMContextFrame` or
diff --git a/env.example b/env.example
@@ -155,3 +155,6 @@ NVIDIA_API_KEY=...
 
 # Qwen
 QWEN_API_KEY=...
+
+# Beyond Presence
+BEY_API_KEY=...
diff --git a/src/pipecat/services/bey/video.py b/src/pipecat/services/bey/video.py
@@ -0,0 +1,182 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+"""Beyond Presence implementation for Pipecat.
+
+This module provides integration with the Beyond Presence API to generate avatar videos
+starting from voice agents.
+"""
+
+import asyncio
+import os
+from typing import Optional
+
+import aiohttp
+
+from pipecat.audio.utils import create_stream_resampler
+from pipecat.frames.frames import (
+    BotStartedSpeakingFrame,
+    Frame,
+    SpeechOutputAudioRawFrame,
+    StartFrame,
+    StartInterruptionFrame,
+    TransportMessageFrame,
+    TTSAudioRawFrame,
+    TTSStartedFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessorSetup
+from pipecat.services.ai_service import AIService
+from pipecat.transports.services.daily import DailyParams, DailyTransport, DailyTransportClient
+from pipecat.transports.services.helpers.daily_rest import DailyRESTHelper
+
+BASE_API_URL = "https://api.bey.dev/v1"
+FRAME_RATE = 25
+
+
+class BeyVideoService(AIService):
+    """A service that integrates Beyond Presence's avatar video generation into the pipeline.
+
+    Converts audio stream from the pipeline into an avatar video stream posted directly
+    to a Daily room from an external worker managed by Beyond Presence.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        avatar_id: str,
+        # TODO: Is it possible to elegantly infer this from the pipeline's transport?
+        # NOTE: Unlike other providers, bey posts video directly to the room,
+        # likely resulting in lower latency
+        room_url: str,
+        session: aiohttp.ClientSession,
+        **kwargs,
+    ) -> None:
+        """Initialize the Beyond Presence speech-to-video service.
+
+        Args:
+            api_key: Beyond Presence API key used for authentication.
+            avatar_id: ID of the Beyond Presence avatar to use for video synthesis.
+            room_url: URL of the Daily room the speech-to-video service will connect to.
+            session: Async HTTP session used for communication with Beyond Presence.
+            **kwargs: Additional arguments passed to the parent AIService class.
+        """
+        super().__init__(**kwargs)
+
+        self._api_key = api_key
+        self._room_url = room_url
+        self._avatar_id = avatar_id
+        self._session = session
+
+        self._client: Optional[DailyTransportClient] = None
+
+        self._resampler = create_stream_resampler()
+        self._queue = asyncio.Queue()
+        self._out_sample_rate = 16000
+        self._audio_buffer = bytearray()
+        self._transport_destination: str = "bey-custom-track"
+        self._http_session: aiohttp.ClientSession | None = None
+
+    async def setup(self, setup: FrameProcessorSetup):
+        """Set up the Beyond Presence video service.
+
+        Args:
+            setup: Frame processor setup configuration.
+        """
+        await super().setup(setup)
+
+        daily_rest_helper = DailyRESTHelper(
+            daily_api_key=key,
+            aiohttp_session=self._session,
+        )
+        token_expiry_time: float = 60 * 60  # 1 hour
+        token = await daily_rest_helper.get_token(url, expiry_time)
+        # TODO: Fix this hacky way of obtaining the DailyTransportClient
+        self._client = DailyTransport(
+            self._room_url,
+            token,
+            "Bey example Bot",
+            DailyParams(
+                audio_in_enabled=True,
+                video_out_enabled=False,
+                video_out_is_live=False,
+                microphone_out_enabled=False,
+                vad_analyzer=SileroVADAnalyzer(),
+            ),
+        )._client
+        await self._client.setup(setup)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process frames through the service.
+
+        Args:
+            frame: The frame to process.
+            direction: The direction of frame processing.
+        """
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, StartFrame):
+            await self._start_session(room_url=self._client.room_url, token=self._client._token)
+            await self._client.register_audio_destination(self._transport_destination)
+            await self.push_frame(frame, direction)
+        elif isinstance(frame, StartInterruptionFrame):
+            frame.transport_destination = self._transport_destination
+            transport_frame = TransportMessageFrame(message="interrupt")
+            await self._client.send_message(transport_frame)
+        elif isinstance(frame, TTSAudioRawFrame):
+            in_sample_rate = frame.sample_rate
+            chunk_size = int((self._out_sample_rate * 2) / FRAME_RATE)
+
+            resampled = await self._resampler.resample(
+                frame.audio, in_sample_rate, self._out_sample_rate
+            )
+            self._audio_buffer.extend(resampled)
+            while len(self._audio_buffer) >= chunk_size:
+                chunk = SpeechOutputAudioRawFrame(
+                    bytes(self._audio_buffer[:chunk_size]),
+                    sample_rate=self._out_sample_rate,
+                    num_channels=frame.num_channels,
+                )
+
+                chunk.transport_destination = self._transport_destination
+
+                self._audio_buffer = self._audio_buffer[chunk_size:]
+                await self._client.write_audio_frame(chunk)
+        elif isinstance(frame, TTSStartedFrame):
+            await self.start_ttfb_metrics()
+        elif isinstance(frame, BotStartedSpeakingFrame):
+            # We constantly receive audio through WebRTC, but most of the time it is silence.
+            # As soon as we receive actual audio, the base output transport will create a
+            # BotStartedSpeakingFrame, which we can use as a signal for the TTFB metrics.
+            await self.stop_ttfb_metrics()
+        else:
+            await self.push_frame(frame, direction)
+
+    def can_generate_metrics(self) -> bool:
+        """Check if the service can generate metrics.
+
+        Returns:
+            True if metrics generation is supported.
+        """
+        return True
+
+    async def _start_session(self, room_url: str, token: str) -> None:
+        async with self._session().post(
+            f"{BASE_API_URL}/session",
+            headers={
+                "x-api-key": self._api_key,
+            },
+            json={
+                "avatar_id": self._avatar_id,
+                "transport_type": "pipecat",
+                # TODO: we might want to rename these to just url and token
+                "pipecat_url": room_url,
+                "pipecat_token": token,
+            },
+        ) as response:
+            if not response.ok:
+                text = await response.text()
+                raise Exception("Server returned an error", status_code=response.status, body=text)
+            return