diff --git a/src/elevenlabs/client.py b/src/elevenlabs/client.py index d5c3ee73..4e4c3d77 100644 --- a/src/elevenlabs/client.py +++ b/src/elevenlabs/client.py @@ -5,7 +5,7 @@ import httpx from typing import Iterator, Optional, Union, \ - Optional, AsyncIterator + Optional, AsyncIterator, Tuple from .base_client import \ BaseElevenLabs, AsyncBaseElevenLabs @@ -130,7 +130,7 @@ def generate( typing.Sequence[PronunciationDictionaryVersionLocator] ] = OMIT, request_options: typing.Optional[RequestOptions] = None - ) -> Iterator[bytes]: + ) -> Tuple[str, Iterator[bytes]]: """ - text: Union[str, Iterator[str]]. The string or stream of strings that will get converted into speech. @@ -310,7 +310,7 @@ async def generate( typing.Sequence[PronunciationDictionaryVersionLocator] ] = OMIT, request_options: typing.Optional[RequestOptions] = None - ) -> AsyncIterator[bytes]: + ) -> Tuple[str, AsyncIterator[bytes]]: """ This is a manually mnaintained helper function that generates a voice from provided text. @@ -383,7 +383,7 @@ async def generate( model_id = model.model_id if stream: - return self.text_to_speech.convert_as_stream( + return await self.text_to_speech.convert_as_stream( voice_id=voice_id, model_id=model_id, voice_settings=voice_settings, @@ -396,7 +396,7 @@ async def generate( else: if not isinstance(text, str): raise ApiError(body="Text must be a string when stream is False.") - return self.text_to_speech.convert( + return await self.text_to_speech.convert( voice_id=voice_id, model_id=model_id, voice_settings=voice_settings, diff --git a/src/elevenlabs/text_to_speech/client.py b/src/elevenlabs/text_to_speech/client.py index b99902cb..a16cd0bd 100644 --- a/src/elevenlabs/text_to_speech/client.py +++ b/src/elevenlabs/text_to_speech/client.py @@ -28,6 +28,7 @@ from .types.text_to_speech_stream_with_timestamps_response import TextToSpeechStreamWithTimestampsResponse import json from ..core.client_wrapper import AsyncClientWrapper +from typing import Tuple # this is used as the default value for optional parameters OMIT = typing.cast(typing.Any, ...) @@ -61,9 +62,9 @@ def convert( BodyTextToSpeechV1TextToSpeechVoiceIdPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.Iterator[bytes]: + ) -> Tuple[str, typing.Iterator[bytes]]: """ - Converts text into speech using a voice of your choice and returns audio. + Converts text into speech using a voice of your choice and returns the request ID and audio stream. Parameters ---------- @@ -126,9 +127,11 @@ def convert( Request-specific configuration. You can pass in configuration such as `chunk_size`, and more to customize the request and response. Yields - ------ - typing.Iterator[bytes] - Successful Response + ------- + tuple[str, typing.Iterator[bytes]] + A tuple containing: + - request_id: The ID of the request + - audio_stream: Iterator of audio bytes chunks Examples -------- @@ -180,10 +183,20 @@ def convert( ) as _response: try: if 200 <= _response.status_code < 300: - _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 - for _chunk in _response.iter_bytes(chunk_size=_chunk_size): - yield _chunk - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + def audio_iterator(): + _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 + for _chunk in _response.iter_bytes(chunk_size=_chunk_size): + yield _chunk + + return request_id, audio_iterator() + _response.read() if _response.status_code == 422: raise UnprocessableEntityError( @@ -224,7 +237,7 @@ def convert_with_timestamps( BodyTextToSpeechWithTimestampsV1TextToSpeechVoiceIdWithTimestampsPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.Optional[typing.Any]: + ) -> Tuple[str, typing.Optional[typing.Any]]: """ Converts text into speech using a voice of your choice and returns JSON containing audio as a base64 encoded string together with information on when which character was spoken. @@ -389,7 +402,7 @@ def convert_as_stream( BodyTextToSpeechStreamingV1TextToSpeechVoiceIdStreamPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.Iterator[bytes]: + ) -> Tuple[str, typing.Iterator[bytes]]: """ Converts text into speech using a voice of your choice and returns audio as an audio stream. @@ -508,10 +521,20 @@ def convert_as_stream( ) as _response: try: if 200 <= _response.status_code < 300: - _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 - for _chunk in _response.iter_bytes(chunk_size=_chunk_size): - yield _chunk - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + def audio_iterator(): + _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 + for _chunk in _response.iter_bytes(chunk_size=_chunk_size): + yield _chunk + + return request_id, audio_iterator() + _response.read() if _response.status_code == 422: raise UnprocessableEntityError( @@ -552,7 +575,7 @@ def stream_with_timestamps( BodyTextToSpeechStreamingWithTimestampsV1TextToSpeechVoiceIdStreamWithTimestampsPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.Iterator[TextToSpeechStreamWithTimestampsResponse]: + ) -> Tuple[str, typing.Iterator[TextToSpeechStreamWithTimestampsResponse]]: """ Converts text into speech using a voice of your choice and returns a stream of JSONs containing audio as a base64 encoded string together with information on when which character was spoken. @@ -673,20 +696,30 @@ def stream_with_timestamps( ) as _response: try: if 200 <= _response.status_code < 300: - for _text in _response.iter_lines(): - try: - if len(_text) == 0: - continue - yield typing.cast( - TextToSpeechStreamWithTimestampsResponse, - construct_type( - type_=TextToSpeechStreamWithTimestampsResponse, # type: ignore - object_=json.loads(_text), - ), - ) - except: - pass - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + def response_iterator(): + for _text in _response.iter_lines(): + try: + if len(_text) == 0: + continue + yield typing.cast( + TextToSpeechStreamWithTimestampsResponse, + construct_type( + type_=TextToSpeechStreamWithTimestampsResponse, # type: ignore + object_=json.loads(_text), + ), + ) + except: + pass + + return request_id, response_iterator() + _response.read() if _response.status_code == 422: raise UnprocessableEntityError( @@ -732,9 +765,9 @@ async def convert( BodyTextToSpeechV1TextToSpeechVoiceIdPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.AsyncIterator[bytes]: + ) -> Tuple[str, typing.AsyncIterator[bytes]]: """ - Converts text into speech using a voice of your choice and returns audio. + Converts text into speech using a voice of your choice and returns the request ID and audio stream. Parameters ---------- @@ -798,9 +831,11 @@ async def convert( Yields ------ - typing.AsyncIterator[bytes] - Successful Response - + tuple[str, typing.AsyncIterator[bytes]] + A tuple containing: + - request_id: The ID of the request + - audio_stream: Iterator of audio bytes chunks + Examples -------- import asyncio @@ -859,10 +894,20 @@ async def main() -> None: ) as _response: try: if 200 <= _response.status_code < 300: - _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 - async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size): - yield _chunk - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + async def audio_iterator(): + _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 + async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size): + yield _chunk + + return request_id, audio_iterator() + await _response.aread() if _response.status_code == 422: raise UnprocessableEntityError( @@ -903,7 +948,7 @@ async def convert_with_timestamps( BodyTextToSpeechWithTimestampsV1TextToSpeechVoiceIdWithTimestampsPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.Optional[typing.Any]: + ) -> Tuple[str, typing.Optional[typing.Any]]: """ Converts text into speech using a voice of your choice and returns JSON containing audio as a base64 encoded string together with information on when which character was spoken. @@ -1030,7 +1075,14 @@ async def main() -> None: ) try: if 200 <= _response.status_code < 300: - return typing.cast( + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + return request_id, typing.cast( typing.Optional[typing.Any], construct_type( type_=typing.Optional[typing.Any], # type: ignore @@ -1076,7 +1128,7 @@ async def convert_as_stream( BodyTextToSpeechStreamingV1TextToSpeechVoiceIdStreamPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.AsyncIterator[bytes]: + ) -> Tuple[str, typing.AsyncIterator[bytes]]: """ Converts text into speech using a voice of your choice and returns audio as an audio stream. @@ -1203,10 +1255,20 @@ async def main() -> None: ) as _response: try: if 200 <= _response.status_code < 300: - _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 - async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size): - yield _chunk - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + async def audio_iterator(): + _chunk_size = request_options.get("chunk_size", 1024) if request_options is not None else 1024 + async for _chunk in _response.aiter_bytes(chunk_size=_chunk_size): + yield _chunk + + return request_id, audio_iterator() + await _response.aread() if _response.status_code == 422: raise UnprocessableEntityError( @@ -1247,7 +1309,7 @@ async def stream_with_timestamps( BodyTextToSpeechStreamingWithTimestampsV1TextToSpeechVoiceIdStreamWithTimestampsPostApplyTextNormalization ] = OMIT, request_options: typing.Optional[RequestOptions] = None, - ) -> typing.AsyncIterator[TextToSpeechStreamWithTimestampsResponse]: + ) -> Tuple[str,typing.AsyncIterator[TextToSpeechStreamWithTimestampsResponse]]: """ Converts text into speech using a voice of your choice and returns a stream of JSONs containing audio as a base64 encoded string together with information on when which character was spoken. @@ -1376,20 +1438,29 @@ async def main() -> None: ) as _response: try: if 200 <= _response.status_code < 300: - async for _text in _response.aiter_lines(): - try: - if len(_text) == 0: - continue - yield typing.cast( - TextToSpeechStreamWithTimestampsResponse, - construct_type( - type_=TextToSpeechStreamWithTimestampsResponse, # type: ignore - object_=json.loads(_text), - ), - ) - except: - pass - return + request_id = _response.headers.get('request-id') + if not request_id: + raise ApiError( + status_code=_response.status_code, + body="Missing request-id in response headers." + ) + + async def response_iterator(): + async for _text in _response.aiter_lines(): + try: + if len(_text) == 0: + continue + yield typing.cast( + TextToSpeechStreamWithTimestampsResponse, + construct_type( + type_=TextToSpeechStreamWithTimestampsResponse, # type: ignore + object_=json.loads(_text), + ), + ) + except: + pass + + return request_id, response_iterator() await _response.aread() if _response.status_code == 422: raise UnprocessableEntityError( diff --git a/tests/test_tts.py b/tests/test_tts.py index 0baf8312..9a71a389 100644 --- a/tests/test_tts.py +++ b/tests/test_tts.py @@ -10,7 +10,7 @@ def test_tts_generate() -> None: """Test basic text-to-speech generation w/ custom generate.""" client = ElevenLabs() - audio_generator = client.generate(text=DEFAULT_TEXT, voice="Brian", model=DEFAULT_MODEL) + req_id, audio_generator = client.generate(text=DEFAULT_TEXT, voice="Brian", model=DEFAULT_MODEL) audio = b"".join(audio_generator) assert isinstance(audio, bytes), "TTS should return bytes" if not IN_GITHUB: @@ -20,7 +20,7 @@ def test_tts_generate() -> None: def test_tts_generate_with_voice_settings() -> None: """Test basic text-to-speech generation.""" client = ElevenLabs() - audio_generator = client.generate( + req_id, audio_generator = client.generate( text=DEFAULT_TEXT, model=DEFAULT_MODEL, voice=Voice( @@ -37,7 +37,7 @@ def test_tts_generate_with_voice_settings() -> None: def test_tts_generate_stream() -> None: """Test streaming text-to-speech generation.""" client = ElevenLabs() - audio_generator = client.generate( + req_id, audio_generator = client.generate( stream=True, text=DEFAULT_TEXT, model=DEFAULT_MODEL, @@ -51,7 +51,7 @@ def test_tts_generate_stream() -> None: def test_tts_convert() -> None: """Test basic text-to-speech generation.""" client = ElevenLabs() - audio_generator = client.text_to_speech.convert(text=DEFAULT_TEXT, voice_id=DEFAULT_VOICE, model_id=DEFAULT_MODEL) + req_id, audio_generator = client.text_to_speech.convert(text=DEFAULT_TEXT, voice_id=DEFAULT_VOICE, model_id=DEFAULT_MODEL) audio = b"".join(audio_generator) assert isinstance(audio, bytes), "TTS should return bytes" if not IN_GITHUB: @@ -61,7 +61,7 @@ def test_tts_convert() -> None: def test_tts_convert_with_voice_settings() -> None: """Test TTS with custom voice settings.""" client = ElevenLabs() - audio_generator = client.text_to_speech.convert( + req_id, audio_generator = client.text_to_speech.convert( text=DEFAULT_TEXT, voice_id=DEFAULT_VOICE, model_id=DEFAULT_MODEL, @@ -76,7 +76,7 @@ def test_tts_convert_with_voice_settings() -> None: def test_tts_convert_as_stream(): async def main(): async_client = AsyncElevenLabs() - results = async_client.text_to_speech.convert_as_stream( + req_id, results = await async_client.text_to_speech.convert_as_stream( text=DEFAULT_TEXT, voice_id=DEFAULT_VOICE, model_id=DEFAULT_MODEL ) out = b"" @@ -92,7 +92,7 @@ async def main(): def test_tts_convert_with_timestamps() -> None: """Test TTS generation with timestamps.""" client = ElevenLabs() - result = client.text_to_speech.convert_with_timestamps( + req_id, result = client.text_to_speech.convert_with_timestamps( text=DEFAULT_TEXT, voice_id=DEFAULT_VOICE, model_id=DEFAULT_MODEL ) @@ -108,7 +108,7 @@ def test_tts_stream_with_timestamps(): async def main(): async_client = AsyncElevenLabs() audio_data = b"" - async_stream = async_client.text_to_speech.stream_with_timestamps( + req_id, async_stream = await async_client.text_to_speech.stream_with_timestamps( voice_id=DEFAULT_VOICE, text=DEFAULT_TEXT, model_id=DEFAULT_MODEL,