feat(stt): add new sad_module param to recognize functions

apaparazzi0329 · apaparazzi0329 · commit 99d4fdd1e7f0 · 2025-11-11T12:32:44.000-05:00
diff --git a/ibm_watson/speech_to_text_v1.py b/ibm_watson/speech_to_text_v1.py
@@ -218,6 +218,7 @@ def recognize(
         end_of_phrase_silence_time: Optional[float] = None,
         split_transcript_at_phrase_end: Optional[bool] = None,
         speech_detector_sensitivity: Optional[float] = None,
+        sad_module: Optional[int] = None,
         background_audio_suppression: Optional[float] = None,
         low_latency: Optional[bool] = None,
         character_insertion_bias: Optional[float] = None,
@@ -351,8 +352,9 @@ def recognize(
                activity is detected in the stream. This can be used both in standard and
                low latency mode. This feature enables client applications to know that
                some words/speech has been detected and the service is in the process of
-               decoding. This can be used in lieu of interim results in standard mode. See
-               [Using speech recognition
+               decoding. This can be used in lieu of interim results in standard mode. Use
+               `sad_module: 2` to increase accuracy and performance in detecting speech
+               boundaries within the audio stream. See [Using speech recognition
                parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
         :param str language_customization_id: (optional) The customization ID
                (GUID) of a custom language model that is to be used with the recognition
@@ -555,6 +557,12 @@ def recognize(
                sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
                and [Language model
                support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
+        :param int sad_module: (optional) Detects speech boundaries within the
+               audio stream with better performance, improved noise suppression, faster
+               responsiveness, and increased accuracy.
+               Specify `sad_module: 2`
+                See [Speech Activity Detection
+               (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
         :param float background_audio_suppression: (optional) The level to which
                the service is to suppress background audio based on its volume to prevent
                it from being transcribed as speech. Use the parameter to suppress side
@@ -647,6 +655,7 @@ def recognize(
             'end_of_phrase_silence_time': end_of_phrase_silence_time,
             'split_transcript_at_phrase_end': split_transcript_at_phrase_end,
             'speech_detector_sensitivity': speech_detector_sensitivity,
+            'sad_module': sad_module,
             'background_audio_suppression': background_audio_suppression,
             'low_latency': low_latency,
             'character_insertion_bias': character_insertion_bias,
@@ -845,6 +854,7 @@ def create_job(
         end_of_phrase_silence_time: Optional[float] = None,
         split_transcript_at_phrase_end: Optional[bool] = None,
         speech_detector_sensitivity: Optional[float] = None,
+        sad_module: Optional[int] = None,
         background_audio_suppression: Optional[float] = None,
         low_latency: Optional[bool] = None,
         character_insertion_bias: Optional[float] = None,
@@ -1244,6 +1254,12 @@ def create_job(
                sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
                and [Language model
                support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
+        :param int sad_module: (optional) Detects speech boundaries within the
+               audio stream with better performance, improved noise suppression, faster
+               responsiveness, and increased accuracy.
+               Specify `sad_module: 2`
+                See [Speech Activity Detection
+               (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
         :param float background_audio_suppression: (optional) The level to which
                the service is to suppress background audio based on its volume to prevent
                it from being transcribed as speech. Use the parameter to suppress side
@@ -1341,6 +1357,7 @@ def create_job(
             'end_of_phrase_silence_time': end_of_phrase_silence_time,
             'split_transcript_at_phrase_end': split_transcript_at_phrase_end,
             'speech_detector_sensitivity': speech_detector_sensitivity,
+            'sad_module': sad_module,
             'background_audio_suppression': background_audio_suppression,
             'low_latency': low_latency,
             'character_insertion_bias': character_insertion_bias,
diff --git a/ibm_watson/speech_to_text_v1_adapter.py b/ibm_watson/speech_to_text_v1_adapter.py
@@ -57,6 +57,7 @@ def recognize_using_websocket(self,
                                   background_audio_suppression=None,
                                   low_latency=None,
                                   character_insertion_bias=None,
+                                  sad_module=None,
                                   **kwargs):
         """
         Sends audio for speech recognition using web sockets.
@@ -309,6 +310,12 @@ def recognize_using_websocket(self,
                `Narrowband` models.
                See [Character insertion
                bias](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#insertion-bias).
+        :param int sad_module: (optional) Detects speech boundaries within the
+               audio stream with better performance, improved noise suppression, faster
+               responsiveness, and increased accuracy.
+               Specify `sad_module: 2`
+                See [Speech Activity Detection
+               (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
         :param dict headers: A `dict` containing the request headers
         :return: A `dict` containing the `SpeechRecognitionResults` response.
         :rtype: dict
@@ -377,6 +384,7 @@ def recognize_using_websocket(self,
             'background_audio_suppression': background_audio_suppression,
             'character_insertion_bias': character_insertion_bias,
             'low_latency': low_latency,
+            'sad_module': sad_module,
         }
         options = {k: v for k, v in options.items() if v is not None}
         request['options'] = options
diff --git a/test/unit/test_speech_to_text_v1.py b/test/unit/test_speech_to_text_v1.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# (C) Copyright IBM Corp. 2024.
+# (C) Copyright IBM Corp. 2025.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -239,6 +239,7 @@ def test_recognize_all_params(self):
         end_of_phrase_silence_time = 0.8
         split_transcript_at_phrase_end = False
         speech_detector_sensitivity = 0.5
+        sad_module = 1
         background_audio_suppression = 0.0
         low_latency = False
         character_insertion_bias = 0.0
@@ -270,6 +271,7 @@ def test_recognize_all_params(self):
             end_of_phrase_silence_time=end_of_phrase_silence_time,
             split_transcript_at_phrase_end=split_transcript_at_phrase_end,
             speech_detector_sensitivity=speech_detector_sensitivity,
+            sad_module=sad_module,
             background_audio_suppression=background_audio_suppression,
             low_latency=low_latency,
             character_insertion_bias=character_insertion_bias,
@@ -302,6 +304,7 @@ def test_recognize_all_params(self):
         assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string
         assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string
         assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string
+        assert 'sad_module={}'.format(sad_module) in query_string
         assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string
         # Validate body params
 
@@ -663,6 +666,7 @@ def test_create_job_all_params(self):
         end_of_phrase_silence_time = 0.8
         split_transcript_at_phrase_end = False
         speech_detector_sensitivity = 0.5
+        sad_module = 1
         background_audio_suppression = 0.0
         low_latency = False
         character_insertion_bias = 0.0
@@ -699,6 +703,7 @@ def test_create_job_all_params(self):
             end_of_phrase_silence_time=end_of_phrase_silence_time,
             split_transcript_at_phrase_end=split_transcript_at_phrase_end,
             speech_detector_sensitivity=speech_detector_sensitivity,
+            sad_module=sad_module,
             background_audio_suppression=background_audio_suppression,
             low_latency=low_latency,
             character_insertion_bias=character_insertion_bias,
@@ -735,6 +740,7 @@ def test_create_job_all_params(self):
         assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string
         assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string
         assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string
+        assert 'sad_module={}'.format(sad_module) in query_string
         assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string
         # Validate body params