Skip to content

Commit 99d4fdd

Browse files
feat(stt): add new sad_module param to recognize functions
1 parent b291b53 commit 99d4fdd

File tree

3 files changed

+34
-3
lines changed

3 files changed

+34
-3
lines changed

ibm_watson/speech_to_text_v1.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ def recognize(
218218
end_of_phrase_silence_time: Optional[float] = None,
219219
split_transcript_at_phrase_end: Optional[bool] = None,
220220
speech_detector_sensitivity: Optional[float] = None,
221+
sad_module: Optional[int] = None,
221222
background_audio_suppression: Optional[float] = None,
222223
low_latency: Optional[bool] = None,
223224
character_insertion_bias: Optional[float] = None,
@@ -351,8 +352,9 @@ def recognize(
351352
activity is detected in the stream. This can be used both in standard and
352353
low latency mode. This feature enables client applications to know that
353354
some words/speech has been detected and the service is in the process of
354-
decoding. This can be used in lieu of interim results in standard mode. See
355-
[Using speech recognition
355+
decoding. This can be used in lieu of interim results in standard mode. Use
356+
`sad_module: 2` to increase accuracy and performance in detecting speech
357+
boundaries within the audio stream. See [Using speech recognition
356358
parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
357359
:param str language_customization_id: (optional) The customization ID
358360
(GUID) of a custom language model that is to be used with the recognition
@@ -555,6 +557,12 @@ def recognize(
555557
sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
556558
and [Language model
557559
support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
560+
:param int sad_module: (optional) Detects speech boundaries within the
561+
audio stream with better performance, improved noise suppression, faster
562+
responsiveness, and increased accuracy.
563+
Specify `sad_module: 2`
564+
See [Speech Activity Detection
565+
(SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
558566
:param float background_audio_suppression: (optional) The level to which
559567
the service is to suppress background audio based on its volume to prevent
560568
it from being transcribed as speech. Use the parameter to suppress side
@@ -647,6 +655,7 @@ def recognize(
647655
'end_of_phrase_silence_time': end_of_phrase_silence_time,
648656
'split_transcript_at_phrase_end': split_transcript_at_phrase_end,
649657
'speech_detector_sensitivity': speech_detector_sensitivity,
658+
'sad_module': sad_module,
650659
'background_audio_suppression': background_audio_suppression,
651660
'low_latency': low_latency,
652661
'character_insertion_bias': character_insertion_bias,
@@ -845,6 +854,7 @@ def create_job(
845854
end_of_phrase_silence_time: Optional[float] = None,
846855
split_transcript_at_phrase_end: Optional[bool] = None,
847856
speech_detector_sensitivity: Optional[float] = None,
857+
sad_module: Optional[int] = None,
848858
background_audio_suppression: Optional[float] = None,
849859
low_latency: Optional[bool] = None,
850860
character_insertion_bias: Optional[float] = None,
@@ -1244,6 +1254,12 @@ def create_job(
12441254
sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
12451255
and [Language model
12461256
support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
1257+
:param int sad_module: (optional) Detects speech boundaries within the
1258+
audio stream with better performance, improved noise suppression, faster
1259+
responsiveness, and increased accuracy.
1260+
Specify `sad_module: 2`
1261+
See [Speech Activity Detection
1262+
(SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
12471263
:param float background_audio_suppression: (optional) The level to which
12481264
the service is to suppress background audio based on its volume to prevent
12491265
it from being transcribed as speech. Use the parameter to suppress side
@@ -1341,6 +1357,7 @@ def create_job(
13411357
'end_of_phrase_silence_time': end_of_phrase_silence_time,
13421358
'split_transcript_at_phrase_end': split_transcript_at_phrase_end,
13431359
'speech_detector_sensitivity': speech_detector_sensitivity,
1360+
'sad_module': sad_module,
13441361
'background_audio_suppression': background_audio_suppression,
13451362
'low_latency': low_latency,
13461363
'character_insertion_bias': character_insertion_bias,

ibm_watson/speech_to_text_v1_adapter.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def recognize_using_websocket(self,
5757
background_audio_suppression=None,
5858
low_latency=None,
5959
character_insertion_bias=None,
60+
sad_module=None,
6061
**kwargs):
6162
"""
6263
Sends audio for speech recognition using web sockets.
@@ -309,6 +310,12 @@ def recognize_using_websocket(self,
309310
`Narrowband` models.
310311
See [Character insertion
311312
bias](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#insertion-bias).
313+
:param int sad_module: (optional) Detects speech boundaries within the
314+
audio stream with better performance, improved noise suppression, faster
315+
responsiveness, and increased accuracy.
316+
Specify `sad_module: 2`
317+
See [Speech Activity Detection
318+
(SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
312319
:param dict headers: A `dict` containing the request headers
313320
:return: A `dict` containing the `SpeechRecognitionResults` response.
314321
:rtype: dict
@@ -377,6 +384,7 @@ def recognize_using_websocket(self,
377384
'background_audio_suppression': background_audio_suppression,
378385
'character_insertion_bias': character_insertion_bias,
379386
'low_latency': low_latency,
387+
'sad_module': sad_module,
380388
}
381389
options = {k: v for k, v in options.items() if v is not None}
382390
request['options'] = options

test/unit/test_speech_to_text_v1.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
# (C) Copyright IBM Corp. 2024.
2+
# (C) Copyright IBM Corp. 2025.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -239,6 +239,7 @@ def test_recognize_all_params(self):
239239
end_of_phrase_silence_time = 0.8
240240
split_transcript_at_phrase_end = False
241241
speech_detector_sensitivity = 0.5
242+
sad_module = 1
242243
background_audio_suppression = 0.0
243244
low_latency = False
244245
character_insertion_bias = 0.0
@@ -270,6 +271,7 @@ def test_recognize_all_params(self):
270271
end_of_phrase_silence_time=end_of_phrase_silence_time,
271272
split_transcript_at_phrase_end=split_transcript_at_phrase_end,
272273
speech_detector_sensitivity=speech_detector_sensitivity,
274+
sad_module=sad_module,
273275
background_audio_suppression=background_audio_suppression,
274276
low_latency=low_latency,
275277
character_insertion_bias=character_insertion_bias,
@@ -302,6 +304,7 @@ def test_recognize_all_params(self):
302304
assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string
303305
assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string
304306
assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string
307+
assert 'sad_module={}'.format(sad_module) in query_string
305308
assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string
306309
# Validate body params
307310

@@ -663,6 +666,7 @@ def test_create_job_all_params(self):
663666
end_of_phrase_silence_time = 0.8
664667
split_transcript_at_phrase_end = False
665668
speech_detector_sensitivity = 0.5
669+
sad_module = 1
666670
background_audio_suppression = 0.0
667671
low_latency = False
668672
character_insertion_bias = 0.0
@@ -699,6 +703,7 @@ def test_create_job_all_params(self):
699703
end_of_phrase_silence_time=end_of_phrase_silence_time,
700704
split_transcript_at_phrase_end=split_transcript_at_phrase_end,
701705
speech_detector_sensitivity=speech_detector_sensitivity,
706+
sad_module=sad_module,
702707
background_audio_suppression=background_audio_suppression,
703708
low_latency=low_latency,
704709
character_insertion_bias=character_insertion_bias,
@@ -735,6 +740,7 @@ def test_create_job_all_params(self):
735740
assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string
736741
assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string
737742
assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string
743+
assert 'sad_module={}'.format(sad_module) in query_string
738744
assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string
739745
# Validate body params
740746

0 commit comments

Comments
 (0)