From 5cc77be0c8becc4b742d6e14201c27f3ecd67e61 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Fri, 26 Sep 2025 23:46:16 +0400 Subject: [PATCH 01/13] Add tests for streaming buffered and cache-aware transducer models Signed-off-by: Vladimir Bataev --- .github/workflows/cicd-main-speech.yml | 4 ++++ ...n_Speech_to_Text_Cache_Aware_Transcribe.sh | 18 ++++++++++++++++ ...ion_Speech_to_Text_Streaming_Transcribe.sh | 21 +++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index a9febfc115f9..5465e14bf8db 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -123,6 +123,10 @@ jobs: is-optional: true - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Transcribe + - runner: self-hosted-azure + script: L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe + - runner: self-hosted-azure + script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe - runner: self-hosted-azure script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest - runner: self-hosted-azure diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh new file mode 100755 index 000000000000..ee0384a8e7c9 --- /dev/null +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh @@ -0,0 +1,18 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ + pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \ + audio_dir="/home/TestData/an4_transcribe/test_subset/" \ + output_filename="/tmp/stt_cache_aware_streaming_test_res.json" diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh new file mode 100755 index 000000000000..0a96c328e47a --- /dev/null +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh @@ -0,0 +1,21 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ + pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \ + audio_dir="/home/TestData/an4_transcribe/test_subset/" \ + chunk_secs=2.0 \ + left_context_secs=10.0 \ + right_context_secs=2.0 \ + output_filename="/tmp/stt_streaming_test_res.json" From 674d66ba4caab885463b4832036e5e03253f14b9 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Sat, 27 Sep 2025 00:02:57 +0400 Subject: [PATCH 02/13] Support audio_dir in cache-aware streaming script Signed-off-by: Vladimir Bataev --- ...ech_to_text_cache_aware_streaming_infer.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py index 9fddf368d23f..c6674ad6d053 100644 --- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py +++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py @@ -97,6 +97,7 @@ """ +import glob import json import os import time @@ -128,7 +129,8 @@ class TranscriptionConfig: # Required configs model_path: Optional[str] = None # Path to a .nemo file pretrained_name: Optional[str] = None # Name of a pretrained model - # audio_dir: Optional[str] = None # Path to a directory which contains audio files + audio_dir: Optional[str] = None # Path to a directory which contains audio files + audio_type: str = "wav" # type of audio file if audio_dir passed audio_file: Optional[str] = None # Path to an audio file to perform streaming dataset_manifest: Optional[str] = None # Path to dataset's JSON manifest output_path: Optional[str] = None # Path to output file when manifest is used as input @@ -329,10 +331,8 @@ def main(cfg: TranscriptionConfig): f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead" ) - if (cfg.audio_file is None and cfg.dataset_manifest is None) or ( - cfg.audio_file is not None and cfg.dataset_manifest is not None - ): - raise ValueError("One of the audio_file and dataset_manifest should be non-empty!") + if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)): + raise ValueError("Exactly one of the `audio_file`, `dataset_manifest` or `audio_dir` should be non-empty!") asr_model, model_name = setup_model(cfg=cfg, map_location=device) @@ -414,15 +414,26 @@ def main(cfg: TranscriptionConfig): all_refs_text = [] batch_size = cfg.batch_size - manifest_dir = Path(cfg.dataset_manifest).parent - samples = read_manifest(cfg.dataset_manifest) - # fix relative paths - for item in samples: - audio_filepath = Path(item["audio_filepath"]) - if not audio_filepath.is_absolute(): - item["audio_filepath"] = str(manifest_dir / audio_filepath) - - logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.") + if cfg.dataset_manifest is not None: + manifest_dir = Path(cfg.dataset_manifest).parent + samples = read_manifest(cfg.dataset_manifest) + # fix relative paths + for item in samples: + audio_filepath = Path(item["audio_filepath"]) + if not audio_filepath.is_absolute(): + item["audio_filepath"] = str(manifest_dir / audio_filepath) + + logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.") + dataset_title = os.path.splitext(os.path.basename(cfg.dataset_manifest))[0] + else: + assert cfg.audio_dir is not None + samples = [ + {"audio_filepath": audio_filepath} + for audio_filepath in ( + glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True) + ) + ] + dataset_title = os.path.basename(cfg.audio_dir) start_time = time.time() for sample_idx, sample in enumerate(samples): @@ -461,11 +472,7 @@ def main(cfg: TranscriptionConfig): # stores the results including the transcriptions of the streaming inference in a json file if cfg.output_path is not None and len(all_refs_text) == len(all_streaming_tran): fname = ( - "streaming_out_" - + os.path.splitext(os.path.basename(model_name))[0] - + "_" - + os.path.splitext(os.path.basename(cfg.dataset_manifest))[0] - + ".json" + "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + "_" + dataset_title + ".json" ) hyp_json = os.path.join(cfg.output_path, fname) From be7965ccb22382740a0f15f676af2ee77674a845 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Mon, 6 Oct 2025 15:38:43 +0400 Subject: [PATCH 03/13] Fix script names. Fix parameters Signed-off-by: Vladimir Bataev --- .github/workflows/cicd-main-speech.yml | 4 ++-- ..._Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh} | 2 +- ...L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename tests/functional_tests/{L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh => L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh} (93%) rename tests/functional_tests/{L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh => L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh} (100%) diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index 5465e14bf8db..d87700f840e2 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -124,9 +124,9 @@ jobs: - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Transcribe - runner: self-hosted-azure - script: L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe + script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer - runner: self-hosted-azure - script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe + script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer - runner: self-hosted-azure script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest - runner: self-hosted-azure diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh similarity index 93% rename from tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh rename to tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh index ee0384a8e7c9..7afcc08397af 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh @@ -15,4 +15,4 @@ coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="/tmp/stt_cache_aware_streaming_test_res.json" + output_path="/tmp/stt_cache_aware_streaming_test_res" diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh similarity index 100% rename from tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh rename to tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh From 6150bdc47458949dfeaebabe1bb663e6b9f5b05c Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Mon, 6 Oct 2025 15:38:53 +0400 Subject: [PATCH 04/13] Comment out extra tests Signed-off-by: Vladimir Bataev --- .github/workflows/cicd-main-speech.yml | 186 ++++++++++++------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index d87700f840e2..b6f881a71f1d 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -84,43 +84,43 @@ jobs: fail-fast: false matrix: include: - - runner: self-hosted-azure-gpus-1 - script: ASR_dev_run_Speech_to_Text - - runner: self-hosted-azure-gpus-1 - script: ASR_dev_run_Speech_to_Text_WPE_CitriNet - - runner: self-hosted-azure-gpus-1 - script: ASR_dev_run_Speech_Pre-training_-_CitriNet - - runner: self-hosted-azure-gpus-1 - script: Optional_ASR_dev_run_Speech_To_Text_Finetuning - is-optional: true - - runner: self-hosted-azure-gpus-1 - script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning - is-optional: true - - runner: self-hosted-azure-gpus-1 - script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer - - runner: self-hosted-azure-gpus-1 - script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer - - runner: self-hosted-azure-gpus-1 - script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader - - runner: self-hosted-azure-gpus-1 - script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader - - runner: self-hosted-azure-gpus-1 - script: L2_ASR_Adapters_Linear_Adapters - - runner: self-hosted-azure-gpus-1 - script: L2_ASR_Adapters_RelPos_MHA_Adapters - - runner: self-hosted-azure - script: L2_Speech_to_Text_EMA - - runner: self-hosted-azure-gpus-1 - script: L2_Speech_to_Text_AED - - runner: self-hosted-azure-gpus-1 - script: L2_Speaker_dev_run_Speech_to_Label - - runner: self-hosted-azure - script: L2_Speech_Estimate_Duration_Bins - - runner: self-hosted-azure - script: L2_Speech_Batch_Size_OOMptimizer - - runner: self-hosted-azure - script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary - is-optional: true +# - runner: self-hosted-azure-gpus-1 +# script: ASR_dev_run_Speech_to_Text +# - runner: self-hosted-azure-gpus-1 +# script: ASR_dev_run_Speech_to_Text_WPE_CitriNet +# - runner: self-hosted-azure-gpus-1 +# script: ASR_dev_run_Speech_Pre-training_-_CitriNet +# - runner: self-hosted-azure-gpus-1 +# script: Optional_ASR_dev_run_Speech_To_Text_Finetuning +# is-optional: true +# - runner: self-hosted-azure-gpus-1 +# script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning +# is-optional: true +# - runner: self-hosted-azure-gpus-1 +# script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer +# - runner: self-hosted-azure-gpus-1 +# script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer +# - runner: self-hosted-azure-gpus-1 +# script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader +# - runner: self-hosted-azure-gpus-1 +# script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader +# - runner: self-hosted-azure-gpus-1 +# script: L2_ASR_Adapters_Linear_Adapters +# - runner: self-hosted-azure-gpus-1 +# script: L2_ASR_Adapters_RelPos_MHA_Adapters +# - runner: self-hosted-azure +# script: L2_Speech_to_Text_EMA +# - runner: self-hosted-azure-gpus-1 +# script: L2_Speech_to_Text_AED +# - runner: self-hosted-azure-gpus-1 +# script: L2_Speaker_dev_run_Speech_to_Label +# - runner: self-hosted-azure +# script: L2_Speech_Estimate_Duration_Bins +# - runner: self-hosted-azure +# script: L2_Speech_Batch_Size_OOMptimizer +# - runner: self-hosted-azure +# script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary +# is-optional: true - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Transcribe - runner: self-hosted-azure @@ -128,62 +128,62 @@ jobs: - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer - runner: self-hosted-azure - script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest - - runner: self-hosted-azure - script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt - - runner: self-hosted-azure - script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir - - runner: self-hosted-azure - script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir - - runner: self-hosted-azure - script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir - - runner: self-hosted-azure - script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest - - runner: self-hosted-azure-gpus-1 - script: Speech_Checkpoints_tests - timeout: 20 - - runner: self-hosted-azure-gpus-1 - script: L2_Speaker_dev_run_Speaker_Recognition - - runner: self-hosted-azure-gpus-1 - script: L2_Speaker_dev_run_Speaker_Diarization - - runner: self-hosted-azure-gpus-1 - script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer - - runner: self-hosted-azure - script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference - - runner: self-hosted-azure - script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference - - runner: self-hosted-azure - script: L2_Speaker_dev_run_Clustering_Diarizer_Inference - - runner: self-hosted-azure - script: L2_Speaker_dev_run_Neural_Diarizer_Inference - - runner: self-hosted-azure - script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation - - runner: self-hosted-azure - script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav - - runner: self-hosted-azure - script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - - script: L2_HF_Transformer_SpeechLM_SFT_2gpu - runner: self-hosted-azure - - script: L2_SpeechLM_LoRA_TP1PP1_MBS2 - runner: self-hosted-azure - - runner: self-hosted-azure-gpus-1 - script: L2_TTS_Fast_dev_runs_1_Tacotron_2 - - runner: self-hosted-azure - script: L2_TTS_Fast_dev_runs_1_WaveGlow - - runner: self-hosted-azure - script: L2_TTS_Fast_dev_runs_1_FastPitch - - runner: self-hosted-azure - script: L2_TTS_Fast_dev_runs_1_Hifigan - - runner: self-hosted-azure - script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - - runner: self-hosted-azure - script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - runner: self-hosted-azure - script: SPEECHLM_HF_Training_DuplexS2S - - runner: self-hosted-azure - script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder - - runner: self-hosted-azure - script: SPEECHLM_HF_Training_SALM +# script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest +# - runner: self-hosted-azure +# script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt +# - runner: self-hosted-azure +# script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir +# - runner: self-hosted-azure +# script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir +# - runner: self-hosted-azure +# script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir +# - runner: self-hosted-azure +# script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest +# - runner: self-hosted-azure-gpus-1 +# script: Speech_Checkpoints_tests +# timeout: 20 +# - runner: self-hosted-azure-gpus-1 +# script: L2_Speaker_dev_run_Speaker_Recognition +# - runner: self-hosted-azure-gpus-1 +# script: L2_Speaker_dev_run_Speaker_Diarization +# - runner: self-hosted-azure-gpus-1 +# script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer +# - runner: self-hosted-azure +# script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference +# - runner: self-hosted-azure +# script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference +# - runner: self-hosted-azure +# script: L2_Speaker_dev_run_Clustering_Diarizer_Inference +# - runner: self-hosted-azure +# script: L2_Speaker_dev_run_Neural_Diarizer_Inference +# - runner: self-hosted-azure +# script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation +# - runner: self-hosted-azure +# script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav +# - runner: self-hosted-azure +# script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 +# - script: L2_HF_Transformer_SpeechLM_SFT_2gpu +# runner: self-hosted-azure +# - script: L2_SpeechLM_LoRA_TP1PP1_MBS2 +# runner: self-hosted-azure +# - runner: self-hosted-azure-gpus-1 +# script: L2_TTS_Fast_dev_runs_1_Tacotron_2 +# - runner: self-hosted-azure +# script: L2_TTS_Fast_dev_runs_1_WaveGlow +# - runner: self-hosted-azure +# script: L2_TTS_Fast_dev_runs_1_FastPitch +# - runner: self-hosted-azure +# script: L2_TTS_Fast_dev_runs_1_Hifigan +# - runner: self-hosted-azure +# script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference +# - runner: self-hosted-azure +# script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference +# - runner: self-hosted-azure +# script: SPEECHLM_HF_Training_DuplexS2S +# - runner: self-hosted-azure +# script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder +# - runner: self-hosted-azure +# script: SPEECHLM_HF_Training_SALM needs: [unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} From 876b6090091e4c850011ccd7cb439b77e29351e2 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 7 Oct 2025 20:01:05 +0400 Subject: [PATCH 05/13] Fix script Signed-off-by: Vladimir Bataev --- .../speech_to_text_cache_aware_streaming_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py index c6674ad6d053..37fbd065e00a 100644 --- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py +++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py @@ -331,7 +331,7 @@ def main(cfg: TranscriptionConfig): f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead" ) - if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)): + if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)) != 1: raise ValueError("Exactly one of the `audio_file`, `dataset_manifest` or `audio_dir` should be non-empty!") asr_model, model_name = setup_model(cfg=cfg, map_location=device) From 2768defc6b6343117396d4c27d9e877fc7226f6a Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 7 Oct 2025 20:53:40 +0400 Subject: [PATCH 06/13] Add test with timestamps Signed-off-by: Vladimir Bataev --- .github/workflows/cicd-main-speech.yml | 4 +++- ...eech_to_Text_Streaming_Infer_Timestamps.sh | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index b6f881a71f1d..da9861008b63 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -126,8 +126,10 @@ jobs: - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer - runner: self-hosted-azure - script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer + script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps - runner: self-hosted-azure + script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer +# - runner: self-hosted-azure # script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest # - runner: self-hosted-azure # script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh new file mode 100755 index 000000000000..148164e6fdd2 --- /dev/null +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh @@ -0,0 +1,22 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ + examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ + pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \ + audio_dir="/home/TestData/an4_transcribe/test_subset/" \ + chunk_secs=2.0 \ + left_context_secs=10.0 \ + right_context_secs=2.0 \ + timestamps=true \ + output_filename="/tmp/stt_streaming_test_res.json" From 0f6a37ff8df93ec9fe1da540a3e0b3d8d8013c44 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 7 Oct 2025 20:55:58 +0400 Subject: [PATCH 07/13] Use local checkpoint Signed-off-by: Vladimir Bataev --- .../L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh | 2 +- ...h_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh index 0a96c328e47a..3654ffd7aac4 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh @@ -13,7 +13,7 @@ # limitations under the License. coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ - pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \ + model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ chunk_secs=2.0 \ left_context_secs=10.0 \ diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh index 148164e6fdd2..0e2dc5061e9e 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh @@ -13,7 +13,7 @@ # limitations under the License. coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ - pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \ + model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ chunk_secs=2.0 \ left_context_secs=10.0 \ From a75c90b87e3994fa53ea4538d42abc5f50ea4efd Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 7 Oct 2025 22:22:03 +0400 Subject: [PATCH 08/13] Use local checkpoint Signed-off-by: Vladimir Bataev --- .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh index 7afcc08397af..3240b2dfdaf7 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh @@ -13,6 +13,6 @@ # limitations under the License. coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ - pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \ + pretrained_name="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ output_path="/tmp/stt_cache_aware_streaming_test_res" From bca66a5ac31223b640a7d6febf5d51761e84a183 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 8 Oct 2025 14:20:19 +0400 Subject: [PATCH 09/13] Fix param Signed-off-by: Vladimir Bataev --- .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh index 3240b2dfdaf7..32df36b8a02c 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh @@ -13,6 +13,6 @@ # limitations under the License. coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ - pretrained_name="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \ + model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ output_path="/tmp/stt_cache_aware_streaming_test_res" From 1ce73abdc1e99ab0a76d8972c1498215eac5a9d8 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Mon, 13 Oct 2025 23:42:03 +0400 Subject: [PATCH 10/13] Clean up Signed-off-by: Vladimir Bataev --- .github/workflows/cicd-main-speech.yml | 2 -- ...cription_Speech_to_Text_Streaming_Infer.sh | 3 +++ ...eech_to_Text_Streaming_Infer_Timestamps.sh | 22 ------------------- 3 files changed, 3 insertions(+), 24 deletions(-) delete mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml index 3e807cefb9c5..c690c4426391 100644 --- a/.github/workflows/cicd-main-speech.yml +++ b/.github/workflows/cicd-main-speech.yml @@ -125,8 +125,6 @@ jobs: script: L2_Speech_Transcription_Speech_to_Text_Transcribe - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer - - runner: self-hosted-azure - script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps - runner: self-hosted-azure script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer - runner: self-hosted-azure diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh index 3654ffd7aac4..06f7625b6989 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# TODO(vbataev): fix decoding with CUDA graphs on CI for this test coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \ @@ -18,4 +19,6 @@ coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ chunk_secs=2.0 \ left_context_secs=10.0 \ right_context_secs=2.0 \ + timestamps=true \ + decoding.greedy.use_cuda_graph_decoder=false \ output_filename="/tmp/stt_streaming_test_res.json" diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh deleted file mode 100755 index 0e2dc5061e9e..000000000000 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ - examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ - model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - chunk_secs=2.0 \ - left_context_secs=10.0 \ - right_context_secs=2.0 \ - timestamps=true \ - output_filename="/tmp/stt_streaming_test_res.json" From 97a9746ead600768badbabb492b90f985bc4b68d Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Mon, 13 Oct 2025 23:49:35 +0400 Subject: [PATCH 11/13] Clean up Signed-off-by: Vladimir Bataev --- .../speech_to_text_cache_aware_streaming_infer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py index 37fbd065e00a..858feda00fa0 100644 --- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py +++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py @@ -471,9 +471,7 @@ def main(cfg: TranscriptionConfig): # stores the results including the transcriptions of the streaming inference in a json file if cfg.output_path is not None and len(all_refs_text) == len(all_streaming_tran): - fname = ( - "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + "_" + dataset_title + ".json" - ) + fname = "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + f"_{dataset_title}.json" hyp_json = os.path.join(cfg.output_path, fname) os.makedirs(cfg.output_path, exist_ok=True) From 47afc24ca3d36417e17801913b3c818905417021 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Mon, 13 Oct 2025 23:51:08 +0400 Subject: [PATCH 12/13] Clean up Signed-off-by: Vladimir Bataev --- .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 1 + .../L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh index 32df36b8a02c..7ab36d8b3cd8 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \ diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh index 06f7625b6989..596e8ffd6b5f 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + # TODO(vbataev): fix decoding with CUDA graphs on CI for this test coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \ From 441a85a2c855b4b8bffc6058a74fe9db415e188f Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Tue, 14 Oct 2025 13:03:21 +0400 Subject: [PATCH 13/13] Temporary avoid CUDA graphs in tests Signed-off-by: Vladimir Bataev --- .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh index 7ab36d8b3cd8..a56ccdb46d6c 100755 --- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh +++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +# TODO(vbataev): fix decoding with CUDA graphs on CI for this test coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \ examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \ model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \ + rnnt_decoding.greedy.use_cuda_graph_decoder=false \ audio_dir="/home/TestData/an4_transcribe/test_subset/" \ output_path="/tmp/stt_cache_aware_streaming_test_res"