NVIDIA-NeMo · artbataev · Sep 26, 2025 · Sep 26, 2025 · Oct 6, 2025 · Oct 6, 2025
@@ -84,102 +84,108 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_Pre-training_-_CitriNet
-          - runner: self-hosted-azure-gpus-1
-            script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
-            is-optional: true
-          - runner: self-hosted-azure-gpus-1
-            script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
-            is-optional: true
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Adapters_Linear_Adapters
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Adapters_RelPos_MHA_Adapters
-          - runner: self-hosted-azure
-            script: L2_Speech_to_Text_EMA
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speech_to_Text_AED
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speech_to_Label
-          - runner: self-hosted-azure
-            script: L2_Speech_Estimate_Duration_Bins
-          - runner: self-hosted-azure
-            script: L2_Speech_Batch_Size_OOMptimizer
-          - runner: self-hosted-azure
-            script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
-            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_Pre-training_-_CitriNet
+#          - runner: self-hosted-azure-gpus-1
+#            script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
+#            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
+#            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Adapters_Linear_Adapters
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Adapters_RelPos_MHA_Adapters
+#          - runner: self-hosted-azure
+#            script: L2_Speech_to_Text_EMA
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speech_to_Text_AED
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speech_to_Label
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Estimate_Duration_Bins
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Batch_Size_OOMptimizer
+#          - runner: self-hosted-azure
+#            script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
+#            is-optional: true
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Transcribe
           - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
-          - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
-          - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
-          - runner: self-hosted-azure-gpus-1
-            script: Speech_Checkpoints_tests
-            timeout: 20
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speaker_Recognition
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speaker_Diarization
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Neural_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
-          - runner: self-hosted-azure
-            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
-          - runner: self-hosted-azure
-            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
-          - script: L2_HF_Transformer_SpeechLM_SFT_2gpu
-            runner: self-hosted-azure
-          - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
-            runner: self-hosted-azure
-          - runner: self-hosted-azure-gpus-1
-            script: L2_TTS_Fast_dev_runs_1_Tacotron_2
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_WaveGlow
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_FastPitch
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_Hifigan
-          - runner: self-hosted-azure
-            script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
-          - runner: self-hosted-azure
-            script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_DuplexS2S
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_SALM
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
+#          - runner: self-hosted-azure-gpus-1
+#            script: Speech_Checkpoints_tests
+#            timeout: 20
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speaker_Recognition
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speaker_Diarization
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Neural_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
+#          - runner: self-hosted-azure
+#            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
+#          - runner: self-hosted-azure
+#            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
+#          - script: L2_HF_Transformer_SpeechLM_SFT_2gpu
+#            runner: self-hosted-azure
+#          - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
+#            runner: self-hosted-azure
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_TTS_Fast_dev_runs_1_Tacotron_2
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_WaveGlow
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_FastPitch
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_Hifigan
+#          - runner: self-hosted-azure
+#            script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
+#          - runner: self-hosted-azure
+#            script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_DuplexS2S
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_SALM
     needs: [unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}

diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -97,6 +97,7 @@
 """
 
 
+import glob
 import json
 import os
 import time
@@ -128,7 +129,8 @@ class TranscriptionConfig:
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
-    # audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    audio_type: str = "wav"  # type of audio file if audio_dir passed
     audio_file: Optional[str] = None  # Path to an audio file to perform streaming
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
     output_path: Optional[str] = None  # Path to output file when manifest is used as input
@@ -329,10 +331,8 @@ def main(cfg: TranscriptionConfig):
             f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead"
         )
 
-    if (cfg.audio_file is None and cfg.dataset_manifest is None) or (
-        cfg.audio_file is not None and cfg.dataset_manifest is not None
-    ):
-        raise ValueError("One of the audio_file and dataset_manifest should be non-empty!")
+    if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)) != 1:
+        raise ValueError("Exactly one of the `audio_file`, `dataset_manifest` or `audio_dir` should be non-empty!")
 
     asr_model, model_name = setup_model(cfg=cfg, map_location=device)
 
@@ -414,15 +414,26 @@ def main(cfg: TranscriptionConfig):
             all_refs_text = []
             batch_size = cfg.batch_size
 
-            manifest_dir = Path(cfg.dataset_manifest).parent
-            samples = read_manifest(cfg.dataset_manifest)
-            # fix relative paths
-            for item in samples:
-                audio_filepath = Path(item["audio_filepath"])
-                if not audio_filepath.is_absolute():
-                    item["audio_filepath"] = str(manifest_dir / audio_filepath)
-
-            logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.")
+            if cfg.dataset_manifest is not None:
+                manifest_dir = Path(cfg.dataset_manifest).parent
+                samples = read_manifest(cfg.dataset_manifest)
+                # fix relative paths
+                for item in samples:
+                    audio_filepath = Path(item["audio_filepath"])
+                    if not audio_filepath.is_absolute():
+                        item["audio_filepath"] = str(manifest_dir / audio_filepath)
+
+                logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.")
+                dataset_title = os.path.splitext(os.path.basename(cfg.dataset_manifest))[0]
+            else:
+                assert cfg.audio_dir is not None
+                samples = [
+                    {"audio_filepath": audio_filepath}
+                    for audio_filepath in (
+                        glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)
+                    )
+                ]
+                dataset_title = os.path.basename(cfg.audio_dir)
 
             start_time = time.time()
             for sample_idx, sample in enumerate(samples):
@@ -461,11 +472,7 @@ def main(cfg: TranscriptionConfig):
         # stores the results including the transcriptions of the streaming inference in a json file
         if cfg.output_path is not None and len(all_refs_text) == len(all_streaming_tran):
             fname = (
-                "streaming_out_"
-                + os.path.splitext(os.path.basename(model_name))[0]
-                + "_"
-                + os.path.splitext(os.path.basename(cfg.dataset_manifest))[0]
-                + ".json"
+                "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + "_" + dataset_title + ".json"
             )
 
             hyp_json = os.path.join(cfg.output_path, fname)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -0,0 +1,18 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
+    pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    output_path="/tmp/stt_cache_aware_streaming_test_res"
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
+    model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    chunk_secs=2.0 \
+    left_context_secs=10.0 \
+    right_context_secs=2.0 \
+    output_filename="/tmp/stt_streaming_test_res.json"
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
+    model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    chunk_secs=2.0 \
+    left_context_secs=10.0 \
+    right_context_secs=2.0 \
+    timestamps=true \
+    output_filename="/tmp/stt_streaming_test_res.json"