From 5cc77be0c8becc4b742d6e14201c27f3ecd67e61 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Fri, 26 Sep 2025 23:46:16 +0400
Subject: [PATCH 01/13] Add tests for streaming buffered and cache-aware
 transducer models

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .github/workflows/cicd-main-speech.yml        |  4 ++++
 ...n_Speech_to_Text_Cache_Aware_Transcribe.sh | 18 ++++++++++++++++
 ...ion_Speech_to_Text_Streaming_Transcribe.sh | 21 +++++++++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh
 create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh

diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml
index a9febfc115f9..5465e14bf8db 100644
--- a/.github/workflows/cicd-main-speech.yml
+++ b/.github/workflows/cicd-main-speech.yml
@@ -123,6 +123,10 @@ jobs:
             is-optional: true
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Transcribe
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
           - runner: self-hosted-azure
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh
new file mode 100755
index 000000000000..ee0384a8e7c9
--- /dev/null
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh
@@ -0,0 +1,18 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
+    pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    output_filename="/tmp/stt_cache_aware_streaming_test_res.json"
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh
new file mode 100755
index 000000000000..0a96c328e47a
--- /dev/null
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
+    pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    chunk_secs=2.0 \
+    left_context_secs=10.0 \
+    right_context_secs=2.0 \
+    output_filename="/tmp/stt_streaming_test_res.json"

From 674d66ba4caab885463b4832036e5e03253f14b9 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Sat, 27 Sep 2025 00:02:57 +0400
Subject: [PATCH 02/13] Support audio_dir in cache-aware streaming script

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 ...ech_to_text_cache_aware_streaming_infer.py | 45 +++++++++++--------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
index 9fddf368d23f..c6674ad6d053 100644
--- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
+++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -97,6 +97,7 @@
 """
 
 
+import glob
 import json
 import os
 import time
@@ -128,7 +129,8 @@ class TranscriptionConfig:
     # Required configs
     model_path: Optional[str] = None  # Path to a .nemo file
     pretrained_name: Optional[str] = None  # Name of a pretrained model
-    # audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    audio_type: str = "wav"  # type of audio file if audio_dir passed
     audio_file: Optional[str] = None  # Path to an audio file to perform streaming
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
     output_path: Optional[str] = None  # Path to output file when manifest is used as input
@@ -329,10 +331,8 @@ def main(cfg: TranscriptionConfig):
             f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead"
         )
 
-    if (cfg.audio_file is None and cfg.dataset_manifest is None) or (
-        cfg.audio_file is not None and cfg.dataset_manifest is not None
-    ):
-        raise ValueError("One of the audio_file and dataset_manifest should be non-empty!")
+    if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)):
+        raise ValueError("Exactly one of the `audio_file`, `dataset_manifest` or `audio_dir` should be non-empty!")
 
     asr_model, model_name = setup_model(cfg=cfg, map_location=device)
 
@@ -414,15 +414,26 @@ def main(cfg: TranscriptionConfig):
             all_refs_text = []
             batch_size = cfg.batch_size
 
-            manifest_dir = Path(cfg.dataset_manifest).parent
-            samples = read_manifest(cfg.dataset_manifest)
-            # fix relative paths
-            for item in samples:
-                audio_filepath = Path(item["audio_filepath"])
-                if not audio_filepath.is_absolute():
-                    item["audio_filepath"] = str(manifest_dir / audio_filepath)
-
-            logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.")
+            if cfg.dataset_manifest is not None:
+                manifest_dir = Path(cfg.dataset_manifest).parent
+                samples = read_manifest(cfg.dataset_manifest)
+                # fix relative paths
+                for item in samples:
+                    audio_filepath = Path(item["audio_filepath"])
+                    if not audio_filepath.is_absolute():
+                        item["audio_filepath"] = str(manifest_dir / audio_filepath)
+
+                logging.info(f"Loaded {len(samples)} from the manifest at {cfg.dataset_manifest}.")
+                dataset_title = os.path.splitext(os.path.basename(cfg.dataset_manifest))[0]
+            else:
+                assert cfg.audio_dir is not None
+                samples = [
+                    {"audio_filepath": audio_filepath}
+                    for audio_filepath in (
+                        glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)
+                    )
+                ]
+                dataset_title = os.path.basename(cfg.audio_dir)
 
             start_time = time.time()
             for sample_idx, sample in enumerate(samples):
@@ -461,11 +472,7 @@ def main(cfg: TranscriptionConfig):
         # stores the results including the transcriptions of the streaming inference in a json file
         if cfg.output_path is not None and len(all_refs_text) == len(all_streaming_tran):
             fname = (
-                "streaming_out_"
-                + os.path.splitext(os.path.basename(model_name))[0]
-                + "_"
-                + os.path.splitext(os.path.basename(cfg.dataset_manifest))[0]
-                + ".json"
+                "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + "_" + dataset_title + ".json"
             )
 
             hyp_json = os.path.join(cfg.output_path, fname)

From be7965ccb22382740a0f15f676af2ee77674a845 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Mon, 6 Oct 2025 15:38:43 +0400
Subject: [PATCH 03/13] Fix script names. Fix parameters

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .github/workflows/cicd-main-speech.yml                        | 4 ++--
 ..._Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh} | 2 +-
 ...L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh} | 0
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename tests/functional_tests/{L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh => L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh} (93%)
 rename tests/functional_tests/{L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh => L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh} (100%)

diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml
index 5465e14bf8db..d87700f840e2 100644
--- a/.github/workflows/cicd-main-speech.yml
+++ b/.github/workflows/cicd-main-speech.yml
@@ -124,9 +124,9 @@ jobs:
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Transcribe
           - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
           - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe
+            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
           - runner: self-hosted-azure
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
similarity index 93%
rename from tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh
rename to tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
index ee0384a8e7c9..7afcc08397af 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Transcribe.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -15,4 +15,4 @@ coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
     pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-    output_filename="/tmp/stt_cache_aware_streaming_test_res.json"
+    output_path="/tmp/stt_cache_aware_streaming_test_res"
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
similarity index 100%
rename from tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Transcribe.sh
rename to tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh

From 6150bdc47458949dfeaebabe1bb663e6b9f5b05c Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Mon, 6 Oct 2025 15:38:53 +0400
Subject: [PATCH 04/13] Comment out extra tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .github/workflows/cicd-main-speech.yml | 186 ++++++++++++-------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml
index d87700f840e2..b6f881a71f1d 100644
--- a/.github/workflows/cicd-main-speech.yml
+++ b/.github/workflows/cicd-main-speech.yml
@@ -84,43 +84,43 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_Pre-training_-_CitriNet
-          - runner: self-hosted-azure-gpus-1
-            script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
-            is-optional: true
-          - runner: self-hosted-azure-gpus-1
-            script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
-            is-optional: true
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
-          - runner: self-hosted-azure-gpus-1
-            script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Adapters_Linear_Adapters
-          - runner: self-hosted-azure-gpus-1
-            script: L2_ASR_Adapters_RelPos_MHA_Adapters
-          - runner: self-hosted-azure
-            script: L2_Speech_to_Text_EMA
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speech_to_Text_AED
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speech_to_Label
-          - runner: self-hosted-azure
-            script: L2_Speech_Estimate_Duration_Bins
-          - runner: self-hosted-azure
-            script: L2_Speech_Batch_Size_OOMptimizer
-          - runner: self-hosted-azure
-            script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
-            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_Pre-training_-_CitriNet
+#          - runner: self-hosted-azure-gpus-1
+#            script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
+#            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
+#            is-optional: true
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
+#          - runner: self-hosted-azure-gpus-1
+#            script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Adapters_Linear_Adapters
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_ASR_Adapters_RelPos_MHA_Adapters
+#          - runner: self-hosted-azure
+#            script: L2_Speech_to_Text_EMA
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speech_to_Text_AED
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speech_to_Label
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Estimate_Duration_Bins
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Batch_Size_OOMptimizer
+#          - runner: self-hosted-azure
+#            script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
+#            is-optional: true
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Transcribe
           - runner: self-hosted-azure
@@ -128,62 +128,62 @@ jobs:
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
           - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
-          - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
-          - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
-          - runner: self-hosted-azure
-            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
-          - runner: self-hosted-azure-gpus-1
-            script: Speech_Checkpoints_tests
-            timeout: 20
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speaker_Recognition
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_Speaker_Diarization
-          - runner: self-hosted-azure-gpus-1
-            script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Neural_Diarizer_Inference
-          - runner: self-hosted-azure
-            script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
-          - runner: self-hosted-azure
-            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
-          - runner: self-hosted-azure
-            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
-          - script: L2_HF_Transformer_SpeechLM_SFT_2gpu
-            runner: self-hosted-azure
-          - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
-            runner: self-hosted-azure
-          - runner: self-hosted-azure-gpus-1
-            script: L2_TTS_Fast_dev_runs_1_Tacotron_2
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_WaveGlow
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_FastPitch
-          - runner: self-hosted-azure
-            script: L2_TTS_Fast_dev_runs_1_Hifigan
-          - runner: self-hosted-azure
-            script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
-          - runner: self-hosted-azure
-            script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_DuplexS2S
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
-          - runner: self-hosted-azure
-            script: SPEECHLM_HF_Training_SALM
+#            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
+#          - runner: self-hosted-azure
+#            script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
+#          - runner: self-hosted-azure
+#            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
+#          - runner: self-hosted-azure-gpus-1
+#            script: Speech_Checkpoints_tests
+#            timeout: 20
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speaker_Recognition
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_Speaker_Diarization
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Neural_Diarizer_Inference
+#          - runner: self-hosted-azure
+#            script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
+#          - runner: self-hosted-azure
+#            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
+#          - runner: self-hosted-azure
+#            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
+#          - script: L2_HF_Transformer_SpeechLM_SFT_2gpu
+#            runner: self-hosted-azure
+#          - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
+#            runner: self-hosted-azure
+#          - runner: self-hosted-azure-gpus-1
+#            script: L2_TTS_Fast_dev_runs_1_Tacotron_2
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_WaveGlow
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_FastPitch
+#          - runner: self-hosted-azure
+#            script: L2_TTS_Fast_dev_runs_1_Hifigan
+#          - runner: self-hosted-azure
+#            script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
+#          - runner: self-hosted-azure
+#            script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_DuplexS2S
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
+#          - runner: self-hosted-azure
+#            script: SPEECHLM_HF_Training_SALM
     needs: [unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}

From 876b6090091e4c850011ccd7cb439b77e29351e2 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Tue, 7 Oct 2025 20:01:05 +0400
Subject: [PATCH 05/13] Fix script

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../speech_to_text_cache_aware_streaming_infer.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
index c6674ad6d053..37fbd065e00a 100644
--- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
+++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -331,7 +331,7 @@ def main(cfg: TranscriptionConfig):
             f"Compute dtype {compute_dtype} is not yet supported for cache-aware models, use float32 instead"
         )
 
-    if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)):
+    if sum((cfg.audio_file is not None, cfg.dataset_manifest is not None, cfg.audio_dir is not None)) != 1:
         raise ValueError("Exactly one of the `audio_file`, `dataset_manifest` or `audio_dir` should be non-empty!")
 
     asr_model, model_name = setup_model(cfg=cfg, map_location=device)

From 2768defc6b6343117396d4c27d9e877fc7226f6a Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Tue, 7 Oct 2025 20:53:40 +0400
Subject: [PATCH 06/13] Add test with timestamps

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .github/workflows/cicd-main-speech.yml        |  4 +++-
 ...eech_to_Text_Streaming_Infer_Timestamps.sh | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh

diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml
index b6f881a71f1d..da9861008b63 100644
--- a/.github/workflows/cicd-main-speech.yml
+++ b/.github/workflows/cicd-main-speech.yml
@@ -126,8 +126,10 @@ jobs:
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
           - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps
           - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
+#          - runner: self-hosted-azure
 #            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
 #          - runner: self-hosted-azure
 #            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
new file mode 100755
index 000000000000..148164e6fdd2
--- /dev/null
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
+    examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
+    pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \
+    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
+    chunk_secs=2.0 \
+    left_context_secs=10.0 \
+    right_context_secs=2.0 \
+    timestamps=true \
+    output_filename="/tmp/stt_streaming_test_res.json"

From 0f6a37ff8df93ec9fe1da540a3e0b3d8d8013c44 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Tue, 7 Oct 2025 20:55:58 +0400
Subject: [PATCH 07/13] Use local checkpoint

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh   | 2 +-
 ...h_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
index 0a96c328e47a..3654ffd7aac4 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
@@ -13,7 +13,7 @@
 # limitations under the License.
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
-    pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \
+    model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
     chunk_secs=2.0 \
     left_context_secs=10.0 \
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
index 148164e6fdd2..0e2dc5061e9e 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
@@ -13,7 +13,7 @@
 # limitations under the License.
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
-    pretrained_name="nvidia/stt_en_fastconformer_transducer_large" \
+    model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
     chunk_secs=2.0 \
     left_context_secs=10.0 \

From a75c90b87e3994fa53ea4538d42abc5f50ea4efd Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Tue, 7 Oct 2025 22:22:03 +0400
Subject: [PATCH 08/13] Use local checkpoint

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
index 7afcc08397af..3240b2dfdaf7 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -13,6 +13,6 @@
 # limitations under the License.
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
-    pretrained_name="stt_en_fastconformer_hybrid_large_streaming_multi" \
+    pretrained_name="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
     output_path="/tmp/stt_cache_aware_streaming_test_res"

From bca66a5ac31223b640a7d6febf5d51761e84a183 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Wed, 8 Oct 2025 14:20:19 +0400
Subject: [PATCH 09/13] Fix param

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
index 3240b2dfdaf7..32df36b8a02c 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -13,6 +13,6 @@
 # limitations under the License.
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
-    pretrained_name="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \
+    model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
     output_path="/tmp/stt_cache_aware_streaming_test_res"

From 1ce73abdc1e99ab0a76d8972c1498215eac5a9d8 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Mon, 13 Oct 2025 23:42:03 +0400
Subject: [PATCH 10/13] Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .github/workflows/cicd-main-speech.yml        |  2 --
 ...cription_Speech_to_Text_Streaming_Infer.sh |  3 +++
 ...eech_to_Text_Streaming_Infer_Timestamps.sh | 22 -------------------
 3 files changed, 3 insertions(+), 24 deletions(-)
 delete mode 100755 tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh

diff --git a/.github/workflows/cicd-main-speech.yml b/.github/workflows/cicd-main-speech.yml
index 3e807cefb9c5..c690c4426391 100644
--- a/.github/workflows/cicd-main-speech.yml
+++ b/.github/workflows/cicd-main-speech.yml
@@ -125,8 +125,6 @@ jobs:
             script: L2_Speech_Transcription_Speech_to_Text_Transcribe
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
-          - runner: self-hosted-azure
-            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps
           - runner: self-hosted-azure
             script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
           - runner: self-hosted-azure
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
index 3654ffd7aac4..06f7625b6989 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# TODO(vbataev): fix decoding with CUDA graphs on CI for this test
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
     model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
@@ -18,4 +19,6 @@ coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     chunk_secs=2.0 \
     left_context_secs=10.0 \
     right_context_secs=2.0 \
+    timestamps=true \
+    decoding.greedy.use_cuda_graph_decoder=false \
     output_filename="/tmp/stt_streaming_test_res.json"
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
deleted file mode 100755
index 0e2dc5061e9e..000000000000
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer_Timestamps.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
-    examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \
-    model_path="/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo" \
-    audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-    chunk_secs=2.0 \
-    left_context_secs=10.0 \
-    right_context_secs=2.0 \
-    timestamps=true \
-    output_filename="/tmp/stt_streaming_test_res.json"

From 97a9746ead600768badbabb492b90f985bc4b68d Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Mon, 13 Oct 2025 23:49:35 +0400
Subject: [PATCH 11/13] Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../speech_to_text_cache_aware_streaming_infer.py             | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
index 37fbd065e00a..858feda00fa0 100644
--- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
+++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -471,9 +471,7 @@ def main(cfg: TranscriptionConfig):
 
         # stores the results including the transcriptions of the streaming inference in a json file
         if cfg.output_path is not None and len(all_refs_text) == len(all_streaming_tran):
-            fname = (
-                "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + "_" + dataset_title + ".json"
-            )
+            fname = "streaming_out_" + os.path.splitext(os.path.basename(model_name))[0] + f"_{dataset_title}.json"
 
             hyp_json = os.path.join(cfg.output_path, fname)
             os.makedirs(cfg.output_path, exist_ok=True)

From 47afc24ca3d36417e17801913b3c818905417021 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Mon, 13 Oct 2025 23:51:08 +0400
Subject: [PATCH 12/13] Clean up

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh  | 1 +
 .../L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
index 32df36b8a02c..7ab36d8b3cd8 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
     model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \
diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
index 06f7625b6989..596e8ffd6b5f 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Streaming_Infer.sh
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 # TODO(vbataev): fix decoding with CUDA graphs on CI for this test
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py \

From 441a85a2c855b4b8bffc6058a74fe9db415e188f Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Tue, 14 Oct 2025 13:03:21 +0400
Subject: [PATCH 13/13] Temporary avoid CUDA graphs in tests

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 .../L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
index 7ab36d8b3cd8..a56ccdb46d6c 100755
--- a/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
+++ b/tests/functional_tests/L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer.sh
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO(vbataev): fix decoding with CUDA graphs on CI for this test
 coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
     examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py \
     model_path="/home/TestData/asr/stt_en_fastconformer_hybrid_large_streaming_multi.nemo" \
+    rnnt_decoding.greedy.use_cuda_graph_decoder=false \
     audio_dir="/home/TestData/an4_transcribe/test_subset/" \
     output_path="/tmp/stt_cache_aware_streaming_test_res"