diff --git a/examples/recipes/llama/pretrain_llama3_8b.py b/examples/recipes/llama/pretrain_llama3_8b.py
index 9757d747b..76ebde762 100644
--- a/examples/recipes/llama/pretrain_llama3_8b.py
+++ b/examples/recipes/llama/pretrain_llama3_8b.py
@@ -21,7 +21,7 @@
 
 Examples:
     Basic usage with default configuration:
-        $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py
+        $ torchrun --nproc_per_node=8 examples/recipes/llama/pretrain_llama3_8b.py
 
     Using a custom YAML config file:
         $ torchrun --nproc_per_node=8 pretrain_llama3_8b.py --config-file my_custom_config.yaml
diff --git a/examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml b/examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml
new file mode 100644
index 000000000..c2fbc78af
--- /dev/null
+++ b/examples/recipes/qwen_vl/conf/qwen25_vl_pretrain_override_example.yaml
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example override file for Qwen2.5-VL
+
+model:
+  seq_length: 4096
+
+train:
+  train_iters: 20
+  global_batch_size: 8
+  micro_batch_size: 1
+  eval_iters: 5
+
+optimizer:
+  lr: 0.00025
+  min_lr: 0.000025
+
+scheduler:
+  lr_warmup_iters: 10
+
+checkpoint:
+  # Directory to save to. If null, no checkpoint will be saved.
+  save: null
+
+dist:
+  use_megatron_fsdp: false
+  use_torch_fsdp2: false
+
+logger:
+  log_interval: 1
+
+dataset:
+  sequence_length: 4096
+
+rng:
+  seed: 42
+
+ddp:
+  grad_reduce_in_fp32: true
+
+
diff --git a/examples/recipes/qwen_vl/finetune_qwen25_vl.py b/examples/recipes/qwen_vl/finetune_qwen25_vl.py
new file mode 100644
index 000000000..0c5501471
--- /dev/null
+++ b/examples/recipes/qwen_vl/finetune_qwen25_vl.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Qwen2.5-VL Finetuning Script with YAML and CLI Configuration Overrides.
+
+This mirrors the Llama example flow and uses the Qwen-VL recipe helpers.
+You can pick a specific recipe via `--recipe`, e.g., `qwen25_vl_3b_finetune_config`,
+`qwen25_vl_7b_finetune_config`, etc.
+
+Examples:
+    Loading pretrained weights (recommended for finetune):
+        1) Import HF checkpoint to Megatron format:
+           $ python examples/conversion/convert_checkpoints.py import \
+               --hf-model Qwen/Qwen2.5-VL-3B-Instruct \
+               --megatron-path /path/to/megatron_ckpt
+
+        2) Run finetune using the imported checkpoint:
+           $ torchrun --nproc_per_node=8 examples/recipes/qwen_vl/finetune_qwen25_vl.py \
+               --pretrained-checkpoint /path/to/megatron_ckpt
+
+    Using a custom YAML config file:
+        $ torchrun --nproc_per_node=8 finetune_qwen25_vl.py --config-file conf/qwen25_vl_pretrain_override_example.yaml
+
+    CLI overrides:
+        $ torchrun --nproc_per_node=8 finetune_qwen25_vl.py model.tensor_model_parallel_size=4 train.train_iters=100000
+
+    Selecting a specific recipe:
+        $ torchrun --nproc_per_node=8 finetune_qwen25_vl.py --recipe qwen25_vl_7b_finetune_config
+"""
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Tuple
+
+from omegaconf import OmegaConf
+
+from megatron.bridge.recipes.qwen_vl import qwen25_vl as qwen_vl_recipes
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.pretrain import pretrain
+from megatron.bridge.training.utils.omegaconf_utils import (
+    apply_overrides,
+    create_omegaconf_dict_config,
+    parse_hydra_overrides,
+)
+from megatron.bridge.training.vlm_step import forward_step
+from megatron.bridge.utils.common_utils import get_rank_safe
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+SCRIPT_DIR: Path = Path(__file__).parent.resolve()
+DEFAULT_CONFIG_FILENAME: str = "qwen25_vl_pretrain_override_example.yaml"
+DEFAULT_CONFIG_FILE_PATH: Path = SCRIPT_DIR / "conf" / DEFAULT_CONFIG_FILENAME
+
+
+def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
+    """Parse known script args and return remaining as Hydra-style overrides."""
+    parser = argparse.ArgumentParser(
+        description="Finetune Qwen2.5-VL with YAML and CLI overrides",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        default=str(DEFAULT_CONFIG_FILE_PATH),
+        help="Path to the YAML OmegaConf override file. Default: conf/qwen25_vl_pretrain_override_example.yaml",
+    )
+    parser.add_argument(
+        "--data-path",
+        type=str,
+        default=None,
+        help="Path to JSON/JSONL dataset (preloaded conversation or legacy messages format).",
+    )
+    parser.add_argument(
+        "--image-folder",
+        type=str,
+        default=None,
+        help="Optional root for resolving relative image/video paths in dataset records.",
+    )
+    parser.add_argument(
+        "--dataset-type",
+        type=str,
+        choices=["mock", "preloaded", "hf"],
+        default=None,
+        help=(
+            "Dataset type to use: 'mock', 'preloaded', or 'hf'. "
+            "If not set, auto-detects based on --data-path/--use-preloaded."
+        ),
+    )
+    parser.add_argument(
+        "--recipe",
+        type=str,
+        default="qwen25_vl_3b_finetune_config",
+        help=(
+            "Name of the recipe function in megatron.bridge.recipes.qwen_vl.qwen25_vl to use, "
+            "e.g., qwen25_vl_3b_finetune_config, qwen25_vl_7b_finetune_config."
+        ),
+    )
+    parser.add_argument(
+        "--pretrained-checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to imported Megatron checkpoint directory to load before finetuning. "
+            "Generate it with scripts/import_hf_ckpt.py."
+        ),
+    )
+    parser.add_argument(
+        "--use-preloaded",
+        action="store_true",
+        help="Use preloaded dataset provider (enabled automatically when --data-path is set).",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+    args, cli_dotlist_overrides = parser.parse_known_args()
+    return args, cli_dotlist_overrides
+
+
+def main() -> None:
+    """
+    Load the base VLM recipe config, apply YAML/CLI overrides, and start pretraining.
+    """
+    args, cli_overrides = parse_cli_args()
+
+    logger.info("Megatron-Bridge Qwen2.5-VL Finetuning Script with YAML & CLI Overrides")
+    logger.info("-----------------------------------------------------------------------")
+
+    # Resolve the recipe function from the provided name
+    recipe_name = getattr(args, "recipe", "qwen25_vl_3b_finetune_config")
+    available_recipes = [name for name in dir(qwen_vl_recipes) if name.endswith("_finetune_config")]
+    if not hasattr(qwen_vl_recipes, recipe_name):
+        logger.error(
+            "Unknown recipe '%s'. Available recipes: %s",
+            recipe_name,
+            ", ".join(sorted(available_recipes)),
+        )
+        sys.exit(2)
+    pretrain_config = getattr(qwen_vl_recipes, recipe_name)
+
+    # Determine dataset type based on CLI flag (overrides) or fall back to auto-detect
+    use_preloaded_flag = bool(args.data_path) or bool(getattr(args, "use_preloaded", False))
+    dataset_type = args.dataset_type or ("preloaded" if use_preloaded_flag else "mock")
+
+    cfg: ConfigContainer = pretrain_config(
+        dataset_type=dataset_type,
+        train_data_path=args.data_path,
+        valid_data_path=None,
+        test_data_path=None,
+        image_folder=args.image_folder,
+        pretrained_checkpoint=args.pretrained_checkpoint,
+    )
+    logger.info("Loaded base configuration")
+
+    if get_rank_safe() == 0:
+        cfg.print_yaml()
+
+    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
+
+    if args.config_file:
+        logger.debug(f"Loading YAML overrides from: {args.config_file}")
+        if not os.path.exists(args.config_file):
+            logger.error(f"Override YAML file not found: {args.config_file}")
+            sys.exit(1)
+        yaml_overrides_omega = OmegaConf.load(args.config_file)
+        merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
+
+    if cli_overrides:
+        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
+        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
+
+    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
+    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
+
+    if get_rank_safe() == 0:
+        logger.info("--- Final Merged Configuration ---")
+        cfg.print_yaml()
+        logger.info("----------------------------------")
+
+    pretrain(config=cfg, forward_step_func=forward_step)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 12259c989..c011fa384 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ dependencies = [
     "tqdm>=4.67.1",
     "hydra-core>1.3,<=1.3.2",
     "megatron-core[dev,mlm]>=0.14.0a0,<0.16.0",
+    "qwen-vl-utils",
 ]
 
 
diff --git a/src/megatron/bridge/data/vlm_datasets/__init__.py b/src/megatron/bridge/data/vlm_datasets/__init__.py
new file mode 100644
index 000000000..054de6537
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/__init__.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+VLM dataset utilities.
+
+Public API re-exports:
+- Makers: functions to build conversation examples from HF datasets
+- Providers: classes that build PyTorch datasets bound to HF processors
+- Collate fns: model-specific batch builders
+"""
+
+from megatron.bridge.data.vlm_datasets.collate import (
+    COLLATE_FNS,
+    default_collate_fn,
+    phi4_mm_collate_fn,
+    qwen2_5_collate_fn,
+)
+from megatron.bridge.data.vlm_datasets.conversation_dataset import VLMConversationDataset
+from megatron.bridge.data.vlm_datasets.hf_dataset_makers import (
+    make_cord_v2_dataset,
+    make_cv17_dataset,
+    make_medpix_dataset,
+    make_rdr_dataset,
+)
+from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+from megatron.bridge.data.vlm_datasets.mock_provider import MockVLMConversationProvider
+from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
+
+
+__all__ = [
+    # Makers
+    "make_rdr_dataset",
+    "make_cord_v2_dataset",
+    "make_medpix_dataset",
+    "make_cv17_dataset",
+    # Dataset types/providers
+    "VLMConversationDataset",
+    "HFDatasetConversationProvider",
+    "PreloadedVLMConversationProvider",
+    "MockVLMConversationProvider",
+    # Collation utilities
+    "COLLATE_FNS",
+    "default_collate_fn",
+    "qwen2_5_collate_fn",
+    "phi4_mm_collate_fn",
+]
diff --git a/src/megatron/bridge/data/vlm_datasets/collate.py b/src/megatron/bridge/data/vlm_datasets/collate.py
new file mode 100644
index 000000000..e0079f776
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/collate.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Collation utilities for building VLM training batches from conversation examples.
+"""
+
+import torch
+import torch.nn.functional as F
+from PIL import Image  # noqa: F401  # may be used downstream by processors
+
+from megatron.bridge.data.vlm_datasets.token_utils import extract_skipped_token_ids
+from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
+
+
+# Local message used when optional qwen_vl_utils dependency is missing
+MISSING_QWEN_VL_UTILS_MSG = (
+    "qwen_vl_utils is required for Qwen2.5 VL processing. Please `pip install qwen-vl-utils` or"
+    " provide compatible vision preprocessing."
+)
+
+try:
+    from qwen_vl_utils import process_vision_info
+
+    HAVE_QWEN_VL_UTILS = True
+except ImportError:
+    HAVE_QWEN_VL_UTILS = False
+
+
+def _gather_assistant_text_segments(example: dict) -> list[str]:
+    """Extract assistant text segments from the structured conversation example.
+
+    The example schema is expected to be {"conversation": [{"role": ..., "content": [...]} ...]} where
+    content is a list of items like {"type": "text"|"image"|..., "text": "..."}.
+    Returns a list of concatenated text strings, one per assistant turn.
+    """
+    texts: list[str] = []
+    for turn in example.get("conversation", []):
+        if turn.get("role") != "assistant":
+            continue
+        parts = turn.get("content", [])
+        buf = []
+        if isinstance(parts, list):
+            for p in parts:
+                if isinstance(p, dict) and p.get("type") == "text" and isinstance(p.get("text"), str):
+                    buf.append(p["text"])
+        elif isinstance(parts, str):
+            buf.append(parts)
+        if buf:
+            texts.append("".join(buf))
+    return texts
+
+
+def create_multiturn_loss_mask_by_search(
+    example: dict, input_ids, processor, skipped_tokens: torch.Tensor
+) -> list[int]:
+    """Tokenizer-agnostic masking via substring search of assistant texts.
+
+    - Tokenize full conversation with processor already done -> input_ids
+    - Extract assistant text strings from the structured example
+    - For each assistant text, tokenize without special tokens and search sequentially
+    - On success, unmask that span; otherwise leave masked
+    """
+    tokenizer = getattr(processor, "tokenizer", processor)
+    ids = input_ids.tolist()
+    mask = [0] * len(ids)
+
+    def try_mark(span_text: str, start_from: int) -> int:
+        """Tokenize a span and mark its occurrence if found. Returns new search start index."""
+        variants = [span_text, span_text + "\n"]
+        for text in variants:
+            span_tokens = tokenizer(text, add_special_tokens=False)["input_ids"]
+            if not span_tokens:
+                continue
+            # naive sequential search from start_from
+            for i in range(start_from, len(ids) - len(span_tokens) + 1):
+                if ids[i : i + len(span_tokens)] == span_tokens:
+                    for j in range(i, i + len(span_tokens)):
+                        mask[j] = 1
+                    return i + len(span_tokens)
+        return start_from
+
+    search_start = 0
+    for asst_text in _gather_assistant_text_segments(example):
+        search_start = try_mark(asst_text, search_start)
+
+    # Ensure pad/skipped tokens are masked
+    ids_t = torch.tensor(ids)
+    for k, t in enumerate(ids_t):
+        if t in skipped_tokens:
+            mask[k] = 0
+    return mask
+
+
+def phi4_mm_collate_fn(examples, processor):
+    """Collate function for Phi-4 MM model audio input"""
+
+    # Extract conversations and audio data
+    conversations = [example["conversation"] for example in examples]
+    audios = [example["audio"] for example in examples]
+    texts = [processor.apply_chat_template(conversation, tokenize=False) for conversation in conversations]
+    audio_inputs = [(audio["array"], audio["sampling_rate"]) if isinstance(audio, dict) else audio for audio in audios]
+    batch = processor(
+        text=texts, audios=audio_inputs, return_tensors="pt", padding=True, truncation=True, max_length=1024
+    )
+    labels = batch["input_ids"].clone()[:, 1:]
+    labels = torch.cat([labels, -100 * torch.ones_like(labels[:, :1])], dim=1)
+
+    loss_masks = []
+    for i, conversation in enumerate(conversations):
+        input_ids = batch["input_ids"][i].tolist()
+
+        assistant_content = conversation[1]["content"]
+        assistant_tokens = processor.tokenizer(assistant_content, add_special_tokens=False)["input_ids"]
+
+        loss_mask = [0] * len(input_ids)
+        for start_idx in range(len(input_ids) - len(assistant_tokens) + 1):
+            if input_ids[start_idx : start_idx + len(assistant_tokens)] == assistant_tokens:
+                for j in range(len(assistant_tokens)):
+                    loss_mask[start_idx + j] = 1
+                break
+        loss_masks.append(loss_mask)
+
+    max_len = max(len(mask) for mask in loss_masks)
+    padded_loss_masks = [mask + [0] * (max_len - len(mask)) for mask in loss_masks]
+    batch["loss_mask"] = torch.tensor(padded_loss_masks, dtype=torch.float)
+
+    labels[batch["loss_mask"] == 0] = -100
+    batch["labels"] = labels
+
+    # Remove specified batch features if present
+    for key in ["input_image_embeds", "image_sizes", "image_attention_mask"]:
+        if key in batch:
+            del batch[key]
+    return batch
+
+
+def qwen2_5_collate_fn(examples: list, processor) -> dict[str, torch.Tensor]:
+    """Collate function for Qwen2.5 VL model."""
+    if not HAVE_QWEN_VL_UTILS:
+        raise ImportError(MISSING_QWEN_VL_UTILS_MSG)
+
+    skipped_tokens = extract_skipped_token_ids(processor)
+
+    texts = [processor.apply_chat_template(example["conversation"], tokenize=False) for example in examples]
+    # Build per-example images (list) and split by presence
+    per_example_images = []
+    has_images = []
+    for example in examples:
+        imgs = process_vision_info(example["conversation"])[0]
+        if imgs is None:
+            imgs = []
+        elif not isinstance(imgs, list):
+            imgs = [imgs]
+        per_example_images.append(imgs)
+        has_images.append(len(imgs) > 0)
+
+    idx_with = [i for i, h in enumerate(has_images) if h]
+    idx_without = [i for i, h in enumerate(has_images) if not h]
+
+    batch_with = None
+    batch_without = None
+
+    if idx_with:
+        texts_with = [texts[i] for i in idx_with]
+        images_with = [per_example_images[i] for i in idx_with]
+        batch_with = processor(
+            text=texts_with,
+            images=images_with,
+            padding=True,
+            return_tensors="pt",
+            min_pixels=200704,  # 256*28*28
+            max_pixels=1003520,  # 1280*28*28
+        )
+
+    if idx_without:
+        texts_without = [texts[i] for i in idx_without]
+        batch_without = processor(
+            text=texts_without,
+            padding=True,
+            return_tensors="pt",
+        )
+
+    # Merge batches back to original order
+    if batch_with is not None and batch_without is None:
+        batch = batch_with
+    elif batch_with is None and batch_without is not None:
+        batch = batch_without
+    else:
+        # Both exist: pad to common max length and interleave rows
+        pad_id = getattr(processor.tokenizer, "pad_token_id", 0) or 0
+        in_with = batch_with["input_ids"]
+        in_without = batch_without["input_ids"]
+        max_len = max(in_with.shape[1], in_without.shape[1])
+
+        def pad_to(x, tgt_len):
+            if x.shape[1] == tgt_len:
+                return x
+            pad_len = tgt_len - x.shape[1]
+            return F.pad(x, (0, pad_len), value=pad_id)
+
+        in_with = pad_to(in_with, max_len)
+        in_without = pad_to(in_without, max_len)
+
+        input_ids = torch.full((len(examples), max_len), pad_id, dtype=in_with.dtype)
+        # Place rows
+        for row, i in enumerate(idx_with):
+            input_ids[i] = in_with[row]
+        for row, i in enumerate(idx_without):
+            input_ids[i] = in_without[row]
+
+        batch = {"input_ids": input_ids}
+        # Carry over vision tensors if present
+        if "pixel_values" in batch_with:
+            batch["pixel_values"] = batch_with["pixel_values"]
+        if "image_grid_thw" in batch_with:
+            batch["image_grid_thw"] = batch_with["image_grid_thw"]
+
+    labels = batch["input_ids"].clone()[:, 1:]
+    labels = torch.cat([labels, -100 * torch.ones_like(labels[:, :1])], dim=1)
+    labels[torch.isin(labels, skipped_tokens)] = -100
+    batch["labels"] = labels
+    # Ensure position_ids exist for the model
+    if "position_ids" not in batch:
+        batch_size, seq_len = batch["input_ids"].shape
+        batch["position_ids"] = (
+            torch.arange(seq_len, device=batch["input_ids"].device).unsqueeze(0).expand(batch_size, -1)
+        )
+    # Prefer general search-based masking using structured example content (not template-specific)
+    loss_masks = [
+        create_multiturn_loss_mask_by_search(example, input_ids, processor, skipped_tokens)
+        for example, input_ids in zip(examples, batch["input_ids"])  # type: ignore[arg-type]
+    ]
+    loss_mask_t = torch.tensor(loss_masks, dtype=torch.float, device=batch["input_ids"].device)
+    # Shift loss mask to align with next-token labels timeline
+    loss_mask_t = torch.cat([loss_mask_t[:, 1:], torch.zeros_like(loss_mask_t[:, :1])], dim=1)
+    # Enforce label masking to match shifted loss_mask
+    batch["labels"] = batch["labels"].masked_fill(loss_mask_t == 0, -100)
+    batch["loss_mask"] = loss_mask_t
+    # Build Qwen2VL visual inputs object and attach to batch; remove raw keys
+    visual_inputs = Qwen2_5_VLVisualInputs(
+        pixel_values=batch.get("pixel_values"),
+        image_grid_thw=batch.get("image_grid_thw"),
+    )
+    if "pixel_values" in batch:
+        del batch["pixel_values"]
+    if "image_grid_thw" in batch:
+        del batch["image_grid_thw"]
+    batch["visual_inputs"] = visual_inputs
+    return batch
+
+
+def default_collate_fn(examples: list, processor) -> dict[str, torch.Tensor]:
+    """Default collate function for VLM models."""
+    if not HAVE_QWEN_VL_UTILS:
+        raise ImportError(MISSING_QWEN_VL_UTILS_MSG)
+
+    skipped_tokens = extract_skipped_token_ids(processor)
+
+    batch = processor.apply_chat_template(
+        [example["conversation"] for example in examples],
+        tokenize=True,
+        padding=True,
+        truncation=True,
+        return_tensors="pt",
+        return_dict=True,
+    )
+
+    if "position_ids" not in batch:
+        batch_size, seq_len = batch["input_ids"].shape
+        batch["position_ids"] = (
+            torch.arange(seq_len, device=batch["input_ids"].device).unsqueeze(0).expand(batch_size, -1)
+        )
+
+    batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)
+    labels = batch["input_ids"].clone()[:, 1:]
+    labels = torch.cat([labels, -100 * torch.ones_like(labels[:, :1])], dim=1)
+    labels[torch.isin(labels, skipped_tokens)] = -100
+    batch["labels"] = labels
+    loss_masks = [
+        create_multiturn_loss_mask_by_search(example, input_ids, processor, skipped_tokens)
+        for example, input_ids in zip(examples, batch["input_ids"])  # type: ignore[arg-type]
+    ]
+    loss_mask_t = torch.tensor(loss_masks, dtype=torch.float, device=batch["input_ids"].device)
+    # Shift loss mask to align with next-token labels timeline
+    loss_mask_t = torch.cat([loss_mask_t[:, 1:], torch.zeros_like(loss_mask_t[:, :1])], dim=1)
+    batch["labels"] = batch["labels"].masked_fill(loss_mask_t == 0, -100)
+    batch["loss_mask"] = loss_mask_t
+    # Build Qwen2VL visual inputs object and attach to batch; remove raw keys
+    visual_inputs = Qwen2_5_VLVisualInputs(
+        pixel_values=batch.get("pixel_values"),
+        image_grid_thw=batch.get("image_grid_thw"),
+    )
+    if "pixel_values" in batch:
+        del batch["pixel_values"]
+    if "image_grid_thw" in batch:
+        del batch["image_grid_thw"]
+    batch["visual_inputs"] = visual_inputs
+    return batch
+
+
+# Mapping of processor types to their collate functions
+COLLATE_FNS = {
+    "Qwen2_5_VLProcessor": qwen2_5_collate_fn,
+    "default": default_collate_fn,
+}
diff --git a/src/megatron/bridge/data/vlm_datasets/conversation_dataset.py b/src/megatron/bridge/data/vlm_datasets/conversation_dataset.py
new file mode 100644
index 000000000..7157118b5
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/conversation_dataset.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Core dataset types for conversation-style VLM examples.
+"""
+
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+
+from megatron.bridge.data.vlm_datasets.collate import COLLATE_FNS
+
+
+class VLMConversationDataset(torch.utils.data.Dataset):
+    """Repeating wrapper over a list of HF-style conversation examples.
+
+    - Each base example is expected to contain a "conversation" key following
+      processor.apply_chat_template conventions. Optional modality fields like
+      "audio" are passed through and consumed by the collate function.
+    - Dataset length is set to a target length and indexes wrap around the
+      underlying list to meet the requested size.
+    - A `collate_fn` attribute is exposed so the framework can pass it to the
+      DataLoader.
+    """
+
+    def __init__(
+        self,
+        base_examples: List[Dict[str, Any]],
+        target_length: int,
+        processor: Any,
+        collate_impl: Optional[Callable[[list, Any], Dict[str, torch.Tensor]]] = None,
+    ) -> None:
+        assert isinstance(base_examples, list) and len(base_examples) > 0, "base_examples must be a non-empty list"
+        self._base_examples = base_examples
+        self._length = int(max(0, target_length))
+        self._processor = processor
+        # Choose collate implementation by processor type name when not provided
+        collate_key = type(processor).__name__ if processor is not None else "default"
+        selected_impl = collate_impl or COLLATE_FNS.get(collate_key, COLLATE_FNS["default"])  # type: ignore[index]
+
+        def _bound_collate(batch: list) -> Dict[str, torch.Tensor]:
+            return selected_impl(batch, self._processor)  # type: ignore[call-arg]
+
+        self.collate_fn = _bound_collate
+
+    def __len__(self) -> int:
+        return self._length
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        if self._length == 0:
+            raise IndexError("Empty dataset")
+        base = self._base_examples[idx % len(self._base_examples)]
+        return base
diff --git a/src/megatron/bridge/data/vlm_datasets/hf_dataset_makers.py b/src/megatron/bridge/data/vlm_datasets/hf_dataset_makers.py
new file mode 100644
index 000000000..782406156
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/hf_dataset_makers.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Built-in maker functions that transform HuggingFace datasets into
+conversation-style examples consumable by VLM processors.
+"""
+
+import json
+import random
+from typing import Any, Dict, List
+
+from datasets import load_dataset
+
+from megatron.bridge.data.vlm_datasets.token_utils import json2token
+
+
+def make_rdr_dataset(
+    path_or_dataset: str = "quintend/rdr-items", split: str = "train", **kwargs
+) -> List[Dict[str, Any]]:
+    """Load and preprocess the RDR dataset for image-to-text fine-tuning.
+
+    Returns a list of examples with a "conversation" field that includes an image and text.
+    """
+    dataset = load_dataset(path_or_dataset, split=split)
+
+    def format(example):
+        return {
+            "conversation": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": example["image"]},
+                        {"type": "text", "text": "Describe this image."},
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": example["text"]}],
+                },
+            ],
+        }
+
+    return [format(example) for example in dataset]
+
+
+def make_cord_v2_dataset(
+    path_or_dataset: str = "naver-clova-ix/cord-v2", split: str = "train", **kwargs
+) -> List[Dict[str, Any]]:
+    """Load and preprocess the CORD-V2 dataset for image-to-text fine-tuning."""
+    dataset = load_dataset(path_or_dataset, split=split)
+
+    def format(example):
+        ground_truth = json.loads(example["ground_truth"])
+        if "gt_parses" in ground_truth:
+            assert isinstance(ground_truth["gt_parses"], list)
+            gt_jsons = ground_truth["gt_parses"]
+        else:
+            assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
+            gt_jsons = [ground_truth["gt_parse"]]
+
+        text = random.choice([json2token(gt_json, sort_json_key=True) for gt_json in gt_jsons])
+
+        return {
+            "conversation": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": example["image"]},
+                        {"type": "text", "text": "Describe this image."},
+                    ],
+                },
+                {"role": "assistant", "content": [{"type": "text", "text": text}]},
+            ],
+        }
+
+    return [format(example) for example in dataset]
+
+
+def make_medpix_dataset(
+    path_or_dataset: str = "mmoukouba/MedPix-VQA", split: str = "train", **kwargs
+) -> List[Dict[str, Any]]:
+    """Load and preprocess the MedPix dataset for image-to-text fine-tuning."""
+    dataset = load_dataset(path_or_dataset, split=split)
+
+    def format(example):
+        return {
+            "conversation": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": example["image_id"]},
+                        {"type": "text", "text": example["question"]},
+                    ],
+                },
+                {"role": "assistant", "content": [{"type": "text", "text": example["answer"]}]},
+            ],
+        }
+
+    return [format(example) for example in dataset]
+
+
+def make_cv17_dataset(
+    path_or_dataset: str = "ysdede/commonvoice_17_tr_fixed", split: str = "train", **kwargs
+) -> List[Dict[str, Any]]:
+    """Load and preprocess the CommonVoice 17 dataset for audio-to-text fine-tuning."""
+    dataset = load_dataset(path_or_dataset, split=split)
+    # Be robust to simple list-like datasets used in tests without `column_names` attr
+    try:
+        all_columns = dataset.column_names  # type: ignore[attr-defined]
+    except Exception:
+        first_example = dataset[0] if len(dataset) > 0 else {}
+        all_columns = list(first_example.keys()) if isinstance(first_example, dict) else []
+    if hasattr(dataset, "remove_columns"):
+        columns_to_remove = [col for col in all_columns if col not in ["audio", "transcription"]]
+        dataset = dataset.remove_columns(columns_to_remove)
+
+    def format(example):
+        return {
+            "conversation": [
+                {"role": "user", "content": "<|audio_1|>Transcribe the Turkish audio clip."},
+                {"role": "assistant", "content": example["transcription"]},
+            ],
+            "audio": (example["audio"]["array"], example["audio"]["sampling_rate"]),
+        }
+
+    return [format(example) for example in dataset]
diff --git a/src/megatron/bridge/data/vlm_datasets/hf_provider.py b/src/megatron/bridge/data/vlm_datasets/hf_provider.py
new file mode 100644
index 000000000..9ca1d43c7
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/hf_provider.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Provider that builds conversation datasets from HuggingFace datasets.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+
+import torch
+from transformers import AutoProcessor
+
+from megatron.bridge.data.vlm_datasets.conversation_dataset import VLMConversationDataset
+from megatron.bridge.data.vlm_datasets.hf_dataset_makers import (
+    make_cord_v2_dataset,
+    make_cv17_dataset,
+    make_medpix_dataset,
+    make_rdr_dataset,
+)
+from megatron.bridge.training.config import DatasetBuildContext, DatasetProvider
+
+
+@dataclass(kw_only=True)
+class HFDatasetConversationProvider(DatasetProvider):
+    """DatasetProvider that builds VLM conversation datasets from HF datasets.
+
+    This provider leverages simple maker functions that return lists of examples
+    with a "conversation" schema understood by model processors. It binds a
+    HuggingFace `AutoProcessor` for the specified model and selects an
+    appropriate collate function for batching.
+    """
+
+    # Required to match model.seq_length (enforced by ConfigContainer.validate)
+    sequence_length: int
+
+    # HF processor/model identifier (e.g., "Qwen/Qwen2.5-VL-3B-Instruct")
+    hf_processor_path: str
+
+    # Select which maker to use. Must match a function defined in makers module
+    # like `make_rdr_dataset`, `make_cord_v2_dataset`, `make_medpix_dataset`, `make_cv17_dataset`.
+    maker_name: str
+
+    # Optional parameters forwarded to the selected maker
+    maker_kwargs: Optional[Dict[str, Any]] = None
+
+    # Optional collate override. If None, inferred from processor type.
+    collate_impl: Optional[Callable[[list, Any], Dict[str, torch.Tensor]]] = None
+
+    # Keep parity with GPTDatasetConfig usage in batching utilities
+    skip_getting_attention_mask_from_dataset: bool = True
+
+    # DataloaderConfig fields are inherited (num_workers, dataloader_type, etc.)
+    dataloader_type: Optional[Literal["single", "cyclic", "external"]] = "single"
+
+    def _get_maker(self) -> Callable[..., List[Dict[str, Any]]]:
+        registry: Dict[str, Callable[..., List[Dict[str, Any]]]] = {
+            "make_rdr_dataset": make_rdr_dataset,
+            "make_cord_v2_dataset": make_cord_v2_dataset,
+            "make_medpix_dataset": make_medpix_dataset,
+            "make_cv17_dataset": make_cv17_dataset,
+        }
+        if self.maker_name in registry:
+            return registry[self.maker_name]
+        # Allow passing function name alias without prefix, e.g., "rdr" -> make_rdr_dataset
+        alias_map = {
+            "rdr": "make_rdr_dataset",
+            "cord_v2": "make_cord_v2_dataset",
+            "medpix": "make_medpix_dataset",
+            "cv17": "make_cv17_dataset",
+        }
+        if self.maker_name in alias_map and alias_map[self.maker_name] in registry:
+            return registry[alias_map[self.maker_name]]
+        raise ValueError(f"Unknown maker_name: {self.maker_name}")
+
+    def _build_split_dataset(
+        self,
+        split: str,
+        target_length: int,
+        processor: Any,
+    ) -> Optional[VLMConversationDataset]:
+        if target_length <= 0:
+            return None
+        maker = self._get_maker()
+        kwargs = dict(self.maker_kwargs or {})
+        kwargs.setdefault("split", split)
+        base_examples = maker(**kwargs)  # type: ignore[misc]
+        if not isinstance(base_examples, list) or len(base_examples) == 0:
+            raise ValueError(f"Maker '{self.maker_name}' returned no examples for split='{split}'")
+        return VLMConversationDataset(
+            base_examples=base_examples,
+            target_length=target_length,
+            processor=processor,
+            collate_impl=self.collate_impl,
+        )
+
+    def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
+        # Bind processor for the requested model
+        processor = AutoProcessor.from_pretrained(self.hf_processor_path, trust_remote_code=True)
+
+        train_ds = self._build_split_dataset("train", context.train_samples, processor)
+        valid_ds = self._build_split_dataset("validation", context.valid_samples, processor)
+        test_ds = self._build_split_dataset("test", context.test_samples, processor)
+
+        return train_ds, valid_ds, test_ds
diff --git a/src/megatron/bridge/data/vlm_datasets/mock_provider.py b/src/megatron/bridge/data/vlm_datasets/mock_provider.py
new file mode 100644
index 000000000..54297c95f
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/mock_provider.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Generic mock conversation-style VLM dataset and provider.
+
+This module produces synthetic image(s) and minimal conversations that are
+compatible with HF `AutoProcessor.apply_chat_template` and the collate
+functions defined in `collate.py`. It is processor-agnostic and can be used
+with any multimodal model whose processor supports the standard conversation
+schema and optional `images` argument.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+import numpy
+from PIL import Image
+
+from megatron.bridge.data.vlm_datasets.conversation_dataset import VLMConversationDataset
+from megatron.bridge.training.config import DatasetBuildContext, DatasetProvider
+
+
+@dataclass(kw_only=True)
+class MockVLMConversationProvider(DatasetProvider):
+    """DatasetProvider for generic mock VLM conversation datasets.
+
+    Builds train/valid/test datasets using a HF AutoProcessor and the
+    `MockVLMConversationDataset` implementation. Intended to work across
+    different VLM models whose processors support the conversation schema.
+    """
+
+    # Required to match model.seq_length
+    sequence_length: int
+
+    # HF processor/model ID (e.g., Qwen/Qwen2.5-VL-3B-Instruct or other VLMs)
+    hf_processor_path: str
+
+    # Sample generation options
+    prompt: str = "Describe this image."
+    random_seed: int = 0
+    image_size: Tuple[int, int] = (256, 256)
+    pad_to_max_length: bool = True
+    create_attention_mask: bool = True
+
+    # Keep parity with GPTDatasetConfig usage in batching utilities
+    skip_getting_attention_mask_from_dataset: bool = True
+
+    # Number of images per sample
+    num_images: int = 1
+
+    # Default dataloader type for VLM providers
+    dataloader_type: Optional[Literal["single", "cyclic", "external"]] = "single"
+
+    # HF AutoProcessor instance will be set during build
+    _processor: Optional[Any] = None
+
+    def _make_base_examples(self) -> List[Dict[str, Any]]:
+        # Single minimal conversation example; dataset will repeat to target length
+        num_images = max(0, int(getattr(self, "num_images", 1)))
+        w, h = self.image_size
+        rng = numpy.random.default_rng(seed=self.random_seed)
+        images = None
+        if num_images > 0:
+            # Embed in-memory PIL images directly in the conversation so that
+            # qwen_vl_utils.process_vision_info can discover them.
+            images = [
+                Image.fromarray(rng.integers(low=0, high=256, size=(h, w, 3), dtype=numpy.uint8), mode="RGB")
+                for _ in range(num_images)
+            ]
+
+        content = [{"type": "image", "image": img} for img in images] if images is not None else []
+        content.append({"type": "text", "text": self.prompt})
+        messages = [
+            {"role": "user", "content": content},
+            {"role": "assistant", "content": [{"type": "text", "text": "dummy assistant response"}]},
+        ]
+        return [{"conversation": messages}]
+
+    def build_datasets(self, context: DatasetBuildContext):
+        from transformers import AutoProcessor
+
+        # Initialize and store processor
+        self._processor = AutoProcessor.from_pretrained(self.hf_processor_path, trust_remote_code=True)
+
+        base_examples = self._make_base_examples()
+
+        def _maybe_make(size: int) -> Optional[VLMConversationDataset]:
+            if not size or size <= 0:
+                return None
+            return VLMConversationDataset(
+                base_examples=base_examples,
+                target_length=size,
+                processor=self._processor,
+                collate_impl=None,  # infer collate from processor type (qwen2_5_collate_fn)
+            )
+
+        train_ds = _maybe_make(context.train_samples)
+        valid_ds = _maybe_make(context.valid_samples)
+        test_ds = _maybe_make(context.test_samples)
+
+        return train_ds, valid_ds, test_ds
diff --git a/src/megatron/bridge/data/vlm_datasets/preloaded_provider.py b/src/megatron/bridge/data/vlm_datasets/preloaded_provider.py
new file mode 100644
index 000000000..e34be8967
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/preloaded_provider.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Provider for datasets preloaded from JSON/JSONL files into conversation schema.
+"""
+
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+from transformers import AutoProcessor
+
+from megatron.bridge.data.vlm_datasets.conversation_dataset import VLMConversationDataset
+from megatron.bridge.training.config import DatasetBuildContext, DatasetProvider
+
+
+def _split_text_by_placeholders(
+    text: str, image_paths: List[str], video_paths: Optional[List[str]] = None
+) -> List[Dict[str, Any]]:
+    """
+    Split legacy text containing "<image>"/"<video>" markers into an alternating
+    sequence of text and media parts, preserving the original order and spacing.
+    """
+    parts: List[Dict[str, Any]] = []
+    img_idx = 0
+    vid_idx = 0
+
+    last_end = 0
+    for match in re.finditer(r"<image>|<video>", text):
+        # Preceding text (if any)
+        if match.start() > last_end:
+            seg = text[last_end : match.start()]
+            if seg:
+                parts.append({"type": "text", "text": seg})
+
+        token = match.group(0)
+        if token == "<image>":
+            if img_idx >= len(image_paths):
+                logging.warning("Encountered <image> without corresponding entry in images list.")
+            else:
+                parts.append({"type": "image", "image": image_paths[img_idx]})
+            img_idx += 1
+        else:  # <video>
+            if video_paths is None or vid_idx >= len(video_paths):
+                logging.warning("Encountered <video> without corresponding entry in videos list.")
+            else:
+                parts.append({"type": "video", "video": video_paths[vid_idx]})
+            vid_idx += 1
+        last_end = match.end()
+
+    # Trailing text (if any)
+    if last_end < len(text):
+        tail = text[last_end:]
+        if tail:
+            parts.append({"type": "text", "text": tail})
+    return parts
+
+
+def _normalize_paths(paths: Optional[List[Any]], base_folder: Optional[str]) -> Optional[List[Any]]:
+    if not paths or base_folder is None:
+        return paths
+    normalized: List[Any] = []
+    for p in paths:
+        if not isinstance(p, str):
+            normalized.append(p)
+            continue
+        if any(prefix in p for prefix in ["http:", "https:", "file:"]) or os.path.isabs(p):
+            normalized.append(p)
+        else:
+            normalized.append(os.path.normpath(os.path.join(base_folder, p)))
+    return normalized
+
+
+def _record_to_conversation(record: Dict[str, Any], image_folder: Optional[str]) -> Optional[List[Dict[str, Any]]]:
+    """
+    Transform a single legacy record into an AutoProcessor-friendly conversation schema.
+    Supports two input styles:
+      - {"conversation": [...]} already in HF schema -> passthrough
+      - {"messages": [...], "images": [...], "videos": [...]} with <image>/<video> markers
+    """
+    if "conversation" in record:
+        return record["conversation"]
+
+    # Accept legacy "messages" or LLaVA-style "conversations"
+    messages = record.get("messages")
+    llava_conversations = record.get("conversations")
+    if not messages and not llava_conversations:
+        return None
+
+    # Build images/videos list from several possible fields
+    images: List[Any] = []
+    if "images" in record and isinstance(record["images"], list):
+        images = record["images"]
+    elif "image" in record and record["image"] is not None:
+        # Single image string -> list
+        if isinstance(record["image"], list):
+            images = record["image"]
+        else:
+            images = [record["image"]]
+    videos: List[Any] = record.get("videos", []) or []
+    images = _normalize_paths(images, image_folder) or []
+    videos = _normalize_paths(videos, image_folder) or []
+
+    conversation: List[Dict[str, Any]] = []
+    source_msgs = messages if messages is not None else llava_conversations
+    for msg in source_msgs:
+        # LLaVA uses {'from': 'human'|'gpt', 'value': '...'}
+        role = msg.get("role")
+        if role is None:
+            from_role = msg.get("from", "human")
+            role = "user" if from_role.lower() in ("human", "user") else "assistant"
+            content_str = msg.get("value", "")
+        else:
+            content_str = msg.get("content", "")
+
+        content_list = _split_text_by_placeholders(content_str, images, videos)
+        if content_list:
+            # Reorder to media-first followed by a single combined text segment to
+            # match typical VLM chat templates (media before text)
+            media_parts = [p for p in content_list if p.get("type") in ("image", "video")]
+            text_parts = [p.get("text", "") for p in content_list if p.get("type") == "text" and p.get("text")]
+            if text_parts:
+                media_parts.append({"type": "text", "text": "".join(text_parts)})
+            content_list = media_parts
+        if not content_list:
+            content_list = [{"type": "text", "text": content_str}]
+        conversation.append({"role": role, "content": content_list})
+    return conversation
+
+
+def _load_preloaded_examples(path: str) -> List[Dict[str, Any]]:
+    examples: List[Dict[str, Any]] = []
+    if path.endswith(".jsonl"):
+        with open(path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                examples.append(json.loads(line))
+    else:
+        with open(path, "r") as f:
+            payload = json.load(f)
+        if isinstance(payload, list):
+            examples = payload
+        elif isinstance(payload, dict):
+            # Some datasets wrap under a key, try common ones
+            for key in ["data", "examples", "records"]:
+                if key in payload and isinstance(payload[key], list):
+                    examples = payload[key]
+                    break
+            if not examples:
+                examples = [payload]
+        else:
+            raise ValueError(f"Unsupported JSON structure in {path}")
+    return examples
+
+
+@dataclass(kw_only=True)
+class PreloadedVLMConversationProvider(DatasetProvider):
+    """DatasetProvider that builds VLM conversation datasets from preloaded JSON/JSONL files.
+
+    The provider converts legacy Qwen2/VL style records with '<image>'/'<video>' markers
+    into a conversation schema consumable by HuggingFace AutoProcessor for Qwen2.5-VL.
+    """
+
+    # Required to match model.seq_length
+    sequence_length: int
+
+    # HF processor/model identifier (e.g., "Qwen/Qwen2.5-VL-3B-Instruct")
+    hf_processor_path: str = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    # Paths to preloaded datasets (JSON/JSONL). Any can be None.
+    train_data_path: Optional[str] = None
+    valid_data_path: Optional[str] = None
+    test_data_path: Optional[str] = None
+
+    # Optional image/video root to resolve relative paths
+    image_folder: Optional[str] = None
+
+    # Keep parity with GPTDatasetConfig usage in batching utilities
+    skip_getting_attention_mask_from_dataset: bool = True
+
+    # Default dataloader type for VLM providers
+    dataloader_type: Optional[Literal["single", "cyclic", "external"]] = "single"
+
+    def _build_split_dataset(
+        self,
+        split_path: Optional[str],
+        target_length: int,
+        processor: Any,
+    ) -> Optional[VLMConversationDataset]:
+        if not split_path or target_length <= 0:
+            return None
+        raw_examples = _load_preloaded_examples(split_path)
+        base_examples: List[Dict[str, Any]] = []
+        for rec in raw_examples:
+            conv = _record_to_conversation(rec, self.image_folder)
+            if conv is None:
+                continue
+            base_examples.append({"conversation": conv})
+        if not base_examples:
+            logging.warning(f"No usable examples parsed from {split_path}")
+            return None
+        return VLMConversationDataset(
+            base_examples=base_examples,
+            target_length=target_length,
+            processor=processor,
+        )
+
+    def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
+        processor = AutoProcessor.from_pretrained(self.hf_processor_path, trust_remote_code=True)
+        train_ds = self._build_split_dataset(self.train_data_path, context.train_samples, processor)
+        valid_ds = self._build_split_dataset(self.valid_data_path, context.valid_samples, processor)
+        test_ds = self._build_split_dataset(self.test_data_path, context.test_samples, processor)
+        return train_ds, valid_ds, test_ds
diff --git a/src/megatron/bridge/data/vlm_datasets/token_utils.py b/src/megatron/bridge/data/vlm_datasets/token_utils.py
new file mode 100644
index 000000000..b626fe34d
--- /dev/null
+++ b/src/megatron/bridge/data/vlm_datasets/token_utils.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tokenizer-related helpers and common special-token utilities.
+"""
+
+import torch
+
+
+# Common special tokens across VLM models
+QWEN_TOKENS = [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+]
+LLAVA_TOKENS = ["<image>", "<pad>"]
+LLAMA_TOKENS = [
+    "<|begin_of_text|>",
+    "<|end_of_text|>",
+    "<|finetune_right_pad_id|>",
+    "<|step_id|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eom_id|>",
+    "<|eot_id|>",
+    "<|python_tag|>",
+    "<|image|>",
+]
+GEMMA_TOKENS = ["<image_soft_token>"]
+
+GEMMA_3N_TOKENS = [
+    "<image_soft_token>",
+    "<audio_soft_token>",
+    "<start_of_audio>",
+    "<start_of_image>",
+    "<end_of_audio>",
+    "<end_of_image>",
+]
+
+PAD_TOKENS = set(QWEN_TOKENS + LLAVA_TOKENS + LLAMA_TOKENS + GEMMA_TOKENS + GEMMA_3N_TOKENS)
+
+
+def extract_skipped_token_ids(processor):
+    """
+    Returns list of tokens to mask in labels.
+
+    Extracted from NeMo's HFAutoModelForImageTextToText.extract_skipped_token_ids
+    """
+    if processor is None:
+        return torch.IntTensor([])
+    tokenizer = getattr(processor, "tokenizer", processor)
+
+    skipped_token_ids = []
+    for key, val in tokenizer.added_tokens_decoder.items():
+        if str(val) in PAD_TOKENS:
+            skipped_token_ids.append(key)
+
+    return torch.IntTensor(list(set(skipped_token_ids)))
+
+
+def json2token(obj, sort_json_key: bool = True):
+    """
+    Convert an ordered JSON object into a token sequence.
+
+    From NeMo's automodel_datasets.py
+    """
+    if type(obj) is dict:
+        if len(obj) == 1 and "text_sequence" in obj:
+            return obj["text_sequence"]
+        output = ""
+        keys = sorted(obj.keys(), reverse=True) if sort_json_key else obj.keys()
+        for k in keys:
+            output += rf"<s_{k}>" + json2token(obj[k], sort_json_key) + rf"</s_{k}>"
+        return output
+    if type(obj) is list:
+        return r"<sep/>".join([json2token(item, sort_json_key) for item in obj])
+    return str(obj)
+
+
+def process_text_batch(
+    processor,
+    texts: list[str],
+    images: list | None = None,
+) -> dict[str, torch.Tensor]:
+    """
+    Process a batch of texts and optionally images.
+
+    Args:
+        processor: The processor to use for tokenization and image processing
+        texts: List of text strings to process
+        images: Optional list of images to process
+
+    Returns:
+        Dict containing processed batch data
+    """
+    if images is not None:
+        batch = processor(
+            text=texts,
+            images=images,
+            padding=True,
+            return_tensors="pt",
+        )
+        if "pixel_values" in batch:
+            batch["pixel_values"] = batch["pixel_values"].to(torch.bfloat16)
+    else:
+        batch = processor(
+            text=texts,
+            padding=True,
+            return_tensors="pt",
+        )
+
+    return batch
diff --git a/src/megatron/bridge/models/qwen_vl/modeling_qwen25_vl.py b/src/megatron/bridge/models/qwen_vl/modeling_qwen25_vl.py
index 1d2ad381a..465bf1917 100644
--- a/src/megatron/bridge/models/qwen_vl/modeling_qwen25_vl.py
+++ b/src/megatron/bridge/models/qwen_vl/modeling_qwen25_vl.py
@@ -29,6 +29,7 @@
 )
 
 from megatron.bridge.models.gpt_provider import GPTModelProvider
+from megatron.bridge.utils.common_utils import hook_hf_module_setattr_for_tp_grad_sync
 
 
 def is_transformers_min_version(version):
@@ -100,10 +101,16 @@ def __init__(
 
         if pre_process:
             self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
+            # Ensure HF visual tower params are marked for TP grad sync and future assignments are hooked.
+            hook_hf_module_setattr_for_tp_grad_sync(self.visual)
         self.language_model = self.config.provide_language_model(
             pre_process=pre_process, post_process=post_process, vp_stage=vp_stage
         )
 
+        # Finalize grad will need these to be bind with module
+        self.share_embeddings_and_output_weights = config.share_embeddings_and_output_weights
+        self.shared_embedding_or_output_weight = self.language_model.shared_embedding_or_output_weight
+
         # Bind methods from HF's Qwen2_5_VLModel to this instance
         # get_placeholder_mask is only available in transformers 4.55+
         if is_transformers_min_version("4.55.0"):
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
new file mode 100644
index 000000000..8797b3197
--- /dev/null
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from typing_extensions import TypedDict, Unpack
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.data.vlm_datasets import (
+    HFDatasetConversationProvider,
+    MockVLMConversationProvider,
+    PreloadedVLMConversationProvider,
+)
+from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.comm_overlap import CommOverlapConfig
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    DatasetProvider,
+    DistributedDataParallelConfig,
+    LoggerConfig,
+    RNGConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
+
+
+class Qwen25VLCommonKwargs(TypedDict, total=False):
+    """Typed options accepted by Qwen2.5-VL recipe helper functions."""
+
+    # Core identifiers
+    hf_path: str
+    dir: Optional[str]
+    name: str
+    # Dataset configuration
+    train_data_path: Optional[List[str]]
+    valid_data_path: Optional[List[str]]
+    test_data_path: Optional[List[str]]
+    dataset_type: Optional[str]
+    image_folder: Optional[str]
+    tokenizer_model: Optional[str]
+    # Model configuration
+    tensor_parallelism: int
+    pipeline_parallelism: int
+    pipeline_parallelism_dtype: Optional[torch.dtype]
+    virtual_pipeline_parallelism: Optional[int]
+    context_parallelism: int
+    sequence_parallelism: bool
+    use_megatron_fsdp: bool
+    # Training hyperparameters
+    train_iters: int
+    global_batch_size: int
+    micro_batch_size: int
+    seq_length: int
+    lr: float
+    min_lr: float
+    lr_warmup_iters: int
+    lr_decay_iters: Optional[int]
+    eval_interval: int
+    save_interval: int
+    # Precision / overlap configs
+    precision_config: Optional[Union[MixedPrecisionConfig, str]]
+    comm_overlap_config: Optional[CommOverlapConfig]
+    # Freeze options
+    freeze_language_model: bool
+    freeze_vision_model: bool
+    freeze_vision_projection: bool
+    # Checkpoint options
+    pretrained_checkpoint: Optional[str]
+
+
+def qwen25_vl_3b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
+    """Return a fine-tuning config for Qwen2.5-VL 3B Instruct.
+
+    See `_qwen25_vl_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen25VLCommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+    }
+    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen25_vl_common(**combined_kwargs)
+
+
+def qwen25_vl_7b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
+    """Return a fine-tuning config for Qwen2.5-VL 7B Instruct.
+
+    See `_qwen25_vl_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen25VLCommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-VL-7B-Instruct",
+        "tensor_parallelism": 2,
+        "pipeline_parallelism": 1,
+    }
+    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen25_vl_common(**combined_kwargs)
+
+
+def qwen25_vl_32b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
+    """Return a fine-tuning config for Qwen2.5-VL 32B Instruct.
+
+    See `_qwen25_vl_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen25VLCommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-VL-32B-Instruct",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 2,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+    }
+    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen25_vl_common(**combined_kwargs)
+
+
+def qwen25_vl_72b_finetune_config(**user_kwargs: Unpack[Qwen25VLCommonKwargs]) -> ConfigContainer:
+    """Return a fine-tuning config for Qwen2.5-VL 72B Instruct.
+
+    See `_qwen25_vl_common` for the full list of parameters.
+    """
+    recommended_kwargs: Qwen25VLCommonKwargs = {
+        "hf_path": "Qwen/Qwen2.5-VL-72B-Instruct",
+        "tensor_parallelism": 8,
+        "pipeline_parallelism": 4,
+        "pipeline_parallelism_dtype": torch.bfloat16,
+    }
+    combined_kwargs: Qwen25VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
+    return _qwen25_vl_common(**combined_kwargs)
+
+
+def _qwen25_vl_common(
+    hf_path: str,
+    dir: Optional[str] = None,
+    name: str = "qwen25_vl_finetune",
+    pretrained_checkpoint: Optional[str] = None,
+    # Dataset configuration
+    train_data_path: Optional[List[str]] = None,
+    valid_data_path: Optional[List[str]] = None,
+    test_data_path: Optional[List[str]] = None,
+    dataset_type: Optional[str] = None,
+    image_folder: Optional[str] = None,
+    tokenizer_model: Optional[str] = None,
+    # Model configuration
+    tensor_parallelism: int = 2,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_dtype: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 1,
+    sequence_parallelism: bool = False,
+    use_megatron_fsdp: bool = False,
+    # Training hyperparameters
+    train_iters: int = 300000,
+    global_batch_size: int = 32,
+    micro_batch_size: int = 2,
+    seq_length: int = 4096,
+    lr: float = 3e-4,
+    min_lr: float = 3e-5,
+    lr_warmup_iters: int = 500,
+    lr_decay_iters: Optional[int] = None,
+    eval_interval: int = 500,
+    save_interval: int = 500,
+    # Precision and comm overlap
+    precision_config: Optional[Union[MixedPrecisionConfig, str]] = "bf16_mixed",
+    comm_overlap_config: Optional[CommOverlapConfig] = None,
+    # Freeze options
+    freeze_language_model: bool = False,
+    freeze_vision_model: bool = False,
+    freeze_vision_projection: bool = False,
+) -> ConfigContainer:
+    """
+    Create a fine-tuning configuration for Qwen2.5-VL models using a given HuggingFace path.
+
+    The dataset pipeline is conversation-based. To train multimodal tokens, ensure your
+    preprocessed data includes placeholders (e.g., <image>) as needed.
+    """
+    base_output_dir = dir if dir is not None else os.path.join(os.getcwd(), "nemo_experiments")
+    run_output_dir = os.path.join(base_output_dir, name)
+    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
+    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
+
+    # Build provider via AutoBridge and set parallel/seq params here
+    bridge = AutoBridge.from_hf_pretrained(hf_path)
+    model_cfg = bridge.to_megatron_provider(load_weights=False)
+    model_cfg.tensor_model_parallel_size = tensor_parallelism
+    model_cfg.pipeline_model_parallel_size = pipeline_parallelism
+    model_cfg.pipeline_dtype = pipeline_parallelism_dtype
+    model_cfg.virtual_pipeline_model_parallel_size = virtual_pipeline_parallelism
+    model_cfg.context_parallel_size = context_parallelism
+    model_cfg.sequence_parallel = sequence_parallelism
+    model_cfg.freeze_language_model = freeze_language_model
+    model_cfg.freeze_vision_model = freeze_vision_model
+    model_cfg.freeze_vision_projection = freeze_vision_projection
+    model_cfg.seq_length = seq_length
+
+    # Optimizer and scheduler
+    opt_config, scheduler = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=lr_warmup_iters,
+        lr_decay_iters=lr_decay_iters if lr_decay_iters is not None else train_iters,
+        max_lr=lr,
+        min_lr=min_lr,
+    )
+
+    # Determine dataset selection strategy.
+    _dataset_choice = dataset_type or "mock"
+    _processor_model = tokenizer_model or hf_path
+
+    if _dataset_choice == "mock":
+        dataset_cfg: DatasetProvider = MockVLMConversationProvider(
+            sequence_length=seq_length,
+            hf_processor_path=_processor_model,
+            prompt="Describe this image.",
+            num_workers=1,
+            dataloader_type="single",
+            data_sharding=True,
+            pin_memory=True,
+            persistent_workers=False,
+            create_attention_mask=True,
+            pad_to_max_length=True,
+        )
+    elif _dataset_choice == "preloaded":
+        dataset_cfg = PreloadedVLMConversationProvider(
+            sequence_length=seq_length,
+            hf_processor_path=_processor_model,
+            train_data_path=train_data_path[0] if isinstance(train_data_path, list) else train_data_path,
+            valid_data_path=valid_data_path[0] if isinstance(valid_data_path, list) else valid_data_path,
+            test_data_path=test_data_path[0] if isinstance(test_data_path, list) else test_data_path,
+            image_folder=image_folder,
+            num_workers=2,
+            dataloader_type="single",
+            data_sharding=True,
+            pin_memory=True,
+            persistent_workers=False,
+        )
+    elif _dataset_choice == "hf":
+        dataset_cfg = HFDatasetConversationProvider(
+            sequence_length=seq_length,
+            hf_processor_path=_processor_model,
+            maker_name="make_cord_v2_dataset",
+            num_workers=2,
+            dataloader_type="single",
+            data_sharding=True,
+            pin_memory=True,
+            persistent_workers=False,
+        )
+    else:
+        raise ValueError(f"Unsupported dataset_type '{_dataset_choice}'. Expected one of ['mock', 'preloaded', 'hf'].")
+
+    cfg = ConfigContainer(
+        model=model_cfg,
+        train=TrainingConfig(
+            train_iters=train_iters,
+            eval_interval=eval_interval,
+            eval_iters=32,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            manual_gc=True,
+            manual_gc_interval=100,
+            manual_gc_eval=100,
+        ),
+        optimizer=opt_config,
+        scheduler=scheduler,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=False,
+            overlap_param_gather=False,
+            average_in_collective=True,
+            data_parallel_sharding_strategy="optim_grads_params",
+            use_distributed_optimizer=True,
+            use_megatron_fsdp=use_megatron_fsdp,
+        ),
+        dataset=dataset_cfg,
+        logger=LoggerConfig(
+            log_interval=10,
+            tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
+        ),
+        tokenizer=TokenizerConfig(tokenizer_type="NullTokenizer", vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE),
+        checkpoint=CheckpointConfig(
+            pretrained_checkpoint=pretrained_checkpoint,
+            save_interval=save_interval,
+            save=checkpoint_dir,
+            load=checkpoint_dir,
+            ckpt_format="torch_dist",
+            fully_parallel_save=True,
+        ),
+        rng=RNGConfig(seed=1234),
+        comm_overlap=comm_overlap_config,
+        mixed_precision=precision_config,
+    )
+
+    return cfg
diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen25_vl_dataset.py b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl_dataset.py
new file mode 100644
index 000000000..1a5a99b0d
--- /dev/null
+++ b/src/megatron/bridge/recipes/qwen_vl/qwen25_vl_dataset.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
+
+import numpy
+import torch
+from PIL import Image
+
+from megatron.bridge.training.config import DatasetBuildContext, DatasetProvider
+
+
+class MockQwen25VLDataset(torch.utils.data.Dataset):
+    """Mock vision-language dataset for Qwen2.5-VL that yields text+image samples.
+
+    Each sample contains:
+      - tokens: torch.LongTensor [L]
+      - labels: torch.LongTensor [L]
+      - attention_mask: torch.BoolTensor [L] (all ones by default)
+      - loss_mask: torch.FloatTensor [L]
+      - position_ids: torch.LongTensor [L]
+      - pixel_values: torch.FloatTensor [num_images, C, H, W]
+      - image_grid_thw: torch.LongTensor [num_images, 3]
+    """
+
+    def __init__(self, size: int, config: Any) -> None:
+        if Image is None:
+            raise ImportError("PIL is required for MockQwen25VLDataset. Please install pillow.")
+
+        self.size = size
+        self.config = config
+
+        # Infer tokenizer from processor
+        try:
+            self._hf_tokenizer = getattr(config._processor, "tokenizer", None)
+        except Exception:
+            self._hf_tokenizer = None
+
+        if self._hf_tokenizer is None:
+            raise ValueError("config._processor must have a 'tokenizer' attribute")
+
+        if self._hf_tokenizer.pad_token_id is None and getattr(self._hf_tokenizer, "eos_token_id", None) is not None:
+            # Ensure pad token exists for stable padding/loss masking
+            try:
+                self._hf_tokenizer.pad_token = self._hf_tokenizer.eos_token
+            except Exception:
+                pass
+
+        self._rng = numpy.random.default_rng(seed=self.config.random_seed)
+
+    def __len__(self) -> int:
+        return self.size
+
+    def _generate_random_image(self) -> Image.Image:
+        w, h = self.config.image_size
+        # Generate a simple RGB image with uniform noise
+        array = self._rng.integers(low=0, high=256, size=(h, w, 3), dtype=numpy.uint8)
+        return Image.fromarray(array, mode="RGB")
+
+    def _build_inputs(self) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        # Build chat template with one image and a simple text prompt
+        num_images = max(0, int(getattr(self.config, "num_images", 1)))
+        content = [{"type": "image"} for _ in range(num_images)]
+        content.append({"type": "text", "text": self.config.prompt})
+        messages = [
+            {
+                "role": "user",
+                "content": content,
+            },
+            {"role": "assistant", "content": "dummy assistant response"},
+        ]
+
+        # The chat template will insert appropriate placeholders for the image token(s)
+        text = self.config._processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+        images: Optional[list[Image.Image]] = None
+        if num_images > 0:
+            images = [self._generate_random_image() for _ in range(num_images)]
+
+        processor_kwargs: Dict[str, Any] = {
+            "text": [text],
+            "padding": "max_length" if self.config.pad_to_max_length else True,
+            "return_tensors": "pt",
+        }
+        if images is not None:
+            processor_kwargs["images"] = images
+
+        if self.config.pad_to_max_length:
+            processor_kwargs["max_length"] = self.config.sequence_length
+
+        inputs = self.config._processor(**processor_kwargs)
+
+        input_ids: torch.Tensor = inputs.input_ids[0]  # [L]
+        # Enforce exact sequence length by truncating or padding with random token ids
+        target_len = int(self.config.sequence_length) + 1
+        cur_len = input_ids.numel()
+        if cur_len > target_len:
+            input_ids = input_ids[:target_len]
+        elif cur_len < target_len:
+            vocab_size = getattr(self._hf_tokenizer, "vocab_size", None)
+            if not vocab_size or vocab_size <= 1:
+                vocab_size = len(self._hf_tokenizer.get_vocab())
+            pad_len = target_len - cur_len
+            random_tail = torch.randint(low=0, high=int(vocab_size), size=(pad_len,), dtype=input_ids.dtype)
+            input_ids = torch.cat([input_ids, random_tail], dim=0)
+        pixel_values_t: Optional[torch.Tensor] = None
+        image_grid_thw_t: Optional[torch.Tensor] = None
+        if images is not None:
+            # Ensure per-sample shapes without a leading batch dim
+            pixel_values_t = inputs.pixel_values
+            if pixel_values_t.dim() == 5 and pixel_values_t.size(0) == 1:
+                pixel_values_t = pixel_values_t.squeeze(0)
+            image_grid_thw_t = inputs.image_grid_thw
+            if image_grid_thw_t.dim() == 3 and image_grid_thw_t.size(0) == 1:
+                image_grid_thw_t = image_grid_thw_t.squeeze(0)
+        return input_ids, pixel_values_t, image_grid_thw_t
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:  # noqa: ARG002 - idx unused randomness ok
+        input_ids, pixel_values, image_grid_thw = self._build_inputs()
+
+        # Ensure at least length 2 to form tokens/labels by shifting
+        if input_ids.numel() < 2:
+            # In rare cases of extremely short templates, pad minimally
+            pad_id = self._hf_tokenizer.pad_token_id or 0
+            input_ids = torch.nn.functional.pad(input_ids, (0, 2 - input_ids.numel()), value=pad_id)
+
+        # Create tokens and labels with next-token prediction convention
+        tokens = input_ids[:-1].contiguous()
+        labels = input_ids[1:].contiguous()
+
+        # Position IDs: [0, 1, ..., L-1]
+        position_ids = torch.arange(tokens.numel(), dtype=torch.long, device=tokens.device)
+
+        # Attention mask: 1D valid-token mask (all ones by default)
+        attention_mask = torch.ones_like(tokens, dtype=torch.bool)
+
+        # Loss mask: mask out padding positions in labels
+        pad_token_id = self._hf_tokenizer.pad_token_id
+        if pad_token_id is None:
+            pad_token_id = getattr(self._hf_tokenizer, "eos_token_id", 0) or 0
+
+        loss_mask = torch.ones_like(labels, dtype=torch.float)
+        # Additionally mask special pad token ids if present
+        loss_mask[labels == pad_token_id] = 0.0
+
+        # For embedding lookup safety on padding
+        tokens = tokens.clone()
+        labels = labels.clone()
+        tokens[tokens == pad_token_id] = 0
+        labels[labels == pad_token_id] = 0
+
+        sample: Dict[str, torch.Tensor] = {
+            "tokens": tokens,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "loss_mask": loss_mask,
+            "position_ids": position_ids,
+        }
+        if pixel_values is not None:
+            sample["pixel_values"] = pixel_values
+        if image_grid_thw is not None:
+            sample["image_grid_thw"] = image_grid_thw
+
+        return sample
+
+
+@dataclass(kw_only=True)
+class MockQwen25VLDatasetProvider(DatasetProvider):
+    """DatasetProvider for a mock Qwen2.5-VL vision-language dataset.
+
+    Builds train/valid/test datasets using a HF AutoProcessor and the
+    MockQwen25VLDataset implementation.
+    """
+
+    # Required to match model.seq_length
+    sequence_length: int
+
+    # HF processor/model ID for Qwen2.5-VL
+    hf_model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    # Sample generation options
+    prompt: str = "Describe this image."
+    random_seed: int = 0
+    image_size: Tuple[int, int] = (256, 256)
+    pad_to_max_length: bool = True
+    create_attention_mask: bool = True
+
+    # Keep parity with GPTDatasetConfig usage in batching utilities
+    skip_getting_attention_mask_from_dataset: bool = True
+
+    # Number of images per sample
+    num_images: int = 1
+
+    # HF AutoProcessor instance will be set during build
+    _processor: Optional[Any] = None
+
+    def build_datasets(self, context: DatasetBuildContext):
+        """Create mock Qwen2.5-VL datasets for train/valid/test splits.
+
+        Args:
+            context: Provides sample counts and optional tokenizer.
+
+        Returns:
+            Tuple[Optional[Dataset], Optional[Dataset], Optional[Dataset]]
+        """
+
+        from transformers import AutoProcessor
+
+        # Initialize and store processor on the provider so the dataset can use it
+        self._processor = AutoProcessor.from_pretrained(self.hf_model_path, trust_remote_code=True)
+
+        def _maybe_make(size: int) -> Optional[MockQwen25VLDataset]:
+            return MockQwen25VLDataset(size=size, config=self) if size and size > 0 else None
+
+        train_ds = _maybe_make(context.train_samples)
+        valid_ds = _maybe_make(context.valid_samples)
+        test_ds = _maybe_make(context.test_samples)
+
+        return train_ds, valid_ds, test_ds
diff --git a/src/megatron/bridge/training/gpt_step.py b/src/megatron/bridge/training/gpt_step.py
index fdbacfb06..3a13b3e08 100644
--- a/src/megatron/bridge/training/gpt_step.py
+++ b/src/megatron/bridge/training/gpt_step.py
@@ -19,51 +19,17 @@
 import torch
 from megatron.core import parallel_state
 from megatron.core.models.gpt import GPTModel
-from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.utils import get_batch_on_this_cp_rank, get_model_config
 
-from megatron.bridge.training.config import ConfigContainer, FinetuningDatasetConfig
+from megatron.bridge.training.config import ConfigContainer
 from megatron.bridge.training.losses import masked_next_token_loss
 from megatron.bridge.training.state import GlobalState
+from megatron.bridge.training.utils.packed_seq_utils import get_packed_seq_params
 
 
 logger = logging.getLogger(__name__)
 
 
-def get_packed_seq_params(batch: dict[str, torch.Tensor]) -> PackedSeqParams:
-    """Extract packed sequence parameters from the batch.
-
-    Creates and returns a PackedSeqParams object with appropriate parameters
-    for packed sequence processing.
-
-    Args:
-        batch: Input batch containing packed sequence information
-
-    Returns:
-        PackedSeqParams: Parameters for packed sequence processing
-    """
-
-    cu_seqlens = batch["cu_seqlens"].squeeze()  # remove batch size dimension (mbs=1)
-    # remove -1 "paddings" added in collate_fn
-    if (cu_seqlens_argmin := batch.get("cu_seqlens_argmin", None)) is not None:
-        # pre-compute cu_seqlens_argmin in dataset class for perf
-        cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
-    else:
-        cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
-
-    # pre-compute max_seqlens in dataset class for perf
-    max_seqlen = batch["max_seqlen"].squeeze() if "max_seqlen" in batch else None
-
-    # these args are passed eventually into TEDotProductAttention.forward()
-    return PackedSeqParams(
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_kv=cu_seqlens,
-        max_seqlen_q=max_seqlen,
-        max_seqlen_kv=max_seqlen,
-        qkv_format="thd",
-    )
-
-
 def get_batch_from_iterator(
     data_iterator: Iterable,
     use_mtp: bool = False,
@@ -109,147 +75,6 @@ def get_batch_from_iterator(
     return _batch_required_keys
 
 
-def get_batch_on_this_tp_rank(
-    data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False
-) -> dict[str, torch.Tensor]:
-    """Get a batch from the data iterator, handling TP broadcasting.
-
-    On TP rank 0, it fetches the next batch from the iterator and broadcasts
-    the necessary tensors to other TP ranks based on the pipeline stage.
-    On other TP ranks, it allocates tensors and receives the broadcasted data.
-
-    Args:
-        data_iterator: The data iterator.
-        cfg: The configuration container.
-        use_mtp: Whether Multi-Token Prediction layers are enabled.
-
-    Returns:
-        A dictionary containing the batch data for the current rank.
-    """
-
-    def _broadcast(item):
-        if item is not None:
-            torch.distributed.broadcast(
-                item,
-                parallel_state.get_tensor_model_parallel_src_rank(),
-                group=parallel_state.get_tensor_model_parallel_group(),
-            )
-
-    if parallel_state.get_tensor_model_parallel_rank() == 0:
-        if data_iterator is not None:
-            data = next(data_iterator)
-        else:
-            data = None
-
-        batch = {
-            "tokens": data["tokens"].cuda(non_blocking=True),
-            "labels": data["labels"].cuda(non_blocking=True),
-            "loss_mask": data["loss_mask"].cuda(non_blocking=True),
-            "attention_mask": None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking=True),
-            "position_ids": data["position_ids"].cuda(non_blocking=True),
-        }
-
-        if cfg.model.pipeline_model_parallel_size == 1:
-            _broadcast(batch["tokens"])
-            _broadcast(batch["labels"])
-            _broadcast(batch["loss_mask"])
-            _broadcast(batch["attention_mask"])
-            _broadcast(batch["position_ids"])
-
-        elif parallel_state.is_pipeline_first_stage():
-            _broadcast(batch["tokens"])
-            _broadcast(batch["attention_mask"])
-            _broadcast(batch["position_ids"])
-
-        elif parallel_state.is_pipeline_last_stage():
-            # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding.
-            # Currently the Multi-Token Prediction (MTP) layers is fixed on the last stage, so we need
-            # to broadcast tokens and position_ids to all of the tensor parallel ranks on the last stage.
-            if use_mtp:
-                _broadcast(batch["tokens"])
-                _broadcast(batch["position_ids"])
-            _broadcast(batch["labels"])
-            _broadcast(batch["loss_mask"])
-            _broadcast(batch["attention_mask"])
-
-    else:
-        mbs = cfg.train.micro_batch_size
-        seq_length = cfg.model.seq_length
-        tokens = torch.empty(
-            (mbs, seq_length),
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
-        labels = torch.empty(
-            (mbs, seq_length),
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
-        loss_mask = torch.empty(
-            (mbs, seq_length),
-            dtype=torch.float32,
-            device=torch.cuda.current_device(),
-        )
-        if isinstance(cfg.dataset, FinetuningDatasetConfig) or cfg.dataset.create_attention_mask:
-            attention_mask = torch.empty(
-                (
-                    mbs,
-                    1,
-                    seq_length,
-                    seq_length,
-                ),
-                dtype=torch.bool,
-                device=torch.cuda.current_device(),
-            )
-        else:
-            attention_mask = None
-        position_ids = torch.empty(
-            (mbs, seq_length),
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
-
-        if cfg.model.pipeline_model_parallel_size == 1:
-            _broadcast(tokens)
-            _broadcast(labels)
-            _broadcast(loss_mask)
-            _broadcast(attention_mask)
-            _broadcast(position_ids)
-
-        elif parallel_state.is_pipeline_first_stage():
-            labels = None
-            loss_mask = None
-
-            _broadcast(tokens)
-            _broadcast(attention_mask)
-            _broadcast(position_ids)
-
-        elif parallel_state.is_pipeline_last_stage():
-            # Multi-Token Prediction (MTP) layers need tokens and position_ids to calculate embedding.
-            # Currently the Multi-Token Prediction (MTP) layers is fixed on the last stage, so we need
-            # to broadcast tokens and position_ids to all of the tensor parallel ranks on the last stage.
-            if use_mtp:
-                _broadcast(tokens)
-                _broadcast(position_ids)
-            else:
-                tokens = None
-                position_ids = None
-
-            _broadcast(labels)
-            _broadcast(loss_mask)
-            _broadcast(attention_mask)
-
-        batch = {
-            "tokens": tokens,
-            "labels": labels,
-            "loss_mask": loss_mask,
-            "attention_mask": attention_mask,
-            "position_ids": position_ids,
-        }
-
-    return batch
-
-
 def get_batch(
     data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False
 ) -> tuple[
@@ -364,13 +189,8 @@ def forward_step(
 def _create_loss_function(loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
     """Create a partial loss function with the specified configuration.
 
-    Args:
-        loss_mask: Used to mask out some portions of the loss
-        check_for_nan_in_loss: Whether to check for NaN values in the loss
-        check_for_spiky_loss: Whether to check for spiky loss values
-
-    Returns:
-        A partial function that can be called with output_tensor to compute the loss
+    Kept here for backward compatibility with tests and callers that patch
+    `megatron.bridge.training.gpt_step.masked_next_token_loss`.
     """
     return partial(
         masked_next_token_loss,
diff --git a/src/megatron/bridge/training/losses.py b/src/megatron/bridge/training/losses.py
index f2cfda84a..985951007 100644
--- a/src/megatron/bridge/training/losses.py
+++ b/src/megatron/bridge/training/losses.py
@@ -21,6 +21,22 @@
 SPIKY_LOSS_FACTOR: int = 10
 
 
+def create_masked_next_token_loss_function(
+    loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool
+) -> partial:
+    """Create a partial loss function configured for masked next-token loss.
+
+    This replaces the generic helper previously in utils/loss_utils.py.
+    """
+
+    return partial(
+        masked_next_token_loss,
+        loss_mask,
+        check_for_nan_in_loss=check_for_nan_in_loss,
+        check_for_spiky_loss=check_for_spiky_loss,
+    )
+
+
 def masked_next_token_loss(
     loss_mask: torch.Tensor,
     output_tensor: torch.Tensor,
diff --git a/src/megatron/bridge/training/utils/batch_utils.py b/src/megatron/bridge/training/utils/batch_utils.py
new file mode 100644
index 000000000..a806e23e3
--- /dev/null
+++ b/src/megatron/bridge/training/utils/batch_utils.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Iterable
+
+import torch
+from megatron.core import parallel_state
+
+from megatron.bridge.training.config import ConfigContainer, FinetuningDatasetConfig
+
+
+def get_batch_on_this_tp_rank(
+    data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False
+) -> dict[str, torch.Tensor]:
+    """Get a batch from the data iterator, handling TP broadcasting.
+
+    This is a generic helper used by multiple recipes. The implementation is
+    identical to the prior one in `gpt_step.py`.
+    """
+
+    def _broadcast(item):
+        if item is not None:
+            torch.distributed.broadcast(
+                item,
+                parallel_state.get_tensor_model_parallel_src_rank(),
+                group=parallel_state.get_tensor_model_parallel_group(),
+            )
+
+    if parallel_state.get_tensor_model_parallel_rank() == 0:
+        if data_iterator is not None:
+            data = next(data_iterator)
+        else:
+            data = None
+
+        batch = {
+            "tokens": data["tokens"].cuda(non_blocking=True),
+            "labels": data["labels"].cuda(non_blocking=True),
+            "loss_mask": data["loss_mask"].cuda(non_blocking=True),
+            "attention_mask": None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking=True),
+            "position_ids": data["position_ids"].cuda(non_blocking=True),
+        }
+
+        if cfg.model.pipeline_model_parallel_size == 1:
+            _broadcast(batch["tokens"])
+            _broadcast(batch["labels"])
+            _broadcast(batch["loss_mask"])
+            _broadcast(batch["attention_mask"])
+            _broadcast(batch["position_ids"])
+
+        elif parallel_state.is_pipeline_first_stage():
+            _broadcast(batch["tokens"])
+            _broadcast(batch["attention_mask"])
+            _broadcast(batch["position_ids"])
+
+        elif parallel_state.is_pipeline_last_stage():
+            if use_mtp:
+                _broadcast(batch["tokens"])
+                _broadcast(batch["position_ids"])
+            _broadcast(batch["labels"])
+            _broadcast(batch["loss_mask"])
+            _broadcast(batch["attention_mask"])
+
+    else:
+        mbs = cfg.train.micro_batch_size
+        seq_length = cfg.model.seq_length
+        tokens = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+        labels = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+        loss_mask = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.float32,
+            device=torch.cuda.current_device(),
+        )
+        if isinstance(cfg.dataset, FinetuningDatasetConfig) or cfg.dataset.create_attention_mask:
+            attention_mask = torch.empty(
+                (
+                    mbs,
+                    1,
+                    seq_length,
+                    seq_length,
+                ),
+                dtype=torch.bool,
+                device=torch.cuda.current_device(),
+            )
+        else:
+            attention_mask = None
+        position_ids = torch.empty(
+            (mbs, seq_length),
+            dtype=torch.int64,
+            device=torch.cuda.current_device(),
+        )
+
+        if cfg.model.pipeline_model_parallel_size == 1:
+            _broadcast(tokens)
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif parallel_state.is_pipeline_first_stage():
+            labels = None
+            loss_mask = None
+
+            _broadcast(tokens)
+            _broadcast(attention_mask)
+            _broadcast(position_ids)
+
+        elif parallel_state.is_pipeline_last_stage():
+            if use_mtp:
+                _broadcast(tokens)
+                _broadcast(position_ids)
+            else:
+                tokens = None
+                position_ids = None
+
+            _broadcast(labels)
+            _broadcast(loss_mask)
+            _broadcast(attention_mask)
+
+        batch = {
+            "tokens": tokens,
+            "labels": labels,
+            "loss_mask": loss_mask,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+        }
+
+    return batch
diff --git a/src/megatron/bridge/training/utils/omegaconf_utils.py b/src/megatron/bridge/training/utils/omegaconf_utils.py
index 75eb7ac83..b87bf14c9 100644
--- a/src/megatron/bridge/training/utils/omegaconf_utils.py
+++ b/src/megatron/bridge/training/utils/omegaconf_utils.py
@@ -17,6 +17,7 @@
 
 import dataclasses
 import functools
+import inspect
 import logging
 from typing import Any, Dict, Tuple, TypeVar
 
@@ -168,13 +169,31 @@ def _dataclass_to_omegaconf_dict(val_to_convert: Any, path: str = "") -> Any:
     """
     current_path = path
 
+    # Handle Hugging Face GenerationConfig / PretrainedConfig by converting to a callable dict
+    # compatible with our YAML representer logic
+    try:
+        from transformers import GenerationConfig, PretrainedConfig  # type: ignore
+
+        if isinstance(val_to_convert, (GenerationConfig, PretrainedConfig)):
+            cfg_class = val_to_convert.__class__
+            target = f"{inspect.getmodule(cfg_class).__name__}.{cfg_class.__qualname__}.from_dict"
+            logger.debug(f"Converting {cfg_class.__qualname__} at {current_path} to callable dict")
+            return {
+                "_target_": target,
+                "_call_": True,
+                "config_dict": val_to_convert.to_dict(),
+            }
+    except ModuleNotFoundError:
+        # transformers is optional; if unavailable, fall through to other handlers
+        pass
+
     # Explicitly handle torch.dtype - convert to string
     if isinstance(val_to_convert, torch.dtype):
         logger.debug(f"Converting torch.dtype at {current_path}: {val_to_convert}")
         return str(val_to_convert)
 
     # Handle callables - exclude them completely
-    elif _is_omegaconf_problematic(val_to_convert):
+    if _is_omegaconf_problematic(val_to_convert):
         logger.debug(f"Excluding callable at {current_path}: {type(val_to_convert)} - {val_to_convert}")
         return _EXCLUDE_FIELD
 
diff --git a/src/megatron/bridge/training/utils/packed_seq_utils.py b/src/megatron/bridge/training/utils/packed_seq_utils.py
new file mode 100644
index 000000000..955e5a0bb
--- /dev/null
+++ b/src/megatron/bridge/training/utils/packed_seq_utils.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import torch
+from megatron.core.packed_seq_params import PackedSeqParams
+
+
+def get_packed_seq_params(batch: dict[str, torch.Tensor]) -> PackedSeqParams:
+    """Build packed sequence parameters from a batch dictionary.
+
+    The function squeezes possible batch dimensions and removes any padding
+    marked by -1 values. It returns a `PackedSeqParams` instance suitable for
+    packed sequence attention kernels.
+
+    Args:
+        batch: A dictionary possibly containing `cu_seqlens`, optional
+            `cu_seqlens_argmin`, and optional `max_seqlen` tensors.
+
+    Returns:
+        PackedSeqParams with identical q/kv parameters and `qkv_format` set to
+        "thd".
+    """
+
+    cu_seqlens = batch["cu_seqlens"].squeeze()
+
+    cu_seqlens_argmin = batch.get("cu_seqlens_argmin", None)
+    if cu_seqlens_argmin is not None:
+        cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
+    else:
+        cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
+
+    max_seqlen = batch["max_seqlen"].squeeze() if "max_seqlen" in batch else None
+
+    return PackedSeqParams(
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        qkv_format="thd",
+    )
diff --git a/src/megatron/bridge/training/utils/padding_utils.py b/src/megatron/bridge/training/utils/padding_utils.py
new file mode 100644
index 000000000..58fef0502
--- /dev/null
+++ b/src/megatron/bridge/training/utils/padding_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Padding and truncation helpers for training batches.
+
+These utilities centralize common sequence length adjustments used to ensure
+fixed or efficient shapes for tensors such as tokens, labels, position ids,
+and attention masks.
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+
+
+__all__ = [
+    "pad_or_truncate_2d_to_len",
+    "pad_or_truncate_pos_to_len",
+    "pad_or_truncate_attn_to_len",
+]
+
+
+def pad_or_truncate_2d_to_len(
+    x: torch.Tensor | None, target_len: int, max_cap: int, pad_value: int | float
+) -> torch.Tensor | None:
+    """Pad or truncate a 2D tensor to a desired target length with an upper cap.
+
+    Expects input of shape (batch, seq_len). Pads/truncates along the last dimension.
+    """
+    if x is None:
+        return None
+    current_len = x.size(1)
+    if current_len < target_len:
+        return F.pad(x, (0, target_len - current_len), value=pad_value)
+    if current_len > max_cap:
+        return x[:, :max_cap]
+    return x
+
+
+def pad_or_truncate_pos_to_len(pos: torch.Tensor | None, target_len: int, max_cap: int) -> torch.Tensor | None:
+    """Pad or truncate position ids to a target length with an upper cap.
+
+    Extends positions by appending a monotonically increasing range starting
+    from the current length to the target length.
+    """
+    if pos is None:
+        return None
+    current_len = pos.size(1)
+    if current_len < target_len:
+        addition = (
+            torch.arange(current_len, target_len, device=pos.device, dtype=pos.dtype)
+            .unsqueeze(0)
+            .expand(pos.size(0), -1)
+        )
+        return torch.cat([pos, addition], dim=1)
+    if current_len > max_cap:
+        return pos[:, :max_cap]
+    return pos
+
+
+def pad_or_truncate_attn_to_len(mask: torch.Tensor | None, target_len: int, max_cap: int) -> torch.Tensor | None:
+    """Pad or truncate a 4D attention mask to the target length with an upper cap.
+
+    Expects input of shape (batch, heads, seq_len, seq_len). Pads the last two dims.
+    """
+    if mask is None:
+        return None
+    _, _, s1, s2 = mask.shape
+    pad_value = False if mask.dtype == torch.bool else 0
+    if s1 < target_len:
+        return F.pad(mask, (0, target_len - s2, 0, target_len - s1), value=pad_value)
+    if s1 > max_cap:
+        return mask[:, :, :max_cap, :max_cap]
+    return mask
diff --git a/src/megatron/bridge/training/utils/visual_inputs.py b/src/megatron/bridge/training/utils/visual_inputs.py
new file mode 100644
index 000000000..b37478382
--- /dev/null
+++ b/src/megatron/bridge/training/utils/visual_inputs.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Optional
+
+import torch
+
+
+@dataclass
+class Qwen2_5_VLVisualInputs:
+    """Container for Qwen2/Qwen2.5-VL visual modality tensors.
+
+    Fields mirror the processor outputs for Qwen2/Qwen2.5-VL. Shapes may be
+    normalized for model consumption via normalized_for_model().
+    """
+
+    # Image tensors, e.g., Qwen2.5-VL processor output.
+    pixel_values: Optional[torch.Tensor] = None
+
+    # Per-image temporal/spatial grid metadata (T, H, W) for videos, Qwen2.5-VL.
+    image_grid_thw: Optional[torch.Tensor] = None
+
+    def as_model_kwargs(self) -> dict[str, torch.Tensor]:
+        """Return a mapping of non-None fields suitable for model forward kwargs."""
+        result: dict[str, torch.Tensor] = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if value is not None:
+                result[f.name] = value
+        return result
+
+    def normalized_for_model(self) -> dict[str, torch.Tensor]:
+        """Return non-None fields with shapes normalized for model expectations.
+
+        - pixel_values: [B, N, C, H, W] -> [B*N, C, H, W]
+        - image_grid_thw: [B, N, 3] -> [B*N, 3]
+        """
+        kwargs = self.as_model_kwargs()
+
+        pixel_values = kwargs.get("pixel_values")
+        if isinstance(pixel_values, torch.Tensor) and pixel_values.dim() == 5:
+            b, n, c, h, w = pixel_values.shape
+            kwargs["pixel_values"] = pixel_values.view(b * n, c, h, w)
+
+        image_grid_thw = kwargs.get("image_grid_thw")
+        if isinstance(image_grid_thw, torch.Tensor) and image_grid_thw.dim() == 3:
+            kwargs["image_grid_thw"] = image_grid_thw.view(-1, image_grid_thw.size(-1))
+
+        return kwargs
diff --git a/src/megatron/bridge/training/vlm_step.py b/src/megatron/bridge/training/vlm_step.py
new file mode 100644
index 000000000..10cd4e75e
--- /dev/null
+++ b/src/megatron/bridge/training/vlm_step.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from functools import partial
+from typing import Any, Iterable
+
+import torch
+from megatron.core import parallel_state
+from megatron.core.models.gpt import GPTModel
+from megatron.core.utils import get_batch_on_this_cp_rank, get_model_config
+
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.losses import (
+    create_masked_next_token_loss_function as _create_loss_function,
+)
+from megatron.bridge.training.state import GlobalState
+from megatron.bridge.training.utils.packed_seq_utils import get_packed_seq_params
+from megatron.bridge.training.utils.padding_utils import (
+    pad_or_truncate_2d_to_len,
+    pad_or_truncate_attn_to_len,
+    pad_or_truncate_pos_to_len,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_batch_from_iterator(
+    data_iterator: Iterable,
+    use_mtp: bool = False,
+    skip_getting_attention_mask_from_dataset: bool = True,
+) -> dict[str, Any]:
+    """Get a batch of data from the iterator.
+
+    Args:
+        data_iterator: The data iterator to get the batch from.
+        use_mtp: Whether Multi-Token Prediction layers are enabled.
+        skip_getting_attention_mask_from_dataset: If set, the dataset will pass a None attention mask.
+
+    Returns:
+        dict[str, torch.Tensor]: A dictionary containing the batch data.
+    """
+    batch = next(data_iterator)
+
+    required_device_keys = set()
+    required_host_keys = set()
+
+    if not skip_getting_attention_mask_from_dataset:
+        required_device_keys.add("attention_mask")
+
+    # Instead of raw tensors, expect a single 'visual_inputs' object in batch
+    required_device_keys.add("visual_inputs")
+
+    if "cu_seqlens" in batch:
+        required_device_keys.add("cu_seqlens")
+        required_host_keys.add("cu_seqlens_argmin")
+        required_host_keys.add("max_seqlen")
+
+    required_device_keys.update(("tokens", "input_ids", "position_ids"))
+    if parallel_state.is_pipeline_last_stage():
+        required_device_keys.update(("labels", "loss_mask"))
+
+    _batch_required_keys = {}
+    for key, val in batch.items():
+        if key in required_device_keys:
+            if key == "visual_inputs":
+                if val is None:
+                    _batch_required_keys[key] = None
+                else:
+                    _batch_required_keys[key] = val
+                    # Move all visual inputs contained tensors to CUDA
+                    for k, v in val.__dict__.items():
+                        _batch_required_keys[key].__dict__[k] = v.cuda(non_blocking=True) if v is not None else None
+            else:
+                _batch_required_keys[key] = val.cuda(non_blocking=True) if val is not None else None
+        elif key in required_host_keys:
+            _batch_required_keys[key] = val.cpu() if val is not None else None
+        else:
+            _batch_required_keys[key] = None
+
+    return _batch_required_keys
+
+
+def get_batch(
+    data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    Any,
+]:
+    """Generate a batch.
+
+    Args:
+        data_iterator: Input data iterator
+        cfg: Configuration container
+        use_mtp: Whether Multi-Token Prediction layers are enabled
+
+    Returns:
+        tuple of tensors containing tokens, labels, loss_mask, attention_mask, position_ids,
+        cu_seqlens, cu_seqlens_argmin, max_seqlen, visual_inputs (container of optional modalities)
+    """
+    if (not parallel_state.is_pipeline_first_stage()) and (not parallel_state.is_pipeline_last_stage()):
+        return None, None, None, None, None, None, None, None, None
+
+    batch = get_batch_from_iterator(
+        data_iterator,
+        use_mtp,
+        getattr(cfg.dataset, "skip_getting_attention_mask_from_dataset", True),
+    )
+
+    # Slice only text tensors for context parallelism
+    cp_keys = ("tokens", "input_ids", "labels", "loss_mask", "attention_mask", "position_ids")
+    cp_slice = {k: batch.get(k) for k in cp_keys if k in batch}
+    cp_slice = get_batch_on_this_cp_rank(cp_slice)
+    for k, v in cp_slice.items():
+        batch[k] = v
+
+    # When using pipeline parallelism, ensure fixed shapes equal to cfg.model.seq_length
+    if getattr(cfg.model, "pipeline_model_parallel_size", 1) > 1:
+        seq_len = cfg.model.seq_length
+
+        tokens_or_input = batch.get("tokens") if batch.get("tokens") is not None else batch.get("input_ids")
+        tokens_or_input = pad_or_truncate_2d_to_len(tokens_or_input, seq_len, seq_len, pad_value=0)
+        if batch.get("tokens") is not None:
+            batch["tokens"] = tokens_or_input  # type: ignore[assignment]
+        else:
+            batch["input_ids"] = tokens_or_input  # type: ignore[assignment]
+        batch["labels"] = pad_or_truncate_2d_to_len(batch.get("labels"), seq_len, seq_len, pad_value=-100)  # type: ignore[assignment]
+        batch["loss_mask"] = pad_or_truncate_2d_to_len(batch.get("loss_mask"), seq_len, seq_len, pad_value=0)  # type: ignore[assignment]
+        batch["position_ids"] = pad_or_truncate_pos_to_len(batch.get("position_ids"), seq_len, seq_len)  # type: ignore[assignment]
+        if batch.get("attention_mask") is not None:
+            batch["attention_mask"] = pad_or_truncate_attn_to_len(batch.get("attention_mask"), seq_len, seq_len)  # type: ignore[assignment]
+    else:
+        # No PP: pad sequence length to nearest multiple of 128 for efficiency (capped at model seq_length)
+        seq_cap = cfg.model.seq_length
+
+        def _ceil_to_mult(n: int, mult: int) -> int:
+            return ((n + mult - 1) // mult) * mult
+
+        tokens_or_input = batch.get("tokens") if batch.get("tokens") is not None else batch.get("input_ids")
+        if tokens_or_input is not None:
+            cur_len = tokens_or_input.size(1)
+            target_len = min(seq_cap, _ceil_to_mult(cur_len, 128))
+
+            # tokens/input_ids
+            padded_tokens = pad_or_truncate_2d_to_len(tokens_or_input, target_len, seq_cap, pad_value=0)
+            if batch.get("tokens") is not None:
+                batch["tokens"] = padded_tokens  # type: ignore[assignment]
+            else:
+                batch["input_ids"] = padded_tokens  # type: ignore[assignment]
+
+            # labels and loss mask
+            batch["labels"] = pad_or_truncate_2d_to_len(batch.get("labels"), target_len, seq_cap, pad_value=-100)  # type: ignore[assignment]
+            batch["loss_mask"] = pad_or_truncate_2d_to_len(batch.get("loss_mask"), target_len, seq_cap, pad_value=0)  # type: ignore[assignment]
+
+            # position_ids: extend with increasing positions
+            pos = batch.get("position_ids")
+            pos = pad_or_truncate_pos_to_len(pos, target_len, seq_cap)
+            if pos is not None:
+                batch["position_ids"] = pos  # type: ignore[assignment]
+
+            # attention_mask if present
+            attn = batch.get("attention_mask")
+        if attn is not None:
+            attn = pad_or_truncate_attn_to_len(attn, target_len, seq_cap)
+            batch["attention_mask"] = attn  # type: ignore[assignment]
+
+    visual_inputs = batch.get("visual_inputs")
+
+    return (
+        (batch.get("tokens") if batch.get("tokens") is not None else batch.get("input_ids")),
+        batch["labels"],
+        batch["loss_mask"],
+        batch["attention_mask"],
+        batch["position_ids"],
+        batch.get("cu_seqlens"),
+        batch.get("cu_seqlens_argmin"),
+        batch.get("max_seqlen"),
+        visual_inputs,
+    )
+
+
+def forward_step(
+    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
+) -> tuple[torch.Tensor, partial]:
+    """Forward training step.
+
+    Args:
+        state: Global state for the run
+        data_iterator: Input data iterator
+        model: The GPT Model
+        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
+
+    Returns:
+        tuple containing the output tensor and the loss function
+    """
+    timers = state.timers
+    straggler_timer = state.straggler_timer
+
+    config = get_model_config(model)
+    use_mtp = (getattr(config, "mtp_num_layers", None) or 0) > 0
+
+    timers("batch-generator", log_level=2).start()
+    with straggler_timer(bdata=True):
+        (
+            tokens,
+            labels,
+            loss_mask,
+            attention_mask,
+            position_ids,
+            cu_seqlens,
+            cu_seqlens_argmin,
+            max_seqlen,
+            visual_inputs,
+        ) = get_batch(data_iterator, state.cfg, use_mtp)
+    timers("batch-generator").stop()
+
+    forward_args = {
+        "input_ids": tokens,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "labels": labels,
+    }
+
+    if visual_inputs is not None:
+        forward_args.update(visual_inputs.normalized_for_model())
+
+    # Add packed sequence support
+    if cu_seqlens is not None:
+        packed_seq_params = {
+            "cu_seqlens": cu_seqlens,
+            "cu_seqlens_argmin": cu_seqlens_argmin,
+            "max_seqlen": max_seqlen,
+        }
+        forward_args["packed_seq_params"] = get_packed_seq_params(packed_seq_params)
+
+    check_for_nan_in_loss = state.cfg.rerun_state_machine.check_for_nan_in_loss
+    check_for_spiky_loss = state.cfg.rerun_state_machine.check_for_spiky_loss
+    with straggler_timer:
+        if return_schedule_plan:
+            assert config.overlap_moe_expert_parallel_comm, (
+                "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
+            )
+            schedule_plan = model.build_schedule_plan(
+                tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask
+            )
+            loss_function = _create_loss_function(loss_mask, check_for_nan_in_loss, check_for_spiky_loss)
+            return schedule_plan, loss_function
+        else:
+            output_tensor = model(**forward_args)
+
+    loss_function = _create_loss_function(loss_mask, check_for_nan_in_loss, check_for_spiky_loss)
+
+    return output_tensor, loss_function
diff --git a/src/megatron/bridge/utils/common_utils.py b/src/megatron/bridge/utils/common_utils.py
index 780fcf60a..2ee1296b4 100644
--- a/src/megatron/bridge/utils/common_utils.py
+++ b/src/megatron/bridge/utils/common_utils.py
@@ -14,6 +14,7 @@
 
 import os
 import re
+import types
 import warnings
 
 import torch
@@ -114,6 +115,49 @@ def print_rank_last(message: str) -> None:
         print(message, flush=True)
 
 
+def hook_hf_module_setattr_for_tp_grad_sync(module: torch.nn.Module) -> torch.nn.Module:
+    """Mark params for TP grad sync and hook __setattr__ on a module and its children.
+
+    This ensures that all existing parameters under the provided module have the
+    attribute ``average_gradients_across_tp_domain=True`` and that any future
+    submodules assigned onto this module (or any of its current children) will
+    also have their parameters marked automatically.
+
+    Args:
+        module: The root module (typically a Hugging Face module instance).
+
+    Returns:
+        The same module instance for convenience.
+    """
+    if module is None:
+        return module
+
+    # Mark all existing parameters recursively
+    for param in module.parameters(recurse=True):
+        setattr(param, "average_gradients_across_tp_domain", True)
+
+    def _wrap_setattr(original_setattr):
+        def _wrapped(self, name, value):
+            original_setattr(name, value)
+            if isinstance(value, torch.nn.Module):
+                for p in value.parameters(recurse=True):
+                    setattr(p, "average_gradients_across_tp_domain", True)
+
+        return _wrapped
+
+    # Hook __setattr__ on the module and all existing submodules to catch
+    # future dynamic assignments anywhere in the hierarchy.
+    for submodule in module.modules():
+        if getattr(submodule, "_tp_grad_sync_setattr_wrapped", False):
+            continue
+        original_setattr = submodule.__setattr__
+        wrapped = _wrap_setattr(original_setattr)
+        submodule.__setattr__ = types.MethodType(wrapped, submodule)
+        setattr(submodule, "_tp_grad_sync_setattr_wrapped", True)
+
+    return module
+
+
 def extract_expert_number_from_param(param_name: str) -> int:
     """Extract the expert number from a parameter name.
     Args:
diff --git a/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
new file mode 100644
index 000000000..b8cfdd51f
--- /dev/null
+++ b/tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Qwen2.5-VL recipe configurations."""
+
+import pytest
+
+from megatron.bridge.recipes.qwen_vl.qwen25_vl import qwen25_vl_3b_finetune_config
+from tests.functional_tests.recipes.utils import run_pretrain_vl_recipe_test
+
+
+QWEN_VL_PRETRAIN_RECIPES = [
+    # (config_func, name, parallelism_overrides)
+    # Two-GPU TP for local/CI multi-GPU runs
+    (qwen25_vl_3b_finetune_config, "qwen25_vl_3b", {"tensor_parallelism": 2, "pipeline_parallelism": 1}),
+]
+
+
+class TestQwenVLRecipes:
+    """Test class for Qwen2.5-VL recipe functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    @pytest.mark.parametrize("config_func,recipe_name,parallelism_overrides", QWEN_VL_PRETRAIN_RECIPES)
+    def test_qwen25_vl_pretrain_recipes(self, config_func, recipe_name, parallelism_overrides, tmp_path):
+        """Functional test for Qwen2.5-VL recipes with appropriate parallelism configurations."""
+        run_pretrain_vl_recipe_test(config_func, recipe_name, tmp_path, **parallelism_overrides)
diff --git a/tests/functional_tests/recipes/utils.py b/tests/functional_tests/recipes/utils.py
index 8162651e6..c264adf7f 100644
--- a/tests/functional_tests/recipes/utils.py
+++ b/tests/functional_tests/recipes/utils.py
@@ -34,6 +34,7 @@ def run_pretrain_recipe_test(
     tmp_path: Path,
     tensor_parallelism: Optional[int] = None,
     pipeline_parallelism: Optional[int] = None,
+    model_overrides: Optional[dict] = None,
 ):
     """
     Common test implementation for pretrain recipe configurations.
@@ -66,6 +67,14 @@ def run_pretrain_recipe_test(
         config.model.seq_length = test_seq_length
         config.dataset.sequence_length = test_seq_length
 
+        # Keep dataloader light-weight for CI
+        if hasattr(config.dataset, "pin_memory"):
+            config.dataset.pin_memory = False
+        if hasattr(config.dataset, "num_workers"):
+            config.dataset.num_workers = 0
+        if hasattr(config.dataset, "persistent_workers"):
+            config.dataset.persistent_workers = False
+
         train_samples_needed = config.train.train_iters * config.train.global_batch_size
         eval_samples_needed = config.train.eval_iters * config.train.global_batch_size
         test_samples_needed = 100  # Minimal test samples
@@ -80,9 +89,20 @@ def run_pretrain_recipe_test(
         config.dataset.split = [train_split, valid_split, test_split]
 
         if tensor_parallelism is not None:
-            config.model.tensor_parallelism = tensor_parallelism
+            if hasattr(config.model, "tensor_model_parallel_size"):
+                config.model.tensor_model_parallel_size = tensor_parallelism
+            else:
+                setattr(config.model, "tensor_parallelism", tensor_parallelism)
         if pipeline_parallelism is not None:
-            config.model.pipeline_parallelism = pipeline_parallelism
+            if hasattr(config.model, "pipeline_model_parallel_size"):
+                config.model.pipeline_model_parallel_size = pipeline_parallelism
+            else:
+                setattr(config.model, "pipeline_parallelism", pipeline_parallelism)
+
+        # Apply any model-level overrides (e.g., shrink layer/expert counts for CI)
+        if model_overrides:
+            for key, value in model_overrides.items():
+                setattr(config.model, key, value)
 
         pretrain(config, forward_step)
 
@@ -112,3 +132,68 @@ def run_pretrain_config_override_test(config_func: Callable):
 
     assert config.train.train_iters == 50000
     assert config.scheduler.lr_decay_iters == config.train.train_iters
+
+
+def run_pretrain_vl_recipe_test(
+    config_func: Callable,
+    recipe_name: str,
+    tmp_path: Path,
+    tensor_parallelism: Optional[int] = None,
+    pipeline_parallelism: Optional[int] = None,
+):
+    """
+    VLM variant of run_pretrain_recipe_test that uses the VLM forward step.
+
+    Mirrors the llama/qwen functional test utility but routes through
+    megatron.bridge.training.vlm_step.forward_step.
+    """
+    # Import locally to avoid loading VLM stack for non-VL tests
+    from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step
+
+    initialize_distributed()
+    shared_base_dir = broadcast_path(tmp_path)
+
+    try:
+        # Note: qwen_vl recipe config functions do not support 'mock' kwarg
+        config: ConfigContainer = config_func(
+            dir=str(shared_base_dir), name=f"{recipe_name}_functional_test", dataset_type="mock"
+        )
+        config.train.train_iters = 2
+        config.train.eval_interval = 1
+        config.train.eval_iters = 1
+        config.scheduler.lr_warmup_iters = 1
+        test_seq_length = 1024
+        config.model.seq_length = test_seq_length
+        config.dataset.sequence_length = test_seq_length
+
+        # Disable pin-memory and worker persistence in tests to avoid
+        # pin-memory device mismatches under torchrun+pytest environments.
+        config.dataset.pin_memory = False
+        config.dataset.num_workers = 0
+        config.dataset.persistent_workers = False
+
+        train_samples_needed = config.train.train_iters * config.train.global_batch_size
+        eval_samples_needed = config.train.eval_iters * config.train.global_batch_size
+        test_samples_needed = 8
+
+        total_samples = train_samples_needed + eval_samples_needed + test_samples_needed
+
+        # Set dataset split ratios for minimal dataset
+        train_split = train_samples_needed / total_samples
+        valid_split = eval_samples_needed / total_samples
+        test_split = test_samples_needed / total_samples
+
+        config.dataset.split = [train_split, valid_split, test_split]
+
+        if tensor_parallelism is not None:
+            config.model.tensor_parallelism = tensor_parallelism
+        if pipeline_parallelism is not None:
+            config.model.pipeline_parallelism = pipeline_parallelism
+
+        pretrain(config, vlm_forward_step)
+
+        # Basic verification that training completed successfully
+        verify_checkpoint_files(config.checkpoint.save, config.train.train_iters)
+
+    finally:
+        clear_directories(tmp_path)
diff --git a/tests/functional_tests/training/test_vlm_hf_masking.py b/tests/functional_tests/training/test_vlm_hf_masking.py
new file mode 100644
index 000000000..ab35cd533
--- /dev/null
+++ b/tests/functional_tests/training/test_vlm_hf_masking.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+import torch
+from torch.utils.data import DataLoader
+
+from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
+from megatron.bridge.training.config import DatasetBuildContext
+
+
+@pytest.mark.run_only_on("GPU")
+class TestVLMHFMasking:
+    def test_hf_vlm_label_masking_and_alignment(self):
+        try:
+            from transformers import AutoProcessor  # noqa: F401
+        except Exception:
+            pytest.skip("transformers not available")
+
+        hf_processor = os.environ.get("HF_VLM_PROCESSOR", "Qwen/Qwen2.5-VL-3B-Instruct")
+
+        provider = HFDatasetConversationProvider(
+            sequence_length=256,
+            hf_processor_path=hf_processor,
+            maker_name="rdr",  # small and public HF dataset
+            num_workers=0,
+            dataloader_type="single",
+            data_sharding=True,
+            pin_memory=False,
+            persistent_workers=False,
+        )
+
+        context = DatasetBuildContext(train_samples=16, valid_samples=0, test_samples=0, tokenizer=None)
+        train_ds, _, _ = provider.build_datasets(context)
+        assert train_ds is not None
+
+        def _collate_with_capture(batch_examples):
+            setattr(train_ds, "_last_batch_examples", batch_examples)
+            return train_ds.collate_fn(batch_examples)
+
+        loader = DataLoader(train_ds, batch_size=2, shuffle=False, collate_fn=_collate_with_capture)
+
+        try:
+            batch = next(iter(loader))
+        except ImportError as e:
+            pytest.skip(f"qwen-vl-utils likely missing: {e}")
+
+        assert "input_ids" in batch
+        assert "labels" in batch
+        assert "loss_mask" in batch
+
+        labels = batch["labels"]
+        loss_mask = batch["loss_mask"].to(dtype=torch.bool)
+
+        # Where loss_mask == 0, labels must be -100
+        assert torch.all(labels[~loss_mask] == -100)
+
+        # Where loss_mask == 1, labels should not be -100
+        has_unmasked = torch.any(loss_mask, dim=1)
+        if torch.any(has_unmasked):
+            assert torch.all(labels[loss_mask] != -100)
+
+        # At least one unmasked token in batch
+        per_sample_unmasked = torch.sum(loss_mask, dim=1)
+        assert torch.any(per_sample_unmasked > 0)
+
+        # Token-level 1:1 match of assistant replies with unmasked labels
+        processor = getattr(train_ds, "_processor", None)
+        tokenizer = getattr(processor, "tokenizer", processor)
+
+        def gather_assistant_texts(example: dict):
+            out = []
+            for turn in example.get("conversation", []):
+                if turn.get("role") != "assistant":
+                    continue
+                parts = turn.get("content", [])
+                if isinstance(parts, list):
+                    buf = []
+                    for p in parts:
+                        if isinstance(p, dict) and p.get("type") == "text" and isinstance(p.get("text"), str):
+                            buf.append(p["text"])
+                    if buf:
+                        out.append("".join(buf))
+                elif isinstance(parts, str):
+                    out.append(parts)
+            return out
+
+        examples_batch = getattr(train_ds, "_last_batch_examples")
+        for i in range(labels.size(0)):
+            label_ids = [int(t) for t in labels[i].tolist() if int(t) != -100]
+            pos = 0
+            turns = gather_assistant_texts(examples_batch[i])
+            ok = True
+            for t in turns:
+                tok0 = tokenizer(t, add_special_tokens=False)["input_ids"]
+                tok1 = tokenizer(t + "\n", add_special_tokens=False)["input_ids"]
+                matched = False
+                for cand in (tok0, tok1):
+                    L = len(cand)
+                    if label_ids[pos : pos + L] == cand:
+                        pos += L
+                        matched = True
+                        break
+                if not matched:
+                    ok = False
+                    break
+            if ok and pos != len(label_ids):
+                ok = False
+            assert ok
diff --git a/tests/unit_tests/data/vlm_datasets/test_collate.py b/tests/unit_tests/data/vlm_datasets/test_collate.py
new file mode 100644
index 000000000..6300ad193
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_collate.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import megatron.bridge.data.vlm_datasets.collate as collate
+
+
+class _DummyProcessor:
+    class _Tok:
+        pad_token_id = 0
+        added_tokens_decoder = {}
+
+    def __init__(self):
+        self.tokenizer = self._Tok()
+
+    def apply_chat_template(self, conversation, tokenize=False, **kwargs):
+        if tokenize:
+            # Return dict mimicking HF processor output when tokenize=True
+            # Minimal keys used by default_collate_fn
+            input_ids = torch.tensor([[1, 2, 3]])
+            pixel_values = torch.randn(1, 1, 3, 4, 4)
+            return {
+                "input_ids": input_ids,
+                "pixel_values": pixel_values,
+            }
+        # Non-tokenized: just a string
+        return "dummy"
+
+    def __call__(self, text=None, images=None, padding=True, return_tensors="pt", **kwargs):
+        # Minimal shape/value outputs used by qwen2_5_collate_fn
+        input_ids = torch.tensor([[1, 2, 3]])
+        out = {"input_ids": input_ids}
+        if images is not None:
+            # Create 1-batch, N images = len(images)
+            n = len(images)
+            out["pixel_values"] = torch.randn(1, n, 3, 4, 4)
+            out["image_grid_thw"] = torch.tensor([[[1, 2, 2]] * n])
+        return out
+
+
+def test_default_collate_builds_visual_inputs(monkeypatch):
+    # Force HAVE_QWEN_VL_UTILS True
+    monkeypatch.setattr(collate, "HAVE_QWEN_VL_UTILS", True)
+    proc = _DummyProcessor()
+    examples = [
+        {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]},
+    ]
+    batch = collate.default_collate_fn(examples, proc)
+    assert "visual_inputs" in batch
+    vi = batch["visual_inputs"]
+    # normalized_for_model called in training path; here we just assert fields present
+    assert hasattr(vi, "pixel_values")
+
+
+def test_qwen2_5_collate_fn_handles_no_images(monkeypatch):
+    monkeypatch.setattr(collate, "HAVE_QWEN_VL_UTILS", True)
+    # Stub process_vision_info to return (None, None)
+    monkeypatch.setattr(collate, "process_vision_info", lambda conv: (None, None))
+    proc = _DummyProcessor()
+    examples = [
+        {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]},
+        {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}]},
+    ]
+    batch = collate.qwen2_5_collate_fn(examples, proc)
+    assert "input_ids" in batch and "labels" in batch and "loss_mask" in batch
+    assert "visual_inputs" in batch
+
+
+def test_qwen2_5_collate_fn_handles_with_images(monkeypatch):
+    monkeypatch.setattr(collate, "HAVE_QWEN_VL_UTILS", True)
+
+    # Return list of N fake images for first example, None for second
+    def _fake_pvi(conv):
+        # Push 2 images for first, no images for second
+        text = str(conv)
+        if "hi" in text:
+            return ([object(), object()], None)
+        return (None, None)
+
+    monkeypatch.setattr(collate, "process_vision_info", _fake_pvi)
+    proc = _DummyProcessor()
+    examples = [
+        {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]},
+        {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}]},
+    ]
+    batch = collate.qwen2_5_collate_fn(examples, proc)
+    assert "visual_inputs" in batch
+    vi = batch["visual_inputs"]
+    # Ensure fields exist when images present
+    assert hasattr(vi, "pixel_values")
diff --git a/tests/unit_tests/data/vlm_datasets/test_dataset_provider.py b/tests/unit_tests/data/vlm_datasets/test_dataset_provider.py
new file mode 100644
index 000000000..a7fa86cf4
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_dataset_provider.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from megatron.bridge.training.config import DatasetBuildContext
+
+
+class _DummyTokenizer:
+    pad_token_id = 0
+    eos_token_id = 2
+    added_tokens_decoder = {}
+
+    def __call__(self, text, add_special_tokens=False):
+        # Very small deterministic tokenization
+        if isinstance(text, list):
+            # Map list of strings to flat ids
+            return {"input_ids": [self.__call__(t, add_special_tokens=add_special_tokens)["input_ids"] for t in text]}
+        ids = [1, 2, 3][: max(1, min(3, len(str(text))))]
+        return {"input_ids": ids}
+
+
+class _DummyProcessor:
+    def __init__(self):
+        self.tokenizer = _DummyTokenizer()
+
+    def apply_chat_template(self, conversation, tokenize=False, **kwargs):
+        if tokenize:
+            # Return minimal dict used by default_collate_fn
+            input_ids = torch.tensor([[1, 2, 3]])
+            pixel_values = torch.randn(1, 1, 3, 4, 4)
+            return {
+                "input_ids": input_ids,
+                "pixel_values": pixel_values,
+            }
+        return "dummy"
+
+    def __call__(self, text=None, images=None, padding=True, return_tensors="pt", **kwargs):
+        input_ids = torch.tensor([[1, 2, 3]])
+        out = {"input_ids": input_ids}
+        if images is not None:
+            n = len(images)
+            out["pixel_values"] = torch.randn(1, n, 3, 4, 4)
+            out["image_grid_thw"] = torch.tensor([[[1, 2, 2]] * n])
+        return out
+
+
+def _example():
+    return {"conversation": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]}
+
+
+def test_vlm_conversation_dataset_basic(monkeypatch):
+    # Import locally to ensure monkeypatches apply to module under test
+    import megatron.bridge.data.vlm_datasets.collate as collate
+    from megatron.bridge.data.vlm_datasets.conversation_dataset import VLMConversationDataset
+
+    # Enable collate dependencies
+    monkeypatch.setattr(collate, "HAVE_QWEN_VL_UTILS", True)
+
+    proc = _DummyProcessor()
+    ds = VLMConversationDataset(base_examples=[_example()], target_length=3, processor=proc, collate_impl=None)
+    assert len(ds) == 3
+    # Wraps over base list
+    assert ds[0]["conversation"][0]["role"] == "user"
+
+    batch = ds.collate_fn([_example(), _example()])
+    assert set(["input_ids", "labels", "loss_mask", "position_ids", "visual_inputs"]).issubset(batch.keys())
+
+
+def test_hf_provider_builds_splits_and_binds_collate(monkeypatch):
+    # Arrange monkeypatches: stub AutoProcessor and maker
+    # Stub AutoProcessor.from_pretrained to avoid network
+    import transformers
+
+    from megatron.bridge.data.vlm_datasets import hf_provider as dp_mod
+
+    monkeypatch.setattr(transformers.AutoProcessor, "from_pretrained", staticmethod(lambda *a, **k: _DummyProcessor()))
+
+    # Provide a tiny maker registry by monkeypatching _get_maker to return our lambda
+    def _fake_get_maker(self):
+        return lambda **kwargs: [_example(), _example()]
+
+    monkeypatch.setattr(dp_mod.HFDatasetConversationProvider, "_get_maker", _fake_get_maker)
+
+    provider = dp_mod.HFDatasetConversationProvider(
+        sequence_length=16, hf_processor_path="dummy/model", maker_name="rdr"
+    )
+
+    ctx = DatasetBuildContext(train_samples=2, valid_samples=1, test_samples=0)
+    train_ds, valid_ds, test_ds = provider.build_datasets(ctx)
+    assert train_ds is not None and len(train_ds) == 2
+    assert valid_ds is not None and len(valid_ds) == 1
+    assert test_ds is None
+
+    # Ensure collate_fn is bound and callable
+    batch = train_ds.collate_fn([_example()])
+    assert isinstance(batch, dict)
diff --git a/tests/unit_tests/data/vlm_datasets/test_hf_dataset_makers.py b/tests/unit_tests/data/vlm_datasets/test_hf_dataset_makers.py
new file mode 100644
index 000000000..e04d0f8fb
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_hf_dataset_makers.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from types import SimpleNamespace
+
+import megatron.bridge.data.vlm_datasets.hf_dataset_makers as makers
+
+
+class _DummyDataset(list):
+    def remove_columns(self, cols):  # match datasets API used
+        return self
+
+
+def _monkeypatch_load_dataset(monkeypatch, rows):
+    def _fake_load_dataset(path_or_dataset, split="train", **kwargs):  # noqa: ARG001 - interface
+        return _DummyDataset(rows)
+
+    monkeypatch.setattr(makers, "load_dataset", _fake_load_dataset)
+
+
+def test_make_rdr_dataset(monkeypatch):
+    rows = [
+        {"image": SimpleNamespace(), "text": "a cat"},
+        {"image": SimpleNamespace(), "text": "a dog"},
+    ]
+    _monkeypatch_load_dataset(monkeypatch, rows)
+    out = makers.make_rdr_dataset()
+    assert isinstance(out, list) and len(out) == 2
+    assert out[0]["conversation"][0]["content"][0]["type"] == "image"
+
+
+def test_make_cord_v2_dataset_variants(monkeypatch):
+    gt = {"gt_parses": [{"x": 1}, {"y": 2}]}
+    rows = [{"image": SimpleNamespace(), "ground_truth": json.dumps(gt)}]
+    _monkeypatch_load_dataset(monkeypatch, rows)
+    out = makers.make_cord_v2_dataset()
+    assert out and out[0]["conversation"][1]["role"] == "assistant"
+
+    # alt structure with single gt_parse
+    gt2 = {"gt_parse": {"a": 1}}
+    rows2 = [{"image": SimpleNamespace(), "ground_truth": json.dumps(gt2)}]
+    _monkeypatch_load_dataset(monkeypatch, rows2)
+    out2 = makers.make_cord_v2_dataset()
+    assert out2 and "<s_a>" in makers.json2token({"a": 1}, sort_json_key=True)
+
+
+def test_make_medpix_dataset(monkeypatch):
+    rows = [{"image_id": SimpleNamespace(), "question": "q?", "answer": "a"}]
+    _monkeypatch_load_dataset(monkeypatch, rows)
+    out = makers.make_medpix_dataset()
+    assert out and out[0]["conversation"][1]["content"][0]["type"] == "text"
+
+
+def test_make_cv17_dataset(monkeypatch):
+    rows = [{"audio": {"array": [0.1, 0.2], "sampling_rate": 16000}, "transcription": "hello"}]
+    _monkeypatch_load_dataset(monkeypatch, rows)
+    out = makers.make_cv17_dataset()
+    assert out and isinstance(out[0]["audio"], tuple)
diff --git a/tests/unit_tests/data/vlm_datasets/test_mock_provider.py b/tests/unit_tests/data/vlm_datasets/test_mock_provider.py
new file mode 100644
index 000000000..0ffc6b64f
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_mock_provider.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import types
+
+import torch
+
+import megatron.bridge.data.vlm_datasets.mock_provider as mock
+from megatron.bridge.training.config import DatasetBuildContext
+
+
+class _Proc:
+    class _Tok:
+        pad_token_id = 0
+        eos_token_id = 2
+        added_tokens_decoder = {}
+
+        def __init__(self):
+            self.vocab = {"a": 1, "b": 2, "c": 3}
+
+        @property
+        def vocab_size(self):
+            return len(self.vocab)
+
+        def get_vocab(self):
+            return self.vocab
+
+    def __init__(self):
+        self.tokenizer = self._Tok()
+
+    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):  # noqa: ARG002
+        # Return a simple deterministic string
+        return "<bos> user: x"
+
+    def __call__(self, **kwargs):  # text/images/padding/return_tensors
+        # Minimal tensors required by dataset
+        out = {"input_ids": torch.tensor([[1, 2, 3]])}
+        images = kwargs.get("images")
+        if images is not None:
+            n = len(images)
+            out["pixel_values"] = torch.randn(1, n, 3, 4, 4)
+            out["image_grid_thw"] = torch.tensor([[[1, 2, 2]] * n])
+        return types.SimpleNamespace(**out)
+
+
+def test_mock_provider_builds_splits(monkeypatch):
+    import transformers
+
+    monkeypatch.setattr(transformers.AutoProcessor, "from_pretrained", staticmethod(lambda *a, **k: _Proc()))
+    provider = mock.MockVLMConversationProvider(sequence_length=16, hf_processor_path="dummy/model", num_images=0)
+    ctx = DatasetBuildContext(train_samples=2, valid_samples=1, test_samples=0)
+    train_ds, valid_ds, test_ds = provider.build_datasets(ctx)
+    assert train_ds is not None and valid_ds is not None and test_ds is None
diff --git a/tests/unit_tests/data/vlm_datasets/test_preloaded_provider.py b/tests/unit_tests/data/vlm_datasets/test_preloaded_provider.py
new file mode 100644
index 000000000..255b8b063
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_preloaded_provider.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+
+import torch
+
+import megatron.bridge.data.vlm_datasets.preloaded_provider as pre
+from megatron.bridge.training.config import DatasetBuildContext
+
+
+class _DummyProcessor:
+    class _Tok:
+        pad_token_id = 0
+        eos_token_id = 2
+        added_tokens_decoder = {}
+
+        def __call__(self, text, add_special_tokens=False):  # noqa: ARG002 - parity with HF
+            return {"input_ids": [1, 2, 3]}
+
+    def __init__(self):
+        self.tokenizer = self._Tok()
+
+    def apply_chat_template(self, conversation, tokenize=False, **kwargs):  # noqa: ARG002
+        if tokenize:
+            return {"input_ids": torch.tensor([[1, 2, 3]]), "pixel_values": torch.randn(1, 1, 3, 4, 4)}
+        return "dummy"
+
+    def __call__(self, text=None, images=None, padding=True, return_tensors="pt", **kwargs):  # noqa: ARG002
+        out = {"input_ids": torch.tensor([[1, 2, 3]])}
+        if images is not None:
+            n = len(images)
+            out["pixel_values"] = torch.randn(1, n, 3, 4, 4)
+            out["image_grid_thw"] = torch.tensor([[[1, 2, 2]] * n])
+        return out
+
+
+def _write_tmp_jsonl(rows):
+    fd, path = tempfile.mkstemp(suffix=".jsonl")
+    os.close(fd)
+    with open(path, "w") as f:
+        for r in rows:
+            f.write(json.dumps(r) + "\n")
+    return path
+
+
+def test_split_text_by_placeholders_basic():
+    parts = pre._split_text_by_placeholders("a<image>b<video>c", ["img.png"], ["vid.mp4"])  # noqa: SLF001
+    types = [p["type"] for p in parts]
+    assert types == ["text", "image", "text", "video", "text"]
+
+
+def test_record_to_conversation_legacy_and_llava(tmp_path):  # noqa: ARG001 - tmp_path reserved
+    conv = pre._record_to_conversation(  # noqa: SLF001
+        {
+            "messages": [
+                {"role": "user", "content": "hello <image>"},
+                {"role": "assistant", "content": "world"},
+            ],
+            "image": "rel/img.png",
+        },
+        image_folder="/abs",
+    )
+    assert isinstance(conv, list) and conv[0]["content"][1]["type"] == "text"
+
+    # LLaVA-style
+    conv2 = pre._record_to_conversation(  # noqa: SLF001
+        {
+            "conversations": [
+                {"from": "human", "value": "<image> say x"},
+                {"from": "gpt", "value": "x"},
+            ],
+            "images": ["a.png"],
+        },
+        image_folder=None,
+    )
+    assert conv2[0]["role"] == "user"
+
+
+def test_load_and_build_provider(monkeypatch):
+    # Create small jsonl
+    rows = [{"messages": [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "ok"}]}]
+    path = _write_tmp_jsonl(rows)
+
+    # Stub AutoProcessor
+    import transformers
+
+    monkeypatch.setattr(transformers.AutoProcessor, "from_pretrained", staticmethod(lambda *a, **k: _DummyProcessor()))
+
+    provider = pre.PreloadedVLMConversationProvider(
+        sequence_length=16, hf_processor_path="dummy/model", train_data_path=path
+    )
+
+    ctx = DatasetBuildContext(train_samples=2, valid_samples=0, test_samples=0)
+    train_ds, valid_ds, test_ds = provider.build_datasets(ctx)
+    assert train_ds is not None and len(train_ds) == 2
+    assert valid_ds is None and test_ds is None
diff --git a/tests/unit_tests/data/vlm_datasets/test_token_utils.py b/tests/unit_tests/data/vlm_datasets/test_token_utils.py
new file mode 100644
index 000000000..be93660df
--- /dev/null
+++ b/tests/unit_tests/data/vlm_datasets/test_token_utils.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import types
+
+import torch
+
+from megatron.bridge.data.vlm_datasets.token_utils import extract_skipped_token_ids, json2token
+
+
+class _DummyTokenizer:
+    def __init__(self, added_tokens):
+        # Simulate HF tokenizer.added_tokens_decoder
+        self.added_tokens_decoder = added_tokens
+
+
+def test_extract_skipped_token_ids_finds_known_pads():
+    # Map token id -> Token object string representation
+    class _TokObj:
+        def __init__(self, s):
+            self.s = s
+
+        def __str__(self):
+            return self.s
+
+    added = {10: _TokObj("<|vision_pad|>"), 11: _TokObj("<image>"), 12: _TokObj("<nonpad>")}
+
+    proc = types.SimpleNamespace(tokenizer=_DummyTokenizer(added))
+    ids = extract_skipped_token_ids(proc)
+    assert isinstance(ids, torch.Tensor)
+    vals = set(ids.tolist())
+    assert 10 in vals and 11 in vals
+    assert 12 not in vals
+
+
+def test_json2token_roundtrip_basic():
+    obj = {"a": "x", "b": [1, 2]}
+    s = json2token(obj, sort_json_key=True)
+    # Contains bracketed tags and separators
+    assert "<s_a>" in s and "</s_a>" in s
+    assert "<s_b>" in s and "</s_b>" in s
+    assert "<sep/>" in s
diff --git a/tests/unit_tests/training/test_gpt_step.py b/tests/unit_tests/training/test_gpt_step.py
index c912ac6f3..9f8521c24 100644
--- a/tests/unit_tests/training/test_gpt_step.py
+++ b/tests/unit_tests/training/test_gpt_step.py
@@ -18,7 +18,10 @@
 import torch
 from megatron.core.packed_seq_params import PackedSeqParams
 
-from megatron.bridge.training.gpt_step import _create_loss_function, get_packed_seq_params
+from megatron.bridge.training.gpt_step import get_packed_seq_params
+from megatron.bridge.training.losses import (
+    create_masked_next_token_loss_function as _create_loss_function,
+)
 
 
 class TestGetPackedSeqParams:
@@ -215,7 +218,7 @@ def test_create_loss_function_mixed_values(self):
         assert loss_func.keywords["check_for_nan_in_loss"] == True
         assert loss_func.keywords["check_for_spiky_loss"] == False
 
-    @patch("megatron.bridge.training.gpt_step.masked_next_token_loss")
+    @patch("megatron.bridge.training.losses.masked_next_token_loss")
     def test_create_loss_function_callable(self, mock_loss_func):
         """Test that the created loss function can be called correctly."""
         loss_mask = torch.tensor([[1.0, 1.0, 1.0]])
diff --git a/tests/unit_tests/training/test_vlm_step.py b/tests/unit_tests/training/test_vlm_step.py
new file mode 100644
index 000000000..fc6ee6c27
--- /dev/null
+++ b/tests/unit_tests/training/test_vlm_step.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
+from megatron.bridge.training.vlm_step import forward_step, get_batch, get_batch_from_iterator
+
+
+class _Iterator:
+    def __init__(self, batch):
+        self.batch = batch
+        self._done = False
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._done:
+            raise StopIteration
+        self._done = True
+        return self.batch
+
+
+def _make_batch(device="cpu"):
+    # Minimal text tensors
+    tokens = torch.tensor([[1, 2, 3]], device=device)
+    input_ids = tokens.clone()
+    position_ids = torch.tensor([[0, 1, 2]], device=device)
+    labels = torch.tensor([[2, 3, 4]], device=device)
+    loss_mask = torch.ones_like(labels, dtype=torch.float, device=device)
+    attention_mask = torch.ones_like(tokens, dtype=torch.bool, device=device)
+
+    # Visual inputs container
+    pixel_values = torch.randn(1, 2, 3, 4, 4, device=device)
+    image_grid_thw = torch.tensor([[[1, 2, 2], [1, 2, 2]]], device=device)
+    vi = Qwen2_5_VLVisualInputs(pixel_values=pixel_values, image_grid_thw=image_grid_thw)
+
+    batch = {
+        "tokens": tokens,
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "labels": labels,
+        "loss_mask": loss_mask,
+        "attention_mask": attention_mask,
+        "visual_inputs": vi,
+    }
+    return batch
+
+
+def test_get_batch_from_iterator_moves_visual_inputs_to_cuda(monkeypatch):
+    # Avoid requiring distributed/parallel initialization in unit test
+    monkeypatch.setattr(
+        "megatron.core.parallel_state.is_pipeline_last_stage",
+        lambda: False,
+        raising=True,
+    )
+
+    # Simulate Training on CPU-only env by making .cuda a no-op that returns the same tensor
+    class _NoCudaTensor(torch.Tensor):
+        def cuda(self, non_blocking=False):  # type: ignore[override]
+            return self
+
+    def _as_nocuda(t):
+        return t.as_subclass(_NoCudaTensor)
+
+    batch = _make_batch()
+    # Replace tensors with _NoCudaTensor so calling .cuda works without a GPU
+    for k in ["tokens", "input_ids", "position_ids", "labels", "loss_mask", "attention_mask"]:
+        batch[k] = _as_nocuda(batch[k])
+    vi = batch["visual_inputs"]
+    vi.pixel_values = _as_nocuda(vi.pixel_values)
+    vi.image_grid_thw = _as_nocuda(vi.image_grid_thw)
+
+    it = _Iterator(batch)
+    out = get_batch_from_iterator(it, use_mtp=False, skip_getting_attention_mask_from_dataset=True)
+
+    assert "visual_inputs" in out
+    out_vi = out["visual_inputs"]
+    assert isinstance(out_vi, Qwen2_5_VLVisualInputs)
+    # Verify fields are preserved
+    assert out_vi.pixel_values is not None and out_vi.image_grid_thw is not None
+
+
+def test_get_batch_padding_paths(monkeypatch):
+    # Simulate both first and last pipeline stages so tensors are returned
+    monkeypatch.setattr("megatron.core.parallel_state.is_pipeline_first_stage", lambda: True, raising=True)
+    monkeypatch.setattr("megatron.core.parallel_state.is_pipeline_last_stage", lambda: True, raising=True)
+
+    # Disable context parallel slicing effects
+    monkeypatch.setattr(
+        "megatron.core.utils.get_batch_on_this_cp_rank",
+        lambda x: x,
+        raising=True,
+    )
+
+    # Minimal cfg
+    cfg = type("Cfg", (), {})()
+    cfg.model = type(
+        "M",
+        (),
+        {
+            "seq_length": 32,
+            "seq_len_interpolation_factor": 1.0,
+            "seq_length_interpolation_factor": 1.0,
+            "seq_length_interpolation": None,
+            "seq_length_interpolation_power": 1.0,
+            "pipeline_model_parallel_size": 1,
+        },
+    )()  # noqa: E501
+    cfg.dataset = type("D", (), {"skip_getting_attention_mask_from_dataset": True})()
+
+    # Make batch shorter than 128 to trigger ceil-to-128 padding path
+    short_tokens = torch.tensor([[1, 2, 3, 4]])
+    vi = Qwen2_5_VLVisualInputs(pixel_values=torch.randn(1, 1, 3, 4, 4), image_grid_thw=torch.tensor([[[1, 2, 2]]]))
+    batch = {
+        "input_ids": short_tokens,
+        "labels": torch.tensor([[2, 3, 4, -100]]),
+        "loss_mask": torch.ones_like(short_tokens, dtype=torch.float),
+        "position_ids": torch.arange(4).unsqueeze(0),
+        "attention_mask": torch.ones_like(short_tokens, dtype=torch.bool),
+        "visual_inputs": vi,
+    }
+
+    # Iterator
+    it = _Iterator(batch)
+    tokens, labels, loss_mask, attention_mask, position_ids, *_ = get_batch(it, cfg, use_mtp=False)
+    # Length padded up to min(seq_cap, ceil_to_128(4)) == 32
+    assert tokens.shape[1] == 32
+    assert labels.shape[1] == 32
+    assert loss_mask.shape[1] == 32
+    assert position_ids.shape[1] == 32
+
+
+def test_forward_step_schedule_plan(monkeypatch):
+    # Configure pipeline last/first to enable labels & loss_mask path
+    monkeypatch.setattr("megatron.core.parallel_state.is_pipeline_first_stage", lambda: True, raising=True)
+    monkeypatch.setattr("megatron.core.parallel_state.is_pipeline_last_stage", lambda: True, raising=True)
+
+    # No-op CUDA and CP functions
+    monkeypatch.setattr("megatron.core.utils.get_batch_on_this_cp_rank", lambda x: x, raising=True)
+
+    # Dummy model with required interface
+    class _Model:
+        def __init__(self):
+            self.config = type("C", (), {"mtp_num_layers": 0, "overlap_moe_expert_parallel_comm": True})()
+
+        def build_schedule_plan(self, tokens, position_ids, attention_mask, labels=None, loss_mask=None):  # noqa: ARG002
+            return torch.tensor(1)
+
+        def __call__(self, **kwargs):  # noqa: ARG002
+            return torch.tensor(0.0)
+
+    # Return model config
+    monkeypatch.setattr("megatron.core.utils.get_model_config", lambda m: m.config, raising=True)
+
+    # Dummy timers/straggler_timer
+    class _Timer:
+        def __call__(self, *a, **k):  # noqa: ARG002
+            return self
+
+        def start(self):
+            return self
+
+        def stop(self):
+            return self
+
+    class _Strag:
+        def __call__(self, *a, **k):  # noqa: ARG002
+            return self
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *exc):  # noqa: ARG002
+            return False
+
+    class _State:
+        def __init__(self):
+            self.cfg = type(
+                "Cfg",
+                (),
+                {
+                    "rerun_state_machine": type(
+                        "R", (), {"check_for_nan_in_loss": False, "check_for_spiky_loss": False}
+                    )()
+                },
+            )()  # noqa: E501
+            self.timers = _Timer()
+            self.straggler_timer = _Strag()
+
+    # Reuse small iterator producing already-sized batch
+    vi = Qwen2_5_VLVisualInputs(pixel_values=torch.randn(1, 1, 3, 4, 4), image_grid_thw=torch.tensor([[[1, 2, 2]]]))
+    batch = {
+        "input_ids": torch.tensor([[1, 2, 3, 4]]),
+        "labels": torch.tensor([[2, 3, 4, -100]]),
+        "loss_mask": torch.ones(1, 4),
+        "position_ids": torch.arange(4).unsqueeze(0),
+        "attention_mask": torch.ones(1, 4, dtype=torch.bool),
+        "visual_inputs": vi,
+    }
+    it = _Iterator(batch)
+
+    # Minimal cfg for get_batch within forward_step
+    cfg = type(
+        "C2",
+        (),
+        {
+            "model": type("M", (), {"seq_length": 16, "pipeline_model_parallel_size": 1})(),
+            "dataset": type("D", (), {"skip_getting_attention_mask_from_dataset": True})(),
+            "rerun_state_machine": type("R", (), {"check_for_nan_in_loss": False, "check_for_spiky_loss": False})(),
+        },
+    )()  # noqa: E501
+
+    state = _State()
+    state.cfg = cfg
+    model = _Model()
+
+    # Execute schedule plan path
+    plan, loss_fn = forward_step(state, it, model, return_schedule_plan=True)
+    assert isinstance(plan, torch.Tensor)
diff --git a/tests/unit_tests/training/utils/test_visual_inputs.py b/tests/unit_tests/training/utils/test_visual_inputs.py
new file mode 100644
index 000000000..5d6320240
--- /dev/null
+++ b/tests/unit_tests/training/utils/test_visual_inputs.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from megatron.bridge.training.utils.visual_inputs import Qwen2_5_VLVisualInputs
+
+
+def test_normalized_for_model_shapes():
+    # pixel_values: [B, N, C, H, W] -> [B*N, C, H, W]
+    pixel_values = torch.randn(2, 3, 4, 5, 6)
+    # image_grid_thw: [B, N, 3] -> [B*N, 3]
+    image_grid_thw = torch.randint(0, 10, (2, 3, 3))
+
+    vi = Qwen2_5_VLVisualInputs(pixel_values=pixel_values, image_grid_thw=image_grid_thw)
+    kwargs = vi.normalized_for_model()
+
+    assert "pixel_values" in kwargs
+    assert "image_grid_thw" in kwargs
+    assert kwargs["pixel_values"].shape == (2 * 3, 4, 5, 6)
+    assert kwargs["image_grid_thw"].shape == (2 * 3, 3)
+
+
+def test_as_model_kwargs_filters_none():
+    vi = Qwen2_5_VLVisualInputs(pixel_values=None, image_grid_thw=None)
+    kwargs = vi.as_model_kwargs()
+    assert kwargs == {}
diff --git a/uv.lock b/uv.lock
index 142e2ae91..6630ddae9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1949,6 +1949,7 @@ dependencies = [
     { name = "megatron-core", extra = ["dev", "mlm"] },
     { name = "omegaconf" },
     { name = "pyyaml" },
+    { name = "qwen-vl-utils" },
     { name = "regex" },
     { name = "rich" },
     { name = "six" },
@@ -2007,6 +2008,7 @@ requires-dist = [
     { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" },
     { name = "omegaconf", specifier = ">=2.3.0" },
     { name = "pyyaml", specifier = ">=6.0.2" },
+    { name = "qwen-vl-utils" },
     { name = "regex", specifier = ">=2024.11.6" },
     { name = "rich" },
     { name = "six", specifier = ">=1.17.0" },
@@ -3935,6 +3937,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/28/26534bed77109632a956977f60d8519049f545abc39215d086e33a61f1f2/pyyaml_ft-8.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:de04cfe9439565e32f178106c51dd6ca61afaa2907d143835d501d84703d3793", size = 171579, upload-time = "2025-06-10T15:32:14.34Z" },
 ]
 
+[[package]]
+name = "qwen-vl-utils"
+version = "0.0.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "av" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/b1/ad4fc2260a3badd278b38d642f3b987412f1f6682f0ef2b31b0572d5caa8/qwen_vl_utils-0.0.14.tar.gz", hash = "sha256:9c7cad5ae803b3a10f8bb7194deb12aeacdd032f92f4224e880c73587a7346ad", size = 8453, upload-time = "2025-09-23T09:38:57.532Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120, upload-time = "2025-09-23T09:38:56.297Z" },
+]
+
 [[package]]
 name = "ray"
 version = "2.49.2"