docling-project · PeterStaar-IBM · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -63,6 +63,7 @@
     GRANITE_VISION_TRANSFORMERS,
     SMOLDOCLING_MLX,
     SMOLDOCLING_TRANSFORMERS,
+    VLM2STAGE,
     VlmModelType,
 )
 from docling.document_converter import (
@@ -627,6 +628,12 @@
                             "To run SmolDocling faster, please install mlx-vlm:\n"
                             "pip install mlx-vlm"
                         )
+            elif vlm_model == VlmModelType.VLM2STAGE:
+                pipeline_options.vlm_options = VLM2STAGE
+            else:
+                raise ValueError(
+                    f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
+                )
 
             pdf_format_option = PdfFormatOption(
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options

diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
@@ -11,12 +11,13 @@
     # ApiAsrOptions,
     InferenceAsrFramework,
     InlineAsrNativeWhisperOptions,
-    TransformersModelType,
+    InlineAsrOptions,
+    # TransformersModelType,
 )
 
 _log = logging.getLogger(__name__)
 
-WHISPER_TINY = InlineAsrNativeWhisperOptions(
+WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="tiny",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -27,7 +28,7 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="small",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -38,7 +39,7 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="medium",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -49,7 +50,7 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_BASE = InlineAsrNativeWhisperOptions(
+WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="base",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -60,7 +61,7 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="large",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -71,7 +72,7 @@
     max_time_chunk=30.0,
 )
 
-WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="turbo",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,

diff --git a/docling/datamodel/layout_model_specs.py b/docling/datamodel/layout_model_specs.py
@@ -26,8 +26,6 @@ def model_repo_folder(self) -> str:
         return self.repo_id.replace("/", "--")
 
 
-# HuggingFace Layout Models
-
 # Default Docling Layout Model
 DOCLING_LAYOUT_V2 = LayoutModelConfig(
     name="docling_layout_v2",

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -12,10 +12,16 @@
 )
 from typing_extensions import deprecated
 
-from docling.datamodel import asr_model_specs
-
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+)
 from docling.datamodel.layout_model_specs import (
     DOCLING_LAYOUT_EGRET_LARGE,
     DOCLING_LAYOUT_EGRET_MEDIUM,
@@ -33,6 +39,7 @@
     InferenceFramework,
     InlineVlmOptions,
     ResponseFormat,
+    TwoStageVlmOptions,
 )
 from docling.datamodel.vlm_model_specs import (
     GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
@@ -270,8 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
         False  # (To be used with vlms, or other generative models)
     )
     # If True, text from backend will be used instead of generated text
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
         smoldocling_vlm_conversion_options
+        # SMOLDOCLING_TRANSFORMERS
     )
 
 
@@ -283,7 +291,7 @@ class LayoutOptions(BaseModel):
 
 
 class AsrPipelineOptions(PipelineOptions):
-    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    asr_options: Union[InlineAsrOptions] = WHISPER_TINY
     artifacts_path: Optional[Union[Path, str]] = None
 
 

diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
@@ -5,10 +5,11 @@
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.pipeline_options_vlm_model import (
-    # InferenceFramework,
-    TransformersModelType,
-)
+
+# from docling.datamodel.pipeline_options_vlm_model import (
+# InferenceFramework,
+# TransformersModelType,
+# )
 
 
 class BaseAsrOptions(BaseModel):

diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
@@ -6,6 +6,9 @@
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import (
+    LayoutModelConfig,
+)
 
 
 class BaseVlmOptions(BaseModel):
@@ -87,3 +90,12 @@ class ApiVlmOptions(BaseVlmOptions):
     timeout: float = 60
     concurrency: int = 1
     response_format: ResponseFormat
+
+
+class TwoStageVlmOptions(BaseModel):
+    kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
+
+    response_format: ResponseFormat  # final response of the VLM
+
+    layout_options: LayoutModelConfig  # = DOCLING_LAYOUT_V2
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions]  # = SMOLDOCLING_TRANSFORMERS
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
@@ -6,12 +6,17 @@
 )
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_V2,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
     InlineVlmOptions,
     ResponseFormat,
     TransformersModelType,
+    TwoStageVlmOptions,
 )
 
 _log = logging.getLogger(__name__)
@@ -137,8 +142,15 @@
     temperature=0.0,
 )
 
+VLM2STAGE = TwoStageVlmOptions(
+    vlm_options=SMOLDOCLING_MLX,
+    layout_options=DOCLING_LAYOUT_HERON,
+    response_format=SMOLDOCLING_MLX.response_format,
+)
+
 
 class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"
     GRANITE_VISION = "granite_vision"
     GRANITE_VISION_OLLAMA = "granite_vision_ollama"
+    VLM2STAGE = "vlm2stage"
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
@@ -3,9 +3,16 @@
 from typing import Generic, Optional, Protocol, Type
 
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL import Image
 from typing_extensions import TypeVar
 
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
+from docling.datamodel.base_models import (
+    Cluster,
+    ItemAndImageEnrichmentElement,
+    Page,
+    TextCell,
+    VlmPredictionToken,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
 from docling.datamodel.settings import settings
@@ -19,13 +26,40 @@
 
 
 class BasePageModel(ABC):
+    scale: float  # scale with which the page-image needs to be created (dpi = 72*scale)
+    max_size: int  # max size of width/height of page-image
+
     @abstractmethod
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         pass
 
 
+class BaseLayoutModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
+        pass
+
+    @abstractmethod
+    def postprocess_on_page_image(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
+        pass
+
+
+class BaseVlmModel(BasePageModel):
+    @abstractmethod
+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        pass
+
+    @abstractmethod
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        pass
+
+
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
 
 

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 from docling_core.types.doc import DocItemLabel
+from docling_core.types.doc.page import TextCell
 from PIL import Image
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
@@ -15,7 +16,7 @@
 from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseLayoutModel, BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
@@ -25,7 +26,7 @@
 _log = logging.getLogger(__name__)
 
 
-class LayoutModel(BasePageModel):
+class LayoutModel(BaseLayoutModel):
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
         DocItemLabel.FOOTNOTE,
@@ -158,6 +159,7 @@ def __call__(
                     page_image = page.get_image(scale=1.0)
                     assert page_image is not None
 
+                    """
                     clusters = []
                     for ix, pred_item in enumerate(
                         self.layout_predictor.predict(page_image)
@@ -176,14 +178,18 @@ def __call__(
                             cells=[],
                         )
                         clusters.append(cluster)
+                    """
+                    predicted_clusters = self.predict_on_page_image(
+                        page_image=page_image
+                    )
 
                     if settings.debug.visualize_raw_layout:
                         self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
+                            conv_res, page, predicted_clusters, mode_prefix="raw"
                         )
 
                     # Apply postprocessing
-
+                    """
                     processed_clusters, processed_cells = LayoutPostprocessor(
                         page, clusters, self.options
                     ).postprocess()
@@ -210,10 +216,65 @@ def __call__(
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters
                     )
+                    """
+                    page, processed_clusters, processed_cells = (
+                        self.postprocess_on_page_image(
+                            page=page, clusters=predicted_clusters
+                        )
+                    )
+
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
+
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
 
                 if settings.debug.visualize_layout:
                     self.draw_clusters_and_cells_side_by_side(
                         conv_res, page, processed_clusters, mode_prefix="postprocessed"
                     )
 
                 yield page
+
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
+        pred_items = self.layout_predictor.predict(page_image)
+
+        clusters = []
+        for ix, pred_item in enumerate(pred_items):
+            label = DocItemLabel(
+                pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+            )  # Temporary, until docling-ibm-model uses docling-core types
+            cluster = Cluster(
+                id=ix,
+                label=label,
+                confidence=pred_item["confidence"],
+                bbox=BoundingBox.model_validate(pred_item),
+                cells=[],
+            )
+            clusters.append(cluster)
+
+        return clusters
+
+    def postprocess_on_page_image(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
+        processed_clusters, processed_cells = LayoutPostprocessor(
+            page, clusters, self.options
+        ).postprocess()
+        # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+
+        page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
+
+        return page, processed_clusters, processed_cells