Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
GRANITE_VISION_TRANSFORMERS,
SMOLDOCLING_MLX,
SMOLDOCLING_TRANSFORMERS,
VLM2STAGE,
VlmModelType,
)
from docling.document_converter import (
Expand Down Expand Up @@ -627,6 +628,12 @@
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
elif vlm_model == VlmModelType.VLM2STAGE:
pipeline_options.vlm_options = VLM2STAGE

Check warning on line 632 in docling/cli/main.py

View check run for this annotation

Codecov / codecov/patch

docling/cli/main.py#L631-L632

Added lines #L631 - L632 were not covered by tests
else:
raise ValueError(

Check warning on line 634 in docling/cli/main.py

View check run for this annotation

Codecov / codecov/patch

docling/cli/main.py#L634

Added line #L634 was not covered by tests
f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
)

pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
Expand Down
15 changes: 8 additions & 7 deletions docling/datamodel/asr_model_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
# ApiAsrOptions,
InferenceAsrFramework,
InlineAsrNativeWhisperOptions,
TransformersModelType,
InlineAsrOptions,
# TransformersModelType,
)

_log = logging.getLogger(__name__)

WHISPER_TINY = InlineAsrNativeWhisperOptions(
WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="tiny",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand All @@ -27,7 +28,7 @@
max_time_chunk=30.0,
)

WHISPER_SMALL = InlineAsrNativeWhisperOptions(
WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="small",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand All @@ -38,7 +39,7 @@
max_time_chunk=30.0,
)

WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="medium",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand All @@ -49,7 +50,7 @@
max_time_chunk=30.0,
)

WHISPER_BASE = InlineAsrNativeWhisperOptions(
WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="base",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand All @@ -60,7 +61,7 @@
max_time_chunk=30.0,
)

WHISPER_LARGE = InlineAsrNativeWhisperOptions(
WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="large",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand All @@ -71,7 +72,7 @@
max_time_chunk=30.0,
)

WHISPER_TURBO = InlineAsrNativeWhisperOptions(
WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="turbo",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
Expand Down
2 changes: 0 additions & 2 deletions docling/datamodel/layout_model_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ def model_repo_folder(self) -> str:
return self.repo_id.replace("/", "--")


# HuggingFace Layout Models

# Default Docling Layout Model
DOCLING_LAYOUT_V2 = LayoutModelConfig(
name="docling_layout_v2",
Expand Down
16 changes: 12 additions & 4 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,16 @@
)
from typing_extensions import deprecated

from docling.datamodel import asr_model_specs

# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_LARGE,
WHISPER_MEDIUM,
WHISPER_SMALL,
WHISPER_TINY,
WHISPER_TURBO,
)
from docling.datamodel.layout_model_specs import (
DOCLING_LAYOUT_EGRET_LARGE,
DOCLING_LAYOUT_EGRET_MEDIUM,
Expand All @@ -33,6 +39,7 @@
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TwoStageVlmOptions,
)
from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
Expand Down Expand Up @@ -270,8 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
smoldocling_vlm_conversion_options
# SMOLDOCLING_TRANSFORMERS
)


Expand All @@ -283,7 +291,7 @@ class LayoutOptions(BaseModel):


class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
asr_options: Union[InlineAsrOptions] = WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None


Expand Down
9 changes: 5 additions & 4 deletions docling/datamodel/pipeline_options_asr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from typing_extensions import deprecated

from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
TransformersModelType,
)

# from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
# TransformersModelType,
# )


class BaseAsrOptions(BaseModel):
Expand Down
12 changes: 12 additions & 0 deletions docling/datamodel/pipeline_options_vlm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from typing_extensions import deprecated

from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.layout_model_specs import (
LayoutModelConfig,
)


class BaseVlmOptions(BaseModel):
Expand Down Expand Up @@ -87,3 +90,12 @@ class ApiVlmOptions(BaseVlmOptions):
timeout: float = 60
concurrency: int = 1
response_format: ResponseFormat


class TwoStageVlmOptions(BaseModel):
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"

response_format: ResponseFormat # final response of the VLM

layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
12 changes: 12 additions & 0 deletions docling/datamodel/vlm_model_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@
)

from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.layout_model_specs import (
DOCLING_LAYOUT_HERON,
DOCLING_LAYOUT_V2,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TransformersModelType,
TwoStageVlmOptions,
)

_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -137,8 +142,15 @@
temperature=0.0,
)

VLM2STAGE = TwoStageVlmOptions(
vlm_options=SMOLDOCLING_MLX,
layout_options=DOCLING_LAYOUT_HERON,
response_format=SMOLDOCLING_MLX.response_format,
)


class VlmModelType(str, Enum):
SMOLDOCLING = "smoldocling"
GRANITE_VISION = "granite_vision"
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
VLM2STAGE = "vlm2stage"
36 changes: 35 additions & 1 deletion docling/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,16 @@
from typing import Generic, Optional, Protocol, Type

from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from PIL import Image
from typing_extensions import TypeVar

from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
from docling.datamodel.base_models import (
Cluster,
ItemAndImageEnrichmentElement,
Page,
TextCell,
VlmPredictionToken,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import BaseOptions
from docling.datamodel.settings import settings
Expand All @@ -19,13 +26,40 @@


class BasePageModel(ABC):
scale: float # scale with which the page-image needs to be created (dpi = 72*scale)
max_size: int # max size of width/height of page-image

@abstractmethod
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass


class BaseLayoutModel(BasePageModel):
@abstractmethod
def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
pass

Check warning on line 42 in docling/models/base_model.py

View check run for this annotation

Codecov / codecov/patch

docling/models/base_model.py#L42

Added line #L42 was not covered by tests

@abstractmethod
def postprocess_on_page_image(
self, *, page: Page, clusters: list[Cluster]
) -> tuple[Page, list[Cluster], list[TextCell]]:
pass

Check warning on line 48 in docling/models/base_model.py

View check run for this annotation

Codecov / codecov/patch

docling/models/base_model.py#L48

Added line #L48 was not covered by tests


class BaseVlmModel(BasePageModel):
@abstractmethod
def get_user_prompt(self, page: Optional[Page]) -> str:
pass

Check warning on line 54 in docling/models/base_model.py

View check run for this annotation

Codecov / codecov/patch

docling/models/base_model.py#L54

Added line #L54 was not covered by tests

@abstractmethod
def predict_on_page_image(
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
pass

Check warning on line 60 in docling/models/base_model.py

View check run for this annotation

Codecov / codecov/patch

docling/models/base_model.py#L60

Added line #L60 was not covered by tests


EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)


Expand Down
69 changes: 65 additions & 4 deletions docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
from docling_core.types.doc import DocItemLabel
from docling_core.types.doc.page import TextCell
from PIL import Image

from docling.datamodel.accelerator_options import AcceleratorOptions
Expand All @@ -15,7 +16,7 @@
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.base_model import BaseLayoutModel, BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
Expand All @@ -25,7 +26,7 @@
_log = logging.getLogger(__name__)


class LayoutModel(BasePageModel):
class LayoutModel(BaseLayoutModel):
TEXT_ELEM_LABELS = [
DocItemLabel.TEXT,
DocItemLabel.FOOTNOTE,
Expand Down Expand Up @@ -158,6 +159,7 @@ def __call__(
page_image = page.get_image(scale=1.0)
assert page_image is not None

"""
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page_image)
Expand All @@ -176,14 +178,18 @@ def __call__(
cells=[],
)
clusters.append(cluster)
"""
predicted_clusters = self.predict_on_page_image(
page_image=page_image
)

if settings.debug.visualize_raw_layout:
self.draw_clusters_and_cells_side_by_side(
conv_res, page, clusters, mode_prefix="raw"
conv_res, page, predicted_clusters, mode_prefix="raw"
)

# Apply postprocessing

"""
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters, self.options
).postprocess()
Expand All @@ -210,10 +216,65 @@ def __call__(
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters
)
"""
page, processed_clusters, processed_cells = (
self.postprocess_on_page_image(
page=page, clusters=predicted_clusters
)
)

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Mean of empty slice|invalid value encountered in scalar divide",
RuntimeWarning,
"numpy",
)

conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters])
)

conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean(
[c.confidence for c in processed_cells if c.from_ocr]
)
)

if settings.debug.visualize_layout:
self.draw_clusters_and_cells_side_by_side(
conv_res, page, processed_clusters, mode_prefix="postprocessed"
)

yield page

def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
pred_items = self.layout_predictor.predict(page_image)

clusters = []
for ix, pred_item in enumerate(pred_items):
label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)

return clusters

def postprocess_on_page_image(
self, *, page: Page, clusters: list[Cluster]
) -> tuple[Page, list[Cluster], list[TextCell]]:
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters, self.options
).postprocess()
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

page.predictions.layout = LayoutPrediction(clusters=processed_clusters)

return page, processed_clusters, processed_cells
Loading
Loading