Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DCO_FINAL_FIX.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"DCO final fix"
1 change: 1 addition & 0 deletions DCO_FIX.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"DCO fix"
1 change: 1 addition & 0 deletions DCO_REMEDIATION.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"DCO fix to certify commit 4260e91"
1 change: 1 addition & 0 deletions FINAL_CERT.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"DCO final certification for hackathon PR."
1 change: 1 addition & 0 deletions FINAL_CERT_10.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"Final certification of all 10 DCO commits."
1 change: 1 addition & 0 deletions FINAL_CERT_7.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"Final certification commit to unblock PR."
1 change: 1 addition & 0 deletions FINAL_CERT_VLM.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"DCO Final Cert for VLM Fix."
1 change: 1 addition & 0 deletions FINAL_DCO_FIX.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"Final DCO certification commit."
49 changes: 49 additions & 0 deletions debug_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import easyocr
import os
import cv2 # OpenCV is often required for image processing

# --- CONFIGURATION ---
# IMPORTANT:
# 1. Create an image file named 'handwriting_sample.jpg'.
# 2. Put it in the same directory as this script (C:\docling-hackathon).
TEST_IMAGE_NAME = "handwriting_sample.jpg"
TEST_IMAGE_PATH = os.path.join(os.getcwd(), TEST_IMAGE_NAME)

# Language codes used by the OCR engine
LANGUAGES = ['en']

def run_ocr_test():
"""Tests the core EasyOCR engine's ability to read a sample image."""

if not os.path.exists(TEST_IMAGE_PATH):
print("🔴 ERROR: Test image not found!")
print(f"Please create and place '{TEST_IMAGE_NAME}' at: {os.getcwd()}")
return

print(f"--- Running Isolated OCR Test on: {TEST_IMAGE_PATH} ---")

# 1. Initialize the OCR Reader (loads the models)
try:
reader = easyocr.Reader(LANGUAGES, gpu=False)
except Exception as e:
print("🔴 ERROR: Failed to initialize EasyOCR Reader. Check installation.")
print(f"Details: {e}")
return

# 2. Run the recognition on the image
# detail=0 returns only the text recognized
results = reader.recognize(TEST_IMAGE_PATH, detail=0)

# 3. Analyze Results
if results:
print("✅ SUCCESS: Recognized Text:")
for text in results:
print(f" - {text}")
print("\nConclusion: The core OCR engine works. The bug is likely in Docling's pipeline.")
else:
print("❌ FAILURE: No text was recognized from the sample image.")
print("\nConclusion: The core OCR engine itself is likely misconfigured or lacks robust handwriting support.")


if __name__ == "__main__":
run_ocr_test()
8 changes: 8 additions & 0 deletions docling/dco_fix.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
DCO Remediation Commit for Angelina-2007 <[email protected]>

I, Angelina-2007 <[email protected]>, hereby add my Signed-off-by to this commit: 4260e91bbb6646750d22b0760709565032188bba
...
(include all lines)
...
Signed-off-by: Angelina-2007 <[email protected]>

30 changes: 28 additions & 2 deletions docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import cv2
import logging
import warnings
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import List, Optional, Type

import numpy
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell

Expand All @@ -24,6 +25,30 @@

_log = logging.getLogger(__name__)

def preprocess_handwriting(image_array: np.ndarray) -> np.ndarray:
"""Applies cleaning filters to an image array to enhance handwriting recognition."""

# 1. Convert to Grayscale
if len(image_array.shape) == 3:
gray = cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY)
else:
gray = image_array

# 2. Denoising/Blur
denoised = cv2.medianBlur(gray, 3)

# 3. Adaptive Thresholding (Crucial for high contrast)
thresholded = cv2.adaptiveThreshold(
denoised,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11, # Block size
2 # Constant
)

return thresholded


class EasyOcrModel(BaseOcrModel):
_model_repo_folder = "EasyOcr"
Expand Down Expand Up @@ -149,7 +174,8 @@ def __call__(
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image)
im_array = np.array(high_res_image)
im = preprocess_handwriting(im_array)

with warnings.catch_warnings():
if self.options.suppress_mps_warnings:
Expand Down
109 changes: 78 additions & 31 deletions docling/models/picture_description_vlm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,39 +78,86 @@ def __init__(
self.model = torch.compile(self.model) # type: ignore

self.provenance = f"{self.options.repo_id}"

# Constants for VLM Quality Check
MAX_ATTEMPTS = 3
MIN_WORD_COUNT = 5
INSUFFICIENT_WORDS = {"in", "this", "the", "a", "an", "the image", "this image"}

def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
from transformers import GenerationConfig

# Create input messages
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": self.options.prompt},
],
},
]
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
from transformers import GenerationConfig

# TODO: do batch generation
# Use the base prompt defined in the model options
base_prompt = self.options.prompt

# TODO: Implement actual batch generation instead of single image iteration
for image in images:
# Prepare inputs
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True
)
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(self.device)

# Generate outputs
generated_ids = self.model.generate(
**inputs,
generation_config=GenerationConfig(**self.options.generation_config),
)
generated_texts = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)

yield generated_texts[0].strip()

# --- START HACKATHON FIX: VLM RETRY LOOP ---
current_prompt = base_prompt

for attempt in range(MAX_ATTEMPTS):
# 1. --- PREPARE INPUTS (Original Logic) ---
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": current_prompt},
],
},
]

# Apply chat template and tokenize
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True
)
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(self.device)

# 2. --- GENERATE OUTPUTS (Original Logic) ---
generated_ids = self.model.generate(
**inputs,
generation_config=GenerationConfig(**self.options.generation_config),
)
generated_texts = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
response = generated_texts[0].strip()

# 3. --- CHECK QUALITY AND DECIDE RETRY ---
response_words = response.lower().split()
word_count = len(response_words)

# Predicates defined using external constants
is_generic_fail = response_words and response_words[0] in INSUFFICIENT_WORDS
is_length_fail = word_count < MIN_WORD_COUNT

if not is_generic_fail and not is_length_fail:
# Success! Yield the good description and exit the inner loop
yield response
break

# If quality check fails and we have attempts remaining:
if attempt < MAX_ATTEMPTS - 1:
print(f"DEBUG: VLM failed on attempt {attempt+1} ('{response[:20]}...'). Retrying.")

# Construct a stronger re-prompt based on failure reason
failure_reason = ""
if is_generic_fail:
failure_reason += "Your previous answer was too generic (like 'This' or 'In'). "
if is_length_fail:
failure_reason += f"The description was too short (under {MIN_WORD_COUNT} words). "

# Update the prompt for the next attempt
current_prompt = (
f"{base_prompt} {failure_reason} Provide a DETAILED, SPECIFIC, multi-sentence caption."
)
else:
# Last attempt failed (attempt == MAX_ATTEMPTS - 1),
# yield the best (last) response found and exit the inner loop
yield response
break
# --- END HACKATHON FIX: VLM RETRY LOOP ---