docling-project · Angelina-2007 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/DCO_FINAL_FIX.txt b/DCO_FINAL_FIX.txt
@@ -0,0 +1 @@
+"DCO final fix" 
diff --git a/DCO_FIX.txt b/DCO_FIX.txt
@@ -0,0 +1 @@
+"DCO fix" 
diff --git a/DCO_REMEDIATION.txt b/DCO_REMEDIATION.txt
@@ -0,0 +1 @@
+"DCO fix to certify commit 4260e91" 
diff --git a/FINAL_CERT.txt b/FINAL_CERT.txt
@@ -0,0 +1 @@
+"DCO final certification for hackathon PR." 
diff --git a/FINAL_CERT_10.txt b/FINAL_CERT_10.txt
@@ -0,0 +1 @@
+"Final certification of all 10 DCO commits." 
diff --git a/FINAL_CERT_7.txt b/FINAL_CERT_7.txt
@@ -0,0 +1 @@
+"Final certification commit to unblock PR." 
diff --git a/FINAL_CERT_VLM.txt b/FINAL_CERT_VLM.txt
@@ -0,0 +1 @@
+"DCO Final Cert for VLM Fix." 
diff --git a/FINAL_DCO_FIX.txt b/FINAL_DCO_FIX.txt
@@ -0,0 +1 @@
+"Final DCO certification commit." 
diff --git a/debug_ocr.py b/debug_ocr.py
@@ -0,0 +1,49 @@
+import easyocr
+import os
+import cv2 # OpenCV is often required for image processing
+
+# --- CONFIGURATION ---
+# IMPORTANT: 
+# 1. Create an image file named 'handwriting_sample.jpg'.
+# 2. Put it in the same directory as this script (C:\docling-hackathon).
+TEST_IMAGE_NAME = "handwriting_sample.jpg"
+TEST_IMAGE_PATH = os.path.join(os.getcwd(), TEST_IMAGE_NAME)
+
+# Language codes used by the OCR engine
+LANGUAGES = ['en'] 
+
+def run_ocr_test():
+    """Tests the core EasyOCR engine's ability to read a sample image."""
+
+    if not os.path.exists(TEST_IMAGE_PATH):
+        print("🔴 ERROR: Test image not found!")
+        print(f"Please create and place '{TEST_IMAGE_NAME}' at: {os.getcwd()}")
+        return
+
+    print(f"--- Running Isolated OCR Test on: {TEST_IMAGE_PATH} ---")
+
+    # 1. Initialize the OCR Reader (loads the models)
+    try:
+        reader = easyocr.Reader(LANGUAGES, gpu=False)
+    except Exception as e:
+        print("🔴 ERROR: Failed to initialize EasyOCR Reader. Check installation.")
+        print(f"Details: {e}")
+        return
+
+    # 2. Run the recognition on the image
+    # detail=0 returns only the text recognized
+    results = reader.recognize(TEST_IMAGE_PATH, detail=0)
+
+    # 3. Analyze Results
+    if results:
+        print("✅ SUCCESS: Recognized Text:")
+        for text in results:
+            print(f"  - {text}")
+        print("\nConclusion: The core OCR engine works. The bug is likely in Docling's pipeline.")
+    else:
+        print("❌ FAILURE: No text was recognized from the sample image.")
+        print("\nConclusion: The core OCR engine itself is likely misconfigured or lacks robust handwriting support.")
+
+
+if __name__ == "__main__":
+    run_ocr_test()
diff --git a/docling/dco_fix.txt b/docling/dco_fix.txt
@@ -0,0 +1,8 @@
+DCO Remediation Commit for Angelina-2007 <[email protected]>
+
+I, Angelina-2007 <[email protected]>, hereby add my Signed-off-by to this commit: 4260e91bbb6646750d22b0760709565032188bba
+...
+(include all lines)
+...
+Signed-off-by: Angelina-2007 <[email protected]>
+
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
@@ -1,11 +1,12 @@
+import cv2 
 import logging
 import warnings
 import zipfile
 from collections.abc import Iterable
 from pathlib import Path
 from typing import List, Optional, Type
 
-import numpy
+import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
 
@@ -24,6 +25,30 @@
 
 _log = logging.getLogger(__name__)
 
+def preprocess_handwriting(image_array: np.ndarray) -> np.ndarray:
+    """Applies cleaning filters to an image array to enhance handwriting recognition."""
+
+    # 1. Convert to Grayscale
+    if len(image_array.shape) == 3:
+        gray = cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY)
+    else:
+        gray = image_array
+
+    # 2. Denoising/Blur
+    denoised = cv2.medianBlur(gray, 3) 
+
+    # 3. Adaptive Thresholding (Crucial for high contrast)
+    thresholded = cv2.adaptiveThreshold(
+        denoised, 
+        255, 
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
+        cv2.THRESH_BINARY, 
+        11, # Block size
+        2   # Constant
+    )
+
+    return thresholded
+
 
 class EasyOcrModel(BaseOcrModel):
     _model_repo_folder = "EasyOcr"
@@ -149,7 +174,8 @@ def __call__(
                         high_res_image = page._backend.get_page_image(
                             scale=self.scale, cropbox=ocr_rect
                         )
-                        im = numpy.array(high_res_image)
+                        im_array = np.array(high_res_image)
+                        im = preprocess_handwriting(im_array)
 
                         with warnings.catch_warnings():
                             if self.options.suppress_mps_warnings:

diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
@@ -78,39 +78,86 @@ def __init__(
                 self.model = torch.compile(self.model)  # type: ignore
 
             self.provenance = f"{self.options.repo_id}"
+
+# Constants for VLM Quality Check 
+MAX_ATTEMPTS = 3
+MIN_WORD_COUNT = 5
+INSUFFICIENT_WORDS = {"in", "this", "the", "a", "an", "the image", "this image"}            
 
-    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
-        from transformers import GenerationConfig
 
-        # Create input messages
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": self.options.prompt},
-                ],
-            },
-        ]
+def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        from transformers import GenerationConfig
 
-        # TODO: do batch generation
+        # Use the base prompt defined in the model options
+        base_prompt = self.options.prompt
 
+        # TODO: Implement actual batch generation instead of single image iteration
         for image in images:
-            # Prepare inputs
-            prompt = self.processor.apply_chat_template(
-                messages, add_generation_prompt=True
-            )
-            inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
-            inputs = inputs.to(self.device)
-
-            # Generate outputs
-            generated_ids = self.model.generate(
-                **inputs,
-                generation_config=GenerationConfig(**self.options.generation_config),
-            )
-            generated_texts = self.processor.batch_decode(
-                generated_ids[:, inputs["input_ids"].shape[1] :],
-                skip_special_tokens=True,
-            )
-
-            yield generated_texts[0].strip()
+
+            # --- START HACKATHON FIX: VLM RETRY LOOP ---
+            current_prompt = base_prompt
+
+            for attempt in range(MAX_ATTEMPTS):
+                # 1. --- PREPARE INPUTS (Original Logic) ---
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image"},
+                            {"type": "text", "text": current_prompt},
+                        ],
+                    },
+                ]
+
+                # Apply chat template and tokenize
+                prompt = self.processor.apply_chat_template(
+                    messages, add_generation_prompt=True
+                )
+                inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+                inputs = inputs.to(self.device)
+
+                # 2. --- GENERATE OUTPUTS (Original Logic) ---
+                generated_ids = self.model.generate(
+                    **inputs,
+                    generation_config=GenerationConfig(**self.options.generation_config),
+                )
+                generated_texts = self.processor.batch_decode(
+                    generated_ids[:, inputs["input_ids"].shape[1] :],
+                    skip_special_tokens=True,
+                )
+                response = generated_texts[0].strip()
+
+                # 3. --- CHECK QUALITY AND DECIDE RETRY ---
+                response_words = response.lower().split()
+                word_count = len(response_words)
+
+                # Predicates defined using external constants
+                is_generic_fail = response_words and response_words[0] in INSUFFICIENT_WORDS
+                is_length_fail = word_count < MIN_WORD_COUNT
+
+                if not is_generic_fail and not is_length_fail:
+                    # Success! Yield the good description and exit the inner loop
+                    yield response
+                    break 
+
+                # If quality check fails and we have attempts remaining:
+                if attempt < MAX_ATTEMPTS - 1:
+                    print(f"DEBUG: VLM failed on attempt {attempt+1} ('{response[:20]}...'). Retrying.")
+
+                    # Construct a stronger re-prompt based on failure reason
+                    failure_reason = ""
+                    if is_generic_fail:
+                        failure_reason += "Your previous answer was too generic (like 'This' or 'In'). "
+                    if is_length_fail:
+                        failure_reason += f"The description was too short (under {MIN_WORD_COUNT} words). "
+
+                    # Update the prompt for the next attempt
+                    current_prompt = (
+                        f"{base_prompt} {failure_reason} Provide a DETAILED, SPECIFIC, multi-sentence caption."
+                    )
+                else:
+                    # Last attempt failed (attempt == MAX_ATTEMPTS - 1), 
+                    # yield the best (last) response found and exit the inner loop
+                    yield response
+                    break 
+            # --- END HACKATHON FIX: VLM RETRY LOOP ---