diff --git a/README.md b/README.md index e030c408..20e3703e 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,70 @@ You can also pipe content: cat path-to-file.pdf | markitdown ``` +### Bounding Boxes + +Use `--emit-bbox` to generate a sidecar JSON file with page, line, and word bounding boxes for PDF and image inputs: + +```bash +markitdown sample.pdf --emit-bbox +``` + +This writes `sample.bbox.json` alongside the Markdown output. The structure of the JSON file is: + +```json +{ + "version": "1.0", + "source": "sample.pdf", + "pages": [{ "page": 1, "width": 612, "height": 792 }], + "lines": [{ "page": 1, "text": "Hello", "bbox_norm": [0,0,0,0], "bbox_abs": [0,0,0,0], "confidence": null, "md_span": {"start": null, "end": null} }], + "words": [{ "page": 1, "text": "Hello", "bbox_norm": [0,0,0,0], "bbox_abs": [0,0,0,0], "confidence": null, "line_id": 0 }] +} +``` + +`bbox_abs` values are in pixel units of the page or image, with a top-left origin. `bbox_norm` values are normalized to the range `[0,1]`. + +For scanned PDFs or images without embedded text, MarkItDown falls back to Tesseract OCR when `--emit-bbox` is supplied. Set `MARKITDOWN_OCR_LANG` (or use `--ocr-lang`) to control OCR languages. Use `TESSDATA_PREFIX` if custom language packs are installed. + +For an example comparison with Docling outputs, see [docling_comparison.md](docling_comparison.md). +For a comprehensive evaluation on the Docling test dataset, see [docling_dataset_comparison.md](docling_dataset_comparison.md). +Across the 12 supported documents, MarkItDown's Markdown differed from the Docling ground truth by roughly **45%** on average, +with bounding box coordinates deviating by about **18%**. Right-to-left pages and scanned forms contributed most of the +discrepancies. + +### Docling Test Data Timing + +The following table reports the time required by `markitdown` to convert each PDF, TIFF, and PNG file from the [Docling test dataset](https://github.com/docling-project/docling/tree/main/tests/data) into Markdown and to generate bounding boxes (`--emit-bbox`). The TIFF sample was first converted to PNG for processing. + +| File | Type | MD Time (s) | BBox Time (s) | +| --- | --- | --- | --- | +| 2305.03393v1-pg9-img.png | png | 2.51 | 5.56 | +| 2203.01017v2.pdf | pdf | 4.59 | 9.30 | +| 2206.01062.pdf | pdf | 4.94 | 11.21 | +| 2305.03393v1-pg9.pdf | pdf | 2.69 | 2.88 | +| 2305.03393v1.pdf | pdf | 3.71 | 6.70 | +| amt_handbook_sample.pdf | pdf | 3.14 | 3.99 | +| code_and_formula.pdf | pdf | 2.80 | 3.24 | +| multi_page.pdf | pdf | 2.89 | 3.93 | +| picture_classification.pdf | pdf | 2.68 | 2.92 | +| redp5110_sampled.pdf | pdf | 3.71 | 8.67 | +| right_to_left_01.pdf | pdf | 2.83 | 2.87 | +| right_to_left_02.pdf | pdf | 2.70 | 3.01 | +| right_to_left_03.pdf | pdf | 2.81 | 2.93 | +| 2206.01062.tif | tiff | 2.57 | 4.19 | + +#### Average Times by Type + +| Type | Avg MD Time (s) | Avg BBox Time (s) | +| --- | --- | --- | +| png | 2.51 | 5.56 | +| pdf | 3.29 | 5.14 | +| tiff | 2.57 | 4.19 | + +#### Overall Average Times + +* Average MD Time: 3.18 s +* Average BBox Time: 5.10 s + ### Optional Dependencies MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example: diff --git a/docling_comparison.md b/docling_comparison.md new file mode 100644 index 00000000..1d9c81d9 --- /dev/null +++ b/docling_comparison.md @@ -0,0 +1,27 @@ +# Docling vs MarkItDown on ocr_test.pdf + +This document compares the outputs of [Docling](https://github.com/docling-project/docling) and the current MarkItDown implementation on the sample `ocr_test.pdf`. + +## Markdown comparison +- Normalized similarity ratio: 1.00 + +```diff +--- docling ++++ markitdown +@@ -1 +1,3 @@ +-Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package ++Docling bundles PDF document conversion to ++JSON and Markdown in an easy self contained ++package +``` + +## Bounding box comparison (first line) +Page size (MarkItDown): 1654 x 2339 px + +| coordinate | Docling (scaled) | MarkItDown | abs diff | norm diff | +|-----------:|------------------:|-----------:|---------:|----------:| +| x1 | 193.63 | 205.00 | 11.37 | 0.0069 | +| y1 | 213.92 | 217.00 | 3.08 | 0.0013 | +| x2 | 1402.98 | 1398.00 | 4.98 | 0.0030 | +| y2 | 424.81 | 268.00 | 156.81 | 0.0670 | + diff --git a/docling_dataset_comparison.md b/docling_dataset_comparison.md new file mode 100644 index 00000000..de7bca64 --- /dev/null +++ b/docling_dataset_comparison.md @@ -0,0 +1,22 @@ +# Docling vs MarkItDown on Docling Test Dataset + +This report compares Docling ground truth outputs (docling_v2) with the current MarkItDown conversion on the PDF and TIFF files from the [docling test data](https://github.com/docling-project/docling/tree/main/tests/data) dataset. For each document we compute the normalized similarity ratio between Docling and MarkItDown Markdown outputs, and the absolute/normalized differences between first line bounding box coordinates. + +| File | Markdown similarity | Markdown diff (%) | x1 abs | y1 abs | x2 abs | y2 abs | x1 norm | y1 norm | x2 norm | y2 norm | Avg bbox diff (%) | +|------|--------------------:|------------------:|-------:|-------:|-------:|-------:|--------:|--------:|--------:|--------:|------------------:| +| 2203.01017v2 | 0.68 | 32.00 | 0.00 | 1.44 | 0.00 | 0.00 | 0.0000 | 0.0018 | 0.0000 | 0.0000 | 0.04 | +| 2206.01062 | 0.55 | 45.00 | 0.00 | 1.77 | 0.00 | 0.18 | 0.0000 | 0.0022 | 0.0000 | 0.0002 | 0.06 | +| 2305.03393v1-pg9 | 0.78 | 22.00 | 0.00 | 0.16 | 33.04 | 0.74 | 0.0000 | 0.0002 | 0.0540 | 0.0009 | 1.38 | +| 2305.03393v1 | 0.77 | 23.00 | 0.00 | 1.67 | 0.00 | 0.00 | 0.0000 | 0.0021 | 0.0000 | 0.0000 | 0.05 | +| amt_handbook_sample | 0.48 | 52.00 | 44.91 | 658.38 | 438.61 | 656.85 | 0.0756 | 0.8506 | 0.7384 | 0.8486 | 62.83 | +| code_and_formula | 0.67 | 33.00 | 0.00 | 1.72 | 0.00 | 0.03 | 0.0000 | 0.0022 | 0.0000 | 0.0000 | 0.06 | +| multi_page | 0.97 | 3.00 | 0.00 | 1.47 | 0.00 | 0.66 | 0.0000 | 0.0017 | 0.0000 | 0.0008 | 0.06 | +| picture_classification | 0.98 | 2.00 | 0.00 | 1.72 | 0.01 | 0.03 | 0.0000 | 0.0022 | 0.0000 | 0.0000 | 0.06 | +| redp5110_sampled | 0.53 | 47.00 | 250.92 | 724.48 | 320.24 | 714.36 | 0.4100 | 0.9148 | 0.5233 | 0.9020 | 68.75 | +| right_to_left_01 | 0.05 | 95.00 | 63.72 | 1.45 | 0.00 | 0.70 | 0.1041 | 0.0018 | 0.0000 | 0.0009 | 2.67 | +| right_to_left_02 | 0.02 | 98.00 | 23.15 | 594.43 | 378.81 | 595.51 | 0.0389 | 0.7060 | 0.6364 | 0.7073 | 52.22 | +| right_to_left_03 | 0.08 | 92.00 | 419.00 | 48.07 | 238.12 | 51.77 | 0.7038 | 0.0571 | 0.4000 | 0.0615 | 30.56 | +| 2206.01062_tif | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Overall (avg)** | 0.55 | 45.33 | | | | | | | | | 18.23 | + +Overall, MarkItDown's Markdown output is about **54.7%** similar to the Docling ground truth (45.33% different) across the 12 supported documents. Bounding box coordinates diverge by an average of **18.23%**, with right-to-left samples and scanned forms contributing most of the error. diff --git a/funsd_bbox_comparison.md b/funsd_bbox_comparison.md new file mode 100644 index 00000000..7a6359b9 --- /dev/null +++ b/funsd_bbox_comparison.md @@ -0,0 +1,1039 @@ +# FUNSD Bounding Box Comparison + +MarkItDown version: 0.1.2 + +## File 82092117 + +Matched words: 11 / 223 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 1.19 | +| y1 | 86.77 | +| x2 | 0.69 | +| y2 | 75.46 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DATE: | [102, 406, 147, 423] | [105, 410, 146, 421] | 2.9 | 1.0 | -0.7 | -0.5 | +| 3 | [461, 440, 475, 455] | [465, 444, 471, 456] | 0.9 | 0.9 | -0.8 | 0.2 | +| OFFICE | [207, 84, 246, 98] | [214, 881, 240, 909] | 3.4 | 948.8 | -2.4 | 827.6 | +| Tower | [243, 888, 272, 902] | [246, 891, 273, 900] | 1.2 | 0.3 | 0.4 | -0.2 | +| / | [275, 888, 281, 899] | [277, 891, 279, 899] | 0.7 | 0.3 | -0.7 | 0.0 | + +## File 82200067_0069 + +Matched words: 18 / 167 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 55.00 | +| y1 | 51.84 | +| x2 | 45.69 | +| y2 | 49.51 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [75, 140, 93, 153] | [76, 132, 93, 160] | 1.3 | -5.7 | 0.0 | 4.6 | +| FROM: | [75, 168, 109, 181] | [72, 160, 108, 188] | -4.0 | -4.8 | -0.9 | 3.9 | +| DIVISION: | [74, 348, 124, 361] | [77, 350, 124, 359] | 4.1 | 0.6 | 0.0 | -0.6 | +| DIVISION: | [91, 374, 143, 388] | [94, 378, 141, 386] | 3.3 | 1.1 | -1.4 | -0.5 | +| DIVISION: | [92, 404, 142, 415] | [387, 378, 433, 387] | 320.7 | -6.4 | 204.9 | -6.7 | + +## File 82250337_0338 + +Matched words: 7 / 214 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 3.73 | +| y1 | 87.09 | +| x2 | 4.06 | +| y2 | 82.22 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [94, 200, 114, 214] | [94, 204, 112, 213] | 0.0 | 2.0 | -1.8 | -0.5 | +| FROM: | [92, 219, 128, 232] | [95, 222, 128, 230] | 3.3 | 1.4 | 0.0 | -0.9 | +| DATE: | [95, 236, 129, 250] | [95, 239, 126, 247] | 0.0 | 1.3 | -2.3 | -1.2 | +| Jan. | [553, 268, 575, 279] | [555, 270, 576, 278] | 0.4 | 0.7 | 0.2 | -0.4 | +| Kool | [405, 218, 427, 228] | [461, 868, 482, 877] | 13.8 | 298.2 | 12.9 | 284.6 | + +## File 82251504 + +Matched words: 5 / 222 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 33.71 | +| y1 | 60.54 | +| x2 | 30.12 | +| y2 | 57.75 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| 17 | [362, 304, 379, 315] | [362, 306, 378, 319] | 0.0 | 0.7 | -0.3 | 1.3 | +| Excel | [265, 186, 307, 200] | [509, 340, 540, 350] | 92.1 | 82.8 | 75.9 | 75.0 | +| of | [525, 223, 536, 231] | [487, 599, 499, 610] | -7.2 | 168.6 | -6.9 | 164.1 | +| of | [494, 408, 507, 422] | [155, 614, 167, 625] | -68.6 | 50.5 | -67.1 | 48.1 | +| shields: | [169, 613, 216, 627] | [170, 614, 217, 625] | 0.6 | 0.2 | 0.5 | -0.3 | + +## File 82252956_2958 + +Matched words: 12 / 108 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 64.83 | +| y1 | 102.56 | +| x2 | 55.01 | +| y2 | 95.33 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| REGION: | [35, 292, 89, 313] | [39, 300, 88, 309] | 11.4 | 2.7 | -1.1 | -1.3 | +| DIVISION: | [32, 360, 100, 378] | [39, 367, 94, 376] | 21.9 | 1.9 | -6.0 | -0.5 | +| OLD | [190, 215, 218, 227] | [245, 567, 273, 578] | 28.9 | 163.7 | 25.2 | 154.6 | +| GOLD | [219, 209, 252, 224] | [279, 567, 317, 578] | 27.4 | 171.3 | 25.8 | 158.0 | +| MENTHOL | [251, 210, 317, 225] | [323, 567, 390, 578] | 28.7 | 170.0 | 23.0 | 156.9 | + +## File 82253058_3059 + +Matched words: 12 / 187 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 11.31 | +| y1 | 59.10 | +| x2 | 10.48 | +| y2 | 55.14 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| LORILLARD | [384, 63, 445, 76] | [385, 67, 444, 75] | 0.3 | 6.3 | -0.2 | -1.3 | +| TO: | [94, 203, 115, 217] | [95, 204, 114, 215] | 1.1 | 0.5 | -0.9 | -0.9 | +| FROM: | [94, 222, 133, 236] | [95, 224, 133, 234] | 1.1 | 0.9 | 0.0 | -0.8 | +| DATE: | [94, 242, 130, 256] | [95, 244, 129, 254] | 1.1 | 0.8 | -0.8 | -0.8 | +| AUG | [288, 279, 319, 293] | [292, 282, 318, 291] | 1.4 | 1.1 | -0.3 | -0.7 | + +## File 82253245_3247 + +Matched words: 8 / 234 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 22.00 | +| y1 | 37.08 | +| x2 | 20.01 | +| y2 | 34.56 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [36, 95, 57, 108] | [37, 86, 55, 114] | 2.8 | -9.5 | -3.5 | 5.6 | +| FROM: | [37, 123, 74, 138] | [41, 116, 71, 144] | 10.8 | -5.7 | -4.1 | 4.3 | +| SUBJECT: | [39, 182, 90, 193] | [41, 174, 93, 202] | 5.1 | -4.4 | 3.3 | 4.7 | +| 150 | [282, 561, 302, 573] | [283, 564, 299, 572] | 0.4 | 0.5 | -1.0 | -0.2 | +| 82 | [284, 742, 298, 753] | [285, 744, 296, 752] | 0.4 | 0.3 | -0.7 | -0.1 | + +## File 82253362_3364 + +Matched words: 9 / 228 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 75.98 | +| y1 | 39.32 | +| x2 | 67.48 | +| y2 | 38.06 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| SUBJECT: | [59, 162, 106, 175] | [60, 156, 103, 184] | 1.7 | -3.7 | -2.8 | 5.1 | +| PARTIAL | [450, 209, 489, 224] | [451, 215, 489, 223] | 0.2 | 2.9 | 0.0 | -0.4 | +| PARTIAL | [448, 261, 490, 274] | [102, 240, 140, 248] | -77.2 | -8.0 | -71.4 | -9.5 | +| 15 | [279, 455, 292, 465] | [279, 669, 293, 677] | 0.0 | 47.0 | 0.3 | 45.6 | +| 27 | [536, 436, 549, 451] | [278, 704, 293, 712] | -48.1 | 61.5 | -46.6 | 57.9 | + +## File 82254765 + +Matched words: 4 / 116 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 58.99 | +| y1 | 3.51 | +| x2 | 52.92 | +| y2 | 1.60 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [78, 124, 100, 139] | [79, 128, 100, 137] | 1.3 | 3.2 | 0.0 | -1.4 | +| FROM: | [77, 152, 122, 166] | [79, 155, 119, 164] | 2.6 | 2.0 | -2.5 | -1.2 | +| 1/24/97 | [608, 126, 664, 139] | [607, 128, 664, 139] | -0.2 | 1.6 | 0.0 | 0.0 | +| 100 | [166, 496, 184, 507] | [551, 460, 569, 488] | 231.9 | -7.3 | 209.2 | -3.7 | + +## File 82491256 + +Matched words: 4 / 70 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.93 | +| y1 | 26.48 | +| x2 | 0.78 | +| y2 | 24.37 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| COURT: | [105, 196, 157, 207] | [104, 188, 153, 216] | -1.0 | -4.1 | -2.5 | 4.3 | +| Asbestos | [233, 320, 279, 335] | [235, 324, 279, 333] | 0.9 | 1.2 | 0.0 | -0.6 | +| LORILLARD | [105, 223, 180, 237] | [107, 446, 179, 455] | 1.9 | 100.0 | -0.6 | 92.0 | +| DATE: | [147, 511, 183, 525] | [147, 514, 183, 522] | 0.0 | 0.6 | 0.0 | -0.6 | + +## File 82504862 + +Matched words: 11 / 63 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.59 | +| y1 | 0.64 | +| x2 | 0.85 | +| y2 | 0.32 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| COURT: | [113, 198, 163, 211] | [115, 201, 161, 209] | 1.8 | 1.5 | -1.2 | -0.9 | +| Asbestos | [240, 327, 289, 337] | [242, 327, 286, 336] | 0.8 | 0.0 | -1.0 | -0.3 | +| Chaber, | [296, 366, 337, 377] | [297, 367, 337, 377] | 0.3 | 0.3 | 0.0 | 0.0 | +| Harowitz, | [338, 363, 391, 377] | [340, 367, 389, 377] | 0.6 | 1.1 | -0.5 | 0.0 | +| Smith& | [391, 365, 430, 378] | [393, 367, 432, 376] | 0.5 | 0.5 | 0.5 | -0.5 | + +## File 82562350 + +Matched words: 13 / 190 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 19.44 | +| y1 | 28.83 | +| x2 | 18.69 | +| y2 | 27.91 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Fax | [118, 249, 238, 309] | [124, 255, 231, 301] | 5.1 | 2.4 | -2.9 | -2.6 | +| To: | [116, 331, 144, 348] | [121, 337, 142, 346] | 4.3 | 1.8 | -1.4 | -0.6 | +| CC: | [120, 385, 148, 402] | [121, 390, 144, 401] | 0.8 | 1.3 | -2.7 | -0.2 | +| Re: | [118, 414, 146, 432] | [121, 418, 142, 428] | 2.5 | 1.0 | -2.7 | -0.9 | +| Pages: | [364, 415, 411, 430] | [367, 418, 410, 429] | 0.8 | 0.7 | -0.2 | -0.2 | + +## File 82573104 + +Matched words: 5 / 135 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 1.35 | +| y1 | 0.69 | +| x2 | 1.01 | +| y2 | 0.52 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Date: | [165, 458, 207, 479] | [167, 450, 206, 484] | 1.2 | -1.7 | -0.5 | 1.0 | +| To: | [166, 500, 193, 517] | [169, 502, 191, 515] | 1.8 | 0.4 | -1.0 | -0.4 | +| From: | [163, 539, 209, 559] | [163, 542, 208, 556] | 0.0 | 0.6 | -0.5 | -0.5 | +| Room: | [162, 601, 214, 618] | [164, 602, 210, 615] | 1.2 | 0.2 | -1.9 | -0.5 | +| MESSAGE: | [161, 691, 255, 709] | [165, 695, 252, 710] | 2.5 | 0.6 | -1.2 | 0.1 | + +## File 82837252 + +Matched words: 10 / 112 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 4.72 | +| y1 | 29.50 | +| x2 | 4.57 | +| y2 | 27.77 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TIME: | [256, 186, 292, 201] | [257, 189, 290, 199] | 0.4 | 1.6 | -0.7 | -1.0 | +| IN | [561, 140, 578, 154] | [422, 239, 435, 247] | -24.8 | 70.7 | -24.7 | 60.4 | +| OF | [148, 230, 168, 243] | [129, 377, 144, 387] | -12.8 | 63.9 | -14.3 | 59.3 | +| INFORMATION: | [92, 246, 186, 259] | [94, 618, 184, 650] | 2.2 | 151.2 | -1.1 | 151.0 | +| SEE | [242, 627, 264, 640] | [243, 630, 263, 638] | 0.4 | 0.5 | -0.4 | -0.3 | + +## File 83443897 + +Matched words: 8 / 194 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 58.79 | +| y1 | 227.82 | +| x2 | 38.21 | +| y2 | 192.69 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DATE: | [91, 223, 127, 238] | [93, 217, 129, 245] | 2.2 | -2.7 | 1.6 | 2.9 | +| COMPANY: | [89, 313, 153, 330] | [92, 317, 152, 327] | 3.4 | 1.3 | -0.7 | -0.9 | +| PHONE: | [91, 366, 134, 384] | [88, 362, 134, 390] | -3.3 | -1.1 | 0.0 | 1.6 | +| FROM: | [88, 402, 129, 417] | [91, 398, 127, 426] | 3.4 | -1.0 | -1.6 | 2.2 | +| PHONE: | [91, 432, 136, 446] | [92, 426, 134, 454] | 1.1 | -1.4 | -1.5 | 1.8 | + +## File 83553333_3334 + +Matched words: 6 / 204 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 17.21 | +| y1 | 110.16 | +| x2 | 17.04 | +| y2 | 102.87 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Sender | [253, 154, 294, 169] | [255, 157, 292, 167] | 0.8 | 1.9 | -0.7 | -1.2 | +| Reference | [465, 268, 521, 281] | [468, 269, 520, 280] | 0.6 | 0.4 | -0.2 | -0.4 | +| & | [351, 112, 364, 123] | [467, 429, 477, 439] | 33.0 | 283.0 | 31.0 | 256.9 | +| of | [511, 190, 525, 205] | [371, 896, 378, 924] | -27.4 | 371.6 | -28.0 | 350.7 | +| the | [281, 897, 294, 905] | [274, 919, 284, 947] | -2.5 | 2.5 | -3.4 | 4.6 | + +## File 83573282 + +Matched words: 11 / 299 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 25.47 | +| y1 | 51.90 | +| x2 | 23.53 | +| y2 | 50.86 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Date: | [91, 279, 123, 294] | [92, 282, 119, 292] | 1.1 | 1.1 | -3.3 | -0.7 | +| From: | [92, 310, 130, 324] | [92, 307, 129, 328] | 0.0 | -1.0 | -0.8 | 1.2 | +| Recipient(s): | [94, 437, 168, 451] | [91, 435, 166, 456] | -3.2 | -0.5 | -1.2 | 1.1 | +| Urgent! | [274, 779, 316, 796] | [276, 783, 316, 796] | 0.7 | 0.5 | 0.0 | 0.0 | +| Deliver | [321, 779, 364, 794] | [324, 783, 364, 793] | 0.9 | 0.5 | 0.0 | -0.1 | + +## File 83594639 + +Matched words: 5 / 87 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.85 | +| y1 | 1.47 | +| x2 | 0.42 | +| y2 | 1.04 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Date: | [103, 268, 131, 282] | [104, 261, 131, 290] | 1.0 | -2.6 | 0.0 | 2.8 | +| To: | [102, 297, 120, 311] | [103, 300, 120, 311] | 1.0 | 1.0 | 0.0 | 0.0 | +| Company: | [103, 328, 159, 345] | [104, 322, 157, 351] | 1.0 | -1.8 | -1.3 | 1.7 | +| From: | [381, 297, 416, 314] | [384, 301, 414, 312] | 0.8 | 1.3 | -0.5 | -0.6 | +| 3 | [563, 360, 574, 373] | [566, 362, 572, 373] | 0.5 | 0.6 | -0.3 | 0.0 | + +## File 83624198 + +Matched words: 3 / 190 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 14.13 | +| y1 | 169.19 | +| x2 | 13.92 | +| y2 | 155.47 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO | [448, 84, 462, 95] | [259, 510, 269, 538] | -42.2 | 507.1 | -41.8 | 466.3 | +| Tel | [392, 933, 406, 944] | [392, 935, 406, 943] | 0.0 | 0.2 | 0.0 | -0.1 | +| Fax | [482, 933, 499, 944] | [483, 935, 499, 944] | 0.2 | 0.2 | 0.0 | 0.0 | + +## File 83635935 + +Matched words: 10 / 141 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 42.63 | +| y1 | 116.99 | +| x2 | 35.63 | +| y2 | 107.65 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DATE: | [85, 366, 130, 383] | [87, 369, 129, 380] | 2.4 | 0.8 | -0.8 | -0.8 | +| TO: | [91, 399, 116, 414] | [90, 393, 117, 421] | -1.1 | -1.5 | 0.9 | 1.7 | +| FROM: | [84, 432, 134, 447] | [87, 434, 132, 445] | 3.6 | 0.5 | -1.5 | -0.4 | +| 6 | [605, 77, 612, 90] | [493, 466, 500, 477] | -18.5 | 505.2 | -18.3 | 430.0 | +| (212) | [374, 204, 399, 215] | [265, 498, 293, 510] | -29.1 | 144.1 | -26.6 | 137.2 | + +## File 83641919_1921 + +Matched words: 1 / 209 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.68 | +| y1 | 74.44 | +| x2 | 0.00 | +| y2 | 72.22 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| 31 | [292, 493, 303, 504] | [294, 860, 303, 868] | 0.7 | 74.4 | 0.0 | 72.2 | + +## File 83772145 + +Matched words: 6 / 236 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 34.98 | +| y1 | 49.54 | +| x2 | 28.51 | +| y2 | 46.84 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TIME: | [420, 296, 463, 314] | [423, 289, 460, 321] | 0.7 | -2.4 | -0.6 | 2.2 | +| OPERATOR: | [110, 503, 191, 524] | [108, 493, 189, 529] | -1.8 | -2.0 | -1.0 | 1.0 | +| DOCUMENT | [232, 404, 317, 422] | [418, 654, 467, 686] | 80.2 | 61.9 | 47.3 | 62.6 | +| JIM | [163, 263, 192, 280] | [100, 766, 120, 777] | -38.7 | 191.3 | -37.5 | 177.5 | +| a | [349, 615, 362, 626] | [568, 850, 577, 857] | 62.8 | 38.2 | 59.4 | 36.9 | + +## File 83823750 + +Matched words: 9 / 162 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 84.82 | +| y1 | 86.75 | +| x2 | 70.72 | +| y2 | 82.44 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| To | [258, 158, 273, 171] | [260, 160, 273, 170] | 0.8 | 1.3 | 0.0 | -0.6 | +| 2 | [455, 335, 465, 352] | [455, 339, 463, 350] | 0.0 | 1.2 | -0.4 | -0.6 | +| FAX | [179, 77, 199, 88] | [485, 360, 502, 382] | 170.9 | 367.5 | 152.3 | 334.1 | +| Fax | [60, 158, 80, 171] | [298, 411, 315, 433] | 396.7 | 160.1 | 293.8 | 153.2 | +| Number | [281, 263, 322, 274] | [322, 411, 361, 433] | 14.6 | 56.3 | 12.1 | 58.0 | + +## File 83996357 + +Matched words: 6 / 98 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 3.57 | +| y1 | 106.92 | +| x2 | 4.02 | +| y2 | 93.67 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [73, 147, 93, 160] | [74, 149, 92, 159] | 1.4 | 1.4 | -1.1 | -0.6 | +| FROM: | [73, 176, 109, 191] | [75, 179, 109, 189] | 2.7 | 1.7 | 0.0 | -1.0 | +| REQUIREMENTS | [142, 779, 229, 794] | [143, 772, 228, 802] | 0.7 | -0.9 | -0.4 | 1.0 | +| CODE | [124, 115, 155, 130] | [143, 847, 174, 857] | 15.3 | 636.5 | 12.3 | 559.2 | +| ASSIGNED | [176, 844, 232, 858] | [177, 838, 256, 857] | 0.6 | -0.7 | 10.3 | -0.1 | + +## File 85201976 + +Matched words: 6 / 80 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 37.10 | +| y1 | 101.31 | +| x2 | 31.70 | +| y2 | 93.25 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| NAME | [149, 292, 188, 305] | [151, 295, 185, 303] | 1.3 | 1.0 | -1.6 | -0.7 | +| AGE | [149, 492, 178, 506] | [152, 495, 176, 504] | 2.0 | 0.6 | -1.1 | -0.4 | +| SMOKING | [373, 779, 433, 796] | [376, 786, 435, 795] | 0.8 | 0.9 | 0.5 | -0.1 | +| INFORMATION | [334, 189, 436, 206] | [170, 871, 264, 880] | -49.1 | 360.8 | -39.4 | 327.2 | +| TO | [149, 253, 169, 266] | [401, 871, 418, 880] | 169.1 | 244.3 | 147.3 | 230.8 | + +## File 85240939 + +Matched words: 8 / 154 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 34.18 | +| y1 | 102.03 | +| x2 | 30.76 | +| y2 | 86.48 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| COMPANY: | [85, 275, 160, 295] | [88, 278, 159, 292] | 3.5 | 1.1 | -0.6 | -1.0 | +| THE | [295, 64, 327, 79] | [530, 421, 555, 433] | 79.7 | 557.8 | 69.7 | 448.1 | +| Please | [229, 406, 288, 426] | [82, 594, 135, 607] | -64.2 | 46.3 | -53.1 | 42.5 | +| a | [370, 412, 387, 427] | [211, 601, 224, 617] | -43.0 | 45.9 | -42.1 | 44.5 | +| at | [499, 419, 523, 434] | [290, 760, 307, 771] | -41.9 | 81.4 | -41.3 | 77.6 | + +## File 85540866 + +Matched words: 3 / 25 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 4.38 | +| y1 | 1.34 | +| x2 | 4.62 | +| y2 | 1.43 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| (Name) | [60, 189, 94, 204] | [56, 191, 99, 203] | -6.7 | 1.1 | 5.3 | -0.5 | +| (Date) | [59, 225, 86, 236] | [56, 219, 92, 245] | -5.1 | -2.7 | 7.0 | 3.8 | +| (Name) | [218, 968, 254, 982] | [215, 971, 258, 982] | -1.4 | 0.3 | 1.6 | 0.0 | + +## File 85629964 + +Matched words: 6 / 94 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 11.07 | +| y1 | 53.75 | +| x2 | 12.55 | +| y2 | 47.59 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| YEAR: | [110, 142, 156, 162] | [115, 148, 197, 163] | 4.5 | 4.2 | 26.3 | 0.6 | +| OVERWRAP: | [416, 297, 501, 315] | [423, 293, 501, 322] | 1.7 | -1.3 | 0.0 | 2.2 | +| NICOTINE: | [282, 385, 364, 406] | [286, 382, 364, 411] | 1.4 | -0.8 | 0.0 | 1.2 | +| (SEE | [552, 208, 587, 228] | [448, 497, 480, 509] | -18.8 | 138.9 | -18.2 | 123.2 | +| EXPLANATION) | [595, 208, 701, 229] | [491, 498, 596, 511] | -17.5 | 139.4 | -15.0 | 123.1 | + +## File 86075409_5410 + +Matched words: 5 / 125 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 18.67 | +| y1 | 65.12 | +| x2 | 17.51 | +| y2 | 60.42 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| From: | [56, 122, 85, 137] | [57, 124, 83, 133] | 1.8 | 1.6 | -2.4 | -2.9 | +| To: | [57, 149, 72, 162] | [57, 152, 69, 160] | 0.0 | 2.0 | -4.2 | -1.2 | +| CC: | [56, 175, 76, 190] | [57, 179, 74, 187] | 1.8 | 2.3 | -2.6 | -1.6 | +| & | [317, 454, 325, 468] | [454, 220, 460, 251] | 43.2 | -51.5 | 41.5 | -46.4 | +| Applicable | [189, 229, 236, 244] | [101, 843, 149, 854] | -46.6 | 268.1 | -36.9 | 250.0 | + +## File 86079776_9777 + +Matched words: 7 / 210 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 45.34 | +| y1 | 67.94 | +| x2 | 28.67 | +| y2 | 63.54 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| SPECIFICS | [95, 154, 183, 171] | [97, 156, 181, 170] | 2.1 | 1.3 | -1.1 | -0.6 | +| Execution: | [223, 264, 291, 281] | [220, 260, 291, 287] | -1.3 | -1.5 | 0.0 | 2.1 | +| Media: | [387, 260, 441, 278] | [390, 258, 445, 286] | 0.8 | -0.8 | 0.9 | 2.9 | +| OBJECTIVE | [89, 300, 185, 314] | [93, 301, 184, 315] | 4.5 | 0.3 | -0.5 | 0.3 | +| TARGET | [103, 439, 171, 454] | [180, 712, 204, 740] | 74.8 | 62.2 | 19.3 | 63.0 | + +## File 86220490 + +Matched words: 5 / 75 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 41.24 | +| y1 | 51.58 | +| x2 | 34.88 | +| y2 | 47.63 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Mike, | [117, 560, 151, 575] | [119, 562, 151, 574] | 1.7 | 0.4 | 0.0 | -0.2 | +| Market | [198, 205, 245, 220] | [468, 695, 511, 705] | 136.4 | 239.0 | 108.6 | 220.5 | +| to | [448, 649, 461, 660] | [148, 765, 160, 774] | -67.0 | 17.9 | -65.3 | 17.3 | +| you | [163, 763, 187, 778] | [164, 766, 186, 777] | 0.6 | 0.4 | -0.5 | -0.1 | +| soon! | [190, 761, 225, 775] | [191, 763, 225, 774] | 0.5 | 0.3 | 0.0 | -0.1 | + +## File 86230203_0206 + +Matched words: 9 / 134 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 1.74 | +| y1 | 5.04 | +| x2 | 1.37 | +| y2 | 4.59 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| FROM: | [61, 171, 100, 181] | [65, 172, 97, 180] | 6.6 | 0.6 | -3.0 | -0.6 | +| REGION: | [66, 331, 117, 344] | [67, 334, 114, 342] | 1.5 | 0.9 | -2.6 | -0.6 | +| FULL | [226, 328, 257, 340] | [224, 393, 250, 403] | -0.9 | 19.8 | -2.7 | 18.5 | +| PARTIAL | [367, 326, 412, 338] | [372, 390, 412, 398] | 1.4 | 19.6 | 0.0 | 17.8 | +| DISTRIBUTION | [106, 533, 194, 543] | [110, 535, 190, 544] | 3.8 | 0.4 | -2.1 | 0.2 | + +## File 86236474_6476 + +Matched words: 23 / 118 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 115.58 | +| y1 | 23.67 | +| x2 | 57.39 | +| y2 | 21.75 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [47, 102, 74, 120] | [51, 99, 73, 127] | 8.5 | -2.9 | -1.4 | 5.8 | +| FROM: | [47, 141, 96, 159] | [52, 143, 94, 155] | 10.6 | 1.4 | -2.1 | -2.5 | +| Ryan | [158, 138, 196, 158] | [161, 144, 192, 159] | 1.9 | 4.3 | -2.0 | 0.6 | +| OCT.7 | [511, 175, 561, 192] | [512, 180, 557, 191] | 0.2 | 2.9 | -0.7 | -0.5 | +| NEWPORT | [169, 212, 249, 232] | [172, 218, 247, 229] | 1.8 | 2.8 | -0.8 | -1.3 | + +## File 86244113 + +Matched words: 5 / 80 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 3.59 | +| y1 | 1.79 | +| x2 | 2.28 | +| y2 | 1.64 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DIVISION: | [39, 141, 92, 156] | [41, 136, 96, 164] | 5.1 | -3.5 | 4.3 | 5.1 | +| DATES: | [492, 142, 535, 155] | [493, 145, 533, 154] | 0.2 | 2.1 | -0.4 | -0.6 | +| BRAND | [39, 342, 80, 355] | [41, 334, 79, 362] | 5.1 | -2.3 | -1.2 | 2.0 | +| NOTE: | [39, 588, 80, 599] | [41, 591, 77, 600] | 5.1 | 0.5 | -3.8 | 0.2 | +| PLAN: | [85, 711, 120, 725] | [87, 714, 118, 723] | 2.4 | 0.4 | -1.7 | -0.3 | + +## File 86263525 + +Matched words: 6 / 90 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.43 | +| y1 | 0.70 | +| x2 | 0.43 | +| y2 | 0.94 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DATE | [466, 288, 507, 306] | [469, 290, 506, 300] | 0.6 | 0.7 | -0.2 | -2.0 | +| DATE | [469, 392, 510, 410] | [469, 396, 506, 406] | 0.0 | 1.0 | -0.8 | -1.0 | +| DATE | [469, 499, 507, 514] | [470, 501, 506, 511] | 0.2 | 0.4 | -0.2 | -0.6 | +| DATE | [468, 602, 509, 623] | [470, 608, 506, 617] | 0.4 | 1.0 | -0.6 | -1.0 | +| DATE | [466, 708, 507, 728] | [470, 713, 506, 723] | 0.9 | 0.7 | -0.2 | -0.7 | + +## File 86328049_8050 + +Matched words: 3 / 199 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 15.90 | +| y1 | 6.77 | +| x2 | 13.45 | +| y2 | 6.20 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Yes | [404, 860, 431, 874] | [407, 864, 429, 875] | 0.7 | 0.5 | -0.5 | 0.1 | +| Telecopier | [359, 219, 423, 234] | [527, 178, 591, 192] | 46.8 | -18.7 | 39.7 | -17.9 | +| Nos. | [594, 177, 622, 191] | [595, 179, 621, 190] | 0.2 | 1.1 | -0.2 | -0.5 | + +## File 87086073 + +Matched words: 8 / 56 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 40.34 | +| y1 | 85.72 | +| x2 | 38.20 | +| y2 | 81.93 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| D. | [490, 602, 503, 617] | [139, 148, 150, 157] | -71.6 | -75.4 | -70.2 | -74.6 | +| R. | [469, 687, 484, 701] | [230, 147, 241, 156] | -51.0 | -78.6 | -50.2 | -77.7 | +| F. | [490, 686, 505, 700] | [251, 147, 262, 156] | -48.8 | -78.6 | -48.1 | -77.7 | +| Dufresne | [513, 687, 572, 700] | [271, 146, 326, 156] | -47.2 | -78.7 | -43.0 | -77.7 | +| Vanitrope | [115, 186, 182, 204] | [118, 189, 180, 201] | 2.6 | 1.6 | -1.1 | -1.5 | + +## File 87093315_87093318 + +Matched words: 15 / 194 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 53.25 | +| y1 | 5.80 | +| x2 | 19.49 | +| y2 | 4.74 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Date: | [476, 32, 513, 46] | [477, 36, 511, 45] | 0.2 | 12.5 | -0.4 | -2.2 | +| on | [299, 99, 315, 110] | [300, 102, 315, 109] | 0.3 | 3.0 | 0.0 | -0.9 | +| RECASING | [271, 166, 331, 183] | [268, 159, 333, 189] | -1.1 | -4.2 | 0.6 | 3.3 | +| Filters | [319, 211, 374, 223] | [321, 212, 372, 226] | 0.6 | 0.5 | -0.5 | 1.3 | +| Length | [72, 261, 118, 275] | [123, 283, 166, 296] | 70.8 | 8.4 | 40.7 | 7.6 | + +## File 87125460 + +Matched words: 2 / 72 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 5.12 | +| y1 | 6.74 | +| x2 | 2.54 | +| y2 | 6.05 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| APPROVALS | [77, 683, 166, 703] | [82, 688, 163, 698] | 6.5 | 0.7 | -1.8 | -0.7 | +| DATE | [80, 730, 122, 747] | [83, 823, 118, 832] | 3.8 | 12.7 | -3.3 | 11.4 | + +## File 87137840 + +Matched words: 5 / 76 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 15.58 | +| y1 | 7.67 | +| x2 | 10.84 | +| y2 | 7.20 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DATE: | [124, 761, 163, 776] | [215, 503, 250, 512] | 73.4 | -33.9 | 53.4 | -34.0 | +| HANDLING | [189, 545, 252, 559] | [195, 531, 253, 562] | 3.2 | -2.6 | 0.4 | 0.5 | +| Store | [372, 538, 408, 552] | [373, 542, 407, 549] | 0.3 | 0.7 | -0.2 | -0.5 | +| refrigerated | [415, 538, 500, 553] | [417, 539, 500, 550] | 0.5 | 0.2 | 0.0 | -0.5 | +| in | [506, 536, 523, 549] | [509, 541, 522, 547] | 0.6 | 0.9 | -0.2 | -0.4 | + +## File 87147607 + +Matched words: 8 / 204 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 30.78 | +| y1 | 206.08 | +| x2 | 28.15 | +| y2 | 180.57 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| DESCRIPTION | [373, 200, 441, 214] | [375, 192, 438, 220] | 0.5 | -4.0 | -0.7 | 2.8 | +| UNIT | [641, 201, 668, 214] | [644, 204, 667, 212] | 0.5 | 1.5 | -0.1 | -0.9 | +| PRICE | [670, 203, 701, 216] | [671, 204, 699, 213] | 0.1 | 0.5 | -0.3 | -1.4 | +| FOR | [207, 75, 228, 83] | [401, 288, 424, 298] | 93.7 | 284.0 | 86.0 | 259.0 | +| OF | [349, 61, 366, 71] | [604, 378, 620, 387] | 73.1 | 519.7 | 69.4 | 445.1 | + +## File 87332450 + +Matched words: 11 / 158 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 0.87 | +| y1 | 0.65 | +| x2 | 0.55 | +| y2 | 0.64 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Name: | [108, 203, 149, 217] | [111, 204, 144, 214] | 2.8 | 0.5 | -3.4 | -1.4 | +| Institution: | [112, 244, 171, 259] | [112, 247, 170, 257] | 0.0 | 1.2 | -0.6 | -0.8 | +| Telephone: | [110, 363, 171, 380] | [113, 358, 171, 387] | 2.7 | -1.4 | 0.0 | 1.8 | +| TOTAL | [472, 561, 517, 575] | [474, 565, 514, 575] | 0.4 | 0.7 | -0.6 | 0.0 | +| DATE | [191, 655, 229, 669] | [189, 649, 229, 677] | -1.0 | -0.9 | 0.0 | 1.2 | + +## File 87428306 + +Matched words: 12 / 187 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 48.66 | +| y1 | 90.75 | +| x2 | 42.04 | +| y2 | 84.48 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| A221 | [127, 162, 158, 173] | [356, 223, 382, 233] | 180.3 | 37.7 | 141.8 | 34.7 | +| NUMBER | [144, 152, 175, 160] | [363, 264, 403, 273] | 152.1 | 73.7 | 130.3 | 70.6 | +| A | [128, 190, 140, 202] | [204, 443, 210, 450] | 59.4 | 133.2 | 50.0 | 122.8 | +| at | [266, 189, 281, 201] | [362, 441, 374, 449] | 36.1 | 133.3 | 33.1 | 123.4 | +| room | [286, 190, 319, 201] | [150, 457, 176, 463] | -47.6 | 140.5 | -44.8 | 130.3 | + +## File 87528321 + +Matched words: 11 / 198 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 33.80 | +| y1 | 866.86 | +| x2 | 22.21 | +| y2 | 416.52 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| Position | [29, 87, 95, 101] | [32, 91, 93, 101] | 10.3 | 4.6 | -2.1 | 0.0 | +| Change | [92, 180, 141, 191] | [95, 182, 140, 194] | 3.3 | 1.1 | -0.7 | 1.6 | +| of | [69, 180, 89, 192] | [125, 272, 140, 282] | 81.2 | 51.1 | 57.3 | 46.9 | +| Change | [75, 242, 127, 252] | [218, 347, 265, 359] | 190.7 | 43.4 | 108.7 | 42.5 | +| Glass | [185, 162, 225, 174] | [273, 347, 312, 359] | 47.6 | 114.2 | 38.7 | 106.3 | + +## File 87528380 + +Matched words: 17 / 411 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 8.85 | +| y1 | 75.92 | +| x2 | 6.82 | +| y2 | 65.49 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| 10675 | [612, 59, 678, 81] | [615, 63, 676, 80] | 0.5 | 6.8 | -0.3 | -1.2 | +| PROPOSAL | [300, 186, 440, 204] | [304, 190, 438, 201] | 1.3 | 2.2 | -0.5 | -1.5 | +| TO | [34, 214, 52, 227] | [37, 219, 49, 226] | 8.8 | 2.3 | -5.8 | -0.4 | +| MATERIAL: | [47, 358, 120, 372] | [49, 359, 113, 370] | 4.3 | 0.3 | -5.8 | -0.5 | +| ALUMINUM | [232, 358, 300, 372] | [234, 359, 299, 370] | 0.9 | 0.3 | -0.3 | -0.5 | + +## File 87594142_87594144 + +Matched words: 12 / 433 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 18.44 | +| y1 | 187.69 | +| x2 | 18.84 | +| y2 | 168.73 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| ADDRESS | [338, 367, 379, 381] | [419, 156, 457, 184] | 24.0 | -57.5 | 20.6 | -51.7 | +| Baltimore | [105, 384, 173, 399] | [109, 386, 171, 395] | 3.8 | 0.5 | -1.2 | -1.0 | +| (A) | [88, 440, 106, 457] | [88, 710, 100, 718] | 0.0 | 61.4 | -5.7 | 57.1 | +| FOR | [244, 112, 290, 136] | [152, 793, 167, 799] | -37.7 | 608.0 | -42.4 | 487.5 | +| INVESTIGATOR | [162, 163, 242, 177] | [131, 877, 189, 883] | -19.1 | 438.0 | -21.9 | 398.9 | + +## File 89856243 + +Matched words: 10 / 311 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 1.75 | +| y1 | 253.13 | +| x2 | 1.60 | +| y2 | 164.27 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| PRODUCT | [104, 116, 157, 130] | [107, 108, 159, 137] | 2.9 | -6.9 | 1.3 | 5.4 | +| Date | [559, 24, 589, 38] | [617, 628, 642, 658] | 10.4 | 2516.7 | 9.0 | 1631.6 | +| yes | [260, 654, 280, 667] | [260, 673, 281, 681] | 0.0 | 2.9 | 0.4 | 2.1 | +| no | [314, 657, 333, 667] | [317, 673, 331, 680] | 1.0 | 2.4 | -0.6 | 1.9 | +| Approved | [106, 725, 165, 738] | [108, 729, 161, 740] | 1.9 | 0.6 | -2.4 | 0.3 | + +## File 91814768_91814769 + +Matched words: 9 / 354 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 26.94 | +| y1 | 200.70 | +| x2 | 25.12 | +| y2 | 141.37 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| November | [204, 409, 277, 424] | [206, 412, 277, 423] | 1.0 | 0.7 | 0.0 | -0.2 | +| 3, | [285, 408, 302, 426] | [288, 411, 302, 424] | 1.1 | 0.7 | 0.0 | -0.5 | +| ballot | [697, 408, 732, 422] | [368, 456, 398, 466] | -47.2 | 11.8 | -45.6 | 10.4 | +| Address | [444, 679, 490, 696] | [571, 536, 644, 546] | 28.6 | -21.1 | 31.4 | -21.6 | +| Date | [480, 616, 509, 631] | [482, 618, 506, 628] | 0.4 | 0.3 | -0.6 | -0.5 | + +## File 92380595 + +Matched words: 13 / 270 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 17.98 | +| y1 | 56.63 | +| x2 | 16.25 | +| y2 | 51.68 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| RECEIVED | [293, 152, 398, 176] | [296, 155, 398, 173] | 1.0 | 2.0 | 0.0 | -1.7 | +| TO: | [28, 183, 50, 197] | [29, 185, 49, 195] | 3.6 | 1.1 | -2.0 | -1.0 | +| x | [546, 240, 556, 255] | [548, 243, 555, 252] | 0.4 | 1.2 | -0.2 | -1.2 | +| via: | [433, 378, 453, 389] | [434, 381, 449, 388] | 0.2 | 0.8 | -0.9 | -0.3 | +| vs. | [31, 591, 49, 602] | [33, 593, 45, 600] | 6.5 | 0.3 | -8.2 | -0.3 | + +## File 93106788 + +Matched words: 10 / 310 + +| Coord | MAPE (%) | +|---|---:| +| x1 | 43.25 | +| y1 | 284.48 | +| x2 | 36.40 | +| y2 | 259.12 | + +Sample comparisons: + +| word | GT bbox | MD bbox | Δx1% | Δy1% | Δx2% | Δy2% | +|---|---|---|---:|---:|---:|---:| +| TO: | [99, 196, 114, 206] | [100, 186, 112, 214] | 1.0 | -5.1 | -1.8 | 3.9 | +| DATE: | [496, 191, 523, 202] | [497, 194, 519, 201] | 0.2 | 1.6 | -0.8 | -0.5 | +| ADVERTISER: | [464, 223, 523, 234] | [464, 226, 519, 233] | 0.0 | 1.3 | -0.8 | -0.4 | +| PRODUCT: | [475, 256, 522, 267] | [476, 257, 519, 265] | 0.2 | 0.4 | -0.6 | -0.7 | +| LORILLARD | [536, 223, 590, 233] | [450, 724, 575, 733] | -16.0 | 224.7 | -2.5 | 214.6 | + diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index afb5d319..52557523 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -47,7 +47,11 @@ all = [ "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", - "azure-identity" + "azure-identity", + "pdfplumber", + "pytesseract", + "Pillow", + "jsonschema", ] pptx = ["python-pptx"] docx = ["mammoth", "lxml"] @@ -58,6 +62,7 @@ outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] +bbox = ["pdfplumber", "pytesseract", "Pillow", "jsonschema"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py index af356dd6..3616d784 100644 --- a/packages/markitdown/src/markitdown/__init__.py +++ b/packages/markitdown/src/markitdown/__init__.py @@ -9,6 +9,7 @@ PRIORITY_GENERIC_FILE_FORMAT, ) from ._base_converter import DocumentConverterResult, DocumentConverter +from .bbox import BBoxDoc from ._stream_info import StreamInfo from ._exceptions import ( MarkItDownException, @@ -23,6 +24,7 @@ "MarkItDown", "DocumentConverter", "DocumentConverterResult", + "BBoxDoc", "MarkItDownException", "MissingDependencyException", "FailedConversionAttempt", diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6b..b8dd93fa 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -4,6 +4,7 @@ import argparse import sys import codecs +import os from textwrap import dedent from importlib.metadata import entry_points from .__about__ import __version__ @@ -110,6 +111,16 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--emit-bbox", + action="store_true", + help="Emit sidecar JSON with page/line/word bounding boxes for PDFs and images.", + ) + parser.add_argument( + "--ocr-lang", + help="Override MARKITDOWN_OCR_LANG for OCR (default 'eng').", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -191,10 +202,16 @@ def main(): sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, + emit_bbox=args.emit_bbox, + ocr_lang=args.ocr_lang, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + emit_bbox=args.emit_bbox, + ocr_lang=args.ocr_lang, ) _handle_output(args, result) @@ -205,6 +222,18 @@ def _handle_output(args, result: DocumentConverterResult): if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) + if result.bbox is not None: + base = os.path.splitext(args.output)[0] + with open(base + ".bbox.json", "w", encoding="utf-8") as bf: + bf.write(result.bbox.to_json()) + elif args.emit_bbox and args.filename: + base = os.path.splitext(args.filename)[0] + md_path = base + ".md" + with open(md_path, "w", encoding="utf-8") as f: + f.write(result.markdown) + if result.bbox is not None: + with open(base + ".bbox.json", "w", encoding="utf-8") as bf: + bf.write(result.bbox.to_json()) else: # Handle stdout encoding errors more gracefully print( diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index a6f2a2d9..30258d00 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -1,5 +1,6 @@ from typing import Any, BinaryIO, Optional from ._stream_info import StreamInfo +from .bbox import BBoxDoc class DocumentConverterResult: @@ -10,6 +11,7 @@ def __init__( markdown: str, *, title: Optional[str] = None, + bbox: "BBoxDoc | None" = None, ): """ Initialize the DocumentConverterResult. @@ -23,6 +25,7 @@ def __init__( """ self.markdown = markdown self.title = title + self.bbox = bbox @property def text_content(self) -> str: diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 3027efc6..ef2f9fd4 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -245,6 +245,8 @@ def convert( source: Union[str, requests.Response, Path, BinaryIO], *, stream_info: Optional[StreamInfo] = None, + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: # TODO: deal with kwargs """ @@ -269,22 +271,52 @@ def convert( _kwargs["mock_url"] = _kwargs["url"] del _kwargs["url"] - return self.convert_uri(source, stream_info=stream_info, **_kwargs) + return self.convert_uri( + source, + stream_info=stream_info, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **_kwargs, + ) else: - return self.convert_local(source, stream_info=stream_info, **kwargs) + return self.convert_local( + source, + stream_info=stream_info, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) # Path object elif isinstance(source, Path): - return self.convert_local(source, stream_info=stream_info, **kwargs) + return self.convert_local( + source, + stream_info=stream_info, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) # Request response elif isinstance(source, requests.Response): - return self.convert_response(source, stream_info=stream_info, **kwargs) + return self.convert_response( + source, + stream_info=stream_info, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) # Binary stream elif ( hasattr(source, "read") and callable(source.read) and not isinstance(source, io.TextIOBase) ): - return self.convert_stream(source, stream_info=stream_info, **kwargs) + return self.convert_stream( + source, + stream_info=stream_info, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) else: raise TypeError( f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO." @@ -297,6 +329,8 @@ def convert_local( stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: if isinstance(path, Path): @@ -325,7 +359,13 @@ def convert_local( guesses = self._get_stream_info_guesses( file_stream=fh, base_guess=base_guess ) - return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs) + return self._convert( + file_stream=fh, + stream_info_guesses=guesses, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) def convert_stream( self, @@ -334,6 +374,8 @@ def convert_stream( stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: guesses: List[StreamInfo] = [] @@ -372,7 +414,13 @@ def convert_stream( guesses = self._get_stream_info_guesses( file_stream=stream, base_guess=base_guess or StreamInfo() ) - return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) + return self._convert( + file_stream=stream, + stream_info_guesses=guesses, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) def convert_url( self, @@ -381,6 +429,8 @@ def convert_url( stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, mock_url: Optional[str] = None, + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: """Alias for convert_uri()""" @@ -390,6 +440,8 @@ def convert_url( stream_info=stream_info, file_extension=file_extension, mock_url=mock_url, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, **kwargs, ) @@ -402,6 +454,8 @@ def convert_uri( mock_url: Optional[ str ] = None, # Mock the request as if it came from a different URL + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: uri = uri.strip() @@ -418,6 +472,8 @@ def convert_uri( stream_info=stream_info, file_extension=file_extension, url=mock_url, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, **kwargs, ) # Data URIs @@ -436,6 +492,8 @@ def convert_uri( stream_info=base_guess, file_extension=file_extension, url=mock_url, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, **kwargs, ) # HTTP/HTTPS URIs @@ -461,6 +519,8 @@ def convert_response( stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: # If there is a content-type header, get the mimetype and charset (if present) @@ -524,7 +584,13 @@ def convert_response( guesses = self._get_stream_info_guesses( file_stream=buffer, base_guess=base_guess ) - return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs) + return self._convert( + file_stream=buffer, + stream_info_guesses=guesses, + emit_bbox=emit_bbox, + ocr_lang=ocr_lang, + **kwargs, + ) def _convert( self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs diff --git a/packages/markitdown/src/markitdown/bbox.py b/packages/markitdown/src/markitdown/bbox.py new file mode 100644 index 00000000..75263266 --- /dev/null +++ b/packages/markitdown/src/markitdown/bbox.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +"""Utilities and data structures for bounding box sidecar emission.""" + +from dataclasses import dataclass, asdict, field +from typing import List, Optional, Dict, Any +import json + + +@dataclass +class BBoxPage: + page: int + width: float + height: float + + +@dataclass +class BBoxLine: + page: int + text: str + bbox_norm: List[float] + bbox_abs: List[float] + confidence: Optional[float] + md_span: Optional[Dict[str, Optional[int]]] + + +@dataclass +class BBoxWord: + page: int + text: str + bbox_norm: List[float] + bbox_abs: List[float] + confidence: Optional[float] + line_id: int + + +@dataclass +class BBoxDoc: + """Container for bounding box information.""" + + version: str = "1.0" + source: str = "" + pages: List[BBoxPage] = field(default_factory=list) + lines: List[BBoxLine] = field(default_factory=list) + words: List[BBoxWord] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), ensure_ascii=False, indent=2) diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index dd8fbac6..9298b36d 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,9 +1,13 @@ -from typing import BinaryIO, Any, Union +from typing import BinaryIO, Any, Union, Optional, Dict import base64 import mimetypes +import io +import os +from warnings import warn from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo +from ..bbox import BBoxDoc, BBoxPage, BBoxLine, BBoxWord ACCEPTED_MIME_TYPE_PREFIXES = [ "image/jpeg", @@ -40,6 +44,9 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, + *, + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: md_content = "" @@ -80,9 +87,83 @@ def convert( if llm_description is not None: md_content += "\n# Description:\n" + llm_description.strip() + "\n" - return DocumentConverterResult( - markdown=md_content, - ) + bbox_doc: Optional[BBoxDoc] = None + if emit_bbox: + try: + from PIL import Image + import pytesseract + from pytesseract import Output + except Exception: + warn("emit_bbox requested but pytesseract/Pillow not installed; skipping bbox output") + else: + cur_pos = file_stream.tell() + file_stream.seek(0) + img = Image.open(file_stream) + file_stream.seek(cur_pos) + width, height = img.size + lang = ocr_lang or os.getenv("MARKITDOWN_OCR_LANG", "eng") + df = pytesseract.image_to_data(img, output_type=Output.DATAFRAME, lang=lang) + line_map: Dict[int, int] = {} + tmp: Dict[int, Dict[str, Any]] = {} + words: list[BBoxWord] = [] + for _, row in df[df.level == 5].iterrows(): + text = str(row["text"]).strip() + if not text: + continue + left, top, w, h = int(row.left), int(row.top), int(row.width), int(row.height) + x1, y1, x2, y2 = left, top, left + w, top + h + conf = float(row.conf) if row.conf != -1 else None + bbox_abs = [x1, y1, x2, y2] + bbox_norm = [x1 / width, y1 / height, w / width, h / height] + key = int(row.line_num) + line_id = line_map.setdefault(key, len(line_map)) + t = tmp.setdefault( + key, + {"page": 1, "words": [], "minx": x1, "miny": y1, "maxx": x2, "maxy": y2}, + ) + t["minx"] = min(t["minx"], x1) + t["miny"] = min(t["miny"], y1) + t["maxx"] = max(t["maxx"], x2) + t["maxy"] = max(t["maxy"], y2) + t["words"].append(text) + words.append( + BBoxWord( + page=1, + text=text, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=conf, + line_id=line_id, + ) + ) + line_list = [None] * len(line_map) + for key, idx in line_map.items(): + t = tmp[key] + x1, y1, x2, y2 = t["minx"], t["miny"], t["maxx"], t["maxy"] + bbox_abs = [x1, y1, x2, y2] + bbox_norm = [ + x1 / width, + y1 / height, + (x2 - x1) / width, + (y2 - y1) / height, + ] + text_line = " ".join(t["words"]).strip() + line_list[idx] = BBoxLine( + page=1, + text=text_line, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=None, + md_span={"start": None, "end": None}, + ) + bbox_doc = BBoxDoc( + source=stream_info.filename or "", + pages=[BBoxPage(page=1, width=width, height=height)], + lines=line_list, + words=words, + ) + + return DocumentConverterResult(markdown=md_content, bbox=bbox_doc) def _get_llm_description( self, diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d52..dd36f80c 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,11 +1,13 @@ -import sys import io - -from typing import BinaryIO, Any +import os +import sys +from typing import Any, BinaryIO, Dict, Optional +from warnings import warn from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo +from ..bbox import BBoxDoc, BBoxPage, BBoxLine, BBoxWord from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -55,6 +57,9 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, + *, + emit_bbox: bool = False, + ocr_lang: Optional[str] = None, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check the dependencies @@ -72,6 +77,215 @@ def convert( ) assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + + data = file_stream.read() + markdown = pdfminer.high_level.extract_text(io.BytesIO(data)) + + bbox_doc: Optional[BBoxDoc] = None + if emit_bbox: + try: + import pdfplumber # type: ignore + except Exception: + warn( + "emit_bbox requested but pdfplumber is not installed; skipping bbox output", + ) + emit_bbox = False + + if emit_bbox: + with pdfplumber.open(io.BytesIO(data)) as doc: + pages: list[BBoxPage] = [] + lines: list[BBoxLine] = [] + words: list[BBoxWord] = [] + plain_lines: list[str] = [] + + for pno, page in enumerate(doc.pages): + wlist = page.extract_words(use_text_flow=True) + base_line_id = len(lines) + + if len(wlist) == 0: + try: + from PIL import Image + import pytesseract + from pytesseract import Output + except Exception: + warn( + "emit_bbox requested but pytesseract/Pillow not available; skipping bbox output", + ) + continue + img = page.to_image(resolution=200).original + width, height = img.width, img.height + pages.append(BBoxPage(page=pno + 1, width=width, height=height)) + lang = ocr_lang or os.getenv("MARKITDOWN_OCR_LANG", "eng") + df = pytesseract.image_to_data( + img, output_type=Output.DATAFRAME, lang=lang + ) + line_map: Dict[int, int] = {} + tmp: Dict[int, Dict[str, Any]] = {} + for _, row in df[df.level == 5].iterrows(): + text = str(row["text"]).strip() + if not text: + continue + left, top, widthw, heighth = ( + int(row.left), + int(row.top), + int(row.width), + int(row.height), + ) + conf = float(row.conf) if row.conf != -1 else None + x1, y1, x2, y2 = left, top, left + widthw, top + heighth + bbox_abs = [x1, y1, x2, y2] + bbox_norm = [ + x1 / width, + y1 / height, + widthw / width, + heighth / height, + ] + key = int(row.line_num) + line_id = line_map.setdefault( + key, base_line_id + len(line_map) + ) + t = tmp.setdefault( + line_id, + { + "page": pno + 1, + "words": [], + "minx": x1, + "miny": y1, + "maxx": x2, + "maxy": y2, + }, + ) + t["minx"] = min(t["minx"], x1) + t["miny"] = min(t["miny"], y1) + t["maxx"] = max(t["maxx"], x2) + t["maxy"] = max(t["maxy"], y2) + t["words"].append(text) + words.append( + BBoxWord( + page=pno + 1, + text=text, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=conf, + line_id=line_id, + ) + ) + for idx in sorted(tmp.keys()): + t = tmp[idx] + x1, y1, x2, y2 = ( + t["minx"], + t["miny"], + t["maxx"], + t["maxy"], + ) + bbox_abs = [x1, y1, x2, y2] + bbox_norm = [ + x1 / width, + y1 / height, + (x2 - x1) / width, + (y2 - y1) / height, + ] + text_line = " ".join(t["words"]).strip() + lines.append( + BBoxLine( + page=pno + 1, + text=text_line, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=None, + md_span={"start": None, "end": None}, + ) + ) + plain_lines.append(text_line) + else: + width, height = float(page.width), float(page.height) + pages.append( + BBoxPage(page=pno + 1, width=width, height=height) + ) + sorted_words = sorted( + wlist, key=lambda w: (float(w["top"]), float(w["x0"])) + ) + tmp: Dict[int, Dict[str, Any]] = {} + current_line_id: Optional[int] = None + current_top: Optional[float] = None + line_tol = 2.0 + for w in sorted_words: + text = str(w.get("text", "")).strip() + if not text: + continue + x0 = float(w["x0"]) + top = float(w["top"]) + x1 = float(w["x1"]) + bottom = float(w["bottom"]) + if current_top is None or abs(top - current_top) > line_tol: + current_line_id = base_line_id + len(tmp) + tmp[current_line_id] = { + "page": pno + 1, + "words": [], + "minx": x0, + "miny": top, + "maxx": x1, + "maxy": bottom, + } + current_top = top + t = tmp[current_line_id] + t["minx"] = min(t["minx"], x0) + t["miny"] = min(t["miny"], top) + t["maxx"] = max(t["maxx"], x1) + t["maxy"] = max(t["maxy"], bottom) + t["words"].append(text) + bbox_abs = [x0, top, x1, bottom] + bbox_norm = [ + x0 / width, + top / height, + (x1 - x0) / width, + (bottom - top) / height, + ] + words.append( + BBoxWord( + page=pno + 1, + text=text, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=None, + line_id=current_line_id, + ) + ) + for idx in sorted(tmp.keys()): + t = tmp[idx] + x1, y1, x2, y2 = ( + t["minx"], + t["miny"], + t["maxx"], + t["maxy"], + ) + bbox_abs = [x1, y1, x2, y2] + bbox_norm = [ + x1 / width, + y1 / height, + (x2 - x1) / width, + (y2 - y1) / height, + ] + text_line = " ".join(t["words"]).strip() + lines.append( + BBoxLine( + page=pno + 1, + text=text_line, + bbox_norm=bbox_norm, + bbox_abs=bbox_abs, + confidence=None, + md_span={"start": None, "end": None}, + ) + ) + plain_lines.append(text_line) + + bbox_doc = BBoxDoc( + source=stream_info.filename or "", + pages=pages, + lines=lines, + words=words, + ) + if not markdown.strip(): + markdown = "\n".join(plain_lines) + + return DocumentConverterResult(markdown=markdown, bbox=bbox_doc) diff --git a/packages/markitdown/tests/bbox/schema.json b/packages/markitdown/tests/bbox/schema.json new file mode 100644 index 00000000..93f51ff0 --- /dev/null +++ b/packages/markitdown/tests/bbox/schema.json @@ -0,0 +1,58 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["version", "source", "pages", "lines", "words"], + "properties": { + "version": {"type": "string"}, + "source": {"type": "string"}, + "pages": { + "type": "array", + "items": { + "type": "object", + "required": ["page", "width", "height"], + "properties": { + "page": {"type": "integer"}, + "width": {"type": "number"}, + "height": {"type": "number"} + } + } + }, + "lines": { + "type": "array", + "items": { + "type": "object", + "required": ["page", "text", "bbox_norm", "bbox_abs", "confidence", "md_span"], + "properties": { + "page": {"type": "integer"}, + "text": {"type": "string"}, + "bbox_norm": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + "bbox_abs": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + "confidence": {"type": ["number", "null"]}, + "md_span": { + "type": ["object"], + "required": ["start", "end"], + "properties": { + "start": {"type": ["integer", "null"]}, + "end": {"type": ["integer", "null"]} + } + } + } + } + }, + "words": { + "type": "array", + "items": { + "type": "object", + "required": ["page", "text", "bbox_norm", "bbox_abs", "confidence", "line_id"], + "properties": { + "page": {"type": "integer"}, + "text": {"type": "string"}, + "bbox_norm": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + "bbox_abs": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + "confidence": {"type": ["number", "null"]}, + "line_id": {"type": "integer"} + } + } + } + } +} diff --git a/packages/markitdown/tests/bbox/test_bbox_image_basic.py b/packages/markitdown/tests/bbox/test_bbox_image_basic.py new file mode 100644 index 00000000..9599d28e --- /dev/null +++ b/packages/markitdown/tests/bbox/test_bbox_image_basic.py @@ -0,0 +1,34 @@ +import io +from pathlib import Path +import io + +import pytest +from PIL import Image, ImageDraw + +from markitdown import MarkItDown, StreamInfo + + +def test_bbox_image_basic(tmp_path: Path): + pytesseract = pytest.importorskip("pytesseract") + try: + pytesseract.get_tesseract_version() + except Exception: + pytest.skip("tesseract not installed") + img = Image.new("RGB", (200, 60), color="white") + d = ImageDraw.Draw(img) + d.text((10, 10), "Hello 123", fill="black") + buf = io.BytesIO() + img.save(buf, format="PNG") + buf.seek(0) + + md = MarkItDown() + res = md.convert_stream( + buf, + stream_info=StreamInfo(extension=".png"), + emit_bbox=True, + ) + assert res.bbox is not None + bbox = res.bbox + assert bbox.words + for w in bbox.words: + assert all(0 <= v <= 1 for v in w.bbox_norm) diff --git a/packages/markitdown/tests/bbox/test_bbox_pdf_basic.py b/packages/markitdown/tests/bbox/test_bbox_pdf_basic.py new file mode 100644 index 00000000..90e30e18 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_bbox_pdf_basic.py @@ -0,0 +1,42 @@ +import io +from pathlib import Path + +import json +import jsonschema +from reportlab.pdfgen import canvas + +from markitdown import MarkItDown, StreamInfo + + +def _make_pdf() -> bytes: + buf = io.BytesIO() + c = canvas.Canvas(buf) + c.drawString(100, 700, "Hello") + c.showPage() + c.drawString(100, 700, "World") + c.save() + return buf.getvalue() + + +def test_bbox_pdf_basic(tmp_path: Path): + pdf_bytes = _make_pdf() + md = MarkItDown() + res = md.convert_stream( + io.BytesIO(pdf_bytes), + stream_info=StreamInfo(extension=".pdf"), + emit_bbox=True, + ) + assert res.bbox is not None + bbox = res.bbox + assert len(bbox.pages) == 2 + for p in bbox.pages: + assert p.width > 0 and p.height > 0 + assert bbox.words + for w in bbox.words: + assert all(0 <= v <= 1 for v in w.bbox_norm) + assert 0 <= w.line_id < len(bbox.lines) + for idx, line in enumerate(bbox.lines): + lw = [w.text for w in bbox.words if w.line_id == idx] + assert " ".join(lw).strip() == line.text.strip() + schema = json.load(open(Path(__file__).parent / "schema.json")) + jsonschema.validate(instance=bbox.to_dict(), schema=schema) diff --git a/packages/markitdown/tests/bbox/test_bbox_pdf_scanned_fallback.py b/packages/markitdown/tests/bbox/test_bbox_pdf_scanned_fallback.py new file mode 100644 index 00000000..78f41705 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_bbox_pdf_scanned_fallback.py @@ -0,0 +1,40 @@ +import io +from pathlib import Path + +import pytest +from PIL import Image, ImageDraw +from reportlab.pdfgen import canvas + +from markitdown import MarkItDown, StreamInfo + + +def test_bbox_pdf_scanned_fallback(tmp_path: Path): + pytesseract = pytest.importorskip("pytesseract") + try: + pytesseract.get_tesseract_version() + except Exception: + pytest.skip("tesseract not installed") + img = Image.new("RGB", (200, 60), color="white") + d = ImageDraw.Draw(img) + d.text((10, 10), "OCR", fill="black") + img_bytes = io.BytesIO() + img.save(img_bytes, format="PNG") + img_bytes.seek(0) + + buf = io.BytesIO() + c = canvas.Canvas(buf) + c.drawInlineImage(Image.open(img_bytes), 0, 0) + c.save() + pdf_bytes = buf.getvalue() + + md = MarkItDown() + res = md.convert_stream( + io.BytesIO(pdf_bytes), + stream_info=StreamInfo(extension=".pdf"), + emit_bbox=True, + ) + assert res.bbox is not None + if not res.bbox.words: + pytest.skip("OCR produced no words") + for w in res.bbox.words: + assert all(0 <= v <= 1 for v in w.bbox_norm) diff --git a/packages/markitdown/tests/bbox/test_bbox_schema_validation.py b/packages/markitdown/tests/bbox/test_bbox_schema_validation.py new file mode 100644 index 00000000..e63fdfe3 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_bbox_schema_validation.py @@ -0,0 +1,60 @@ +import io +import json +from pathlib import Path +import io +import json +import pytest + +import jsonschema +from PIL import Image, ImageDraw +from reportlab.pdfgen import canvas + +from markitdown import MarkItDown, StreamInfo + + +def _make_pdf() -> bytes: + buf = io.BytesIO() + c = canvas.Canvas(buf) + c.drawString(100, 700, "One") + c.showPage() + c.save() + return buf.getvalue() + + +def _make_png() -> bytes: + img = Image.new("RGB", (100, 40), color="white") + d = ImageDraw.Draw(img) + d.text((5, 5), "img", fill="black") + b = io.BytesIO() + img.save(b, format="PNG") + return b.getvalue() + + +def _validate(bbox_dict, schema): + jsonschema.validate(instance=bbox_dict, schema=schema) + + +def test_bbox_schema_validation(tmp_path: Path): + pytesseract = pytest.importorskip("pytesseract") + try: + pytesseract.get_tesseract_version() + except Exception: + pytest.skip("tesseract not installed") + schema = json.load(open(Path(__file__).parent / "schema.json")) + md = MarkItDown() + + pdf_res = md.convert_stream( + io.BytesIO(_make_pdf()), + stream_info=StreamInfo(extension=".pdf"), + emit_bbox=True, + ) + assert pdf_res.bbox is not None + _validate(pdf_res.bbox.to_dict(), schema) + + png_res = md.convert_stream( + io.BytesIO(_make_png()), + stream_info=StreamInfo(extension=".png"), + emit_bbox=True, + ) + assert png_res.bbox is not None + _validate(png_res.bbox.to_dict(), schema) diff --git a/packages/markitdown/tests/bbox/test_cli_emits_sidecar.py b/packages/markitdown/tests/bbox/test_cli_emits_sidecar.py new file mode 100644 index 00000000..e96cc5b3 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_cli_emits_sidecar.py @@ -0,0 +1,32 @@ +import json +import subprocess +import sys +from pathlib import Path + +from reportlab.pdfgen import canvas +import jsonschema +import os + + +def _make_pdf(path: Path) -> None: + c = canvas.Canvas(str(path)) + c.drawString(100, 700, "Hi") + c.save() + + +def test_cli_emits_sidecar(tmp_path: Path): + pdf_path = tmp_path / "sample.pdf" + _make_pdf(pdf_path) + env = os.environ.copy() + repo_root = Path(__file__).resolve().parents[3].parent + env["PYTHONPATH"] = str(repo_root / "packages/markitdown/src") + subprocess.run( + [sys.executable, "-m", "markitdown", str(pdf_path), "--emit-bbox"], + check=True, + cwd=tmp_path, + env=env, + ) + sidecar = pdf_path.with_suffix(".bbox.json") + assert sidecar.exists() + schema = json.load(open(Path(__file__).parent / "schema.json")) + jsonschema.validate(instance=json.load(open(sidecar)), schema=schema) diff --git a/packages/markitdown/tests/bbox/test_docling_dataset.py b/packages/markitdown/tests/bbox/test_docling_dataset.py new file mode 100644 index 00000000..4ac0f600 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_docling_dataset.py @@ -0,0 +1,75 @@ +import json +import shutil +import urllib.request +from pathlib import Path + +import pytest + +from markitdown import MarkItDown + +DOC_BASE = "https://raw.githubusercontent.com/docling-project/docling/main/tests/data_scanned" +PDF_URL = f"{DOC_BASE}/ocr_test.pdf" +MD_URL = f"{DOC_BASE}/groundtruth/docling_v2/ocr_test.md" +JSON_URL = f"{DOC_BASE}/groundtruth/docling_v2/ocr_test.json" + + +def _fetch(url: str, dest: Path) -> None: + dest.write_bytes(urllib.request.urlopen(url).read()) + + +@pytest.mark.skipif(shutil.which("tesseract") is None, reason="tesseract not installed") +def test_docling_ocr_pdf(tmp_path: Path) -> None: + pdfplumber = pytest.importorskip("pdfplumber") + pytesseract = pytest.importorskip("pytesseract") + try: + pytesseract.get_tesseract_version() + except Exception: + pytest.skip("tesseract not installed") + + pdf_path = tmp_path / "ocr_test.pdf" + md_path = tmp_path / "ocr_test.md" + json_path = tmp_path / "ocr_test.json" + + _fetch(PDF_URL, pdf_path) + _fetch(MD_URL, md_path) + _fetch(JSON_URL, json_path) + + md = MarkItDown() + result = md.convert_local(pdf_path, emit_bbox=True) + + assert result.bbox is not None + # normalize whitespace since OCR may insert newlines + got_md = " ".join(result.markdown.split()) + expect_md = " ".join(md_path.read_text().split()) + assert got_md == expect_md + + gt = json.loads(json_path.read_text()) + page_info = next(iter(gt["pages"].values())) + width = page_info["size"]["width"] + height = page_info["size"]["height"] + bbox_gt = gt["texts"][0]["prov"][0]["bbox"] + x1, y_top, x2, y_bottom = ( + bbox_gt["l"], + bbox_gt["t"], + bbox_gt["r"], + bbox_gt["b"], + ) + y1 = height - y_top + y2 = height - y_bottom + line = result.bbox.lines[0] + page_dims = result.bbox.pages[line.page - 1] + scale_x = page_dims.width / width + scale_y = page_dims.height / height + expected_abs = [x1 * scale_x, y1 * scale_y, x2 * scale_x, y2 * scale_y] + # top-left corner should be close to groundtruth when scaled + for got, exp in zip(line.bbox_abs[:2], expected_abs[:2]): + assert got == pytest.approx(exp, abs=20.0) + + # width should roughly match after scaling + got_width = line.bbox_abs[2] - line.bbox_abs[0] + exp_width = expected_abs[2] - expected_abs[0] + assert got_width == pytest.approx(exp_width, abs=20.0) + + # normalized coordinates should be in range + for v in line.bbox_norm: + assert 0 <= v <= 1 diff --git a/packages/markitdown/tests/bbox/test_no_overhead_when_disabled.py b/packages/markitdown/tests/bbox/test_no_overhead_when_disabled.py new file mode 100644 index 00000000..ef0a2fa4 --- /dev/null +++ b/packages/markitdown/tests/bbox/test_no_overhead_when_disabled.py @@ -0,0 +1,26 @@ +import io +import sys +import time + +from markitdown import MarkItDown, StreamInfo + + +def test_no_overhead_when_disabled(): + md = MarkItDown() + sys.modules.pop("pytesseract", None) + sys.modules.pop("pdfplumber", None) + stream = io.BytesIO(b"hello world") + start = time.time() + md.convert_stream(stream, stream_info=StreamInfo(extension=".txt"), emit_bbox=False) + t_disabled = time.time() - start + assert "pytesseract" not in sys.modules + assert "pdfplumber" not in sys.modules + assert t_disabled < 0.5 + + stream = io.BytesIO(b"hello world") + start = time.time() + md.convert_stream(stream, stream_info=StreamInfo(extension=".txt"), emit_bbox=True) + t_enabled = time.time() - start + assert t_enabled < 0.5 + assert "pytesseract" not in sys.modules + assert "pdfplumber" not in sys.modules