From bdfbab89deb708c65a2299205bbdf444b5eba3f9 Mon Sep 17 00:00:00 2001 From: brainylark Date: Wed, 16 Jul 2025 11:12:10 +0200 Subject: [PATCH] Incorporated options to embed or save images off of PDF --- packages/markitdown/pyproject.toml | 3 ++- .../src/markitdown/converters/_pdf_converter.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index afb5d319..e0f2c154 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -42,6 +42,7 @@ all = [ "xlrd", "lxml", "pdfminer.six", + "pymupdf4llm", "olefile", "pydub", "SpeechRecognition", @@ -53,7 +54,7 @@ pptx = ["python-pptx"] docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] -pdf = ["pdfminer.six"] +pdf = ["pdfminer.six", "pymupdf4llm"] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d52..9c1e4097 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -3,6 +3,7 @@ from typing import BinaryIO, Any +import pymupdf4llm from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo @@ -58,6 +59,7 @@ def convert( **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check the dependencies + if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -72,6 +74,16 @@ def convert( ) assert isinstance(file_stream, io.IOBase) # for mypy + + use_pdf4llm = kwargs.get("use_pdf4llm", False) + + if use_pdf4llm: + args_pdf4llm = kwargs.get("args_pdf4llm", {}) + + return DocumentConverterResult( + markdown=pymupdf4llm.to_markdown(file_stream, **args_pdf4llm) + ) + return DocumentConverterResult( markdown=pdfminer.high_level.extract_text(file_stream), )