diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index afb5d319..e25b0dd9 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -47,7 +47,8 @@ all = [ "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", - "azure-identity" + "azure-identity", + "pdfplumber", ] pptx = ["python-pptx"] docx = ["mammoth", "lxml"] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 3027efc6..d00defba 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -115,6 +115,7 @@ def __init__( # TODO - remove these (see enable_builtins) self._llm_client: Any = None self._llm_model: Union[str | None] = None + self._llm_promtp: Union[str | None] = None self._exiftool_path: Union[str | None] = None self._style_map: Union[str | None] = None @@ -139,6 +140,7 @@ def enable_builtins(self, **kwargs) -> None: # TODO: Move these into converter constructors self._llm_client = kwargs.get("llm_client") self._llm_model = kwargs.get("llm_model") + self._llm_prompt = kwargs.get("llm_prompt") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") @@ -559,6 +561,9 @@ def _convert( if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model + if "llm_prompt" not in _kwargs and self._llm_prompt is not None: + _kwargs["llm_prompt"] = self._llm_prompt + if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d52..1be3110d 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -4,6 +4,7 @@ from typing import BinaryIO, Any +from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -72,6 +73,84 @@ def convert( ) assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + try: + import base64 + from PIL import Image + + import pdfplumber + import markdownify + + def extract_text_from_pdf(pdf_path): + markdown_text = "" + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + text = page.extract_text() + if text: + markdown_text += ( + markdownify.markdownify(text, heading_style="ATX") + + "\n\n" + ) + return markdown_text + + def extract_images_from_pdf(pdf_path): + image_list = [] + with pdfplumber.open(pdf_path) as pdf: + for i, page in enumerate(pdf.pages): + for img in page.images: + bbox = (img["x0"], img["top"], img["x1"], img["bottom"]) + cropped_image = page.to_image(resolution=150) + image_list.append((i + 1, cropped_image)) + return image_list + + def describe_image_with_llm(image: Image.Image, **kwargs) -> str: + try: + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + buffered = io.BytesIO() + image.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode() + + response = llm_client.chat.completions.create( + model=llm_model, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Write a detailed caption for this image.", + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{img_b64}" + }, + }, + ], + } + ], + ) + return response.choices[0].message.content + except Exception: + pass + + def convert_pdf_to_markdown_with_images(pdf_path, **kwargs): + md_text = extract_text_from_pdf(pdf_path) + images = extract_images_from_pdf(pdf_path) + + for idx, (page_num, image) in enumerate(images): + description = describe_image_with_llm(image, **kwargs) + md_text += ( + f"\n\n![Image{idx+1} - Page {page_num}]\n\n> {description}\n" + ) + + return md_text + + return DocumentConverterResult( + markdown=convert_pdf_to_markdown_with_images(file_stream, **kwargs) + ) + + except Exception as e: + return DocumentConverterResult( + markdown=pdfminer.high_level.extract_text(file_stream), + )