Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ all = [
"SpeechRecognition",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
"azure-identity",
"pdfplumber",
]
pptx = ["python-pptx"]
docx = ["mammoth", "lxml"]
Expand Down
5 changes: 5 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
# TODO - remove these (see enable_builtins)
self._llm_client: Any = None
self._llm_model: Union[str | None] = None
self._llm_promtp: Union[str | None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None

Expand All @@ -139,6 +140,7 @@ def enable_builtins(self, **kwargs) -> None:
# TODO: Move these into converter constructors
self._llm_client = kwargs.get("llm_client")
self._llm_model = kwargs.get("llm_model")
self._llm_prompt = kwargs.get("llm_prompt")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")

Expand Down Expand Up @@ -559,6 +561,9 @@ def _convert(
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model

if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
_kwargs["llm_prompt"] = self._llm_prompt

if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map

Expand Down
85 changes: 82 additions & 3 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import BinaryIO, Any


from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand Down Expand Up @@ -72,6 +73,84 @@ def convert(
)

assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)
try:
import base64
from PIL import Image

import pdfplumber
import markdownify

def extract_text_from_pdf(pdf_path):
markdown_text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
markdown_text += (
markdownify.markdownify(text, heading_style="ATX")
+ "\n\n"
)
return markdown_text

def extract_images_from_pdf(pdf_path):
image_list = []
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
for img in page.images:
bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
cropped_image = page.to_image(resolution=150)
image_list.append((i + 1, cropped_image))
return image_list

def describe_image_with_llm(image: Image.Image, **kwargs) -> str:
try:
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_b64 = base64.b64encode(buffered.getvalue()).decode()

response = llm_client.chat.completions.create(
model=llm_model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Write a detailed caption for this image.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_b64}"
},
},
],
}
],
)
return response.choices[0].message.content
except Exception:
pass

def convert_pdf_to_markdown_with_images(pdf_path, **kwargs):
md_text = extract_text_from_pdf(pdf_path)
images = extract_images_from_pdf(pdf_path)

for idx, (page_num, image) in enumerate(images):
description = describe_image_with_llm(image, **kwargs)
md_text += (
f"\n\n![Image{idx+1} - Page {page_num}]\n\n> {description}\n"
)

return md_text

return DocumentConverterResult(
markdown=convert_pdf_to_markdown_with_images(file_stream, **kwargs)
)

except Exception as e:
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)