Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions packages/markitdown/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ From PyPI:
pip install markitdown[all]
```

If you need the optional PyMuPDF fallback for some PDFs:

```bash
pip install markitdown[pymupdf]
```

From source:

```bash
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ all = [
"lxml",
"pdfminer.six>=20251230",
"pdfplumber>=0.11.9",
"pymupdf",
"olefile",
"pydub",
"SpeechRecognition",
Expand All @@ -55,6 +56,7 @@ docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
pymupdf = ["pymupdf"]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ def convert(
reader = csv.reader(io.StringIO(content))
rows = list(reader)

if not rows:
return DocumentConverterResult(markdown="")

while rows and not any(cell.strip() for cell in rows[0]):
rows.pop(0)

if not rows:
return DocumentConverterResult(markdown="")

Expand Down
47 changes: 46 additions & 1 deletion packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ def _merge_partial_numbering_lines(text: str) -> str:
return "\n".join(result_lines)


def _extract_with_pymupdf(pdf_bytes: io.BytesIO) -> str | None:
"""Extract text with PyMuPDF when the primary PDF parsers look truncated."""
if fitz is None:
return None

pdf_bytes.seek(0)
chunks: list[str] = []
with fitz.open(stream=pdf_bytes.read(), filetype="pdf") as doc:
Comment thread
Muhtasim-Munif-Fahim marked this conversation as resolved.
for page in doc:
text = page.get_text("text")
if text and text.strip():
chunks.append(text.strip())

markdown = "\n\n".join(chunks).strip()
return markdown or None


# Load dependencies
_dependency_exc_info = None
try:
Expand All @@ -66,6 +83,11 @@ def _merge_partial_numbering_lines(text: str) -> str:
except ImportError:
_dependency_exc_info = sys.exc_info()

try:
import fitz
except ImportError:
fitz = None


ACCEPTED_MIME_TYPE_PREFIXES = [
"application/pdf",
Expand Down Expand Up @@ -536,6 +558,9 @@ def convert(

assert isinstance(file_stream, io.IOBase)

markdown_chunks: list[str] = []
has_images = False

# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())

Expand All @@ -545,12 +570,14 @@ def convert(
# pages are collected separately. page.close() is called
# after each page to free pdfplumber's cached objects and
# keep memory usage constant regardless of page count.
markdown_chunks: list[str] = []
form_page_count = 0
plain_page_indices: list[int] = []

with pdfplumber.open(pdf_bytes) as pdf:
for page_idx, page in enumerate(pdf.pages):
has_images = has_images or bool(getattr(page, "images", None))

# Try form-style word position extraction
page_content = _extract_form_content_from_words(page)

if page_content is not None:
Expand Down Expand Up @@ -583,6 +610,24 @@ def convert(
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)

# Recover from inline-image truncation cases where the primary parsers
# return a much shorter body than an optional PyMuPDF pass.
if fitz is not None and has_images and markdown and len(markdown) < 2048:
try:
pymupdf_markdown = _extract_with_pymupdf(pdf_bytes)
except Exception:
pymupdf_markdown = None
else:
if pymupdf_markdown is not None:
primary_length = len(markdown.strip())
pymupdf_length = len(pymupdf_markdown.strip())
if pymupdf_length > primary_length and (
primary_length == 0
or pymupdf_length >= primary_length * 1.5
or pymupdf_length - primary_length >= 200
):
Comment thread
Muhtasim-Munif-Fahim marked this conversation as resolved.
markdown = pymupdf_markdown

# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)

Expand Down
81 changes: 81 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import pytest
from unittest.mock import MagicMock

from markitdown.converters._csv_converter import CsvConverter
import markitdown.converters._pdf_converter as pdf_converter_module

from markitdown._uri_utils import parse_data_uri, file_uri_to_path

from markitdown import (
Expand Down Expand Up @@ -432,6 +435,84 @@ def test_exceptions() -> None:
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"


def test_pdf_converter_prefers_pymupdf_when_primary_extraction_is_truncated(
monkeypatch: pytest.MonkeyPatch,
) -> None:
class _FakePage:
images = [object()]

def extract_words(self, keep_blank_chars=True, x_tolerance=3, y_tolerance=3):
return []

def extract_text(self):
return "BEFORE_IMAGE: this text should be extracted"

class _FakePdf:
pages = [_FakePage()]

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

class _FakePyMuPdfPage:
def get_text(self, kind):
assert kind == "text"
return (
"BEFORE_IMAGE: this text should be extracted\n"
"AFTER_IMAGE: this text should also be extracted"
)

class _FakePyMuPdfDoc:
def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

def __iter__(self):
return iter([_FakePyMuPdfPage()])

monkeypatch.setattr(pdf_converter_module.pdfplumber, "open", lambda _: _FakePdf())
monkeypatch.setattr(
pdf_converter_module.pdfminer.high_level,
"extract_text",
lambda _: "BEFORE_IMAGE: this text should be extracted",
)
monkeypatch.setattr(
pdf_converter_module.fitz,
"open",
lambda *args, **kwargs: _FakePyMuPdfDoc(),
)
Comment thread
Muhtasim-Munif-Fahim marked this conversation as resolved.

converter = pdf_converter_module.PdfConverter()
result = converter.convert(
io.BytesIO(b"%PDF-1.4 fake"),
StreamInfo(mimetype="application/pdf", extension=".pdf"),
)

assert "BEFORE_IMAGE: this text should be extracted" in result.markdown
assert "AFTER_IMAGE: this text should also be extracted" in result.markdown


def test_csv_converter_skips_leading_blank_rows() -> None:
converter = CsvConverter()
result = converter.convert(
io.BytesIO(b"\nname,age\nbob,3\nalice,7\n"),
StreamInfo(mimetype="text/csv", extension=".csv", charset="utf-8"),
)

assert result.markdown == "\n".join(
[
"| name | age |",
"| --- | --- |",
"| bob | 3 |",
"| alice | 7 |",
]
)


@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
Expand Down