&;MWOa=mfc;>IZPV5~1HEQYzK6+rUt98KC-lAK3clpAZ-Vezb>fzwWFIAmM??k!F
z&2WwutlgGI4y3&AwRd{rAHqc1&o+uOv)PG4ZJd0iig=qcjozH~Ywh}seUt|3lCq|L
ziOrS8!|Ij)2chE+v2*=GGwm<*&z}r|;|)A>&HTBJOrOs0Pv-gOIt)!0`)}&%SQ;_@
zx(;ulf7h2l@%G=!rC}-ESbyD@Xx9I}Z?l!=%YMowvElhn{3EkA9c8`_oo3r2>ZsMU
z+bxX148yb?48JyT(=iFOZKJ(^E^+!Q{F3Z*|8cUfTFuyA8dZ`E_}}II%3oi|^mz?4
p-;qW9N`qRMWWFO+dM|nZ;-o(Rv7?PR`LPH@t3`Yj3gb2R^| RQ3P>
literal 0
HcmV?d00001
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..fff9ef614 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,7 +3,9 @@
import os
import re
import shutil
+import subprocess
import pytest
+import sys
from unittest.mock import MagicMock
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
@@ -107,6 +109,16 @@ def validate_strings(result, expected_strings, exclude_strings=None):
assert string not in text_content
+def _has_pdf_dependencies() -> bool:
+ try:
+ import pdfminer # noqa: F401
+ import pdfplumber # noqa: F401
+
+ return True
+ except ModuleNotFoundError:
+ return False
+
+
def test_stream_info_operations() -> None:
"""Test operations performed on StreamInfo objects."""
@@ -220,6 +232,71 @@ def test_data_uris() -> None:
assert data == b"Hello, World!"
+@pytest.mark.skipif(
+ not _has_pdf_dependencies(),
+ reason="PDF optional dependencies not installed",
+)
+def test_pdf_extract_images_to_markdown(tmp_path) -> None:
+ pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf")
+ images_dir = tmp_path / "images"
+
+ result = MarkItDown().convert(
+ pdf_path,
+ extract_images=True,
+ images_dir=str(images_dir),
+ images_rel_dir="images",
+ )
+
+ markdown = result.markdown
+ assert "Here is some introductory text." in markdown
+ assert "
+ < markdown.index("
+ < markdown.index("Section 2: Details")
+ )
+
+ image_files = list(images_dir.glob("image_1.*"))
+ assert len(image_files) == 1
+ assert image_files[0].stat().st_size > 0
+
+
+@pytest.mark.skipif(
+ not _has_pdf_dependencies(),
+ reason="PDF optional dependencies not installed",
+)
+def test_cli_pdf_extract_images_uses_timestamped_dir(tmp_path) -> None:
+ pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf")
+ output_path = tmp_path / "out.md"
+
+ result = subprocess.run(
+ [
+ sys.executable,
+ "-m",
+ "markitdown",
+ pdf_path,
+ "-o",
+ str(output_path),
+ "--extract-images",
+ "--images-dir",
+ "assets",
+ ],
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 0, result.stderr
+ markdown = output_path.read_text(encoding="utf-8")
+ image_dirs = list(tmp_path.glob("assets_*"))
+ assert len(image_dirs) == 1
+ assert image_dirs[0].is_dir()
+ assert f")
+ assert len(image_files) == 1
+ assert image_files[0].stat().st_size > 0
+
+
def test_file_uris() -> None:
# Test file URI with an empty host
file_uri = "file:///path/to/file.txt"
|