diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index ccb44b64b..3b4ffabff 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -3,7 +3,10 @@
# SPDX-License-Identifier: MIT
import argparse
import sys
+import os
import codecs
+import zipfile
+from datetime import datetime
from typing import Any, Dict
from textwrap import dedent
from importlib.metadata import entry_points
@@ -11,6 +14,27 @@
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
+def count_docx_images(filename: str) -> int:
+ """快速预检:统计 DOCX 中嵌入的图片数量"""
+ try:
+ with zipfile.ZipFile(filename) as z:
+ return len([f for f in z.namelist() if f.startswith("word/media/")])
+ except (zipfile.BadZipFile, FileNotFoundError):
+ return 0
+
+
+def ask_extract_images(image_count: int) -> bool:
+ """交互式询问是否提取图片"""
+ if not sys.stdin.isatty():
+ return False # 非交互终端,不询问
+ print(f"\n📄 检测到文档中包含 {image_count} 张图片")
+ try:
+ answer = input(" 是否提取图片到本地文件?(y/n): ").strip().lower()
+ return answer in ("y", "yes")
+ except (EOFError, KeyboardInterrupt):
+ return False
+
+
def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
@@ -138,6 +162,24 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
+ parser.add_argument(
+ "--extract-images",
+ action="store_true",
+ help="Extract embedded images from DOCX/PDF to a local directory.",
+ )
+
+ parser.add_argument(
+ "--no-extract-images",
+ action="store_true",
+ help="Do not extract images (skip interactive prompt).",
+ )
+
+ parser.add_argument(
+ "--images-dir",
+ default="images",
+ help="Base directory name for extracted images (default: images). A timestamp suffix is added.",
+ )
+
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -244,25 +286,62 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)
+ # --- 图片提取逻辑 ---
+ extract_images = False
+ if args.extract_images:
+ extract_images = True
+ elif args.no_extract_images:
+ extract_images = False
+ elif args.filename and args.output and args.filename.lower().endswith(".docx"):
+ count = count_docx_images(args.filename)
+ if count > 0:
+ extract_images = ask_extract_images(count)
+
+ # 构建 kwargs
+ convert_kwargs: Dict[str, Any] = {
+ "keep_data_uris": args.keep_data_uris,
+ }
+
+ if extract_images and args.output:
+ images_dir_name = _timestamped_images_dir_name(args.images_dir or "images")
+ args._actual_images_dir = images_dir_name
+ abs_images_dir = os.path.join(
+ os.path.dirname(os.path.abspath(args.output)),
+ images_dir_name,
+ )
+ os.makedirs(abs_images_dir, exist_ok=True)
+ convert_kwargs["extract_images"] = True
+ convert_kwargs["images_dir"] = abs_images_dir
+ convert_kwargs["images_rel_dir"] = images_dir_name
+ # extract_images 优先于 keep_data_uris
+ convert_kwargs["keep_data_uris"] = False
+
+ # --- 转换 ---
if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
- keep_data_uris=args.keep_data_uris,
+ **convert_kwargs,
)
else:
result = markitdown.convert(
- args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+ args.filename,
+ stream_info=stream_info,
+ **convert_kwargs,
)
- _handle_output(args, result)
+ _handle_output(args, result, extract_images=extract_images)
-def _handle_output(args, result: DocumentConverterResult):
+def _handle_output(args, result: DocumentConverterResult, extract_images: bool = False):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)
+ if extract_images:
+ images_dir = getattr(args, "_actual_images_dir", args.images_dir or "images")
+ print(f"[OK] Generated {args.output}")
+ print(f"[OK] Images extracted to ./{images_dir}/")
else:
# Handle stdout encoding errors more gracefully
print(
@@ -277,5 +356,10 @@ def _exit_with_error(message: str):
sys.exit(1)
+def _timestamped_images_dir_name(base_name: str) -> str:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ return f"{base_name.rstrip(os.sep)}_{timestamp}"
+
+
if __name__ == "__main__":
main()
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..39cb1c346 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,5 +1,8 @@
import sys
import io
+import os
+import re
+import zipfile
from warnings import warn
from typing import BinaryIO, Any
@@ -75,9 +78,106 @@ def convert(
_dependency_exc_info[2]
)
+ extract_images = kwargs.get("extract_images", False)
+ media_files: list[str] = []
+
+ # ① 提取阶段:从 ZIP 获取原图(按文档中出现顺序)
+ if extract_images:
+ file_stream.seek(0)
+ zip_bytes = io.BytesIO(file_stream.read())
+
+ with zipfile.ZipFile(zip_bytes) as z:
+ media_files = self._get_media_in_doc_order(z)
+
+ if media_files:
+ images_dir = kwargs["images_dir"]
+ os.makedirs(images_dir, exist_ok=True)
+
+ for i, media_file in enumerate(media_files, 1):
+ ext = os.path.splitext(media_file)[1]
+ if ext.lower() == ".jpeg":
+ ext = ".jpg"
+ if not ext:
+ data = z.read(media_file)
+ ext = self._detect_ext(data)
+ filename = f"image_{i}{ext}"
+ with open(os.path.join(images_dir, filename), "wb") as f:
+ f.write(z.read(media_file))
+
+ file_stream.seek(0)
+
+ # ② mammoth 转 HTML
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
- return self._html_converter.convert_string(
- mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
- **kwargs,
- )
+ html_value = mammoth.convert_to_html(
+ pre_process_stream, style_map=style_map
+ ).value
+
+ # ③ 替换 base64 → 相对路径
+ if extract_images and media_files:
+ images_rel = kwargs.get("images_rel_dir", "images")
+ for i, media_file in enumerate(media_files, 1):
+ ext = os.path.splitext(media_file)[1]
+ if ext.lower() == ".jpeg":
+ ext = ".jpg"
+ filename = f"image_{i}{ext}"
+ # 替换 mammoth 生成的 data: URI 为文件路径
+ html_value = re.sub(
+ r'
]*)src="data:image/[^"]+"',
+ f'
list[str]:
+ """从 DOCX 的 document.xml.rels 和 document.xml 解析图片在文档中的出现顺序"""
+ from xml.etree.ElementTree import fromstring
+
+ try:
+ # 1. rels: rId -> media 路径
+ rels_xml = z.read("word/_rels/document.xml.rels")
+ rels_root = fromstring(rels_xml)
+ rid_to_media: dict[str, str] = {}
+ for rel in rels_root:
+ target = rel.get("Target", "")
+ if target.startswith("media/"):
+ rid_to_media[rel.get("Id", "")] = target
+
+ # 2. document.xml: 按出现顺序收集 rId
+ doc_xml = z.read("word/document.xml")
+ doc_root = fromstring(doc_xml)
+ ns_a = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
+ ns_r = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
+
+ ordered_media: list[str] = []
+ for blip in doc_root.iter(f"{ns_a}blip"):
+ rid = blip.get(f"{ns_r}embed")
+ if rid and rid in rid_to_media:
+ ordered_media.append(f"word/{rid_to_media[rid]}")
+
+ return ordered_media
+ except Exception:
+ # fallback: 按文件名数字自然排序
+ raw = [f for f in z.namelist() if f.startswith("word/media/") and not f.endswith("/")]
+ return sorted(raw, key=lambda p: int("".join(c for c in os.path.basename(p) if c.isdigit()) or "0"))
+
+ @staticmethod
+ def _detect_ext(data: bytes) -> str:
+ """根据文件头 magic bytes 检测图片格式"""
+ if data[:8] == b"\x89PNG\r\n\x1a\n":
+ return ".png"
+ if data[:2] == b"\xff\xd8":
+ return ".jpg"
+ if data[:4] == b"GIF8":
+ return ".gif"
+ if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP":
+ return ".webp"
+ if data[:2] == b"BM":
+ return ".bmp"
+ if data[:4] == b"\x00\x00\x01\x00":
+ return ".ico"
+ return ".png" # 默认
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index ffbcbd990..6ffa610f1 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,5 +1,6 @@
import sys
import io
+import os
import re
from typing import BinaryIO, Any
@@ -492,6 +493,169 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
return [table_rows]
+def _detect_image_ext(data: bytes) -> str | None:
+ """Return a file extension for common image byte signatures."""
+ if data[:8] == b"\x89PNG\r\n\x1a\n":
+ return ".png"
+ if data[:2] == b"\xff\xd8":
+ return ".jpg"
+ if data[:4] == b"GIF8":
+ return ".gif"
+ if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP":
+ return ".webp"
+ if data[:2] == b"BM":
+ return ".bmp"
+ return None
+
+
+def _write_pdf_image(
+ page: Any,
+ image: dict,
+ images_dir: str,
+ images_rel_dir: str,
+ image_index: int,
+) -> dict[str, Any] | None:
+ image_bytes = b""
+ ext: str | None = None
+ stream = image.get("stream")
+
+ if stream is not None and hasattr(stream, "get_data"):
+ try:
+ raw_bytes = stream.get_data()
+ raw_ext = _detect_image_ext(raw_bytes)
+ if raw_ext is not None:
+ image_bytes = raw_bytes
+ ext = raw_ext
+ else:
+ try:
+ from PIL import Image # type: ignore[import-not-found]
+
+ try:
+ pil_image = Image.open(io.BytesIO(raw_bytes))
+ except Exception:
+ width, height = image.get("srcsize") or (
+ image.get("width"),
+ image.get("height"),
+ )
+ colorspace = str(image.get("colorspace", "")).lower()
+ mode = "L" if "gray" in colorspace else "RGB"
+ pil_image = Image.frombytes(
+ mode,
+ (int(width), int(height)),
+ raw_bytes,
+ )
+
+ with pil_image:
+ if pil_image.mode not in ("RGB", "L", "RGBA"):
+ pil_image = pil_image.convert("RGB")
+ out = io.BytesIO()
+ pil_image.save(out, format="PNG")
+ image_bytes = out.getvalue()
+ ext = ".png"
+ except Exception:
+ pass
+ except Exception:
+ pass
+
+ if not image_bytes:
+ try:
+ x0 = image.get("x0", 0)
+ x1 = image.get("x1", 0)
+ top = image.get("top", 0)
+ bottom = image.get("bottom", 0)
+ if x1 <= x0 or bottom <= top:
+ return None
+
+ cropped_page = page.within_bbox((x0, top, x1, bottom))
+ page_image = cropped_page.to_image(resolution=150)
+ out = io.BytesIO()
+ page_image.original.save(out, format="PNG")
+ image_bytes = out.getvalue()
+ ext = ".png"
+ except Exception:
+ return None
+
+ if ext is None:
+ ext = ".png"
+
+ filename = f"image_{image_index}{ext}"
+ os.makedirs(images_dir, exist_ok=True)
+ with open(os.path.join(images_dir, filename), "wb") as image_file:
+ image_file.write(image_bytes)
+
+ return {
+ "top": image.get("top", 0),
+ "markdown": f"",
+ }
+
+
+def _extract_text_lines_with_positions(page: Any) -> list[dict[str, Any]]:
+ words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
+ if not words:
+ text = page.extract_text()
+ if text and text.strip():
+ return [
+ {"top": float(idx), "text": line.strip()}
+ for idx, line in enumerate(text.splitlines())
+ if line.strip()
+ ]
+ return []
+
+ y_tolerance = 5
+ rows_by_y: dict[float, list[dict]] = {}
+ for word in words:
+ y_key = round(word["top"] / y_tolerance) * y_tolerance
+ rows_by_y.setdefault(y_key, []).append(word)
+
+ lines: list[dict[str, Any]] = []
+ for y_key in sorted(rows_by_y.keys()):
+ row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
+ text = " ".join(word["text"] for word in row_words).strip()
+ if text:
+ lines.append({"top": y_key, "text": text})
+ return lines
+
+
+def _extract_pdf_with_images(
+ pdf_bytes: io.BytesIO,
+ images_dir: str,
+ images_rel_dir: str,
+) -> str:
+ markdown_chunks: list[str] = []
+ image_index = 1
+
+ with pdfplumber.open(pdf_bytes) as pdf:
+ for page in pdf.pages:
+ items: list[dict[str, Any]] = [
+ {"top": line["top"], "markdown": line["text"]}
+ for line in _extract_text_lines_with_positions(page)
+ ]
+
+ for image in getattr(page, "images", []) or []:
+ image_item = _write_pdf_image(
+ page,
+ image,
+ images_dir,
+ images_rel_dir,
+ image_index,
+ )
+ if image_item is not None:
+ items.append(image_item)
+ image_index += 1
+
+ page_markdown = "\n\n".join(
+ item["markdown"]
+ for item in sorted(items, key=lambda item: item["top"])
+ if item["markdown"].strip()
+ )
+ if page_markdown.strip():
+ markdown_chunks.append(page_markdown.strip())
+
+ page.close()
+
+ return "\n\n".join(markdown_chunks).strip()
+
+
class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown.
@@ -539,6 +703,22 @@ def convert(
# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())
+ if kwargs.get("extract_images", False):
+ images_dir = kwargs["images_dir"]
+ images_rel_dir = kwargs.get("images_rel_dir", "images")
+ try:
+ markdown = _extract_pdf_with_images(
+ pdf_bytes,
+ images_dir=images_dir,
+ images_rel_dir=images_rel_dir,
+ )
+ except Exception:
+ pdf_bytes.seek(0)
+ markdown = pdfminer.high_level.extract_text(pdf_bytes)
+
+ markdown = _merge_partial_numbering_lines(markdown)
+ return DocumentConverterResult(markdown=markdown)
+
try:
# Single pass: check every page for form-style content.
# Pages with tables/forms get rich extraction; plain-text
diff --git a/packages/markitdown/tests/test_files/pdf_image_middle.pdf b/packages/markitdown/tests/test_files/pdf_image_middle.pdf
new file mode 100644
index 000000000..d90bc9d3e
Binary files /dev/null and b/packages/markitdown/tests/test_files/pdf_image_middle.pdf differ
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..fff9ef614 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,7 +3,9 @@
import os
import re
import shutil
+import subprocess
import pytest
+import sys
from unittest.mock import MagicMock
from markitdown._uri_utils import parse_data_uri, file_uri_to_path
@@ -107,6 +109,16 @@ def validate_strings(result, expected_strings, exclude_strings=None):
assert string not in text_content
+def _has_pdf_dependencies() -> bool:
+ try:
+ import pdfminer # noqa: F401
+ import pdfplumber # noqa: F401
+
+ return True
+ except ModuleNotFoundError:
+ return False
+
+
def test_stream_info_operations() -> None:
"""Test operations performed on StreamInfo objects."""
@@ -220,6 +232,71 @@ def test_data_uris() -> None:
assert data == b"Hello, World!"
+@pytest.mark.skipif(
+ not _has_pdf_dependencies(),
+ reason="PDF optional dependencies not installed",
+)
+def test_pdf_extract_images_to_markdown(tmp_path) -> None:
+ pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf")
+ images_dir = tmp_path / "images"
+
+ result = MarkItDown().convert(
+ pdf_path,
+ extract_images=True,
+ images_dir=str(images_dir),
+ images_rel_dir="images",
+ )
+
+ markdown = result.markdown
+ assert "Here is some introductory text." in markdown
+ assert "
+ < markdown.index("
+ < markdown.index("Section 2: Details")
+ )
+
+ image_files = list(images_dir.glob("image_1.*"))
+ assert len(image_files) == 1
+ assert image_files[0].stat().st_size > 0
+
+
+@pytest.mark.skipif(
+ not _has_pdf_dependencies(),
+ reason="PDF optional dependencies not installed",
+)
+def test_cli_pdf_extract_images_uses_timestamped_dir(tmp_path) -> None:
+ pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf")
+ output_path = tmp_path / "out.md"
+
+ result = subprocess.run(
+ [
+ sys.executable,
+ "-m",
+ "markitdown",
+ pdf_path,
+ "-o",
+ str(output_path),
+ "--extract-images",
+ "--images-dir",
+ "assets",
+ ],
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 0, result.stderr
+ markdown = output_path.read_text(encoding="utf-8")
+ image_dirs = list(tmp_path.glob("assets_*"))
+ assert len(image_dirs) == 1
+ assert image_dirs[0].is_dir()
+ assert f")
+ assert len(image_files) == 1
+ assert image_files[0].stat().st_size > 0
+
+
def test_file_uris() -> None:
# Test file URI with an empty host
file_uri = "file:///path/to/file.txt"