microsoft · Craftr-X · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
@@ -3,14 +3,38 @@
 # SPDX-License-Identifier: MIT
 import argparse
 import sys
+import os
 import codecs
+import zipfile
+from datetime import datetime
 from typing import Any, Dict
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
 from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
 
 
+def count_docx_images(filename: str) -> int:
+    """快速预检：统计 DOCX 中嵌入的图片数量"""
+    try:
+        with zipfile.ZipFile(filename) as z:
+            return len([f for f in z.namelist() if f.startswith("word/media/")])
+    except (zipfile.BadZipFile, FileNotFoundError):
+        return 0
+
+
+def ask_extract_images(image_count: int) -> bool:
+    """交互式询问是否提取图片"""
+    if not sys.stdin.isatty():
+        return False  # 非交互终端，不询问
+    print(f"\n📄 检测到文档中包含 {image_count} 张图片")
+    try:
+        answer = input("   是否提取图片到本地文件？(y/n): ").strip().lower()
+        return answer in ("y", "yes")
+    except (EOFError, KeyboardInterrupt):
+        return False
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Convert various file formats to markdown.",
@@ -138,6 +162,24 @@ def main():
         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
     )
 
+    parser.add_argument(
+        "--extract-images",
+        action="store_true",
+        help="Extract embedded images from DOCX/PDF to a local directory.",
+    )
+
+    parser.add_argument(
+        "--no-extract-images",
+        action="store_true",
+        help="Do not extract images (skip interactive prompt).",
+    )
+
+    parser.add_argument(
+        "--images-dir",
+        default="images",
+        help="Base directory name for extracted images (default: images). A timestamp suffix is added.",
+    )
+
     parser.add_argument("filename", nargs="?")
     args = parser.parse_args()
 
@@ -244,25 +286,62 @@ def main():
     else:
         markitdown = MarkItDown(enable_plugins=args.use_plugins)
 
+    # --- 图片提取逻辑 ---
+    extract_images = False
+    if args.extract_images:
+        extract_images = True
+    elif args.no_extract_images:
+        extract_images = False
+    elif args.filename and args.output and args.filename.lower().endswith(".docx"):
+        count = count_docx_images(args.filename)
+        if count > 0:
+            extract_images = ask_extract_images(count)
+
+    # 构建 kwargs
+    convert_kwargs: Dict[str, Any] = {
+        "keep_data_uris": args.keep_data_uris,
+    }
+
+    if extract_images and args.output:
+        images_dir_name = _timestamped_images_dir_name(args.images_dir or "images")
+        args._actual_images_dir = images_dir_name
+        abs_images_dir = os.path.join(
+            os.path.dirname(os.path.abspath(args.output)),
+            images_dir_name,
+        )
+        os.makedirs(abs_images_dir, exist_ok=True)
+        convert_kwargs["extract_images"] = True
+        convert_kwargs["images_dir"] = abs_images_dir
+        convert_kwargs["images_rel_dir"] = images_dir_name
+        # extract_images 优先于 keep_data_uris
+        convert_kwargs["keep_data_uris"] = False
+
+    # --- 转换 ---
     if args.filename is None:
         result = markitdown.convert_stream(
             sys.stdin.buffer,
             stream_info=stream_info,
-            keep_data_uris=args.keep_data_uris,
+            **convert_kwargs,
         )
     else:
         result = markitdown.convert(
-            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+            args.filename,
+            stream_info=stream_info,
+            **convert_kwargs,
         )
 
-    _handle_output(args, result)
+    _handle_output(args, result, extract_images=extract_images)
 
 
-def _handle_output(args, result: DocumentConverterResult):
+def _handle_output(args, result: DocumentConverterResult, extract_images: bool = False):
     """Handle output to stdout or file"""
     if args.output:
         with open(args.output, "w", encoding="utf-8") as f:
             f.write(result.markdown)
+        if extract_images:
+            images_dir = getattr(args, "_actual_images_dir", args.images_dir or "images")
+            print(f"[OK] Generated {args.output}")
+            print(f"[OK] Images extracted to ./{images_dir}/")
     else:
         # Handle stdout encoding errors more gracefully
         print(
@@ -277,5 +356,10 @@ def _exit_with_error(message: str):
     sys.exit(1)
 
 
+def _timestamped_images_dir_name(base_name: str) -> str:
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return f"{base_name.rstrip(os.sep)}_{timestamp}"
+
+
 if __name__ == "__main__":
     main()
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,5 +1,8 @@
 import sys
 import io
+import os
+import re
+import zipfile
 from warnings import warn
 
 from typing import BinaryIO, Any
@@ -75,9 +78,106 @@ def convert(
                 _dependency_exc_info[2]
             )
 
+        extract_images = kwargs.get("extract_images", False)
+        media_files: list[str] = []
+
+        # ① 提取阶段：从 ZIP 获取原图（按文档中出现顺序）
+        if extract_images:
+            file_stream.seek(0)
+            zip_bytes = io.BytesIO(file_stream.read())
+
+            with zipfile.ZipFile(zip_bytes) as z:
+                media_files = self._get_media_in_doc_order(z)
+
+                if media_files:
+                    images_dir = kwargs["images_dir"]
+                    os.makedirs(images_dir, exist_ok=True)
+
+                    for i, media_file in enumerate(media_files, 1):
+                        ext = os.path.splitext(media_file)[1]
+                        if ext.lower() == ".jpeg":
+                            ext = ".jpg"
+                        if not ext:
+                            data = z.read(media_file)
+                            ext = self._detect_ext(data)
+                        filename = f"image_{i}{ext}"
+                        with open(os.path.join(images_dir, filename), "wb") as f:
+                            f.write(z.read(media_file))
+
+            file_stream.seek(0)
+
+        # ② mammoth 转 HTML
         style_map = kwargs.get("style_map", None)
         pre_process_stream = pre_process_docx(file_stream)
-        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
-            **kwargs,
-        )
+        html_value = mammoth.convert_to_html(
+            pre_process_stream, style_map=style_map
+        ).value
+
+        # ③ 替换 base64 → 相对路径
+        if extract_images and media_files:
+            images_rel = kwargs.get("images_rel_dir", "images")
+            for i, media_file in enumerate(media_files, 1):
+                ext = os.path.splitext(media_file)[1]
+                if ext.lower() == ".jpeg":
+                    ext = ".jpg"
+                filename = f"image_{i}{ext}"
+                # 替换 mammoth 生成的 data: URI 为文件路径
+                html_value = re.sub(
+                    r'<img([^>]*)src="data:image/[^"]+"',
+                    f'<img\\1src="{images_rel}/{filename}"',
+                    html_value,
+                    count=1,
+                )
+
+        # ④ HTML → Markdown
+        return self._html_converter.convert_string(html_value, **kwargs)
+
+    @staticmethod
+    def _get_media_in_doc_order(z: zipfile.ZipFile) -> list[str]:
+        """从 DOCX 的 document.xml.rels 和 document.xml 解析图片在文档中的出现顺序"""
+        from xml.etree.ElementTree import fromstring
+
+        try:
+            # 1. rels: rId -> media 路径
+            rels_xml = z.read("word/_rels/document.xml.rels")
+            rels_root = fromstring(rels_xml)
+            rid_to_media: dict[str, str] = {}
+            for rel in rels_root:
+                target = rel.get("Target", "")
+                if target.startswith("media/"):
+                    rid_to_media[rel.get("Id", "")] = target
+
+            # 2. document.xml: 按出现顺序收集 rId
+            doc_xml = z.read("word/document.xml")
+            doc_root = fromstring(doc_xml)
+            ns_a = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
+            ns_r = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
+
+            ordered_media: list[str] = []
+            for blip in doc_root.iter(f"{ns_a}blip"):
+                rid = blip.get(f"{ns_r}embed")
+                if rid and rid in rid_to_media:
+                    ordered_media.append(f"word/{rid_to_media[rid]}")
+
+            return ordered_media
+        except Exception:
+            # fallback: 按文件名数字自然排序
+            raw = [f for f in z.namelist() if f.startswith("word/media/") and not f.endswith("/")]
+            return sorted(raw, key=lambda p: int("".join(c for c in os.path.basename(p) if c.isdigit()) or "0"))
+
+    @staticmethod
+    def _detect_ext(data: bytes) -> str:
+        """根据文件头 magic bytes 检测图片格式"""
+        if data[:8] == b"\x89PNG\r\n\x1a\n":
+            return ".png"
+        if data[:2] == b"\xff\xd8":
+            return ".jpg"
+        if data[:4] == b"GIF8":
+            return ".gif"
+        if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP":
+            return ".webp"
+        if data[:2] == b"BM":
+            return ".bmp"
+        if data[:4] == b"\x00\x00\x01\x00":
+            return ".ico"
+        return ".png"  # 默认