Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 88 additions & 4 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,38 @@
# SPDX-License-Identifier: MIT
import argparse
import sys
import os
import codecs
import zipfile
from datetime import datetime
from typing import Any, Dict
from textwrap import dedent
from importlib.metadata import entry_points
from .__about__ import __version__
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult


def count_docx_images(filename: str) -> int:
"""快速预检:统计 DOCX 中嵌入的图片数量"""
try:
with zipfile.ZipFile(filename) as z:
return len([f for f in z.namelist() if f.startswith("word/media/")])
except (zipfile.BadZipFile, FileNotFoundError):
return 0


def ask_extract_images(image_count: int) -> bool:
"""交互式询问是否提取图片"""
if not sys.stdin.isatty():
return False # 非交互终端,不询问
print(f"\n📄 检测到文档中包含 {image_count} 张图片")
try:
answer = input(" 是否提取图片到本地文件?(y/n): ").strip().lower()
return answer in ("y", "yes")
except (EOFError, KeyboardInterrupt):
return False


def main():
parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.",
Expand Down Expand Up @@ -138,6 +162,24 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--extract-images",
action="store_true",
help="Extract embedded images from DOCX/PDF to a local directory.",
)

parser.add_argument(
"--no-extract-images",
action="store_true",
help="Do not extract images (skip interactive prompt).",
)

parser.add_argument(
"--images-dir",
default="images",
help="Base directory name for extracted images (default: images). A timestamp suffix is added.",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -244,25 +286,62 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)

# --- 图片提取逻辑 ---
extract_images = False
if args.extract_images:
extract_images = True
elif args.no_extract_images:
extract_images = False
elif args.filename and args.output and args.filename.lower().endswith(".docx"):
count = count_docx_images(args.filename)
if count > 0:
extract_images = ask_extract_images(count)

# 构建 kwargs
convert_kwargs: Dict[str, Any] = {
"keep_data_uris": args.keep_data_uris,
}

if extract_images and args.output:
images_dir_name = _timestamped_images_dir_name(args.images_dir or "images")
args._actual_images_dir = images_dir_name
abs_images_dir = os.path.join(
os.path.dirname(os.path.abspath(args.output)),
images_dir_name,
)
os.makedirs(abs_images_dir, exist_ok=True)
convert_kwargs["extract_images"] = True
convert_kwargs["images_dir"] = abs_images_dir
convert_kwargs["images_rel_dir"] = images_dir_name
# extract_images 优先于 keep_data_uris
convert_kwargs["keep_data_uris"] = False

# --- 转换 ---
if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
**convert_kwargs,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename,
stream_info=stream_info,
**convert_kwargs,
)

_handle_output(args, result)
_handle_output(args, result, extract_images=extract_images)


def _handle_output(args, result: DocumentConverterResult):
def _handle_output(args, result: DocumentConverterResult, extract_images: bool = False):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)
if extract_images:
images_dir = getattr(args, "_actual_images_dir", args.images_dir or "images")
print(f"[OK] Generated {args.output}")
print(f"[OK] Images extracted to ./{images_dir}/")
else:
# Handle stdout encoding errors more gracefully
print(
Expand All @@ -277,5 +356,10 @@ def _exit_with_error(message: str):
sys.exit(1)


def _timestamped_images_dir_name(base_name: str) -> str:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"{base_name.rstrip(os.sep)}_{timestamp}"


if __name__ == "__main__":
main()
108 changes: 104 additions & 4 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import sys
import io
import os
import re
import zipfile
from warnings import warn

from typing import BinaryIO, Any
Expand Down Expand Up @@ -75,9 +78,106 @@ def convert(
_dependency_exc_info[2]
)

extract_images = kwargs.get("extract_images", False)
media_files: list[str] = []

# ① 提取阶段:从 ZIP 获取原图(按文档中出现顺序)
if extract_images:
file_stream.seek(0)
zip_bytes = io.BytesIO(file_stream.read())

with zipfile.ZipFile(zip_bytes) as z:
media_files = self._get_media_in_doc_order(z)

if media_files:
images_dir = kwargs["images_dir"]
os.makedirs(images_dir, exist_ok=True)

for i, media_file in enumerate(media_files, 1):
ext = os.path.splitext(media_file)[1]
if ext.lower() == ".jpeg":
ext = ".jpg"
if not ext:
data = z.read(media_file)
ext = self._detect_ext(data)
filename = f"image_{i}{ext}"
with open(os.path.join(images_dir, filename), "wb") as f:
f.write(z.read(media_file))

file_stream.seek(0)

# ② mammoth 转 HTML
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
)
html_value = mammoth.convert_to_html(
pre_process_stream, style_map=style_map
).value

# ③ 替换 base64 → 相对路径
if extract_images and media_files:
images_rel = kwargs.get("images_rel_dir", "images")
for i, media_file in enumerate(media_files, 1):
ext = os.path.splitext(media_file)[1]
if ext.lower() == ".jpeg":
ext = ".jpg"
filename = f"image_{i}{ext}"
# 替换 mammoth 生成的 data: URI 为文件路径
html_value = re.sub(
r'<img([^>]*)src="data:image/[^"]+"',
f'<img\\1src="{images_rel}/{filename}"',
html_value,
count=1,
)

# ④ HTML → Markdown
return self._html_converter.convert_string(html_value, **kwargs)

@staticmethod
def _get_media_in_doc_order(z: zipfile.ZipFile) -> list[str]:
"""从 DOCX 的 document.xml.rels 和 document.xml 解析图片在文档中的出现顺序"""
from xml.etree.ElementTree import fromstring

try:
# 1. rels: rId -> media 路径
rels_xml = z.read("word/_rels/document.xml.rels")
rels_root = fromstring(rels_xml)
rid_to_media: dict[str, str] = {}
for rel in rels_root:
target = rel.get("Target", "")
if target.startswith("media/"):
rid_to_media[rel.get("Id", "")] = target

# 2. document.xml: 按出现顺序收集 rId
doc_xml = z.read("word/document.xml")
doc_root = fromstring(doc_xml)
ns_a = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
ns_r = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"

ordered_media: list[str] = []
for blip in doc_root.iter(f"{ns_a}blip"):
rid = blip.get(f"{ns_r}embed")
if rid and rid in rid_to_media:
ordered_media.append(f"word/{rid_to_media[rid]}")

return ordered_media
except Exception:
# fallback: 按文件名数字自然排序
raw = [f for f in z.namelist() if f.startswith("word/media/") and not f.endswith("/")]
return sorted(raw, key=lambda p: int("".join(c for c in os.path.basename(p) if c.isdigit()) or "0"))

@staticmethod
def _detect_ext(data: bytes) -> str:
"""根据文件头 magic bytes 检测图片格式"""
if data[:8] == b"\x89PNG\r\n\x1a\n":
return ".png"
if data[:2] == b"\xff\xd8":
return ".jpg"
if data[:4] == b"GIF8":
return ".gif"
if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP":
return ".webp"
if data[:2] == b"BM":
return ".bmp"
if data[:4] == b"\x00\x00\x01\x00":
return ".ico"
return ".png" # 默认
Loading