diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ccb44b64b..3b4ffabff 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -3,7 +3,10 @@ # SPDX-License-Identifier: MIT import argparse import sys +import os import codecs +import zipfile +from datetime import datetime from typing import Any, Dict from textwrap import dedent from importlib.metadata import entry_points @@ -11,6 +14,27 @@ from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult +def count_docx_images(filename: str) -> int: + """快速预检:统计 DOCX 中嵌入的图片数量""" + try: + with zipfile.ZipFile(filename) as z: + return len([f for f in z.namelist() if f.startswith("word/media/")]) + except (zipfile.BadZipFile, FileNotFoundError): + return 0 + + +def ask_extract_images(image_count: int) -> bool: + """交互式询问是否提取图片""" + if not sys.stdin.isatty(): + return False # 非交互终端,不询问 + print(f"\n📄 检测到文档中包含 {image_count} 张图片") + try: + answer = input(" 是否提取图片到本地文件?(y/n): ").strip().lower() + return answer in ("y", "yes") + except (EOFError, KeyboardInterrupt): + return False + + def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", @@ -138,6 +162,24 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--extract-images", + action="store_true", + help="Extract embedded images from DOCX/PDF to a local directory.", + ) + + parser.add_argument( + "--no-extract-images", + action="store_true", + help="Do not extract images (skip interactive prompt).", + ) + + parser.add_argument( + "--images-dir", + default="images", + help="Base directory name for extracted images (default: images). A timestamp suffix is added.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -244,25 +286,62 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + # --- 图片提取逻辑 --- + extract_images = False + if args.extract_images: + extract_images = True + elif args.no_extract_images: + extract_images = False + elif args.filename and args.output and args.filename.lower().endswith(".docx"): + count = count_docx_images(args.filename) + if count > 0: + extract_images = ask_extract_images(count) + + # 构建 kwargs + convert_kwargs: Dict[str, Any] = { + "keep_data_uris": args.keep_data_uris, + } + + if extract_images and args.output: + images_dir_name = _timestamped_images_dir_name(args.images_dir or "images") + args._actual_images_dir = images_dir_name + abs_images_dir = os.path.join( + os.path.dirname(os.path.abspath(args.output)), + images_dir_name, + ) + os.makedirs(abs_images_dir, exist_ok=True) + convert_kwargs["extract_images"] = True + convert_kwargs["images_dir"] = abs_images_dir + convert_kwargs["images_rel_dir"] = images_dir_name + # extract_images 优先于 keep_data_uris + convert_kwargs["keep_data_uris"] = False + + # --- 转换 --- if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, - keep_data_uris=args.keep_data_uris, + **convert_kwargs, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + **convert_kwargs, ) - _handle_output(args, result) + _handle_output(args, result, extract_images=extract_images) -def _handle_output(args, result: DocumentConverterResult): +def _handle_output(args, result: DocumentConverterResult, extract_images: bool = False): """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) + if extract_images: + images_dir = getattr(args, "_actual_images_dir", args.images_dir or "images") + print(f"[OK] Generated {args.output}") + print(f"[OK] Images extracted to ./{images_dir}/") else: # Handle stdout encoding errors more gracefully print( @@ -277,5 +356,10 @@ def _exit_with_error(message: str): sys.exit(1) +def _timestamped_images_dir_name(base_name: str) -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{base_name.rstrip(os.sep)}_{timestamp}" + + if __name__ == "__main__": main() diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..39cb1c346 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,5 +1,8 @@ import sys import io +import os +import re +import zipfile from warnings import warn from typing import BinaryIO, Any @@ -75,9 +78,106 @@ def convert( _dependency_exc_info[2] ) + extract_images = kwargs.get("extract_images", False) + media_files: list[str] = [] + + # ① 提取阶段:从 ZIP 获取原图(按文档中出现顺序) + if extract_images: + file_stream.seek(0) + zip_bytes = io.BytesIO(file_stream.read()) + + with zipfile.ZipFile(zip_bytes) as z: + media_files = self._get_media_in_doc_order(z) + + if media_files: + images_dir = kwargs["images_dir"] + os.makedirs(images_dir, exist_ok=True) + + for i, media_file in enumerate(media_files, 1): + ext = os.path.splitext(media_file)[1] + if ext.lower() == ".jpeg": + ext = ".jpg" + if not ext: + data = z.read(media_file) + ext = self._detect_ext(data) + filename = f"image_{i}{ext}" + with open(os.path.join(images_dir, filename), "wb") as f: + f.write(z.read(media_file)) + + file_stream.seek(0) + + # ② mammoth 转 HTML style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, - **kwargs, - ) + html_value = mammoth.convert_to_html( + pre_process_stream, style_map=style_map + ).value + + # ③ 替换 base64 → 相对路径 + if extract_images and media_files: + images_rel = kwargs.get("images_rel_dir", "images") + for i, media_file in enumerate(media_files, 1): + ext = os.path.splitext(media_file)[1] + if ext.lower() == ".jpeg": + ext = ".jpg" + filename = f"image_{i}{ext}" + # 替换 mammoth 生成的 data: URI 为文件路径 + html_value = re.sub( + r']*)src="data:image/[^"]+"', + f' list[str]: + """从 DOCX 的 document.xml.rels 和 document.xml 解析图片在文档中的出现顺序""" + from xml.etree.ElementTree import fromstring + + try: + # 1. rels: rId -> media 路径 + rels_xml = z.read("word/_rels/document.xml.rels") + rels_root = fromstring(rels_xml) + rid_to_media: dict[str, str] = {} + for rel in rels_root: + target = rel.get("Target", "") + if target.startswith("media/"): + rid_to_media[rel.get("Id", "")] = target + + # 2. document.xml: 按出现顺序收集 rId + doc_xml = z.read("word/document.xml") + doc_root = fromstring(doc_xml) + ns_a = "{http://schemas.openxmlformats.org/drawingml/2006/main}" + ns_r = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}" + + ordered_media: list[str] = [] + for blip in doc_root.iter(f"{ns_a}blip"): + rid = blip.get(f"{ns_r}embed") + if rid and rid in rid_to_media: + ordered_media.append(f"word/{rid_to_media[rid]}") + + return ordered_media + except Exception: + # fallback: 按文件名数字自然排序 + raw = [f for f in z.namelist() if f.startswith("word/media/") and not f.endswith("/")] + return sorted(raw, key=lambda p: int("".join(c for c in os.path.basename(p) if c.isdigit()) or "0")) + + @staticmethod + def _detect_ext(data: bytes) -> str: + """根据文件头 magic bytes 检测图片格式""" + if data[:8] == b"\x89PNG\r\n\x1a\n": + return ".png" + if data[:2] == b"\xff\xd8": + return ".jpg" + if data[:4] == b"GIF8": + return ".gif" + if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP": + return ".webp" + if data[:2] == b"BM": + return ".bmp" + if data[:4] == b"\x00\x00\x01\x00": + return ".ico" + return ".png" # 默认 diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..6ffa610f1 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,5 +1,6 @@ import sys import io +import os import re from typing import BinaryIO, Any @@ -492,6 +493,169 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: return [table_rows] +def _detect_image_ext(data: bytes) -> str | None: + """Return a file extension for common image byte signatures.""" + if data[:8] == b"\x89PNG\r\n\x1a\n": + return ".png" + if data[:2] == b"\xff\xd8": + return ".jpg" + if data[:4] == b"GIF8": + return ".gif" + if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP": + return ".webp" + if data[:2] == b"BM": + return ".bmp" + return None + + +def _write_pdf_image( + page: Any, + image: dict, + images_dir: str, + images_rel_dir: str, + image_index: int, +) -> dict[str, Any] | None: + image_bytes = b"" + ext: str | None = None + stream = image.get("stream") + + if stream is not None and hasattr(stream, "get_data"): + try: + raw_bytes = stream.get_data() + raw_ext = _detect_image_ext(raw_bytes) + if raw_ext is not None: + image_bytes = raw_bytes + ext = raw_ext + else: + try: + from PIL import Image # type: ignore[import-not-found] + + try: + pil_image = Image.open(io.BytesIO(raw_bytes)) + except Exception: + width, height = image.get("srcsize") or ( + image.get("width"), + image.get("height"), + ) + colorspace = str(image.get("colorspace", "")).lower() + mode = "L" if "gray" in colorspace else "RGB" + pil_image = Image.frombytes( + mode, + (int(width), int(height)), + raw_bytes, + ) + + with pil_image: + if pil_image.mode not in ("RGB", "L", "RGBA"): + pil_image = pil_image.convert("RGB") + out = io.BytesIO() + pil_image.save(out, format="PNG") + image_bytes = out.getvalue() + ext = ".png" + except Exception: + pass + except Exception: + pass + + if not image_bytes: + try: + x0 = image.get("x0", 0) + x1 = image.get("x1", 0) + top = image.get("top", 0) + bottom = image.get("bottom", 0) + if x1 <= x0 or bottom <= top: + return None + + cropped_page = page.within_bbox((x0, top, x1, bottom)) + page_image = cropped_page.to_image(resolution=150) + out = io.BytesIO() + page_image.original.save(out, format="PNG") + image_bytes = out.getvalue() + ext = ".png" + except Exception: + return None + + if ext is None: + ext = ".png" + + filename = f"image_{image_index}{ext}" + os.makedirs(images_dir, exist_ok=True) + with open(os.path.join(images_dir, filename), "wb") as image_file: + image_file.write(image_bytes) + + return { + "top": image.get("top", 0), + "markdown": f"![image_{image_index}]({images_rel_dir}/{filename})", + } + + +def _extract_text_lines_with_positions(page: Any) -> list[dict[str, Any]]: + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + text = page.extract_text() + if text and text.strip(): + return [ + {"top": float(idx), "text": line.strip()} + for idx, line in enumerate(text.splitlines()) + if line.strip() + ] + return [] + + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + rows_by_y.setdefault(y_key, []).append(word) + + lines: list[dict[str, Any]] = [] + for y_key in sorted(rows_by_y.keys()): + row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + text = " ".join(word["text"] for word in row_words).strip() + if text: + lines.append({"top": y_key, "text": text}) + return lines + + +def _extract_pdf_with_images( + pdf_bytes: io.BytesIO, + images_dir: str, + images_rel_dir: str, +) -> str: + markdown_chunks: list[str] = [] + image_index = 1 + + with pdfplumber.open(pdf_bytes) as pdf: + for page in pdf.pages: + items: list[dict[str, Any]] = [ + {"top": line["top"], "markdown": line["text"]} + for line in _extract_text_lines_with_positions(page) + ] + + for image in getattr(page, "images", []) or []: + image_item = _write_pdf_image( + page, + image, + images_dir, + images_rel_dir, + image_index, + ) + if image_item is not None: + items.append(image_item) + image_index += 1 + + page_markdown = "\n\n".join( + item["markdown"] + for item in sorted(items, key=lambda item: item["top"]) + if item["markdown"].strip() + ) + if page_markdown.strip(): + markdown_chunks.append(page_markdown.strip()) + + page.close() + + return "\n\n".join(markdown_chunks).strip() + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. @@ -539,6 +703,22 @@ def convert( # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) + if kwargs.get("extract_images", False): + images_dir = kwargs["images_dir"] + images_rel_dir = kwargs.get("images_rel_dir", "images") + try: + markdown = _extract_pdf_with_images( + pdf_bytes, + images_dir=images_dir, + images_rel_dir=images_rel_dir, + ) + except Exception: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + markdown = _merge_partial_numbering_lines(markdown) + return DocumentConverterResult(markdown=markdown) + try: # Single pass: check every page for form-style content. # Pages with tables/forms get rich extraction; plain-text diff --git a/packages/markitdown/tests/test_files/pdf_image_middle.pdf b/packages/markitdown/tests/test_files/pdf_image_middle.pdf new file mode 100644 index 000000000..d90bc9d3e Binary files /dev/null and b/packages/markitdown/tests/test_files/pdf_image_middle.pdf differ diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..fff9ef614 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,7 +3,9 @@ import os import re import shutil +import subprocess import pytest +import sys from unittest.mock import MagicMock from markitdown._uri_utils import parse_data_uri, file_uri_to_path @@ -107,6 +109,16 @@ def validate_strings(result, expected_strings, exclude_strings=None): assert string not in text_content +def _has_pdf_dependencies() -> bool: + try: + import pdfminer # noqa: F401 + import pdfplumber # noqa: F401 + + return True + except ModuleNotFoundError: + return False + + def test_stream_info_operations() -> None: """Test operations performed on StreamInfo objects.""" @@ -220,6 +232,71 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +@pytest.mark.skipif( + not _has_pdf_dependencies(), + reason="PDF optional dependencies not installed", +) +def test_pdf_extract_images_to_markdown(tmp_path) -> None: + pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf") + images_dir = tmp_path / "images" + + result = MarkItDown().convert( + pdf_path, + extract_images=True, + images_dir=str(images_dir), + images_rel_dir="images", + ) + + markdown = result.markdown + assert "Here is some introductory text." in markdown + assert "![image_1](images/image_1." in markdown + assert "Section 2: Details" in markdown + assert ( + markdown.index("Here is some introductory text.") + < markdown.index("![image_1](images/image_1.") + < markdown.index("Section 2: Details") + ) + + image_files = list(images_dir.glob("image_1.*")) + assert len(image_files) == 1 + assert image_files[0].stat().st_size > 0 + + +@pytest.mark.skipif( + not _has_pdf_dependencies(), + reason="PDF optional dependencies not installed", +) +def test_cli_pdf_extract_images_uses_timestamped_dir(tmp_path) -> None: + pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf") + output_path = tmp_path / "out.md" + + result = subprocess.run( + [ + sys.executable, + "-m", + "markitdown", + pdf_path, + "-o", + str(output_path), + "--extract-images", + "--images-dir", + "assets", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + markdown = output_path.read_text(encoding="utf-8") + image_dirs = list(tmp_path.glob("assets_*")) + assert len(image_dirs) == 1 + assert image_dirs[0].is_dir() + assert f"![image_1]({image_dirs[0].name}/image_1." in markdown + image_files = list(image_dirs[0].glob("image_1.*")) + assert len(image_files) == 1 + assert image_files[0].stat().st_size > 0 + + def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt"