From 797e1ea985f2ac6ec6c41f1b552fe8d70251bedf Mon Sep 17 00:00:00 2001 From: CraftrX <1443608354@qq.com> Date: Thu, 11 Jun 2026 13:28:01 +0800 Subject: [PATCH 1/2] feat: extract embedded images from DOCX to local directory --- .../markitdown/src/markitdown/__main__.py | 85 +++++++++++++- .../markitdown/converters/_docx_converter.py | 108 +++++++++++++++++- 2 files changed, 185 insertions(+), 8 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ccb44b64b..632d0bb66 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -3,7 +3,9 @@ # SPDX-License-Identifier: MIT import argparse import sys +import os import codecs +import zipfile from typing import Any, Dict from textwrap import dedent from importlib.metadata import entry_points @@ -11,6 +13,27 @@ from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult +def count_docx_images(filename: str) -> int: + """快速预检:统计 DOCX 中嵌入的图片数量""" + try: + with zipfile.ZipFile(filename) as z: + return len([f for f in z.namelist() if f.startswith("word/media/")]) + except (zipfile.BadZipFile, FileNotFoundError): + return 0 + + +def ask_extract_images(image_count: int) -> bool: + """交互式询问是否提取图片""" + if not sys.stdin.isatty(): + return False # 非交互终端,不询问 + print(f"\n📄 检测到文档中包含 {image_count} 张图片") + try: + answer = input(" 是否提取图片到本地文件?(y/n): ").strip().lower() + return answer in ("y", "yes") + except (EOFError, KeyboardInterrupt): + return False + + def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", @@ -138,6 +161,24 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--extract-images", + action="store_true", + help="Extract embedded images from DOCX to a local directory.", + ) + + parser.add_argument( + "--no-extract-images", + action="store_true", + help="Do not extract images (skip interactive prompt).", + ) + + parser.add_argument( + "--images-dir", + default="images", + help="Directory name for extracted images (default: images).", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -244,25 +285,61 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + # --- 图片提取逻辑 --- + extract_images = False + if args.extract_images: + extract_images = True + elif args.no_extract_images: + extract_images = False + elif args.filename and args.output and args.filename.lower().endswith(".docx"): + count = count_docx_images(args.filename) + if count > 0: + extract_images = ask_extract_images(count) + + # 构建 kwargs + convert_kwargs: Dict[str, Any] = { + "keep_data_uris": args.keep_data_uris, + } + + if extract_images and args.output: + images_dir_name = args.images_dir or "images" + abs_images_dir = os.path.join( + os.path.dirname(os.path.abspath(args.output)), + images_dir_name, + ) + os.makedirs(abs_images_dir, exist_ok=True) + convert_kwargs["extract_images"] = True + convert_kwargs["images_dir"] = abs_images_dir + convert_kwargs["images_rel_dir"] = images_dir_name + # extract_images 优先于 keep_data_uris + convert_kwargs["keep_data_uris"] = False + + # --- 转换 --- if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, - keep_data_uris=args.keep_data_uris, + **convert_kwargs, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, + stream_info=stream_info, + **convert_kwargs, ) - _handle_output(args, result) + _handle_output(args, result, extract_images=extract_images) -def _handle_output(args, result: DocumentConverterResult): +def _handle_output(args, result: DocumentConverterResult, extract_images: bool = False): """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) + if extract_images: + images_dir = args.images_dir or "images" + print(f"[OK] Generated {args.output}") + print(f"[OK] Images extracted to ./{images_dir}/") else: # Handle stdout encoding errors more gracefully print( diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..39cb1c346 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,5 +1,8 @@ import sys import io +import os +import re +import zipfile from warnings import warn from typing import BinaryIO, Any @@ -75,9 +78,106 @@ def convert( _dependency_exc_info[2] ) + extract_images = kwargs.get("extract_images", False) + media_files: list[str] = [] + + # ① 提取阶段:从 ZIP 获取原图(按文档中出现顺序) + if extract_images: + file_stream.seek(0) + zip_bytes = io.BytesIO(file_stream.read()) + + with zipfile.ZipFile(zip_bytes) as z: + media_files = self._get_media_in_doc_order(z) + + if media_files: + images_dir = kwargs["images_dir"] + os.makedirs(images_dir, exist_ok=True) + + for i, media_file in enumerate(media_files, 1): + ext = os.path.splitext(media_file)[1] + if ext.lower() == ".jpeg": + ext = ".jpg" + if not ext: + data = z.read(media_file) + ext = self._detect_ext(data) + filename = f"image_{i}{ext}" + with open(os.path.join(images_dir, filename), "wb") as f: + f.write(z.read(media_file)) + + file_stream.seek(0) + + # ② mammoth 转 HTML style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, - **kwargs, - ) + html_value = mammoth.convert_to_html( + pre_process_stream, style_map=style_map + ).value + + # ③ 替换 base64 → 相对路径 + if extract_images and media_files: + images_rel = kwargs.get("images_rel_dir", "images") + for i, media_file in enumerate(media_files, 1): + ext = os.path.splitext(media_file)[1] + if ext.lower() == ".jpeg": + ext = ".jpg" + filename = f"image_{i}{ext}" + # 替换 mammoth 生成的 data: URI 为文件路径 + html_value = re.sub( + r']*)src="data:image/[^"]+"', + f' list[str]: + """从 DOCX 的 document.xml.rels 和 document.xml 解析图片在文档中的出现顺序""" + from xml.etree.ElementTree import fromstring + + try: + # 1. rels: rId -> media 路径 + rels_xml = z.read("word/_rels/document.xml.rels") + rels_root = fromstring(rels_xml) + rid_to_media: dict[str, str] = {} + for rel in rels_root: + target = rel.get("Target", "") + if target.startswith("media/"): + rid_to_media[rel.get("Id", "")] = target + + # 2. document.xml: 按出现顺序收集 rId + doc_xml = z.read("word/document.xml") + doc_root = fromstring(doc_xml) + ns_a = "{http://schemas.openxmlformats.org/drawingml/2006/main}" + ns_r = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}" + + ordered_media: list[str] = [] + for blip in doc_root.iter(f"{ns_a}blip"): + rid = blip.get(f"{ns_r}embed") + if rid and rid in rid_to_media: + ordered_media.append(f"word/{rid_to_media[rid]}") + + return ordered_media + except Exception: + # fallback: 按文件名数字自然排序 + raw = [f for f in z.namelist() if f.startswith("word/media/") and not f.endswith("/")] + return sorted(raw, key=lambda p: int("".join(c for c in os.path.basename(p) if c.isdigit()) or "0")) + + @staticmethod + def _detect_ext(data: bytes) -> str: + """根据文件头 magic bytes 检测图片格式""" + if data[:8] == b"\x89PNG\r\n\x1a\n": + return ".png" + if data[:2] == b"\xff\xd8": + return ".jpg" + if data[:4] == b"GIF8": + return ".gif" + if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP": + return ".webp" + if data[:2] == b"BM": + return ".bmp" + if data[:4] == b"\x00\x00\x01\x00": + return ".ico" + return ".png" # 默认 From 47342ed4db913ba69fad9f3ec755dd919a8cb351 Mon Sep 17 00:00:00 2001 From: CraftrX <1443608354@qq.com> Date: Fri, 12 Jun 2026 20:32:33 +0800 Subject: [PATCH 2/2] Add PDF image extraction support --- .../markitdown/src/markitdown/__main__.py | 15 +- .../markitdown/converters/_pdf_converter.py | 180 ++++++++++++++++++ .../tests/test_files/pdf_image_middle.pdf | Bin 0 -> 7291 bytes packages/markitdown/tests/test_module_misc.py | 77 ++++++++ 4 files changed, 268 insertions(+), 4 deletions(-) create mode 100644 packages/markitdown/tests/test_files/pdf_image_middle.pdf diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 632d0bb66..3b4ffabff 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -6,6 +6,7 @@ import os import codecs import zipfile +from datetime import datetime from typing import Any, Dict from textwrap import dedent from importlib.metadata import entry_points @@ -164,7 +165,7 @@ def main(): parser.add_argument( "--extract-images", action="store_true", - help="Extract embedded images from DOCX to a local directory.", + help="Extract embedded images from DOCX/PDF to a local directory.", ) parser.add_argument( @@ -176,7 +177,7 @@ def main(): parser.add_argument( "--images-dir", default="images", - help="Directory name for extracted images (default: images).", + help="Base directory name for extracted images (default: images). A timestamp suffix is added.", ) parser.add_argument("filename", nargs="?") @@ -302,7 +303,8 @@ def main(): } if extract_images and args.output: - images_dir_name = args.images_dir or "images" + images_dir_name = _timestamped_images_dir_name(args.images_dir or "images") + args._actual_images_dir = images_dir_name abs_images_dir = os.path.join( os.path.dirname(os.path.abspath(args.output)), images_dir_name, @@ -337,7 +339,7 @@ def _handle_output(args, result: DocumentConverterResult, extract_images: bool = with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) if extract_images: - images_dir = args.images_dir or "images" + images_dir = getattr(args, "_actual_images_dir", args.images_dir or "images") print(f"[OK] Generated {args.output}") print(f"[OK] Images extracted to ./{images_dir}/") else: @@ -354,5 +356,10 @@ def _exit_with_error(message: str): sys.exit(1) +def _timestamped_images_dir_name(base_name: str) -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{base_name.rstrip(os.sep)}_{timestamp}" + + if __name__ == "__main__": main() diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..6ffa610f1 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,5 +1,6 @@ import sys import io +import os import re from typing import BinaryIO, Any @@ -492,6 +493,169 @@ def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: return [table_rows] +def _detect_image_ext(data: bytes) -> str | None: + """Return a file extension for common image byte signatures.""" + if data[:8] == b"\x89PNG\r\n\x1a\n": + return ".png" + if data[:2] == b"\xff\xd8": + return ".jpg" + if data[:4] == b"GIF8": + return ".gif" + if data[:4] == b"RIFF" and len(data) > 12 and data[8:12] == b"WEBP": + return ".webp" + if data[:2] == b"BM": + return ".bmp" + return None + + +def _write_pdf_image( + page: Any, + image: dict, + images_dir: str, + images_rel_dir: str, + image_index: int, +) -> dict[str, Any] | None: + image_bytes = b"" + ext: str | None = None + stream = image.get("stream") + + if stream is not None and hasattr(stream, "get_data"): + try: + raw_bytes = stream.get_data() + raw_ext = _detect_image_ext(raw_bytes) + if raw_ext is not None: + image_bytes = raw_bytes + ext = raw_ext + else: + try: + from PIL import Image # type: ignore[import-not-found] + + try: + pil_image = Image.open(io.BytesIO(raw_bytes)) + except Exception: + width, height = image.get("srcsize") or ( + image.get("width"), + image.get("height"), + ) + colorspace = str(image.get("colorspace", "")).lower() + mode = "L" if "gray" in colorspace else "RGB" + pil_image = Image.frombytes( + mode, + (int(width), int(height)), + raw_bytes, + ) + + with pil_image: + if pil_image.mode not in ("RGB", "L", "RGBA"): + pil_image = pil_image.convert("RGB") + out = io.BytesIO() + pil_image.save(out, format="PNG") + image_bytes = out.getvalue() + ext = ".png" + except Exception: + pass + except Exception: + pass + + if not image_bytes: + try: + x0 = image.get("x0", 0) + x1 = image.get("x1", 0) + top = image.get("top", 0) + bottom = image.get("bottom", 0) + if x1 <= x0 or bottom <= top: + return None + + cropped_page = page.within_bbox((x0, top, x1, bottom)) + page_image = cropped_page.to_image(resolution=150) + out = io.BytesIO() + page_image.original.save(out, format="PNG") + image_bytes = out.getvalue() + ext = ".png" + except Exception: + return None + + if ext is None: + ext = ".png" + + filename = f"image_{image_index}{ext}" + os.makedirs(images_dir, exist_ok=True) + with open(os.path.join(images_dir, filename), "wb") as image_file: + image_file.write(image_bytes) + + return { + "top": image.get("top", 0), + "markdown": f"![image_{image_index}]({images_rel_dir}/{filename})", + } + + +def _extract_text_lines_with_positions(page: Any) -> list[dict[str, Any]]: + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + text = page.extract_text() + if text and text.strip(): + return [ + {"top": float(idx), "text": line.strip()} + for idx, line in enumerate(text.splitlines()) + if line.strip() + ] + return [] + + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + rows_by_y.setdefault(y_key, []).append(word) + + lines: list[dict[str, Any]] = [] + for y_key in sorted(rows_by_y.keys()): + row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + text = " ".join(word["text"] for word in row_words).strip() + if text: + lines.append({"top": y_key, "text": text}) + return lines + + +def _extract_pdf_with_images( + pdf_bytes: io.BytesIO, + images_dir: str, + images_rel_dir: str, +) -> str: + markdown_chunks: list[str] = [] + image_index = 1 + + with pdfplumber.open(pdf_bytes) as pdf: + for page in pdf.pages: + items: list[dict[str, Any]] = [ + {"top": line["top"], "markdown": line["text"]} + for line in _extract_text_lines_with_positions(page) + ] + + for image in getattr(page, "images", []) or []: + image_item = _write_pdf_image( + page, + image, + images_dir, + images_rel_dir, + image_index, + ) + if image_item is not None: + items.append(image_item) + image_index += 1 + + page_markdown = "\n\n".join( + item["markdown"] + for item in sorted(items, key=lambda item: item["top"]) + if item["markdown"].strip() + ) + if page_markdown.strip(): + markdown_chunks.append(page_markdown.strip()) + + page.close() + + return "\n\n".join(markdown_chunks).strip() + + class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. @@ -539,6 +703,22 @@ def convert( # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) + if kwargs.get("extract_images", False): + images_dir = kwargs["images_dir"] + images_rel_dir = kwargs.get("images_rel_dir", "images") + try: + markdown = _extract_pdf_with_images( + pdf_bytes, + images_dir=images_dir, + images_rel_dir=images_rel_dir, + ) + except Exception: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + markdown = _merge_partial_numbering_lines(markdown) + return DocumentConverterResult(markdown=markdown) + try: # Single pass: check every page for form-style content. # Pages with tables/forms get rich extraction; plain-text diff --git a/packages/markitdown/tests/test_files/pdf_image_middle.pdf b/packages/markitdown/tests/test_files/pdf_image_middle.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d90bc9d3edf3834f7e141f89ff0414e17ba0cdd3 GIT binary patch literal 7291 zcmdU!+19Gga=`Dur*J?;K>-y|P*GIG89@YbViZvjL_9+-@&d_S?(?kS+vmT}*=NW) z$z7sr3Ef>?U0vOj&94f=h^u0?##iA#{{7$nGb1`t7$*zE%uJl86B~(RXHtob9a^8j zc_x`#l0+1O{Qmd9uf;!PeZ#C-Veo}z&`fCFzdD^Sgu^oRv?zY{dSA}d{;vKm|7>QB zx5I_0GK6>h-#LkA85v~!v_ji^x*0_Ao`dJx`%A^j2%OBfQbhWg$seP%uuXr{^7kce zNp<})RO8p7yyPu7@h}Xc@cC_jD}xL}Ka8cwu+pw0=Xajvh!Zx|;`xaaXAT);APuL} z7Cv%Pi@w$SsT+PPoW5=L+*^{tP&9)qoTr=IGI$gH)wpS3xK8FP^V2?cZQdO#ovz~9 z-_j`hZS6aW9V7Ufm^tJzjoXuH-r4tQHhn4m|G9FVpj>kg(T}NeZV^Uad*o;Y^hJ%3-{8bBjR)oSltQm!jz1 zTJAJ;n9YZGO0Ew!9f|0#a3#E$Y~eJHU&6u&o)5vOt*7zDeTr;eeqPC76Smw|@7&|! zkcB91at|x-GBVY3z1C?b92*eKfap1Rl;_uFwjVY&d9jbY=g3wN8lJd=C?>0x>}jhW zMkEZG>a|maSL{;1cv$^_8|_+PTAr}zxWEDGbt?~#A5~}E-QYZf)f)^}qutvJ*^-3>`&qei z$dy5-U0)6tV8a-x?8hJ6N>wr-t4?xB9ksTjVa-;}8&~yGyg{hihq=s!KKJDwFr;N$jKiZNvYy}{`d zaLDvBgjC?sEup+e*8#k=d*pX!c(+uhU-IkG zkb=>ns8fwWz>J*NEye(h-DzqcN})~Je2tD|I?D0j7eUDU^Lau*a{?N27VD_0+swEVeEDC_JlKyU93 zRjojZ8@*m+ylxLTepwY!Ypc+tmu2i>jr-~+5A1iIt$qj|68ogRh3L|JZaIYPL zdexdXpt#7gB3@=TnMJcpi?m@vbRS!ljAGdlZ)&dtL)9_AdYIIC*ySInR zYdCWc416!6=J-+b032q0cSJ82t6F2*#kTsUj&CZX#}X*GwhMyQi=zA1Fgs%|98Y$x z$!>ER;yAwJO3bQMuqfVPuYk0rvW-yCE6|o*62ux~V_mCs$6kIM%?5+V^D~{A^Xiiw ziCPKiJaJcl{_T!X<4g2%H+ikReM`oRp)a@mAYOGqUD32|A@Dly zF<*8&x!2N&+DAoQ_w<5Ba8i|>EgKK?&K9_WJtW)yRO+-|=F?+MiCn+E1wP}p5J8&R z@d`Ym^J5QK<+prXp`!iB;2zD|@StC@Xz6gs!@mi+2_c!UGg4L`3`T)xXIjdlm0@>L z=10YwdAMA9PNQ)!hgHFER;H`SEMdM`-MR=Hq1$nfv2)wmwYe4?4PUO>$L(Sq?ZrC$ z^lp4SE}?C;mX$Y!`BN!uH=Yc3RwdW}mh6+sKfP!_ho%qq^C$$YaBAP&tXK(d+p#xE+ zZez+plTXr8(^++K7?+XqS=tR=^tsx_5qh$%1=|606&Y zgK}?Lciy{MwN+P9VRwOVx1kFJgGCT#NARPoGVOcajDcI~^KPufeidXuSDLz7iK! zwF`KinA`4hIlKZjvy0}WS+!*4&s79q*m>QZz`Mop-Bo>EE3*MT*U4&g*3wCylez$= zF2mghL!k95cjri}@~GaPD|;qhJMD|vd&B@FEFX-e*6aBlK3~lw$<8m8su&62s6j%T zz}32v8!zn}fT}P;H~eK(qb8BZ-HPXF3+iThK#1AN_=7|@5gK5p$7jES{QRdo&&DNQ zn4OMTF3kK5eJ*Sk2LY|!unsgs;DEjIG}sM&26 z?nY4`m>u~NR4&%^N|x%c%a(V{dT#uvN|B7*>r7_=SkgUU6JeK|k&|QVMZD!_r8u*? zs<0`K(aEgowqU$`^^zyBgibm5)G4+1#UuIdiCQ<^HmiQYY}OW{Ry28@d#v0ufJu*C zzGLg1KE4?YvuZMY^s~xl2yC^nh>RMq+!c)a4dJ1EUeeB;^{J;p<#_AWdwp6CJ$aQe z;*bWHK%F1??B_gQ`eIR0^DvtpBucunv^Va$zx&kSCjB~`P7PW>pG)h~??tuEKM>uF7zWlRDUTGy=~bZd6(o`&&jSOoY1W&Bi0TSziHT z+SCCdsLo5*MoZpqIdOm9V;z0xl(-dQQ^7Jo0s%mGz_imN3%0unYzNVUT$mf{Ou7ey z)sZ3zj#5P{Sx+!^fIf*BuD|lcVmiPz**EdG7wmmeqwtncwv8rj|} zchT!m0pTYReZ&FazJbJp`<`z7-1JzG!ho0CzB{QqO3~7U!7>}-qe=elze;me?P;~; zd5@y&wH@y9sM|WV19pE}xV1x7=&efwv0XaIt&95c6mTpiPoiFFMngltUazSltZ>a{ zo69*E&0~^m6;J8g5{XYaZKJebx%>e${NtLgmU_(*a0fdvEvv$O*?gIXc1{N zV;GZimABnr%y+x&Vc$m=UThHs8#-T{Gdx(bpl#ALh3WCa;*snHCmRP)Y19b3aM<)d zy=-;`p8M`g(Q`!1Sk)150TZo})C%$d;xPL(dIi zbGzY~n|p`%G2O}K=S8`^Z&VrjZneSD?LP6>^`$i!GYZ;@dts}f6q^gpB34DYjX5<6 z#CSPipRKn@LS?*MDF(w%*=F^N(QiZ)P3)eh>d0=y(tbJCsAbi7e%8s%^(Jq!o4)ok zOpi%?GCeUM*2T|`X0Q==gYk|u)Kj{w&bcm)>$_G&6+yvx^!nPW9vs`Cg^b78n4To> zUIY+R?{X?v6nTKjqgEN30oTiCpZ#3b7CR*!U7v5rDBzcc%A6M5wK8N4kRnI8f!D+ZC z1?GD9@uw5ti$B=h6sCRra~G^nb4QMkyb<&dM7z`-+x_4U1xg$a7sF)S=x~)RQ&yV> zoAn=*F(9)WY=0xSC!yhRGv*jN$XTd@oamea)w*?bZ!{vb|6rVx{Um z*1$l&%~ydw)mjiD2&YW|#(lDL#MJS4e%Y~SGJWA$2U4pYIOr<^_VqXD)~W<T7M6>?HKqu-d(~&Jw#^NiNoLt|)aZ^c!)vJbxFH>n+y_ zLB2g1&BehmbY$-e>({zDj##gh%Zg2Xx8P6kd>qcSl z9+;_m_w+pNWm380>YN}}v>+{?@$t*&XU*jyg!XJkAr(x)5n;HHBPT3J6Y7Ft6fUMwayh zJs{dLTwuuk|31Up$CBkWl2vYR=l->qg5%#1>2Cl)yPipLtzg98+?V{5oans6Pi#5w%nwfe4WYlm zb*e##LrZd!Z~prYypgT*hm_d<{M4_pyIt=1`)#~1RGs-u1PtycHfIIkkz9Yx-uAXD3LvFpqPiOUc zCf_$dYGNab(i8{68UFiMUw^ytr>}=$+9Ta2$lOb9KV-i5q4>Q8{^aBze*eYA-#hYW z*Z*lw;kP_O18aUO`k5Ip;;?gp3-JSt&4fkD9Bgy5{jGn4%qA`>uY)%+W9-G|^YPrC z1uJ&ba&?rEKb`b+_+U6HtO`qHiM`JIFl%=TqtSzcgMmYd%YE}G5n1BWHsG-ozWAJU zTZqKWYpIY3-IC_}_B2S$SD5&=vT7&;MWOa=mfc;>IZPV5~1HEQYzK6+rUt98KC-lAK3clpAZ-Vezb>fzwWFIAmM??k!F z&2WwutlgGI4y3&AwRd{rAHqc1&o+uOv)PG4ZJd0iig=qcjozH~Ywh}seUt|3lCq|L ziOrS8!|Ij)2chE+v2*=GGwm<*&z}r|;|)A>&HTBJOrOs0Pv-gOIt)!0`)}&%SQ;_@ zx(;ulf7h2l@%G=!rC}-ESbyD@Xx9I}Z?l!=%YMowvElhn{3EkA9c8`_oo3r2>ZsMU z+bxX148yb?48JyT(=iFOZKJ(^E^+!Q{F3Z*|8cUfTFuyA8dZ`E_}}II%3oi|^mz?4 p-;qW9N`qRMWWFO+dM|nZ;-o(Rv7?PR`LPH@t3`Yj3gb2R^RQ3P> literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..fff9ef614 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,7 +3,9 @@ import os import re import shutil +import subprocess import pytest +import sys from unittest.mock import MagicMock from markitdown._uri_utils import parse_data_uri, file_uri_to_path @@ -107,6 +109,16 @@ def validate_strings(result, expected_strings, exclude_strings=None): assert string not in text_content +def _has_pdf_dependencies() -> bool: + try: + import pdfminer # noqa: F401 + import pdfplumber # noqa: F401 + + return True + except ModuleNotFoundError: + return False + + def test_stream_info_operations() -> None: """Test operations performed on StreamInfo objects.""" @@ -220,6 +232,71 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +@pytest.mark.skipif( + not _has_pdf_dependencies(), + reason="PDF optional dependencies not installed", +) +def test_pdf_extract_images_to_markdown(tmp_path) -> None: + pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf") + images_dir = tmp_path / "images" + + result = MarkItDown().convert( + pdf_path, + extract_images=True, + images_dir=str(images_dir), + images_rel_dir="images", + ) + + markdown = result.markdown + assert "Here is some introductory text." in markdown + assert "![image_1](images/image_1." in markdown + assert "Section 2: Details" in markdown + assert ( + markdown.index("Here is some introductory text.") + < markdown.index("![image_1](images/image_1.") + < markdown.index("Section 2: Details") + ) + + image_files = list(images_dir.glob("image_1.*")) + assert len(image_files) == 1 + assert image_files[0].stat().st_size > 0 + + +@pytest.mark.skipif( + not _has_pdf_dependencies(), + reason="PDF optional dependencies not installed", +) +def test_cli_pdf_extract_images_uses_timestamped_dir(tmp_path) -> None: + pdf_path = os.path.join(TEST_FILES_DIR, "pdf_image_middle.pdf") + output_path = tmp_path / "out.md" + + result = subprocess.run( + [ + sys.executable, + "-m", + "markitdown", + pdf_path, + "-o", + str(output_path), + "--extract-images", + "--images-dir", + "assets", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, result.stderr + markdown = output_path.read_text(encoding="utf-8") + image_dirs = list(tmp_path.glob("assets_*")) + assert len(image_dirs) == 1 + assert image_dirs[0].is_dir() + assert f"![image_1]({image_dirs[0].name}/image_1." in markdown + image_files = list(image_dirs[0].glob("image_1.*")) + assert len(image_files) == 1 + assert image_files[0].stat().st_size > 0 + + def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt"