diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index d4c20a402..ae9c3dd54 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ [project.optional-dependencies] all = [ "python-pptx", + "unword", "mammoth~=1.11.0", "pandas", "openpyxl", @@ -51,6 +52,7 @@ all = [ "azure-identity", ] pptx = ["python-pptx"] +doc = ["unword"] docx = ["mammoth~=1.11.0", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..b351f4d4c 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -29,6 +29,7 @@ BingSerpConverter, PdfConverter, DocxConverter, + DocConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -193,6 +194,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(DocConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 77f8b1acd..46af968d7 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._doc_converter import DocConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter @@ -38,6 +39,7 @@ "BingSerpConverter", "PdfConverter", "DocxConverter", + "DocConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py new file mode 100644 index 000000000..ed05892f4 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: 2024-present Adam Fourney +# +# SPDX-License-Identifier: MIT + +import sys +from typing import BinaryIO, Any + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional (but in this case, required) dependencies +# Save reporting of any exceptions for later +_dependency_exc_info = None +try: + import unword +except ImportError: + # Preserve the error and stack trace for later + _dependency_exc_info = sys.exc_info() + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/msword", + "application/vnd.ms-word", +] + +ACCEPTED_FILE_EXTENSIONS = [".doc"] + + +class DocConverter(DocumentConverter): + """ + Converts legacy DOC files (OLE/CFB format) to Markdown using the unword library. + No external dependencies such as LibreOffice or MS Word are required. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + # Check: the dependencies + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".doc", + feature="doc", + ) + ) from _dependency_exc_info[ + 1 + ].with_traceback( # type: ignore[union-attr] + _dependency_exc_info[2] + ) + + data = file_stream.read() + doc = unword.parse_doc(data) + + # Combine body text and textbox content + markdown_parts = [] + if doc.body_text: + markdown_parts.append(doc.body_text.strip()) + + if doc.textboxes: + for textbox in doc.textboxes: + text = textbox.strip() + if text: + markdown_parts.append(text) + + markdown = "\n\n".join(markdown_parts) + + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..144f2994a 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -31,6 +31,20 @@ class FileTestVector(object): "data:image/png;base64,iVBORw0KGgoAAAANSU", ], ), + FileTestVector( + filename="test.doc", + mimetype="application/msword", + charset=None, + url=None, + must_include=[ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + ], + must_not_include=[], + ), FileTestVector( filename="test.xlsx", mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", diff --git a/packages/markitdown/tests/test_files/test.doc b/packages/markitdown/tests/test_files/test.doc new file mode 100644 index 000000000..ab60db59d Binary files /dev/null and b/packages/markitdown/tests/test_files/test.doc differ diff --git a/packages/markitdown/tests/test_files/test_sample.doc b/packages/markitdown/tests/test_files/test_sample.doc new file mode 100644 index 000000000..9423c5a41 Binary files /dev/null and b/packages/markitdown/tests/test_files/test_sample.doc differ