diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/__init__.py b/packages/markitdown/src/markitdown/converter_utils/pdf/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py new file mode 100644 index 000000000..b8b09d9d5 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py @@ -0,0 +1,119 @@ +"""Resolve pdfminer ``(cid:N)`` tokens in extracted PDF text to Unicode. + +Two stages: + +* :func:`build_cid_map` makes a single low-level pdfminer pass over the document, + reading each unmapped glyph's ``fontname`` so the correct per-font encoding table + is used (see :mod:`.cid_fonts`). The result is a document-specific + ``cid -> Unicode`` map; a cid that resolves to conflicting glyphs across fonts in + the same document is dropped (ambiguous -> left for the fallback path). + +* :func:`decode_cids` substitutes those tokens in the already-extracted markdown. + Tokens are grouped into *clusters* (a cluster ≈ one math expression). Each cluster + gets a confidence score = resolved / total tokens; clusters below the threshold are + wrapped in an HTML comment instead of being partly mistranslated, so downstream + consumers still see the raw signal. +""" + +import re +from typing import BinaryIO + +from .cid_fonts import lookup + +# Matches a pdfminer unmapped-glyph token, e.g. "(cid:88)". +_CID_TOKEN = re.compile(r"\(cid:(\d+)\)") +# Same, anchored, for testing a single LTChar's rendered text. +_CID_EXACT = re.compile(r"^\(cid:(\d+)\)$") + +# Max length of intervening (non-blank-line) text for two cid tokens to count as +# the same cluster. Formula fragments like "Q−1 (Q(x))" sit between delimiters. +_MAX_CLUSTER_GAP = 40 + + +def build_cid_map(pdf_bytes: BinaryIO) -> dict[int, str]: + """Build a document-specific ``cid -> Unicode`` map via a font-aware pass. + + Reads ``LTChar.fontname`` for every unmapped ``(cid:N)`` glyph and resolves it + through the per-font tables. Conflicting resolutions for the same cid are + discarded so they are never guessed. + """ + from pdfminer.high_level import extract_pages + from pdfminer.layout import LTChar + + pdf_bytes.seek(0) + + candidates: dict[int, set[str]] = {} + + def walk(obj: object) -> None: + for element in obj: # type: ignore[attr-defined] + if isinstance(element, LTChar): + match = _CID_EXACT.match(element.get_text()) + if match: + cid = int(match.group(1)) + resolved = lookup(element.fontname, cid) + if resolved is not None: + candidates.setdefault(cid, set()).add(resolved) + if hasattr(element, "__iter__"): + walk(element) + + for page in extract_pages(pdf_bytes): + walk(page) + + # Keep only unambiguous resolutions. + return { + cid: next(iter(chars)) for cid, chars in candidates.items() if len(chars) == 1 + } + + +def _same_cluster(gap: str) -> bool: + """True if the text between two cid tokens keeps them in one formula cluster.""" + if "\n\n" in gap: # a blank line ends the formula + return False + return len(gap.strip()) <= _MAX_CLUSTER_GAP + + +def decode_cids( + markdown: str, + cid_map: dict[int, str], + *, + confidence_threshold: float = 0.6, +) -> str: + """Replace ``(cid:N)`` tokens in *markdown* using *cid_map*. + + High-confidence clusters are substituted (unresolved tokens within them are left + untouched). Clusters whose resolved ratio falls below *confidence_threshold* are + wrapped as ```` rather than partially decoded. + """ + matches = list(_CID_TOKEN.finditer(markdown)) + if not matches: + return markdown + + # Group token matches into clusters by proximity. + clusters: list[list[re.Match[str]]] = [[matches[0]]] + for match in matches[1:]: + gap = markdown[clusters[-1][-1].end() : match.start()] + if _same_cluster(gap): + clusters[-1].append(match) + else: + clusters.append([match]) + + out: list[str] = [] + last = 0 + for cluster in clusters: + start, end = cluster[0].start(), cluster[-1].end() + out.append(markdown[last:start]) + span = markdown[start:end] + + resolved = sum(1 for m in cluster if int(m.group(1)) in cid_map) + confidence = resolved / len(cluster) + + if confidence >= confidence_threshold: + out.append( + _CID_TOKEN.sub(lambda m: cid_map.get(int(m.group(1)), m.group(0)), span) + ) + else: + out.append(f"") + last = end + + out.append(markdown[last:]) + return "".join(out) diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py new file mode 100644 index 000000000..a18046c25 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py @@ -0,0 +1,941 @@ +"""Font-keyed CID -> Unicode tables for LaTeX (Computer Modern / Latin Modern) PDFs. + +When a PDF embeds a math font without a ``ToUnicode`` CMap, pdfminer emits the +glyph as the literal token ``(cid:N)``, where ``N`` is the glyph's code in the +font's *native* encoding -- not Unicode. The same code means different glyphs in +different fonts (e.g. code 12 is ``|`` in CMEX10 but ``circledot`` in CMSY10), so +resolution must be keyed by font name. + +Tables are expressed as ``code -> glyph-name`` (the native font encoding) plus a +shared ``glyph-name -> Unicode`` map. The ``code -> glyph-name`` encodings are the +complete codes 0-127 of cmex10 / cmsy10 / cmmi10, taken verbatim from the +Computer Modern AFM metrics (the authoritative source). These native encodings are +stable across documents -- unlike subset *CID* fonts, CM/LM Type1 math fonts are +not renumbered per document -- so a static table is reliable. + +Extensible delimiter pieces map so that a multi-row delimiter collapses to a single +character: the top cap carries the base glyph and the extender/bottom pieces map to +``""``. Glyphs with no clean single-codepoint Unicode (wide accents, brace tips) are +mapped to ``""``; any code not resolvable falls through to the decoder's +confidence/fallback path rather than being mistranslated. +""" + +import string + +# --- glyph name -> Unicode ------------------------------------------------- + +_GLYPH_TO_UNICODE: dict[str, str] = { + # delimiters (base; size variants are stripped to these by _build) + "parenleft": "(", + "parenright": ")", + "bracketleft": "[", + "bracketright": "]", + "braceleft": "{", + "braceright": "}", + "floorleft": "⌊", + "floorright": "⌋", + "ceilingleft": "⌈", + "ceilingright": "⌉", + "angbracketleft": "⟨", + "angbracketright": "⟩", + "slash": "/", + "backslash": "\\", + "bar": "|", + "bardbl": "‖", + "vextendsingle": "|", + "vextenddouble": "‖", + # big operators + "summation": "∑", + "product": "∏", + "coproduct": "∐", + "integral": "∫", + "contintegral": "∮", + "union": "∪", + "intersection": "∩", + "unionsq": "⊔", + "intersectionsq": "⊓", + "unionmulti": "⊎", + "logicaland": "⋀", + "logicalor": "⋁", + "radical": "√", + "circledot": "⊙", + "circleplus": "⊕", + "circlemultiply": "⊗", + "circleminus": "⊖", + "circledivide": "⊘", + "circlecopyrt": "©", + "openbullet": "◦", + "bullet": "•", + # uppercase Greek + "Gamma": "Γ", + "Delta": "Δ", + "Theta": "Θ", + "Lambda": "Λ", + "Xi": "Ξ", + "Pi": "Π", + "Sigma": "Σ", + "Upsilon": "Υ", + "Phi": "Φ", + "Psi": "Ψ", + "Omega": "Ω", + # lowercase Greek + "alpha": "α", + "beta": "β", + "gamma": "γ", + "delta": "δ", + "epsilon": "ϵ", + "zeta": "ζ", + "eta": "η", + "theta": "θ", + "iota": "ι", + "kappa": "κ", + "lambda": "λ", + "mu": "μ", + "nu": "ν", + "xi": "ξ", + "pi": "π", + "rho": "ρ", + "sigma": "σ", + "tau": "τ", + "upsilon": "υ", + "phi": "ϕ", + "chi": "χ", + "psi": "ψ", + "omega": "ω", + "epsilon1": "ε", + "theta1": "ϑ", + "pi1": "ϖ", + "rho1": "ϱ", + "sigma1": "ς", + "phi1": "φ", + # symbols / relations / arrows + "minus": "−", + "periodcentered": "·", + "multiply": "×", + "asteriskmath": "∗", + "divide": "÷", + "diamondmath": "⋄", + "plusminus": "±", + "minusplus": "∓", + "equivasymptotic": "≍", + "equivalence": "≡", + "reflexsubset": "⊆", + "reflexsuperset": "⊇", + "lessequal": "≤", + "greaterequal": "≥", + "precedesequal": "⪯", + "followsequal": "⪰", + "similar": "∼", + "approxequal": "≈", + "propersubset": "⊂", + "propersuperset": "⊃", + "lessmuch": "≪", + "greatermuch": "≫", + "precedes": "≺", + "follows": "≻", + "arrowleft": "←", + "arrowright": "→", + "arrowup": "↑", + "arrowdown": "↓", + "arrowboth": "↔", + "arrownortheast": "↗", + "arrowsoutheast": "↘", + "arrownorthwest": "↖", + "arrowsouthwest": "↙", + "similarequal": "≃", + "arrowdblleft": "⇐", + "arrowdblright": "⇒", + "arrowdblup": "⇑", + "arrowdbldown": "⇓", + "arrowdblboth": "⇔", + "arrowbothv": "↕", + "arrowdblbothv": "⇕", + "proportional": "∝", + "prime": "′", + "infinity": "∞", + "element": "∈", + "owner": "∋", + "triangle": "△", + "triangleinv": "▽", + "triangleright": "▹", + "triangleleft": "◃", + "negationslash": "̸", # combining long solidus overlay; renders on preceding glyph + "mapsto": "↦", + "universal": "∀", + "existential": "∃", + "logicalnot": "¬", + "emptyset": "∅", + "Rfractur": "ℜ", + "Ifractur": "ℑ", + "latticetop": "⊤", + "perpendicular": "⊥", + "aleph": "ℵ", + "turnstileleft": "⊢", + "turnstileright": "⊣", + "wreathproduct": "≀", + "nabla": "∇", + "subsetsqequal": "⊑", + "supersetsqequal": "⊒", + "section": "§", + "dagger": "†", + "daggerdbl": "‡", + "paragraph": "¶", + "club": "♣", + "diamond": "♦", + "heart": "♥", + "spade": "♠", + "partialdiff": "∂", + "star": "⋆", + "flat": "♭", + "natural": "♮", + "sharp": "♯", + "lscript": "ℓ", + "weierstrass": "℘", + "dotlessi": "ı", + "dotlessj": "ȷ", + "period": ".", + "comma": ",", + "less": "<", + "greater": ">", + "arrowhookleft": "↩", + "arrowhookright": "↪", + # half arrows (harpoons) + "arrowlefttophalf": "↼", + "arrowleftbothalf": "↽", + "arrowrighttophalf": "⇀", + "arrowrightbothalf": "⇁", + # extensible delimiter pieces: visible top cap -> base char, others dropped + "parenlefttp": "(", + "parenrighttp": ")", + "parenleftbt": "", + "parenrightbt": "", + "parenleftex": "", + "parenrightex": "", + "bracketlefttp": "[", + "bracketrighttp": "]", + "bracketleftbt": "", + "bracketrightbt": "", + "bracketleftex": "", + "bracketrightex": "", + "bracelefttp": "{", + "bracerighttp": "}", + "braceleftbt": "", + "bracerightbt": "", + "braceleftmid": "", + "bracerightmid": "", + "braceex": "", + "radicalbt": "√", + "radicaltp": "", + "radicalvertex": "", + "arrowtp": "↑", + "arrowbt": "↓", + "arrowvertex": "", + "arrowvertexdbl": "", + "arrowdbltp": "⇑", + "arrowdblbt": "⇓", + # accents / brace tips with no clean single codepoint -> dropped + "bracehtipdownleft": "", + "bracehtipdownright": "", + "bracehtipupleft": "", + "bracehtipupright": "", + "hatwide": "", + "hatwider": "", + "hatwidest": "", + "tildewide": "", + "tildewider": "", + "tildewidest": "", + "vector": "", # \vec accent: dropped, base letter is emitted separately + "tie": "", + "slurbelow": "", + "slurabove": "", +} + +# math-italic / calligraphic letters and old-style digits -> plain ASCII +_GLYPH_TO_UNICODE.update({c: c for c in string.ascii_letters}) +_GLYPH_TO_UNICODE.update( + { + name: digit + for name, digit in zip( + ( + "zerooldstyle", + "oneoldstyle", + "twooldstyle", + "threeoldstyle", + "fouroldstyle", + "fiveoldstyle", + "sixoldstyle", + "sevenoldstyle", + "eightoldstyle", + "nineoldstyle", + ), + "0123456789", + ) + } +) + + +# --- native font encodings: code -> glyph name (cmex10/cmsy10/cmmi10, codes 0-127, +# verbatim from the Computer Modern AFM metrics) ---------------------------- + +_CMEX10_CODES: dict[int, str] = { + 0: "parenleftbig", + 1: "parenrightbig", + 2: "bracketleftbig", + 3: "bracketrightbig", + 4: "floorleftbig", + 5: "floorrightbig", + 6: "ceilingleftbig", + 7: "ceilingrightbig", + 8: "braceleftbig", + 9: "bracerightbig", + 10: "angbracketleftbig", + 11: "angbracketrightbig", + 12: "vextendsingle", + 13: "vextenddouble", + 14: "slashbig", + 15: "backslashbig", + 16: "parenleftBig", + 17: "parenrightBig", + 18: "parenleftbigg", + 19: "parenrightbigg", + 20: "bracketleftbigg", + 21: "bracketrightbigg", + 22: "floorleftbigg", + 23: "floorrightbigg", + 24: "ceilingleftbigg", + 25: "ceilingrightbigg", + 26: "braceleftbigg", + 27: "bracerightbigg", + 28: "angbracketleftbigg", + 29: "angbracketrightbigg", + 30: "slashbigg", + 31: "backslashbigg", + 32: "parenleftBigg", + 33: "parenrightBigg", + 34: "bracketleftBigg", + 35: "bracketrightBigg", + 36: "floorleftBigg", + 37: "floorrightBigg", + 38: "ceilingleftBigg", + 39: "ceilingrightBigg", + 40: "braceleftBigg", + 41: "bracerightBigg", + 42: "angbracketleftBigg", + 43: "angbracketrightBigg", + 44: "slashBigg", + 45: "backslashBigg", + 46: "slashBig", + 47: "backslashBig", + 48: "parenlefttp", + 49: "parenrighttp", + 50: "bracketlefttp", + 51: "bracketrighttp", + 52: "bracketleftbt", + 53: "bracketrightbt", + 54: "bracketleftex", + 55: "bracketrightex", + 56: "bracelefttp", + 57: "bracerighttp", + 58: "braceleftbt", + 59: "bracerightbt", + 60: "braceleftmid", + 61: "bracerightmid", + 62: "braceex", + 63: "arrowvertex", + 64: "parenleftbt", + 65: "parenrightbt", + 66: "parenleftex", + 67: "parenrightex", + 68: "angbracketleftBig", + 69: "angbracketrightBig", + 70: "unionsqtext", + 71: "unionsqdisplay", + 72: "contintegraltext", + 73: "contintegraldisplay", + 74: "circledottext", + 75: "circledotdisplay", + 76: "circleplustext", + 77: "circleplusdisplay", + 78: "circlemultiplytext", + 79: "circlemultiplydisplay", + 80: "summationtext", + 81: "producttext", + 82: "integraltext", + 83: "uniontext", + 84: "intersectiontext", + 85: "unionmultitext", + 86: "logicalandtext", + 87: "logicalortext", + 88: "summationdisplay", + 89: "productdisplay", + 90: "integraldisplay", + 91: "uniondisplay", + 92: "intersectiondisplay", + 93: "unionmultidisplay", + 94: "logicalanddisplay", + 95: "logicalordisplay", + 96: "coproducttext", + 97: "coproductdisplay", + 98: "hatwide", + 99: "hatwider", + 100: "hatwidest", + 101: "tildewide", + 102: "tildewider", + 103: "tildewidest", + 104: "bracketleftBig", + 105: "bracketrightBig", + 106: "floorleftBig", + 107: "floorrightBig", + 108: "ceilingleftBig", + 109: "ceilingrightBig", + 110: "braceleftBig", + 111: "bracerightBig", + 112: "radicalbig", + 113: "radicalBig", + 114: "radicalbigg", + 115: "radicalBigg", + 116: "radicalbt", + 117: "radicalvertex", + 118: "radicaltp", + 119: "arrowvertexdbl", + 120: "arrowtp", + 121: "arrowbt", + 122: "bracehtipdownleft", + 123: "bracehtipdownright", + 124: "bracehtipupleft", + 125: "bracehtipupright", + 126: "arrowdbltp", + 127: "arrowdblbt", +} + +_CMSY10_CODES: dict[int, str] = { + 0: "minus", + 1: "periodcentered", + 2: "multiply", + 3: "asteriskmath", + 4: "divide", + 5: "diamondmath", + 6: "plusminus", + 7: "minusplus", + 8: "circleplus", + 9: "circleminus", + 10: "circlemultiply", + 11: "circledivide", + 12: "circledot", + 13: "circlecopyrt", + 14: "openbullet", + 15: "bullet", + 16: "equivasymptotic", + 17: "equivalence", + 18: "reflexsubset", + 19: "reflexsuperset", + 20: "lessequal", + 21: "greaterequal", + 22: "precedesequal", + 23: "followsequal", + 24: "similar", + 25: "approxequal", + 26: "propersubset", + 27: "propersuperset", + 28: "lessmuch", + 29: "greatermuch", + 30: "precedes", + 31: "follows", + 32: "arrowleft", + 33: "arrowright", + 34: "arrowup", + 35: "arrowdown", + 36: "arrowboth", + 37: "arrownortheast", + 38: "arrowsoutheast", + 39: "similarequal", + 40: "arrowdblleft", + 41: "arrowdblright", + 42: "arrowdblup", + 43: "arrowdbldown", + 44: "arrowdblboth", + 45: "arrownorthwest", + 46: "arrowsouthwest", + 47: "proportional", + 48: "prime", + 49: "infinity", + 50: "element", + 51: "owner", + 52: "triangle", + 53: "triangleinv", + 54: "negationslash", + 55: "mapsto", + 56: "universal", + 57: "existential", + 58: "logicalnot", + 59: "emptyset", + 60: "Rfractur", + 61: "Ifractur", + 62: "latticetop", + 63: "perpendicular", + 64: "aleph", + 65: "A", + 66: "B", + 67: "C", + 68: "D", + 69: "E", + 70: "F", + 71: "G", + 72: "H", + 73: "I", + 74: "J", + 75: "K", + 76: "L", + 77: "M", + 78: "N", + 79: "O", + 80: "P", + 81: "Q", + 82: "R", + 83: "S", + 84: "T", + 85: "U", + 86: "V", + 87: "W", + 88: "X", + 89: "Y", + 90: "Z", + 91: "union", + 92: "intersection", + 93: "unionmulti", + 94: "logicaland", + 95: "logicalor", + 96: "turnstileleft", + 97: "turnstileright", + 98: "floorleft", + 99: "floorright", + 100: "ceilingleft", + 101: "ceilingright", + 102: "braceleft", + 103: "braceright", + 104: "angbracketleft", + 105: "angbracketright", + 106: "bar", + 107: "bardbl", + 108: "arrowbothv", + 109: "arrowdblbothv", + 110: "backslash", + 111: "wreathproduct", + 112: "radical", + 113: "coproduct", + 114: "nabla", + 115: "integral", + 116: "unionsq", + 117: "intersectionsq", + 118: "subsetsqequal", + 119: "supersetsqequal", + 120: "section", + 121: "dagger", + 122: "daggerdbl", + 123: "paragraph", + 124: "club", + 125: "diamond", + 126: "heart", + 127: "spade", +} + +_CMMI10_CODES: dict[int, str] = { + 0: "Gamma", + 1: "Delta", + 2: "Theta", + 3: "Lambda", + 4: "Xi", + 5: "Pi", + 6: "Sigma", + 7: "Upsilon", + 8: "Phi", + 9: "Psi", + 10: "Omega", + 11: "alpha", + 12: "beta", + 13: "gamma", + 14: "delta", + 15: "epsilon1", + 16: "zeta", + 17: "eta", + 18: "theta", + 19: "iota", + 20: "kappa", + 21: "lambda", + 22: "mu", + 23: "nu", + 24: "xi", + 25: "pi", + 26: "rho", + 27: "sigma", + 28: "tau", + 29: "upsilon", + 30: "phi", + 31: "chi", + 32: "psi", + 33: "omega", + 34: "epsilon", + 35: "theta1", + 36: "pi1", + 37: "rho1", + 38: "sigma1", + 39: "phi1", + 40: "arrowlefttophalf", + 41: "arrowleftbothalf", + 42: "arrowrighttophalf", + 43: "arrowrightbothalf", + 44: "arrowhookleft", + 45: "arrowhookright", + 46: "triangleright", + 47: "triangleleft", + 48: "zerooldstyle", + 49: "oneoldstyle", + 50: "twooldstyle", + 51: "threeoldstyle", + 52: "fouroldstyle", + 53: "fiveoldstyle", + 54: "sixoldstyle", + 55: "sevenoldstyle", + 56: "eightoldstyle", + 57: "nineoldstyle", + 58: "period", + 59: "comma", + 60: "less", + 61: "slash", + 62: "greater", + 63: "star", + 64: "partialdiff", + 65: "A", + 66: "B", + 67: "C", + 68: "D", + 69: "E", + 70: "F", + 71: "G", + 72: "H", + 73: "I", + 74: "J", + 75: "K", + 76: "L", + 77: "M", + 78: "N", + 79: "O", + 80: "P", + 81: "Q", + 82: "R", + 83: "S", + 84: "T", + 85: "U", + 86: "V", + 87: "W", + 88: "X", + 89: "Y", + 90: "Z", + 91: "flat", + 92: "natural", + 93: "sharp", + 94: "slurbelow", + 95: "slurabove", + 96: "lscript", + 97: "a", + 98: "b", + 99: "c", + 100: "d", + 101: "e", + 102: "f", + 103: "g", + 104: "h", + 105: "i", + 106: "j", + 107: "k", + 108: "l", + 109: "m", + 110: "n", + 111: "o", + 112: "p", + 113: "q", + 114: "r", + 115: "s", + 116: "t", + 117: "u", + 118: "v", + 119: "w", + 120: "x", + 121: "y", + 122: "z", + 123: "dotlessi", + 124: "dotlessj", + 125: "weierstrass", + 126: "vector", + 127: "tie", +} + +# Suffixes denoting graded-size variants of a base delimiter/operator glyph. +_SIZE_SUFFIXES = ("bigg", "Bigg", "big", "Big", "display", "text") + + +def _build(codes: dict[int, str]) -> dict[int, str]: + """Resolve a code->glyph-name table to code->Unicode. + + A glyph name is looked up directly, then (for graded-size variants such as + ``summationdisplay`` or ``parenleftBig``) with its size suffix stripped. Codes + whose glyph has no mapping are dropped, so they fall through to the decoder's + fallback path instead of being mistranslated. + """ + table: dict[int, str] = {} + for code, glyph in codes.items(): + unicode_char = _GLYPH_TO_UNICODE.get(glyph) + if unicode_char is None: + for suffix in _SIZE_SUFFIXES: + if glyph.endswith(suffix): + unicode_char = _GLYPH_TO_UNICODE.get(glyph[: -len(suffix)]) + break + if unicode_char is not None: + table[code] = unicode_char + return table + + +CMEX = _build(_CMEX10_CODES) +CMMI = _build(_CMMI10_CODES) +CMSY = _build(_CMSY10_CODES) + +# AMS and LaTeX symbol fonts. These carry symbols (and MSBM's blackboard letters) +# with no shared base name, so they are authored as direct code -> Unicode tables +# (from the msam10/msbm10 AFM glyph names, and the gnuplot table for opaque LASY). +# Codes whose glyph has no clean single codepoint are omitted -> fallback path. + +# MSAM10 (AMS symbols "a"). +MSAM10: dict[int, str] = { + 0: "⊡", + 1: "⊞", + 2: "⊠", + 3: "□", + 4: "■", + 5: "▪", + 6: "◊", + 7: "◆", + 8: "↻", + 9: "↺", + 10: "⇋", + 11: "⇌", + 12: "⊟", + 13: "⊩", + 14: "⊪", + 15: "⊨", + 16: "↠", + 17: "↞", + 18: "⇚", + 19: "⇛", + 20: "⇈", + 21: "⇊", + 22: "⇀", + 23: "⇁", + 24: "↼", + 25: "↽", + 26: "↣", + 27: "↢", + 28: "⇄", + 29: "⇆", + 32: "⇝", + 33: "↭", + 34: "↶", + 35: "↷", + 36: "≗", + 37: "≽", + 38: "≳", + 39: "⪆", + 40: "⊸", + 41: "∴", + 42: "∵", + 45: "≼", + 46: "≲", + 47: "⪅", + 48: "⋜", + 49: "⋝", + 50: "⋞", + 51: "⋟", + 53: "≦", + 54: "⩽", + 55: "≶", + 61: "≧", + 62: "⩾", + 63: "≷", + 64: "⊏", + 65: "⊐", + 66: "▷", + 67: "◁", + 68: "⊵", + 69: "⊴", + 70: "⋆", + 71: "≬", + 77: "△", + 78: "▲", + 79: "▽", + 81: "⋚", + 82: "⋛", + 85: "¥", + 88: "✓", + 90: "⊼", + 92: "∠", + 93: "∡", + 94: "∢", + 95: "∝", + 96: "⌣", + 97: "⌢", + 98: "⋐", + 99: "⋑", + 100: "⋓", + 101: "⋒", + 106: "⫅", + 107: "⫆", + 110: "⋘", + 111: "⋙", + 116: "⋔", + 117: "∔", + 118: "∽", + 122: "✠", + 123: "∁", + 124: "⊺", + 125: "⊚", + 126: "⊛", + 127: "⊖", +} + +# MSBM10 (AMS symbols "b"): negated relations, blackboard bold, Hebrew letters. +MSBM10: dict[int, str] = { + 2: "≰", + 3: "≱", + 4: "≮", + 5: "≯", + 6: "⊀", + 7: "⊁", + 12: "≨", + 13: "≩", + 28: "≁", + 29: "≉", + 42: "⊈", + 43: "⊉", + 44: "∦", + 45: "∤", + 52: "⋭", + 53: "⋬", + 54: "⋪", + 55: "⋫", + 56: "↚", + 57: "↛", + 58: "⇍", + 59: "⇏", + 60: "⇎", + 61: "↮", + 63: "∅", + 64: "∄", + # Blackboard bold A-Z (codes 65-90). The seven letters that exist in the + # Letterlike Symbols block override the Mathematical Double-Struck block + # (U+1D538-U+1D551) -- those are exactly the reserved holes in that block. + 65: "𝔸", + 66: "𝔹", + 67: "ℂ", # U+2102 (exception) + 68: "𝔻", + 69: "𝔼", + 70: "𝔽", + 71: "𝔾", + 72: "ℍ", # U+210D (exception) + 73: "𝕀", + 74: "𝕁", + 75: "𝕂", + 76: "𝕃", + 77: "𝕄", + 78: "ℕ", # U+2115 (exception) + 79: "𝕆", + 80: "ℙ", # U+2119 (exception) + 81: "ℚ", # U+211A (exception) + 82: "ℝ", # U+211D (exception) + 83: "𝕊", + 84: "𝕋", + 85: "𝕌", + 86: "𝕍", + 87: "𝕎", + 88: "𝕏", + 89: "𝕐", + 90: "ℤ", # U+2124 (exception) + 103: "ð", + 104: "≂", + 105: "ℶ", + 106: "ℷ", + 107: "ℸ", + 108: "⋖", + 109: "⋗", + 115: "∼", + 116: "≈", + 122: "Ϝ", + 123: "ϰ", + 125: "ℏ", + 126: "ℏ", +} + +# LASY10 (LaTeX symbol font): opaque AFM names; mapped from the gnuplot table and +# the canonical lasy layout. Conservative -- only the well-known symbols. +LASY10: dict[int, str] = { + 0: "⊲", + 1: "⊳", + 2: "⊴", + 3: "⊵", + 4: "⋈", + 5: "□", + 6: "◇", + 7: "⇝", + 8: "⊏", + 9: "⊐", + 10: "℧", +} + +# Tables are keyed by *family*, not design size. A Computer Modern math font +# shares its encoding across every point size (CMSY10, CMSY8, CMSY7, ...) and +# with its Latin Modern equivalent (LMSY10, ...), so all collapse to one table. +FONT_TABLES: dict[str, dict[int, str]] = { + "CMEX": CMEX, + "CMMI": CMMI, + "CMSY": CMSY, + "MSAM": MSAM10, + "MSBM": MSBM10, + "LASY": LASY10, +} + +# Family prefix (after subset-prefix stripping) -> FONT_TABLES key. +_FAMILY_PREFIXES = { + "CMEX": "CMEX", + "LMEX": "CMEX", + "CMMI": "CMMI", # also matches CMMIB (bold math italic): same OML encoding + "LMMI": "CMMI", + "CMSY": "CMSY", + "LMSY": "CMSY", + "CMBSY": "CMSY", # bold math symbols: encoding identical to CMSY + "MSAM": "MSAM", + "MSBM": "MSBM", + "LASY": "LASY", +} + + +def normalize_fontname(fontname: str | None) -> str: + """Normalise a pdfminer fontname to a FONT_TABLES family key. + + Strips the 6-letter subset prefix (e.g. ``UXDKUK+CMEX10`` -> ``CMEX10``) and + maps any design-size/Latin-Modern variant to its family + (``CMSY8`` / ``LMSY10`` -> ``CMSY``). Returns ``""`` if no math family matches. + """ + if not fontname: + return "" + name = fontname.split("+", 1)[1] if "+" in fontname else fontname + name = name.upper() + for prefix, family in _FAMILY_PREFIXES.items(): + if name.startswith(prefix): + return family + return "" + + +def lookup(fontname: str | None, cid: int) -> str | None: + """Resolve a single ``(font, cid)`` to Unicode, or ``None`` if unknown.""" + table = FONT_TABLES.get(normalize_fontname(fontname)) + if table is None: + return None + return table.get(cid) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..98ffa0447 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -6,6 +6,7 @@ from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from ..converter_utils.pdf.cid_decoder import build_cid_map, decode_cids # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") @@ -586,4 +587,10 @@ def convert( # Post-process to merge MasterFormat-style partial numbering with following text markdown = _merge_partial_numbering_lines(markdown) + # Resolve (cid:N) tokens from LaTeX math fonts to Unicode. On by + # default; pass decode_cid=False to keep the raw pdfminer tokens. + if kwargs.get("decode_cid", True) and "(cid:" in markdown: + cid_map = build_cid_map(pdf_bytes) + markdown = decode_cids(markdown, cid_map) + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..d66db72fd 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -97,6 +97,14 @@ class FileTestVector(object): ], must_not_include=[], ), + FileTestVector( + filename="test_math_cid.pdf", + mimetype="application/pdf", + charset=None, + url=None, + must_include=["surveys spectral theory and variational calculus on Hilbert"], + must_not_include=["(cid:"], + ), FileTestVector( filename="test_blog.html", mimetype="text/html", diff --git a/packages/markitdown/tests/test_files/test_math_cid.pdf b/packages/markitdown/tests/test_files/test_math_cid.pdf new file mode 100644 index 000000000..f31940f17 Binary files /dev/null and b/packages/markitdown/tests/test_files/test_math_cid.pdf differ diff --git a/packages/markitdown/tests/test_pdf_cid.py b/packages/markitdown/tests/test_pdf_cid.py new file mode 100644 index 000000000..64d7cf828 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_cid.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for CID decoding of LaTeX math PDFs (on by default; decode_cid=False to opt out). + +LaTeX engines embed Computer Modern math glyphs without a ToUnicode CMap, so +pdfminer emits them as literal (cid:N) tokens. The decoder resolves them +font-aware. Fixtures: + +* test_math_cid.pdf - a pdflatex document whose math produces CMEX10 (cid:N). +* test.pdf - a clean Unicode PDF with no (cid:N) tokens (control). + +The CMMI/CMSY tables are checked directly against the lookup tables, since a +freshly compiled document only emits (cid:N) for the CMEX delimiters/operators +(modern pdflatex attaches ToUnicode to the symbol and Greek fonts). +""" +import os + +import pytest + +from markitdown import MarkItDown +from markitdown.converter_utils.pdf.cid_fonts import lookup + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +MATH_CID_PDF = os.path.join(TEST_FILES_DIR, "test_math_cid.pdf") +CLEAN_PDF = os.path.join(TEST_FILES_DIR, "test.pdf") + + +@pytest.fixture +def markitdown(): + return MarkItDown() + + +def test_math_cids_are_resolved(markitdown): + """decode_cid=True replaces every (cid:N) and yields the expected glyphs.""" + if not os.path.exists(MATH_CID_PDF): + pytest.skip(f"Test file not found: {MATH_CID_PDF}") + + result = markitdown.convert(MATH_CID_PDF, decode_cid=True) + text = result.text_content + + assert "(cid:" not in text + for glyph in ("∑", "∫", "√", "∏", "∪", "|", "⟨", "⟩"): + assert glyph in text, f"missing decoded glyph {glyph!r}" + + +def test_font_tables_resolve_cmsy_cmmi(): + """CMSY/CMMI tables map their glyphs, including point-size variants.""" + assert lookup("ABCDEF+CMMI10", 64) == "∂" # partialdiff + assert lookup("ABCDEF+CMMI10", 11) == "α" # alpha + assert lookup("ABCDEF+CMMI10", 15) == "ε" # epsilon1 (varepsilon) + assert lookup("ABCDEF+CMMI10", 34) == "ϵ" # epsilon (lunate) + assert lookup("ABCDEF+CMSY10", 114) == "∇" # nabla + assert lookup("ABCDEF+CMSY10", 11) == "⊘" # circledivide + assert lookup("ABCDEF+CMSY10", 10) == "⊗" # circlemultiply + # Design-size and Latin Modern variants share the family table. + assert lookup("ABCDEF+CMSY8", 48) == "′" # prime + assert lookup("ABCDEF+LMMI10", 11) == "α" + + +def test_bold_and_ams_fonts(): + """Bold-math (CMBSY/CMMIB) and AMS symbol fonts (MSAM/MSBM/LASY) resolve.""" + # Bold math fonts share the plain CM encodings. + assert lookup("ABCDEF+CMBSY10", 0) == "−" # bold cmsy minus + assert lookup("ABCDEF+CMMIB10", 11) == "α" # bold math italic alpha + # AMS symbols. + assert lookup("ABCDEF+MSAM10", 3) == "□" # square + assert lookup("ABCDEF+MSBM10", 82) == "ℝ" # blackboard R (Letterlike U+211D) + assert lookup("ABCDEF+MSBM10", 65) == "𝔸" # blackboard A (1D5 block) + # Letterlike exception overrides the double-struck range: Z is U+2124, not 1D56B. + assert lookup("ABCDEF+MSBM10", 90) == "ℤ" + assert lookup("ABCDEF+LASY10", 0) == "⊲" # \lhd + # Fallback path stays active for codes outside an authored table. + assert lookup("ABCDEF+MSAM10", 200) is None + + +def test_cmex_full_coverage_and_operators(): + """Every cmex10 code 0-127 resolves, including the rarer big operators.""" + # Big operators / contour integrals added from the full AFM encoding. + assert lookup("ABCDEF+CMEX10", 72) == "∮" # contintegraltext + assert lookup("ABCDEF+CMEX10", 76) == "⊕" # circleplustext + assert lookup("ABCDEF+CMEX10", 96) == "∐" # coproducttext + assert lookup("ABCDEF+CMEX10", 106) == "⌊" # floorleftBig + # Extensible delimiter extenders collapse to nothing (single delimiter out). + assert lookup("ABCDEF+CMEX10", 66) == "" # parenleftex + # No cmex code in 0-127 is left unresolved. + assert all(lookup("ABCDEF+CMEX10", c) is not None for c in range(128)) + + +def test_prose_pdf_unaffected_by_decode(markitdown): + """A CID-free prose PDF must convert identically with the flag on vs off.""" + if not os.path.exists(CLEAN_PDF): + pytest.skip(f"Test file not found: {CLEAN_PDF}") + + off = markitdown.convert(CLEAN_PDF).text_content + on = markitdown.convert(CLEAN_PDF, decode_cid=True).text_content + + assert on == off + assert "(cid:" not in on + + +def test_decode_on_by_default(markitdown): + """Decoding is the default: math glyphs resolve without passing the flag.""" + if not os.path.exists(MATH_CID_PDF): + pytest.skip(f"Test file not found: {MATH_CID_PDF}") + + text = markitdown.convert(MATH_CID_PDF).text_content + + assert "(cid:" not in text + assert "∑" in text + + +def test_decode_can_be_disabled(markitdown): + """decode_cid=False opts out, leaving the raw pdfminer (cid:N) tokens.""" + if not os.path.exists(MATH_CID_PDF): + pytest.skip(f"Test file not found: {MATH_CID_PDF}") + + text = markitdown.convert(MATH_CID_PDF, decode_cid=False).text_content + + assert "(cid:" in text + assert "∑" not in text + + +def test_clean_unicode_not_corrupted(markitdown): + """Running the decoder on a clean Unicode PDF must not mangle its text.""" + if not os.path.exists(CLEAN_PDF): + pytest.skip(f"Test file not found: {CLEAN_PDF}") + + result = markitdown.convert(CLEAN_PDF, decode_cid=True) + text = result.text_content + + # A known prose sentence from the document survives intact, no comments injected. + assert ( + "While there is contemporaneous exploration of multi-agent approaches" in text + ) + assert "