diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/__init__.py b/packages/markitdown/src/markitdown/converter_utils/pdf/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py
new file mode 100644
index 000000000..b8b09d9d5
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_decoder.py
@@ -0,0 +1,119 @@
+"""Resolve pdfminer ``(cid:N)`` tokens in extracted PDF text to Unicode.
+
+Two stages:
+
+* :func:`build_cid_map` makes a single low-level pdfminer pass over the document,
+  reading each unmapped glyph's ``fontname`` so the correct per-font encoding table
+  is used (see :mod:`.cid_fonts`). The result is a document-specific
+  ``cid -> Unicode`` map; a cid that resolves to conflicting glyphs across fonts in
+  the same document is dropped (ambiguous -> left for the fallback path).
+
+* :func:`decode_cids` substitutes those tokens in the already-extracted markdown.
+  Tokens are grouped into *clusters* (a cluster ≈ one math expression). Each cluster
+  gets a confidence score = resolved / total tokens; clusters below the threshold are
+  wrapped in an HTML comment instead of being partly mistranslated, so downstream
+  consumers still see the raw signal.
+"""
+
+import re
+from typing import BinaryIO
+
+from .cid_fonts import lookup
+
+# Matches a pdfminer unmapped-glyph token, e.g. "(cid:88)".
+_CID_TOKEN = re.compile(r"\(cid:(\d+)\)")
+# Same, anchored, for testing a single LTChar's rendered text.
+_CID_EXACT = re.compile(r"^\(cid:(\d+)\)$")
+
+# Max length of intervening (non-blank-line) text for two cid tokens to count as
+# the same cluster. Formula fragments like "Q−1 (Q(x))" sit between delimiters.
+_MAX_CLUSTER_GAP = 40
+
+
+def build_cid_map(pdf_bytes: BinaryIO) -> dict[int, str]:
+    """Build a document-specific ``cid -> Unicode`` map via a font-aware pass.
+
+    Reads ``LTChar.fontname`` for every unmapped ``(cid:N)`` glyph and resolves it
+    through the per-font tables. Conflicting resolutions for the same cid are
+    discarded so they are never guessed.
+    """
+    from pdfminer.high_level import extract_pages
+    from pdfminer.layout import LTChar
+
+    pdf_bytes.seek(0)
+
+    candidates: dict[int, set[str]] = {}
+
+    def walk(obj: object) -> None:
+        for element in obj:  # type: ignore[attr-defined]
+            if isinstance(element, LTChar):
+                match = _CID_EXACT.match(element.get_text())
+                if match:
+                    cid = int(match.group(1))
+                    resolved = lookup(element.fontname, cid)
+                    if resolved is not None:
+                        candidates.setdefault(cid, set()).add(resolved)
+            if hasattr(element, "__iter__"):
+                walk(element)
+
+    for page in extract_pages(pdf_bytes):
+        walk(page)
+
+    # Keep only unambiguous resolutions.
+    return {
+        cid: next(iter(chars)) for cid, chars in candidates.items() if len(chars) == 1
+    }
+
+
+def _same_cluster(gap: str) -> bool:
+    """True if the text between two cid tokens keeps them in one formula cluster."""
+    if "\n\n" in gap:  # a blank line ends the formula
+        return False
+    return len(gap.strip()) <= _MAX_CLUSTER_GAP
+
+
+def decode_cids(
+    markdown: str,
+    cid_map: dict[int, str],
+    *,
+    confidence_threshold: float = 0.6,
+) -> str:
+    """Replace ``(cid:N)`` tokens in *markdown* using *cid_map*.
+
+    High-confidence clusters are substituted (unresolved tokens within them are left
+    untouched). Clusters whose resolved ratio falls below *confidence_threshold* are
+    wrapped as ``<!-- FORMULA: ... -->`` rather than partially decoded.
+    """
+    matches = list(_CID_TOKEN.finditer(markdown))
+    if not matches:
+        return markdown
+
+    # Group token matches into clusters by proximity.
+    clusters: list[list[re.Match[str]]] = [[matches[0]]]
+    for match in matches[1:]:
+        gap = markdown[clusters[-1][-1].end() : match.start()]
+        if _same_cluster(gap):
+            clusters[-1].append(match)
+        else:
+            clusters.append([match])
+
+    out: list[str] = []
+    last = 0
+    for cluster in clusters:
+        start, end = cluster[0].start(), cluster[-1].end()
+        out.append(markdown[last:start])
+        span = markdown[start:end]
+
+        resolved = sum(1 for m in cluster if int(m.group(1)) in cid_map)
+        confidence = resolved / len(cluster)
+
+        if confidence >= confidence_threshold:
+            out.append(
+                _CID_TOKEN.sub(lambda m: cid_map.get(int(m.group(1)), m.group(0)), span)
+            )
+        else:
+            out.append(f"<!-- FORMULA: {span} -->")
+        last = end
+
+    out.append(markdown[last:])
+    return "".join(out)
diff --git a/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py
new file mode 100644
index 000000000..a18046c25
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/pdf/cid_fonts.py
@@ -0,0 +1,941 @@
+"""Font-keyed CID -> Unicode tables for LaTeX (Computer Modern / Latin Modern) PDFs.
+
+When a PDF embeds a math font without a ``ToUnicode`` CMap, pdfminer emits the
+glyph as the literal token ``(cid:N)``, where ``N`` is the glyph's code in the
+font's *native* encoding -- not Unicode. The same code means different glyphs in
+different fonts (e.g. code 12 is ``|`` in CMEX10 but ``circledot`` in CMSY10), so
+resolution must be keyed by font name.
+
+Tables are expressed as ``code -> glyph-name`` (the native font encoding) plus a
+shared ``glyph-name -> Unicode`` map. The ``code -> glyph-name`` encodings are the
+complete codes 0-127 of cmex10 / cmsy10 / cmmi10, taken verbatim from the
+Computer Modern AFM metrics (the authoritative source). These native encodings are
+stable across documents -- unlike subset *CID* fonts, CM/LM Type1 math fonts are
+not renumbered per document -- so a static table is reliable.
+
+Extensible delimiter pieces map so that a multi-row delimiter collapses to a single
+character: the top cap carries the base glyph and the extender/bottom pieces map to
+``""``. Glyphs with no clean single-codepoint Unicode (wide accents, brace tips) are
+mapped to ``""``; any code not resolvable falls through to the decoder's
+confidence/fallback path rather than being mistranslated.
+"""
+
+import string
+
+# --- glyph name -> Unicode -------------------------------------------------
+
+_GLYPH_TO_UNICODE: dict[str, str] = {
+    # delimiters (base; size variants are stripped to these by _build)
+    "parenleft": "(",
+    "parenright": ")",
+    "bracketleft": "[",
+    "bracketright": "]",
+    "braceleft": "{",
+    "braceright": "}",
+    "floorleft": "⌊",
+    "floorright": "⌋",
+    "ceilingleft": "⌈",
+    "ceilingright": "⌉",
+    "angbracketleft": "⟨",
+    "angbracketright": "⟩",
+    "slash": "/",
+    "backslash": "\\",
+    "bar": "|",
+    "bardbl": "‖",
+    "vextendsingle": "|",
+    "vextenddouble": "‖",
+    # big operators
+    "summation": "∑",
+    "product": "∏",
+    "coproduct": "∐",
+    "integral": "∫",
+    "contintegral": "∮",
+    "union": "∪",
+    "intersection": "∩",
+    "unionsq": "⊔",
+    "intersectionsq": "⊓",
+    "unionmulti": "⊎",
+    "logicaland": "⋀",
+    "logicalor": "⋁",
+    "radical": "√",
+    "circledot": "⊙",
+    "circleplus": "⊕",
+    "circlemultiply": "⊗",
+    "circleminus": "⊖",
+    "circledivide": "⊘",
+    "circlecopyrt": "©",
+    "openbullet": "◦",
+    "bullet": "•",
+    # uppercase Greek
+    "Gamma": "Γ",
+    "Delta": "Δ",
+    "Theta": "Θ",
+    "Lambda": "Λ",
+    "Xi": "Ξ",
+    "Pi": "Π",
+    "Sigma": "Σ",
+    "Upsilon": "Υ",
+    "Phi": "Φ",
+    "Psi": "Ψ",
+    "Omega": "Ω",
+    # lowercase Greek
+    "alpha": "α",
+    "beta": "β",
+    "gamma": "γ",
+    "delta": "δ",
+    "epsilon": "ϵ",
+    "zeta": "ζ",
+    "eta": "η",
+    "theta": "θ",
+    "iota": "ι",
+    "kappa": "κ",
+    "lambda": "λ",
+    "mu": "μ",
+    "nu": "ν",
+    "xi": "ξ",
+    "pi": "π",
+    "rho": "ρ",
+    "sigma": "σ",
+    "tau": "τ",
+    "upsilon": "υ",
+    "phi": "ϕ",
+    "chi": "χ",
+    "psi": "ψ",
+    "omega": "ω",
+    "epsilon1": "ε",
+    "theta1": "ϑ",
+    "pi1": "ϖ",
+    "rho1": "ϱ",
+    "sigma1": "ς",
+    "phi1": "φ",
+    # symbols / relations / arrows
+    "minus": "−",
+    "periodcentered": "·",
+    "multiply": "×",
+    "asteriskmath": "∗",
+    "divide": "÷",
+    "diamondmath": "⋄",
+    "plusminus": "±",
+    "minusplus": "∓",
+    "equivasymptotic": "≍",
+    "equivalence": "≡",
+    "reflexsubset": "⊆",
+    "reflexsuperset": "⊇",
+    "lessequal": "≤",
+    "greaterequal": "≥",
+    "precedesequal": "⪯",
+    "followsequal": "⪰",
+    "similar": "∼",
+    "approxequal": "≈",
+    "propersubset": "⊂",
+    "propersuperset": "⊃",
+    "lessmuch": "≪",
+    "greatermuch": "≫",
+    "precedes": "≺",
+    "follows": "≻",
+    "arrowleft": "←",
+    "arrowright": "→",
+    "arrowup": "↑",
+    "arrowdown": "↓",
+    "arrowboth": "↔",
+    "arrownortheast": "↗",
+    "arrowsoutheast": "↘",
+    "arrownorthwest": "↖",
+    "arrowsouthwest": "↙",
+    "similarequal": "≃",
+    "arrowdblleft": "⇐",
+    "arrowdblright": "⇒",
+    "arrowdblup": "⇑",
+    "arrowdbldown": "⇓",
+    "arrowdblboth": "⇔",
+    "arrowbothv": "↕",
+    "arrowdblbothv": "⇕",
+    "proportional": "∝",
+    "prime": "′",
+    "infinity": "∞",
+    "element": "∈",
+    "owner": "∋",
+    "triangle": "△",
+    "triangleinv": "▽",
+    "triangleright": "▹",
+    "triangleleft": "◃",
+    "negationslash": "̸",  # combining long solidus overlay; renders on preceding glyph
+    "mapsto": "↦",
+    "universal": "∀",
+    "existential": "∃",
+    "logicalnot": "¬",
+    "emptyset": "∅",
+    "Rfractur": "ℜ",
+    "Ifractur": "ℑ",
+    "latticetop": "⊤",
+    "perpendicular": "⊥",
+    "aleph": "ℵ",
+    "turnstileleft": "⊢",
+    "turnstileright": "⊣",
+    "wreathproduct": "≀",
+    "nabla": "∇",
+    "subsetsqequal": "⊑",
+    "supersetsqequal": "⊒",
+    "section": "§",
+    "dagger": "†",
+    "daggerdbl": "‡",
+    "paragraph": "¶",
+    "club": "♣",
+    "diamond": "♦",
+    "heart": "♥",
+    "spade": "♠",
+    "partialdiff": "∂",
+    "star": "⋆",
+    "flat": "♭",
+    "natural": "♮",
+    "sharp": "♯",
+    "lscript": "ℓ",
+    "weierstrass": "℘",
+    "dotlessi": "ı",
+    "dotlessj": "ȷ",
+    "period": ".",
+    "comma": ",",
+    "less": "<",
+    "greater": ">",
+    "arrowhookleft": "↩",
+    "arrowhookright": "↪",
+    # half arrows (harpoons)
+    "arrowlefttophalf": "↼",
+    "arrowleftbothalf": "↽",
+    "arrowrighttophalf": "⇀",
+    "arrowrightbothalf": "⇁",
+    # extensible delimiter pieces: visible top cap -> base char, others dropped
+    "parenlefttp": "(",
+    "parenrighttp": ")",
+    "parenleftbt": "",
+    "parenrightbt": "",
+    "parenleftex": "",
+    "parenrightex": "",
+    "bracketlefttp": "[",
+    "bracketrighttp": "]",
+    "bracketleftbt": "",
+    "bracketrightbt": "",
+    "bracketleftex": "",
+    "bracketrightex": "",
+    "bracelefttp": "{",
+    "bracerighttp": "}",
+    "braceleftbt": "",
+    "bracerightbt": "",
+    "braceleftmid": "",
+    "bracerightmid": "",
+    "braceex": "",
+    "radicalbt": "√",
+    "radicaltp": "",
+    "radicalvertex": "",
+    "arrowtp": "↑",
+    "arrowbt": "↓",
+    "arrowvertex": "",
+    "arrowvertexdbl": "",
+    "arrowdbltp": "⇑",
+    "arrowdblbt": "⇓",
+    # accents / brace tips with no clean single codepoint -> dropped
+    "bracehtipdownleft": "",
+    "bracehtipdownright": "",
+    "bracehtipupleft": "",
+    "bracehtipupright": "",
+    "hatwide": "",
+    "hatwider": "",
+    "hatwidest": "",
+    "tildewide": "",
+    "tildewider": "",
+    "tildewidest": "",
+    "vector": "",  # \vec accent: dropped, base letter is emitted separately
+    "tie": "",
+    "slurbelow": "",
+    "slurabove": "",
+}
+
+# math-italic / calligraphic letters and old-style digits -> plain ASCII
+_GLYPH_TO_UNICODE.update({c: c for c in string.ascii_letters})
+_GLYPH_TO_UNICODE.update(
+    {
+        name: digit
+        for name, digit in zip(
+            (
+                "zerooldstyle",
+                "oneoldstyle",
+                "twooldstyle",
+                "threeoldstyle",
+                "fouroldstyle",
+                "fiveoldstyle",
+                "sixoldstyle",
+                "sevenoldstyle",
+                "eightoldstyle",
+                "nineoldstyle",
+            ),
+            "0123456789",
+        )
+    }
+)
+
+
+# --- native font encodings: code -> glyph name (cmex10/cmsy10/cmmi10, codes 0-127,
+# verbatim from the Computer Modern AFM metrics) ----------------------------
+
+_CMEX10_CODES: dict[int, str] = {
+    0: "parenleftbig",
+    1: "parenrightbig",
+    2: "bracketleftbig",
+    3: "bracketrightbig",
+    4: "floorleftbig",
+    5: "floorrightbig",
+    6: "ceilingleftbig",
+    7: "ceilingrightbig",
+    8: "braceleftbig",
+    9: "bracerightbig",
+    10: "angbracketleftbig",
+    11: "angbracketrightbig",
+    12: "vextendsingle",
+    13: "vextenddouble",
+    14: "slashbig",
+    15: "backslashbig",
+    16: "parenleftBig",
+    17: "parenrightBig",
+    18: "parenleftbigg",
+    19: "parenrightbigg",
+    20: "bracketleftbigg",
+    21: "bracketrightbigg",
+    22: "floorleftbigg",
+    23: "floorrightbigg",
+    24: "ceilingleftbigg",
+    25: "ceilingrightbigg",
+    26: "braceleftbigg",
+    27: "bracerightbigg",
+    28: "angbracketleftbigg",
+    29: "angbracketrightbigg",
+    30: "slashbigg",
+    31: "backslashbigg",
+    32: "parenleftBigg",
+    33: "parenrightBigg",
+    34: "bracketleftBigg",
+    35: "bracketrightBigg",
+    36: "floorleftBigg",
+    37: "floorrightBigg",
+    38: "ceilingleftBigg",
+    39: "ceilingrightBigg",
+    40: "braceleftBigg",
+    41: "bracerightBigg",
+    42: "angbracketleftBigg",
+    43: "angbracketrightBigg",
+    44: "slashBigg",
+    45: "backslashBigg",
+    46: "slashBig",
+    47: "backslashBig",
+    48: "parenlefttp",
+    49: "parenrighttp",
+    50: "bracketlefttp",
+    51: "bracketrighttp",
+    52: "bracketleftbt",
+    53: "bracketrightbt",
+    54: "bracketleftex",
+    55: "bracketrightex",
+    56: "bracelefttp",
+    57: "bracerighttp",
+    58: "braceleftbt",
+    59: "bracerightbt",
+    60: "braceleftmid",
+    61: "bracerightmid",
+    62: "braceex",
+    63: "arrowvertex",
+    64: "parenleftbt",
+    65: "parenrightbt",
+    66: "parenleftex",
+    67: "parenrightex",
+    68: "angbracketleftBig",
+    69: "angbracketrightBig",
+    70: "unionsqtext",
+    71: "unionsqdisplay",
+    72: "contintegraltext",
+    73: "contintegraldisplay",
+    74: "circledottext",
+    75: "circledotdisplay",
+    76: "circleplustext",
+    77: "circleplusdisplay",
+    78: "circlemultiplytext",
+    79: "circlemultiplydisplay",
+    80: "summationtext",
+    81: "producttext",
+    82: "integraltext",
+    83: "uniontext",
+    84: "intersectiontext",
+    85: "unionmultitext",
+    86: "logicalandtext",
+    87: "logicalortext",
+    88: "summationdisplay",
+    89: "productdisplay",
+    90: "integraldisplay",
+    91: "uniondisplay",
+    92: "intersectiondisplay",
+    93: "unionmultidisplay",
+    94: "logicalanddisplay",
+    95: "logicalordisplay",
+    96: "coproducttext",
+    97: "coproductdisplay",
+    98: "hatwide",
+    99: "hatwider",
+    100: "hatwidest",
+    101: "tildewide",
+    102: "tildewider",
+    103: "tildewidest",
+    104: "bracketleftBig",
+    105: "bracketrightBig",
+    106: "floorleftBig",
+    107: "floorrightBig",
+    108: "ceilingleftBig",
+    109: "ceilingrightBig",
+    110: "braceleftBig",
+    111: "bracerightBig",
+    112: "radicalbig",
+    113: "radicalBig",
+    114: "radicalbigg",
+    115: "radicalBigg",
+    116: "radicalbt",
+    117: "radicalvertex",
+    118: "radicaltp",
+    119: "arrowvertexdbl",
+    120: "arrowtp",
+    121: "arrowbt",
+    122: "bracehtipdownleft",
+    123: "bracehtipdownright",
+    124: "bracehtipupleft",
+    125: "bracehtipupright",
+    126: "arrowdbltp",
+    127: "arrowdblbt",
+}
+
+_CMSY10_CODES: dict[int, str] = {
+    0: "minus",
+    1: "periodcentered",
+    2: "multiply",
+    3: "asteriskmath",
+    4: "divide",
+    5: "diamondmath",
+    6: "plusminus",
+    7: "minusplus",
+    8: "circleplus",
+    9: "circleminus",
+    10: "circlemultiply",
+    11: "circledivide",
+    12: "circledot",
+    13: "circlecopyrt",
+    14: "openbullet",
+    15: "bullet",
+    16: "equivasymptotic",
+    17: "equivalence",
+    18: "reflexsubset",
+    19: "reflexsuperset",
+    20: "lessequal",
+    21: "greaterequal",
+    22: "precedesequal",
+    23: "followsequal",
+    24: "similar",
+    25: "approxequal",
+    26: "propersubset",
+    27: "propersuperset",
+    28: "lessmuch",
+    29: "greatermuch",
+    30: "precedes",
+    31: "follows",
+    32: "arrowleft",
+    33: "arrowright",
+    34: "arrowup",
+    35: "arrowdown",
+    36: "arrowboth",
+    37: "arrownortheast",
+    38: "arrowsoutheast",
+    39: "similarequal",
+    40: "arrowdblleft",
+    41: "arrowdblright",
+    42: "arrowdblup",
+    43: "arrowdbldown",
+    44: "arrowdblboth",
+    45: "arrownorthwest",
+    46: "arrowsouthwest",
+    47: "proportional",
+    48: "prime",
+    49: "infinity",
+    50: "element",
+    51: "owner",
+    52: "triangle",
+    53: "triangleinv",
+    54: "negationslash",
+    55: "mapsto",
+    56: "universal",
+    57: "existential",
+    58: "logicalnot",
+    59: "emptyset",
+    60: "Rfractur",
+    61: "Ifractur",
+    62: "latticetop",
+    63: "perpendicular",
+    64: "aleph",
+    65: "A",
+    66: "B",
+    67: "C",
+    68: "D",
+    69: "E",
+    70: "F",
+    71: "G",
+    72: "H",
+    73: "I",
+    74: "J",
+    75: "K",
+    76: "L",
+    77: "M",
+    78: "N",
+    79: "O",
+    80: "P",
+    81: "Q",
+    82: "R",
+    83: "S",
+    84: "T",
+    85: "U",
+    86: "V",
+    87: "W",
+    88: "X",
+    89: "Y",
+    90: "Z",
+    91: "union",
+    92: "intersection",
+    93: "unionmulti",
+    94: "logicaland",
+    95: "logicalor",
+    96: "turnstileleft",
+    97: "turnstileright",
+    98: "floorleft",
+    99: "floorright",
+    100: "ceilingleft",
+    101: "ceilingright",
+    102: "braceleft",
+    103: "braceright",
+    104: "angbracketleft",
+    105: "angbracketright",
+    106: "bar",
+    107: "bardbl",
+    108: "arrowbothv",
+    109: "arrowdblbothv",
+    110: "backslash",
+    111: "wreathproduct",
+    112: "radical",
+    113: "coproduct",
+    114: "nabla",
+    115: "integral",
+    116: "unionsq",
+    117: "intersectionsq",
+    118: "subsetsqequal",
+    119: "supersetsqequal",
+    120: "section",
+    121: "dagger",
+    122: "daggerdbl",
+    123: "paragraph",
+    124: "club",
+    125: "diamond",
+    126: "heart",
+    127: "spade",
+}
+
+_CMMI10_CODES: dict[int, str] = {
+    0: "Gamma",
+    1: "Delta",
+    2: "Theta",
+    3: "Lambda",
+    4: "Xi",
+    5: "Pi",
+    6: "Sigma",
+    7: "Upsilon",
+    8: "Phi",
+    9: "Psi",
+    10: "Omega",
+    11: "alpha",
+    12: "beta",
+    13: "gamma",
+    14: "delta",
+    15: "epsilon1",
+    16: "zeta",
+    17: "eta",
+    18: "theta",
+    19: "iota",
+    20: "kappa",
+    21: "lambda",
+    22: "mu",
+    23: "nu",
+    24: "xi",
+    25: "pi",
+    26: "rho",
+    27: "sigma",
+    28: "tau",
+    29: "upsilon",
+    30: "phi",
+    31: "chi",
+    32: "psi",
+    33: "omega",
+    34: "epsilon",
+    35: "theta1",
+    36: "pi1",
+    37: "rho1",
+    38: "sigma1",
+    39: "phi1",
+    40: "arrowlefttophalf",
+    41: "arrowleftbothalf",
+    42: "arrowrighttophalf",
+    43: "arrowrightbothalf",
+    44: "arrowhookleft",
+    45: "arrowhookright",
+    46: "triangleright",
+    47: "triangleleft",
+    48: "zerooldstyle",
+    49: "oneoldstyle",
+    50: "twooldstyle",
+    51: "threeoldstyle",
+    52: "fouroldstyle",
+    53: "fiveoldstyle",
+    54: "sixoldstyle",
+    55: "sevenoldstyle",
+    56: "eightoldstyle",
+    57: "nineoldstyle",
+    58: "period",
+    59: "comma",
+    60: "less",
+    61: "slash",
+    62: "greater",
+    63: "star",
+    64: "partialdiff",
+    65: "A",
+    66: "B",
+    67: "C",
+    68: "D",
+    69: "E",
+    70: "F",
+    71: "G",
+    72: "H",
+    73: "I",
+    74: "J",
+    75: "K",
+    76: "L",
+    77: "M",
+    78: "N",
+    79: "O",
+    80: "P",
+    81: "Q",
+    82: "R",
+    83: "S",
+    84: "T",
+    85: "U",
+    86: "V",
+    87: "W",
+    88: "X",
+    89: "Y",
+    90: "Z",
+    91: "flat",
+    92: "natural",
+    93: "sharp",
+    94: "slurbelow",
+    95: "slurabove",
+    96: "lscript",
+    97: "a",
+    98: "b",
+    99: "c",
+    100: "d",
+    101: "e",
+    102: "f",
+    103: "g",
+    104: "h",
+    105: "i",
+    106: "j",
+    107: "k",
+    108: "l",
+    109: "m",
+    110: "n",
+    111: "o",
+    112: "p",
+    113: "q",
+    114: "r",
+    115: "s",
+    116: "t",
+    117: "u",
+    118: "v",
+    119: "w",
+    120: "x",
+    121: "y",
+    122: "z",
+    123: "dotlessi",
+    124: "dotlessj",
+    125: "weierstrass",
+    126: "vector",
+    127: "tie",
+}
+
+# Suffixes denoting graded-size variants of a base delimiter/operator glyph.
+_SIZE_SUFFIXES = ("bigg", "Bigg", "big", "Big", "display", "text")
+
+
+def _build(codes: dict[int, str]) -> dict[int, str]:
+    """Resolve a code->glyph-name table to code->Unicode.
+
+    A glyph name is looked up directly, then (for graded-size variants such as
+    ``summationdisplay`` or ``parenleftBig``) with its size suffix stripped. Codes
+    whose glyph has no mapping are dropped, so they fall through to the decoder's
+    fallback path instead of being mistranslated.
+    """
+    table: dict[int, str] = {}
+    for code, glyph in codes.items():
+        unicode_char = _GLYPH_TO_UNICODE.get(glyph)
+        if unicode_char is None:
+            for suffix in _SIZE_SUFFIXES:
+                if glyph.endswith(suffix):
+                    unicode_char = _GLYPH_TO_UNICODE.get(glyph[: -len(suffix)])
+                    break
+        if unicode_char is not None:
+            table[code] = unicode_char
+    return table
+
+
+CMEX = _build(_CMEX10_CODES)
+CMMI = _build(_CMMI10_CODES)
+CMSY = _build(_CMSY10_CODES)
+
+# AMS and LaTeX symbol fonts. These carry symbols (and MSBM's blackboard letters)
+# with no shared base name, so they are authored as direct code -> Unicode tables
+# (from the msam10/msbm10 AFM glyph names, and the gnuplot table for opaque LASY).
+# Codes whose glyph has no clean single codepoint are omitted -> fallback path.
+
+# MSAM10 (AMS symbols "a").
+MSAM10: dict[int, str] = {
+    0: "⊡",
+    1: "⊞",
+    2: "⊠",
+    3: "□",
+    4: "■",
+    5: "▪",
+    6: "◊",
+    7: "◆",
+    8: "↻",
+    9: "↺",
+    10: "⇋",
+    11: "⇌",
+    12: "⊟",
+    13: "⊩",
+    14: "⊪",
+    15: "⊨",
+    16: "↠",
+    17: "↞",
+    18: "⇚",
+    19: "⇛",
+    20: "⇈",
+    21: "⇊",
+    22: "⇀",
+    23: "⇁",
+    24: "↼",
+    25: "↽",
+    26: "↣",
+    27: "↢",
+    28: "⇄",
+    29: "⇆",
+    32: "⇝",
+    33: "↭",
+    34: "↶",
+    35: "↷",
+    36: "≗",
+    37: "≽",
+    38: "≳",
+    39: "⪆",
+    40: "⊸",
+    41: "∴",
+    42: "∵",
+    45: "≼",
+    46: "≲",
+    47: "⪅",
+    48: "⋜",
+    49: "⋝",
+    50: "⋞",
+    51: "⋟",
+    53: "≦",
+    54: "⩽",
+    55: "≶",
+    61: "≧",
+    62: "⩾",
+    63: "≷",
+    64: "⊏",
+    65: "⊐",
+    66: "▷",
+    67: "◁",
+    68: "⊵",
+    69: "⊴",
+    70: "⋆",
+    71: "≬",
+    77: "△",
+    78: "▲",
+    79: "▽",
+    81: "⋚",
+    82: "⋛",
+    85: "¥",
+    88: "✓",
+    90: "⊼",
+    92: "∠",
+    93: "∡",
+    94: "∢",
+    95: "∝",
+    96: "⌣",
+    97: "⌢",
+    98: "⋐",
+    99: "⋑",
+    100: "⋓",
+    101: "⋒",
+    106: "⫅",
+    107: "⫆",
+    110: "⋘",
+    111: "⋙",
+    116: "⋔",
+    117: "∔",
+    118: "∽",
+    122: "✠",
+    123: "∁",
+    124: "⊺",
+    125: "⊚",
+    126: "⊛",
+    127: "⊖",
+}
+
+# MSBM10 (AMS symbols "b"): negated relations, blackboard bold, Hebrew letters.
+MSBM10: dict[int, str] = {
+    2: "≰",
+    3: "≱",
+    4: "≮",
+    5: "≯",
+    6: "⊀",
+    7: "⊁",
+    12: "≨",
+    13: "≩",
+    28: "≁",
+    29: "≉",
+    42: "⊈",
+    43: "⊉",
+    44: "∦",
+    45: "∤",
+    52: "⋭",
+    53: "⋬",
+    54: "⋪",
+    55: "⋫",
+    56: "↚",
+    57: "↛",
+    58: "⇍",
+    59: "⇏",
+    60: "⇎",
+    61: "↮",
+    63: "∅",
+    64: "∄",
+    # Blackboard bold A-Z (codes 65-90). The seven letters that exist in the
+    # Letterlike Symbols block override the Mathematical Double-Struck block
+    # (U+1D538-U+1D551) -- those are exactly the reserved holes in that block.
+    65: "𝔸",
+    66: "𝔹",
+    67: "ℂ",  # U+2102 (exception)
+    68: "𝔻",
+    69: "𝔼",
+    70: "𝔽",
+    71: "𝔾",
+    72: "ℍ",  # U+210D (exception)
+    73: "𝕀",
+    74: "𝕁",
+    75: "𝕂",
+    76: "𝕃",
+    77: "𝕄",
+    78: "ℕ",  # U+2115 (exception)
+    79: "𝕆",
+    80: "ℙ",  # U+2119 (exception)
+    81: "ℚ",  # U+211A (exception)
+    82: "ℝ",  # U+211D (exception)
+    83: "𝕊",
+    84: "𝕋",
+    85: "𝕌",
+    86: "𝕍",
+    87: "𝕎",
+    88: "𝕏",
+    89: "𝕐",
+    90: "ℤ",  # U+2124 (exception)
+    103: "ð",
+    104: "≂",
+    105: "ℶ",
+    106: "ℷ",
+    107: "ℸ",
+    108: "⋖",
+    109: "⋗",
+    115: "∼",
+    116: "≈",
+    122: "Ϝ",
+    123: "ϰ",
+    125: "ℏ",
+    126: "ℏ",
+}
+
+# LASY10 (LaTeX symbol font): opaque AFM names; mapped from the gnuplot table and
+# the canonical lasy layout. Conservative -- only the well-known symbols.
+LASY10: dict[int, str] = {
+    0: "⊲",
+    1: "⊳",
+    2: "⊴",
+    3: "⊵",
+    4: "⋈",
+    5: "□",
+    6: "◇",
+    7: "⇝",
+    8: "⊏",
+    9: "⊐",
+    10: "℧",
+}
+
+# Tables are keyed by *family*, not design size. A Computer Modern math font
+# shares its encoding across every point size (CMSY10, CMSY8, CMSY7, ...) and
+# with its Latin Modern equivalent (LMSY10, ...), so all collapse to one table.
+FONT_TABLES: dict[str, dict[int, str]] = {
+    "CMEX": CMEX,
+    "CMMI": CMMI,
+    "CMSY": CMSY,
+    "MSAM": MSAM10,
+    "MSBM": MSBM10,
+    "LASY": LASY10,
+}
+
+# Family prefix (after subset-prefix stripping) -> FONT_TABLES key.
+_FAMILY_PREFIXES = {
+    "CMEX": "CMEX",
+    "LMEX": "CMEX",
+    "CMMI": "CMMI",  # also matches CMMIB (bold math italic): same OML encoding
+    "LMMI": "CMMI",
+    "CMSY": "CMSY",
+    "LMSY": "CMSY",
+    "CMBSY": "CMSY",  # bold math symbols: encoding identical to CMSY
+    "MSAM": "MSAM",
+    "MSBM": "MSBM",
+    "LASY": "LASY",
+}
+
+
+def normalize_fontname(fontname: str | None) -> str:
+    """Normalise a pdfminer fontname to a FONT_TABLES family key.
+
+    Strips the 6-letter subset prefix (e.g. ``UXDKUK+CMEX10`` -> ``CMEX10``) and
+    maps any design-size/Latin-Modern variant to its family
+    (``CMSY8`` / ``LMSY10`` -> ``CMSY``). Returns ``""`` if no math family matches.
+    """
+    if not fontname:
+        return ""
+    name = fontname.split("+", 1)[1] if "+" in fontname else fontname
+    name = name.upper()
+    for prefix, family in _FAMILY_PREFIXES.items():
+        if name.startswith(prefix):
+            return family
+    return ""
+
+
+def lookup(fontname: str | None, cid: int) -> str | None:
+    """Resolve a single ``(font, cid)`` to Unicode, or ``None`` if unknown."""
+    table = FONT_TABLES.get(normalize_fontname(fontname))
+    if table is None:
+        return None
+    return table.get(cid)
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
index ffbcbd990..98ffa0447 100644
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -6,6 +6,7 @@
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ..converter_utils.pdf.cid_decoder import build_cid_map, decode_cids
 
 # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
 PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -586,4 +587,10 @@ def convert(
         # Post-process to merge MasterFormat-style partial numbering with following text
         markdown = _merge_partial_numbering_lines(markdown)
 
+        # Resolve (cid:N) tokens from LaTeX math fonts to Unicode. On by
+        # default; pass decode_cid=False to keep the raw pdfminer tokens.
+        if kwargs.get("decode_cid", True) and "(cid:" in markdown:
+            cid_map = build_cid_map(pdf_bytes)
+            markdown = decode_cids(markdown, cid_map)
+
         return DocumentConverterResult(markdown=markdown)
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 74fa9bd0a..d66db72fd 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -97,6 +97,14 @@ class FileTestVector(object):
         ],
         must_not_include=[],
     ),
+    FileTestVector(
+        filename="test_math_cid.pdf",
+        mimetype="application/pdf",
+        charset=None,
+        url=None,
+        must_include=["surveys spectral theory and variational calculus on Hilbert"],
+        must_not_include=["(cid:"],
+    ),
     FileTestVector(
         filename="test_blog.html",
         mimetype="text/html",
diff --git a/packages/markitdown/tests/test_files/test_math_cid.pdf b/packages/markitdown/tests/test_files/test_math_cid.pdf
new file mode 100644
index 000000000..f31940f17
Binary files /dev/null and b/packages/markitdown/tests/test_files/test_math_cid.pdf differ
diff --git a/packages/markitdown/tests/test_pdf_cid.py b/packages/markitdown/tests/test_pdf_cid.py
new file mode 100644
index 000000000..64d7cf828
--- /dev/null
+++ b/packages/markitdown/tests/test_pdf_cid.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3 -m pytest
+"""Tests for CID decoding of LaTeX math PDFs (on by default; decode_cid=False to opt out).
+
+LaTeX engines embed Computer Modern math glyphs without a ToUnicode CMap, so
+pdfminer emits them as literal (cid:N) tokens. The decoder resolves them
+font-aware. Fixtures:
+
+* test_math_cid.pdf - a pdflatex document whose math produces CMEX10 (cid:N).
+* test.pdf          - a clean Unicode PDF with no (cid:N) tokens (control).
+
+The CMMI/CMSY tables are checked directly against the lookup tables, since a
+freshly compiled document only emits (cid:N) for the CMEX delimiters/operators
+(modern pdflatex attaches ToUnicode to the symbol and Greek fonts).
+"""
+import os
+
+import pytest
+
+from markitdown import MarkItDown
+from markitdown.converter_utils.pdf.cid_fonts import lookup
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+MATH_CID_PDF = os.path.join(TEST_FILES_DIR, "test_math_cid.pdf")
+CLEAN_PDF = os.path.join(TEST_FILES_DIR, "test.pdf")
+
+
+@pytest.fixture
+def markitdown():
+    return MarkItDown()
+
+
+def test_math_cids_are_resolved(markitdown):
+    """decode_cid=True replaces every (cid:N) and yields the expected glyphs."""
+    if not os.path.exists(MATH_CID_PDF):
+        pytest.skip(f"Test file not found: {MATH_CID_PDF}")
+
+    result = markitdown.convert(MATH_CID_PDF, decode_cid=True)
+    text = result.text_content
+
+    assert "(cid:" not in text
+    for glyph in ("∑", "∫", "√", "∏", "∪", "|", "⟨", "⟩"):
+        assert glyph in text, f"missing decoded glyph {glyph!r}"
+
+
+def test_font_tables_resolve_cmsy_cmmi():
+    """CMSY/CMMI tables map their glyphs, including point-size variants."""
+    assert lookup("ABCDEF+CMMI10", 64) == "∂"  # partialdiff
+    assert lookup("ABCDEF+CMMI10", 11) == "α"  # alpha
+    assert lookup("ABCDEF+CMMI10", 15) == "ε"  # epsilon1 (varepsilon)
+    assert lookup("ABCDEF+CMMI10", 34) == "ϵ"  # epsilon (lunate)
+    assert lookup("ABCDEF+CMSY10", 114) == "∇"  # nabla
+    assert lookup("ABCDEF+CMSY10", 11) == "⊘"  # circledivide
+    assert lookup("ABCDEF+CMSY10", 10) == "⊗"  # circlemultiply
+    # Design-size and Latin Modern variants share the family table.
+    assert lookup("ABCDEF+CMSY8", 48) == "′"  # prime
+    assert lookup("ABCDEF+LMMI10", 11) == "α"
+
+
+def test_bold_and_ams_fonts():
+    """Bold-math (CMBSY/CMMIB) and AMS symbol fonts (MSAM/MSBM/LASY) resolve."""
+    # Bold math fonts share the plain CM encodings.
+    assert lookup("ABCDEF+CMBSY10", 0) == "−"  # bold cmsy minus
+    assert lookup("ABCDEF+CMMIB10", 11) == "α"  # bold math italic alpha
+    # AMS symbols.
+    assert lookup("ABCDEF+MSAM10", 3) == "□"  # square
+    assert lookup("ABCDEF+MSBM10", 82) == "ℝ"  # blackboard R (Letterlike U+211D)
+    assert lookup("ABCDEF+MSBM10", 65) == "𝔸"  # blackboard A (1D5 block)
+    # Letterlike exception overrides the double-struck range: Z is U+2124, not 1D56B.
+    assert lookup("ABCDEF+MSBM10", 90) == "ℤ"
+    assert lookup("ABCDEF+LASY10", 0) == "⊲"  # \lhd
+    # Fallback path stays active for codes outside an authored table.
+    assert lookup("ABCDEF+MSAM10", 200) is None
+
+
+def test_cmex_full_coverage_and_operators():
+    """Every cmex10 code 0-127 resolves, including the rarer big operators."""
+    # Big operators / contour integrals added from the full AFM encoding.
+    assert lookup("ABCDEF+CMEX10", 72) == "∮"  # contintegraltext
+    assert lookup("ABCDEF+CMEX10", 76) == "⊕"  # circleplustext
+    assert lookup("ABCDEF+CMEX10", 96) == "∐"  # coproducttext
+    assert lookup("ABCDEF+CMEX10", 106) == "⌊"  # floorleftBig
+    # Extensible delimiter extenders collapse to nothing (single delimiter out).
+    assert lookup("ABCDEF+CMEX10", 66) == ""  # parenleftex
+    # No cmex code in 0-127 is left unresolved.
+    assert all(lookup("ABCDEF+CMEX10", c) is not None for c in range(128))
+
+
+def test_prose_pdf_unaffected_by_decode(markitdown):
+    """A CID-free prose PDF must convert identically with the flag on vs off."""
+    if not os.path.exists(CLEAN_PDF):
+        pytest.skip(f"Test file not found: {CLEAN_PDF}")
+
+    off = markitdown.convert(CLEAN_PDF).text_content
+    on = markitdown.convert(CLEAN_PDF, decode_cid=True).text_content
+
+    assert on == off
+    assert "(cid:" not in on
+
+
+def test_decode_on_by_default(markitdown):
+    """Decoding is the default: math glyphs resolve without passing the flag."""
+    if not os.path.exists(MATH_CID_PDF):
+        pytest.skip(f"Test file not found: {MATH_CID_PDF}")
+
+    text = markitdown.convert(MATH_CID_PDF).text_content
+
+    assert "(cid:" not in text
+    assert "∑" in text
+
+
+def test_decode_can_be_disabled(markitdown):
+    """decode_cid=False opts out, leaving the raw pdfminer (cid:N) tokens."""
+    if not os.path.exists(MATH_CID_PDF):
+        pytest.skip(f"Test file not found: {MATH_CID_PDF}")
+
+    text = markitdown.convert(MATH_CID_PDF, decode_cid=False).text_content
+
+    assert "(cid:" in text
+    assert "∑" not in text
+
+
+def test_clean_unicode_not_corrupted(markitdown):
+    """Running the decoder on a clean Unicode PDF must not mangle its text."""
+    if not os.path.exists(CLEAN_PDF):
+        pytest.skip(f"Test file not found: {CLEAN_PDF}")
+
+    result = markitdown.convert(CLEAN_PDF, decode_cid=True)
+    text = result.text_content
+
+    # A known prose sentence from the document survives intact, no comments injected.
+    assert (
+        "While there is contemporaneous exploration of multi-agent approaches" in text
+    )
+    assert "<!-- FORMULA:" not in text
+
+
+if __name__ == "__main__":
+    import sys
+
+    pytest.main([__file__] + sys.argv[1:])