diff --git a/packages/markitdown-mcp/pyproject.toml b/packages/markitdown-mcp/pyproject.toml index 746253be5..73cc55a5d 100644 --- a/packages/markitdown-mcp/pyproject.toml +++ b/packages/markitdown-mcp/pyproject.toml @@ -24,10 +24,16 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "mcp~=1.8.0", + "mcp>=1.8.0", "markitdown[all]>=0.1.1,<0.2.0", ] +[project.optional-dependencies] +ocr = [ + "markitdown-ocr>=0.1.0", + "openai>=1.0.0", +] + [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" Issues = "https://github.com/microsoft/markitdown/issues" diff --git a/packages/markitdown-mcp/src/markitdown_mcp/__main__.py b/packages/markitdown-mcp/src/markitdown_mcp/__main__.py index 89f89444e..7c62978d0 100644 --- a/packages/markitdown-mcp/src/markitdown_mcp/__main__.py +++ b/packages/markitdown-mcp/src/markitdown_mcp/__main__.py @@ -1,36 +1,450 @@ +import base64 import contextlib -import sys +import hashlib +import json import os +import re +import shutil +import sys +import tempfile +import time from collections.abc import AsyncIterator +from pathlib import Path + +import uvicorn +from mcp.server import Server from mcp.server.fastmcp import FastMCP -from starlette.applications import Starlette from mcp.server.sse import SseServerTransport +from mcp.server.streamable_http_manager import StreamableHTTPSessionManager +from starlette.applications import Starlette from starlette.requests import Request from starlette.routing import Mount, Route from starlette.types import Receive, Scope, Send -from mcp.server import Server -from mcp.server.streamable_http_manager import StreamableHTTPSessionManager + from markitdown import MarkItDown -import uvicorn -# Initialize FastMCP server for MarkItDown (SSE) + +# --------------------------------------------------------------------------- +# Temporary directory management (module-level) +# --------------------------------------------------------------------------- + +_temp_root: str = os.path.realpath(tempfile.mkdtemp(prefix="markitdown_mcp_")) + +# Image file extensions we recognise when scanning output directories +_IMAGE_EXTENSIONS = frozenset( + {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".tif", ".svg", ".ico"} +) + + +def _cleanup_old_dirs(root: str, max_age_seconds: int = 86400) -> None: + """Remove sub-directories under *root* that are older than *max_age_seconds*.""" + now = time.time() + try: + for entry in os.scandir(root): + if entry.is_dir(): + try: + if now - entry.stat().st_mtime > max_age_seconds: + shutil.rmtree(entry.path, ignore_errors=True) + except OSError: + pass + except FileNotFoundError: + pass + + +def _cleanup_all(root: str) -> None: + """Remove the entire *root* directory tree.""" + shutil.rmtree(root, ignore_errors=True) + + +def _new_doc_image_dir(path: str) -> str: + """Create and return a per-document image output directory.""" + doc_hash = hashlib.sha256(path.encode("utf-8", errors="replace")).hexdigest()[:16] + image_dir = os.path.join(_temp_root, doc_hash, "images") + os.makedirs(image_dir, exist_ok=True) + return image_dir + + +# --------------------------------------------------------------------------- +# LLM client configuration (optional — for fully automatic OCR mode) +# --------------------------------------------------------------------------- + +def _create_llm_client(): + """Create an OpenAI-compatible client from environment variables. + + Returns (client, model) tuple, or (None, None) if not configured. + Environment variables: + MARKITDOWN_LLM_API_KEY — required to enable LLM mode + MARKITDOWN_LLM_BASE_URL — optional, defaults to OpenAI + MARKITDOWN_LLM_MODEL — optional, defaults to gpt-4o-mini + """ + api_key = os.getenv("MARKITDOWN_LLM_API_KEY", "").strip() + if not api_key: + return None, None + + try: + from openai import OpenAI + except ImportError: + print( + "WARNING: MARKITDOWN_LLM_API_KEY is set but 'openai' package is not installed. " + "Install with: pip install openai", + file=sys.stderr, + ) + return None, None + + base_url = os.getenv("MARKITDOWN_LLM_BASE_URL", "").strip() or None + model = os.getenv("MARKITDOWN_LLM_MODEL", "gpt-4o-mini").strip() + + kwargs = {"api_key": api_key} + if base_url: + kwargs["base_url"] = base_url + + client = OpenAI(**kwargs) + print(f"LLM vision enabled: model={model}" + (f", base_url={base_url}" if base_url else ""), file=sys.stderr) + return client, model + + +def _get_llm_prompt() -> str | None: + """Return custom LLM prompt from env, or None for default.""" + prompt = os.getenv("MARKITDOWN_LLM_PROMPT", "").strip() + return prompt or None + + +# --------------------------------------------------------------------------- +# MarkItDown factory +# --------------------------------------------------------------------------- + +def _make_markitdown(*, extract_only: bool = False, image_output_dir: str | None = None) -> MarkItDown: + """Create a MarkItDown instance with plugin and optional LLM support.""" + kwargs: dict = { + "enable_plugins": check_plugins_enabled(), + } + if extract_only: + kwargs["extract_only"] = True + if image_output_dir: + kwargs["image_output_dir"] = image_output_dir + + # Inject LLM client if configured (for automatic OCR / image description) + llm_client, llm_model = _create_llm_client() + if llm_client: + kwargs["llm_client"] = llm_client + kwargs["llm_model"] = llm_model + prompt = _get_llm_prompt() + if prompt: + kwargs["llm_prompt"] = prompt + + return MarkItDown(**kwargs) + + +# --------------------------------------------------------------------------- +# FastMCP server +# --------------------------------------------------------------------------- + mcp = FastMCP("markitdown") @mcp.tool() -async def convert_to_markdown(uri: str) -> str: - """Convert a resource described by an http:, https:, file: or data: URI to markdown""" - return MarkItDown(enable_plugins=check_plugins_enabled()).convert_uri(uri).markdown +async def convert_to_markdown(uri: str, extract_images: bool = False) -> str: + """Convert a document or URI resource to markdown. + + Args: + uri: An http:, https:, file: or data: URI pointing to the resource. + Local file paths are also accepted and will be converted to file: URIs. + extract_images: If True, extract embedded images to disk and replace + references with absolute file paths. Default False (original behavior). + + Returns: + The markdown representation of the document. When extract_images=True, + image references point to extracted files on disk that can be read directly. + """ + # Normalise bare file paths to file: URIs + if not re.match(r"^(https?|file|data):", uri): + abs_path = os.path.abspath(uri) + if os.path.exists(abs_path): + uri = Path(abs_path).as_uri() + + if extract_images: + image_dir = _new_doc_image_dir(uri) + md = _make_markitdown(image_output_dir=image_dir) + result = md.convert_uri(uri) + text = result.markdown or "" + # Rewrite relative image references to absolute disk paths + text = _resolve_image_refs(text, image_dir) + return text + else: + md = _make_markitdown() + return md.convert_uri(uri).markdown + + +@mcp.tool() +async def analyze_document(path: str) -> str: + """Analyze a document and extract its text skeleton together with embedded images. + + This is the primary tool for AI-assistant-driven document processing. + It extracts the document structure (text skeleton) and all embedded images + to disk, so the AI assistant can read each image with its own vision capability + to perform OCR, chart understanding, or semantic analysis. + + Accepts a local file path or a URI (http:, https:, file:, data:). + + Returns a JSON string containing: + - text_skeleton: the markdown text with image references (absolute paths on disk) + - images: a list of extracted images, each with: + path: absolute file path on disk (readable by AI assistant) + uri: file:// URI for the image + position: surrounding context showing where the image appears + size_bytes: file size + width, height: pixel dimensions + - metadata: source path, image count, and ocr_mode indicator + + Workflow for AI assistants: + 1. Call analyze_document to get text + image list + 2. Use your vision capability to read each image path for OCR/understanding + 3. Insert the extracted text back into the text_skeleton at the image positions + """ + # ---- validate input ------------------------------------------------ + is_uri = re.match(r"^(https?|file|data):", path) is not None + if not is_uri and not os.path.exists(path): + return json.dumps({"error": f"File not found: {path}"}) + + # ---- prepare per-document temp directory ---------------------------- + image_dir = _new_doc_image_dir(path) + + # ---- run conversion ------------------------------------------------- + try: + md_instance = _make_markitdown(extract_only=True, image_output_dir=image_dir) + result = md_instance.convert(path) + text_skeleton: str = result.markdown or "" + except Exception as exc: + return json.dumps({"error": f"Conversion failed: {exc}"}) + + # ---- resolve image references to absolute paths --------------------- + text_skeleton = _resolve_image_refs(text_skeleton, image_dir) + + # ---- collect image files on disk ------------------------------------ + image_files: list[str] = [] + for fname in sorted(os.listdir(image_dir)): + ext = os.path.splitext(fname)[1].lower() + if ext in _IMAGE_EXTENSIONS: + image_files.append(os.path.join(image_dir, fname)) + + # ---- parse image references from text skeleton ---------------------- + img_ref_pattern = re.compile( + r"(?:\s*\n?)?" + r"!\[(?P[^\]]*)\]\((?P[^)]+)\)" + ) + + images_out: list[dict] = [] + for match in img_ref_pattern.finditer(text_skeleton): + src = match.group("src") + meta_str = match.group("meta") or "" + + # Resolve to absolute path + if os.path.isabs(src): + img_path = src + else: + base_dir = os.path.dirname(path) if not is_uri else "" + img_path = os.path.normpath(os.path.join(base_dir, src)) + + images_out.append(_build_image_info(img_path, meta_str, text_skeleton, match)) + + # Account for unreferenced images + referenced_paths = {img["path"] for img in images_out} + for fpath in image_files: + if fpath not in referenced_paths: + images_out.append( + _build_image_info(fpath, "", text_skeleton, None, position="[unreferenced image]") + ) + + # ---- determine OCR mode --------------------------------------------- + llm_client, _ = _create_llm_client() + ocr_mode = "llm_vision" if llm_client else "ai_assistant_driven" + + # ---- build response ------------------------------------------------- + response = { + "text_skeleton": text_skeleton, + "images": images_out, + "metadata": { + "source": path, + "image_count": len(images_out), + "ocr_mode": ocr_mode, + }, + } + return json.dumps(response, ensure_ascii=False) + + +@mcp.tool() +async def ocr_image(path: str, prompt: str = "") -> str: + """Extract text content from an image file. + + Args: + path: Local file path to the image (png, jpg, pdf, etc.) + prompt: Optional custom prompt for OCR extraction. + Default: "Extract all text from this image, maintaining layout and order." + + Returns: + JSON string with extracted text and image metadata. + If LLM is configured, uses vision model for automatic OCR. + Otherwise, returns the image path and metadata for the AI assistant + to read directly with its own vision capability. + """ + abs_path = os.path.abspath(path) + if not os.path.exists(abs_path): + return json.dumps({"error": f"File not found: {abs_path}"}) + + # Basic image metadata + info: dict = {"path": abs_path, "uri": Path(abs_path).as_uri()} + + try: + from PIL import Image as PILImage + with PILImage.open(abs_path) as img: + info["width"], info["height"] = img.size + info["format"] = img.format + except Exception: + pass + + try: + info["size_bytes"] = os.path.getsize(abs_path) + except OSError: + pass + + # Try LLM-based OCR if configured + llm_client, llm_model = _create_llm_client() + if llm_client: + try: + ocr_prompt = prompt or ( + "Extract all text from this image. Return ONLY the extracted text, " + "maintaining the original layout and order. Do not add any commentary." + ) + text = _llm_vision_extract(llm_client, llm_model, abs_path, ocr_prompt) + info["text"] = text + info["ocr_mode"] = "llm_vision" + return json.dumps(info, ensure_ascii=False) + except Exception as exc: + info["error"] = str(exc) + + # No LLM — return metadata for AI assistant to process + info["text"] = None + info["ocr_mode"] = "ai_assistant_driven" + info["hint"] = ( + "No LLM configured. Use your vision capability to read this image directly: " + f"Read tool with path={abs_path}" + ) + return json.dumps(info, ensure_ascii=False) + + +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- + +def _resolve_image_refs(text: str, image_dir: str) -> str: + """Rewrite relative image references in markdown to absolute paths on disk.""" + def _replace(match: re.Match) -> str: + alt = match.group("alt") + src = match.group("src") + if os.path.isabs(src) or src.startswith(("http:", "https:", "data:")): + return match.group(0) + # Resolve relative path against image_dir + abs_src = os.path.normpath(os.path.join(image_dir, src)) + return f"![{alt}]({abs_src})" + + return re.sub(r"!\[(?P[^\]]*)\]\((?P[^)]+)\)", _replace, text) + + +def _build_image_info( + img_path: str, + meta_str: str, + text_skeleton: str, + match: re.Match | None, + position: str | None = None, +) -> dict: + """Build an image info dict for the response.""" + size_bytes: int | None = None + try: + size_bytes = os.path.getsize(img_path) + except OSError: + pass + + width: int | None = None + height: int | None = None + + # Try metadata comment first (e.g. "1920x1080, 239KB") + dim_match = re.search(r"(\d+)\s*x\s*(\d+)", meta_str) if meta_str else None + if dim_match: + width, height = int(dim_match.group(1)), int(dim_match.group(2)) + else: + try: + from PIL import Image as PILImage + with PILImage.open(img_path) as img: + width, height = img.size + except Exception: + pass + + # Build position context + if position is None and match is not None: + start = max(0, match.start() - 60) + end = min(len(text_skeleton), match.end() + 60) + ctx_before = text_skeleton[start:match.start()].strip().split("\n")[-1] + ctx_after = text_skeleton[match.end():end].strip().split("\n")[0] + parts: list[str] = [] + if ctx_before: + parts.append(f"...{ctx_before}") + parts.append("[image]") + if ctx_after: + parts.append(f"{ctx_after}...") + position = " ".join(parts) + + return { + "path": img_path, + "uri": Path(img_path).as_uri() if os.path.exists(img_path) else None, + "position": position or "", + "size_bytes": size_bytes, + "width": width, + "height": height, + } + + +def _llm_vision_extract(client, model: str, image_path: str, prompt: str) -> str: + """Use an OpenAI-compatible vision model to extract text from an image.""" + import mimetypes + + content_type, _ = mimetypes.guess_type(image_path) + if not content_type: + content_type = "image/png" + + with open(image_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("ascii") + + data_uri = f"data:{content_type};base64,{b64}" + + response = client.chat.completions.create( + model=model, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ], + ) + return response.choices[0].message.content.strip() def check_plugins_enabled() -> bool: - return os.getenv("MARKITDOWN_ENABLE_PLUGINS", "false").strip().lower() in ( - "true", - "1", - "yes", + """Check if plugins should be enabled. Default is True (load plugins if available).""" + return os.getenv("MARKITDOWN_ENABLE_PLUGINS", "true").strip().lower() not in ( + "false", + "0", + "no", + "off", ) +# --------------------------------------------------------------------------- +# HTTP/SSE transport +# --------------------------------------------------------------------------- + def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette: sse = SseServerTransport("/messages/") session_manager = StreamableHTTPSessionManager( @@ -59,13 +473,15 @@ async def handle_streamable_http( @contextlib.asynccontextmanager async def lifespan(app: Starlette) -> AsyncIterator[None]: - """Context manager for session manager.""" + """Context manager for session manager with temp-dir lifecycle.""" + _cleanup_old_dirs(_temp_root, max_age_seconds=86400) async with session_manager.run(): print("Application started with StreamableHTTP session manager!") try: yield finally: print("Application shutting down...") + _cleanup_all(_temp_root) return Starlette( debug=debug, @@ -78,7 +494,10 @@ async def lifespan(app: Starlette) -> AsyncIterator[None]: ) +# --------------------------------------------------------------------------- # Main entry point +# --------------------------------------------------------------------------- + def main(): import argparse @@ -120,7 +539,7 @@ def main(): "WARNING: The server is being bound to a non-localhost interface " f"({host}).\n" "This exposes the server to other machines on the network or Internet.\n" - "The server has NO authentication and runs with your user's privileges.\n" + "The server has NO authentication and runs with the user's privileges.\n" "Any process or user that can reach this interface can read files and\n" "fetch network resources accessible to this user.\n" "Only proceed if you understand the security implications.\n", @@ -133,7 +552,12 @@ def main(): port=args.port if args.port else 3001, ) else: - mcp.run() + # STDIO mode: clean stale dirs on start, clean up on exit + _cleanup_old_dirs(_temp_root, max_age_seconds=86400) + try: + mcp.run() + finally: + _cleanup_all(_temp_root) if __name__ == "__main__": diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py index f2463de11..087bc8354 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py @@ -4,8 +4,10 @@ """ import io +import os import re import sys +import tempfile from typing import Any, BinaryIO, Optional from markitdown.converters import HtmlConverter @@ -15,7 +17,7 @@ MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, ) -from ._ocr_service import LLMVisionOCRService +from ._ocr_service import LLMVisionOCRService, format_image_reference # Try loading dependencies _dependency_exc_info = None @@ -82,6 +84,15 @@ def convert( kwargs.get("ocr_service") or self.ocr_service ) + # --- extract_only mode: skip OCR, emit image file references --- + if kwargs.get("extract_only", False): + image_output_dir = kwargs.get("image_output_dir") or tempfile.mkdtemp( + prefix="markitdown_ocr_" + ) + os.makedirs(image_output_dir, exist_ok=True) + _eo_kwargs = {k: v for k, v in kwargs.items() if k not in ("image_output_dir",)} + return self._convert_extract_only(file_stream, image_output_dir, **_eo_kwargs) + if ocr_service: # 1. Extract and OCR images — returns raw text per image file_stream.seek(0) @@ -187,3 +198,92 @@ def replace_img(match: re.Match) -> str: # type: ignore[type-arg] result += f"

{_PLACEHOLDER.format(i)}

" return result, ocr_texts + + def _convert_extract_only( + self, file_stream: BinaryIO, image_output_dir: str, **kwargs: Any + ) -> DocumentConverterResult: + """ + Extract-only mode: extract text via mammoth and save embedded images to disk. + No OCR is performed; images are referenced via file paths. + """ + from PIL import Image + + # 1. Extract images from DOCX and save to disk + file_stream.seek(0) + doc = Document(file_stream) + + image_paths: list[str] = [] # ordered list of saved image paths + img_idx = 0 + for rel in doc.part.rels.values(): + if "image" in rel.target_ref.lower(): + try: + image_bytes = rel.target_part.blob + + # Determine extension and dimensions + ext = "png" + width, height = None, None + try: + pil_img = Image.open(io.BytesIO(image_bytes)) + fmt = pil_img.format + if fmt: + ext = fmt.lower() + if ext == "jpeg": + ext = "jpg" + width, height = pil_img.size + except Exception: + pass + + filename = f"docx_image_{img_idx}.{ext}" + filepath = os.path.join(image_output_dir, filename) + with open(filepath, "wb") as f: + f.write(image_bytes) + + image_paths.append( + format_image_reference( + filepath, + width=width, + height=height, + size_bytes=len(image_bytes), + ) + ) + img_idx += 1 + except Exception: + continue + + # 2. Convert DOCX -> HTML via mammoth + file_stream.seek(0) + pre_process_stream = pre_process_docx(file_stream) + html_result = mammoth.convert_to_html( + pre_process_stream, style_map=kwargs.get("style_map") + ).value + + # 3. Replace tags with placeholders + _EO_PLACEHOLDER = "MARKITDOWNEXTRACTONLY{}" + used: list[int] = [] + + def replace_img(match: re.Match) -> str: # type: ignore[type-arg] + for i in range(len(image_paths)): + if i not in used: + used.append(i) + return f"

{_EO_PLACEHOLDER.format(i)}

" + return "" + + html_with_placeholders = re.sub(r"]*>", replace_img, html_result) + + # Any images that had no matching tag go at the end + for i in range(len(image_paths)): + if i not in used: + html_with_placeholders += f"

{_EO_PLACEHOLDER.format(i)}

" + + # 4. Convert HTML -> markdown + md_result = self._html_converter.convert_string( + html_with_placeholders, **kwargs + ) + md = md_result.markdown + + # 5. Swap placeholders for image references + for i, img_ref in enumerate(image_paths): + placeholder = _EO_PLACEHOLDER.format(i) + md = md.replace(placeholder, img_ref) + + return DocumentConverterResult(markdown=md) diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py b/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py index 2885e1f47..1e25826ea 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py @@ -4,12 +4,56 @@ """ import base64 +import os from typing import Any, BinaryIO from dataclasses import dataclass from markitdown import StreamInfo +def format_image_reference( + image_path: str, + width: int | None = None, + height: int | None = None, + size_bytes: int | None = None, +) -> str: + """ + Format an annotated markdown image reference for extract-only mode. + + Args: + image_path: Filesystem path to the saved image. + width: Image width in pixels (optional). + height: Image height in pixels (optional). + size_bytes: Image file size in bytes (optional). + + Returns: + Markdown string with metadata comment and image link. + """ + parts: list[str] = [] + + # Build metadata comment + meta_parts: list[str] = [] + if width is not None and height is not None: + meta_parts.append(f"{width}x{height}") + if size_bytes is not None: + size_kb = round(size_bytes / 1024, 1) + meta_parts.append(f"{size_kb}KB") + + if meta_parts: + parts.append(f"") + else: + # Fallback: try to get file size from disk + try: + file_size = os.path.getsize(image_path) + size_kb = round(file_size / 1024, 1) + parts.append(f"") + except Exception: + pass + + parts.append(f"![image]({image_path})") + return "\n".join(parts) + + @dataclass class OCRResult: """Result from OCR extraction.""" diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py index c1dc0f613..fad9943c8 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py @@ -4,7 +4,9 @@ """ import io +import os import sys +import tempfile from typing import Any, BinaryIO, Optional from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo @@ -12,7 +14,7 @@ MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, ) -from ._ocr_service import LLMVisionOCRService +from ._ocr_service import LLMVisionOCRService, format_image_reference # Import dependencies _dependency_exc_info = None @@ -181,6 +183,15 @@ def convert( file_stream.seek(0) pdf_bytes = io.BytesIO(file_stream.read()) + # --- extract_only mode: skip OCR, emit image file references --- + extract_only = kwargs.get("extract_only", False) + if extract_only: + image_output_dir = kwargs.get("image_output_dir") or tempfile.mkdtemp( + prefix="markitdown_ocr_" + ) + os.makedirs(image_output_dir, exist_ok=True) + return self._convert_extract_only(pdf_bytes, image_output_dir) + markdown_content = [] try: @@ -310,6 +321,74 @@ def convert( return DocumentConverterResult(markdown=markdown) + def _convert_extract_only( + self, pdf_bytes: io.BytesIO, image_output_dir: str + ) -> DocumentConverterResult: + """ + Extract-only mode: extract text skeleton and save embedded images to disk. + No OCR is performed; images are referenced via file paths. + """ + markdown_content: list[str] = [] + + try: + with pdfplumber.open(pdf_bytes) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + markdown_content.append(f"\n## Page {page_num}\n") + + # Extract regular text + text_content = page.extract_text() or "" + if text_content.strip(): + markdown_content.append(text_content.strip()) + + # Extract and save images + images_on_page = self._extract_page_images(pdf_bytes, page_num) + for idx, img_info in enumerate(images_on_page): + img_stream: io.BytesIO = img_info["stream"] + img_stream.seek(0) + img_data = img_stream.read() + + # Determine extension from image data (default png) + ext = "png" + try: + pil_img = Image.open(io.BytesIO(img_data)) + fmt = pil_img.format + if fmt: + ext = fmt.lower() + if ext == "jpeg": + ext = "jpg" + width, height = pil_img.size + except Exception: + width, height = None, None + + filename = f"page_{page_num}_{idx}.{ext}" + filepath = os.path.join(image_output_dir, filename) + with open(filepath, "wb") as f: + f.write(img_data) + + img_ref = format_image_reference( + filepath, + width=width, + height=height, + size_bytes=len(img_data), + ) + markdown_content.append(f"\n{img_ref}\n") + + markdown = "\n\n".join(markdown_content).strip() + + # Fallback to pdfminer if empty + if not markdown: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + except Exception: + try: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + except Exception: + markdown = "" + + return DocumentConverterResult(markdown=markdown) + def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]: """ Extract images from a PDF page using pdfplumber. diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py index 7e91ed6b4..cdfff70b0 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py @@ -4,7 +4,9 @@ """ import io +import os import sys +import tempfile from typing import Any, BinaryIO, Optional from typing import BinaryIO, Any, Optional @@ -15,7 +17,7 @@ MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, ) -from ._ocr_service import LLMVisionOCRService +from ._ocr_service import LLMVisionOCRService, format_image_reference _dependency_exc_info = None try: @@ -74,6 +76,16 @@ def convert( ) llm_client = kwargs.get("llm_client") + # --- extract_only mode: skip OCR, emit image file references --- + if kwargs.get("extract_only", False): + image_output_dir = kwargs.get("image_output_dir") or tempfile.mkdtemp( + prefix="markitdown_ocr_" + ) + os.makedirs(image_output_dir, exist_ok=True) + # Pop keys already consumed as positional args to avoid duplicate keyword + _eo_kwargs = {k: v for k, v in kwargs.items() if k not in ("image_output_dir",)} + return self._convert_extract_only(file_stream, image_output_dir, **_eo_kwargs) + presentation = pptx.Presentation(file_stream) md_content = "" slide_num = 0 @@ -185,6 +197,111 @@ def get_shape_content(shape, **kwargs): return DocumentConverterResult(markdown=md_content.strip()) + def _convert_extract_only( + self, file_stream: BinaryIO, image_output_dir: str, **kwargs: Any + ) -> DocumentConverterResult: + """ + Extract-only mode: extract text and save embedded images to disk. + No OCR or LLM description is performed; images are referenced via file paths. + """ + from PIL import Image + + presentation = pptx.Presentation(file_stream) + md_content = "" + slide_num = 0 + global_img_idx = 0 + + for slide in presentation.slides: + slide_num += 1 + md_content += f"\n\n\n" + + title = slide.shapes.title + + def get_shape_content_extract_only(shape, **kw): + nonlocal md_content, global_img_idx + + # Pictures + if self._is_picture(shape): + try: + image_bytes = shape.image.blob + + ext = "png" + width, height = None, None + try: + pil_img = Image.open(io.BytesIO(image_bytes)) + fmt = pil_img.format + if fmt: + ext = fmt.lower() + if ext == "jpeg": + ext = "jpg" + width, height = pil_img.size + except Exception: + pass + + filename = f"slide_{slide_num}_{global_img_idx}.{ext}" + filepath = os.path.join(image_output_dir, filename) + with open(filepath, "wb") as f: + f.write(image_bytes) + + img_ref = format_image_reference( + filepath, + width=width, + height=height, + size_bytes=len(image_bytes), + ) + md_content += f"\n{img_ref}\n" + global_img_idx += 1 + except Exception: + pass + + # Tables + if self._is_table(shape): + md_content += self._convert_table_to_markdown(shape.table, **kw) + + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\n" + else: + md_content += shape.text + "\n" + + # Group Shapes + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: + sorted_shapes = sorted( + shape.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for subshape in sorted_shapes: + get_shape_content_extract_only(subshape, **kw) + + sorted_shapes = sorted( + slide.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for shape in sorted_shapes: + get_shape_content_extract_only(shape, **kwargs) + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\n\n### Notes:\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult(markdown=md_content.strip()) + def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: return True diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py index 481e07195..3ccc83f8b 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py @@ -4,7 +4,9 @@ """ import io +import os import sys +import tempfile from typing import Any, BinaryIO, Optional from markitdown.converters import HtmlConverter @@ -13,7 +15,7 @@ MissingDependencyException, MISSING_DEPENDENCY_MESSAGE, ) -from ._ocr_service import LLMVisionOCRService +from ._ocr_service import LLMVisionOCRService, format_image_reference # Try loading dependencies _xlsx_dependency_exc_info = None @@ -76,6 +78,15 @@ def convert( kwargs.get("ocr_service") or self.ocr_service ) + # --- extract_only mode: skip OCR, emit image file references --- + if kwargs.get("extract_only", False): + image_output_dir = kwargs.get("image_output_dir") or tempfile.mkdtemp( + prefix="markitdown_ocr_" + ) + os.makedirs(image_output_dir, exist_ok=True) + _eo_kwargs = {k: v for k, v in kwargs.items() if k not in ("image_output_dir",)} + return self._convert_extract_only(file_stream, image_output_dir, **_eo_kwargs) + if ocr_service: # Remove ocr_service from kwargs to avoid duplicate argument error kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"} @@ -213,6 +224,108 @@ def _extract_and_ocr_sheet_images( return results + def _convert_extract_only( + self, file_stream: BinaryIO, image_output_dir: str, **kwargs: Any + ) -> DocumentConverterResult: + """ + Extract-only mode: extract table data and save embedded images to disk. + No OCR is performed; images are referenced via file paths. + """ + from PIL import Image + + file_stream.seek(0) + wb = load_workbook(file_stream) + + # Filter out ocr_service / extract_only from kwargs passed to HTML converter + html_kwargs = { + k: v + for k, v in kwargs.items() + if k not in ("ocr_service", "extract_only", "image_output_dir") + } + + md_content = "" + + for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + md_content += f"## {sheet_name}\n\n" + + # Convert sheet data to markdown table + file_stream.seek(0) + try: + df = pd.read_excel( + file_stream, sheet_name=sheet_name, engine="openpyxl" + ) + html_content = df.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **html_kwargs + ).markdown.strip() + + "\n\n" + ) + except Exception: + pass + + # Extract images and save to disk + img_idx = 0 + if hasattr(sheet, "_images"): + for img in sheet._images: + try: + # Get image data + if hasattr(img, "_data"): + image_data = img._data() + elif hasattr(img, "image"): + image_data = img.image + else: + continue + + # Determine extension and dimensions + ext = "png" + width, height = None, None + try: + pil_img = Image.open(io.BytesIO(image_data)) + fmt = pil_img.format + if fmt: + ext = fmt.lower() + if ext == "jpeg": + ext = "jpg" + width, height = pil_img.size + except Exception: + pass + + # Get cell reference for naming + cell_ref = "unknown" + if hasattr(img, "anchor"): + anchor = img.anchor + if hasattr(anchor, "_from"): + from_cell = anchor._from + if hasattr(from_cell, "col") and hasattr( + from_cell, "row" + ): + col_letter = self._column_number_to_letter( + from_cell.col + ) + cell_ref = f"{col_letter}{from_cell.row + 1}" + + filename = f"xlsx_{sheet_name}_{img_idx}.{ext}" + # Sanitize filename (sheet names may have spaces/special chars) + filename = filename.replace(" ", "_").replace("/", "_") + filepath = os.path.join(image_output_dir, filename) + with open(filepath, "wb") as f: + f.write(image_data) + + img_ref = format_image_reference( + filepath, + width=width, + height=height, + size_bytes=len(image_data), + ) + md_content += f"\n{img_ref}\n\n" + img_idx += 1 + except Exception: + continue + + return DocumentConverterResult(markdown=md_content.strip()) + @staticmethod def _column_number_to_letter(n: int) -> str: """Convert column number to Excel column letter (0-indexed).""" diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f6aa4df0e..f47e9cf58 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -100,10 +100,13 @@ def __init__( *, enable_builtins: Union[None, bool] = None, enable_plugins: Union[None, bool] = None, + extract_only: bool = False, **kwargs, ): self._builtins_enabled = False self._plugins_enabled = False + self._extract_only = extract_only + self._image_output_dir: Union[str | None] = kwargs.get("image_output_dir") requests_session = kwargs.get("requests_session") if requests_session is None: @@ -600,6 +603,12 @@ def _convert( if "exiftool_path" not in _kwargs and self._exiftool_path is not None: _kwargs["exiftool_path"] = self._exiftool_path + # Propagate extract-only mode settings + if "extract_only" not in _kwargs: + _kwargs["extract_only"] = self._extract_only + if "image_output_dir" not in _kwargs and self._image_output_dir is not None: + _kwargs["image_output_dir"] = self._image_output_dir + # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._converters diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index dd8fbac61..b544f5ba5 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,6 +1,7 @@ from typing import BinaryIO, Any, Union import base64 import mimetypes +import os from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo @@ -42,6 +43,12 @@ def convert( stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + extract_only = kwargs.get("extract_only", False) + + # --- Extract-only mode: save image to disk, return path reference --- + if extract_only: + return self._extract_only(file_stream, stream_info, **kwargs) + md_content = "" # Add metadata @@ -84,6 +91,101 @@ def convert( markdown=md_content, ) + def _extract_only( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + """Extract image to disk and return a markdown reference with metadata.""" + import tempfile + import uuid + + # Determine output directory + image_output_dir = kwargs.get("image_output_dir") + if image_output_dir is None: + image_output_dir = tempfile.mkdtemp(prefix="markitdown_images_") + + os.makedirs(image_output_dir, exist_ok=True) + + # Determine file extension + extension = stream_info.extension or "" + if not extension: + ext = mimetypes.guess_extension(stream_info.mimetype or "") or ".png" + else: + ext = extension if extension.startswith(".") else "." + extension + + # Generate unique filename + unique_id = uuid.uuid4().hex[:12] + filename = f"img_{unique_id}{ext}" + file_path = os.path.join(image_output_dir, filename) + + # Read image data and collect metadata + cur_pos = file_stream.tell() + try: + image_data = file_stream.read() + finally: + file_stream.seek(cur_pos) + + size_bytes = len(image_data) + + # Get image dimensions if possible + width, height = self._get_image_dimensions(image_data, ext) + + # Write image to disk + with open(file_path, "wb") as f: + f.write(image_data) + + # Build markdown output with metadata comment + md_parts = [] + if width and height: + md_parts.append(f"") + else: + md_parts.append(f"") + md_parts.append(f"![image]({file_path})") + + return DocumentConverterResult( + markdown="\n".join(md_parts), + ) + + def _get_image_dimensions( + self, image_data: bytes, ext: str + ) -> tuple: + """Try to get image dimensions without external dependencies.""" + try: + from PIL import Image + img = Image.open(__import__("io").BytesIO(image_data)) + return img.size # (width, height) + except ImportError: + pass + + # Fallback: try to parse PNG/JPEG headers + if ext.lower() in (".png",) and len(image_data) >= 24: + import struct + try: + w, h = struct.unpack(">II", image_data[16:24]) + return (w, h) + except Exception: + pass + + if ext.lower() in (".jpg", ".jpeg") and len(image_data) > 10: + try: + import struct + idx = 2 + while idx < len(image_data) - 9: + if image_data[idx] != 0xFF: + break + marker = image_data[idx + 1] + if marker in (0xC0, 0xC1, 0xC2): + h, w = struct.unpack(">HH", image_data[idx + 5:idx + 9]) + return (w, h) + length = struct.unpack(">H", image_data[idx + 2:idx + 4])[0] + idx += 2 + length + except Exception: + pass + + return (None, None) + def _get_llm_description( self, file_stream: BinaryIO, diff --git a/skills/markitdown-convert/SKILL.md b/skills/markitdown-convert/SKILL.md new file mode 100644 index 000000000..a708bb037 --- /dev/null +++ b/skills/markitdown-convert/SKILL.md @@ -0,0 +1,120 @@ +--- +name: markitdown-convert +description: Converts documents (PDF/PPTX/DOCX/XLSX) and images to structured Markdown using markitdown-mcp MCP server + AI vision OCR. Use when the user asks to convert any document or image file to markdown, or mentions markitdown. +version: 1.0.0 +--- + +# MarkItDown Document-to-Markdown Workflow + +## Prerequisites + +- markitdown-mcp MCP Server is configured and running (STDIO mode) +- Three MCP tools available: `convert_to_markdown`, `analyze_document`, `ocr_image` +- Path 1 (LLM): Optional — requires `MARKITDOWN_LLM_API_KEY` / `MARKITDOWN_LLM_BASE_URL` / `MARKITDOWN_LLM_MODEL` environment variables +- Path 2 (AI assistant-driven): No extra configuration needed — uses the AI assistant's own Read visual capability for OCR + +## Step 1: Format Routing + +After receiving a file, determine its type and select the appropriate path: + +| File Type | Extensions | Handling | +|---|---|---| +| Document files | .pdf .pptx .docx .xlsx .html .csv | Use MCP tools | +| Raw images | .png .jpg .jpeg .gif .bmp .webp | **Bypass MCP** — use the Read tool's visual OCR directly | +| Other | Unknown format | Try `convert_to_markdown` first; if it fails, notify the user | + +**Key limitation**: The three MCP tools only accept document formats and do not support raw image input. + +## Step 2: Determine Document Type (for document files) + +Call the `analyze_document` tool and inspect the returned `text_skeleton`: + +- **text_skeleton contains rich text** → Pure text / semi-structured document (e.g., text-based PDF, DOCX) + - Use `convert_to_markdown` directly for a one-step conversion + - If the document contains images and their content needs to be OCR'd, proceed to Step 3 + +- **text_skeleton contains only image references or is nearly empty** → Scanned document (e.g., scanned PDF, image-based PPT) + - Must proceed to Step 3 for the OCR workflow + +## Step 3: Image OCR Workflow + +### 3.1 Collect Images + +The `analyze_document` result includes a list of images, each with path and dimension information. Images are typically saved in a temporary directory. + +### 3.2 Filter Out Decorative Images + +**Not all images need OCR**. Skip the following types: + +| Filter Condition | Description | +|---|---| +| Any dimension >= 2000px | Usually full-page backgrounds or decorative base images | +| Any dimension <= 15px | Divider lines, thin decorative elements | +| Width <= 72px **and** Height <= 72px **and** File size < 2KB | Small icons, decorative elements | + +**Note**: Do not use <= 120px as the threshold — it will incorrectly remove meaningful small images (e.g., a 220x64 "PRACTICE" badge, 112x112 step icons). The 72px + 2KB dual condition is a verified safe threshold. + +### 3.3 OCR Each Image + +For each remaining image after filtering, use the **Read tool** to read it: + +``` +Read(file_path="") +``` + +The AI assistant uses its visual capability to directly "see" the image content and extract text. + +**Batch processing tip**: Read 5 images in parallel each time to avoid context overload from too many simultaneous requests. + +### 3.4 OCR Extraction Guidelines + +When reading each image, ensure the following content is extracted: + +- **Body paragraphs**: Fully reconstruct the text without omissions +- **Headings/Subheadings**: Recognize the hierarchy and map them to `#`/`##`/`###` +- **Data tables**: Reconstruct in Markdown table format (`| col1 | col2 |`) +- **Footnotes/Annotations**: Mark with `*` and place after the relevant paragraph +- **Signatures and dates**: Preserve the original text +- **Lists**: Reconstruct as ordered/unordered lists + +## Step 4: Assemble Final Markdown + +Merge all OCR results with the structural information from text_skeleton: + +1. Use the section headings from text_skeleton as the skeleton +2. Embed the OCR-extracted body text into the corresponding sections +3. Use standard Markdown table syntax for tables +4. Mark footnotes with `*` and place them immediately after the relevant content +5. Preserve the original document's heading hierarchy (`#` for level 1, `##` for level 2, etc.) +6. Place signatures and dates at the end of the document + +**Output location**: Save to the same directory as the source file (e.g., `D:\`), with the filename being the source filename plus the `.md` extension. + +## Step 5: Verification + +- Confirm all sections have content (not just empty image references) +- Confirm table formatting is correct (renders properly) +- Confirm no pages are missing +- Use `present_files` to show the result to the user + +## Common Pitfalls + +1. **Windows short paths**: Image paths extracted by MCP may use 8.3 short names (e.g., `ADMINI~1`), which the Read tool cannot resolve. markitdown-mcp has fixed this with `os.path.realpath()`, but check for path issues if encountered. + +2. **Scanned PDFs having an empty text_skeleton is normal**: This is not an error — it means every page of the PDF is an image. Proceed directly to the OCR workflow. + +3. **Image filter thresholds must not be too aggressive**: A previous threshold of < 120px & < 3KB incorrectly removed meaningful content images. Strictly use the 72px + 2KB dual condition. + +4. **MCP Server must be restarted after dependency changes**: The old process will not load newly installed Python packages. + +5. **Do not use MCP for raw images**: The MCP tools do not support raw image input (PNG/JPG, etc.) — use the Read visual capability directly instead. + +## Path 1 vs Path 2 Selection Guide + +| Scenario | Recommended Path | +|---|---| +| User has configured LLM environment variables | Path 1: Fully automatic — MCP internally calls LLM to describe images | +| User has not configured LLM (default) | Path 2: AI assistant Read visual OCR | +| Pure text document (no images) | Use `convert_to_markdown` directly — neither path is needed | +| Batch document processing | Path 1 is more efficient (no per-page Read calls) | +| High quality requirements (tables, footnotes, signatures) | Path 2 is more flexible — the AI assistant can understand document structure |