diff --git a/.claude/commands/docs-lint.md b/.claude/commands/docs-lint.md index c90fb02890..64969eb26f 100644 --- a/.claude/commands/docs-lint.md +++ b/.claude/commands/docs-lint.md @@ -1,21 +1,21 @@ -# Linting Documentation Notebooks +# Linting Documentation Python Snippets -Lint documentation notebooks for syntax and type errors without executing them using `docs/_scripts/lint_notebooks.py`. +Lint Markdown and MDX Python fenced code blocks for syntax and type errors without executing them using `docs/_scripts/lint_python_snippets.py`. ## Basic Usage ```sh -uv run python docs/_scripts/lint_notebooks.py +uv run python docs/_scripts/lint_python_snippets.py ``` Examples: ```sh -# Lint a single notebook -uv run python docs/_scripts/lint_notebooks.py docs/run-inference/tutorials/deploy-llm-nims.md +# Lint a single doc page +uv run python docs/_scripts/lint_python_snippets.py docs/run-inference/tutorials/deploy-llm-nims.md -# Lint all notebooks in a directory -uv run python docs/_scripts/lint_notebooks.py docs/run-inference/ +# Lint all Markdown/MDX pages in a directory +uv run python docs/_scripts/lint_python_snippets.py docs/run-inference/ ``` ### Running in Cursor Agent Sandbox Mode @@ -24,10 +24,10 @@ When running through the Cursor agent in sandbox mode, the agent will set `UV_CA ## Type Checking -Add `--type-check` to run `ty` type checker on the combined notebook cells: +Add `--type-check` to run `ty` type checker on the combined snippets: ```sh -uv run python docs/_scripts/lint_notebooks.py docs/run-inference/ --type-check +uv run python docs/_scripts/lint_python_snippets.py docs/run-inference/ --type-check ``` This catches: @@ -45,10 +45,10 @@ Note: `ty` is alpha software and reports some false positives for SDK attributes ## How It Works The script: -1. Finds notebooks with the `@nemo-nb: process` marker -2. Extracts all Python code cells -3. Combines them into a single file (so cross-cell context works) -4. Runs `ty check` and passes through the output +1. Finds Markdown and MDX files under the requested paths +2. Extracts all `python` and `py` fenced code blocks +3. Combines snippets from each page into a single file so earlier snippets can define later context +4. Runs `ty check` and maps diagnostics back to the original doc lines ## Fixing Linter Errors @@ -92,4 +92,4 @@ The SDK's gateway methods return `object` type, causing false positives when acc ## Markers -Only notebooks with the `@nemo-nb: process` marker will be linted. +All Markdown and MDX pages under the requested paths are scanned. Use `` before a block to skip it. diff --git a/.claude/commands/docs-test.md b/.claude/commands/docs-test.md index 0916ba0432..00d98778bc 100644 --- a/.claude/commands/docs-test.md +++ b/.claude/commands/docs-test.md @@ -1,12 +1,14 @@ # Testing Documentation Notebooks -You can test documentation from the repository root using `docs/_scripts/run_notebooks.py`, which executes notebooks marked with `@nemo-nb: process`. +You can test documentation from the repository root using `docs/fern/scripts/run_notebooks.py`, which executes notebooks marked with `@nemo-nb: process`. ## Basic Usage The script takes a file path or directory path: -uv run python docs/_scripts/run_notebooks.py +```sh +uv run python docs/fern/scripts/run_notebooks.py +``` Environment Variables The script automatically loads environment variables from a .env file in the repository root (if present). @@ -22,20 +24,20 @@ NGC_API_KEY=your-ngc-key-here You can also override environment variables inline: ```sh -NMP_BASE_URL=http://custom-url:8080 uv run python docs/_scripts/run_notebooks.py docs/run-inference/ +NMP_BASE_URL=http://custom-url:8080 uv run python docs/fern/scripts/run_notebooks.py docs/run-inference/ ``` ## Language filters ```sh # Run only Python cells (default) -uv run python docs/_scripts/run_notebooks.py docs/run-inference/ --language python +uv run python docs/fern/scripts/run_notebooks.py docs/run-inference/ --language python # Run only shell cells -uv run python docs/_scripts/run_notebooks.py docs/run-inference/ --language shell +uv run python docs/fern/scripts/run_notebooks.py docs/run-inference/ --language shell ``` ## Markers -Only notebooks with the @nemo-nb: process marker will be executed. +Only notebooks with the `@nemo-nb: process` marker will be executed. diff --git a/Makefile b/Makefile index 6d693f2b6d..8b07750f68 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,8 @@ endif PYTEST_EXTRA ?= PYTHON_VERSION ?= 3.11 BOOTSTRAP_CREATE_VENV ?= 1 +BOOTSTRAP_EXPECTED_VIRTUAL_ENV := $(CURDIR)/.venv +BOOTSTRAP_ACTIVATION_REMINDER = if [ "$${VIRTUAL_ENV:-}" != "$(BOOTSTRAP_EXPECTED_VIRTUAL_ENV)" ]; then echo ""; echo "Next steps:"; echo " source .venv/bin/activate"; echo " nemo --help"; fi # Display platform info $(info local system architecture: $(PLATFORM)/$(ARCH)) @@ -120,6 +122,16 @@ docs-watch: ## Start Fern docs dev plus a repo-level watcher for docs/** changes docs-check: ## Validate the Fern docs (fern check + validate-mdx + gated-link check) cd docs/fern && npm run check +.PHONY: docs-check-python-snippets +docs-check-python-snippets: ## Syntax-check and type-check Python snippets in one doc (DOCS_PATH=...) + @if [ -z "$(strip $(DOCS_PATH))" ]; then echo "Usage: make docs-check-python-snippets DOCS_PATH=docs/customizer/tutorials/import-hf-model.mdx" >&2; exit 2; fi + uv run --frozen python docs/_scripts/lint_python_snippets.py "$(DOCS_PATH)" + +.PHONY: docs-run-notebook +docs-run-notebook: ## Execute one Fern notebook source (DOCS_PATH=.mdx/.ipynb/.md, optional ARGS=...) + @if [ -z "$(strip $(DOCS_PATH))" ]; then echo "Usage: make docs-run-notebook DOCS_PATH=docs/customizer/tutorials/sft-customization-job.mdx" >&2; exit 2; fi + uv run --frozen python docs/fern/scripts/run_notebooks.py $(ARGS) "$(DOCS_PATH)" + .PHONY: docs-broken-links docs-broken-links: ## Report broken links across the built docs cd docs/fern && npm run broken-links @@ -179,6 +191,9 @@ bootstrap-python: ## Bootstrap Python dependencies. @if [ -n "$(strip $(BOOTSTRAP_LOCAL_PLUGIN_DIRS))" ]; then \ $(MAKE) bootstrap-plugins BOOTSTRAP_LOCAL_PLUGIN_DIRS="$(BOOTSTRAP_LOCAL_PLUGIN_DIRS)"; \ fi + @if [ "$(filter bootstrap-python,$(MAKECMDGOALS))" = "bootstrap-python" ]; then \ + $(BOOTSTRAP_ACTIVATION_REMINDER); \ + fi .PHONY: verify-node-version verify-node-version: ## Verify pnpm and Node.js satisfy Studio's package engine @@ -222,6 +237,7 @@ bootstrap: bootstrap-python ## Bootstrap the local dev environment, including St echo " make bootstrap-studio"; \ fi @echo "bootstrap completed" + @$(BOOTSTRAP_ACTIVATION_REMINDER) .PHONY: run run: build-policy ## Run the NeMo Platform locally with Docker job backend diff --git a/docs/.gitignore b/docs/.gitignore index d5403b578f..804a791d39 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -38,6 +38,7 @@ _build conf.py # temporary ipynb files +*.tmp.md *.tmp.ipynb # base64 content diff --git a/docs/AGENTS.md b/docs/AGENTS.md index ea3d83d37c..c71dd39149 100644 --- a/docs/AGENTS.md +++ b/docs/AGENTS.md @@ -12,6 +12,8 @@ Run these from the repo root (they wrap `cd docs/fern && npm run …`): | `make docs` | Local dev server (live preview) | | `make docs-watch` | Local dev server plus repo-level watcher for `docs/**` changes outside `docs/fern/` | | `make docs-check` | `fern check` + MDX validation + gated-link check (what CI runs) | +| `make docs-check-python-snippets DOCS_PATH=...` | Syntax-check and type-check Python fenced snippets in one doc | +| `make docs-run-notebook DOCS_PATH=...` | Execute the source notebook for one Fern `.mdx`/`.ipynb` doc using `nemo-nb` markers | | `make docs-broken-links` | Report broken links | | `make docs-fix-links` | Auto-delink references into gated pages | diff --git a/docs/_scripts/lint_python_snippets.py b/docs/_scripts/lint_python_snippets.py new file mode 100644 index 0000000000..73fae2263e --- /dev/null +++ b/docs/_scripts/lint_python_snippets.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Validate Python fenced code blocks in Markdown and MDX docs. + +The checker is intentionally extraction-based instead of MDX-renderer-based: +Fern pages may contain JSX, imports, or custom components, but Python snippet +validation only needs fenced ``python``/``py`` blocks. + +By default this script syntax-checks every Python snippet and combines snippets +per page to run ``ty`` over the extracted source while mapping diagnostics back +to the original doc line numbers. Pass ``--no-type-check`` for syntax-only +audits. +""" + +import argparse +import ast +import os +import re +import subprocess +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +DOC_SUFFIXES = {".md", ".mdx"} +PYTHON_LANGUAGES = {"python", "py"} +SKIP_DIRS = { + ".git", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + ".venv", + ".venv-docs", + ".venv-mkdocs", + "_build", + "_generated", + "node_modules", + "site", +} + +FENCE_RE = re.compile(r"^([ \t]*)(`{3,}|~{3,})(.*)$") +TY_OUTPUT_RE = re.compile(r"^(?P.+?):(?P\d+):(?P\d+): (?P.+)$") + +SKIP_NEXT_BLOCK_MARKERS = { + "", +} +SKIP_NEXT_TYPE_CHECK_MARKERS = { + "", + "", +} +DEFAULT_IGNORED_TY_RULES = ("possibly-unbound-attribute",) + + +@dataclass(frozen=True) +class PythonSnippet: + path: Path + start_line: int + source: str + type_check: bool + + +@dataclass(frozen=True) +class SnippetDiagnostic: + path: Path + line: int + column: int | None + message: str + + def format(self) -> str: + if self.column is None: + return f"{self.path}:{self.line}: {self.message}" + return f"{self.path}:{self.line}:{self.column}: {self.message}" + + +@dataclass(frozen=True) +class PageResult: + path: Path + snippets: int + syntax_errors: tuple[SnippetDiagnostic, ...] + type_errors: tuple[str, ...] + + @property + def passed(self) -> bool: + return not self.syntax_errors and not self.type_errors + + +@dataclass(frozen=True) +class PreparedTypeCheckFile: + doc_path: Path + temp_path: Path + line_mapping: tuple[int, ...] + + +def find_doc_files(paths: Iterable[Path]) -> list[Path]: + doc_files: list[Path] = [] + for path in paths: + if not path.exists(): + raise FileNotFoundError(f"{path} does not exist") + + if path.is_file(): + if path.suffix in DOC_SUFFIXES: + doc_files.append(path) + continue + expected = ", ".join(sorted(DOC_SUFFIXES)) + raise ValueError(f"Expected a Markdown/MDX file ({expected}) or directory: {path}") + + for root, dirs, files in os.walk(path): + dirs[:] = [directory for directory in dirs if directory not in SKIP_DIRS] + for filename in files: + file_path = Path(root) / filename + if file_path.suffix in DOC_SUFFIXES: + doc_files.append(file_path) + + return sorted(set(doc_files)) + + +def fence_closes(line: str, fence_marker: str) -> bool: + stripped = line.lstrip(" \t") + marker_char = fence_marker[0] + marker_len = len(fence_marker) + if not stripped.startswith(marker_char * marker_len): + return False + return stripped.strip(marker_char).strip() == "" + + +def get_language(info_string: str) -> str: + stripped = info_string.strip() + if not stripped: + return "" + return stripped.split(maxsplit=1)[0].lower() + + +def strip_markdown_indent(line: str, indent: str) -> str: + if indent and line.startswith(indent): + return line[len(indent) :] + return line + + +def source_for_static_check(source: str) -> str: + """Return Python-parseable source for notebook-style cells. + + IPython line magics and shell escapes are valid in notebooks, but not in + ``ast`` or ``ty``. Keep line numbers stable by replacing them with comments. + """ + transformed_lines: list[str] = [] + for line in source.splitlines(): + stripped = line.lstrip(" \t") + if stripped.startswith("%") or stripped.startswith("!"): + indent = line[: len(line) - len(stripped)] + transformed_lines.append(f"{indent}# {stripped}") + continue + transformed_lines.append(line) + return "\n".join(transformed_lines) + + +def extract_python_snippets(path: Path) -> list[PythonSnippet]: + lines = path.read_text(encoding="utf-8").splitlines() + snippets: list[PythonSnippet] = [] + skip_next_block = False + skip_next_type_check = False + index = 0 + + while index < len(lines): + stripped = lines[index].strip() + if stripped in SKIP_NEXT_BLOCK_MARKERS: + skip_next_block = True + index += 1 + continue + if stripped in SKIP_NEXT_TYPE_CHECK_MARKERS: + skip_next_type_check = True + index += 1 + continue + + fence_match = FENCE_RE.match(lines[index]) + if not fence_match: + index += 1 + continue + + indent, fence_marker, info_string = fence_match.groups() + language = get_language(info_string) + code_start_line = index + 2 + index += 1 + + code_lines: list[str] = [] + while index < len(lines) and not fence_closes(lines[index], fence_marker): + code_lines.append(strip_markdown_indent(lines[index], indent)) + index += 1 + + if index < len(lines): + index += 1 + + if language not in PYTHON_LANGUAGES: + skip_next_block = False + skip_next_type_check = False + continue + + if not skip_next_block: + source = "\n".join(code_lines) + if source.strip(): + snippets.append( + PythonSnippet( + path=path, + start_line=code_start_line, + source=source, + type_check=not skip_next_type_check, + ) + ) + + skip_next_block = False + skip_next_type_check = False + + return snippets + + +def syntax_check(snippets: Iterable[PythonSnippet]) -> list[SnippetDiagnostic]: + diagnostics: list[SnippetDiagnostic] = [] + for snippet in snippets: + try: + ast.parse(source_for_static_check(snippet.source)) + except SyntaxError as error: + line_offset = error.lineno or 1 + diagnostics.append( + SnippetDiagnostic( + path=snippet.path, + line=snippet.start_line + line_offset - 1, + column=error.offset, + message=error.msg, + ) + ) + return diagnostics + + +def prepare_type_check_file( + doc_path: Path, + snippets: Iterable[PythonSnippet], + temp_dir: Path, +) -> PreparedTypeCheckFile | None: + source_lines: list[str] = [] + line_mapping: list[int] = [] + + for snippet in snippets: + if not snippet.type_check: + continue + + snippet_lines = source_for_static_check(snippet.source).splitlines() + for offset, line in enumerate(snippet_lines): + source_lines.append(line) + line_mapping.append(snippet.start_line + offset) + + source_lines.append("") + if snippet_lines: + line_mapping.append(snippet.start_line + len(snippet_lines) - 1) + else: + line_mapping.append(snippet.start_line) + + if not any(line.strip() for line in source_lines): + return None + + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=temp_dir, + prefix="snippet-", + suffix=".py", + delete=False, + ) as temp_file: + temp_file.write("\n".join(source_lines)) + temp_path = Path(temp_file.name) + + return PreparedTypeCheckFile(doc_path=doc_path, temp_path=temp_path, line_mapping=tuple(line_mapping)) + + +def translate_line_number(line_in_combined: int, line_mapping: tuple[int, ...]) -> int: + index = line_in_combined - 1 + if 0 <= index < len(line_mapping): + return line_mapping[index] + return line_in_combined + + +def run_type_check( + snippets_by_path: dict[Path, list[PythonSnippet]], + project_root: Path, + timeout_seconds: int, +) -> dict[Path, tuple[str, ...]]: + results: dict[Path, list[str]] = {path: [] for path in snippets_by_path} + if not snippets_by_path: + return {} + + with tempfile.TemporaryDirectory(prefix="nemo-docs-snippets-") as temp_dir_name: + temp_dir = Path(temp_dir_name) + prepared_files = [ + prepared + for path, snippets in snippets_by_path.items() + if (prepared := prepare_type_check_file(path, snippets, temp_dir)) is not None + ] + if not prepared_files: + return {path: tuple(messages) for path, messages in results.items()} + + temp_to_prepared = {str(prepared.temp_path): prepared for prepared in prepared_files} + command = [ + "uv", + "run", + "--frozen", + "ty", + "check", + "--project", + str(project_root), + "--output-format", + "concise", + "--no-progress", + ] + for rule in DEFAULT_IGNORED_TY_RULES: + command.extend(["--ignore", rule]) + command.extend(str(prepared.temp_path) for prepared in prepared_files) + + try: + completed = subprocess.run( + command, + cwd=project_root, + text=True, + capture_output=True, + timeout=timeout_seconds, + check=False, + ) + except FileNotFoundError: + return {path: ("Type checking failed: `uv` was not found on PATH",) for path in snippets_by_path} + except subprocess.TimeoutExpired: + return {path: (f"Type checking timed out after {timeout_seconds} seconds",) for path in snippets_by_path} + + output = "\n".join(part for part in (completed.stdout, completed.stderr) if part) + unmatched_lines: list[str] = [] + + for line in output.splitlines(): + match = TY_OUTPUT_RE.match(line) + if match is None: + if line.strip(): + unmatched_lines.append(line) + continue + + matched_prepared = temp_to_prepared.get(match.group("path")) + if matched_prepared is None: + if line.strip(): + unmatched_lines.append(line) + continue + + combined_line = int(match.group("line")) + column = match.group("column") + message = match.group("message") + doc_line = translate_line_number(combined_line, matched_prepared.line_mapping) + results[matched_prepared.doc_path].append(f"{matched_prepared.doc_path}:{doc_line}:{column}: {message}") + + if completed.returncode != 0 and not any(results.values()) and unmatched_lines: + shared_output = tuple(unmatched_lines) + return {path: shared_output for path in snippets_by_path} + + return {path: tuple(messages) for path, messages in results.items()} + + +def check_paths( + paths: Iterable[Path], + type_check: bool, + project_root: Path, + timeout_seconds: int, +) -> list[PageResult]: + doc_files = find_doc_files(paths) + snippets_by_path = {path: extract_python_snippets(path) for path in doc_files} + snippets_by_path = {path: snippets for path, snippets in snippets_by_path.items() if snippets} + + type_results: dict[Path, tuple[str, ...]] = {} + if type_check: + type_results = run_type_check(snippets_by_path, project_root, timeout_seconds) + + return [ + PageResult( + path=path, + snippets=len(snippets), + syntax_errors=tuple(syntax_check(snippets)), + type_errors=type_results.get(path, ()), + ) + for path, snippets in snippets_by_path.items() + ] + + +def display_results(results: list[PageResult], type_check: bool) -> None: + if not results: + print("No Python snippets found.") + return + + total_snippets = sum(result.snippets for result in results) + checks = "syntax + type" if type_check else "syntax" + print(f"Checked {total_snippets} Python snippet(s) across {len(results)} doc file(s) ({checks}).\n") + + for result in results: + if result.passed: + print(f"✓ {result.path} ({result.snippets} snippet(s))") + continue + + print(f"✗ {result.path} ({result.snippets} snippet(s))") + for diagnostic in result.syntax_errors: + print(f" SYNTAX: {diagnostic.format()}") + for message in result.type_errors: + print(f" TYPE: {message}") + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate Python fenced snippets in Markdown/MDX docs.") + parser.add_argument("paths", nargs="+", type=Path, help="Markdown/MDX files or directories to check.") + type_check_group = parser.add_mutually_exclusive_group() + type_check_group.add_argument( + "--type-check", + dest="type_check", + action="store_true", + help="Run `ty check` over extracted snippets. This is the default.", + ) + type_check_group.add_argument( + "--no-type-check", + dest="type_check", + action="store_false", + help="Only run structural syntax checks; skip `ty check`.", + ) + parser.set_defaults(type_check=True) + parser.add_argument( + "--project-root", + type=Path, + default=Path.cwd(), + help="Project root passed to `ty --project` when type checking. Defaults to cwd.", + ) + parser.add_argument( + "--timeout-seconds", + type=int, + default=120, + help="Timeout for the `ty check` subprocess when --type-check is enabled.", + ) + args = parser.parse_args() + + try: + results = check_paths( + paths=args.paths, + type_check=args.type_check, + project_root=args.project_root.resolve(), + timeout_seconds=args.timeout_seconds, + ) + except (FileNotFoundError, ValueError) as error: + print(f"ERROR: {error}", file=sys.stderr) + return 2 + + display_results(results, args.type_check) + return 1 if any(not result.passed for result in results) else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/_scripts/test_lint_python_snippets.py b/docs/_scripts/test_lint_python_snippets.py new file mode 100644 index 0000000000..647de382aa --- /dev/null +++ b/docs/_scripts/test_lint_python_snippets.py @@ -0,0 +1,227 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path + +import pytest + +from docs._scripts import lint_python_snippets +from docs._scripts.lint_python_snippets import ( + PreparedTypeCheckFile, + PythonSnippet, + extract_python_snippets, + find_doc_files, + prepare_type_check_file, + run_type_check, + syntax_check, + translate_line_number, +) + + +def test_find_doc_files_includes_mdx_and_skips_node_modules(tmp_path: Path) -> None: + docs_dir = tmp_path / "docs" + docs_dir.mkdir() + mdx = docs_dir / "page.mdx" + md = docs_dir / "page.md" + ignored = docs_dir / "node_modules" / "package" / "README.md" + ignored.parent.mkdir(parents=True) + mdx.write_text("# Page\n", encoding="utf-8") + md.write_text("# Page\n", encoding="utf-8") + ignored.write_text("# Ignored\n", encoding="utf-8") + + assert find_doc_files([docs_dir]) == [md, mdx] + + +def test_find_doc_files_rejects_non_doc_file(tmp_path: Path) -> None: + notebook = tmp_path / "page.ipynb" + notebook.write_text("{}", encoding="utf-8") + + with pytest.raises(ValueError, match="Expected a Markdown/MDX file"): + find_doc_files([notebook]) + + +def test_extract_python_snippets_supports_mdx_info_strings_and_indent(tmp_path: Path) -> None: + doc = tmp_path / "page.mdx" + doc.write_text( + """ + + ```python title="example.py" + value = 1 + ``` + +```py +print(value) +``` + +""", + encoding="utf-8", + ) + + snippets = extract_python_snippets(doc) + + assert [(snippet.start_line, snippet.source) for snippet in snippets] == [ + (4, "value = 1"), + (8, "print(value)"), + ] + + +def test_extract_python_snippets_skip_markers(tmp_path: Path) -> None: + doc = tmp_path / "page.mdx" + doc.write_text( + """ +```python +this is intentionally not python +``` + + +```python +from litellm import completion +completion(model="demo", messages=[]) +``` + +```python +print("kept") +``` +""", + encoding="utf-8", + ) + + snippets = extract_python_snippets(doc) + + assert [(snippet.start_line, snippet.type_check, snippet.source) for snippet in snippets] == [ + (8, False, 'from litellm import completion\ncompletion(model="demo", messages=[])'), + (13, True, 'print("kept")'), + ] + + +def test_syntax_check_reports_original_doc_line(tmp_path: Path) -> None: + doc = tmp_path / "page.mdx" + doc.write_text( + """# Page + +```python +print("ok") +if True + print("bad") +``` +""", + encoding="utf-8", + ) + + diagnostics = syntax_check(extract_python_snippets(doc)) + + assert len(diagnostics) == 1 + assert diagnostics[0].line == 5 + assert diagnostics[0].column == 8 + assert diagnostics[0].path == doc + + +def test_syntax_check_allows_ipython_line_magics(tmp_path: Path) -> None: + doc = tmp_path / "page.mdx" + doc.write_text( + """```python +%pip install -q datasets +!echo ready +value = 1 +``` +""", + encoding="utf-8", + ) + + assert syntax_check(extract_python_snippets(doc)) == [] + + +def test_prepare_type_check_file_preserves_line_mapping(tmp_path: Path) -> None: + doc = tmp_path / "page.mdx" + doc.write_text( + """```python +value = 1 +``` + + +```python +skipped = unknown +``` + +```python +print(value) +``` +""", + encoding="utf-8", + ) + snippets = extract_python_snippets(doc) + + prepared = prepare_type_check_file(doc, snippets, tmp_path) + + assert prepared is not None + assert prepared.temp_path.read_text(encoding="utf-8") == "value = 1\n\nprint(value)\n" + assert prepared.line_mapping == (2, 2, 11, 11) + assert translate_line_number(3, prepared.line_mapping) == 11 + + +def test_prepare_type_check_file_uses_unique_temp_paths_for_colliding_doc_names(tmp_path: Path) -> None: + first_doc = tmp_path / "docs" / "a_b.md" + second_doc = tmp_path / "docs" / "a" / "b.md" + first_doc.parent.mkdir(parents=True) + second_doc.parent.mkdir(parents=True) + for doc in (first_doc, second_doc): + doc.write_text("```python\nvalue = 1\n```\n", encoding="utf-8") + + first_prepared = prepare_type_check_file(first_doc, extract_python_snippets(first_doc), tmp_path) + second_prepared = prepare_type_check_file(second_doc, extract_python_snippets(second_doc), tmp_path) + + assert first_prepared is not None + assert second_prepared is not None + assert first_prepared.temp_path != second_prepared.temp_path + assert first_prepared.temp_path.read_text(encoding="utf-8") == "value = 1\n" + assert second_prepared.temp_path.read_text(encoding="utf-8") == "value = 1\n" + + +def test_run_type_check_matches_temp_paths_exactly(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + doc = tmp_path / "doc.md" + doc_with_prefixed_temp_path = tmp_path / "doc-prefixed.md" + temp_path = tmp_path / "snippet.py" + prefixed_temp_path = tmp_path / "snippet.py-extra" + prepared_files = { + doc: PreparedTypeCheckFile(doc_path=doc, temp_path=temp_path, line_mapping=(10,)), + doc_with_prefixed_temp_path: PreparedTypeCheckFile( + doc_path=doc_with_prefixed_temp_path, + temp_path=prefixed_temp_path, + line_mapping=(20,), + ), + } + + def fake_prepare_type_check_file( + doc_path: Path, + snippets: list[PythonSnippet], + temp_dir: Path, + ) -> PreparedTypeCheckFile: + return prepared_files[doc_path] + + def fake_run( + command: list[str], + **kwargs: object, + ) -> lint_python_snippets.subprocess.CompletedProcess[str]: + return lint_python_snippets.subprocess.CompletedProcess( + command, + returncode=1, + stdout=f"{prefixed_temp_path}:1:5: exact match only\n", + stderr="", + ) + + monkeypatch.setattr(lint_python_snippets, "prepare_type_check_file", fake_prepare_type_check_file) + monkeypatch.setattr(lint_python_snippets.subprocess, "run", fake_run) + + results = run_type_check( + { + doc: [PythonSnippet(path=doc, start_line=1, source="value = 1", type_check=True)], + doc_with_prefixed_temp_path: [ + PythonSnippet(path=doc_with_prefixed_temp_path, start_line=1, source="value = 2", type_check=True) + ], + }, + project_root=tmp_path, + timeout_seconds=120, + ) + + assert results[doc] == () + assert results[doc_with_prefixed_temp_path] == (f"{doc_with_prefixed_temp_path}:20:5: exact match only",) diff --git a/docs/fern/scripts/README.md b/docs/fern/scripts/README.md index ef43050b23..cbd3de6053 100644 --- a/docs/fern/scripts/README.md +++ b/docs/fern/scripts/README.md @@ -56,3 +56,32 @@ uv run python docs/fern/scripts/ipynb-to-mdx.py \ ``` Re-run whenever the source `.ipynb` changes. + +## `run_notebooks.py` + +Runs the source notebook for a Fern `.mdx` page using `nemo_nb` marker semantics: +the source notebook must have `@nemo-nb: process`, and `@nemo-nb: skip-test` +opts it out of execution. When given a Fern `.mdx` page, the runner resolves the +adjacent source `.ipynb` or the notebook linked from the Colab URL. + +Dry-run notebook selection: + +```bash +uv run python docs/fern/scripts/run_notebooks.py \ + --dry-run \ + docs/customizer/tutorials/sft-customization-job.mdx +``` + +Execute Python cells only (default): + +```bash +uv run python docs/fern/scripts/run_notebooks.py \ + docs/customizer/tutorials/sft-customization-job.mdx +``` + +Execute through Make: + +```bash +make docs-run-notebook DOCS_PATH=docs/customizer/tutorials/sft-customization-job.mdx +make docs-run-notebook DOCS_PATH=docs/customizer/tutorials/sft-customization-job.mdx ARGS=--dry-run +``` diff --git a/docs/fern/scripts/run_notebooks.py b/docs/fern/scripts/run_notebooks.py new file mode 100644 index 0000000000..b06f9e4915 --- /dev/null +++ b/docs/fern/scripts/run_notebooks.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Run Fern documentation notebooks using nemo-nb discovery semantics. + +Fern pages generated from notebooks are ``.mdx`` files, but the executable +source remains the adjacent ``.ipynb``. This wrapper resolves Fern ``.mdx`` +pages back to their source notebooks and then executes only notebooks marked +with ``@nemo-nb: process`` and not marked with ``@nemo-nb: skip-test``. +Hand-authored Fern pages without a source notebook are materialized as +temporary Markdown files and executed through the same snippet runner. +""" + +import argparse +import os +import re +import sys +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path + +from dotenv import load_dotenv +from nemo_nb import ( + find_testable_notebooks, + has_process_marker_markdown, + has_process_marker_notebook, + has_skip_test_marker_markdown, + has_skip_test_marker_notebook, + print_conflicts_error, +) + +COLAB_NOTEBOOK_RE = re.compile( + r"https://colab\.research\.google\.com/github/[^/]+/[^/]+/blob/(?:[^/]+/)+(?Pdocs/[^)\"'\s]+\.ipynb)" +) +FERN_NOTEBOOK_RE = re.compile(r"colabUrl=[\"'].*?/blob/(?:[^/]+/)+(?Pdocs/[^\"']+\.ipynb)[\"']") +FERN_MARKDOWN_RE = re.compile(r"^[ \t]*[^\"']+)[\"']\s*/>[ \t]*$", re.MULTILINE) +EXECUTABLE_FENCE_RE = re.compile(r"^```(?P[\w+-]*)\s*$", re.MULTILINE) +EXECUTABLE_FENCE_LANGUAGES = {"python", "py", "sh", "bash", "shell"} +TIMEOUT_SECONDS = 3600 + +load_dotenv() + + +@dataclass(frozen=True) +class NotebookSelection: + path: Path + source: Path + + +def resolve_repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def resolve_mdx_notebook(mdx_path: Path, repo_root: Path) -> Path: + sibling = mdx_path.with_suffix(".ipynb") + if sibling.exists(): + return sibling + + text = mdx_path.read_text(encoding="utf-8") + for pattern in (COLAB_NOTEBOOK_RE, FERN_NOTEBOOK_RE): + match = pattern.search(text) + if match: + linked = repo_root / match.group("path") + if linked.exists(): + return linked + + raise FileNotFoundError(f"Could not find source .ipynb for Fern page: {mdx_path}") + + +def resolve_fern_markdown_src(src: str, mdx_path: Path, repo_root: Path) -> Path: + if src.startswith("/snippets/"): + return repo_root / "docs" / "fern" / src.removeprefix("/") + if src.startswith("/"): + return repo_root / "docs" / src.removeprefix("/") + return mdx_path.parent / src + + +def mdx_to_markdown_text(mdx_path: Path, repo_root: Path, seen: set[Path] | None = None) -> str: + seen = seen or set() + resolved_mdx_path = mdx_path.resolve() + if resolved_mdx_path in seen: + raise RuntimeError(f"Recursive Fern Markdown include detected: {mdx_path}") + seen.add(resolved_mdx_path) + + try: + text = mdx_path.read_text(encoding="utf-8") + + def replace_markdown(match: re.Match[str]) -> str: + include_path = resolve_fern_markdown_src(match.group("src"), mdx_path, repo_root) + if not include_path.exists(): + return match.group(0) + return mdx_to_markdown_text(include_path, repo_root, seen) + + return FERN_MARKDOWN_RE.sub(replace_markdown, text) + finally: + seen.remove(resolved_mdx_path) + + +def has_executable_snippets(mdx_path: Path, repo_root: Path) -> bool: + text = mdx_to_markdown_text(mdx_path, repo_root) + return any(match.group("language") in EXECUTABLE_FENCE_LANGUAGES for match in EXECUTABLE_FENCE_RE.finditer(text)) + + +def materialize_mdx_as_markdown(mdx_path: Path, repo_root: Path) -> Path: + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=mdx_path.parent, + prefix=f"{mdx_path.stem}-", + suffix=".tmp.md", + delete=False, + ) as temp_file: + temp_file.write(mdx_to_markdown_text(mdx_path, repo_root)) + temp_md_path = Path(temp_file.name) + return temp_md_path + + +def is_processable(path: Path) -> bool: + if path.suffix == ".ipynb": + return has_process_marker_notebook(path) + if path.suffix in {".md", ".mdx"}: + return has_process_marker_markdown(path) + return False + + +def is_skip_test(path: Path) -> bool: + if path.suffix == ".ipynb": + return has_skip_test_marker_notebook(path) + if path.suffix in {".md", ".mdx"}: + return has_skip_test_marker_markdown(path) + return False + + +def select_single_file(path: Path, repo_root: Path) -> list[NotebookSelection]: + if path.suffix == ".mdx": + try: + notebook_path = resolve_mdx_notebook(path, repo_root) + except FileNotFoundError: + if is_skip_test(path): + print(f"Skipping {path}: has @nemo-nb: skip-test marker") + return [] + if not is_processable(path) and not has_executable_snippets(path, repo_root): + print(f"Skipping {path}: missing @nemo-nb: process marker and executable snippets") + return [] + return [NotebookSelection(path=path, source=path)] + else: + notebook_path = path + + if notebook_path.suffix not in {".ipynb", ".md"}: + raise ValueError(f"Expected .mdx, .ipynb, or .md file: {path}") + if not is_processable(notebook_path): + print(f"Skipping {notebook_path}: missing @nemo-nb: process marker") + return [] + if is_skip_test(notebook_path): + print(f"Skipping {notebook_path}: has @nemo-nb: skip-test marker") + return [] + return [NotebookSelection(path=notebook_path, source=path)] + + +def select_notebooks(paths: list[Path], repo_root: Path) -> list[NotebookSelection]: + selections: list[NotebookSelection] = [] + seen: set[Path] = set() + + for input_path in paths: + path = input_path.resolve() + if not path.exists(): + raise FileNotFoundError(f"{input_path} does not exist") + + if path.is_file(): + candidates = select_single_file(path, repo_root) + else: + result = find_testable_notebooks(str(path)) + if result.conflicts: + print_conflicts_error(result.conflicts) + raise RuntimeError("Found conflicting .md and .ipynb notebook sources") + candidates = [ + NotebookSelection(path=notebook, source=path) for notebook in [*result.ipynb_files, *result.md_files] + ] + + for candidate in candidates: + resolved = candidate.path.resolve() + if resolved not in seen: + seen.add(resolved) + selections.append(candidate) + + return selections + + +def cleanup_selection_temp_files(selection_path: Path) -> None: + for suffix in (".tmp.ipynb", ".executed.ipynb"): + stale = selection_path.with_suffix(suffix) + if stale.exists(): + stale.unlink() + for stale in (selection_path.with_suffix(".expanded.md"), selection_path.with_suffix(".tmp.md")): + if stale.exists(): + stale.unlink() + + +def create_kernel(use_temporary_venv: bool, requirements_file: str | None) -> tuple[str, str | None, str | None]: + if requirements_file and not use_temporary_venv: + print("Warning: --requirements requires --use-temporary-venv and will be ignored.") + requirements_file = None + + if not use_temporary_venv: + return "python3", None, None + + from nmp.testing.notebooks import create_temp_venv_with_kernel + + kernel_name, temp_venv_dir, temp_kernel_spec_dir = create_temp_venv_with_kernel(requirements_file) + os.environ["VIRTUAL_ENV"] = temp_venv_dir + bin_dir = "Scripts" if sys.platform == "win32" else "bin" + os.environ["PATH"] = str(Path(temp_venv_dir) / bin_dir) + os.pathsep + os.environ["PATH"] + return kernel_name, temp_venv_dir, temp_kernel_spec_dir + + +def cleanup_kernel(kernel_name: str, temp_venv_dir: str | None, temp_kernel_spec_dir: str | None) -> None: + if temp_venv_dir and temp_kernel_spec_dir: + from nmp.testing.notebooks import cleanup_temp_venv_and_kernel + + cleanup_temp_venv_and_kernel(kernel_name, temp_venv_dir, temp_kernel_spec_dir) + + +def run_selected_notebooks( + selections: list[NotebookSelection], + language: str, + keep_temp_files: bool, + use_temporary_venv: bool, + requirements_file: str | None, + execution_timeout: int | None, +) -> int: + if not selections: + print("No testable notebooks found.") + return 0 + + print(f"Found {len(selections)} testable notebook(s):") + for selection in selections: + print(f" [{selection.path.suffix.removeprefix('.')}] {selection.path}") + + start_time = time.monotonic() + kernel_name, temp_venv_dir, temp_kernel_spec_dir = create_kernel(use_temporary_venv, requirements_file) + failures: list[Path] = [] + + try: + from nmp.testing.notebooks import execute_notebook + + for selection in selections: + run_path = selection.path + temp_md_path: Path | None = None + elapsed = time.monotonic() - start_time + if elapsed > TIMEOUT_SECONDS: + raise TimeoutError(f"Timeout running notebooks after {TIMEOUT_SECONDS} seconds") + + print(f"\nRunning {selection.path}...") + if selection.path.suffix == ".mdx": + temp_md_path = materialize_mdx_as_markdown(selection.path, resolve_repo_root()) + run_path = temp_md_path + output_path = run_path.with_suffix(".executed.ipynb") + try: + execute_notebook( + run_path, + language_filter=language, + kernel_name=kernel_name, + execution_timeout=execution_timeout, + ) + print(f"SUCCESS: {selection.path}") + except Exception as error: + print(f"FAILURE: {selection.path}") + print(f"Error: {error}") + failures.append(selection.path) + finally: + if not keep_temp_files and output_path.exists(): + output_path.unlink() + if not keep_temp_files and temp_md_path: + cleanup_selection_temp_files(temp_md_path) + temp_md_path.unlink(missing_ok=True) + finally: + cleanup_kernel(kernel_name, temp_venv_dir, temp_kernel_spec_dir) + + if failures: + print(f"\nFAILED: {len(failures)} notebook(s) failed.") + for failure in failures: + print(f" - {failure}") + return 1 + + print("\nSUCCESS: All notebook(s) ran successfully.") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run Fern source notebooks with nemo-nb marker semantics.") + parser.add_argument("paths", nargs="+", type=Path, help="Fern .mdx page, source .ipynb/.md, or directory.") + parser.add_argument( + "--language", + choices=["all", "python", "shell"], + default="python", + help="Which cells to execute. Defaults to Python cells only.", + ) + parser.add_argument("--dry-run", action="store_true", help="List selected notebooks without executing them.") + parser.add_argument("--keep-temp-files", action="store_true", help="Keep generated .executed.ipynb files.") + parser.add_argument("--use-temporary-venv", action="store_true", help="Run notebooks in a temporary venv.") + parser.add_argument("--requirements", help="Requirements file to install when using --use-temporary-venv.") + parser.add_argument("--execution-timeout", type=int, default=None, help="Per-cell execution timeout in seconds.") + args = parser.parse_args() + + repo_root = resolve_repo_root() + selections: list[NotebookSelection] = [] + try: + selections = select_notebooks(args.paths, repo_root) + except Exception as error: + print(f"ERROR: {error}", file=sys.stderr) + return 2 + + if args.dry_run: + if not selections: + print("No testable notebooks found.") + return 0 + for selection in selections: + print(selection.path) + return 0 + + try: + return run_selected_notebooks( + selections=selections, + language=args.language, + keep_temp_files=args.keep_temp_files, + use_temporary_venv=args.use_temporary_venv, + requirements_file=args.requirements, + execution_timeout=args.execution_timeout, + ) + except KeyboardInterrupt: + return 130 + finally: + if not args.keep_temp_files: + for selection in selections: + cleanup_selection_temp_files(selection.path) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/fern/scripts/test_run_notebooks.py b/docs/fern/scripts/test_run_notebooks.py new file mode 100644 index 0000000000..708978f06c --- /dev/null +++ b/docs/fern/scripts/test_run_notebooks.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +from pathlib import Path + +from docs.fern.scripts.run_notebooks import materialize_mdx_as_markdown, resolve_mdx_notebook, select_notebooks + + +def _write_notebook(path: Path, marker: str = "") -> None: + path.write_text( + json.dumps( + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [marker], + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5, + } + ), + encoding="utf-8", + ) + + +def test_resolve_mdx_notebook_prefers_sibling(tmp_path: Path) -> None: + mdx = tmp_path / "tutorial.mdx" + notebook = tmp_path / "tutorial.ipynb" + mdx.write_text("# Tutorial\n", encoding="utf-8") + _write_notebook(notebook) + + assert resolve_mdx_notebook(mdx, tmp_path) == notebook + + +def test_resolve_mdx_notebook_uses_colab_link(tmp_path: Path) -> None: + repo_root = tmp_path + notebook = repo_root / "docs" / "customizer" / "tutorials" / "tutorial.ipynb" + notebook.parent.mkdir(parents=True) + _write_notebook(notebook) + mdx = repo_root / "page.mdx" + mdx.write_text( + "[Run in Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/nemo-platform/blob/main/docs/customizer/tutorials/tutorial.ipynb)\n", + encoding="utf-8", + ) + + assert resolve_mdx_notebook(mdx, repo_root) == notebook + + +def test_resolve_mdx_notebook_accepts_slash_branch_refs(tmp_path: Path) -> None: + repo_root = tmp_path + notebook = repo_root / "docs" / "customizer" / "tutorials" / "tutorial.ipynb" + notebook.parent.mkdir(parents=True) + _write_notebook(notebook) + + colab_mdx = repo_root / "colab.mdx" + colab_mdx.write_text( + "[Run in Google Colab](https://colab.research.google.com/github/NVIDIA-NeMo/nemo-platform/blob/release/2026.06/docs/customizer/tutorials/tutorial.ipynb)\n", + encoding="utf-8", + ) + fern_mdx = repo_root / "fern.mdx" + fern_mdx.write_text( + '\n', + encoding="utf-8", + ) + + assert resolve_mdx_notebook(colab_mdx, repo_root) == notebook + assert resolve_mdx_notebook(fern_mdx, repo_root) == notebook + + +def test_select_notebooks_skips_skip_test_marker(tmp_path: Path) -> None: + notebook = tmp_path / "skip.ipynb" + _write_notebook(notebook, "\n") + mdx = tmp_path / "skip.mdx" + mdx.write_text("# Skip\n", encoding="utf-8") + + assert select_notebooks([mdx], tmp_path) == [] + + +def test_select_notebooks_resolves_fern_mdx_source(tmp_path: Path) -> None: + notebook = tmp_path / "tutorial.ipynb" + _write_notebook(notebook) + mdx = tmp_path / "tutorial.mdx" + mdx.write_text("# Tutorial\n", encoding="utf-8") + + selections = select_notebooks([mdx], tmp_path) + + assert len(selections) == 1 + assert selections[0].path == notebook + assert selections[0].source == mdx + + +def test_select_notebooks_falls_back_to_mdx_snippets_without_notebook(tmp_path: Path) -> None: + mdx = tmp_path / "tutorial.mdx" + mdx.write_text( + "# Tutorial\n\n```python\nprint('hello')\n```\n", + encoding="utf-8", + ) + + selections = select_notebooks([mdx], tmp_path) + + assert len(selections) == 1 + assert selections[0].path == mdx + assert selections[0].source == mdx + + +def test_select_notebooks_skips_mdx_without_notebook_or_snippets(tmp_path: Path) -> None: + mdx = tmp_path / "tutorial.mdx" + mdx.write_text("# Tutorial\n\nNo executable snippets.\n", encoding="utf-8") + + assert select_notebooks([mdx], tmp_path) == [] + + +def test_materialize_mdx_as_markdown_expands_fern_markdown_snippets(tmp_path: Path) -> None: + snippet = tmp_path / "docs" / "fern" / "snippets" / "_snippets" / "setup.mdx" + snippet.parent.mkdir(parents=True) + snippet.write_text("```python\nprint('from snippet')\n```\n", encoding="utf-8") + mdx = tmp_path / "tutorial.mdx" + mdx.write_text( + '# Tutorial\n\n\n', + encoding="utf-8", + ) + + temp_md = materialize_mdx_as_markdown(mdx, tmp_path) + + try: + assert temp_md.parent == mdx.parent + assert temp_md != mdx.with_suffix(".tmp.md") + assert temp_md.read_text(encoding="utf-8") == "# Tutorial\n\n```python\nprint('from snippet')\n```\n\n" + finally: + temp_md.unlink(missing_ok=True) diff --git a/pytest.ini b/pytest.ini index a884ce87c2..2782cd20bf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -47,6 +47,8 @@ testpaths = services/intake/tests services/platform-seed/tests services/safe-synthesizer-api/tests + docs/_scripts + docs/fern/scripts tests/integration tests/unit @@ -133,7 +135,6 @@ norecursedirs = notebooks dist build - docs .venv __pycache__ *.egg-info