readmeio · eaglethrost · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 29, 2026
diff --git a/__tests__/lib/compile-sanitize.test.tsx b/__tests__/lib/compile-sanitize.test.tsx
@@ -0,0 +1,52 @@
+import { render } from '@testing-library/react';
+import React from 'react';
+
+import { execute } from '../helpers';
+
+/**
+ * The `md` format sanitizes via rehype-sanitize (covered in run.test.tsx). Default
+ * MDX format keeps raw HTML as JSX nodes that schema never sees, so these assert the
+ * shared dangerous-HTML stripper closes that path too.
+ */
+describe('MDX (compile) sanitization', () => {
+  it('strips script-execution vectors in default MDX format', () => {
+    const md = [
+      '# Docs',
+      '',
+      '<script>window.__xss = 1</script>',
+      '',
+      '<a href="javascript:alert(1)">link</a>',
+      '',
+      '<img src="x" onerror="window.__xss = 1" />',
+      '',
+      '<iframe src="javascript:alert(1)"></iframe>',
+    ].join('\n');
+
+    const Component = execute(md, {}, {}); // no format => MDX
+    const { container } = render(<Component />);
+
+    expect(container.querySelector('script')).not.toBeInTheDocument();
+    expect(container.querySelector('iframe')).not.toBeInTheDocument();
+
+    // The link text still renders, but no anchor carries a javascript: href.
+    const hrefs = [...container.querySelectorAll('a')].map(a => a.getAttribute('href'));
+    // eslint-disable-next-line no-script-url
+    expect(hrefs.some(href => href?.startsWith('javascript:'))).toBe(false);
+    expect(container.textContent).toContain('link');
+
+    // Image still renders, but the onerror handler is gone.
+    const image = container.querySelector('img');
+    expect(image?.getAttribute('onerror')).toBeNull();
+  });
+
+  it('strips the MathML namespace-confusion payload in default MDX format', () => {
+    const md = '# Docs\n\n<math><mtext><script>window.__xss = 1</script></mtext></math>';
+
+    const Component = execute(md, {}, {});
+    const { container } = render(<Component />);
+
+    expect(container.querySelector('script')).not.toBeInTheDocument();
+    expect(container.querySelector('math')).not.toBeInTheDocument();
+    expect(container.querySelector('h1')).toBeInTheDocument();
+  });
+});
diff --git a/__tests__/lib/mdxish/sanitize-raw-html.test.ts b/__tests__/lib/mdxish/sanitize-raw-html.test.ts
@@ -0,0 +1,145 @@
+import type { RMDXModule } from '../../../types';
+
+import { visit } from 'unist-util-visit';
+
+import { mdxish } from '../../../lib';
+import { findAllElementsByTagName, findElementByTagName } from '../../helpers';
+
+/** Collects every property key present on any element in the tree. */
+function allPropertyKeys(tree: ReturnType<typeof mdxish>): string[] {
+  const keys = new Set<string>();
+  visit(tree, 'element', node => {
+    Object.keys(node.properties ?? {}).forEach(key => keys.add(key));
+  });
+  return [...keys];
+}
+
+describe('mdxish raw HTML sanitization', () => {
+  describe('script execution vectors', () => {
+    it('strips the MathML namespace-confusion payload from the report', () => {
+      const tree = mdxish('# Docs\n\n<math><mtext><script>window.__xssfired=1</script></mtext></math>\n');
+
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+      expect(findElementByTagName(tree, 'math')).toBeNull();
+      expect(findElementByTagName(tree, 'mtext')).toBeNull();
+      // The heading and surrounding structure survive.
+      expect(findElementByTagName(tree, 'h1')).not.toBeNull();
+    });
+
+    it('strips the exact String.fromCharCode exfil payload from the report', () => {
+      const payload =
+        '<math><mtext><script>fetch(String.fromCharCode(47,97,112,105)).then(function(r){return r.text()})</script></mtext></math>';
+      const tree = mdxish(`# Docs\n\n${payload}\n`);
+
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+      expect(JSON.stringify(tree)).not.toContain('fromCharCode');
+    });
+
+    it('strips a bare top-level <script>', () => {
+      const tree = mdxish('<script>alert(1)</script>');
+
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+    });
+
+    it('strips SVG foreign content carrying a script', () => {
+      const tree = mdxish('<svg><foreignObject><script>alert(1)</script></foreignObject></svg>');
+
+      expect(findElementByTagName(tree, 'svg')).toBeNull();
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+    });
+
+    it('strips dangerous embedders (iframe/object)', () => {
+      const tree = mdxish('<iframe src="javascript:alert(1)"></iframe>\n\n<object data="x"></object>');
+
+      expect(findElementByTagName(tree, 'iframe')).toBeNull();
+      expect(findElementByTagName(tree, 'object')).toBeNull();
+    });
+  });
+
+  describe('attribute vectors', () => {
+    it('removes event-handler attributes but keeps the element', () => {
+      const tree = mdxish('<img src="x.png" onerror="alert(1)" alt="ok">');
+
+      const img = findElementByTagName(tree, 'img');
+      expect(img).not.toBeNull();
+      expect(allPropertyKeys(tree)).not.toContain('onError');
+      expect(img?.properties?.src).toBe('x.png');
+    });
+
+    it('removes javascript: hrefs but keeps the anchor text', () => {
+      const tree = mdxish('<a href="javascript:alert(1)">click me</a>');
+
+      const anchor = findElementByTagName(tree, 'a');
+      expect(anchor).not.toBeNull();
+      expect(anchor?.properties?.href).toBeUndefined();
+      expect(JSON.stringify(tree)).toContain('click me');
+    });
+
+    it('ignores whitespace/control-char obfuscated javascript: URLs', () => {
+      const tree = mdxish('<a href="java\tscript:alert(1)">x</a>');
+
+      expect(findElementByTagName(tree, 'a')?.properties?.href).toBeUndefined();
+    });
+  });
+
+  describe('safe content is preserved', () => {
+    it('keeps benign formatting, links, and images', () => {
+      const tree = mdxish(
+        '<div class="note"><strong>Bold</strong> and <a href="https://example.com">link</a></div>\n\n<img src="https://example.com/a.png" alt="ok">',
+      );
+
+      expect(findElementByTagName(tree, 'strong')).not.toBeNull();
+      expect(findElementByTagName(tree, 'a')?.properties?.href).toBe('https://example.com');
+      expect(findElementByTagName(tree, 'img')?.properties?.src).toBe('https://example.com/a.png');
+    });
+
+    it('keeps relative and mailto links', () => {
+      const tree = mdxish('<a href="/docs/start">a</a> <a href="mailto:x@y.com">b</a>');
+
+      const hrefs = findAllElementsByTagName(tree, 'a').map(node => node.properties?.href);
+      expect(hrefs).toStrictEqual(['/docs/start', 'mailto:x@y.com']);
+    });
+  });
+
+  describe('custom components', () => {
+    const testComponents: Record<string, RMDXModule> = {
+      TestComponent: {} as RMDXModule
+    }
+
+    it('preserves event-handler-named props on PascalCase components', () => {
+      const tree = mdxish('<TestComponent onClick="fn" href="javascript:alert(1)" />', {
+        components: testComponents,
+      });
+
+      const component = findElementByTagName(tree, 'TestComponent');
+      expect(component?.properties?.onClick).toBe('fn');
+      // eslint-disable-next-line no-script-url
+      expect(component?.properties?.href).toBe('javascript:alert(1)');
+    });
+
+    it('still sanitizes raw HTML nested inside a component', () => {
+      const tree = mdxish('<TestComponent>\n\n<img src="x" onerror="alert(1)">\n\n</TestComponent>', {
+        components: testComponents,
+      });
+
+      expect(allPropertyKeys(tree)).not.toContain('onError');
+      expect(findElementByTagName(tree, 'img')).not.toBeNull();
+    });
+  });
+
+  describe('integration with other nodes', () => {
+    it('sanitizes raw HTML embedded inside a table cell', () => {
+      const tree = mdxish('| A | B |\n| --- | --- |\n| <img src=x onerror=alert(1)> | ok |');
+
+      expect(allPropertyKeys(tree)).not.toContain('onError');
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+    });
+
+    it('sanitizes raw HTML nested inside a callout', () => {
+      const tree = mdxish('> 📘 Title\n>\n> <script>alert(1)</script> body text');
+
+      expect(findElementByTagName(tree, 'script')).toBeNull();
+      expect(JSON.stringify(tree)).toContain('body text');
+    });
+  });
+});
diff --git a/lib/compile.ts b/lib/compile.ts
@@ -11,6 +11,7 @@ import remarkGfm from 'remark-gfm';
 
 import MdxSyntaxError from '../errors/mdx-syntax-error';
 import { rehypeToc } from '../processor/plugin/toc';
+import rehypeStripDangerousHtml from '../processor/sanitize/rehype-strip-dangerous-html';
 import {
   defaultTransforms,
   tailwindTransformer,
@@ -81,6 +82,10 @@ const compile = (
     rehypePlugins.push([rehypeSanitize, sanitizeSchema]);
   }
 
+  // MDX (non-`md`) content keeps raw HTML as JSX nodes that the schema above never
+  // sees, so strip script-execution vectors regardless of format.
+  rehypePlugins.push(rehypeStripDangerousHtml);
+
   try {
     const vfile = mdxCompileSync(text, {
       outputFormat: 'function-body',

diff --git a/lib/mdxish.ts b/lib/mdxish.ts
@@ -23,6 +23,7 @@ import { mdxishCompilers } from '../processor/compile';
 import { rehypeFlattenTableCellParagraphs } from '../processor/plugin/flatten-table-cell-paragraphs';
 import { rehypeMdxishComponents } from '../processor/plugin/mdxish-components';
 import { mdxComponentHandlers } from '../processor/plugin/mdxish-handlers';
+import rehypeStripDangerousHtml from '../processor/sanitize/rehype-strip-dangerous-html';
 import calloutTransformer from '../processor/transform/callouts';
 import codeTabsTransformer from '../processor/transform/code-tabs';
 import embedTransformer from '../processor/transform/embeds';
@@ -286,6 +287,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root {
     .use(restoreBooleanProperties)
     .use(safeMode ? undefined : resolveDeferredAttributeExpressionProps) // Evaluate deferred attribute expressions on mdx-jsx nodes (now past rehypeRaw's clone)
     .use(normalizeMdxJsxNodes) // Rewrite `mdx-jsx` back to standard `element` nodes for downstream plugins
+    .use(rehypeStripDangerousHtml) // Strip script/foreign-content/event-handler XSS vectors from raw HTML
     .use(rehypeFlattenTableCellParagraphs) // Remove <p> wrappers inside table cells to prevent margin issues
     .use(mdxishMermaidTransformer) // Add mermaid-render className to pre wrappers
     .use(generateSlugForHeadings)

diff --git a/processor/sanitize/dangerous-html.ts b/processor/sanitize/dangerous-html.ts
@@ -0,0 +1,153 @@
+import type { Root } from 'hast';
+
+/**
+ * Elements removed wholesale (subtree included) because they execute script,
+ * load remote resources, or open a foreign-content (MathML/SVG) parsing context
+ * that lets `<script>` survive namespace-confusion bypasses.
+ */
+const DANGEROUS_TAG_NAMES = new Set([
+  'script',
+  'noscript',
+  'style',
+  'template',
+  'iframe',
+  'frame',
+  'frameset',
+  'object',
+  'applet',
+  'base',
+  'link',
+  'meta',
+  'svg',
+  'math',
+]);
+
+/**
+ * URL-valued attributes that can carry a `javascript:` payload. Compared after
+ * normalizing the attribute name (lowercased, non-letters stripped) so both hast
+ * properties (`xLinkHref`, `formAction`) and raw JSX attributes (`xlink:href`,
+ * `formaction`) match the same entry.
+ */
+const URL_ATTRIBUTES = new Set([
+  'href',
+  'src',
+  'srcset',
+  'xlinkhref',
+  'action',
+  'formaction',
+  'poster',
+  'background',
+  'cite',
+  'data',
+  'ping',
+  'longdesc',
+  'manifest',
+]);
+
+const EVENT_HANDLER_ATTRIBUTE = /^on/i;
+
+// Control characters and spaces HTML strips before resolving a URL scheme; keeping
+// them would let `java\tscript:` slip past the protocol check below.
+// eslint-disable-next-line no-control-regex
+const IGNORED_URL_CHARS = /[\u0000-\u0020]/g;
+
+const normalizeAttributeName = (name: string): string => name.toLowerCase().replace(/[^a-z]/g, '');
+
+const isEventHandlerAttribute = (name: string): boolean => EVENT_HANDLER_ATTRIBUTE.test(name);
+
+const isUrlAttribute = (name: string): boolean => URL_ATTRIBUTES.has(normalizeAttributeName(name));
+
+// PascalCase names are custom React components (e.g. `<Callout>`), not host
+// elements; their `on*`/url-like values are component props, not DOM handlers.
+const isComponentName = (name: string): boolean => /^[A-Z]/.test(name);
+
+/**
+ * True for URLs that execute on navigation/load. Only a leading scheme matters:
+ * a colon that appears after a `/`, `?`, or `#` is part of a relative path, not a scheme.
+ */
+const isDangerousUrl = (value: unknown): boolean => {
+  if (typeof value !== 'string') return false;
+
+  const normalized = value.replace(IGNORED_URL_CHARS, '').toLowerCase();
+  const colonIndex = normalized.indexOf(':');
+  if (colonIndex === -1) return false;
+
+  const pathDelimiterIndex = normalized.search(/[/?#]/);
+  if (pathDelimiterIndex !== -1 && pathDelimiterIndex < colonIndex) return false;
+
+  const scheme = normalized.slice(0, colonIndex);
+  if (scheme === 'javascript' || scheme === 'vbscript') return true;
+
+  // `data:` is only dangerous when it can render markup/script (e.g. data:text/html).
+  return scheme === 'data' && /^[^,]*(?:html|xml|script|svg)/.test(normalized.slice(colonIndex + 1));
+};
+
+// MDX keeps JSX (including raw HTML) as `mdxJsxFlowElement`/`mdxJsxTextElement`
+// nodes with a `name` + `attributes` array, whereas the mdxish/`md` pipelines
+// produce hast `element` nodes with `tagName` + `properties`. This loose shape
+// lets one walker sanitize both so every engine reaches parity.
+interface SanitizableNode {
+  attributes?: { name?: string | null; type: string; value?: unknown }[];
+  children?: SanitizableNode[];
+  name?: string | null;
+  properties?: Record<string, unknown> | null;
+  tagName?: string;
+  type: string;
+}
+
+/** The element/component name for either node shape, or null for text/root/fragments. */
+const elementName = (node: SanitizableNode): string | null => {
+  if (node.type === 'element') return typeof node.tagName === 'string' ? node.tagName : null;
+  if (node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement') return node.name ?? null;
+  return null;
+};
+
+const cleanHostElement = (node: SanitizableNode): void => {
+  const { properties } = node;
+  if (properties) {
+    Object.keys(properties).forEach(key => {
+      if (isEventHandlerAttribute(key) || (isUrlAttribute(key) && isDangerousUrl(properties[key]))) {
+        delete properties[key];
+      }
+    });
+  }
+
+  if (node.attributes) {
+    node.attributes = node.attributes.filter(attr => {
+      if (attr.type !== 'mdxJsxAttribute' || typeof attr.name !== 'string') return true; // keep `{...spread}`
+      if (isEventHandlerAttribute(attr.name)) return false;
+      return !(isUrlAttribute(attr.name) && isDangerousUrl(attr.value));
+    });
+  }
+};
+
+/**
+ * Removes dangerous descendant elements and neutralizes script-bearing attributes
+ * in place. Recurses through components so raw HTML nested inside them is cleaned;
+ * iterates back-to-front so splicing doesn't skip siblings.
+ */
+const cleanChildren = (node: SanitizableNode): void => {
+  if (!node.children) return;
+
+  for (let index = node.children.length - 1; index >= 0; index -= 1) {
+    const child = node.children[index];
+    const name = elementName(child);
+    const isHostElement = name !== null && !isComponentName(name);
+
+    if (isHostElement && DANGEROUS_TAG_NAMES.has(name.toLowerCase())) {
+      node.children.splice(index, 1);
+      continue; // eslint-disable-line no-continue
+    }
+
+    if (isHostElement) cleanHostElement(child);
+    cleanChildren(child);
+  }
+};
+
+/**
+ * Strips script-execution vectors from a HAST/MDX tree: `<script>`, MathML/SVG
+ * foreign content, event-handler attributes, and `javascript:`/`vbscript:` URLs.
+ * Handles both hast `element` and MDX JSX nodes; PascalCase custom components keep
+ * their props (React props, not DOM handlers) — only host elements are sanitized.
+ */
+export const stripDangerousHtml = (tree: Root): void => cleanChildren(tree);