diff --git a/__tests__/components/HTMLBlock.test.tsx b/__tests__/components/HTMLBlock.test.tsx index 71e1a9625..15c87e71a 100644 --- a/__tests__/components/HTMLBlock.test.tsx +++ b/__tests__/components/HTMLBlock.test.tsx @@ -55,9 +55,7 @@ describe('HTML Block', () => { expect(view.indexOf('
in atag - // Rendering looks correct, so skip this for now until we decide if we want to fix this or not - it.skip.each(renderingEngines)('%s: renders the html in a `
` tag if safeMode={true}', (_label, renderContent) => { + it.each(renderingEngines)('%s: renders the html in a `` tag if safeMode={true}', (_label, renderContent) => { const md = '{``} '; const Component = renderContent(md); expect(renderToStaticMarkup()).toBe( diff --git a/__tests__/lib/compile-sanitize.test.tsx b/__tests__/lib/compile-sanitize.test.tsx new file mode 100644 index 000000000..a68fcd2da --- /dev/null +++ b/__tests__/lib/compile-sanitize.test.tsx @@ -0,0 +1,54 @@ +import { render } from '@testing-library/react'; +import React from 'react'; + +import { execute } from '../helpers'; + +// `md` format sanitizes via rehype-sanitize's allow-list (covered in run.test.tsx). +// Default MDX keeps raw HTML as JSX nodes that allow-list never sees, so these assert +// the deny-list stripper removes the known script-execution vectors on that path. +describe('MDX (compile) sanitization', () => { + it('strips script-execution vectors in default MDX format', () => { + const md = [ + '# Docs', + '', + '', + '', + 'link', + '', + 'link', + '', + 'link', + '', + ' ', + '', + '', + ].join('\n'); + + const Component = execute(md, {}, {}); // no format => MDX + const { container } = render(
); + + expect(container.querySelector('script')).not.toBeInTheDocument(); + expect(container.querySelector('iframe')).not.toBeInTheDocument(); + + // The link text still renders, but no anchor carries a script-executing href. + const dangerousScheme = /^\s*(?:javascript|vbscript|data):/i; + const hrefs = [...container.querySelectorAll('a')].map(a => a.getAttribute('href')); + expect(hrefs.some(href => href !== null && dangerousScheme.test(href))).toBe(false); + expect(container.textContent).toContain('link'); + + // Image still renders, but the onerror handler is gone. + const image = container.querySelector('img'); + expect(image?.getAttribute('onerror')).toBeNull(); + }); + + it('strips the MathML namespace-confusion payload in default MDX format', () => { + const md = '# Docs\n\n'; + + const Component = execute(md, {}, {}); + const { container } = render( ); + + expect(container.querySelector('script')).not.toBeInTheDocument(); + expect(container.querySelector('math')).not.toBeInTheDocument(); + expect(container.querySelector('h1')).toBeInTheDocument(); + }); +}); diff --git a/__tests__/lib/mdxish/sanitize-raw-html.test.ts b/__tests__/lib/mdxish/sanitize-raw-html.test.ts new file mode 100644 index 000000000..8220661bd --- /dev/null +++ b/__tests__/lib/mdxish/sanitize-raw-html.test.ts @@ -0,0 +1,172 @@ +import type { RMDXModule } from '../../../types'; + +import { visit } from 'unist-util-visit'; + +import { mdxish } from '../../../lib'; +import { findAllElementsByTagName, findElementByTagName } from '../../helpers'; + +/** Collects every property key present on any element in the tree. */ +function allPropertyKeys(tree: ReturnType ): string[] { + const keys = new Set (); + visit(tree, 'element', node => { + Object.keys(node.properties ?? {}).forEach(key => keys.add(key)); + }); + return [...keys]; +} + +describe('mdxish raw HTML sanitization', () => { + describe('script execution vectors', () => { + it('strips the MathML namespace-confusion payload from the report', () => { + const tree = mdxish('# Docs\n\n\n'); + + expect(findElementByTagName(tree, 'script')).toBeNull(); + expect(findElementByTagName(tree, 'math')).toBeNull(); + expect(findElementByTagName(tree, 'mtext')).toBeNull(); + // The heading and surrounding structure survive. + expect(findElementByTagName(tree, 'h1')).not.toBeNull(); + }); + + it('strips scripts containing String.fromCharCode payload', () => { + const payload = + ''; + const tree = mdxish(`# Docs\n\n${payload}\n`); + + expect(findElementByTagName(tree, 'script')).toBeNull(); + expect(JSON.stringify(tree)).not.toContain('fromCharCode'); + }); + + it('strips a bare top-level '); + + expect(findElementByTagName(tree, 'script')).toBeNull(); + }); + + it('strips SVG foreign content carrying a script', () => { + const tree = mdxish(''); + + expect(findElementByTagName(tree, 'svg')).toBeNull(); + expect(findElementByTagName(tree, 'script')).toBeNull(); + }); + + it('strips embedders (iframe/object)', () => { + const tree = mdxish('\n\n'); + + expect(findElementByTagName(tree, 'iframe')).toBeNull(); + expect(findElementByTagName(tree, 'object')).toBeNull(); + }); + }); + + describe('attribute vectors', () => { + it('removes event-handler attributes but keeps the element', () => { + const tree = mdxish(' '); + + const img = findElementByTagName(tree, 'img'); + expect(img).not.toBeNull(); + expect(allPropertyKeys(tree)).not.toContain('onError'); + expect(img?.properties?.src).toBe('x.png'); + }); + + it('removes javascript: hrefs but keeps the anchor text', () => { + const tree = mdxish('click me'); + + const anchor = findElementByTagName(tree, 'a'); + expect(anchor).not.toBeNull(); + expect(anchor?.properties?.href).toBeUndefined(); + expect(JSON.stringify(tree)).toContain('click me'); + }); + + it('ignores whitespace/control-char obfuscated javascript: URLs', () => { + const tree = mdxish('x'); + + expect(findElementByTagName(tree, 'a')?.properties?.href).toBeUndefined(); + }); + }); + + describe('safe content is preserved', () => { + it('keeps benign formatting, links, and images', () => { + const tree = mdxish( + '
Bold and link\n\n', + ); + + expect(findElementByTagName(tree, 'strong')).not.toBeNull(); + expect(findElementByTagName(tree, 'a')?.properties?.href).toBe('https://example.com'); + expect(findElementByTagName(tree, 'img')?.properties?.src).toBe('https://example.com/a.png'); + }); + + it('keeps relative and mailto links', () => { + const tree = mdxish('a b'); + + const hrefs = findAllElementsByTagName(tree, 'a').map(node => node.properties?.href); + expect(hrefs).toStrictEqual(['/docs/start', 'mailto:x@y.com']); + }); + }); + + describe('custom components', () => { + const testComponents: Record
= { + TestComponent: {} as RMDXModule + } + + it('keeps event-handler-named props but strips dangerous URL props on PascalCase components', () => { + const tree = mdxish(' ', { + components: testComponents, + }); + + const component = findElementByTagName(tree, 'TestComponent'); + // `on*` props are React props on a component, not DOM handlers, so they survive... + expect(component?.properties?.onClick).toBe('fn'); + // ...but a `javascript:` URL prop is stripped: a component may forward it to a host element. + expect(component?.properties?.href).toBeUndefined(); + }); + + it('keeps safe URL props on PascalCase components', () => { + const tree = mdxish(' ', { + components: testComponents, + }); + + const component = findElementByTagName(tree, 'TestComponent'); + expect(component?.properties?.href).toBe('https://example.com'); + }); + + it('still sanitizes raw HTML nested inside a component', () => { + const tree = mdxish(' \n\n ', { + components: testComponents, + }); + + expect(allPropertyKeys(tree)).not.toContain('onError'); + expect(findElementByTagName(tree, 'img')).not.toBeNull(); + }); + }); + + describe('integration with other nodes', () => { + it('sanitizes raw HTML embedded inside a table cell', () => { + const tree = mdxish('| A | B |\n| --- | --- |\n|\n\n
| ok |'); + + expect(allPropertyKeys(tree)).not.toContain('onError'); + expect(findElementByTagName(tree, 'script')).toBeNull(); + }); + + it('sanitizes raw HTML nested inside a callout', () => { + const tree = mdxish('> 📘 Title\n>\n> body text'); + + expect(findElementByTagName(tree, 'script')).toBeNull(); + expect(JSON.stringify(tree)).toContain('body text'); + }); + + it('sanitizes raw HTML nested inside a JSX table cell', () => { + const tree = mdxish(` +
+ +
+ `); + + expect(findElementByTagName(tree, 'script')).toBeNull(); + expect(findElementByTagName(tree, 'table')).not.toBeNull(); + }); + }); +}); diff --git a/__tests__/processor/plugin/dangerous-html.test.ts b/__tests__/processor/plugin/dangerous-html.test.ts new file mode 100644 index 000000000..6453ee297 --- /dev/null +++ b/__tests__/processor/plugin/dangerous-html.test.ts @@ -0,0 +1,174 @@ +/* eslint-disable no-script-url -- the `javascript:`/`vbscript:` URLs are intentional XSS fixtures */ +import type { Element, Root } from 'hast'; +import type { MdxJsxFlowElementHast } from 'mdast-util-mdx-jsx'; + +import { stripDangerousHtml } from '../../../processor/plugin/dangerous-html'; + +const root = (...children: Root['children']): Root => ({ type: 'root', children }); + +const el = (tagName: string, properties: Element['properties'] = {}, children: Element['children'] = []): Element => ({ + type: 'element', + tagName, + properties, + children, +}); + +const jsx = (name: string | null, attributes: MdxJsxFlowElementHast['attributes'] = []): MdxJsxFlowElementHast => ({ + type: 'mdxJsxFlowElement', + name, + attributes, + children: [], +}); + +describe('stripDangerousHtml', () => { + describe('dangerous tag removal', () => { + it.each([ + 'script', + 'noscript', + 'template', + 'iframe', + 'frame', + 'frameset', + 'object', + 'applet', + 'base', + 'link', + 'meta', + 'svg', + 'math', + ])('removes <%s> and its subtree', tagName => { + const tree = root(el('p'), el(tagName, {}, [el('span')]), el('div')); + + stripDangerousHtml(tree); + + const tags = tree.children.map(child => (child.type === 'element' ? child.tagName : child.type)); + expect(tags).toStrictEqual(['p', 'div']); + }); + + // Lowercase-leading names are host elements (uppercase-leading ones are custom + // components), so the deny-set lookup lowercases to also catch e.g. `iFrame`. + it('matches lowercase-leading dangerous tags case-insensitively', () => { + const tree = root(el('iFrame')); + + stripDangerousHtml(tree); + + expect(tree.children).toHaveLength(0); + }); + + it('removes consecutive dangerous siblings', () => { + const tree = root(el('script'), el('iframe'), el('p')); + + stripDangerousHtml(tree); + + expect(tree.children).toHaveLength(1); + expect((tree.children[0] as Element).tagName).toBe('p'); + }); + }); + + describe('host element attribute cleaning', () => { + it('drops event-handler attributes', () => { + const node = el('img', { src: 'x', onError: 'steal()', onClick: 'go()' }); + stripDangerousHtml(root(node)); + + expect(node.properties).toStrictEqual({ src: 'x' }); + }); + + it('drops javascript: and vbscript: URLs on url-valued attributes', () => { + const node = el('a', { href: 'javascript:alert(1)' }); + const node2 = el('a', { href: 'vbscript:msgbox(1)' }); + stripDangerousHtml(root(node, node2)); + + expect(node.properties).toStrictEqual({}); + expect(node2.properties).toStrictEqual({}); + }); + + it('keeps safe URLs', () => { + const node = el('a', { href: 'https://example.com/javascript:not-a-scheme' }); + stripDangerousHtml(root(node)); + + expect(node.properties?.href).toBe('https://example.com/javascript:not-a-scheme'); + }); + + it('drops dangerous data: URLs but keeps benign ones', () => { + const danger = el('a', { href: 'data:text/html,' }); + const safe = el('img', { src: 'data:image/png;base64,iVBOR' }); + stripDangerousHtml(root(danger, safe)); + + expect(danger.properties).toStrictEqual({}); + expect(safe.properties?.src).toBe('data:image/png;base64,iVBOR'); + }); + + it('ignores control characters when resolving the scheme', () => { + const node = el('a', { href: 'java\tscript:alert(1)' }); + stripDangerousHtml(root(node)); + + expect(node.properties).toStrictEqual({}); + }); + + it('keeps a normal srcset (treated as a single URL, no javascript: scheme)', () => { + const node = el('img', { srcSet: 'a.png 1x, b.png 2x' }); + stripDangerousHtml(root(node)); + + expect(node.properties?.srcSet).toBe('a.png 1x, b.png 2x'); + }); + + it('normalizes attribute names so xlink:href / formaction are checked', () => { + const node = el('a', { xLinkHref: 'javascript:alert(1)', formAction: 'javascript:alert(1)' }); + stripDangerousHtml(root(node)); + + expect(node.properties).toStrictEqual({}); + }); + }); + + describe('MDX JSX nodes', () => { + it('drops event-handler and javascript: attributes on host JSX elements', () => { + const node = jsx('a', [ + { type: 'mdxJsxAttribute', name: 'onClick', value: 'go()' }, + { type: 'mdxJsxAttribute', name: 'href', value: 'javascript:alert(1)' }, + { type: 'mdxJsxAttribute', name: 'id', value: 'keep' }, + ]); + stripDangerousHtml(root(node)); + + expect(node.attributes).toStrictEqual([{ type: 'mdxJsxAttribute', name: 'id', value: 'keep' }]); + }); + + it('drops expression-valued URL attributes on host JSX elements', () => { + // `href={'javascript:alert(1)'}` / `formAction={...}` — a non-string value could still + // resolve to a dangerous scheme at render time, so it can't be trusted like a literal. + const node = jsx('a', [ + { type: 'mdxJsxAttribute', name: 'href', value: { type: 'mdxJsxAttributeValueExpression', value: "'javascript:alert(1)'", data: {} } }, + { type: 'mdxJsxAttribute', name: 'formAction', value: { type: 'mdxJsxAttributeValueExpression', value: 'dynamicUrl', data: {} } }, + { type: 'mdxJsxAttribute', name: 'id', value: 'keep' }, + ]); + stripDangerousHtml(root(node)); + + expect(node.attributes).toStrictEqual([{ type: 'mdxJsxAttribute', name: 'id', value: 'keep' }]); + }); + + it('keeps spread expression attributes untouched', () => { + const spread = { type: 'mdxJsxExpressionAttribute', value: '...{ onClick: handler }' } as const; + const node = jsx('div', [spread]); + stripDangerousHtml(root(node)); + + expect(node.attributes).toStrictEqual([spread]); + }); + + it('preserves PascalCase components and their on* props, but strips URL props and cleans children', () => { + const child = el('img', { onError: 'steal()' }); + const component = jsx('Callout', [ + { type: 'mdxJsxAttribute', name: 'onClick', value: 'props-not-a-handler' }, + { type: 'mdxJsxAttribute', name: 'href', value: 'javascript:alert(1)' }, + ]); + component.children = [child]; + stripDangerousHtml(root(component)); + + // The `on*` prop survives (React prop, not a DOM handler), but the dangerous URL prop + // is stripped since a component may forward it to a host element... + expect(component.attributes).toStrictEqual([ + { type: 'mdxJsxAttribute', name: 'onClick', value: 'props-not-a-handler' }, + ]); + // ...and the nested raw+ + ++ + +handler is stripped. + expect(child.properties).toStrictEqual({}); + }); + }); +}); diff --git a/lib/compile.ts b/lib/compile.ts index 1ae9c7fdb..5374f2b9a 100644 --- a/lib/compile.ts +++ b/lib/compile.ts @@ -10,6 +10,7 @@ import remarkFrontmatter from 'remark-frontmatter'; import remarkGfm from 'remark-gfm'; import MdxSyntaxError from '../errors/mdx-syntax-error'; +import rehypeStripDangerousHtml from '../processor/plugin/dangerous-html'; import { rehypeToc } from '../processor/plugin/toc'; import { defaultTransforms, @@ -39,7 +40,18 @@ const sanitizeSchema = deepmerge(defaultSchema, { const compile = ( text: string, - { components = {}, missingComponents, copyButtons, useTailwind, hardBreaks, ...opts }: CompileOpts = {}, + { + components = {}, + missingComponents, + copyButtons, + useTailwind, + hardBreaks, + // Pulled out of `...opts` so the sanitizer below is always appended last: a caller's + // `rehypePlugins`/`remarkPlugins` must not replace the pipeline (and drop the stripper). + remarkPlugins: userRemarkPlugins = [], + rehypePlugins: userRehypePlugins = [], + ...opts + }: CompileOpts = {}, ) => { // Destructure at runtime to avoid circular dependency issues const { codeTabsTransformer, ...transforms } = defaultTransforms; @@ -54,14 +66,16 @@ const compile = ( handleMissingComponents, { components, missingComponents: ['ignore', 'throw'].includes(missingComponents) ? missingComponents : 'ignore' }, ], - [validateMCPIntro], + [validateMCPIntro] ]; if (useTailwind) { remarkPlugins.push([tailwindTransformer, { components }]); } - const rehypePlugins: PluggableList = [...defaultRehypePlugins, [rehypeToc, { components }]]; + remarkPlugins.push(...userRemarkPlugins); + + const rehypePlugins: PluggableList = [...defaultRehypePlugins, [rehypeToc, { components }], ...userRehypePlugins]; if (opts.format === 'md') { /** @@ -79,6 +93,11 @@ const compile = ( }, ]); rehypePlugins.push([rehypeSanitize, sanitizeSchema]); + } else { + // MDX keeps raw HTML as JSX nodes the `rehypeSanitize` allow-list never sees, and + // custom components must survive an allow-list anyway, so fall back to a deny-list. + // Narrower guarantee than the `md` path — see `dangerous-html.ts`. + rehypePlugins.push(rehypeStripDangerousHtml); } try { diff --git a/lib/mdxish.ts b/lib/mdxish.ts index b4054b62c..d1ea704c9 100644 --- a/lib/mdxish.ts +++ b/lib/mdxish.ts @@ -20,6 +20,7 @@ import { unified } from 'unified'; import { VFile } from 'vfile'; import { mdxishCompilers } from '../processor/compile'; +import rehypeStripDangerousHtml from '../processor/plugin/dangerous-html'; import { rehypeFlattenTableCellParagraphs } from '../processor/plugin/flatten-table-cell-paragraphs'; import { rehypeMdxishComponents } from '../processor/plugin/mdxish-components'; import { mdxComponentHandlers } from '../processor/plugin/mdxish-handlers'; @@ -286,6 +287,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root { .use(restoreBooleanProperties) .use(safeMode ? undefined : resolveDeferredAttributeExpressionProps) // Evaluate deferred attribute expressions on mdx-jsx nodes (now past rehypeRaw's clone) .use(normalizeMdxJsxNodes) // Rewrite `mdx-jsx` back to standard `element` nodes for downstream plugins + .use(rehypeStripDangerousHtml) // Strip script/foreign-content/event-handler XSS vectors from raw HTML .use(rehypeFlattenTableCellParagraphs) // Remove
wrappers inside table cells to prevent margin issues .use(mdxishMermaidTransformer) // Add mermaid-render className to pre wrappers .use(generateSlugForHeadings) diff --git a/processor/plugin/dangerous-html.ts b/processor/plugin/dangerous-html.ts new file mode 100644 index 000000000..8086e4540 --- /dev/null +++ b/processor/plugin/dangerous-html.ts @@ -0,0 +1,194 @@ +import type { Root, Element } from 'hast'; +import type { MdxJsxAttribute } from 'mdast-util-mdx-jsx'; +import type { Node, Parent } from 'unist'; + +import { SKIP, visit } from 'unist-util-visit'; + +// A deny-list, deliberately weaker than the `md` pipeline's `rehype-sanitize` +// allow-list: those pipelines preserve arbitrary custom components, which an +// allow-list can't express. Anything not enumerated below passes through, so +// CSS exfiltration via `style` and similar lower-severity vectors are out of scope. + +/** + * Elements removed wholesale (subtree included) because they execute script, + * load remote resources, or open a foreign-content (MathML/SVG) parsing context + * that lets `