Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions __tests__/lib/compile-sanitize.test.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { render } from '@testing-library/react';
import React from 'react';

import { execute } from '../helpers';

/**
* The `md` format sanitizes via rehype-sanitize (covered in run.test.tsx). Default
* MDX format keeps raw HTML as JSX nodes that schema never sees, so these assert the
* shared dangerous-HTML stripper closes that path too.
*/
describe('MDX (compile) sanitization', () => {
it('strips script-execution vectors in default MDX format', () => {
const md = [
'# Docs',
'',
'<script>window.__xss = 1</script>',
'',
'<a href="javascript:alert(1)">link</a>',
'',
'<img src="x" onerror="window.__xss = 1" />',
'',
'<iframe src="javascript:alert(1)"></iframe>',
].join('\n');

const Component = execute(md, {}, {}); // no format => MDX
const { container } = render(<Component />);

expect(container.querySelector('script')).not.toBeInTheDocument();
expect(container.querySelector('iframe')).not.toBeInTheDocument();

// The link text still renders, but no anchor carries a javascript: href.
const hrefs = [...container.querySelectorAll('a')].map(a => a.getAttribute('href'));
// eslint-disable-next-line no-script-url
expect(hrefs.some(href => href?.startsWith('javascript:'))).toBe(false);

Check failure

Code scanning / CodeQL

Incomplete URL scheme check High test

This check does not consider data: and vbscript:.
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
expect(container.textContent).toContain('link');

// Image still renders, but the onerror handler is gone.
const image = container.querySelector('img');
expect(image?.getAttribute('onerror')).toBeNull();
});

it('strips the MathML namespace-confusion payload in default MDX format', () => {
const md = '# Docs\n\n<math><mtext><script>window.__xss = 1</script></mtext></math>';

const Component = execute(md, {}, {});
const { container } = render(<Component />);

expect(container.querySelector('script')).not.toBeInTheDocument();
expect(container.querySelector('math')).not.toBeInTheDocument();
expect(container.querySelector('h1')).toBeInTheDocument();
});
});
145 changes: 145 additions & 0 deletions __tests__/lib/mdxish/sanitize-raw-html.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import type { RMDXModule } from '../../../types';

import { visit } from 'unist-util-visit';

import { mdxish } from '../../../lib';
import { findAllElementsByTagName, findElementByTagName } from '../../helpers';

/** Collects every property key present on any element in the tree. */
function allPropertyKeys(tree: ReturnType<typeof mdxish>): string[] {
const keys = new Set<string>();
visit(tree, 'element', node => {
Object.keys(node.properties ?? {}).forEach(key => keys.add(key));
});
return [...keys];
}

describe('mdxish raw HTML sanitization', () => {
describe('script execution vectors', () => {
it('strips the MathML namespace-confusion payload from the report', () => {
const tree = mdxish('# Docs\n\n<math><mtext><script>window.__xssfired=1</script></mtext></math>\n');

expect(findElementByTagName(tree, 'script')).toBeNull();
expect(findElementByTagName(tree, 'math')).toBeNull();
expect(findElementByTagName(tree, 'mtext')).toBeNull();
// The heading and surrounding structure survive.
expect(findElementByTagName(tree, 'h1')).not.toBeNull();
});

it('strips the exact String.fromCharCode exfil payload from the report', () => {
const payload =
'<math><mtext><script>fetch(String.fromCharCode(47,97,112,105)).then(function(r){return r.text()})</script></mtext></math>';
const tree = mdxish(`# Docs\n\n${payload}\n`);

expect(findElementByTagName(tree, 'script')).toBeNull();
expect(JSON.stringify(tree)).not.toContain('fromCharCode');
});

it('strips a bare top-level <script>', () => {
const tree = mdxish('<script>alert(1)</script>');

expect(findElementByTagName(tree, 'script')).toBeNull();
});

it('strips SVG foreign content carrying a script', () => {
const tree = mdxish('<svg><foreignObject><script>alert(1)</script></foreignObject></svg>');

expect(findElementByTagName(tree, 'svg')).toBeNull();
expect(findElementByTagName(tree, 'script')).toBeNull();
});

it('strips dangerous embedders (iframe/object)', () => {
const tree = mdxish('<iframe src="javascript:alert(1)"></iframe>\n\n<object data="x"></object>');

expect(findElementByTagName(tree, 'iframe')).toBeNull();
expect(findElementByTagName(tree, 'object')).toBeNull();
});
});

describe('attribute vectors', () => {
it('removes event-handler attributes but keeps the element', () => {
const tree = mdxish('<img src="x.png" onerror="alert(1)" alt="ok">');

const img = findElementByTagName(tree, 'img');
expect(img).not.toBeNull();
expect(allPropertyKeys(tree)).not.toContain('onError');
expect(img?.properties?.src).toBe('x.png');
});

it('removes javascript: hrefs but keeps the anchor text', () => {
const tree = mdxish('<a href="javascript:alert(1)">click me</a>');

const anchor = findElementByTagName(tree, 'a');
expect(anchor).not.toBeNull();
expect(anchor?.properties?.href).toBeUndefined();
expect(JSON.stringify(tree)).toContain('click me');
});

it('ignores whitespace/control-char obfuscated javascript: URLs', () => {
const tree = mdxish('<a href="java\tscript:alert(1)">x</a>');

expect(findElementByTagName(tree, 'a')?.properties?.href).toBeUndefined();
});
});

describe('safe content is preserved', () => {
it('keeps benign formatting, links, and images', () => {
const tree = mdxish(
'<div class="note"><strong>Bold</strong> and <a href="https://example.com">link</a></div>\n\n<img src="https://example.com/a.png" alt="ok">',
);

expect(findElementByTagName(tree, 'strong')).not.toBeNull();
expect(findElementByTagName(tree, 'a')?.properties?.href).toBe('https://example.com');
expect(findElementByTagName(tree, 'img')?.properties?.src).toBe('https://example.com/a.png');
});

it('keeps relative and mailto links', () => {
const tree = mdxish('<a href="/docs/start">a</a> <a href="mailto:x@y.com">b</a>');

const hrefs = findAllElementsByTagName(tree, 'a').map(node => node.properties?.href);
expect(hrefs).toStrictEqual(['/docs/start', 'mailto:x@y.com']);
});
});

describe('custom components', () => {
const testComponents: Record<string, RMDXModule> = {
TestComponent: {} as RMDXModule
}

it('preserves event-handler-named props on PascalCase components', () => {
const tree = mdxish('<TestComponent onClick="fn" href="javascript:alert(1)" />', {
components: testComponents,
});

const component = findElementByTagName(tree, 'TestComponent');
expect(component?.properties?.onClick).toBe('fn');
// eslint-disable-next-line no-script-url
expect(component?.properties?.href).toBe('javascript:alert(1)');
});
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

it('still sanitizes raw HTML nested inside a component', () => {
const tree = mdxish('<TestComponent>\n\n<img src="x" onerror="alert(1)">\n\n</TestComponent>', {
components: testComponents,
});

expect(allPropertyKeys(tree)).not.toContain('onError');
expect(findElementByTagName(tree, 'img')).not.toBeNull();
});
});

describe('integration with other nodes', () => {
it('sanitizes raw HTML embedded inside a table cell', () => {
const tree = mdxish('| A | B |\n| --- | --- |\n| <img src=x onerror=alert(1)> | ok |');

expect(allPropertyKeys(tree)).not.toContain('onError');
expect(findElementByTagName(tree, 'script')).toBeNull();
});

it('sanitizes raw HTML nested inside a callout', () => {
const tree = mdxish('> 📘 Title\n>\n> <script>alert(1)</script> body text');

expect(findElementByTagName(tree, 'script')).toBeNull();
expect(JSON.stringify(tree)).toContain('body text');
});
});
});
5 changes: 5 additions & 0 deletions lib/compile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import remarkGfm from 'remark-gfm';

import MdxSyntaxError from '../errors/mdx-syntax-error';
import { rehypeToc } from '../processor/plugin/toc';
import rehypeStripDangerousHtml from '../processor/sanitize/rehype-strip-dangerous-html';
import {
defaultTransforms,
tailwindTransformer,
Expand Down Expand Up @@ -81,6 +82,10 @@ const compile = (
rehypePlugins.push([rehypeSanitize, sanitizeSchema]);
}

// MDX (non-`md`) content keeps raw HTML as JSX nodes that the schema above never
// sees, so strip script-execution vectors regardless of format.
rehypePlugins.push(rehypeStripDangerousHtml);

try {
const vfile = mdxCompileSync(text, {
outputFormat: 'function-body',
Expand Down
2 changes: 2 additions & 0 deletions lib/mdxish.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import { mdxishCompilers } from '../processor/compile';
import { rehypeFlattenTableCellParagraphs } from '../processor/plugin/flatten-table-cell-paragraphs';
import { rehypeMdxishComponents } from '../processor/plugin/mdxish-components';
import { mdxComponentHandlers } from '../processor/plugin/mdxish-handlers';
import rehypeStripDangerousHtml from '../processor/sanitize/rehype-strip-dangerous-html';
import calloutTransformer from '../processor/transform/callouts';
import codeTabsTransformer from '../processor/transform/code-tabs';
import embedTransformer from '../processor/transform/embeds';
Expand Down Expand Up @@ -286,6 +287,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root {
.use(restoreBooleanProperties)
.use(safeMode ? undefined : resolveDeferredAttributeExpressionProps) // Evaluate deferred attribute expressions on mdx-jsx nodes (now past rehypeRaw's clone)
.use(normalizeMdxJsxNodes) // Rewrite `mdx-jsx` back to standard `element` nodes for downstream plugins
.use(rehypeStripDangerousHtml) // Strip script/foreign-content/event-handler XSS vectors from raw HTML
.use(rehypeFlattenTableCellParagraphs) // Remove <p> wrappers inside table cells to prevent margin issues
.use(mdxishMermaidTransformer) // Add mermaid-render className to pre wrappers
.use(generateSlugForHeadings)
Expand Down
153 changes: 153 additions & 0 deletions processor/sanitize/dangerous-html.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import type { Root } from 'hast';

/**
* Elements removed wholesale (subtree included) because they execute script,
* load remote resources, or open a foreign-content (MathML/SVG) parsing context
* that lets `<script>` survive namespace-confusion bypasses.
*/
const DANGEROUS_TAG_NAMES = new Set([
'script',
'noscript',
'style',
'template',
'iframe',
'frame',
'frameset',
'object',
'applet',
'base',
'link',
'meta',
'svg',
'math',
]);

/**
* URL-valued attributes that can carry a `javascript:` payload. Compared after
* normalizing the attribute name (lowercased, non-letters stripped) so both hast
* properties (`xLinkHref`, `formAction`) and raw JSX attributes (`xlink:href`,
* `formaction`) match the same entry.
*/
const URL_ATTRIBUTES = new Set([
'href',
'src',
'srcset',
'xlinkhref',
'action',
'formaction',
'poster',
'background',
'cite',
'data',
'ping',
'longdesc',
'manifest',
]);

const EVENT_HANDLER_ATTRIBUTE = /^on/i;

// Control characters and spaces HTML strips before resolving a URL scheme; keeping
// them would let `java\tscript:` slip past the protocol check below.
// eslint-disable-next-line no-control-regex
const IGNORED_URL_CHARS = /[\u0000-\u0020]/g;

const normalizeAttributeName = (name: string): string => name.toLowerCase().replace(/[^a-z]/g, '');

const isEventHandlerAttribute = (name: string): boolean => EVENT_HANDLER_ATTRIBUTE.test(name);

const isUrlAttribute = (name: string): boolean => URL_ATTRIBUTES.has(normalizeAttributeName(name));

// PascalCase names are custom React components (e.g. `<Callout>`), not host
// elements; their `on*`/url-like values are component props, not DOM handlers.
const isComponentName = (name: string): boolean => /^[A-Z]/.test(name);

/**
* True for URLs that execute on navigation/load. Only a leading scheme matters:
* a colon that appears after a `/`, `?`, or `#` is part of a relative path, not a scheme.
*/
const isDangerousUrl = (value: unknown): boolean => {
if (typeof value !== 'string') return false;

const normalized = value.replace(IGNORED_URL_CHARS, '').toLowerCase();
const colonIndex = normalized.indexOf(':');
if (colonIndex === -1) return false;

const pathDelimiterIndex = normalized.search(/[/?#]/);
if (pathDelimiterIndex !== -1 && pathDelimiterIndex < colonIndex) return false;

const scheme = normalized.slice(0, colonIndex);
if (scheme === 'javascript' || scheme === 'vbscript') return true;

// `data:` is only dangerous when it can render markup/script (e.g. data:text/html).
return scheme === 'data' && /^[^,]*(?:html|xml|script|svg)/.test(normalized.slice(colonIndex + 1));
};

// MDX keeps JSX (including raw HTML) as `mdxJsxFlowElement`/`mdxJsxTextElement`
// nodes with a `name` + `attributes` array, whereas the mdxish/`md` pipelines
// produce hast `element` nodes with `tagName` + `properties`. This loose shape
// lets one walker sanitize both so every engine reaches parity.
interface SanitizableNode {
attributes?: { name?: string | null; type: string; value?: unknown }[];
children?: SanitizableNode[];
name?: string | null;
properties?: Record<string, unknown> | null;
tagName?: string;
type: string;
}

/** The element/component name for either node shape, or null for text/root/fragments. */
const elementName = (node: SanitizableNode): string | null => {
if (node.type === 'element') return typeof node.tagName === 'string' ? node.tagName : null;
if (node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement') return node.name ?? null;
return null;
};

const cleanHostElement = (node: SanitizableNode): void => {
const { properties } = node;
if (properties) {
Object.keys(properties).forEach(key => {
if (isEventHandlerAttribute(key) || (isUrlAttribute(key) && isDangerousUrl(properties[key]))) {
delete properties[key];
}
});
}

if (node.attributes) {
node.attributes = node.attributes.filter(attr => {
if (attr.type !== 'mdxJsxAttribute' || typeof attr.name !== 'string') return true; // keep `{...spread}`
if (isEventHandlerAttribute(attr.name)) return false;
return !(isUrlAttribute(attr.name) && isDangerousUrl(attr.value));
});
}
};

/**
* Removes dangerous descendant elements and neutralizes script-bearing attributes
* in place. Recurses through components so raw HTML nested inside them is cleaned;
* iterates back-to-front so splicing doesn't skip siblings.
*/
const cleanChildren = (node: SanitizableNode): void => {
if (!node.children) return;

for (let index = node.children.length - 1; index >= 0; index -= 1) {
const child = node.children[index];
const name = elementName(child);
const isHostElement = name !== null && !isComponentName(name);

if (isHostElement && DANGEROUS_TAG_NAMES.has(name.toLowerCase())) {
node.children.splice(index, 1);
continue; // eslint-disable-line no-continue
}

if (isHostElement) cleanHostElement(child);
cleanChildren(child);
}
};

/**
* Strips script-execution vectors from a HAST/MDX tree: `<script>`, MathML/SVG
* foreign content, event-handler attributes, and `javascript:`/`vbscript:` URLs.
* Handles both hast `element` and MDX JSX nodes; PascalCase custom components keep
* their props (React props, not DOM handlers) — only host elements are sanitized.
*/
export const stripDangerousHtml = (tree: Root): void => cleanChildren(tree);
Loading
Loading