-
Notifications
You must be signed in to change notification settings - Fork 18
fix: sanitize raw HTML in MDXISH & MDX renderers #1526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
eaglethrost
wants to merge
10
commits into
next
Choose a base branch
from
dimas/rm-17024-stored-xss-in-hub-docs-renderer-via-mathml
base: next
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 1 commit
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
15b76ac
feat: sanitize mdxish & mdx
eaglethrost c2d6449
fix: github test
eaglethrost d771030
fix: tests, remove style
eaglethrost f3957e3
test: enhance
eaglethrost 053eb40
chore: improvements
eaglethrost 31035c3
fix: tests & code structure comments
eaglethrost df6d12e
chore: coderabbit comments
eaglethrost 1c9ef15
Merge branch 'next' into dimas/rm-17024-stored-xss-in-hub-docs-render…
eaglethrost 26e7f0f
fix: remove embed
eaglethrost 1b2b2c9
fix: move remark plugins
eaglethrost File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| import { render } from '@testing-library/react'; | ||
| import React from 'react'; | ||
|
|
||
| import { execute } from '../helpers'; | ||
|
|
||
| /** | ||
| * The `md` format sanitizes via rehype-sanitize (covered in run.test.tsx). Default | ||
| * MDX format keeps raw HTML as JSX nodes that schema never sees, so these assert the | ||
| * shared dangerous-HTML stripper closes that path too. | ||
| */ | ||
| describe('MDX (compile) sanitization', () => { | ||
| it('strips script-execution vectors in default MDX format', () => { | ||
| const md = [ | ||
| '# Docs', | ||
| '', | ||
| '<script>window.__xss = 1</script>', | ||
| '', | ||
| '<a href="javascript:alert(1)">link</a>', | ||
| '', | ||
| '<img src="x" onerror="window.__xss = 1" />', | ||
| '', | ||
| '<iframe src="javascript:alert(1)"></iframe>', | ||
| ].join('\n'); | ||
|
|
||
| const Component = execute(md, {}, {}); // no format => MDX | ||
| const { container } = render(<Component />); | ||
|
|
||
| expect(container.querySelector('script')).not.toBeInTheDocument(); | ||
| expect(container.querySelector('iframe')).not.toBeInTheDocument(); | ||
|
|
||
| // The link text still renders, but no anchor carries a javascript: href. | ||
| const hrefs = [...container.querySelectorAll('a')].map(a => a.getAttribute('href')); | ||
| // eslint-disable-next-line no-script-url | ||
| expect(hrefs.some(href => href?.startsWith('javascript:'))).toBe(false); | ||
Check failureCode scanning / CodeQL Incomplete URL scheme check High test
This check does not consider data: and vbscript:.
|
||
| expect(container.textContent).toContain('link'); | ||
|
|
||
| // Image still renders, but the onerror handler is gone. | ||
| const image = container.querySelector('img'); | ||
| expect(image?.getAttribute('onerror')).toBeNull(); | ||
| }); | ||
|
|
||
| it('strips the MathML namespace-confusion payload in default MDX format', () => { | ||
| const md = '# Docs\n\n<math><mtext><script>window.__xss = 1</script></mtext></math>'; | ||
|
|
||
| const Component = execute(md, {}, {}); | ||
| const { container } = render(<Component />); | ||
|
|
||
| expect(container.querySelector('script')).not.toBeInTheDocument(); | ||
| expect(container.querySelector('math')).not.toBeInTheDocument(); | ||
| expect(container.querySelector('h1')).toBeInTheDocument(); | ||
| }); | ||
| }); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| import type { RMDXModule } from '../../../types'; | ||
|
|
||
| import { visit } from 'unist-util-visit'; | ||
|
|
||
| import { mdxish } from '../../../lib'; | ||
| import { findAllElementsByTagName, findElementByTagName } from '../../helpers'; | ||
|
|
||
| /** Collects every property key present on any element in the tree. */ | ||
| function allPropertyKeys(tree: ReturnType<typeof mdxish>): string[] { | ||
| const keys = new Set<string>(); | ||
| visit(tree, 'element', node => { | ||
| Object.keys(node.properties ?? {}).forEach(key => keys.add(key)); | ||
| }); | ||
| return [...keys]; | ||
| } | ||
|
|
||
| describe('mdxish raw HTML sanitization', () => { | ||
| describe('script execution vectors', () => { | ||
| it('strips the MathML namespace-confusion payload from the report', () => { | ||
| const tree = mdxish('# Docs\n\n<math><mtext><script>window.__xssfired=1</script></mtext></math>\n'); | ||
|
|
||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| expect(findElementByTagName(tree, 'math')).toBeNull(); | ||
| expect(findElementByTagName(tree, 'mtext')).toBeNull(); | ||
| // The heading and surrounding structure survive. | ||
| expect(findElementByTagName(tree, 'h1')).not.toBeNull(); | ||
| }); | ||
|
|
||
| it('strips the exact String.fromCharCode exfil payload from the report', () => { | ||
| const payload = | ||
| '<math><mtext><script>fetch(String.fromCharCode(47,97,112,105)).then(function(r){return r.text()})</script></mtext></math>'; | ||
| const tree = mdxish(`# Docs\n\n${payload}\n`); | ||
|
|
||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| expect(JSON.stringify(tree)).not.toContain('fromCharCode'); | ||
| }); | ||
|
|
||
| it('strips a bare top-level <script>', () => { | ||
| const tree = mdxish('<script>alert(1)</script>'); | ||
|
|
||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| }); | ||
|
|
||
| it('strips SVG foreign content carrying a script', () => { | ||
| const tree = mdxish('<svg><foreignObject><script>alert(1)</script></foreignObject></svg>'); | ||
|
|
||
| expect(findElementByTagName(tree, 'svg')).toBeNull(); | ||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| }); | ||
|
|
||
| it('strips dangerous embedders (iframe/object)', () => { | ||
| const tree = mdxish('<iframe src="javascript:alert(1)"></iframe>\n\n<object data="x"></object>'); | ||
|
|
||
| expect(findElementByTagName(tree, 'iframe')).toBeNull(); | ||
| expect(findElementByTagName(tree, 'object')).toBeNull(); | ||
| }); | ||
| }); | ||
|
|
||
| describe('attribute vectors', () => { | ||
| it('removes event-handler attributes but keeps the element', () => { | ||
| const tree = mdxish('<img src="x.png" onerror="alert(1)" alt="ok">'); | ||
|
|
||
| const img = findElementByTagName(tree, 'img'); | ||
| expect(img).not.toBeNull(); | ||
| expect(allPropertyKeys(tree)).not.toContain('onError'); | ||
| expect(img?.properties?.src).toBe('x.png'); | ||
| }); | ||
|
|
||
| it('removes javascript: hrefs but keeps the anchor text', () => { | ||
| const tree = mdxish('<a href="javascript:alert(1)">click me</a>'); | ||
|
|
||
| const anchor = findElementByTagName(tree, 'a'); | ||
| expect(anchor).not.toBeNull(); | ||
| expect(anchor?.properties?.href).toBeUndefined(); | ||
| expect(JSON.stringify(tree)).toContain('click me'); | ||
| }); | ||
|
|
||
| it('ignores whitespace/control-char obfuscated javascript: URLs', () => { | ||
| const tree = mdxish('<a href="java\tscript:alert(1)">x</a>'); | ||
|
|
||
| expect(findElementByTagName(tree, 'a')?.properties?.href).toBeUndefined(); | ||
| }); | ||
| }); | ||
|
|
||
| describe('safe content is preserved', () => { | ||
| it('keeps benign formatting, links, and images', () => { | ||
| const tree = mdxish( | ||
| '<div class="note"><strong>Bold</strong> and <a href="https://example.com">link</a></div>\n\n<img src="https://example.com/a.png" alt="ok">', | ||
| ); | ||
|
|
||
| expect(findElementByTagName(tree, 'strong')).not.toBeNull(); | ||
| expect(findElementByTagName(tree, 'a')?.properties?.href).toBe('https://example.com'); | ||
| expect(findElementByTagName(tree, 'img')?.properties?.src).toBe('https://example.com/a.png'); | ||
| }); | ||
|
|
||
| it('keeps relative and mailto links', () => { | ||
| const tree = mdxish('<a href="/docs/start">a</a> <a href="mailto:x@y.com">b</a>'); | ||
|
|
||
| const hrefs = findAllElementsByTagName(tree, 'a').map(node => node.properties?.href); | ||
| expect(hrefs).toStrictEqual(['/docs/start', 'mailto:x@y.com']); | ||
| }); | ||
| }); | ||
|
|
||
| describe('custom components', () => { | ||
| const testComponents: Record<string, RMDXModule> = { | ||
| TestComponent: {} as RMDXModule | ||
| } | ||
|
|
||
| it('preserves event-handler-named props on PascalCase components', () => { | ||
| const tree = mdxish('<TestComponent onClick="fn" href="javascript:alert(1)" />', { | ||
| components: testComponents, | ||
| }); | ||
|
|
||
| const component = findElementByTagName(tree, 'TestComponent'); | ||
| expect(component?.properties?.onClick).toBe('fn'); | ||
| // eslint-disable-next-line no-script-url | ||
| expect(component?.properties?.href).toBe('javascript:alert(1)'); | ||
| }); | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| it('still sanitizes raw HTML nested inside a component', () => { | ||
| const tree = mdxish('<TestComponent>\n\n<img src="x" onerror="alert(1)">\n\n</TestComponent>', { | ||
| components: testComponents, | ||
| }); | ||
|
|
||
| expect(allPropertyKeys(tree)).not.toContain('onError'); | ||
| expect(findElementByTagName(tree, 'img')).not.toBeNull(); | ||
| }); | ||
| }); | ||
|
|
||
| describe('integration with other nodes', () => { | ||
| it('sanitizes raw HTML embedded inside a table cell', () => { | ||
| const tree = mdxish('| A | B |\n| --- | --- |\n| <img src=x onerror=alert(1)> | ok |'); | ||
|
|
||
| expect(allPropertyKeys(tree)).not.toContain('onError'); | ||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| }); | ||
|
|
||
| it('sanitizes raw HTML nested inside a callout', () => { | ||
| const tree = mdxish('> 📘 Title\n>\n> <script>alert(1)</script> body text'); | ||
|
|
||
| expect(findElementByTagName(tree, 'script')).toBeNull(); | ||
| expect(JSON.stringify(tree)).toContain('body text'); | ||
| }); | ||
| }); | ||
| }); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,153 @@ | ||
| import type { Root } from 'hast'; | ||
|
|
||
| /** | ||
| * Elements removed wholesale (subtree included) because they execute script, | ||
| * load remote resources, or open a foreign-content (MathML/SVG) parsing context | ||
| * that lets `<script>` survive namespace-confusion bypasses. | ||
| */ | ||
| const DANGEROUS_TAG_NAMES = new Set([ | ||
| 'script', | ||
| 'noscript', | ||
| 'style', | ||
| 'template', | ||
| 'iframe', | ||
| 'frame', | ||
| 'frameset', | ||
| 'object', | ||
| 'applet', | ||
| 'base', | ||
| 'link', | ||
| 'meta', | ||
| 'svg', | ||
| 'math', | ||
| ]); | ||
|
|
||
| /** | ||
| * URL-valued attributes that can carry a `javascript:` payload. Compared after | ||
| * normalizing the attribute name (lowercased, non-letters stripped) so both hast | ||
| * properties (`xLinkHref`, `formAction`) and raw JSX attributes (`xlink:href`, | ||
| * `formaction`) match the same entry. | ||
| */ | ||
| const URL_ATTRIBUTES = new Set([ | ||
| 'href', | ||
| 'src', | ||
| 'srcset', | ||
| 'xlinkhref', | ||
| 'action', | ||
| 'formaction', | ||
| 'poster', | ||
| 'background', | ||
| 'cite', | ||
| 'data', | ||
| 'ping', | ||
| 'longdesc', | ||
| 'manifest', | ||
| ]); | ||
|
|
||
| const EVENT_HANDLER_ATTRIBUTE = /^on/i; | ||
|
|
||
| // Control characters and spaces HTML strips before resolving a URL scheme; keeping | ||
| // them would let `java\tscript:` slip past the protocol check below. | ||
| // eslint-disable-next-line no-control-regex | ||
| const IGNORED_URL_CHARS = /[\u0000-\u0020]/g; | ||
|
|
||
| const normalizeAttributeName = (name: string): string => name.toLowerCase().replace(/[^a-z]/g, ''); | ||
|
|
||
| const isEventHandlerAttribute = (name: string): boolean => EVENT_HANDLER_ATTRIBUTE.test(name); | ||
|
|
||
| const isUrlAttribute = (name: string): boolean => URL_ATTRIBUTES.has(normalizeAttributeName(name)); | ||
|
|
||
| // PascalCase names are custom React components (e.g. `<Callout>`), not host | ||
| // elements; their `on*`/url-like values are component props, not DOM handlers. | ||
| const isComponentName = (name: string): boolean => /^[A-Z]/.test(name); | ||
|
|
||
| /** | ||
| * True for URLs that execute on navigation/load. Only a leading scheme matters: | ||
| * a colon that appears after a `/`, `?`, or `#` is part of a relative path, not a scheme. | ||
| */ | ||
| const isDangerousUrl = (value: unknown): boolean => { | ||
| if (typeof value !== 'string') return false; | ||
|
|
||
| const normalized = value.replace(IGNORED_URL_CHARS, '').toLowerCase(); | ||
| const colonIndex = normalized.indexOf(':'); | ||
| if (colonIndex === -1) return false; | ||
|
|
||
| const pathDelimiterIndex = normalized.search(/[/?#]/); | ||
| if (pathDelimiterIndex !== -1 && pathDelimiterIndex < colonIndex) return false; | ||
|
|
||
| const scheme = normalized.slice(0, colonIndex); | ||
| if (scheme === 'javascript' || scheme === 'vbscript') return true; | ||
|
|
||
| // `data:` is only dangerous when it can render markup/script (e.g. data:text/html). | ||
| return scheme === 'data' && /^[^,]*(?:html|xml|script|svg)/.test(normalized.slice(colonIndex + 1)); | ||
| }; | ||
|
|
||
| // MDX keeps JSX (including raw HTML) as `mdxJsxFlowElement`/`mdxJsxTextElement` | ||
| // nodes with a `name` + `attributes` array, whereas the mdxish/`md` pipelines | ||
| // produce hast `element` nodes with `tagName` + `properties`. This loose shape | ||
| // lets one walker sanitize both so every engine reaches parity. | ||
| interface SanitizableNode { | ||
| attributes?: { name?: string | null; type: string; value?: unknown }[]; | ||
| children?: SanitizableNode[]; | ||
| name?: string | null; | ||
| properties?: Record<string, unknown> | null; | ||
| tagName?: string; | ||
| type: string; | ||
| } | ||
|
|
||
| /** The element/component name for either node shape, or null for text/root/fragments. */ | ||
| const elementName = (node: SanitizableNode): string | null => { | ||
| if (node.type === 'element') return typeof node.tagName === 'string' ? node.tagName : null; | ||
| if (node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement') return node.name ?? null; | ||
| return null; | ||
| }; | ||
|
|
||
| const cleanHostElement = (node: SanitizableNode): void => { | ||
| const { properties } = node; | ||
| if (properties) { | ||
| Object.keys(properties).forEach(key => { | ||
| if (isEventHandlerAttribute(key) || (isUrlAttribute(key) && isDangerousUrl(properties[key]))) { | ||
| delete properties[key]; | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| if (node.attributes) { | ||
| node.attributes = node.attributes.filter(attr => { | ||
| if (attr.type !== 'mdxJsxAttribute' || typeof attr.name !== 'string') return true; // keep `{...spread}` | ||
| if (isEventHandlerAttribute(attr.name)) return false; | ||
| return !(isUrlAttribute(attr.name) && isDangerousUrl(attr.value)); | ||
| }); | ||
| } | ||
| }; | ||
|
|
||
| /** | ||
| * Removes dangerous descendant elements and neutralizes script-bearing attributes | ||
| * in place. Recurses through components so raw HTML nested inside them is cleaned; | ||
| * iterates back-to-front so splicing doesn't skip siblings. | ||
| */ | ||
| const cleanChildren = (node: SanitizableNode): void => { | ||
| if (!node.children) return; | ||
|
|
||
| for (let index = node.children.length - 1; index >= 0; index -= 1) { | ||
| const child = node.children[index]; | ||
| const name = elementName(child); | ||
| const isHostElement = name !== null && !isComponentName(name); | ||
|
|
||
| if (isHostElement && DANGEROUS_TAG_NAMES.has(name.toLowerCase())) { | ||
| node.children.splice(index, 1); | ||
| continue; // eslint-disable-line no-continue | ||
| } | ||
|
|
||
| if (isHostElement) cleanHostElement(child); | ||
| cleanChildren(child); | ||
| } | ||
| }; | ||
|
|
||
| /** | ||
| * Strips script-execution vectors from a HAST/MDX tree: `<script>`, MathML/SVG | ||
| * foreign content, event-handler attributes, and `javascript:`/`vbscript:` URLs. | ||
| * Handles both hast `element` and MDX JSX nodes; PascalCase custom components keep | ||
| * their props (React props, not DOM handlers) — only host elements are sanitized. | ||
| */ | ||
| export const stripDangerousHtml = (tree: Root): void => cleanChildren(tree); |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.