From 6103285a5828ae845dcdeb5e3ddc7093cf332543 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Fri, 22 May 2026 00:34:48 +1000 Subject: [PATCH 01/15] fix: htmlblocks in tables by rehyping --- __tests__/lib/mdxish/html-blocks.test.ts | 123 ++++++++++++++++++ lib/mdxish.ts | 2 + .../mdxish/rehype-html-blocks-in-jsx.ts | 74 +++++++++++ 3 files changed, 199 insertions(+) create mode 100644 __tests__/lib/mdxish/html-blocks.test.ts create mode 100644 processor/transform/mdxish/rehype-html-blocks-in-jsx.ts diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts new file mode 100644 index 000000000..2218930e1 --- /dev/null +++ b/__tests__/lib/mdxish/html-blocks.test.ts @@ -0,0 +1,123 @@ +import type { Element } from 'hast'; + +import { mdxish } from '../../../lib'; +import { findAllElementsByTagName, findElementByTagName } from '../../helpers'; + +describe('mdxish HTMLBlock', () => { + describe('standalone', () => { + it('renders as with the decoded html prop', () => { + const tree = mdxish('{`
Hello
`}
'); + + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + type: 'element', + tagName: 'html-block', + properties: { html: '
Hello
' }, + children: [], + }); + }); + }); + + describe('nested inside JSX blocks (RM-16726)', () => { + it('renders inside a cell as with the decoded html prop', () => { + const md = `
+ + + + + + + + + +
NameMarkup
Custom{\`
Hello
\`}
`; + + const tree = mdxish(md); + + const rawHtmlBlock = findElementByTagName(tree, 'HTMLBlock'); + expect(rawHtmlBlock).toBeNull(); + + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + type: 'element', + tagName: 'html-block', + properties: { html: '
Hello
' }, + children: [], + }); + }); + + it('renders inside a generic JSX block as with the decoded html prop', () => { + const md = '
{`

nested

`}
'; + + const tree = mdxish(md); + + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + type: 'element', + tagName: 'html-block', + properties: { html: '

nested

' }, + }); + }); + + it('preserves safeMode and runScripts attributes when nested', () => { + const md = ` + + + + + +
{\`
raw
\`}
`; + + const tree = mdxish(md); + + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + type: 'element', + tagName: 'html-block', + properties: { + html: '
raw
', + safeMode: 'true', + runScripts: 'false', + }, + }); + }); + + it('renders multiple HTMLBlocks inside the same Table', () => { + const md = ` + + + + + + +
{\`one\`}{\`two\`}
`; + + const tree = mdxish(md); + + const htmlBlocks = findAllElementsByTagName(tree, 'html-block'); + expect(htmlBlocks).toHaveLength(2); + expect(htmlBlocks[0].properties).toMatchObject({ html: 'one' }); + expect(htmlBlocks[1].properties).toMatchObject({ html: 'two' }); + }); + + it('leaves no RDMX_HTMLBLOCK markers or stray comment nodes in the tree', () => { + const md = ` + + + + + +
+ {\`
x
\`}
+
`; + + const tree = mdxish(md); + const serialized = JSON.stringify(tree); + + expect(serialized).not.toContain('RDMX_HTMLBLOCK'); + + const htmlBlock = findElementByTagName(tree, 'html-block') as Element; + expect(htmlBlock.children).toStrictEqual([]); + }); + }); +}); diff --git a/lib/mdxish.ts b/lib/mdxish.ts index a4f8bd873..01f1db56d 100644 --- a/lib/mdxish.ts +++ b/lib/mdxish.ts @@ -47,6 +47,7 @@ import { preprocessJSXExpressions, removeJSXComments, } from '../processor/transform/mdxish/preprocess-jsx-expressions'; +import rehypeHtmlBlocksInJsx from '../processor/transform/mdxish/rehype-html-blocks-in-jsx'; import restoreSnakeCaseComponentNames from '../processor/transform/mdxish/restore-snake-case-component-name'; import { preserveBooleanProperties, @@ -282,6 +283,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root { .use(rehypeRaw, { passThrough: ['html-block', 'mdx-jsx'] }) // MDX JSX nodes bypass parse5's string-only HTML round-trip .use(restoreBooleanProperties) .use(normalizeMdxJsxNodes) // Rewrite `mdx-jsx` back to standard `element` nodes for downstream plugins + .use(rehypeHtmlBlocksInJsx) // Reattach HTMLBlock contents that survived rehypeRaw nested inside JSX blocks .use(rehypeFlattenTableCellParagraphs) // Remove

wrappers inside table cells to prevent margin issues .use(mdxishMermaidTransformer) // Add mermaid-render className to pre wrappers .use(generateSlugForHeadings) diff --git a/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts b/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts new file mode 100644 index 000000000..2a2e5a3a3 --- /dev/null +++ b/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts @@ -0,0 +1,74 @@ +import type { Element, ElementContent, Properties, Root } from 'hast'; +import type { Transformer } from 'unified'; + +import { visit } from 'unist-util-visit'; + +import { formatHtmlForMdxish } from '../../utils'; + +import { base64Decode, HTML_BLOCK_CONTENT_END, HTML_BLOCK_CONTENT_START } from './preprocess-jsx-expressions'; + +const startEscaped = HTML_BLOCK_CONTENT_START.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +const endEscaped = HTML_BLOCK_CONTENT_END.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +// Marker emitted as an HTML comment by preprocessJSXExpressions. rehypeRaw parses +// the comment body (sans ), so we match the inner form here. +const COMMENT_MARKER_RE = new RegExp( + `^${startEscaped.replace(/^$/, '')}$`, +); + +const KNOWN_HTML_BLOCK_PROPS: Record = { + safemode: 'safeMode', + runscripts: 'runScripts', +}; + +function decodeProtectedComment(value: string): string | null { + const match = value.match(COMMENT_MARKER_RE); + if (!match) return null; + try { + return base64Decode(match[1]); + } catch { + return null; + } +} + +function findEncodedPayload(children: ElementContent[]): string | null { + return children.reduce((found, child) => { + if (found !== null) return found; + if (child.type !== 'comment') return null; + return decodeProtectedComment(child.value); + }, null); +} + +function normalizeHtmlBlockProperties(properties: Properties | undefined, html: string): Properties { + const normalized: Properties = { html }; + if (!properties) return normalized; + + Object.entries(properties).forEach(([key, value]) => { + if (key === 'html') return; + const canonical = KNOWN_HTML_BLOCK_PROPS[key.toLowerCase()] ?? key; + normalized[canonical] = value; + }); + return normalized; +} + +/** + * Converts elements that survived rehypeRaw (because they were nested + * inside another JSX block like , so the mdast-level transformer never saw + * them) into the canonical hast element the renderer expects. + */ +const rehypeHtmlBlocksInJsx = (): Transformer => tree => { + visit(tree, 'element', (node: Element) => { + // rehypeRaw routes HTMLBlock through parse5, which lowercases tag names. + if (node.tagName.toLowerCase() !== 'htmlblock') return; + + const encoded = findEncodedPayload(node.children ?? []); + if (encoded === null) return; + + const html = formatHtmlForMdxish(encoded); + + node.tagName = 'html-block'; + node.properties = normalizeHtmlBlockProperties(node.properties, html); + node.children = []; + }); +}; + +export default rehypeHtmlBlocksInJsx; From 59b67fcdad3acf9ce3213460cfbf5364a2424069 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Mon, 25 May 2026 13:12:13 +1000 Subject: [PATCH 02/15] fix: github regex --- processor/transform/mdxish/rehype-html-blocks-in-jsx.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts b/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts index 2a2e5a3a3..9151e838c 100644 --- a/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts +++ b/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts @@ -12,7 +12,7 @@ const endEscaped = HTML_BLOCK_CONTENT_END.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') // Marker emitted as an HTML comment by preprocessJSXExpressions. rehypeRaw parses // the comment body (sans ), so we match the inner form here. const COMMENT_MARKER_RE = new RegExp( - `^${startEscaped.replace(/^$/, '')}$`, + `^${startEscaped.replace(/^/g; + +/** + * Rewrites protected HTMLBlock HTML comments into MDX comments so remarkMdx can + * parse a `
` that contains an ``. Length-preserving. + */ +export const neutralizeHtmlBlockComments = (value: string): string => + value.replace(HTML_COMMENT_RE, (_match, marker: string) => `{/*${marker} */}`); + +/** + * Reads the protected payload out of an `` element's MDX comment child + * (an `mdxFlowExpression`/`mdxTextExpression` holding `/*RDMX_HTMLBLOCK:โ€ฆ*\/`). + */ +const extractPayload = (element: MdxJsxFlowElement | MdxJsxTextElement): string | null => { + let payload: string | null = null; + visit(element, expr => expr.type === 'mdxFlowExpression' || expr.type === 'mdxTextExpression', expr => { + const match = (expr as { value?: string }).value?.match(PAYLOAD_RE); + if (match) payload = match[1]; + }); + return payload; +}; + +/** + * Builds the `html-block` hProperties from the `` element's JSX + * attributes (e.g. `safeMode`, `runScripts`), mirroring the rehype fallback. + */ +const collectAttributes = (element: MdxJsxFlowElement | MdxJsxTextElement): Record => { + const props: Record = {}; + element.attributes.forEach(attr => { + if (attr.type === 'mdxJsxAttribute' && typeof attr.value === 'string' && attr.name !== 'html') { + props[attr.name] = attr.value; + } + }); + return props; +}; + +/** + * Converts `` JSX elements inside a re-parsed table into `html-block` + * MDAST nodes, decoding the protected base64 payload. + */ +export const convertHtmlBlockElements = (tree: Node) => { + visit( + tree, + node => node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement', + (node, index, parent: Parent | undefined) => { + const element = node as MdxJsxFlowElement | MdxJsxTextElement; + if (element.name !== 'HTMLBlock' || !parent || index === undefined) return; + + const payload = extractPayload(element); + if (payload === null) return; + + const html = formatHtmlForMdxish(base64Decode(payload)); + const htmlBlock: HTMLBlock = { + position: element.position, + children: [{ type: 'text', value: html }], + type: NodeTypes.htmlBlock, + data: { + hName: 'html-block', + hProperties: { + html, + ...collectAttributes(element), + }, + }, + }; + + parent.children[index] = htmlBlock; + }, + ); +}; diff --git a/processor/transform/mdxish/tables/mdxish-tables.ts b/processor/transform/mdxish/tables/mdxish-tables.ts index 3dab225e5..ce3f5728d 100644 --- a/processor/transform/mdxish/tables/mdxish-tables.ts +++ b/processor/transform/mdxish/tables/mdxish-tables.ts @@ -10,6 +10,7 @@ import remarkParse from 'remark-parse'; import { unified } from 'unified'; import { EXIT, visit } from 'unist-util-visit'; +import { NodeTypes } from '../../../../enums'; import { gemojiFromMarkdown } from '../../../../lib/mdast-util/gemoji'; import { legacyVariableFromMarkdown } from '../../../../lib/mdast-util/legacy-variable'; import { gemoji } from '../../../../lib/micromark/gemoji'; @@ -20,6 +21,7 @@ import codeTabsTransformer from '../../code-tabs'; import { extractText } from '../../extract-text'; import normalizeEmphasisAST from '../normalize-malformed-md-syntax'; +import { convertHtmlBlockElements, neutralizeHtmlBlockComments } from './html-blocks-in-table'; import { normalizeTagSpacing } from './normalize-tag-spacing'; import { remapPositionsToOriginal } from './remap-positions'; import { repairUnclosedTags } from './repair-unclosed-tags'; @@ -147,6 +149,13 @@ const processTableNode = ( let tableHasFlowContent = false; + // An `html-block` (a converted ``) is block-level content that a + // markdown table cell can't represent, so keep the table as a JSX `
`. + visit(node as Node, NodeTypes.htmlBlock, () => { + tableHasFlowContent = true; + return EXIT; + }); + // Re-parse text-only cells through markdown and detect flow content visit(node as Node, isTableCell, (cell: MdxJsxTableCell) => { if (!isTextOnly(cell.children as unknown[])) return; @@ -312,36 +321,48 @@ const mdxishTables = (): Transform => tree => { if (typeof index !== 'number' || !parent || !('children' in parent)) return; if (!node.value.startsWith('` payload is an HTML comment, which is invalid MDX + // and makes the re-parse below throw. Rewrite it to an MDX comment (a + // length-preserving swap, so offsets stay aligned) and parse against that + // copy, leaving the original node intact for the failure fallback. + const neutralizedValue = neutralizeHtmlBlockComments(node.value); + const parseNode = neutralizedValue === node.value ? node : { ...node, value: neutralizedValue }; + // Main logic to transform table node to its parts // Because the processor uses remarkMdx, it is stricter in what it accepts // and only accepts valid MDX syntax. in the table node. // To get around that, we have some fallback logics after trying to repair the table content - let parsed = parseTableNode(tableNodeProcessor, node); + let parsed = parseTableNode(tableNodeProcessor, parseNode); if (!parsed) { // First common error is unclosed HTML tags - const repaired = repairUnclosedTags(node.value); - if (repaired.value !== node.value) { + const repaired = repairUnclosedTags(parseNode.value); + if (repaired.value !== parseNode.value) { parsed = parseTableNode( tableNodeProcessor, - { ...node, value: repaired.value }, - { inserts: repaired.inserts, originalSource: node.value }, + { ...parseNode, value: repaired.value }, + { inserts: repaired.inserts, originalSource: parseNode.value }, ); } if (!parsed) { // Second common error is having a line with text and an opening tag // E.g. text
\n
text - const normalized = normalizeTagSpacing(node.value); - if (normalized.value !== node.value) { + const normalized = normalizeTagSpacing(parseNode.value); + if (normalized.value !== parseNode.value) { parsed = parseTableNode( tableNodeProcessor, - { ...node, value: normalized.value }, - { inserts: normalized.inserts, originalSource: node.value }, + { ...parseNode, value: normalized.value }, + { inserts: normalized.inserts, originalSource: parseNode.value }, ); } } } if (parsed) { + // Re-parsed `` elements (now MDX comments) become `html-block` + // MDAST nodes here; the mdast-level `mdxishHtmlBlocks` never sees them + // because they are nested inside the JSX `
`. + convertHtmlBlockElements(parsed as Node); + // If the table is parsed successfully, we can now process it further // to build on the markdown / JSX table visit(parsed as Node, isMDXElement, (tableNode: MdxJsxFlowElement | MdxJsxTextElement) => { From 3aee631a186d83237e4647c27242fcf6dec324f1 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Mon, 25 May 2026 19:36:44 +1000 Subject: [PATCH 04/15] chore: bump markdown --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 27bbc9c97..27b5869ee 100644 --- a/package.json +++ b/package.json @@ -177,7 +177,7 @@ }, { "path": "dist/main.node.js", - "maxSize": "947KB" + "maxSize": "950KB" } ] }, From 4472bb118ee551899e7d0c4ebba51f1aedb61161 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Mon, 25 May 2026 22:47:22 +1000 Subject: [PATCH 05/15] test: more html blocks edge cases --- __tests__/lib/mdxish/html-blocks.test.ts | 238 +++++++++++++++++++++-- 1 file changed, 226 insertions(+), 12 deletions(-) diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts index 49e14e302..15b2054eb 100644 --- a/__tests__/lib/mdxish/html-blocks.test.ts +++ b/__tests__/lib/mdxish/html-blocks.test.ts @@ -15,6 +15,17 @@ function ensureJsxTableIsParsed(md: string) { expect(tableNodes).toHaveLength(1); } +/** Decoded `html` props of every in the rendered tree, in document order. */ +function htmlBlockPayloads(tree: ReturnType) { + return findAllElementsByTagName(tree, 'html-block').map(node => node.properties?.html); +} + +/** Asserts no raw survived and no protected marker leaked into the tree. */ +function expectFullyConverted(tree: ReturnType) { + expect(findElementByTagName(tree, 'HTMLBlock')).toBeNull(); + expect(JSON.stringify(tree)).not.toContain('RDMX_HTMLBLOCK'); +} + describe(' in mdxish', () => { describe('standalone', () => { it('renders as with the decoded html prop', () => { @@ -28,18 +39,130 @@ describe(' in mdxish', () => { children: [], }); }); + + it('preserves multiline HTML content verbatim', () => { + const tree = mdxish('{`
\n multi\n
`}
'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n multi\n
']); + expectFullyConverted(tree); + }); + + it('preserves raw `}
'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['']); + // The script must not become a real `}
'; - * protectHTMLBlockContent(input) - * // Returns: '' - * ``` - */ -function protectHTMLBlockContent(content: string): string { - return content.replace( - /(]*>)\{\s*`((?:[^`\\]|\\.)*)`\s*\}(<\/HTMLBlock>)/g, - (_match, openTag: string, templateContent: string, closeTag: string) => { - const encoded = base64Encode(templateContent); - return `${openTag}${HTML_BLOCK_CONTENT_START}${encoded}${HTML_BLOCK_CONTENT_END}${closeTag}`; - }, - ); -} - /** * Removes JSX-style comments (e.g., { /* comment *\/ }) from content. * @@ -208,10 +166,9 @@ function escapeProblematicBraces(content: string): string { * @returns Preprocessed content ready for markdown parsing */ export function preprocessJSXExpressions(content: string): string { - let processed = protectHTMLBlockContent(content); - const { protectedCode, protectedContent } = protectCodeBlocks(processed); + const { protectedCode, protectedContent } = protectCodeBlocks(content); - processed = escapeProblematicBraces(protectedContent); + let processed = escapeProblematicBraces(protectedContent); processed = restoreCodeBlocks(processed, protectedCode); return processed; diff --git a/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts b/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts deleted file mode 100644 index 9151e838c..000000000 --- a/processor/transform/mdxish/rehype-html-blocks-in-jsx.ts +++ /dev/null @@ -1,74 +0,0 @@ -import type { Element, ElementContent, Properties, Root } from 'hast'; -import type { Transformer } from 'unified'; - -import { visit } from 'unist-util-visit'; - -import { formatHtmlForMdxish } from '../../utils'; - -import { base64Decode, HTML_BLOCK_CONTENT_END, HTML_BLOCK_CONTENT_START } from './preprocess-jsx-expressions'; - -const startEscaped = HTML_BLOCK_CONTENT_START.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); -const endEscaped = HTML_BLOCK_CONTENT_END.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); -// Marker emitted as an HTML comment by preprocessJSXExpressions. rehypeRaw parses -// the comment body (sans ), so we match the inner form here. -const COMMENT_MARKER_RE = new RegExp( - `^${startEscaped.replace(/^/g; - -/** - * Rewrites protected HTMLBlock HTML comments into MDX comments so remarkMdx can - * parse a `
` that contains an ``. Length-preserving. - */ -export const neutralizeHtmlBlockComments = (value: string): string => - value.replace(HTML_COMMENT_RE, (_match, marker: string) => `{/*${marker} */}`); - -/** - * Reads the protected payload out of an `` element's MDX comment child - * (an `mdxFlowExpression`/`mdxTextExpression` holding `/*RDMX_HTMLBLOCK:โ€ฆ*\/`). - */ -const extractPayload = (element: MdxJsxFlowElement | MdxJsxTextElement): string | null => { - let payload: string | null = null; - visit(element, expr => expr.type === 'mdxFlowExpression' || expr.type === 'mdxTextExpression', expr => { - const match = (expr as { value?: string }).value?.match(PAYLOAD_RE); - if (match) payload = match[1]; - }); - return payload; -}; - -/** - * Builds the `html-block` hProperties from the `` element's JSX - * attributes (e.g. `safeMode`, `runScripts`), mirroring the rehype fallback. - */ -const collectAttributes = (element: MdxJsxFlowElement | MdxJsxTextElement): Record => { - const props: Record = {}; - element.attributes.forEach(attr => { - if (attr.type === 'mdxJsxAttribute' && typeof attr.value === 'string' && attr.name !== 'html') { - props[attr.name] = attr.value; - } - }); - return props; -}; - -/** - * Converts `` JSX elements inside a re-parsed table into `html-block` - * MDAST nodes, decoding the protected base64 payload. - */ -export const convertHtmlBlockElements = (tree: Node) => { - visit( - tree, - node => node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement', - (node, index, parent: Parent | undefined) => { - const element = node as MdxJsxFlowElement | MdxJsxTextElement; - if (element.name !== 'HTMLBlock' || !parent || index === undefined) return; - - const payload = extractPayload(element); - if (payload === null) return; - - const html = formatHtmlForMdxish(base64Decode(payload)); - const htmlBlock: HTMLBlock = { - position: element.position, - children: [{ type: 'text', value: html }], - type: NodeTypes.htmlBlock, - data: { - hName: 'html-block', - hProperties: { - html, - ...collectAttributes(element), - }, - }, - }; - - parent.children[index] = htmlBlock; - }, - ); -}; From a394d057734d2ccf18997a48ee0c3cece22127a6 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Wed, 27 May 2026 15:51:15 +1000 Subject: [PATCH 08/15] chore: simplify function & comments --- .../transform/mdxish/html-block-from-jsx.ts | 52 +++++++------------ 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/processor/transform/mdxish/html-block-from-jsx.ts b/processor/transform/mdxish/html-block-from-jsx.ts index c34e8e995..8b05e7a1c 100644 --- a/processor/transform/mdxish/html-block-from-jsx.ts +++ b/processor/transform/mdxish/html-block-from-jsx.ts @@ -12,8 +12,7 @@ type HtmlBlockJsx = MdxJsxFlowElement | MdxJsxTextElement; // `{`โ€ฆ`}` embedded inside a raw HTML block (e.g. a // single-line `
โ€ฆ
`). CommonMark slurps the whole div as one `html` -// node, so the tokenizer never sees the HTMLBlock โ€” we recover it here before -// rehypeRaw hands the blob to parse5 (which would mangle the template literal). +// node, so the tokenizer never sees the HTMLBlock โ€” we recover it here const RAW_HTML_BLOCK_RE = /]*)>\s*\{\s*`((?:[^`\\]|\\.)*)`\s*\}\s*<\/HTMLBlock>/g; const stringAttr = (attrs: string, name: string): string | undefined => { @@ -31,33 +30,28 @@ const toRunScripts = (raw: string | undefined): boolean | string | undefined => * Splits a raw `html` node that embeds an `` into * `[html before, html-block, html after, โ€ฆ]`. Returns null when there is no * HTMLBlock to extract, so the caller can leave the node untouched. + * + * `String.split` on a regex with capture groups interleaves the captures into + * the result, so segments arrive as `[text, attrs, body, text, attrs, body, โ€ฆ]`. */ const splitRawHtmlBlocks = (node: Html): RootContent[] | null => { - const { value } = node; - RAW_HTML_BLOCK_RE.lastIndex = 0; - if (!RAW_HTML_BLOCK_RE.exec(value)) return null; + const segments = node.value.split(RAW_HTML_BLOCK_RE); + if (segments.length === 1) return null; // no present const parts: RootContent[] = []; - let lastIndex = 0; - RAW_HTML_BLOCK_RE.lastIndex = 0; - let match: RegExpExecArray | null; - while ((match = RAW_HTML_BLOCK_RE.exec(value)) !== null) { - const [full, attrs, body] = match; - if (match.index > lastIndex) { - parts.push({ type: 'html', value: value.slice(lastIndex, match.index) }); + for (let i = 0; i < segments.length; i += 3) { + const [text, attrs, body] = segments.slice(i, i + 3); + if (text) parts.push({ type: 'html', value: text }); + if (body !== undefined) { + parts.push( + createHTMLBlockNode( + formatHtmlForMdxish(body), + node.position, + toRunScripts(stringAttr(attrs, 'runScripts')), + stringAttr(attrs, 'safeMode'), + ), + ); } - parts.push( - createHTMLBlockNode( - formatHtmlForMdxish(extractTemplateLiteral(`\`${body}\``)), - node.position, - toRunScripts(stringAttr(attrs, 'runScripts')), - stringAttr(attrs, 'safeMode'), - ), - ); - lastIndex = match.index + full.length; - } - if (lastIndex < value.length) { - parts.push({ type: 'html', value: value.slice(lastIndex) }); } return parts; }; @@ -78,12 +72,6 @@ const attrValue = (element: HtmlBlockJsx, name: string): string | undefined => { * Converts an `` captured by the mdxComponent tokenizer as a JSX * element into the canonical `html-block` MDAST node, reading the body straight * out of its template-literal expression child. - * - * Runs *before* `mdxishTables` so a table cell containing an `` is - * seen as block-level content and kept as a JSX `
`. This replaces the - * base64-comment marker machinery: the #1455 tokenizer already hands the body - * over as a parsed `mdxFlowExpression`/`mdxTextExpression`, so there is nothing - * to protect or decode. */ const htmlBlockFromJsx = (): Transform => tree => { visit( @@ -99,9 +87,7 @@ const htmlBlockFromJsx = (): Transform => tree => { const html = formatHtmlForMdxish(extractTemplateLiteral(exprChild?.value)); const safeMode = attrValue(element, 'safeMode'); - const runScriptsRaw = attrValue(element, 'runScripts'); - const runScripts = - runScriptsRaw === 'true' ? true : runScriptsRaw === 'false' ? false : runScriptsRaw; + const runScripts = toRunScripts(attrValue(element, 'runScripts')); parent.children[index] = createHTMLBlockNode(html, element.position, runScripts, safeMode); }, From eed8fb3e3f764613fc7930d9a8332dc5f2134c5e Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Wed, 27 May 2026 16:16:48 +1000 Subject: [PATCH 09/15] refactor: unify the htmlblock transformers --- lib/mdxish.ts | 6 +- .../transform/mdxish/html-block-from-jsx.ts | 104 ---- .../transform/mdxish/mdxish-html-blocks.ts | 476 +++++------------- 3 files changed, 136 insertions(+), 450 deletions(-) delete mode 100644 processor/transform/mdxish/html-block-from-jsx.ts diff --git a/lib/mdxish.ts b/lib/mdxish.ts index be1f61e47..1219dfbd8 100644 --- a/lib/mdxish.ts +++ b/lib/mdxish.ts @@ -36,7 +36,6 @@ import { processSnakeCaseComponent } from '../processor/transform/mdxish/compone import evaluateExports from '../processor/transform/mdxish/evaluate-exports'; import evaluateExpressions from '../processor/transform/mdxish/evaluate-expressions'; import generateSlugForHeadings from '../processor/transform/mdxish/heading-slugs'; -import htmlBlockFromJsx from '../processor/transform/mdxish/html-block-from-jsx'; import magicBlockTransformer from '../processor/transform/mdxish/magic-blocks/magic-block-transformer'; import mdxishHtmlBlocks from '../processor/transform/mdxish/mdxish-html-blocks'; import mdxishJsxToMdast from '../processor/transform/mdxish/mdxish-jsx-to-mdast'; @@ -202,10 +201,7 @@ export function mdxishAstProcessor(mdContent: string, opts: MdxishOpts = {}) { .use(mdxishInlineMdxHtmlBlocks, { safeMode }) .use(restoreSnakeCaseComponentNames, { mapping: snakeCaseMapping }) .use(mdxishTables) - // After tables: the table cell re-parse (remarkMdx) turns multiline HTMLBlock - // bodies into clean JSX nodes, so convert them here once tables are settled. - .use(htmlBlockFromJsx) // Convert tokenized JSX โ†’ html-block - .use(mdxishHtmlBlocks) + .use(mdxishHtmlBlocks) // Convert every shape โ†’ html-block // The next few transformers must appear after mdxishMdxComponentBlocks // so nodes produced by the inline re-parse of component bodies // (e.g. code/image/embed inside ) get visited too diff --git a/processor/transform/mdxish/html-block-from-jsx.ts b/processor/transform/mdxish/html-block-from-jsx.ts deleted file mode 100644 index 8b05e7a1c..000000000 --- a/processor/transform/mdxish/html-block-from-jsx.ts +++ /dev/null @@ -1,104 +0,0 @@ -import type { Html, Parent, RootContent } from 'mdast'; -import type { Transform } from 'mdast-util-from-markdown'; -import type { MdxJsxFlowElement, MdxJsxTextElement } from 'mdast-util-mdx'; - -import { visit } from 'unist-util-visit'; - -import { formatHtmlForMdxish } from '../../utils'; - -import { createHTMLBlockNode, extractTemplateLiteral } from './mdxish-html-blocks'; - -type HtmlBlockJsx = MdxJsxFlowElement | MdxJsxTextElement; - -// `{`โ€ฆ`}` embedded inside a raw HTML block (e.g. a -// single-line `
โ€ฆ
`). CommonMark slurps the whole div as one `html` -// node, so the tokenizer never sees the HTMLBlock โ€” we recover it here -const RAW_HTML_BLOCK_RE = /]*)>\s*\{\s*`((?:[^`\\]|\\.)*)`\s*\}\s*<\/HTMLBlock>/g; - -const stringAttr = (attrs: string, name: string): string | undefined => { - const quoted = attrs.match(new RegExp(`\\b${name}\\s*=\\s*"([^"]*)"`)); - if (quoted) return quoted[1]; - const expr = attrs.match(new RegExp(`\\b${name}\\s*=\\s*\\{(true|false)\\}`)); - if (expr) return expr[1]; - return new RegExp(`\\b${name}\\b`).test(attrs) ? 'true' : undefined; -}; - -const toRunScripts = (raw: string | undefined): boolean | string | undefined => - raw === 'true' ? true : raw === 'false' ? false : raw; - -/** - * Splits a raw `html` node that embeds an `` into - * `[html before, html-block, html after, โ€ฆ]`. Returns null when there is no - * HTMLBlock to extract, so the caller can leave the node untouched. - * - * `String.split` on a regex with capture groups interleaves the captures into - * the result, so segments arrive as `[text, attrs, body, text, attrs, body, โ€ฆ]`. - */ -const splitRawHtmlBlocks = (node: Html): RootContent[] | null => { - const segments = node.value.split(RAW_HTML_BLOCK_RE); - if (segments.length === 1) return null; // no present - - const parts: RootContent[] = []; - for (let i = 0; i < segments.length; i += 3) { - const [text, attrs, body] = segments.slice(i, i + 3); - if (text) parts.push({ type: 'html', value: text }); - if (body !== undefined) { - parts.push( - createHTMLBlockNode( - formatHtmlForMdxish(body), - node.position, - toRunScripts(stringAttr(attrs, 'runScripts')), - stringAttr(attrs, 'safeMode'), - ), - ); - } - } - return parts; -}; - -/** - * Reads a JSX attribute value as a string. Handles `name="x"`, `name={expr}` - * (returns the raw expression source) and bare boolean attributes (`runScripts`). - */ -const attrValue = (element: HtmlBlockJsx, name: string): string | undefined => { - const attr = element.attributes.find(a => a.type === 'mdxJsxAttribute' && a.name === name); - if (!attr || attr.type !== 'mdxJsxAttribute') return undefined; - if (typeof attr.value === 'string') return attr.value; - if (attr.value && typeof attr.value === 'object' && 'value' in attr.value) return attr.value.value; - return 'true'; // bare boolean attribute, e.g. -}; - -/** - * Converts an `` captured by the mdxComponent tokenizer as a JSX - * element into the canonical `html-block` MDAST node, reading the body straight - * out of its template-literal expression child. - */ -const htmlBlockFromJsx = (): Transform => tree => { - visit( - tree, - node => node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement', - (node, index, parent: Parent | undefined) => { - const element = node as HtmlBlockJsx; - if (element.name !== 'HTMLBlock' || !parent || index === undefined) return; - - const exprChild = element.children.find( - child => child.type === 'mdxFlowExpression' || child.type === 'mdxTextExpression', - ) as { value?: string } | undefined; - const html = formatHtmlForMdxish(extractTemplateLiteral(exprChild?.value)); - - const safeMode = attrValue(element, 'safeMode'); - const runScripts = toRunScripts(attrValue(element, 'runScripts')); - - parent.children[index] = createHTMLBlockNode(html, element.position, runScripts, safeMode); - }, - ); - - // Recover HTMLBlocks embedded inside raw HTML blocks (e.g. inline `
`). - visit(tree, 'html', (node: Html, index, parent: Parent | undefined) => { - if (!parent || index === undefined) return; - const replacement = splitRawHtmlBlocks(node); - if (replacement) parent.children.splice(index, 1, ...(replacement as typeof parent.children)); - }); -}; - -export default htmlBlockFromJsx; diff --git a/processor/transform/mdxish/mdxish-html-blocks.ts b/processor/transform/mdxish/mdxish-html-blocks.ts index 101581726..8607a88c4 100644 --- a/processor/transform/mdxish/mdxish-html-blocks.ts +++ b/processor/transform/mdxish/mdxish-html-blocks.ts @@ -1,380 +1,174 @@ import type { HTMLBlock } from '../../../types'; -import type { Paragraph, Parent } from 'mdast'; +import type { Html, Paragraph, Parent, RootContent } from 'mdast'; import type { Transform } from 'mdast-util-from-markdown'; +import type { MdxJsxFlowElement, MdxJsxTextElement } from 'mdast-util-mdx'; import { visit } from 'unist-util-visit'; import { NodeTypes } from '../../../enums'; import { formatHtmlForMdxish } from '../../utils'; -/** - * Reads the cooked string out of a tokenized brace expression that wraps a - * single template literal (e.g. `` `

n

` `` โ†’ `

n

`). The #1455 - * mdxComponent tokenizer hands HTMLBlock bodies through as - * mdxTextExpression/mdxFlowExpression nodes, so we no longer need the - * base64-comment marker to recover them. - */ -export function extractTemplateLiteral(value: string | undefined): string { - if (!value) return ''; - const trimmed = value.trim(); - const match = trimmed.match(/^`([\s\S]*)`$/); - return match ? match[1] : trimmed; -} +type HtmlBlockJsx = MdxJsxFlowElement | MdxJsxTextElement; -/** - * Collects text content from a node and its children recursively - */ -function collectTextContent(node: { children?: unknown[]; lang?: string; type?: string; value?: string }): string { - const parts: string[] = []; - - if (node.type === 'text' && node.value) { - parts.push(node.value); - } else if ((node.type === 'mdxTextExpression' || node.type === 'mdxFlowExpression') && node.value) { - parts.push(extractTemplateLiteral(node.value)); - } else if (node.type === 'html' && node.value) { - parts.push(node.value); - } else if (node.type === 'inlineCode' && node.value) { - parts.push(node.value); - } else if (node.type === 'code' && node.value) { - // Reconstruct code fence syntax (markdown parser consumes opening ```) - const lang = node.lang || ''; - const fence = `\`\`\`${lang ? `${lang}\n` : ''}`; - parts.push(fence); - parts.push(node.value); - // Add newline before closing fence if missing - const closingFence = node.value.endsWith('\n') ? '```' : '\n```'; - parts.push(closingFence); - } else if (node.children && Array.isArray(node.children)) { - node.children.forEach(child => { - if (typeof child === 'object' && child !== null) { - parts.push(collectTextContent(child as { children?: unknown[]; lang?: string; type?: string; value?: string })); - } - }); - } - - return parts.join(''); -} - -/** - * Extracts boolean attribute from HTML tag. Handles JSX (safeMode={true}) and string (safeMode="true") syntax. - * Returns "true"/"false" string to survive rehypeRaw serialization. - */ -function extractBooleanAttr(attrs: string, name: string): string | undefined { - // Try JSX syntax: name={true|false} - const jsxMatch = attrs.match(new RegExp(`${name}=\\{(true|false)\\}`)); - if (jsxMatch) { - return jsxMatch[1]; - } - // Try string syntax: name="true"|true - const stringMatch = attrs.match(new RegExp(`${name}="?(true|false)"?`)); - if (stringMatch) { - return stringMatch[1]; - } - return undefined; -} - -/** - * Extracts runScripts attribute from HTML tag. Returns boolean for "true"/"false", string for other values, or undefined if not found. - */ -function extractRunScriptsAttr(attrs: string): boolean | string | undefined { - const runScriptsMatch = attrs.match(/runScripts="?([^">\s]+)"?/); - if (!runScriptsMatch) { - return undefined; - } - const value = runScriptsMatch[1]; - if (value === 'true') { - return true; - } - if (value === 'false') { - return false; - } - return value; -} +// `{`โ€ฆ`}` embedded inside a raw HTML block (e.g. a +// single-line `
โ€ฆ
`). CommonMark slurps the whole div as one `html` +// node, so the tokenizer never sees the HTMLBlock โ€” we recover it here. +const RAW_HTML_BLOCK_RE = /]*)>\s*\{\s*`((?:[^`\\]|\\.)*)`\s*\}\s*<\/HTMLBlock>/g; +// Opening `` as its own `html` node โ€” produced inside a paragraph +// when an HTMLBlock appears inline alongside text. +const HTML_BLOCK_OPEN_RE = /^]*)>$/; /** - * Creates an HTMLBlock node from HTML string and optional attributes + * Builds the canonical `html-block` MDAST node the renderer expects. */ -export function createHTMLBlockNode( - htmlString: string, +const createHtmlBlockNode = ( + html: string, position: HTMLBlock['position'], runScripts?: boolean | string, safeMode?: string, -): HTMLBlock { - return { - position, - children: [{ type: 'text', value: htmlString }], - type: NodeTypes.htmlBlock, - data: { - hName: 'html-block', - hProperties: { - html: htmlString, - ...(runScripts !== undefined && { runScripts }), - ...(safeMode !== undefined && { safeMode }), - }, +): HTMLBlock => ({ + position, + children: [{ type: 'text', value: html }], + type: NodeTypes.htmlBlock, + data: { + hName: 'html-block', + hProperties: { + html, + ...(runScripts !== undefined && { runScripts }), + ...(safeMode !== undefined && { safeMode }), }, - }; -} + }, +}); /** - * Checks for opening tag only (for split detection) + * Reads the cooked string out of a brace expression wrapping a single template + * literal (`` `

n

` `` โ†’ `

n

`). */ -function hasOpeningTagOnly(node: { children?: unknown[]; type?: string; value?: string }): { - attrs: string; - found: boolean; -} { - let hasOpening = false; - let hasClosed = false; - let attrs = ''; +const extractTemplateLiteral = (value: string | undefined): string => { + if (!value) return ''; + const trimmed = value.trim(); + const match = trimmed.match(/^`([\s\S]*)`$/); + return match ? match[1] : trimmed; +}; - const check = (n: { children?: unknown[]; type?: string; value?: string }) => { - if (n.type === 'html' && n.value) { - if (n.value === '') { - hasOpening = true; - } else { - const match = n.value.match(/^]*)?>$/); - if (match) { - hasOpening = true; - attrs = match[1] || ''; - } - } - if (n.value === '' || n.value.includes('
')) { - hasClosed = true; - } - } - if (n.children && Array.isArray(n.children)) { - n.children.forEach(child => { - check(child as { children?: unknown[]; type?: string; value?: string }); - }); - } - }; +const toRunScripts = (raw: string | undefined): boolean | string | undefined => + raw === 'true' ? true : raw === 'false' ? false : raw; - check(node); - // Return true only if opening without closing (split case) - return { attrs, found: hasOpening && !hasClosed }; -} +/** Reads an attribute from a raw `` attribute string. */ +const rawAttr = (attrs: string, name: string): string | undefined => { + const quoted = attrs.match(new RegExp(`\\b${name}\\s*=\\s*"([^"]*)"`)); + if (quoted) return quoted[1]; + const expr = attrs.match(new RegExp(`\\b${name}\\s*=\\s*\\{(true|false)\\}`)); + if (expr) return expr[1]; + return new RegExp(`\\b${name}\\b`).test(attrs) ? 'true' : undefined; +}; + +/** Reads an attribute from a parsed `` JSX element. */ +const jsxAttr = (element: HtmlBlockJsx, name: string): string | undefined => { + const attr = element.attributes.find(a => a.type === 'mdxJsxAttribute' && a.name === name); + if (!attr || attr.type !== 'mdxJsxAttribute') return undefined; + if (typeof attr.value === 'string') return attr.value; + if (attr.value && typeof attr.value === 'object' && 'value' in attr.value) return attr.value.value; + return 'true'; // bare boolean attribute, e.g. +}; + +/** Builds an `html-block` from a raw attribute string and (unparsed) body. */ +const htmlBlockFromRaw = (attrs: string, html: string, position: HTMLBlock['position']): HTMLBlock => + createHtmlBlockNode(formatHtmlForMdxish(html), position, toRunScripts(rawAttr(attrs, 'runScripts')), rawAttr(attrs, 'safeMode')); /** - * Checks if a node contains an HTMLBlock closing tag + * Splits a raw `html` node that embeds one or more ``s into + * `[html before, html-block, html after, โ€ฆ]`. Returns null when there is none. + * + * `String.split` on a regex with capture groups interleaves the captures into + * the result, so segments arrive as `[text, attrs, body, text, attrs, body, โ€ฆ]`. */ -function hasClosingTag(node: { children?: unknown[]; type?: string; value?: string }): boolean { - if (node.type === 'html' && node.value) { - if (node.value === '' || node.value.includes('')) return true; - } - if (node.children && Array.isArray(node.children)) { - return node.children.some(child => hasClosingTag(child as { children?: unknown[]; type?: string; value?: string })); +const splitRawHtmlBlocks = (node: Html): RootContent[] | null => { + const segments = node.value.split(RAW_HTML_BLOCK_RE); + if (segments.length === 1) return null; // no present + + const parts: RootContent[] = []; + for (let i = 0; i < segments.length; i += 3) { + const [text, attrs, body] = segments.slice(i, i + 3); + if (text) parts.push({ type: 'html', value: text }); + if (body !== undefined) parts.push(htmlBlockFromRaw(attrs, body, node.position)); } - return false; -} + return parts; +}; /** - * Transforms HTMLBlock MDX JSX to html-block nodes. Handles {`...`} syntax. + * Converts every `` shape that survives parsing into the canonical + * `html-block` MDAST node, reading the body from the tokenizer's template-literal + * expression. Three shapes occur: + * + * 1. JSX element (`mdxJsxFlowElement`/`mdxJsxTextElement`) โ€” multiline/block + * context and table cells (after their remarkMdx re-parse). + * 2. Raw `html` blob (`splitRawHtmlBlocks`) โ€” single-line top-level, or nested + * in raw HTML like an inline `
`. + * 3. Inline-in-paragraph โ€” split into `html` + expression + `html` siblings. + * + * Runs *after* `mdxishTables` so table cells are re-parsed first; + * `mdxishTables` recognizes the still-JSX `` element when deciding to + * keep a table as a JSX `
`. This replaces the old base64-comment marker + * machinery โ€” the #1455 tokenizer hands the body over already parsed. */ const mdxishHtmlBlocks = (): Transform => tree => { - // Handle HTMLBlock split across root children (caused by newlines) - visit(tree, 'root', (root: Parent) => { - const children = root.children; - let i = 0; - - while (i < children.length) { - const child = children[i] as { children?: unknown[]; type?: string; value?: string }; - const { attrs, found: hasOpening } = hasOpeningTagOnly(child); - - if (hasOpening) { - // Find closing tag in subsequent siblings - let closingIdx = -1; - for (let j = i + 1; j < children.length; j += 1) { - if (hasClosingTag(children[j] as { children?: unknown[]; type?: string; value?: string })) { - closingIdx = j; - break; - } - } - - if (closingIdx !== -1) { - // Collect inner content between tags - const contentParts: string[] = []; - for (let j = i; j <= closingIdx; j += 1) { - const node = children[j] as { children?: unknown[]; type?: string; value?: string }; - contentParts.push(collectTextContent(node)); - } - - // Remove the opening/closing tags and template literal syntax from content - let content = contentParts.join(''); - content = content.replace(/^]*>\s*\{?\s*`?/, '').replace(/`?\s*\}?\s*<\/HTMLBlock>$/, ''); - - const htmlString = formatHtmlForMdxish(content); - const runScripts = extractRunScriptsAttr(attrs); - const safeMode = extractBooleanAttr(attrs, 'safeMode'); - - // Replace range with single HTMLBlock node - const mdNode = createHTMLBlockNode( - htmlString, - (children[i] as { position?: unknown }).position as HTMLBlock['position'], - runScripts, - safeMode, - ); - root.children.splice(i, closingIdx - i + 1, mdNode); - } - } - i += 1; - } - }); + // Shape 1: tokenized JSX element. + visit( + tree, + node => node.type === 'mdxJsxFlowElement' || node.type === 'mdxJsxTextElement', + (node, index, parent: Parent | undefined) => { + const element = node as HtmlBlockJsx; + if (element.name !== 'HTMLBlock' || !parent || index === undefined) return; + + const exprChild = element.children.find( + child => child.type === 'mdxFlowExpression' || child.type === 'mdxTextExpression', + ) as { value?: string } | undefined; + + parent.children[index] = createHtmlBlockNode( + formatHtmlForMdxish(extractTemplateLiteral(exprChild?.value)), + element.position, + toRunScripts(jsxAttr(element, 'runScripts')), + jsxAttr(element, 'safeMode'), + ); + }, + ); - // Handle HTMLBlock parsed as HTML elements (when template literal contains block-level HTML tags) - visit(tree, 'html', (node, index, parent: Parent | undefined) => { + // Shape 2: raw HTML blob. + visit(tree, 'html', (node: Html, index, parent: Parent | undefined) => { if (!parent || index === undefined) return; - - const value = (node as { value?: string }).value; - if (!value) return; - - // Case 1: Full HTMLBlock in single node - const fullMatch = value.match(/^]*)?>([\s\S]*)<\/HTMLBlock>$/); - if (fullMatch) { - const attrs = fullMatch[1] || ''; - let content = fullMatch[2] || ''; - - // Remove template literal syntax if present: {`...`} - content = content.replace(/^\s*\{\s*`/, '').replace(/`\s*\}\s*$/, ''); - - const htmlString = formatHtmlForMdxish(content); - const runScripts = extractRunScriptsAttr(attrs); - const safeMode = extractBooleanAttr(attrs, 'safeMode'); - - parent.children[index] = createHTMLBlockNode(htmlString, node.position, runScripts, safeMode); - return; - } - - // Case 2: Opening tag only (split by blank lines) - if (value === '' || value.match(/^]*>$/)) { - const siblings = parent.children; - let closingIdx = -1; - - // Find closing tag in siblings - for (let i = index + 1; i < siblings.length; i += 1) { - const sibling = siblings[i]; - if (sibling.type === 'html') { - const sibVal = (sibling as { value?: string }).value; - if (sibVal === '' || sibVal?.includes('')) { - closingIdx = i; - break; - } - } - } - - if (closingIdx === -1) return; - - // Collect content between tags, skipping template literal delimiters - const contentParts: string[] = []; - for (let i = index + 1; i < closingIdx; i += 1) { - const sibling = siblings[i]; - // Skip template literal delimiters - if (sibling.type === 'text') { - const textVal = (sibling as { value?: string }).value; - if (textVal === '{' || textVal === '}' || textVal === '{`' || textVal === '`}') { - // eslint-disable-next-line no-continue - continue; - } - } - contentParts.push(collectTextContent(sibling as { children?: unknown[]; type?: string; value?: string })); - } - - const htmlString = formatHtmlForMdxish(contentParts.join('')); - const runScripts = extractRunScriptsAttr(value); - const safeMode = extractBooleanAttr(value, 'safeMode'); - - // Replace opening tag with HTMLBlock node, remove consumed siblings - parent.children[index] = createHTMLBlockNode(htmlString, node.position, runScripts, safeMode); - parent.children.splice(index + 1, closingIdx - index); - } + const replacement = splitRawHtmlBlocks(node); + if (replacement) parent.children.splice(index, 1, ...(replacement as typeof parent.children)); }); - // Handle HTMLBlock inside paragraphs (parsed as inline elements) - visit(tree, 'paragraph', (node: Paragraph, index, parent: Parent | undefined) => { - if (!parent || index === undefined) return; - - const children = node.children || []; - - let htmlBlockStartIdx = -1; - let htmlBlockEndIdx = -1; - let templateLiteralStartIdx = -1; - let templateLiteralEndIdx = -1; - + // Shape 3: inline within a paragraph โ€” `` open/close arrive as + // separate `html` siblings with the template-literal expression between them. + visit(tree, 'paragraph', (paragraph: Paragraph) => { + // An html-block is block content, so it isn't a valid PhrasingContent child; + // widen to RootContent (which HTMLBlock belongs to) for the in-place splice. + const children = paragraph.children as RootContent[]; for (let i = 0; i < children.length; i += 1) { - const child = children[i]; - - if (child.type === 'html' && typeof (child as { value?: string }).value === 'string') { - const value = (child as { value: string }).value; - if (value === '' || value.match(/^]*>$/)) { - htmlBlockStartIdx = i; - } else if (value === '') { - htmlBlockEndIdx = i; - } - } - - // Find opening brace after HTMLBlock start - if (htmlBlockStartIdx !== -1 && templateLiteralStartIdx === -1 && child.type === 'text') { - const value = (child as { value?: string }).value; - if (value === '{') { - templateLiteralStartIdx = i; - } - } - - // Find closing brace before HTMLBlock end - if (htmlBlockStartIdx !== -1 && htmlBlockEndIdx === -1 && child.type === 'text') { - const value = (child as { value?: string }).value; - if (value === '}') { - templateLiteralEndIdx = i; - } - } - } - - if ( - htmlBlockStartIdx !== -1 && - htmlBlockEndIdx !== -1 && - templateLiteralStartIdx !== -1 && - templateLiteralEndIdx !== -1 && - templateLiteralStartIdx < templateLiteralEndIdx - ) { - const openingTag = children[htmlBlockStartIdx] as { value?: string }; - - // Collect content between braces (handles code blocks) - const templateContent: string[] = []; - for (let i = templateLiteralStartIdx + 1; i < templateLiteralEndIdx; i += 1) { - const child = children[i]; - templateContent.push( - collectTextContent(child as { children?: unknown[]; lang?: string; type?: string; value?: string }), - ); - } - - const htmlString = formatHtmlForMdxish(templateContent.join('')); - - const runScripts = openingTag.value ? extractRunScriptsAttr(openingTag.value) : undefined; - const safeMode = openingTag.value ? extractBooleanAttr(openingTag.value, 'safeMode') : undefined; - - const mdNode = createHTMLBlockNode(htmlString, node.position, runScripts, safeMode); - - parent.children[index] = mdNode; + const open = children[i]; + const openMatch = open.type === 'html' ? open.value.match(HTML_BLOCK_OPEN_RE) : null; + if (!openMatch) continue; // eslint-disable-line no-continue + + const closeIdx = children.findIndex( + (child, j) => j > i && child.type === 'html' && child.value === '', + ); + if (closeIdx === -1) continue; // eslint-disable-line no-continue + + const body = children + .slice(i + 1, closeIdx) + .map(child => + child.type === 'mdxTextExpression' || child.type === 'mdxFlowExpression' + ? extractTemplateLiteral(child.value) + : '', + ) + .join(''); + + children.splice(i, closeIdx - i + 1, htmlBlockFromRaw(openMatch[1], body, open.position)); } }); - - // Ensure html-block nodes have HTML in children as text node - visit(tree, 'html-block', (node: HTMLBlock) => { - const html = node.data?.hProperties?.html; - if ( - html && - (!node.children || - node.children.length === 0 || - (node.children.length === 1 && node.children[0].type === 'text' && node.children[0].value !== html)) - ) { - node.children = [ - { - type: 'text', - value: html, - }, - ]; - } - }); - - return tree; }; export default mdxishHtmlBlocks; From 3988f96f4828d911cfdaf90581f82ca3e88b062c Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Thu, 28 May 2026 18:24:54 +1000 Subject: [PATCH 10/15] test: integrate tokenizer pr tests here, split with transformation --- __tests__/compilers/html-block.test.ts | 204 +--------------- __tests__/lib/mdxish/html-blocks.test.ts | 227 ++++++++++++++---- .../transformers/mdxish-html-blocks.test.ts | 121 ++++++++++ 3 files changed, 299 insertions(+), 253 deletions(-) create mode 100644 __tests__/transformers/mdxish-html-blocks.test.ts diff --git a/__tests__/compilers/html-block.test.ts b/__tests__/compilers/html-block.test.ts index 94880a0b2..7b62977fb 100644 --- a/__tests__/compilers/html-block.test.ts +++ b/__tests__/compilers/html-block.test.ts @@ -1,15 +1,4 @@ -import type { Element } from 'hast'; - -import { mdast, mdx, mdxish } from '../../index'; - -function findHTMLBlock(element: Element): Element | undefined { - if (element.tagName === 'HTMLBlock' || element.tagName === 'html-block') { - return element; - } - return element.children - .filter((child): child is Element => child.type === 'element') - .reduce((found, child) => found || findHTMLBlock(child), undefined); -} +import { mdast, mdx } from '../../index'; describe('html-block compiler', () => { it('compiles html blocks within containers', () => { @@ -51,194 +40,3 @@ const foo = () => { expect(mdx(mdast(markdown)).trim()).toBe(expected.trim()); }); }); - -describe('mdxish html-block compiler', () => { - it('compiles html blocks within containers', () => { - const markdown = ` -> ๐Ÿšง It compiles! -> -> {\` -> Hello, World! -> \`} -`; - - const hast = mdxish(markdown.trim()); - const callout = hast.children[0] as Element; - - expect(callout.type).toBe('element'); - expect(callout.tagName).toBe('Callout'); - - // Find HTMLBlock within the callout - const htmlBlock = findHTMLBlock(callout); - expect(htmlBlock).toBeDefined(); - expect(htmlBlock?.tagName).toBe('html-block'); - }); - - it('compiles html blocks preserving newlines', () => { - const markdown = ` -{\` -

-const foo = () => {
-  const bar = {
-    baz: 'blammo'
-  }
-
-  return bar
-}
-
-\`}
-`; - - const hast = mdxish(markdown.trim()); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - expect(htmlBlock?.tagName).toBe('html-block'); - }); - - it('adds newlines for readability', () => { - const markdown = '{`

Hello, World!

`}
'; - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - expect(htmlBlock?.tagName).toBe('html-block'); - }); - - it('unescapes backticks in HTML content', () => { - const markdown = '{`\\`example\\``}'; - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - expect(htmlBlock?.tagName).toBe('html-block'); - - // Verify that escaped backticks \` are unescaped to ` in the HTML - const htmlProp = htmlBlock?.properties?.html as string; - expect(htmlProp).toBeDefined(); - expect(htmlProp).toContain('`example`'); - expect(htmlProp).not.toContain('\\`'); - }); - - it('passes safeMode property correctly', () => { - // Test with both JSX expression and string syntax - const markdown = '{`

Content

`}
'; - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - - const allProps = htmlBlock?.properties; - expect(allProps).toBeDefined(); - - const safeMode = allProps?.safeMode; - expect(safeMode).toBe('true'); - - // Verify that html property is still present (for safeMode to render as escaped text) - const htmlProp = allProps?.html as string; - expect(htmlProp).toBeDefined(); - expect(htmlProp).toContain(''); - expect(htmlProp).toContain('

Content

'); - }); - - it('should handle template literal with variables', () => { - // eslint-disable-next-line quotes - const markdown = `{\`const x = \${variable}\`}`; - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - // eslint-disable-next-line no-template-curly-in-string - expect(htmlBlock?.properties?.html).toBe('const x = ${variable}'); - }); - - it('should handle nested template literals', () => { - // Use a regular string to avoid nested template literal syntax error - // The content should be:
```javascript\nconst x = 1;\n```
- const markdown = '{`
\\`\\`\\`javascript\\nconst x = 1;\\n\\`\\`\\`
`}
'; - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - - // Verify that the HTML content is preserved correctly with newlines - const htmlProp = htmlBlock?.properties?.html as string; - expect(htmlProp).toBeDefined(); - - // The expected content should have triple backticks - expect(htmlProp).toBe('
```javascript\nconst x = 1;\n```
'); - }); - - it('expands \\n only inside
/, not in plain text after tags', () => {
-    const markdown = [
-      '{`',
-      '
qerq3er \\n qerreqqe
', - 'qerq3er \\n qerreqqe', - 'hello \\n world', - '`}
', - ].join('\n'); - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - - const htmlProp = htmlBlock?.properties?.html as string; - expect(htmlProp).toBeDefined(); - - // Literal `\n` expands to real newlines only inside
 / .
-    expect(htmlProp).toBe(
-      '
qerq3er \n qerreqqe
\nqerq3er \n qerreqqe\n\nhello \\n world', - ); - // Must not turn the plain-text `hello \n world` into a line break between words. - expect(htmlProp).toContain('hello \\n world'); - expect(htmlProp).not.toMatch(/hello \n world/); // space + LF + space (wrong) - }); - - it('preserves \\n escape sequences inside ', - '', - '`}', - ].join('\n'); - - const hast = mdxish(markdown); - const paragraph = hast.children[0] as Element; - - expect(paragraph.type).toBe('element'); - const htmlBlock = findHTMLBlock(paragraph); - expect(htmlBlock).toBeDefined(); - - const htmlProp = htmlBlock?.properties?.html as string; - expect(htmlProp).toBeDefined(); - - // The `\n` inside the JS string literal must survive as the two-byte escape - // sequence so eval() sees a well-formed JS string. A real LF here would break it. - expect(htmlProp).toContain('var x = "hello\\nworld";'); - expect(htmlProp).not.toContain('var x = "hello\nworld";'); - }); -}); diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts index fe0707a7d..d5d517e79 100644 --- a/__tests__/lib/mdxish/html-blocks.test.ts +++ b/__tests__/lib/mdxish/html-blocks.test.ts @@ -26,7 +26,7 @@ function expectFullyConverted(tree: ReturnType) { expect(JSON.stringify(tree)).not.toContain('RDMX_HTMLBLOCK'); } -describe(' in mdxish', () => { +describe(' parsing', () => { describe('standalone', () => { it('renders as with the decoded html prop', () => { const tree = mdxish('{`
Hello
`}
'); @@ -40,6 +40,84 @@ describe(' in mdxish', () => { }); }); + it('renders between surrounding paragraphs', () => { + const tree = mdxish('text before\n\n{`
x
`}
\n\ntext after'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['
x
']); + const json = JSON.stringify(tree); + expect(json).toContain('text before'); + expect(json).toContain('text after'); + }); + + it('renders after a markdown heading', () => { + const tree = mdxish('# Heading\n\n{`

after heading

`}
'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['

after heading

']); + expect(findElementByTagName(tree, 'h1')).not.toBeNull(); + }); + + it('renders two consecutive top-level HTMLBlocks', () => { + const tree = mdxish('{`
one
`}
\n\n{`
two
`}
'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['
one
', '
two
']); + }); + + it('renders inline within a paragraph alongside text', () => { + const tree = mdxish('Inline {`x`} text'); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['x']); + const json = JSON.stringify(tree); + expect(json).toContain('Inline'); + expect(json).toContain('text'); + }); + + it('compiles html blocks preserving newlines', () => { + const markdown = `{\` +

+const foo = () => {
+  const bar = {
+    baz: 'blammo'
+  }
+
+  return bar
+}
+
+\`}
`; + + const tree = mdxish(markdown); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); + + const htmlProp = htmlBlock?.properties?.html as string; + expect(htmlProp).toContain('
');
+      expect(htmlProp).toContain('const foo = () => {');
+      expect(htmlProp).toContain("baz: 'blammo'");
+      expect(htmlProp).toContain('
'); + }); + + it('handles standalone multiline HTMLBlock with surrounding paragraphs', () => { + const markdown = `Hello + +{\` +

Hello, World!

+\`}
+ +there`; + const tree = mdxish(markdown); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); + expect(htmlBlock?.properties?.html).toContain('Hello, World!

'); + }); + + it('handles nested HTMLBlock tags in content', () => { + const tree = mdxish('{`{Hello}`}'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); + expect(htmlBlock?.properties?.html).toContain('{Hello}'); + }); + }); + + describe('content formatting', () => { it('preserves multiline HTML content verbatim', () => { const tree = mdxish('{`
\n multi\n
`}
'); @@ -67,39 +145,64 @@ describe(' in mdxish', () => { expect(htmlBlockPayloads(tree)).toStrictEqual(['
{notTemplate}
']); }); - it('renders between surrounding paragraphs', () => { - const tree = mdxish('text before\n\n{`
x
`}
\n\ntext after'); - - expect(htmlBlockPayloads(tree)).toStrictEqual(['
x
']); - const json = JSON.stringify(tree); - expect(json).toContain('text before'); - expect(json).toContain('text after'); + it('adds newlines for readability', () => { + const hast = mdxish('{`

Hello, World!

`}
'); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: '

Hello, World!

' }, + }); }); - it('renders after a markdown heading', () => { - const tree = mdxish('# Heading\n\n{`

after heading

`}
'); - - expect(htmlBlockPayloads(tree)).toStrictEqual(['

after heading

']); - expect(findElementByTagName(tree, 'h1')).not.toBeNull(); + it('unescapes backticks in HTML content', () => { + const hast = mdxish('{`\\`example\\``}'); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: '`example`' }, + }); }); - it('renders two consecutive top-level HTMLBlocks', () => { - const tree = mdxish('{`
one
`}
\n\n{`
two
`}
'); + it('passes safeMode property correctly', () => { + const hast = mdxish('{`

Content

`}
'); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { safeMode: 'true', html: '

Content

' }, + }); + }); - expect(htmlBlockPayloads(tree)).toStrictEqual(['
one
', '
two
']); + it('handles template literal with variables', () => { + // eslint-disable-next-line quotes + const hast = mdxish(`{\`const x = \${variable}\`}`); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + // eslint-disable-next-line no-template-curly-in-string + properties: { html: 'const x = ${variable}' }, + }); }); - it('renders inline within a paragraph alongside text', () => { - const tree = mdxish('Inline {`x`} text'); + it('handles nested template literals', () => { + const hast = mdxish('{`
\\`\\`\\`javascript\\nconst x = 1;\\n\\`\\`\\`
`}
'); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: '
```javascript\nconst x = 1;\n```
' }, + }); + }); - expect(htmlBlockPayloads(tree)).toStrictEqual(['x']); - const json = JSON.stringify(tree); - expect(json).toContain('Inline'); - expect(json).toContain('text'); + it('handles trailing whitespace after closing tag', () => { + const hast = mdxish('{`
hello
`}
'); + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: '
hello
' }, + }); }); }); - describe('inside generic HTML tags', () => { + describe('inside generic HTML tags & markdown', () => { it('renders inside a
with the decoded html prop', () => { const tree = mdxish('
{`

nested

`}
'); @@ -132,16 +235,60 @@ describe(' in mdxish', () => { expect(htmlBlockPayloads(tree)).toStrictEqual(['

n

']); expect(findElementByTagName(tree, 'li')).not.toBeNull(); }); + + it('renders inside callout blockquotes', () => { + const md = `> ๐Ÿšง It compiles! +> +> {\` +> Hello, World! +> \`}`; + + const tree = mdxish(md); + const callout = tree.children[0] as Element; + + expect(callout.tagName).toBe('Callout'); + + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: ' Hello, World!' }, + }); + }); + + it('does not render inside code blocks', () => { + const md = '```{`

n

`}
```'; + + const tree = mdxish(md); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toBeNull(); + }); }); describe('inside ReadMe components', () => { - it('renders inside a ', () => { - const tree = mdxish( - '\n\n{`
\n

n

\n\n

m

\n
`}
\n\n
', - ); + describe('callouts', () => { + it('handles HTMLBlock in an empty callout (no title text)', () => { + const markdown = `> ๐Ÿ“˜ +> +> {\`

body only

\`}
`; + + const hast = mdxish(markdown); + expect((hast.children[0] as Element).tagName).toBe('Callout'); + + const htmlBlock = findElementByTagName(hast, 'html-block'); + expect(htmlBlock).toMatchObject({ + tagName: 'html-block', + properties: { html: '

body only

' }, + }); + }); - expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n

n

\n\n

m

\n
']); - expectFullyConverted(tree); + it('renders inside a ', () => { + const tree = mdxish( + '\n\n{`
\n

n

\n\n

m

\n
`}
\n\n
', + ); + + expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n

n

\n\n

m

\n
']); + expectFullyConverted(tree); + }); }); it('renders inside an ', () => { @@ -166,7 +313,7 @@ describe(' in mdxish', () => { }); }); - describe('inside
cells', () => { + describe('inside
', () => { it('renders inside a
cell as with the decoded html prop', () => { const md = `
@@ -333,24 +480,4 @@ describe(' in mdxish', () => { expect(htmlBlock.children).toStrictEqual([]); }); }); - - describe('attribute preservation across containers', () => { - it('preserves safeMode and runScripts on a standalone block', () => { - const tree = mdxish('{`

n

`}
'); - - expect(findElementByTagName(tree, 'html-block')).toMatchObject({ - properties: { html: '

n

', safeMode: 'true', runScripts: false }, - }); - }); - - it('preserves safeMode and runScripts inside a ', () => { - const tree = mdxish( - '\n\n{`

n

`}
\n\n
', - ); - - expect(findElementByTagName(tree, 'html-block')).toMatchObject({ - properties: { html: '

n

', safeMode: 'true', runScripts: false }, - }); - }); - }); }); diff --git a/__tests__/transformers/mdxish-html-blocks.test.ts b/__tests__/transformers/mdxish-html-blocks.test.ts new file mode 100644 index 000000000..edea87f79 --- /dev/null +++ b/__tests__/transformers/mdxish-html-blocks.test.ts @@ -0,0 +1,121 @@ +import { mdxish } from '../../lib'; +import { findElementByTagName } from '../helpers'; + +describe('mdxish html blocks transformer', () => { + describe('attribute extraction', () => { + it('extracts safeMode from JSX syntax', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', safeMode: 'true' }, + }); + }); + + it('extracts safeMode from string syntax', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', safeMode: 'false' }, + }); + }); + + it('extracts runScripts boolean true', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', runScripts: true }, + }); + }); + + it('extracts runScripts boolean false', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', runScripts: false }, + }); + }); + + it('extracts runScripts string value', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', runScripts: 'afterRender' }, + }); + }); + + it('extracts multiple attributes', () => { + const tree = mdxish( + '{`

content

`}
', + ); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

', safeMode: 'true', runScripts: true }, + }); + }); + + it('omits runScripts and safeMode when absent', () => { + const tree = mdxish('{`

content

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '

content

' }, + }); + }); + }); + + describe('content extraction', () => { + it('strips template literal delimiters', () => { + const tree = mdxish('{`
hello
`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '
hello
' }, + }); + }); + + it('handles content without template literal syntax', () => { + const tree = mdxish('{`plain`}'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: 'plain' }, + }); + }); + + it('unescapes backticks in HTML content', () => { + const tree = mdxish('{`\\`example\\``}'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '`example`' }, + }); + }); + + it('preserves multiline content', () => { + const markdown = `{\` +
    +
  • one
  • +
  • two
  • +
+\`}
`; + const tree = mdxish(markdown); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: '
    \n
  • one
  • \n
  • two
  • \n
' }, + }); + }); + }); + + describe('node structure', () => { + it('produces correct node type and hName', () => { + const tree = mdxish('{`

test

`}
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + type: 'element', + tagName: 'html-block', + }); + }); + + it('does not transform non-HTMLBlock html nodes', () => { + const tree = mdxish('
just html
'); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toBeNull(); + }); + }); +}) \ No newline at end of file From 21fcc67afa1c3939ac108eecbf18b74cfe723785 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Thu, 28 May 2026 18:41:49 +1000 Subject: [PATCH 11/15] chore: cleanup test --- __tests__/lib/mdxish/html-blocks.test.ts | 125 +++++++---------------- 1 file changed, 36 insertions(+), 89 deletions(-) diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts index d5d517e79..d9a39b348 100644 --- a/__tests__/lib/mdxish/html-blocks.test.ts +++ b/__tests__/lib/mdxish/html-blocks.test.ts @@ -4,7 +4,7 @@ import type { MdxJsxFlowElement } from 'mdast-util-mdx'; import { mdxish } from '../../../lib'; import { collectNodes, findAllElementsByTagName, findElementByTagName, parseMdxishWithSource } from '../../helpers'; -function ensureJsxTableIsParsed(md: string) { +function expectJsxTableIsParsed(md: string) { const { tree: mdastTree } = parseMdxishWithSource(md); // A table containing an carries block-level content, so it is kept // as a JSX
(mdxJsxFlowElement) rather than collapsed to a markdown table. @@ -31,13 +31,7 @@ describe(' parsing', () => { it('renders as with the decoded html prop', () => { const tree = mdxish('{`
Hello
`}
'); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ - type: 'element', - tagName: 'html-block', - properties: { html: '
Hello
' }, - children: [], - }); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
Hello
']); }); it('renders between surrounding paragraphs', () => { @@ -85,14 +79,15 @@ const foo = () => { \`}
`; const tree = mdxish(markdown); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); + expect(htmlBlockPayloads(tree)).toStrictEqual([`

+const foo = () => {
+  const bar = {
+    baz: 'blammo'
+  }
 
-      const htmlProp = htmlBlock?.properties?.html as string;
-      expect(htmlProp).toContain('
');
-      expect(htmlProp).toContain('const foo = () => {');
-      expect(htmlProp).toContain("baz: 'blammo'");
-      expect(htmlProp).toContain('
'); + return bar +} +
`]); }); it('handles standalone multiline HTMLBlock with surrounding paragraphs', () => { @@ -104,16 +99,12 @@ const foo = () => { there`; const tree = mdxish(markdown); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); - expect(htmlBlock?.properties?.html).toContain('Hello, World!

'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['

Hello, World!

']); }); it('handles nested HTMLBlock tags in content', () => { const tree = mdxish('{`{Hello}`}'); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ tagName: 'html-block' }); - expect(htmlBlock?.properties?.html).toContain('{Hello}'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['{Hello}']); }); }); @@ -146,59 +137,34 @@ there`; }); it('adds newlines for readability', () => { - const hast = mdxish('{`

Hello, World!

`}
'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: '

Hello, World!

' }, - }); + const tree = mdxish('{`

Hello, World!

`}
'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['

Hello, World!

']); }); it('unescapes backticks in HTML content', () => { - const hast = mdxish('{`\\`example\\``}'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: '`example`' }, - }); + const tree = mdxish('{`\\`example\\``}'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['`example`']); }); it('passes safeMode property correctly', () => { - const hast = mdxish('{`

Content

`}
'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { safeMode: 'true', html: '

Content

' }, - }); + const tree = mdxish('{`

Content

`}
'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['

Content

']); }); it('handles template literal with variables', () => { // eslint-disable-next-line quotes - const hast = mdxish(`{\`const x = \${variable}\`}`); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - // eslint-disable-next-line no-template-curly-in-string - properties: { html: 'const x = ${variable}' }, - }); + const tree = mdxish(`{\`const x = \${variable}\`}`); + expect(htmlBlockPayloads(tree)).toStrictEqual(['const x = ${variable}']); }); it('handles nested template literals', () => { - const hast = mdxish('{`
\\`\\`\\`javascript\\nconst x = 1;\\n\\`\\`\\`
`}
'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: '
```javascript\nconst x = 1;\n```
' }, - }); + const tree = mdxish('{`
\\`\\`\\`javascript\\nconst x = 1;\\n\\`\\`\\`
`}
'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
```javascript\nconst x = 1;\n```
']); }); it('handles trailing whitespace after closing tag', () => { - const hast = mdxish('{`
hello
`}
'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: '
hello
' }, - }); + const tree = mdxish('{`
hello
`}
'); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
hello
']); }); }); @@ -206,12 +172,7 @@ there`; it('renders inside a
with the decoded html prop', () => { const tree = mdxish('
{`

nested

`}
'); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ - type: 'element', - tagName: 'html-block', - properties: { html: '

nested

' }, - }); + expect(htmlBlockPayloads(tree)).toStrictEqual(['

nested

']); expect(findElementByTagName(tree, 'div')).not.toBeNull(); }); @@ -248,11 +209,7 @@ there`; expect(callout.tagName).toBe('Callout'); - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: ' Hello, World!' }, - }); + expect(htmlBlockPayloads(tree)).toStrictEqual([' Hello, World!']); }); it('does not render inside code blocks', () => { @@ -271,14 +228,10 @@ there`; > > {\`

body only

\`}
`; - const hast = mdxish(markdown); - expect((hast.children[0] as Element).tagName).toBe('Callout'); + const tree = mdxish(markdown); + expect((tree.children[0] as Element).tagName).toBe('Callout'); - const htmlBlock = findElementByTagName(hast, 'html-block'); - expect(htmlBlock).toMatchObject({ - tagName: 'html-block', - properties: { html: '

body only

' }, - }); + expect(htmlBlockPayloads(tree)).toStrictEqual(['

body only

']); }); it('renders inside a ', () => { @@ -331,7 +284,7 @@ there`;
`; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); @@ -340,13 +293,7 @@ there`; // Newlines (including the blank line) inside the content must survive the // table re-parse and not fragment the HTMLBlock. - const htmlBlock = findElementByTagName(tree, 'html-block'); - expect(htmlBlock).toMatchObject({ - type: 'element', - tagName: 'html-block', - properties: { html: '

\n

Hello

\n\n

World

\n
' }, - children: [], - }); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n

Hello

\n\n

World

\n
']); }); it('still renders markdown in a sibling text cell', () => { @@ -366,7 +313,7 @@ there`; `; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); // The sibling cell's markdown must still be processed into a . @@ -403,7 +350,7 @@ there`; `; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); expect(htmlBlockPayloads(tree)).toStrictEqual(['
{notTemplate}
']); }); @@ -417,7 +364,7 @@ there`; `; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); const htmlBlock = findElementByTagName(tree, 'html-block'); @@ -446,7 +393,7 @@ there`; `; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); @@ -469,7 +416,7 @@ there`; `; - ensureJsxTableIsParsed(md); + expectJsxTableIsParsed(md); const tree = mdxish(md); const serialized = JSON.stringify(tree); From f0dcdf9c5376648f37c1e87b7dc575bf31309320 Mon Sep 17 00:00:00 2001 From: eagletrhost Date: Thu, 28 May 2026 18:43:27 +1000 Subject: [PATCH 12/15] chore: comments --- lib/constants.ts | 7 ++++--- processor/transform/mdxish/tables/mdxish-tables.ts | 2 +- processor/utils.ts | 10 ++-------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/lib/constants.ts b/lib/constants.ts index e87d95020..b85042c60 100644 --- a/lib/constants.ts +++ b/lib/constants.ts @@ -50,9 +50,10 @@ export const GENERIC_MDX_COMPONENT_EXCLUDED_TAGS = new Set([ /** * Tags the micromark `mdxComponent` tokenizer must not claim. Unlike the remark * transforms, the tokenizer *does* claim `` so its brace-aware body - * states capture multiline template literals (e.g. `{`
โ€ฆ\nโ€ฆ
`}`); the - * raw single-line form is left to `htmlBlockFromJsx` to recover instead. Only - * `` (dedicated jsxTable tokenizer) and the inline tags stay excluded. + * states capture multiline template literals (e.g. `{`
โ€ฆ\nโ€ฆ
`}`); any + * raw-HTML shape that slips past the tokenizer is recovered later by the + * `mdxishHtmlBlocks` remark transform. Only `
` (dedicated jsxTable + * tokenizer) and the inline tags stay excluded. */ export const TOKENIZER_MDX_COMPONENT_EXCLUDED_TAGS = new Set([ 'Table', diff --git a/processor/transform/mdxish/tables/mdxish-tables.ts b/processor/transform/mdxish/tables/mdxish-tables.ts index 4c57c5879..d908892cf 100644 --- a/processor/transform/mdxish/tables/mdxish-tables.ts +++ b/processor/transform/mdxish/tables/mdxish-tables.ts @@ -149,7 +149,7 @@ const processTableNode = ( let tableHasFlowContent = false; // An `` (still a JSX element here; converted to `html-block` by - // `htmlBlockFromJsx` after this transformer) is block-level content that a + // `mdxishHtmlBlocks` after this transformer) is block-level content that a // markdown table cell can't represent, so keep the table as a JSX `
`. visit( node as Node, diff --git a/processor/utils.ts b/processor/utils.ts index afc646390..dadd7dce6 100644 --- a/processor/utils.ts +++ b/processor/utils.ts @@ -161,18 +161,12 @@ export const isMDXEsm = (node: Node): node is MdxjsEsm => { * Takes an HTML string and formats it for display in the editor. Removes leading/trailing newlines * and unindents the HTML. * - * @param {string} html - HTML content from template literal + * @param {string} html - cooked HTML payload (callers strip any template-literal backticks first) * @returns {string} processed HTML */ export function formatHtmlForMdxish(html: string): string { - // Remove leading/trailing backticks if present, since they're used to keep the HTML - // from being parsed prematurely - let processed = html; - if (processed.startsWith('`') && processed.endsWith('`')) { - processed = processed.slice(1, -1); - } // Removes the leading/trailing newlines - let cleaned = processed.replace(/^\s*\n|\n\s*$/g, ''); + let cleaned = html.replace(/^\s*\n|\n\s*$/g, ''); // Convert literal \n sequences to actual newlines only inside
 and .
   // Because 
 needs to respect the newline visual and

From 2ec04fde32c5e83551acb1df0a2ca099037f8875 Mon Sep 17 00:00:00 2001
From: eagletrhost 
Date: Thu, 28 May 2026 18:55:38 +1000
Subject: [PATCH 13/15] fix: better fallback

---
 __tests__/lib/mdxish/html-blocks.test.ts      |  1 +
 .../transform/mdxish/mdxish-html-blocks.ts    | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts
index d9a39b348..160b4e469 100644
--- a/__tests__/lib/mdxish/html-blocks.test.ts
+++ b/__tests__/lib/mdxish/html-blocks.test.ts
@@ -154,6 +154,7 @@ there`;
     it('handles template literal with variables', () => {
       // eslint-disable-next-line quotes
       const tree = mdxish(`{\`const x = \${variable}\`}`);
+      // eslint-disable-next-line no-template-curly-in-string
       expect(htmlBlockPayloads(tree)).toStrictEqual(['const x = ${variable}']);
     });
 
diff --git a/processor/transform/mdxish/mdxish-html-blocks.ts b/processor/transform/mdxish/mdxish-html-blocks.ts
index 8607a88c4..abb212e90 100644
--- a/processor/transform/mdxish/mdxish-html-blocks.ts
+++ b/processor/transform/mdxish/mdxish-html-blocks.ts
@@ -46,9 +46,10 @@ const createHtmlBlockNode = (
  */
 const extractTemplateLiteral = (value: string | undefined): string => {
   if (!value) return '';
-  const trimmed = value.trim();
-  const match = trimmed.match(/^`([\s\S]*)`$/);
-  return match ? match[1] : trimmed;
+  const match = value.trim().match(/^`([\s\S]*)`$/);
+  // Non-template-literal bodies (e.g. `{someVar}`) are malformed mdxish input;
+  // returning '' beats shipping JS identifier source as an HTML payload.
+  return match ? match[1] : '';
 };
 
 const toRunScripts = (raw: string | undefined): boolean | string | undefined =>
@@ -159,11 +160,15 @@ const mdxishHtmlBlocks = (): Transform => tree => {
 
       const body = children
         .slice(i + 1, closeIdx)
-        .map(child =>
-          child.type === 'mdxTextExpression' || child.type === 'mdxFlowExpression'
-            ? extractTemplateLiteral(child.value)
-            : '',
-        )
+        .map(child => {
+          if (child.type === 'mdxTextExpression' || child.type === 'mdxFlowExpression') {
+            return extractTemplateLiteral(child.value);
+          }
+          // Preserve raw text from any other phrasing sibling (e.g. stray
+          // whitespace or content the tokenizer didn't claim) so it isn't
+          // silently dropped from the html payload.
+          return 'value' in child && typeof child.value === 'string' ? child.value : '';
+        })
         .join('');
 
       children.splice(i, closeIdx - i + 1, htmlBlockFromRaw(openMatch[1], body, open.position));

From 425c7d5e0f20a6b49d5e55fbec3065c740ec5beb Mon Sep 17 00:00:00 2001
From: eagletrhost 
Date: Mon, 1 Jun 2026 18:07:49 +1000
Subject: [PATCH 14/15] chore: comment

---
 lib/constants.ts | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/lib/constants.ts b/lib/constants.ts
index e5a57927e..36978809f 100644
--- a/lib/constants.ts
+++ b/lib/constants.ts
@@ -77,12 +77,8 @@ export const GENERIC_MDX_COMPONENT_EXCLUDED_TAGS = new Set([
 ]);
 
 /**
- * Tags the micromark `mdxComponent` tokenizer must not claim. Unlike the remark
- * transforms, the tokenizer *does* claim `` so its brace-aware body
- * states capture multiline template literals (e.g. `{`
โ€ฆ\nโ€ฆ
`}`); any - * raw-HTML shape that slips past the tokenizer is recovered later by the - * `mdxishHtmlBlocks` remark transform. Only `
` (dedicated jsxTable - * tokenizer) and the inline tags stay excluded. + * Tags the micromark `mdxComponent` tokenizer must not claim, which + * are inline components and those that have their own dedicated tokenizer */ export const TOKENIZER_MDX_COMPONENT_EXCLUDED_TAGS = new Set([ 'Table', From acc0fde165607fdd090413de0162ab7e7005aeb7 Mon Sep 17 00:00:00 2001 From: Dimas Putra Anugerah <63914983+eaglethrost@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:02:34 +1000 Subject: [PATCH 15/15] fix(mdxish) deindent HTMLBlock content relative to opening tag (#1501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | ๐ŸŽซ Resolve ISSUE_ID | | :-----------------: | ## ๐ŸŽฏ What does this PR do? While fixing HTMLBlocks not rendering inside Tables in mdxish, I noticed that once it worked, an editor round trip would unexpectedly indent the block content lines in the editor, even though the rendering is fine & not affected. See this demo: https://github.com/user-attachments/assets/5b3fa862-b493-417b-b48f-fac82650133b The root issue is actually the HTMLBlock transformer in the engine captures the content verbatim from the source, each content line's leading whitespace is exactly the characters that sit between the backticks, measured from column 1. Since the Table content is indented in serialisation, the leading whitespaces exist. The fix I went for here is in the block content extraction code, we pass in the `` opening tag position & deindent each line relative to that, instead of the starting column. I think it makes sense to use the tag as the anchor column, there's a few ways we can decide that. I also think the fix should be the engine side cause I don't think it should capture the content verbatim anyway (briefly considered putting fix in the editor). Note that this happens in MDX as well. Haven't investigated yet but it's likely it's an engine issue as well and not the editor. ## ๐Ÿงช QA tips The fix deindents each `` content line **relative to the opening tag's column**, not the start of the line. To verify, paste each example into the mdxish editor, confirm it renders correctly, then do an editor round-trip (e.g. view as Markdown and reopen) โ€” the content lines should **not** gain extra leading indentation. - [ ] **Indented `` (nested under a list item)** ````md 1. Here is some custom HTML: {`

Hello

World

`}
```` The extracted content should be deindented relative to the `` tag, so the `
` sits at column 0 and the `

`s keep their relative 2-space indent: ```html

Hello

World

``` Before the fix, every round-trip would keep the list's 3-space indentation on each line (and compound it on repeated trips). - [ ] **`` inside a `
` cell** ````md
NameMarkup
Custom {`

Hello

World

`}
```` The table should stay a JSX `` and the cell should render the raw HTML. The extracted content should preserve the author's relative indentation without the table-cell serialization indentation leaking into the lines: ```html

Hello

World

``` ## ๐Ÿ“ธ Screenshot or Loom Demo of block inside Table where the indents are retained: https://github.com/user-attachments/assets/68178bd0-0d44-4ebc-8dbb-86be1b2fad8a --- __tests__/lib/mdxish/html-blocks.test.ts | 36 ++++++++++--------- .../transformers/mdxish-html-blocks.test.ts | 14 ++++++++ .../transform/mdxish/mdxish-html-blocks.ts | 27 +++++++++++--- processor/utils.ts | 18 +++++++++- 4 files changed, 73 insertions(+), 22 deletions(-) diff --git a/__tests__/lib/mdxish/html-blocks.test.ts b/__tests__/lib/mdxish/html-blocks.test.ts index 160b4e469..8f7c384ed 100644 --- a/__tests__/lib/mdxish/html-blocks.test.ts +++ b/__tests__/lib/mdxish/html-blocks.test.ts @@ -277,10 +277,11 @@ there`; +

World

+ +\`}
Custom {\`
-

Hello

+

Hello

-

World

-
\`}
`; @@ -294,7 +295,7 @@ there`; // Newlines (including the blank line) inside the content must survive the // table re-parse and not fragment the HTMLBlock. - expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n

Hello

\n\n

World

\n
']); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
\n

Hello

\n\n

World

\n
']); }); it('still renders markdown in a sibling text cell', () => { @@ -306,10 +307,11 @@ there`; **bold** here {\`
    -
  • one
  • +
  • one
  • -
  • two
  • -
\`}
+
  • two
  • + +\`}
    `; @@ -321,7 +323,7 @@ there`; const strongs = findAllElementsByTagName(tree, 'strong'); expect(strongs.length).toBeGreaterThan(0); expect(JSON.stringify(strongs[0])).toContain('bold'); - expect(htmlBlockPayloads(tree)).toStrictEqual(['
      \n
    • one
    • \n\n
    • two
    • \n
    ']); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
      \n
    • one
    • \n\n
    • two
    • \n
    ']); }); it('renders inside a lowercase cell', () => { @@ -329,16 +331,17 @@ there`; +

    b

    + +\`}
    {\`
    -

    a

    +

    a

    -

    b

    -
    \`}
    `; const tree = mdxish(md); - expect(htmlBlockPayloads(tree)).toStrictEqual(['
    \n

    a

    \n\n

    b

    \n
    ']); + expect(htmlBlockPayloads(tree)).toStrictEqual(['
    \n

    a

    \n\n

    b

    \n
    ']); expectFullyConverted(tree); }); @@ -385,10 +388,11 @@ there`; {\`
    - one +one - uno -
    \`}
    +uno + +\`} {\`two\`} @@ -401,7 +405,7 @@ there`; const htmlBlocks = findAllElementsByTagName(tree, 'html-block'); expect(htmlBlocks).toHaveLength(2); expect(htmlBlocks[0].properties).toMatchObject({ - html: '
    \n one\n\n uno\n
    ', + html: '
    \none\n\nuno\n
    ', }); expect(htmlBlocks[1].properties).toMatchObject({ html: 'two' }); }); diff --git a/__tests__/transformers/mdxish-html-blocks.test.ts b/__tests__/transformers/mdxish-html-blocks.test.ts index edea87f79..c87f83f05 100644 --- a/__tests__/transformers/mdxish-html-blocks.test.ts +++ b/__tests__/transformers/mdxish-html-blocks.test.ts @@ -100,6 +100,20 @@ describe('mdxish html blocks transformer', () => { properties: { html: '
      \n
    • one
    • \n
    • two
    • \n
    ' }, }); }); + + it('starts indent relative to the HTMLBlock opening tag', () => { + const markdown = ` + {\`first +second + third + fourth + \`}`; + const tree = mdxish(markdown); + const htmlBlock = findElementByTagName(tree, 'html-block'); + expect(htmlBlock).toMatchObject({ + properties: { html: 'first\nsecond\n third\nfourth' }, + }); + }); }); describe('node structure', () => { diff --git a/processor/transform/mdxish/mdxish-html-blocks.ts b/processor/transform/mdxish/mdxish-html-blocks.ts index abb212e90..c57490619 100644 --- a/processor/transform/mdxish/mdxish-html-blocks.ts +++ b/processor/transform/mdxish/mdxish-html-blocks.ts @@ -74,8 +74,18 @@ const jsxAttr = (element: HtmlBlockJsx, name: string): string | undefined => { }; /** Builds an `html-block` from a raw attribute string and (unparsed) body. */ -const htmlBlockFromRaw = (attrs: string, html: string, position: HTMLBlock['position']): HTMLBlock => - createHtmlBlockNode(formatHtmlForMdxish(html), position, toRunScripts(rawAttr(attrs, 'runScripts')), rawAttr(attrs, 'safeMode')); +const htmlBlockFromRaw = ( + attrs: string, + html: string, + position: HTMLBlock['position'], + openingTagIndent = 0, +): HTMLBlock => + createHtmlBlockNode( + formatHtmlForMdxish(html, openingTagIndent), + position, + toRunScripts(rawAttr(attrs, 'runScripts')), + rawAttr(attrs, 'safeMode'), + ); /** * Splits a raw `html` node that embeds one or more ``s into @@ -92,7 +102,12 @@ const splitRawHtmlBlocks = (node: Html): RootContent[] | null => { for (let i = 0; i < segments.length; i += 3) { const [text, attrs, body] = segments.slice(i, i + 3); if (text) parts.push({ type: 'html', value: text }); - if (body !== undefined) parts.push(htmlBlockFromRaw(attrs, body, node.position)); + if (body !== undefined) { + // The opening tag's column equals the length of the line it starts on + // (the text run since the previous newline preceding the match). + const openingTagIndent = text.slice(text.lastIndexOf('\n') + 1).length; + parts.push(htmlBlockFromRaw(attrs, body, node.position, openingTagIndent)); + } } return parts; }; @@ -126,8 +141,9 @@ const mdxishHtmlBlocks = (): Transform => tree => { child => child.type === 'mdxFlowExpression' || child.type === 'mdxTextExpression', ) as { value?: string } | undefined; + const openingTagIndent = (element.position?.start.column ?? 1) - 1; parent.children[index] = createHtmlBlockNode( - formatHtmlForMdxish(extractTemplateLiteral(exprChild?.value)), + formatHtmlForMdxish(extractTemplateLiteral(exprChild?.value), openingTagIndent), element.position, toRunScripts(jsxAttr(element, 'runScripts')), jsxAttr(element, 'safeMode'), @@ -171,7 +187,8 @@ const mdxishHtmlBlocks = (): Transform => tree => { }) .join(''); - children.splice(i, closeIdx - i + 1, htmlBlockFromRaw(openMatch[1], body, open.position)); + const openingTagIndent = (open.position?.start.column ?? 1) - 1; + children.splice(i, closeIdx - i + 1, htmlBlockFromRaw(openMatch[1], body, open.position, openingTagIndent)); } }); }; diff --git a/processor/utils.ts b/processor/utils.ts index dadd7dce6..60309e021 100644 --- a/processor/utils.ts +++ b/processor/utils.ts @@ -162,12 +162,28 @@ export const isMDXEsm = (node: Node): node is MdxjsEsm => { * and unindents the HTML. * * @param {string} html - cooked HTML payload (callers strip any template-literal backticks first) + * @param {number} [openingTagIndent=0] - column the `` opening tag sits at, used to + * dedent each content line so its indentation reads relative to the tag, not the line start * @returns {string} processed HTML */ -export function formatHtmlForMdxish(html: string): string { +export function formatHtmlForMdxish(html: string, openingTagIndent = 0): string { // Removes the leading/trailing newlines let cleaned = html.replace(/^\s*\n|\n\s*$/g, ''); + // Strip / deindent the lines in the HTML string so that the indents are relative + // to the opening HTMLBlock tag, not the literal line start + // Keep any deeper indent + if (openingTagIndent > 0) { + cleaned = cleaned + .split('\n') + .map(line => { + let i = 0; + while (i < openingTagIndent && (line[i] === ' ' || line[i] === '\t')) i += 1; + return line.slice(i); + }) + .join('\n'); + } + // Convert literal \n sequences to actual newlines only inside
     and .
       // Because 
     needs to respect the newline visual and
       // escape characters should be processed in the  tag.