From fd092651030bb5bda44441873ec86e2312924e07 Mon Sep 17 00:00:00 2001 From: Rainer Friederich Date: Thu, 25 Jun 2026 15:53:00 +0200 Subject: [PATCH 1/2] fix(llmo): encode dataFolder path boundary Helix-safe (LLMO-5859) generateDataFolder joined the host and URL path segments with `--`, which Helix/AEM reserves for its `ref--repo--owner` host convention and rejects with HTTP 400 on bulk-preview and Admin API /status -- breaking the Brand Presence live-website publish for any path-bearing site (the SharePoint sheets and S3 objects still land; only the publish path is rejected). Replace the `--` join with a self-escaped `zs` boundary marker (`z` -> `zz`, `/` -> `zs`): Helix-safe, unambiguous, and reversible. It keeps the `/` boundary distinguishable from a sanitized `.`/`-`, so subpath sites on the same host stay distinct (e.g. `/us/kings` != `/us-kings`). Sanitization is still lossy within a single segment, unchanged from before. Forward-only: the derivation runs only at onboarding, existing dataFolders are not re-derived, and prod has no `--` folders remaining (LLMO-5770 already remediated those). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/controllers/llmo/llmo-onboarding.js | 38 +++++++--- test/controllers/llmo/llmo-onboarding.test.js | 75 +++++++++++++++---- 2 files changed, 88 insertions(+), 25 deletions(-) diff --git a/src/controllers/llmo/llmo-onboarding.js b/src/controllers/llmo/llmo-onboarding.js index 80fd903e8..331caa52d 100644 --- a/src/controllers/llmo/llmo-onboarding.js +++ b/src/controllers/llmo/llmo-onboarding.js @@ -322,19 +322,28 @@ export async function ensureInitialCustomerConfigV2({ * percent-decoded and individually sanitized: runs of non-alphanumeric characters * are replaced with a single `-`, leading/trailing `-` are trimmed, and the * result is lowercased. Segments that reduce to empty after sanitization are - * dropped. Sanitized parts are joined with `--` as the path-segment delimiter. + * dropped. * - * The `--` delimiter cannot appear inside a sanitized segment (any run of - * non-alphanumeric characters collapses to a single `-`), so URLs that differ - * in path structure produce distinct folder names. Path segments differing only - * in punctuation (e.g. `us-kings` vs `us_kings`) produce the same sanitized - * segment and therefore the same folder; this is an inherent limitation of - * lossy sanitization. + * Helix/AEM reserves the double-dash for its `ref--repo--owner` host convention + * and rejects any resource path containing `--` with HTTP 400 (LLMO-5859), so the + * host/segment boundary cannot be a dash. Instead each sanitized part has its + * marker letter self-escaped (`z` -> `zz`) and the parts are joined with `zs`, + * a Helix-safe token marking the `/` path boundary. Because a literal `z` is + * always doubled, a lone `z` can only introduce a structural token, so `zs` can + * never appear by accident inside a part: the encoding is unambiguous and + * reversible (`zz` -> `z`, `zs` -> `/`). + * + * This keeps the path boundary distinguishable from a `.` (which sanitizes to + * `-`): `nba.com/com` -> `nba-comzscom` stays distinct from `nba.com.com` -> + * `nba-com-com`. Sanitization is still lossy *within* a single segment, so + * segments differing only in punctuation (e.g. `us-kings` vs `us_kings`) collapse + * to the same folder; that is an inherent limitation of lossy per-segment + * sanitization, unchanged from the prior scheme. * * Examples: * https://nba.com -> nba-com - * https://nba.com/kings -> nba-com--kings - * https://nba.com/us/kings -> nba-com--us--kings + * https://nba.com/kings -> nba-comzskings + * https://nba.com/us/kings -> nba-comzsuszskings * * @param {string} baseURL - The site's base URL (must be a fully-qualified URL). * @param {string} env - The environment ('prod' for production, anything else is @@ -347,17 +356,22 @@ export function generateDataFolder(baseURL, env = 'dev') { throw new TypeError('Invalid baseURL: hostname is required'); } const sanitize = (s) => s.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '').toLowerCase(); - const host = sanitize(url.hostname); + // Self-escape the boundary marker letter so a lone `z` can only ever introduce + // the `zs` path-boundary token (see join below). Run on already-sanitized, + // lowercased parts. + const escapeMarker = (s) => s.replace(/z/g, 'zz'); + const host = escapeMarker(sanitize(url.hostname)); const segments = url.pathname.split('/').filter(Boolean) .map((seg) => { let decoded = seg; try { decoded = decodeURIComponent(seg); } catch { /* keep raw on percent-encoded sequences that are not valid UTF-8 */ } - return sanitize(decoded); + return escapeMarker(sanitize(decoded)); }) .filter(Boolean); - const dataFolderName = segments.length > 0 ? `${host}--${segments.join('--')}` : host; + // Join host + path segments with `zs`, the Helix-safe marker for the `/` boundary. + const dataFolderName = [host, ...segments].join('zs'); return env === 'prod' ? dataFolderName : `dev/${dataFolderName}`; } diff --git a/test/controllers/llmo/llmo-onboarding.test.js b/test/controllers/llmo/llmo-onboarding.test.js index f7ff17aa7..f26f14a70 100644 --- a/test/controllers/llmo/llmo-onboarding.test.js +++ b/test/controllers/llmo/llmo-onboarding.test.js @@ -377,9 +377,9 @@ describe('LLMO Onboarding Functions', () => { }); it('should generate unique folder names for subpath sites on the same domain', () => { - expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-com--kings'); - expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-com--lakers'); - expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-com--kings'); + expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-comzskings'); + expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-comzslakers'); + expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-comzskings'); }); it('should produce the same folder name for root domain with or without trailing slash', () => { @@ -393,24 +393,73 @@ describe('LLMO Onboarding Functions', () => { }); it('should generate correct folder name for nested subpaths', () => { - expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-com--us--kings'); - expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-com--us--kings'); + expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings'); + expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-comzsuszskings'); }); - it('should generate distinct folder names for paths that differ only by separator type', () => { + it('should keep the path boundary distinguishable from a sanitized dash (LLMO-5859)', () => { + // The `zs` boundary marker is distinct from the `-` that punctuation + // sanitizes to, so a real path split is not confused with an in-segment dash: + // `/us/kings` (two segments) differs from `/us-kings` (one segment). + expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings'); + expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal('nba-comzsus-kings'); expect(generateDataFolder('https://nba.com/us/kings', 'prod')) .to.not.equal(generateDataFolder('https://nba.com/us-kings', 'prod')); + }); + + it('should still collapse separator variants within a single segment (lossy sanitize)', () => { + // Sanitization remains lossy inside one segment, so punctuation-only variants + // of the same segment collapse -- an inherent limit unchanged from before. + const expected = 'nba-comzsus-kings'; + expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal(expected); + expect(generateDataFolder('https://nba.com/us..kings', 'prod')).to.equal(expected); + expect(generateDataFolder('https://nba.com/us--kings', 'prod')).to.equal(expected); + }); + + it('should still separate same-host sites with genuinely different paths', () => { + // The real LLMO-4186 case: different path content -> different folders. + expect(generateDataFolder('https://nba.com/kings', 'prod')) + .to.not.equal(generateDataFolder('https://nba.com/lakers', 'prod')); expect(generateDataFolder('https://nba.com/us/kings', 'prod')) - .to.not.equal(generateDataFolder('https://nba.com/us..kings', 'prod')); - expect(generateDataFolder('https://nba.com/us/kings', 'prod')) - .to.not.equal(generateDataFolder('https://nba.com/us--kings', 'prod')); + .to.not.equal(generateDataFolder('https://nba.com/eu/kings', 'prod')); }); - it('should not collide hostname with consecutive non-alnum chars and a subpath', () => { + it('should distinguish a hostname with consecutive non-alnum chars from a host/subpath split', () => { + // `nba--com` (host; `--` collapses to `-`) -> `nba-com`, whereas `nba` + `/com` + // -> `nbazscom`. The `zs` boundary keeps these distinct (single-dash could not). + expect(generateDataFolder('https://nba--com/', 'prod')).to.equal('nba-com'); + expect(generateDataFolder('https://nba/com', 'prod')).to.equal('nbazscom'); expect(generateDataFolder('https://nba--com/', 'prod')) .to.not.equal(generateDataFolder('https://nba/com', 'prod')); }); + it('should self-escape the marker letter `z` in hosts and segments (LLMO-5859)', () => { + // A literal `z` is doubled so it can never be read as a boundary token. + expect(generateDataFolder('https://amazon.com', 'prod')).to.equal('amazzon-com'); + expect(generateDataFolder('https://nba.com/zone', 'prod')).to.equal('nba-comzszzone'); + // A segment that literally spells the boundary token stays unambiguous. + expect(generateDataFolder('https://nba.com/zs', 'prod')).to.equal('nba-comzszzs'); + expect(generateDataFolder('https://nba.com/zs', 'prod')) + .to.not.equal(generateDataFolder('https://nba.com/z/s', 'prod')); + }); + + it('should never emit the Helix-reserved `--` in a folder name (LLMO-5859)', () => { + // Helix 400s any resource path containing `--`, so the derivation must never + // produce it -- including for hosts or paths that themselves contain `--`. + const urls = [ + 'https://nba.com', + 'https://nba.com/kings', + 'https://nba.com/us/kings', + 'https://nba--com/double/dash/host', + 'https://nba.com/us--kings', + 'https://nba.com/a/b/c/d', + ]; + urls.forEach((url) => { + expect(generateDataFolder(url, 'prod')).to.not.include('--'); + expect(generateDataFolder(url, 'dev')).to.not.include('--'); + }); + }); + it('should handle malformed percent-encoded path segments without throwing', () => { expect(() => generateDataFolder('https://a.com/%FF', 'prod')).to.not.throw(); }); @@ -419,7 +468,7 @@ describe('LLMO Onboarding Functions', () => { expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod')) .to.equal(generateDataFolder('https://nba.com/könig', 'prod')); expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod')) - .to.match(/^nba-com--/); + .to.match(/^nba-comzs/); }); it('should case-fold path segments so /Kings and /kings resolve to the same folder', () => { @@ -434,9 +483,9 @@ describe('LLMO Onboarding Functions', () => { it('should ignore query strings and URL fragments', () => { expect(generateDataFolder('https://nba.com/kings?utm=foo', 'prod')) - .to.equal('nba-com--kings'); + .to.equal('nba-comzskings'); expect(generateDataFolder('https://nba.com/kings#section', 'prod')) - .to.equal('nba-com--kings'); + .to.equal('nba-comzskings'); }); it('should drop degenerate path segments that sanitize to empty', () => { From 5894b6169f019f5bea75a2b384ad9a692a935a7e Mon Sep 17 00:00:00 2001 From: Rainer Friederich Date: Thu, 25 Jun 2026 16:13:20 +0200 Subject: [PATCH 2/2] fix(llmo): address MysticatBot review on PR 2688 - docstring: note decoding needs a left-to-right scanner, not split('zs') - rename escapeMarker -> escapeZ for clarity - test: add assertion context to the never-emit-`--` loop - test: add host-starting-with-`z` edge case (zsecurity.com) - test: assert the concrete return value for the malformed `%FF` segment - offboarding fallback: defensive comment on the older-scheme mismatch risk Co-Authored-By: Claude Opus 4.8 (1M context) --- src/controllers/llmo/llmo-onboarding.js | 19 ++++++++++++++----- test/controllers/llmo/llmo-onboarding.test.js | 9 +++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/controllers/llmo/llmo-onboarding.js b/src/controllers/llmo/llmo-onboarding.js index 331caa52d..922728449 100644 --- a/src/controllers/llmo/llmo-onboarding.js +++ b/src/controllers/llmo/llmo-onboarding.js @@ -331,7 +331,10 @@ export async function ensureInitialCustomerConfigV2({ * a Helix-safe token marking the `/` path boundary. Because a literal `z` is * always doubled, a lone `z` can only introduce a structural token, so `zs` can * never appear by accident inside a part: the encoding is unambiguous and - * reversible (`zz` -> `z`, `zs` -> `/`). + * reversible by a left-to-right scanner that, on each `z`, consumes the next + * character to decide escape-vs-boundary (`zz` -> `z`, `zs` -> `/`). A naive + * `String.split('zs')` is NOT a correct decoder: a part whose content contains + * `zs` is stored as `zzs`, which a blind split would wrongly break apart. * * This keeps the path boundary distinguishable from a `.` (which sanitizes to * `-`): `nba.com/com` -> `nba-comzscom` stays distinct from `nba.com.com` -> @@ -359,15 +362,15 @@ export function generateDataFolder(baseURL, env = 'dev') { // Self-escape the boundary marker letter so a lone `z` can only ever introduce // the `zs` path-boundary token (see join below). Run on already-sanitized, // lowercased parts. - const escapeMarker = (s) => s.replace(/z/g, 'zz'); - const host = escapeMarker(sanitize(url.hostname)); + const escapeZ = (s) => s.replace(/z/g, 'zz'); + const host = escapeZ(sanitize(url.hostname)); const segments = url.pathname.split('/').filter(Boolean) .map((seg) => { let decoded = seg; try { decoded = decodeURIComponent(seg); } catch { /* keep raw on percent-encoded sequences that are not valid UTF-8 */ } - return escapeMarker(sanitize(decoded)); + return escapeZ(sanitize(decoded)); }) .filter(Boolean); // Join host + path segments with `zs`, the Helix-safe marker for the `/` boundary. @@ -1705,7 +1708,13 @@ export async function performLlmoOffboarding(site, config, context) { const baseURL = site.getBaseURL(); const llmoConfig = config.getLlmoConfig(); - // Check if site has LLMO config with data folder, if not calculate it + // Check if site has LLMO config with data folder, if not calculate it. + // NOTE: re-deriving here assumes the folder was created with the CURRENT + // generateDataFolder scheme. A site onboarded under an older scheme whose + // stored dataFolder is missing could derive a name that does not match its + // actual SharePoint folder (so deleteSharePointFolder below would miss it). + // Onboarded sites always persist dataFolder, so this fallback should not fire + // in practice (prod scan confirmed zero affected, LLMO-5859). let dataFolder = llmoConfig?.dataFolder; if (!dataFolder) { log.debug(`Data folder not found in LLMO config, calculating from base URL: ${baseURL}`); diff --git a/test/controllers/llmo/llmo-onboarding.test.js b/test/controllers/llmo/llmo-onboarding.test.js index f26f14a70..91672d982 100644 --- a/test/controllers/llmo/llmo-onboarding.test.js +++ b/test/controllers/llmo/llmo-onboarding.test.js @@ -441,6 +441,8 @@ describe('LLMO Onboarding Functions', () => { expect(generateDataFolder('https://nba.com/zs', 'prod')).to.equal('nba-comzszzs'); expect(generateDataFolder('https://nba.com/zs', 'prod')) .to.not.equal(generateDataFolder('https://nba.com/z/s', 'prod')); + // Host starting with the marker letter, immediately adjacent to the boundary. + expect(generateDataFolder('https://zsecurity.com/page', 'prod')).to.equal('zzsecurity-comzspage'); }); it('should never emit the Helix-reserved `--` in a folder name (LLMO-5859)', () => { @@ -455,13 +457,16 @@ describe('LLMO Onboarding Functions', () => { 'https://nba.com/a/b/c/d', ]; urls.forEach((url) => { - expect(generateDataFolder(url, 'prod')).to.not.include('--'); - expect(generateDataFolder(url, 'dev')).to.not.include('--'); + expect(generateDataFolder(url, 'prod'), `prod: ${url}`).to.not.include('--'); + expect(generateDataFolder(url, 'dev'), `dev: ${url}`).to.not.include('--'); }); }); it('should handle malformed percent-encoded path segments without throwing', () => { expect(() => generateDataFolder('https://a.com/%FF', 'prod')).to.not.throw(); + // `%FF` is not valid UTF-8, so decode is skipped and the raw segment is + // sanitized to `ff`; assert the concrete result to catch regressions. + expect(generateDataFolder('https://a.com/%FF', 'prod')).to.equal('a-comzsff'); }); it('should normalize percent-encoded path segments', () => {