diff --git a/src/controllers/llmo/llmo-onboarding.js b/src/controllers/llmo/llmo-onboarding.js index 80fd903e82..922728449b 100644 --- a/src/controllers/llmo/llmo-onboarding.js +++ b/src/controllers/llmo/llmo-onboarding.js @@ -322,19 +322,31 @@ export async function ensureInitialCustomerConfigV2({ * percent-decoded and individually sanitized: runs of non-alphanumeric characters * are replaced with a single `-`, leading/trailing `-` are trimmed, and the * result is lowercased. Segments that reduce to empty after sanitization are - * dropped. Sanitized parts are joined with `--` as the path-segment delimiter. + * dropped. * - * The `--` delimiter cannot appear inside a sanitized segment (any run of - * non-alphanumeric characters collapses to a single `-`), so URLs that differ - * in path structure produce distinct folder names. Path segments differing only - * in punctuation (e.g. `us-kings` vs `us_kings`) produce the same sanitized - * segment and therefore the same folder; this is an inherent limitation of - * lossy sanitization. + * Helix/AEM reserves the double-dash for its `ref--repo--owner` host convention + * and rejects any resource path containing `--` with HTTP 400 (LLMO-5859), so the + * host/segment boundary cannot be a dash. Instead each sanitized part has its + * marker letter self-escaped (`z` -> `zz`) and the parts are joined with `zs`, + * a Helix-safe token marking the `/` path boundary. Because a literal `z` is + * always doubled, a lone `z` can only introduce a structural token, so `zs` can + * never appear by accident inside a part: the encoding is unambiguous and + * reversible by a left-to-right scanner that, on each `z`, consumes the next + * character to decide escape-vs-boundary (`zz` -> `z`, `zs` -> `/`). A naive + * `String.split('zs')` is NOT a correct decoder: a part whose content contains + * `zs` is stored as `zzs`, which a blind split would wrongly break apart. + * + * This keeps the path boundary distinguishable from a `.` (which sanitizes to + * `-`): `nba.com/com` -> `nba-comzscom` stays distinct from `nba.com.com` -> + * `nba-com-com`. Sanitization is still lossy *within* a single segment, so + * segments differing only in punctuation (e.g. `us-kings` vs `us_kings`) collapse + * to the same folder; that is an inherent limitation of lossy per-segment + * sanitization, unchanged from the prior scheme. * * Examples: * https://nba.com -> nba-com - * https://nba.com/kings -> nba-com--kings - * https://nba.com/us/kings -> nba-com--us--kings + * https://nba.com/kings -> nba-comzskings + * https://nba.com/us/kings -> nba-comzsuszskings * * @param {string} baseURL - The site's base URL (must be a fully-qualified URL). * @param {string} env - The environment ('prod' for production, anything else is @@ -347,17 +359,22 @@ export function generateDataFolder(baseURL, env = 'dev') { throw new TypeError('Invalid baseURL: hostname is required'); } const sanitize = (s) => s.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '').toLowerCase(); - const host = sanitize(url.hostname); + // Self-escape the boundary marker letter so a lone `z` can only ever introduce + // the `zs` path-boundary token (see join below). Run on already-sanitized, + // lowercased parts. + const escapeZ = (s) => s.replace(/z/g, 'zz'); + const host = escapeZ(sanitize(url.hostname)); const segments = url.pathname.split('/').filter(Boolean) .map((seg) => { let decoded = seg; try { decoded = decodeURIComponent(seg); } catch { /* keep raw on percent-encoded sequences that are not valid UTF-8 */ } - return sanitize(decoded); + return escapeZ(sanitize(decoded)); }) .filter(Boolean); - const dataFolderName = segments.length > 0 ? `${host}--${segments.join('--')}` : host; + // Join host + path segments with `zs`, the Helix-safe marker for the `/` boundary. + const dataFolderName = [host, ...segments].join('zs'); return env === 'prod' ? dataFolderName : `dev/${dataFolderName}`; } @@ -1691,7 +1708,13 @@ export async function performLlmoOffboarding(site, config, context) { const baseURL = site.getBaseURL(); const llmoConfig = config.getLlmoConfig(); - // Check if site has LLMO config with data folder, if not calculate it + // Check if site has LLMO config with data folder, if not calculate it. + // NOTE: re-deriving here assumes the folder was created with the CURRENT + // generateDataFolder scheme. A site onboarded under an older scheme whose + // stored dataFolder is missing could derive a name that does not match its + // actual SharePoint folder (so deleteSharePointFolder below would miss it). + // Onboarded sites always persist dataFolder, so this fallback should not fire + // in practice (prod scan confirmed zero affected, LLMO-5859). let dataFolder = llmoConfig?.dataFolder; if (!dataFolder) { log.debug(`Data folder not found in LLMO config, calculating from base URL: ${baseURL}`); diff --git a/test/controllers/llmo/llmo-onboarding.test.js b/test/controllers/llmo/llmo-onboarding.test.js index f7ff17aa77..91672d9826 100644 --- a/test/controllers/llmo/llmo-onboarding.test.js +++ b/test/controllers/llmo/llmo-onboarding.test.js @@ -377,9 +377,9 @@ describe('LLMO Onboarding Functions', () => { }); it('should generate unique folder names for subpath sites on the same domain', () => { - expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-com--kings'); - expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-com--lakers'); - expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-com--kings'); + expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-comzskings'); + expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-comzslakers'); + expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-comzskings'); }); it('should produce the same folder name for root domain with or without trailing slash', () => { @@ -393,33 +393,87 @@ describe('LLMO Onboarding Functions', () => { }); it('should generate correct folder name for nested subpaths', () => { - expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-com--us--kings'); - expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-com--us--kings'); + expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings'); + expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-comzsuszskings'); }); - it('should generate distinct folder names for paths that differ only by separator type', () => { + it('should keep the path boundary distinguishable from a sanitized dash (LLMO-5859)', () => { + // The `zs` boundary marker is distinct from the `-` that punctuation + // sanitizes to, so a real path split is not confused with an in-segment dash: + // `/us/kings` (two segments) differs from `/us-kings` (one segment). + expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings'); + expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal('nba-comzsus-kings'); expect(generateDataFolder('https://nba.com/us/kings', 'prod')) .to.not.equal(generateDataFolder('https://nba.com/us-kings', 'prod')); + }); + + it('should still collapse separator variants within a single segment (lossy sanitize)', () => { + // Sanitization remains lossy inside one segment, so punctuation-only variants + // of the same segment collapse -- an inherent limit unchanged from before. + const expected = 'nba-comzsus-kings'; + expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal(expected); + expect(generateDataFolder('https://nba.com/us..kings', 'prod')).to.equal(expected); + expect(generateDataFolder('https://nba.com/us--kings', 'prod')).to.equal(expected); + }); + + it('should still separate same-host sites with genuinely different paths', () => { + // The real LLMO-4186 case: different path content -> different folders. + expect(generateDataFolder('https://nba.com/kings', 'prod')) + .to.not.equal(generateDataFolder('https://nba.com/lakers', 'prod')); expect(generateDataFolder('https://nba.com/us/kings', 'prod')) - .to.not.equal(generateDataFolder('https://nba.com/us..kings', 'prod')); - expect(generateDataFolder('https://nba.com/us/kings', 'prod')) - .to.not.equal(generateDataFolder('https://nba.com/us--kings', 'prod')); + .to.not.equal(generateDataFolder('https://nba.com/eu/kings', 'prod')); }); - it('should not collide hostname with consecutive non-alnum chars and a subpath', () => { + it('should distinguish a hostname with consecutive non-alnum chars from a host/subpath split', () => { + // `nba--com` (host; `--` collapses to `-`) -> `nba-com`, whereas `nba` + `/com` + // -> `nbazscom`. The `zs` boundary keeps these distinct (single-dash could not). + expect(generateDataFolder('https://nba--com/', 'prod')).to.equal('nba-com'); + expect(generateDataFolder('https://nba/com', 'prod')).to.equal('nbazscom'); expect(generateDataFolder('https://nba--com/', 'prod')) .to.not.equal(generateDataFolder('https://nba/com', 'prod')); }); + it('should self-escape the marker letter `z` in hosts and segments (LLMO-5859)', () => { + // A literal `z` is doubled so it can never be read as a boundary token. + expect(generateDataFolder('https://amazon.com', 'prod')).to.equal('amazzon-com'); + expect(generateDataFolder('https://nba.com/zone', 'prod')).to.equal('nba-comzszzone'); + // A segment that literally spells the boundary token stays unambiguous. + expect(generateDataFolder('https://nba.com/zs', 'prod')).to.equal('nba-comzszzs'); + expect(generateDataFolder('https://nba.com/zs', 'prod')) + .to.not.equal(generateDataFolder('https://nba.com/z/s', 'prod')); + // Host starting with the marker letter, immediately adjacent to the boundary. + expect(generateDataFolder('https://zsecurity.com/page', 'prod')).to.equal('zzsecurity-comzspage'); + }); + + it('should never emit the Helix-reserved `--` in a folder name (LLMO-5859)', () => { + // Helix 400s any resource path containing `--`, so the derivation must never + // produce it -- including for hosts or paths that themselves contain `--`. + const urls = [ + 'https://nba.com', + 'https://nba.com/kings', + 'https://nba.com/us/kings', + 'https://nba--com/double/dash/host', + 'https://nba.com/us--kings', + 'https://nba.com/a/b/c/d', + ]; + urls.forEach((url) => { + expect(generateDataFolder(url, 'prod'), `prod: ${url}`).to.not.include('--'); + expect(generateDataFolder(url, 'dev'), `dev: ${url}`).to.not.include('--'); + }); + }); + it('should handle malformed percent-encoded path segments without throwing', () => { expect(() => generateDataFolder('https://a.com/%FF', 'prod')).to.not.throw(); + // `%FF` is not valid UTF-8, so decode is skipped and the raw segment is + // sanitized to `ff`; assert the concrete result to catch regressions. + expect(generateDataFolder('https://a.com/%FF', 'prod')).to.equal('a-comzsff'); }); it('should normalize percent-encoded path segments', () => { expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod')) .to.equal(generateDataFolder('https://nba.com/könig', 'prod')); expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod')) - .to.match(/^nba-com--/); + .to.match(/^nba-comzs/); }); it('should case-fold path segments so /Kings and /kings resolve to the same folder', () => { @@ -434,9 +488,9 @@ describe('LLMO Onboarding Functions', () => { it('should ignore query strings and URL fragments', () => { expect(generateDataFolder('https://nba.com/kings?utm=foo', 'prod')) - .to.equal('nba-com--kings'); + .to.equal('nba-comzskings'); expect(generateDataFolder('https://nba.com/kings#section', 'prod')) - .to.equal('nba-com--kings'); + .to.equal('nba-comzskings'); }); it('should drop degenerate path segments that sanitize to empty', () => {