Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 26 additions & 12 deletions src/controllers/llmo/llmo-onboarding.js
Original file line number Diff line number Diff line change
Expand Up @@ -322,19 +322,28 @@ export async function ensureInitialCustomerConfigV2({
* percent-decoded and individually sanitized: runs of non-alphanumeric characters
* are replaced with a single `-`, leading/trailing `-` are trimmed, and the
* result is lowercased. Segments that reduce to empty after sanitization are
* dropped. Sanitized parts are joined with `--` as the path-segment delimiter.
* dropped.
*
* The `--` delimiter cannot appear inside a sanitized segment (any run of
* non-alphanumeric characters collapses to a single `-`), so URLs that differ
* in path structure produce distinct folder names. Path segments differing only
* in punctuation (e.g. `us-kings` vs `us_kings`) produce the same sanitized
* segment and therefore the same folder; this is an inherent limitation of
* lossy sanitization.
* Helix/AEM reserves the double-dash for its `ref--repo--owner` host convention
* and rejects any resource path containing `--` with HTTP 400 (LLMO-5859), so the
* host/segment boundary cannot be a dash. Instead each sanitized part has its
* marker letter self-escaped (`z` -> `zz`) and the parts are joined with `zs`,
* a Helix-safe token marking the `/` path boundary. Because a literal `z` is
* always doubled, a lone `z` can only introduce a structural token, so `zs` can
* never appear by accident inside a part: the encoding is unambiguous and
* reversible (`zz` -> `z`, `zs` -> `/`).
*
* This keeps the path boundary distinguishable from a `.` (which sanitizes to
* `-`): `nba.com/com` -> `nba-comzscom` stays distinct from `nba.com.com` ->
* `nba-com-com`. Sanitization is still lossy *within* a single segment, so
* segments differing only in punctuation (e.g. `us-kings` vs `us_kings`) collapse
* to the same folder; that is an inherent limitation of lossy per-segment
* sanitization, unchanged from the prior scheme.
*
* Examples:
* https://nba.com -> nba-com
* https://nba.com/kings -> nba-com--kings
* https://nba.com/us/kings -> nba-com--us--kings
* https://nba.com/kings -> nba-comzskings
* https://nba.com/us/kings -> nba-comzsuszskings
*
* @param {string} baseURL - The site's base URL (must be a fully-qualified URL).
* @param {string} env - The environment ('prod' for production, anything else is
Expand All @@ -347,17 +356,22 @@ export function generateDataFolder(baseURL, env = 'dev') {
throw new TypeError('Invalid baseURL: hostname is required');
}
const sanitize = (s) => s.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '').toLowerCase();
const host = sanitize(url.hostname);
// Self-escape the boundary marker letter so a lone `z` can only ever introduce
// the `zs` path-boundary token (see join below). Run on already-sanitized,
// lowercased parts.
const escapeMarker = (s) => s.replace(/z/g, 'zz');
const host = escapeMarker(sanitize(url.hostname));
const segments = url.pathname.split('/').filter(Boolean)
.map((seg) => {
let decoded = seg;
try {
decoded = decodeURIComponent(seg);
} catch { /* keep raw on percent-encoded sequences that are not valid UTF-8 */ }
return sanitize(decoded);
return escapeMarker(sanitize(decoded));
})
.filter(Boolean);
const dataFolderName = segments.length > 0 ? `${host}--${segments.join('--')}` : host;
// Join host + path segments with `zs`, the Helix-safe marker for the `/` boundary.
const dataFolderName = [host, ...segments].join('zs');
return env === 'prod' ? dataFolderName : `dev/${dataFolderName}`;
}

Expand Down
75 changes: 62 additions & 13 deletions test/controllers/llmo/llmo-onboarding.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -377,9 +377,9 @@ describe('LLMO Onboarding Functions', () => {
});

it('should generate unique folder names for subpath sites on the same domain', () => {
expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-com--kings');
expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-com--lakers');
expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-com--kings');
expect(generateDataFolder('https://nba.com/kings', 'prod')).to.equal('nba-comzskings');
expect(generateDataFolder('https://nba.com/lakers', 'prod')).to.equal('nba-comzslakers');
expect(generateDataFolder('https://nba.com/kings', 'dev')).to.equal('dev/nba-comzskings');
});

it('should produce the same folder name for root domain with or without trailing slash', () => {
Expand All @@ -393,24 +393,73 @@ describe('LLMO Onboarding Functions', () => {
});

it('should generate correct folder name for nested subpaths', () => {
expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-com--us--kings');
expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-com--us--kings');
expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings');
expect(generateDataFolder('https://nba.com/us/kings', 'dev')).to.equal('dev/nba-comzsuszskings');
});

it('should generate distinct folder names for paths that differ only by separator type', () => {
it('should keep the path boundary distinguishable from a sanitized dash (LLMO-5859)', () => {
// The `zs` boundary marker is distinct from the `-` that punctuation
// sanitizes to, so a real path split is not confused with an in-segment dash:
// `/us/kings` (two segments) differs from `/us-kings` (one segment).
expect(generateDataFolder('https://nba.com/us/kings', 'prod')).to.equal('nba-comzsuszskings');
expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal('nba-comzsus-kings');
expect(generateDataFolder('https://nba.com/us/kings', 'prod'))
.to.not.equal(generateDataFolder('https://nba.com/us-kings', 'prod'));
});

it('should still collapse separator variants within a single segment (lossy sanitize)', () => {
// Sanitization remains lossy inside one segment, so punctuation-only variants
// of the same segment collapse -- an inherent limit unchanged from before.
const expected = 'nba-comzsus-kings';
expect(generateDataFolder('https://nba.com/us-kings', 'prod')).to.equal(expected);
expect(generateDataFolder('https://nba.com/us..kings', 'prod')).to.equal(expected);
expect(generateDataFolder('https://nba.com/us--kings', 'prod')).to.equal(expected);
});

it('should still separate same-host sites with genuinely different paths', () => {
// The real LLMO-4186 case: different path content -> different folders.
expect(generateDataFolder('https://nba.com/kings', 'prod'))
.to.not.equal(generateDataFolder('https://nba.com/lakers', 'prod'));
expect(generateDataFolder('https://nba.com/us/kings', 'prod'))
.to.not.equal(generateDataFolder('https://nba.com/us..kings', 'prod'));
expect(generateDataFolder('https://nba.com/us/kings', 'prod'))
.to.not.equal(generateDataFolder('https://nba.com/us--kings', 'prod'));
.to.not.equal(generateDataFolder('https://nba.com/eu/kings', 'prod'));
});

it('should not collide hostname with consecutive non-alnum chars and a subpath', () => {
it('should distinguish a hostname with consecutive non-alnum chars from a host/subpath split', () => {
// `nba--com` (host; `--` collapses to `-`) -> `nba-com`, whereas `nba` + `/com`
// -> `nbazscom`. The `zs` boundary keeps these distinct (single-dash could not).
expect(generateDataFolder('https://nba--com/', 'prod')).to.equal('nba-com');
expect(generateDataFolder('https://nba/com', 'prod')).to.equal('nbazscom');
expect(generateDataFolder('https://nba--com/', 'prod'))
.to.not.equal(generateDataFolder('https://nba/com', 'prod'));
});

it('should self-escape the marker letter `z` in hosts and segments (LLMO-5859)', () => {
// A literal `z` is doubled so it can never be read as a boundary token.
expect(generateDataFolder('https://amazon.com', 'prod')).to.equal('amazzon-com');
expect(generateDataFolder('https://nba.com/zone', 'prod')).to.equal('nba-comzszzone');
// A segment that literally spells the boundary token stays unambiguous.
expect(generateDataFolder('https://nba.com/zs', 'prod')).to.equal('nba-comzszzs');
expect(generateDataFolder('https://nba.com/zs', 'prod'))
.to.not.equal(generateDataFolder('https://nba.com/z/s', 'prod'));
});

it('should never emit the Helix-reserved `--` in a folder name (LLMO-5859)', () => {
// Helix 400s any resource path containing `--`, so the derivation must never
// produce it -- including for hosts or paths that themselves contain `--`.
const urls = [
'https://nba.com',
'https://nba.com/kings',
'https://nba.com/us/kings',
'https://nba--com/double/dash/host',
'https://nba.com/us--kings',
'https://nba.com/a/b/c/d',
];
urls.forEach((url) => {
expect(generateDataFolder(url, 'prod')).to.not.include('--');
expect(generateDataFolder(url, 'dev')).to.not.include('--');
});
});

it('should handle malformed percent-encoded path segments without throwing', () => {
expect(() => generateDataFolder('https://a.com/%FF', 'prod')).to.not.throw();
});
Expand All @@ -419,7 +468,7 @@ describe('LLMO Onboarding Functions', () => {
expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod'))
.to.equal(generateDataFolder('https://nba.com/könig', 'prod'));
expect(generateDataFolder('https://nba.com/k%C3%B6nig', 'prod'))
.to.match(/^nba-com--/);
.to.match(/^nba-comzs/);
});

it('should case-fold path segments so /Kings and /kings resolve to the same folder', () => {
Expand All @@ -434,9 +483,9 @@ describe('LLMO Onboarding Functions', () => {

it('should ignore query strings and URL fragments', () => {
expect(generateDataFolder('https://nba.com/kings?utm=foo', 'prod'))
.to.equal('nba-com--kings');
.to.equal('nba-comzskings');
expect(generateDataFolder('https://nba.com/kings#section', 'prod'))
.to.equal('nba-com--kings');
.to.equal('nba-comzskings');
});

it('should drop degenerate path segments that sanitize to empty', () => {
Expand Down
Loading