From 51681b13fe8175cc00232b0defe60b5473c10b1c Mon Sep 17 00:00:00 2001 From: david catalan Date: Tue, 16 Jun 2026 17:38:47 +0200 Subject: [PATCH 1/7] feat(snowflake): add asset-collect.mjs for mechanical asset normalization --- .../snowflake/scripts/asset-collect.mjs | 403 ++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 plugins/aem/edge-delivery-services/skills/snowflake/scripts/asset-collect.mjs diff --git a/plugins/aem/edge-delivery-services/skills/snowflake/scripts/asset-collect.mjs b/plugins/aem/edge-delivery-services/skills/snowflake/scripts/asset-collect.mjs new file mode 100644 index 00000000..a76d50ce --- /dev/null +++ b/plugins/aem/edge-delivery-services/skills/snowflake/scripts/asset-collect.mjs @@ -0,0 +1,403 @@ +#!/usr/bin/env node +/* eslint-disable no-await-in-loop, no-console, no-restricted-syntax, no-continue -- + CLI batch tool: sequential async downloads are intentional (avoid hammering + the source server); console is the output channel; for-of loops are clearer + for ordered side effects; continue is clearer than nested ternaries for + skip paths. */ +/** + * asset-collect.mjs + * + * Mechanical asset normalization for snowflake Phase 1 (Capture). + * Scans index.html for in-scope asset references (raster images, videos, + * fonts), classifies each by reachability, downloads local/unreachable + * assets into normalized subdirectories, rewrites references in index.html, + * and emits asset-manifest.json. + * + * In-scope types: + * images : .png .jpg .jpeg .webp .avif .gif + * videos : .mp4 .webm + * fonts : .otf .woff .woff2 .ttf .eot + * + * Asset strategies (per-asset, not per-run): + * absolute — stable public URL; leave as-is + * vendor — local/unreachable font; download to fonts/ + * da-media — local/unreachable image or video; download to images/ or videos/ + * + * Usage: + * node /scripts/asset-collect.mjs \ + * --input \ + * --base-url \ + * [--dry-run] + * + * Flags: + * --input Directory containing index.html (required) + * --base-url Original page URL for resolving relative refs (required) + * --dry-run Classify without downloading or rewriting; print manifest to stdout + * + * Exit codes: + * 0 Success (collected, no-op, or dry-run completed cleanly) + * 1 Input error (missing flags, missing index.html) + * 2 Fetch failure (hard stop, not recoverable) + * 3 Filesystem error + */ + +import { readFile, writeFile, mkdir, copyFile, access, stat } from 'node:fs/promises'; +import { join, basename, extname, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const IMAGE_EXTS = new Set(['.png', '.jpg', '.jpeg', '.webp', '.avif', '.gif']); +const VIDEO_EXTS = new Set(['.mp4', '.webm']); +const FONT_EXTS = new Set(['.otf', '.woff', '.woff2', '.ttf', '.eot']); + +const STABLE_CDN_HOSTS = new Set([ + 'fonts.googleapis.com', + 'fonts.gstatic.com', + 'cdn.jsdelivr.net', + 'cdnjs.cloudflare.com', + 'unpkg.com', +]); + +const PRIVATE_IP_RE = /^(localhost|127\.|10\.|192\.168\.|172\.(1[6-9]|2\d|3[01])\.)/; + +const log = (msg) => console.log(`[asset-collect] ${msg}`); +const warn = (msg) => console.warn(`[asset-collect] WARN: ${msg}`); +const die = (msg, code = 1) => { console.error(`[asset-collect] ERROR: ${msg}`); process.exit(code); }; + +// --------------------------------------------------------------------------- +// CLI args +// --------------------------------------------------------------------------- + +/** @returns {{ inputDir: string, baseUrl: string, dryRun: boolean }} */ +function parseArgs() { + const args = process.argv.slice(2); + let inputDir = ''; + let baseUrl = ''; + let dryRun = false; + for (let i = 0; i < args.length; i++) { + if (args[i] === '--input') inputDir = args[++i] ?? ''; + else if (args[i] === '--base-url') baseUrl = args[++i] ?? ''; + else if (args[i] === '--dry-run') dryRun = true; + } + if (!inputDir) die('--input is required'); + if (!baseUrl) die('--base-url is required'); + return { inputDir, baseUrl, dryRun }; +} + +// --------------------------------------------------------------------------- +// Type helpers +// --------------------------------------------------------------------------- + +/** + * @param {string} url + * @returns {'image'|'video'|'font'|null} + */ +function assetType(url) { + const ext = extname(new URL(url, 'http://x').pathname).toLowerCase(); + if (IMAGE_EXTS.has(ext)) return 'image'; + if (VIDEO_EXTS.has(ext)) return 'video'; + if (FONT_EXTS.has(ext)) return 'font'; + return null; +} + +/** + * @param {string} resolvedUrl + * @returns {'local'|'stable-cdn'|'reachable'} + */ +function reachability(resolvedUrl) { + let host; + try { host = new URL(resolvedUrl).hostname; } catch { return 'local'; } + if (PRIVATE_IP_RE.test(host)) return 'local'; + if (STABLE_CDN_HOSTS.has(host)) return 'stable-cdn'; + return 'reachable'; +} + +/** + * @param {'local'|'stable-cdn'|'reachable'} reach + * @param {'image'|'video'|'font'} type + * @returns {'absolute'|'vendor'|'da-media'} + */ +function strategy(reach, type) { + if (reach !== 'local') return 'absolute'; + if (type === 'font') return 'vendor'; + return 'da-media'; +} + +/** + * @param {'image'|'video'|'font'} type + * @returns {string} + */ +function typeDir(type) { + if (type === 'font') return 'fonts'; + if (type === 'video') return 'videos'; + return 'images'; +} + +// --------------------------------------------------------------------------- +// Filename normalization +// --------------------------------------------------------------------------- + +const HASH_RE = /^[0-9a-f]{8,}$/i; + +/** + * Build a normalized filename for a downloaded asset. + * For hash-named fonts, derive from @font-face context. + * + * @param {string} originalUrl - the raw URL as found in HTML + * @param {'image'|'video'|'font'} type + * @param {Map} fontContext - basename → font metadata + * @param {Set} usedNames - already-claimed normalized names (collision guard) + * @returns {string} e.g. "adobe-clean-spectrum-vf.woff2" + */ +function normalizeFilename(originalUrl, type, fontContext, usedNames) { + const urlPath = (() => { + try { return new URL(originalUrl).pathname; } catch { return originalUrl; } + })(); + const base = basename(urlPath); + const ext = extname(base).toLowerCase(); + const stem = base.slice(0, base.length - ext.length); + const cleanStem = stem.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''); + + let candidate; + if (type === 'font' && HASH_RE.test(cleanStem)) { + const ctx = fontContext.get(base); + if (ctx) { + const familySlug = ctx.family.toLowerCase().replace(/["']/g, '').trim() + .replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, ''); + const italicSuffix = ctx.style === 'italic' ? '-italic' : ''; + candidate = `${familySlug}${italicSuffix}${ext}`; + } + } + if (!candidate) candidate = `${cleanStem || 'asset'}${ext}`; + + // Collision guard + if (!usedNames.has(candidate)) { usedNames.add(candidate); return candidate; } + for (let n = 2; n < 1000; n++) { + const alt = `${candidate.slice(0, candidate.length - ext.length)}-${n}${ext}`; + if (!usedNames.has(alt)) { usedNames.add(alt); return alt; } + } + die(`Cannot resolve filename collision for ${candidate}`, 3); +} + +// --------------------------------------------------------------------------- +// HTML/CSS scanning +// --------------------------------------------------------------------------- + +/** + * Extract @font-face context: basename → { family, style }. + * Used for hash-named font renaming. + * + * @param {string} html + * @returns {Map} + */ +function extractFontContext(html) { + const map = new Map(); + const faceRE = /@font-face\s*\{([^}]+)\}/gi; + let m; + while ((m = faceRE.exec(html)) !== null) { + const block = m[1]; + const familyM = block.match(/font-family\s*:\s*(['"]?)([^;'"]+)\1/i); + const styleM = block.match(/font-style\s*:\s*(\w+)/i); + const srcM = block.match(/url\(['"]?([^'")\s]+)['"]?\)/g); + if (!familyM || !srcM) continue; + const family = familyM[2].trim(); + const style = styleM ? styleM[1].toLowerCase() : 'normal'; + for (const srcEntry of srcM) { + const urlM = srcEntry.match(/url\(['"]?([^'")\s]+)['"]?\)/); + if (urlM) map.set(basename(urlM[1]), { family, style }); + } + } + return map; +} + +/** + * Extract all in-scope asset URLs from HTML text. + * Returns raw URL strings as they appear in the HTML (not resolved). + * + * @param {string} html + * @returns {string[]} + */ +function scanHtml(html) { + const found = new Set(); + + const addIfInScope = (rawUrl) => { + if (!rawUrl || rawUrl.startsWith('data:')) return; + const ext = extname(new URL(rawUrl, 'http://x').pathname).toLowerCase(); + if (IMAGE_EXTS.has(ext) || VIDEO_EXTS.has(ext) || FONT_EXTS.has(ext)) found.add(rawUrl); + }; + + // img src + for (const m of html.matchAll(/]+src=['"]([^'"]+)['"]/gi)) addIfInScope(m[1]); + // img srcset / picture source srcset — "url [descriptor], url [descriptor]" + for (const m of html.matchAll(/srcset=['"]([^'"]+)['"]/gi)) { + for (const part of m[1].split(',')) { + const url = part.trim().split(/\s+/)[0]; + if (url) addIfInScope(url); + } + } + // video source src, video poster + for (const m of html.matchAll(/<(?:source|video)[^>]+src=['"]([^'"]+)['"]/gi)) addIfInScope(m[1]); + for (const m of html.matchAll(/poster=['"]([^'"]+)['"]/gi)) addIfInScope(m[1]); + // inline style: background-image / content url() + for (const m of html.matchAll(/style=['"][^'"]*url\((['"]?)([^'")\s]+)\1\)[^'"]*['"]/gi)) { + addIfInScope(m[2]); + } + // CSS url() inside + `, + 'font.woff2': TINY_WOFF2, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + assert.equal(m.stats.total, 1); + assert.equal(m.assets[0].type, 'font'); + assert.equal(m.assets[0].strategy, 'vendor'); + assert.ok(existsSync(join(dir, 'fonts/font.woff2'))); + assert.match(readFileSync(join(dir, 'index.html'), 'utf8'), /fonts\/font\.woff2/); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('hash-named font gets semantic name from @font-face context — italic variant', async () => { + const dir = makeInput({ + 'index.html': ` + + `, + 'abcdef1234567890.woff2': TINY_WOFF2, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + assert.ok(existsSync(join(dir, 'fonts/adobe-clean-italic.woff2')), 'expected adobe-clean-italic.woff2'); + assert.equal(m.assets[0].normalizedPath, 'fonts/adobe-clean-italic.woff2'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('stable CDN assets are left as absolute — no download', async () => { + const dir = makeInput({ + 'index.html': ` + + + + `, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + // img.png is SVG via cdn.jsdelivr... but it's a .png so it's in scope + // Both should be absolute (stable-cdn) + for (const a of m.assets) { + assert.equal(a.strategy, 'absolute', `Expected absolute for ${a.originalUrl}`); + assert.equal(a.normalizedPath, null); + } + assert.ok(!existsSync(join(dir, 'fonts')), 'no fonts dir should be created'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('cross-origin font warning generated for reachable non-CDN font', async () => { + const dir = makeInput({ + 'index.html': ` + + `, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + assert.ok( + m.warnings.some((w) => w.includes('cross-origin')), + `Expected cross-origin warning, got: ${JSON.stringify(m.warnings)}`, + ); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('filename collision — second asset gets -2 suffix', async () => { + const dir = makeInput({ + 'index.html': ` + + + `, + 'dir-a/hero.jpg': TINY_PNG, + 'dir-b/hero.jpg': TINY_PNG, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + assert.ok(existsSync(join(dir, 'images/hero.jpg'))); + assert.ok(existsSync(join(dir, 'images/hero-2.jpg'))); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('idempotent re-run — second call is a no-op', async () => { + const dir = makeInput({ + 'index.html': ``, + 'photo.gif': TINY_PNG, + }); + serverRoot = dir; + try { + const r1 = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r1.code, 0, r1.stderr); + const htmlAfter1 = readFileSync(join(dir, 'index.html'), 'utf8'); + const manifestAfter1 = readFileSync(join(dir, 'asset-manifest.json'), 'utf8'); + + const r2 = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r2.code, 0, r2.stderr); + assert.match(r2.stdout, /already collected/); + assert.equal(readFileSync(join(dir, 'index.html'), 'utf8'), htmlAfter1, 'index.html changed on second run'); + assert.equal(readFileSync(join(dir, 'asset-manifest.json'), 'utf8'), manifestAfter1, 'manifest changed on second run'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('--dry-run — no files written, manifest on stdout', async () => { + const dir = makeInput({ + 'index.html': ``, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`, ['--dry-run']); + assert.equal(r.code, 0, r.stderr); + // stdout contains log line + JSON — find the JSON block + const jsonStart = r.stdout.indexOf('{'); + const manifest = JSON.parse(r.stdout.slice(jsonStart)); + assert.ok(manifest.assets, 'no assets in dry-run output'); + assert.ok(!existsSync(join(dir, 'asset-manifest.json')), 'manifest written in dry-run'); + assert.ok(!existsSync(join(dir, 'images')), 'images dir created in dry-run'); + assert.doesNotMatch( + readFileSync(join(dir, 'index.html'), 'utf8'), + /images\//, + 'index.html rewritten in dry-run', + ); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('SVGs are ignored (out of scope)', async () => { + const dir = makeInput({ + 'index.html': ` + + + `, + 'icon.svg': '', + 'photo.png': TINY_PNG, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + assert.equal(m.stats.total, 1, 'SVG should not be counted'); + assert.equal(m.assets[0].normalizedPath, 'images/photo.png'); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); + +test('all three CSS url() quote forms handled', async () => { + const dir = makeInput({ + 'index.html': ` + + `, + 'unquoted.gif': TINY_PNG, + 'single.gif': TINY_PNG, + 'double.gif': TINY_PNG, + }); + serverRoot = dir; + try { + const r = await run(dir, `http://127.0.0.1:${serverPort}/index.html`); + assert.equal(r.code, 0, r.stderr); + const m = readManifest(dir); + assert.equal(m.stats.total, 3, `Expected 3 assets, got ${m.stats.total}`); + assert.ok(existsSync(join(dir, 'images/unquoted.gif'))); + assert.ok(existsSync(join(dir, 'images/single.gif'))); + assert.ok(existsSync(join(dir, 'images/double.gif'))); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +}); From 048bbd18d5bb2f5022bb2b274ff6e48c5ee6ccee Mon Sep 17 00:00:00 2001 From: david catalan Date: Tue, 16 Jun 2026 17:58:26 +0200 Subject: [PATCH 3/7] docs(snowflake): update phases and methodology for per-asset strategy via asset-collect.mjs - Phase 1: replace grep-based discovery with asset-collect.mjs call - Phase 2: read asset-manifest.json instead of manual classification; update decisions.json shape (assetManifest replaces assetStrategy/assetBase/vendorAssetsTo) - Phase 3: simplify 3.7 (paths pre-normalized); update B.3 deploy commands - methodology.md: rewrite asset strategy from per-run to per-asset model --- .../skills/snowflake/knowledge/methodology.md | 78 ++++++++--------- .../skills/snowflake/phases/1-capture.md | 60 ++++++------- .../skills/snowflake/phases/2-analyze.md | 39 +++++---- .../skills/snowflake/phases/3-generate.md | 87 +++++++++++-------- 4 files changed, 137 insertions(+), 127 deletions(-) diff --git a/plugins/aem/edge-delivery-services/skills/snowflake/knowledge/methodology.md b/plugins/aem/edge-delivery-services/skills/snowflake/knowledge/methodology.md index 85b7f258..62a42212 100644 --- a/plugins/aem/edge-delivery-services/skills/snowflake/knowledge/methodology.md +++ b/plugins/aem/edge-delivery-services/skills/snowflake/knowledge/methodology.md @@ -86,46 +86,44 @@ output/ └── da/.html ← DA-source body fragment ``` -**Rewrite relative asset paths.** When a source uses relative paths -like `assets/photos/foo.jpg`, `url(./images/bar.png)`, -``, they resolve against our serving -host (`localhost:3000/drafts/...` or -`----.aem.page//...`) — where they 404. -They need rewriting to one of three target forms, depending on -**asset strategy** (recorded in `decisions.json["assetStrategy"]`): - -1. **`absolute`** (default for publicly hosted sources): - `https:///path/to/assets/...`. Source host serves the - binaries directly; EDS preview sideloads any `` URL into - Media Bus on first preview (see da-content `media.md` §2). -2. **`vendor`** (local-only source, accepted repo size impact): - Copy the asset tree into `./assets/` in the repo. Template / - fragment / page-CSS refs become root-relative `/assets/...`. DA - cell refs use absolute branch URLs - (`https://----.aem.page/assets/...`). -3. **`da-media`** (cleanest long-term): - Upload binaries to DA `/media//` via the bundled - `/scripts/da-media-upload.mjs` script. Template / - fragment / page-CSS / DA cell refs all use - `https://content.da.live///media//`. The - uploader emits a `media-mapping.json` of local-path → - content.da.live URL that Generate consumes for the rewrites. - -| Aspect | absolute | vendor | da-media | -|---------------------------|-------------|--------------|--------------| -| Repo size | unchanged | +N MB | unchanged | -| Branch-independent assets | N/A | No | **Yes** | -| Local-only source | No | Yes | Yes | -| Initial-run effort | none | curl+sed | uploader | -| Tooling required | none | none | bundled `.mjs` | -| Reusable across runs | N/A | per-run | **Yes** | -| DA-cell image URL form | source host | branch URL | content.da.live | -| Delivered image URL | `./media_` (sideload) | `./media_` | `./media_` | - -For fonts specifically, even under `da-media`, place font files in Code -Bus `/fonts/.woff2` (or `.otf`) per da-content `media.md` -§13.2 decision tree. Fonts upload would be a DA media-bus mismatch -(SVG/PNG/JPG/MP4 are Media Bus; fonts are Content Bus). +**Asset handling is per-asset, decided mechanically.** The +`asset-collect.mjs` script (run in Phase 1) scans `index.html` for +in-scope asset references — raster images, videos, and fonts — and +assigns a per-asset strategy based on reachability and type: + +| Asset reachability | Asset type | Strategy | Destination | +|---|---|---|---| +| Stable public URL (any CDN or publicly reachable host) | any | `absolute` — leave URL as-is | no download | +| Local / unreachable | font | `vendor` — git repo (Code Bus) | `input/fonts/` → `/fonts/` | +| Local / unreachable | image, video | `da-media` — DA Media Bus | `input/images/` or `input/videos/` → upload in Phase 5 | + +**Never vendor images or videos into the git repo.** Binary content +assets belong in DA Media Bus, not Code Bus. Only code assets (fonts, +SVGs under 40KB, scripts, stylesheets) go in the git repo. + +The script downloads assets, normalizes filenames, and rewrites +references in `index.html` to relative paths (`fonts/foo.woff2`, +`images/hero.jpg`). Phase 3 transforms these to root-relative +(`/fonts/foo.woff2`) for template/fragment/CSS, and to absolute branch +URLs for DA cells: + +- **Template / fragment / CSS**: root-relative paths (`/fonts/...`, + `/assets/images/...`). Browser resolves against code-bus host. +- **DA cell image refs**: absolute branch URLs + (`https://----.aem.page/assets/images/...`). + Media Bus resolves against `content.da.live`, not code-bus — root- + relative paths produce `about:error` there. + +The `da-media-upload.mjs` script (Phase 5) uploads `da-media` assets +to DA and emits final `content.da.live` URLs; DA cells should be +updated to those final URLs after upload. + +| Aspect | absolute | vendor | da-media | +|---|---|---|---| +| Asset types | any | fonts only | images, videos | +| Repo size | unchanged | +font files (small) | unchanged | +| DA-cell URL form | source host (leave as-is) | branch URL | content.da.live (after upload) | +| Delivered image URL | `./media_` (sideload) | `./media_` | `./media_` | This applies to template HTML, fragment HTML, DA cell values referencing images, and any CSS `url()` references. diff --git a/plugins/aem/edge-delivery-services/skills/snowflake/phases/1-capture.md b/plugins/aem/edge-delivery-services/skills/snowflake/phases/1-capture.md index b5d73cca..2e8b71e1 100644 --- a/plugins/aem/edge-delivery-services/skills/snowflake/phases/1-capture.md +++ b/plugins/aem/edge-delivery-services/skills/snowflake/phases/1-capture.md @@ -101,45 +101,33 @@ curl -fsS "$SOURCE_URL" -o "${PROJ}/input/index.html" Validate: file size > 0, response was HTML (look for `` — external CSS -- `