adobe · habansal · Jun 25, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 coverage
 .nyc_output/
-node_modules/
+node_modules
 junit
 dist
 tmp

diff --git a/src/controllers/bot-blocker.js b/src/controllers/bot-blocker.js
@@ -11,12 +11,13 @@
  */
 
 import {
-  isNonEmptyObject, isValidUUID, detectBotBlocker,
+  isNonEmptyObject, isValidUUID,
 } from '@adobe/spacecat-shared-utils';
 import {
   badRequest, internalServerError, notFound, ok, forbidden,
 } from '@adobe/spacecat-shared-http-utils';
 import AccessControlUtil from '../support/access-control-util.js';
+import { detectBotBlockerMultiClient } from '../support/bot-blocker-multi-client.js';
 
 /**
  * Creates a bot blocker controller instance
@@ -83,7 +84,10 @@ function BotBlockerController(ctx, log) {
 
       log.debug(`Checking bot blocker for site ${siteId} with baseURL: ${baseURL}, customHeaders: ${Object.keys(customHeaders).join(',') || 'none'}`);
 
-      const result = await detectBotBlocker({ baseUrl: baseURL, headers: customHeaders });
+      const result = await detectBotBlockerMultiClient(
+        { baseUrl: baseURL, headers: customHeaders },
+        { log },
+      );
 
       log.debug(`Bot blocker check completed for site ${siteId}: crawlable=${result.crawlable}, type=${result.type}, confidence=${result.confidence}`);
 

diff --git a/src/controllers/plg/plg-onboarding/onboarding-flow.js b/src/controllers/plg/plg-onboarding/onboarding-flow.js
@@ -13,6 +13,7 @@
 import { Site as SiteModel } from '@adobe/spacecat-shared-data-access';
 import { hasText } from '@adobe/spacecat-shared-utils';
 import { cleanupPlgSiteSuggestionsAndFixes } from '../plg-onboarding-cleanup.js';
+import { detectBotBlockerMultiClient } from '../../../support/bot-blocker-multi-client.js';
 import { updateRumConfig } from '../../../support/rum-config-service.js';
 import { hasActiveSuggestions } from './displacement.js';
 import {
@@ -593,8 +594,15 @@ export async function performAsoPlgOnboarding({
       }
     }
 
-    // Step 4: Bot blocker check
-    const botBlockerResult = await detectBotBlocker({ baseUrl: baseURL });
+    // Step 4: Bot blocker check. Probe with multiple HTTP clients (@adobe/fetch +
+    // undici) because Cloudflare blocks on client fingerprint — a single-client probe
+    // can report "crawlable" while the clients our audits actually use are blocked
+    // (SITES-47217). `detectBotBlocker` is injected via context for testability and is
+    // forwarded as the @adobe/fetch probe.
+    const botBlockerResult = await detectBotBlockerMultiClient(
+      { baseUrl: baseURL },
+      { log, detectBotBlockerFn: detectBotBlocker },
+    );
     if (!botBlockerResult.crawlable) {
       if (site) {
         await site.save();

diff --git a/src/support/bot-blocker-multi-client.js b/src/support/bot-blocker-multi-client.js
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2026 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+import {
+  detectBotBlocker, analyzeBotProtection, SPACECAT_USER_AGENT,
+} from '@adobe/spacecat-shared-utils';
+
+const PROBE_TIMEOUT_MS = 10000;
+// Bound the body read the same way the shared detectBotBlocker does: skip the body
+// when Content-Length is large, and race the read against a short timeout so a slow
+// (or unbounded chunked) response can never hang or balloon memory.
+const BODY_READ_MAX_BYTES = 65536; // 64 KB — challenge markers appear in the first KB
+const BODY_READ_TIMEOUT_MS = 3000;
+
+/**
+ * Probes a URL with Node's native fetch (undici) and classifies the response.
+ *
+ * undici is the HTTP client used by CWV liveness, preflight, site-detection, and the
+ * import-worker. Cloudflare Bot Management fingerprints the client (TLS/HTTP, JA3/JA4),
+ * so a site can allow the @adobe/fetch client while blocking undici (and headless
+ * Chrome). We send the same User-Agent the @adobe/fetch probe uses so the ONLY
+ * difference between the two probes is the client itself.
+ *
+ * A request we cannot complete (timeout/network) is reported as inconclusive
+ * (crawlable, low confidence) rather than blocked — we only assert a block when the
+ * response actually classifies as one.
+ *
+ * @param {string} baseUrl - URL to probe.
+ * @param {Object} headers - Optional extra headers (e.g. site scraper headers).
+ * @param {Object} log - Logger.
+ * @returns {Promise<Object>} analyzeBotProtection result { crawlable, type, confidence }.
+ */
+async function probeWithUndici(baseUrl, headers, log, fetchFn) {
+  try {
+    const response = await fetchFn(baseUrl, {
+      method: 'GET',
+      redirect: 'manual',
+      headers: { 'User-Agent': SPACECAT_USER_AGENT, ...headers },
+      signal: AbortSignal.timeout(PROBE_TIMEOUT_MS),
+    });
+    const headersObj = Object.fromEntries(response.headers);
+    let html = '';
+    const contentLength = parseInt(headersObj['content-length'] || '0', 10);
+    if (contentLength <= BODY_READ_MAX_BYTES) {
+      try {
+        let timer;
+        html = await Promise.race([
+          response.text().finally(() => clearTimeout(timer)),
+          new Promise((_, reject) => {
+            timer = setTimeout(() => reject(new Error('body-read-timeout')), BODY_READ_TIMEOUT_MS);
+          }),
+        ]);
+      } catch {
+        html = '';
+      }
+    }
+    return analyzeBotProtection({ status: response.status, headers: headersObj, html });
+  } catch (err) {
+    log?.debug?.(`[bot-blocker] undici probe inconclusive for ${baseUrl}: ${err.message}`);
+    return { crawlable: true, type: 'unknown', confidence: 0.3 };
+  }
+}
+
+/**
+ * Multi-client bot-blocker detection.
+ *
+ * Probes the site with BOTH the @adobe/fetch client (via the shared
+ * {@link detectBotBlocker}) and Node's native fetch (undici), because Cloudflare Bot
+ * Management blocks on the HTTP client fingerprint: a site can allow @adobe/fetch while
+ * blocking undici — the client CWV/preflight/imports use — and headless Chrome (the
+ * scraper). A single-client probe therefore yields false "crawlable: Yes" verdicts
+ * (SITES-47217 / datacom.com).
+ *
+ * The return value keeps the shared {@link detectBotBlocker} shape (so existing
+ * consumers — the onboarding waitlist reason, the controller response — keep working),
+ * but `crawlable` is the AGGREGATE across clients (false if ANY representative client
+ * is blocked) and a `perClient` breakdown is added. The top-level `type`/`confidence`
+ * describe the blocking client so downstream messaging is accurate.
+ *
+ * NOTE: headless Chrome is intentionally NOT probed here — api-service has no browser.
+ * The scraper-backed headless confirmation is tracked as a follow-up; until then a
+ * "crawlable: true" verdict means "the lightweight HTTP clients were allowed", not
+ * "headless scraping will succeed".
+ *
+ * @param {Object} opts
+ * @param {string} opts.baseUrl - URL to check.
+ * @param {Object} [opts.headers] - Optional extra headers forwarded to both probes.
+ * @param {Object} [log=console] - Logger.
+ * @returns {Promise<Object>} detectBotBlocker-shaped result + `perClient`.
+ */
+export async function detectBotBlockerMultiClient(
+  { baseUrl, headers = {} } = {},
+  { log = console, detectBotBlockerFn = detectBotBlocker, fetchFn = fetch } = {},
+) {
+  const [adobe, undici] = await Promise.all([
+    // Match probeWithUndici's behaviour: a probe failure (timeout/DNS/network) is
+    // inconclusive, not a block — so neither probe can reject the whole call.
+    Promise.resolve()
+      .then(() => detectBotBlockerFn({ baseUrl, headers }))
+      .catch((err) => {
+        log?.debug?.(`[bot-blocker] @adobe/fetch probe inconclusive for ${baseUrl}: ${err.message}`);
+        return { crawlable: true, type: 'unknown', confidence: 0.3 };
+      }),
+    probeWithUndici(baseUrl, headers, log, fetchFn),
+  ]);
+
+  const perClient = {
+    'adobe-fetch': { crawlable: adobe.crawlable, type: adobe.type, confidence: adobe.confidence },
+    undici: { crawlable: undici.crawlable, type: undici.type, confidence: undici.confidence },
+  };
+
+  const crawlable = adobe.crawlable && undici.crawlable;
+
+  // Surface the blocking client's classification at the top level. Prefer the
+  // @adobe/fetch block (it carries allowlist IPs/UA from the shared probe); fall back
+  // to the undici block when @adobe/fetch was allowed but undici was not.
+  let blocker = adobe;
+  if (adobe.crawlable && !undici.crawlable) {
+    blocker = undici;
+  }
+
+  return {
+    ...adobe,
+    crawlable,
+    type: blocker.type,
+    confidence: blocker.confidence,
+    // Always reflect the blocking client's reason (overriding any reason the
+    // ...adobe spread carried), so the reason never describes the wrong client.
+    reason: blocker.reason || undefined,
+    perClient,
+  };
+}
diff --git a/src/support/slack/commands/detect-bot-blocker.js b/src/support/slack/commands/detect-bot-blocker.js
@@ -10,10 +10,11 @@
  * governing permissions and limitations under the License.
  */
 
-import { isValidUrl, detectBotBlocker } from '@adobe/spacecat-shared-utils';
+import { isValidUrl } from '@adobe/spacecat-shared-utils';
 
 import BaseCommand from './base.js';
 import { extractURLFromSlackInput, postErrorMessage } from '../../../utils/slack/base.js';
+import { detectBotBlockerMultiClient } from '../../bot-blocker-multi-client.js';
 
 const COMMAND_ID = 'detect-bot-blocker';
 const PHRASES = ['detect bot-blocker', 'detect bot blocker', 'check bot blocker'];
@@ -103,10 +104,23 @@ function DetectBotBlockerCommand(context) {
     await say(`:mag: Checking bot blocker for \`${baseURL}\`...`);
 
     try {
-      const result = await detectBotBlocker({ baseUrl: baseURL });
+      const result = await detectBotBlockerMultiClient({ baseUrl: baseURL }, { log });
       const formattedResult = formatResult(result);
 
-      await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}`);
+      // Per-client breakdown: a site can allow one HTTP client while blocking another
+      // (Cloudflare fingerprints the client), so the aggregate alone hides which audits
+      // will fail. headless Chrome is not probed here (no browser in this service).
+      const perClientLines = Object.entries(result.perClient || {})
+        .map(([client, r]) => {
+          const emoji = r.crawlable ? ':white_check_mark:' : ':no_entry:';
+          return `   • \`${client}\`: ${emoji} ${r.crawlable ? 'allowed' : `blocked (${r.type})`}`;
+        })
+        .join('\n');
+      const perClientBlock = perClientLines
+        ? `\n:gear: *Per client:*\n${perClientLines}`
+        : '';
+
+      await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}${perClientBlock}`);
     } catch (error) {
       log.error(`detect-bot-blocker command: failed for URL ${baseURL}`, error);
       await postErrorMessage(say, error);

diff --git a/test/controllers/bot-blocker.test.js b/test/controllers/bot-blocker.test.js
@@ -75,7 +75,9 @@ describe('Bot Blocker Controller', () => {
       '@adobe/spacecat-shared-utils': {
         isNonEmptyObject: (obj) => obj !== null && typeof obj === 'object' && Object.keys(obj).length > 0,
         isValidUUID: isValidUUIDStub,
-        detectBotBlocker: detectBotBlockerStub,
+      },
+      '../../src/support/bot-blocker-multi-client.js': {
+        detectBotBlockerMultiClient: detectBotBlockerStub,
       },
       '../../src/support/access-control-util.js': {
         default: {

diff --git a/test/controllers/plg/plg-onboarding/onboarding-flow-core.test.js b/test/controllers/plg/plg-onboarding/onboarding-flow-core.test.js
@@ -430,7 +430,7 @@ describe('PlgOnboardingController (onboarding-flow-core)', function describePlgO
       expect(loadProfileConfigStub).to.have.been.calledWith('aso_plg');
       expect(createOrFindOrganizationStub).to.have.been.calledWith(TEST_IMS_ORG_ID, context);
       expect(mockDataAccess.Site.findByBaseURL).to.have.been.calledWith(TEST_BASE_URL);
-      expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL });
+      expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL, headers: {} });
       expect(findDeliveryTypeStub).to.have.been.calledWith(TEST_BASE_URL);
       expect(mockDataAccess.Site.create).to.have.been.called;
       expect(enableImportsStub).to.have.been.called;

diff --git a/test/controllers/plg/plg-onboarding/plg-esmock-factory.js b/test/controllers/plg/plg-onboarding/plg-esmock-factory.js
@@ -61,6 +61,15 @@ export async function createPlgEsmock(stubs, {
         isValidIMSOrgId: (val) => typeof val === 'string' && val.endsWith('@AdobeOrg'),
         resolveCanonicalUrl: resolveCanonicalUrlStub,
       },
+      // Keep onboarding tests hermetic: delegate the multi-client probe to the
+      // injected @adobe/fetch stub (detectBotBlockerFn) instead of making a real
+      // undici network call. The multi-client aggregation is unit-tested separately.
+      '../../../../src/support/bot-blocker-multi-client.js': {
+        detectBotBlockerMultiClient: async ({ baseUrl, headers }, opts = {}) => {
+          const fn = opts.detectBotBlockerFn || detectBotBlockerStub;
+          return fn({ baseUrl, headers });
+        },
+      },
       '@adobe/spacecat-shared-http-utils': {
         badRequest: (msg) => ({ status: 400, value: msg }),
         created: (data) => ({ status: 201, value: data }),