Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
coverage
.nyc_output/
node_modules/
node_modules
junit
dist
tmp
Expand Down
8 changes: 6 additions & 2 deletions src/controllers/bot-blocker.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
*/

import {
isNonEmptyObject, isValidUUID, detectBotBlocker,
isNonEmptyObject, isValidUUID,
} from '@adobe/spacecat-shared-utils';
import {
badRequest, internalServerError, notFound, ok, forbidden,
} from '@adobe/spacecat-shared-http-utils';
import AccessControlUtil from '../support/access-control-util.js';
import { detectBotBlockerMultiClient } from '../support/bot-blocker-multi-client.js';

/**
* Creates a bot blocker controller instance
Expand Down Expand Up @@ -83,7 +84,10 @@ function BotBlockerController(ctx, log) {

log.debug(`Checking bot blocker for site ${siteId} with baseURL: ${baseURL}, customHeaders: ${Object.keys(customHeaders).join(',') || 'none'}`);

const result = await detectBotBlocker({ baseUrl: baseURL, headers: customHeaders });
const result = await detectBotBlockerMultiClient(
{ baseUrl: baseURL, headers: customHeaders },
{ log },
);

log.debug(`Bot blocker check completed for site ${siteId}: crawlable=${result.crawlable}, type=${result.type}, confidence=${result.confidence}`);

Expand Down
12 changes: 10 additions & 2 deletions src/controllers/plg/plg-onboarding/onboarding-flow.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import { Site as SiteModel } from '@adobe/spacecat-shared-data-access';
import { hasText } from '@adobe/spacecat-shared-utils';
import { cleanupPlgSiteSuggestionsAndFixes } from '../plg-onboarding-cleanup.js';
import { detectBotBlockerMultiClient } from '../../../support/bot-blocker-multi-client.js';
import { updateRumConfig } from '../../../support/rum-config-service.js';
import { hasActiveSuggestions } from './displacement.js';
import {
Expand Down Expand Up @@ -593,8 +594,15 @@ export async function performAsoPlgOnboarding({
}
}

// Step 4: Bot blocker check
const botBlockerResult = await detectBotBlocker({ baseUrl: baseURL });
// Step 4: Bot blocker check. Probe with multiple HTTP clients (@adobe/fetch +
// undici) because Cloudflare blocks on client fingerprint — a single-client probe
// can report "crawlable" while the clients our audits actually use are blocked
// (SITES-47217). `detectBotBlocker` is injected via context for testability and is
// forwarded as the @adobe/fetch probe.
const botBlockerResult = await detectBotBlockerMultiClient(
{ baseUrl: baseURL },
{ log, detectBotBlockerFn: detectBotBlocker },
);
if (!botBlockerResult.crawlable) {
if (site) {
await site.save();
Expand Down
141 changes: 141 additions & 0 deletions src/support/bot-blocker-multi-client.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright 2026 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import {
detectBotBlocker, analyzeBotProtection, SPACECAT_USER_AGENT,
} from '@adobe/spacecat-shared-utils';

const PROBE_TIMEOUT_MS = 10000;
// Bound the body read the same way the shared detectBotBlocker does: skip the body
// when Content-Length is large, and race the read against a short timeout so a slow
// (or unbounded chunked) response can never hang or balloon memory.
const BODY_READ_MAX_BYTES = 65536; // 64 KB — challenge markers appear in the first KB
const BODY_READ_TIMEOUT_MS = 3000;

/**
* Probes a URL with Node's native fetch (undici) and classifies the response.
*
* undici is the HTTP client used by CWV liveness, preflight, site-detection, and the
* import-worker. Cloudflare Bot Management fingerprints the client (TLS/HTTP, JA3/JA4),
* so a site can allow the @adobe/fetch client while blocking undici (and headless
* Chrome). We send the same User-Agent the @adobe/fetch probe uses so the ONLY
* difference between the two probes is the client itself.
*
* A request we cannot complete (timeout/network) is reported as inconclusive
* (crawlable, low confidence) rather than blocked — we only assert a block when the
* response actually classifies as one.
*
* @param {string} baseUrl - URL to probe.
* @param {Object} headers - Optional extra headers (e.g. site scraper headers).
* @param {Object} log - Logger.
* @returns {Promise<Object>} analyzeBotProtection result { crawlable, type, confidence }.
*/
async function probeWithUndici(baseUrl, headers, log, fetchFn) {
try {
const response = await fetchFn(baseUrl, {
method: 'GET',
redirect: 'manual',
headers: { 'User-Agent': SPACECAT_USER_AGENT, ...headers },
signal: AbortSignal.timeout(PROBE_TIMEOUT_MS),
});
const headersObj = Object.fromEntries(response.headers);
let html = '';
const contentLength = parseInt(headersObj['content-length'] || '0', 10);
if (contentLength <= BODY_READ_MAX_BYTES) {
try {
let timer;
html = await Promise.race([
response.text().finally(() => clearTimeout(timer)),
new Promise((_, reject) => {
timer = setTimeout(() => reject(new Error('body-read-timeout')), BODY_READ_TIMEOUT_MS);
}),
]);
} catch {
html = '';
}
}
return analyzeBotProtection({ status: response.status, headers: headersObj, html });
} catch (err) {
log?.debug?.(`[bot-blocker] undici probe inconclusive for ${baseUrl}: ${err.message}`);
return { crawlable: true, type: 'unknown', confidence: 0.3 };
}
}

/**
* Multi-client bot-blocker detection.
*
* Probes the site with BOTH the @adobe/fetch client (via the shared
* {@link detectBotBlocker}) and Node's native fetch (undici), because Cloudflare Bot
* Management blocks on the HTTP client fingerprint: a site can allow @adobe/fetch while
* blocking undici — the client CWV/preflight/imports use — and headless Chrome (the
* scraper). A single-client probe therefore yields false "crawlable: Yes" verdicts
* (SITES-47217 / datacom.com).
*
* The return value keeps the shared {@link detectBotBlocker} shape (so existing
* consumers — the onboarding waitlist reason, the controller response — keep working),
* but `crawlable` is the AGGREGATE across clients (false if ANY representative client
* is blocked) and a `perClient` breakdown is added. The top-level `type`/`confidence`
* describe the blocking client so downstream messaging is accurate.
*
* NOTE: headless Chrome is intentionally NOT probed here — api-service has no browser.
* The scraper-backed headless confirmation is tracked as a follow-up; until then a
* "crawlable: true" verdict means "the lightweight HTTP clients were allowed", not
* "headless scraping will succeed".
*
* @param {Object} opts
* @param {string} opts.baseUrl - URL to check.
* @param {Object} [opts.headers] - Optional extra headers forwarded to both probes.
* @param {Object} [log=console] - Logger.
* @returns {Promise<Object>} detectBotBlocker-shaped result + `perClient`.
*/
export async function detectBotBlockerMultiClient(
{ baseUrl, headers = {} } = {},
{ log = console, detectBotBlockerFn = detectBotBlocker, fetchFn = fetch } = {},
) {
const [adobe, undici] = await Promise.all([
// Match probeWithUndici's behaviour: a probe failure (timeout/DNS/network) is
// inconclusive, not a block — so neither probe can reject the whole call.
Promise.resolve()
.then(() => detectBotBlockerFn({ baseUrl, headers }))
.catch((err) => {
log?.debug?.(`[bot-blocker] @adobe/fetch probe inconclusive for ${baseUrl}: ${err.message}`);
return { crawlable: true, type: 'unknown', confidence: 0.3 };
}),
probeWithUndici(baseUrl, headers, log, fetchFn),
]);

const perClient = {
'adobe-fetch': { crawlable: adobe.crawlable, type: adobe.type, confidence: adobe.confidence },
undici: { crawlable: undici.crawlable, type: undici.type, confidence: undici.confidence },
};

const crawlable = adobe.crawlable && undici.crawlable;

// Surface the blocking client's classification at the top level. Prefer the
// @adobe/fetch block (it carries allowlist IPs/UA from the shared probe); fall back
// to the undici block when @adobe/fetch was allowed but undici was not.
let blocker = adobe;

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): The blocker-preference logic (let blocker = adobe; if adobe.crawlable and not undici.crawlable then blocker = undici) has a third branch: both blocked. When both are blocked, blocker stays as adobe (intentional per the comment about allowlist IPs). This branch has no test - the next developer who touches this conditional could break the preference without any test failing.

Fix: Add a test where both stubs return crawlable: false with different types and assert that result.type matches the adobe probe type.

if (adobe.crawlable && !undici.crawlable) {
blocker = undici;
}

return {
...adobe,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): The ...adobe spread places all of adobe fields (including reason) on the return object. The conditional spread on line 115 only adds blocker.reason when truthy - but if blocker is undici and undici has no reason, the adobe reason from the initial spread survives unclobbered. This means the output carries a reason from the wrong client when only undici is blocked.

Scenario: adobe returns { crawlable: true, reason: 'informational', ... }, undici returns { crawlable: false, type: 'cloudflare', reason: undefined }. Output: { crawlable: false, type: 'cloudflare', reason: 'informational' } - the reason describes the non-blocking client.

Fix: Add reason: blocker.reason || undefined as an explicit field (replacing the conditional spread), or delete the ...adobe spread and explicitly list only the fields you intend to forward.

crawlable,
type: blocker.type,
confidence: blocker.confidence,
// Always reflect the blocking client's reason (overriding any reason the
// ...adobe spread carried), so the reason never describes the wrong client.
reason: blocker.reason || undefined,
perClient,
};
}
20 changes: 17 additions & 3 deletions src/support/slack/commands/detect-bot-blocker.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
* governing permissions and limitations under the License.
*/

import { isValidUrl, detectBotBlocker } from '@adobe/spacecat-shared-utils';
import { isValidUrl } from '@adobe/spacecat-shared-utils';

import BaseCommand from './base.js';
import { extractURLFromSlackInput, postErrorMessage } from '../../../utils/slack/base.js';
import { detectBotBlockerMultiClient } from '../../bot-blocker-multi-client.js';

const COMMAND_ID = 'detect-bot-blocker';
const PHRASES = ['detect bot-blocker', 'detect bot blocker', 'check bot blocker'];
Expand Down Expand Up @@ -103,10 +104,23 @@ function DetectBotBlockerCommand(context) {
await say(`:mag: Checking bot blocker for \`${baseURL}\`...`);

try {
const result = await detectBotBlocker({ baseUrl: baseURL });
const result = await detectBotBlockerMultiClient({ baseUrl: baseURL }, { log });
const formattedResult = formatResult(result);

await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}`);
// Per-client breakdown: a site can allow one HTTP client while blocking another
// (Cloudflare fingerprints the client), so the aggregate alone hides which audits
// will fail. headless Chrome is not probed here (no browser in this service).
const perClientLines = Object.entries(result.perClient || {})
.map(([client, r]) => {
const emoji = r.crawlable ? ':white_check_mark:' : ':no_entry:';
return ` • \`${client}\`: ${emoji} ${r.crawlable ? 'allowed' : `blocked (${r.type})`}`;
})
.join('\n');
const perClientBlock = perClientLines
? `\n:gear: *Per client:*\n${perClientLines}`
: '';

await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}${perClientBlock}`);
} catch (error) {
log.error(`detect-bot-blocker command: failed for URL ${baseURL}`, error);
await postErrorMessage(say, error);
Expand Down
4 changes: 3 additions & 1 deletion test/controllers/bot-blocker.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ describe('Bot Blocker Controller', () => {
'@adobe/spacecat-shared-utils': {
isNonEmptyObject: (obj) => obj !== null && typeof obj === 'object' && Object.keys(obj).length > 0,
isValidUUID: isValidUUIDStub,
detectBotBlocker: detectBotBlockerStub,
},
'../../src/support/bot-blocker-multi-client.js': {
detectBotBlockerMultiClient: detectBotBlockerStub,
},
'../../src/support/access-control-util.js': {
default: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ describe('PlgOnboardingController (onboarding-flow-core)', function describePlgO
expect(loadProfileConfigStub).to.have.been.calledWith('aso_plg');
expect(createOrFindOrganizationStub).to.have.been.calledWith(TEST_IMS_ORG_ID, context);
expect(mockDataAccess.Site.findByBaseURL).to.have.been.calledWith(TEST_BASE_URL);
expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL });
expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL, headers: {} });
expect(findDeliveryTypeStub).to.have.been.calledWith(TEST_BASE_URL);
expect(mockDataAccess.Site.create).to.have.been.called;
expect(enableImportsStub).to.have.been.called;
Expand Down
9 changes: 9 additions & 0 deletions test/controllers/plg/plg-onboarding/plg-esmock-factory.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ export async function createPlgEsmock(stubs, {
isValidIMSOrgId: (val) => typeof val === 'string' && val.endsWith('@AdobeOrg'),
resolveCanonicalUrl: resolveCanonicalUrlStub,
},
// Keep onboarding tests hermetic: delegate the multi-client probe to the
// injected @adobe/fetch stub (detectBotBlockerFn) instead of making a real
// undici network call. The multi-client aggregation is unit-tested separately.
'../../../../src/support/bot-blocker-multi-client.js': {
detectBotBlockerMultiClient: async ({ baseUrl, headers }, opts = {}) => {
const fn = opts.detectBotBlockerFn || detectBotBlockerStub;
return fn({ baseUrl, headers });
},
},
'@adobe/spacecat-shared-http-utils': {
badRequest: (msg) => ({ status: 400, value: msg }),
created: (data) => ({ status: 201, value: data }),
Expand Down
Loading
Loading