Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions node_modules
8 changes: 6 additions & 2 deletions src/controllers/bot-blocker.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
*/

import {
isNonEmptyObject, isValidUUID, detectBotBlocker,
isNonEmptyObject, isValidUUID,
} from '@adobe/spacecat-shared-utils';
import {
badRequest, internalServerError, notFound, ok, forbidden,
} from '@adobe/spacecat-shared-http-utils';
import AccessControlUtil from '../support/access-control-util.js';
import { detectBotBlockerMultiClient } from '../support/bot-blocker-multi-client.js';

/**
* Creates a bot blocker controller instance
Expand Down Expand Up @@ -83,7 +84,10 @@ function BotBlockerController(ctx, log) {

log.debug(`Checking bot blocker for site ${siteId} with baseURL: ${baseURL}, customHeaders: ${Object.keys(customHeaders).join(',') || 'none'}`);

const result = await detectBotBlocker({ baseUrl: baseURL, headers: customHeaders });
const result = await detectBotBlockerMultiClient(
{ baseUrl: baseURL, headers: customHeaders },
{ log },
);

log.debug(`Bot blocker check completed for site ${siteId}: crawlable=${result.crawlable}, type=${result.type}, confidence=${result.confidence}`);

Expand Down
12 changes: 10 additions & 2 deletions src/controllers/plg/plg-onboarding/onboarding-flow.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import { Site as SiteModel } from '@adobe/spacecat-shared-data-access';
import { hasText } from '@adobe/spacecat-shared-utils';
import { cleanupPlgSiteSuggestionsAndFixes } from '../plg-onboarding-cleanup.js';
import { detectBotBlockerMultiClient } from '../../../support/bot-blocker-multi-client.js';
import { updateRumConfig } from '../../../support/rum-config-service.js';
import { hasActiveSuggestions } from './displacement.js';
import {
Expand Down Expand Up @@ -593,8 +594,15 @@ export async function performAsoPlgOnboarding({
}
}

// Step 4: Bot blocker check
const botBlockerResult = await detectBotBlocker({ baseUrl: baseURL });
// Step 4: Bot blocker check. Probe with multiple HTTP clients (@adobe/fetch +
// undici) because Cloudflare blocks on client fingerprint — a single-client probe
// can report "crawlable" while the clients our audits actually use are blocked
// (SITES-47217). `detectBotBlocker` is injected via context for testability and is
// forwarded as the @adobe/fetch probe.
const botBlockerResult = await detectBotBlockerMultiClient(
{ baseUrl: baseURL },
{ log, detectBotBlockerFn: detectBotBlocker },
);
if (!botBlockerResult.crawlable) {
if (site) {
await site.save();
Expand Down
118 changes: 118 additions & 0 deletions src/support/bot-blocker-multi-client.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright 2026 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import {
detectBotBlocker, analyzeBotProtection, SPACECAT_USER_AGENT,
} from '@adobe/spacecat-shared-utils';

const PROBE_TIMEOUT_MS = 10000;

/**
* Probes a URL with Node's native fetch (undici) and classifies the response.
*
* undici is the HTTP client used by CWV liveness, preflight, site-detection, and the
* import-worker. Cloudflare Bot Management fingerprints the client (TLS/HTTP, JA3/JA4),
* so a site can allow the @adobe/fetch client while blocking undici (and headless
* Chrome). We send the same User-Agent the @adobe/fetch probe uses so the ONLY
* difference between the two probes is the client itself.
*
* A request we cannot complete (timeout/network) is reported as inconclusive
* (crawlable, low confidence) rather than blocked — we only assert a block when the
* response actually classifies as one.
*
* @param {string} baseUrl - URL to probe.
* @param {Object} headers - Optional extra headers (e.g. site scraper headers).
* @param {Object} log - Logger.
* @returns {Promise<Object>} analyzeBotProtection result { crawlable, type, confidence }.
*/
async function probeWithUndici(baseUrl, headers, log, fetchFn) {
try {
const response = await fetchFn(baseUrl, {
method: 'GET',
redirect: 'manual',
headers: { 'User-Agent': SPACECAT_USER_AGENT, ...headers },
signal: AbortSignal.timeout(PROBE_TIMEOUT_MS),
});
const headersObj = Object.fromEntries(response.headers);
let html = '';
try {
html = await response.text();
} catch {
html = '';

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): response.text() reads the entire response body into memory with no size limit. The AbortSignal.timeout(PROBE_TIMEOUT_MS) covers the connection/headers phase but does not reliably abort body streaming in all Node versions (Node 18 does not propagate the signal to the body reader). A malicious or misconfigured target returning a chunked multi-GB response will spike Lambda memory.

The shared-utils detectBotBlocker caps body reads to 64 KB. This probe should match that contract.

Fix: Add a size guard before consuming the body, e.g. check Content-Length and use a Promise.race with a body-read timeout (3s), or use a streaming reader with a byte cap. The outer try/catch prevents caller-visible failures, but the memory spike is the concern.

}
return analyzeBotProtection({ status: response.status, headers: headersObj, html });
} catch (err) {
log?.debug?.(`[bot-blocker] undici probe inconclusive for ${baseUrl}: ${err.message}`);
return { crawlable: true, type: 'unknown', confidence: 0.3 };
}
}

/**
* Multi-client bot-blocker detection.
*
* Probes the site with BOTH the @adobe/fetch client (via the shared
* {@link detectBotBlocker}) and Node's native fetch (undici), because Cloudflare Bot
* Management blocks on the HTTP client fingerprint: a site can allow @adobe/fetch while
* blocking undici — the client CWV/preflight/imports use — and headless Chrome (the
* scraper). A single-client probe therefore yields false "crawlable: Yes" verdicts
* (SITES-47217 / datacom.com).
*
* The return value keeps the shared {@link detectBotBlocker} shape (so existing
* consumers — the onboarding waitlist reason, the controller response — keep working),
* but `crawlable` is the AGGREGATE across clients (false if ANY representative client
* is blocked) and a `perClient` breakdown is added. The top-level `type`/`confidence`
* describe the blocking client so downstream messaging is accurate.
*
* NOTE: headless Chrome is intentionally NOT probed here — api-service has no browser.
* The scraper-backed headless confirmation is tracked as a follow-up; until then a
* "crawlable: true" verdict means "the lightweight HTTP clients were allowed", not
* "headless scraping will succeed".
*
* @param {Object} opts
* @param {string} opts.baseUrl - URL to check.
* @param {Object} [opts.headers] - Optional extra headers forwarded to both probes.
* @param {Object} [log=console] - Logger.
* @returns {Promise<Object>} detectBotBlocker-shaped result + `perClient`.
*/
export async function detectBotBlockerMultiClient(
{ baseUrl, headers = {} } = {},
{ log = console, detectBotBlockerFn = detectBotBlocker, fetchFn = fetch } = {},
) {
const [adobe, undici] = await Promise.all([
detectBotBlockerFn({ baseUrl, headers }),

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): detectBotBlockerFn has no error wrapper. If it throws (network timeout, DNS failure), Promise.all rejects and the entire function throws - while probeWithUndici gracefully returns an inconclusive result on the same class of failures. The stated design intent is 'inconclusive on failure', but only one probe gets that treatment.

Fix: Wrap the call with a .catch() that returns the same inconclusive shape { crawlable: true, type: 'unknown', confidence: 0.3 } as the undici probe uses. Or switch to Promise.allSettled and handle rejected entries.

probeWithUndici(baseUrl, headers, log, fetchFn),
]);

const perClient = {
'adobe-fetch': { crawlable: adobe.crawlable, type: adobe.type, confidence: adobe.confidence },
undici: { crawlable: undici.crawlable, type: undici.type, confidence: undici.confidence },
};

const crawlable = adobe.crawlable && undici.crawlable;

// Surface the blocking client's classification at the top level. Prefer the
// @adobe/fetch block (it carries allowlist IPs/UA from the shared probe); fall back
// to the undici block when @adobe/fetch was allowed but undici was not.
let blocker = adobe;

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): The blocker-preference logic (let blocker = adobe; if adobe.crawlable and not undici.crawlable then blocker = undici) has a third branch: both blocked. When both are blocked, blocker stays as adobe (intentional per the comment about allowlist IPs). This branch has no test - the next developer who touches this conditional could break the preference without any test failing.

Fix: Add a test where both stubs return crawlable: false with different types and assert that result.type matches the adobe probe type.

if (adobe.crawlable && !undici.crawlable) {
blocker = undici;
}

return {
...adobe,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (blocking): The ...adobe spread places all of adobe fields (including reason) on the return object. The conditional spread on line 115 only adds blocker.reason when truthy - but if blocker is undici and undici has no reason, the adobe reason from the initial spread survives unclobbered. This means the output carries a reason from the wrong client when only undici is blocked.

Scenario: adobe returns { crawlable: true, reason: 'informational', ... }, undici returns { crawlable: false, type: 'cloudflare', reason: undefined }. Output: { crawlable: false, type: 'cloudflare', reason: 'informational' } - the reason describes the non-blocking client.

Fix: Add reason: blocker.reason || undefined as an explicit field (replacing the conditional spread), or delete the ...adobe spread and explicitly list only the fields you intend to forward.

crawlable,
type: blocker.type,
confidence: blocker.confidence,
...(blocker.reason ? { reason: blocker.reason } : {}),
perClient,
};
}
20 changes: 17 additions & 3 deletions src/support/slack/commands/detect-bot-blocker.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
* governing permissions and limitations under the License.
*/

import { isValidUrl, detectBotBlocker } from '@adobe/spacecat-shared-utils';
import { isValidUrl } from '@adobe/spacecat-shared-utils';

import BaseCommand from './base.js';
import { extractURLFromSlackInput, postErrorMessage } from '../../../utils/slack/base.js';
import { detectBotBlockerMultiClient } from '../../bot-blocker-multi-client.js';

const COMMAND_ID = 'detect-bot-blocker';
const PHRASES = ['detect bot-blocker', 'detect bot blocker', 'check bot blocker'];
Expand Down Expand Up @@ -103,10 +104,23 @@ function DetectBotBlockerCommand(context) {
await say(`:mag: Checking bot blocker for \`${baseURL}\`...`);

try {
const result = await detectBotBlocker({ baseUrl: baseURL });
const result = await detectBotBlockerMultiClient({ baseUrl: baseURL }, { log });
const formattedResult = formatResult(result);

await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}`);
// Per-client breakdown: a site can allow one HTTP client while blocking another
// (Cloudflare fingerprints the client), so the aggregate alone hides which audits
// will fail. headless Chrome is not probed here (no browser in this service).
const perClientLines = Object.entries(result.perClient || {})
.map(([client, r]) => {
const emoji = r.crawlable ? ':white_check_mark:' : ':no_entry:';
return ` • \`${client}\`: ${emoji} ${r.crawlable ? 'allowed' : `blocked (${r.type})`}`;
})
.join('\n');
const perClientBlock = perClientLines
? `\n:gear: *Per client:*\n${perClientLines}`
: '';

await say(`:robot_face: *Bot Blocker Detection Results for* \`${baseURL}\`\n\n${formattedResult}${perClientBlock}`);
} catch (error) {
log.error(`detect-bot-blocker command: failed for URL ${baseURL}`, error);
await postErrorMessage(say, error);
Expand Down
4 changes: 3 additions & 1 deletion test/controllers/bot-blocker.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ describe('Bot Blocker Controller', () => {
'@adobe/spacecat-shared-utils': {
isNonEmptyObject: (obj) => obj !== null && typeof obj === 'object' && Object.keys(obj).length > 0,
isValidUUID: isValidUUIDStub,
detectBotBlocker: detectBotBlockerStub,
},
'../../src/support/bot-blocker-multi-client.js': {
detectBotBlockerMultiClient: detectBotBlockerStub,
},
'../../src/support/access-control-util.js': {
default: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ describe('PlgOnboardingController (onboarding-flow-core)', function describePlgO
expect(loadProfileConfigStub).to.have.been.calledWith('aso_plg');
expect(createOrFindOrganizationStub).to.have.been.calledWith(TEST_IMS_ORG_ID, context);
expect(mockDataAccess.Site.findByBaseURL).to.have.been.calledWith(TEST_BASE_URL);
expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL });
expect(detectBotBlockerStub).to.have.been.calledWith({ baseUrl: TEST_BASE_URL, headers: {} });
expect(findDeliveryTypeStub).to.have.been.calledWith(TEST_BASE_URL);
expect(mockDataAccess.Site.create).to.have.been.called;
expect(enableImportsStub).to.have.been.called;
Expand Down
9 changes: 9 additions & 0 deletions test/controllers/plg/plg-onboarding/plg-esmock-factory.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ export async function createPlgEsmock(stubs, {
isValidIMSOrgId: (val) => typeof val === 'string' && val.endsWith('@AdobeOrg'),
resolveCanonicalUrl: resolveCanonicalUrlStub,
},
// Keep onboarding tests hermetic: delegate the multi-client probe to the
// injected @adobe/fetch stub (detectBotBlockerFn) instead of making a real
// undici network call. The multi-client aggregation is unit-tested separately.
'../../../../src/support/bot-blocker-multi-client.js': {
detectBotBlockerMultiClient: async ({ baseUrl, headers }, opts = {}) => {
const fn = opts.detectBotBlockerFn || detectBotBlockerStub;
return fn({ baseUrl, headers });
},
},
'@adobe/spacecat-shared-http-utils': {
badRequest: (msg) => ({ status: 400, value: msg }),
created: (data) => ({ status: 201, value: data }),
Expand Down
135 changes: 135 additions & 0 deletions test/support/bot-blocker-multi-client.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright 2026 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

/* eslint-env mocha */

import { expect } from 'chai';
import sinon from 'sinon';
import esmock from 'esmock';
import { detectBotBlockerMultiClient } from '../../src/support/bot-blocker-multi-client.js';

const log = {
debug: () => {}, info: () => {}, warn: () => {}, error: () => {},
};

// Fake fetch Response whose headers iterate like undici's Headers.
const resp = (status, headerPairs = [], html = '') => ({
status,
headers: new Map(headerPairs),
text: async () => html,
});

describe('detectBotBlockerMultiClient', () => {
it('aggregates crawlable=true when BOTH clients are allowed', async () => {
const detectBotBlockerFn = sinon.stub().resolves({ crawlable: true, type: 'none', confidence: 1 });
const fetchFn = sinon.stub().resolves(resp(200));

const result = await detectBotBlockerMultiClient(
{ baseUrl: 'https://ok.com' },
{ log, detectBotBlockerFn, fetchFn },
);

expect(result.crawlable).to.be.true;
expect(result.perClient['adobe-fetch'].crawlable).to.be.true;
expect(result.perClient.undici.crawlable).to.be.true;
expect(result.type).to.equal('none');
});

it('aggregates crawlable=FALSE when undici is blocked but @adobe/fetch is allowed', async () => {
const detectBotBlockerFn = sinon.stub().resolves({
crawlable: true, type: 'cloudflare-allowed', confidence: 1, userAgent: 'Spacecat/1.0', ipsToAllowlist: ['1.2.3.4'],
});
const fetchFn = sinon.stub().resolves(resp(403, [['server', 'cloudflare'], ['cf-ray', 'abc']]));

const result = await detectBotBlockerMultiClient(
{ baseUrl: 'https://datacom.com' },
{ log, detectBotBlockerFn, fetchFn },
);

expect(result.crawlable).to.be.false;
expect(result.type).to.equal('cloudflare'); // blocking client's type surfaced
expect(result.perClient['adobe-fetch'].crawlable).to.be.true;
expect(result.perClient.undici.crawlable).to.be.false;
// adobe fields preserved for downstream messaging
expect(result.userAgent).to.equal('Spacecat/1.0');
expect(result.ipsToAllowlist).to.deep.equal(['1.2.3.4']);
});

it('uses the @adobe/fetch block (with reason) when it is the blocker', async () => {
const detectBotBlockerFn = sinon.stub().resolves({
crawlable: false, type: 'akamai', confidence: 0.99, reason: 'challenge page',
});
const fetchFn = sinon.stub().resolves(resp(200));

const result = await detectBotBlockerMultiClient(
{ baseUrl: 'https://blocked.com' },
{ log, detectBotBlockerFn, fetchFn },
);

expect(result.crawlable).to.be.false;
expect(result.type).to.equal('akamai');
expect(result.reason).to.equal('challenge page');
expect(result.perClient['adobe-fetch'].crawlable).to.be.false;
});

it('treats an undici fetch failure as inconclusive (not a block)', async () => {
const detectBotBlockerFn = sinon.stub().resolves({ crawlable: true, type: 'none', confidence: 1 });
const fetchFn = sinon.stub().rejects(new Error('ECONNREFUSED'));

const result = await detectBotBlockerMultiClient(
{ baseUrl: 'https://ok.com' },
{ log, detectBotBlockerFn, fetchFn },
);

expect(result.crawlable).to.be.true;
expect(result.perClient.undici.type).to.equal('unknown');
expect(result.perClient.undici.confidence).to.equal(0.3);
});

it('handles a body read failure gracefully (html defaults to empty)', async () => {
const detectBotBlockerFn = sinon.stub().resolves({ crawlable: true, type: 'none', confidence: 1 });
const fetchFn = sinon.stub().resolves({
status: 200,
headers: new Map(),
text: async () => { throw new Error('stream error'); },
});

const result = await detectBotBlockerMultiClient(
{ baseUrl: 'https://ok.com' },
{ log, detectBotBlockerFn, fetchFn },
);

expect(result.crawlable).to.be.true;
expect(result.perClient.undici.crawlable).to.be.true;
});

it('falls back to default deps when options are omitted', async () => {
// Cover the default params (log/detectBotBlockerFn/fetchFn) hermetically by
// esmocking the shared probe and stubbing global fetch.
const fetchStub = sinon.stub(globalThis, 'fetch').resolves(resp(200));
try {
const mocked = await esmock('../../src/support/bot-blocker-multi-client.js', {
'@adobe/spacecat-shared-utils': {
detectBotBlocker: sinon.stub().resolves({ crawlable: true, type: 'none', confidence: 1 }),
// keep the real analyzer + UA so classification is genuine
analyzeBotProtection: (await import('@adobe/spacecat-shared-utils')).analyzeBotProtection,
SPACECAT_USER_AGENT: 'Spacecat/1.0',
},
});
const result = await mocked.detectBotBlockerMultiClient({ baseUrl: 'https://ok.com' });
expect(result.crawlable).to.be.true;
expect(result.perClient).to.have.keys(['adobe-fetch', 'undici']);
} finally {
fetchStub.restore();
}
});
});
Loading
Loading