From fbfe356cd8b66bff7b3a8e419fc6a8b69923ec13 Mon Sep 17 00:00:00 2001 From: vilenarios Date: Tue, 26 May 2026 23:14:23 +0000 Subject: [PATCH 1/2] feat(data): serve contiguous data by content digest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add GET|HEAD /ar-io/digest/:digest to serve contiguous data addressed by its SHA-256 content digest — the same base64url value emitted as the X-AR-IO-Digest response header and used as the on-disk cache key. The gateway already stores contiguous data content-addressed by this hash; this exposes it directly. Responses are inherently self-verifying (the bytes provably hash to the requested digest) so X-AR-IO-Verified is always true and Cache-Control is immutable. Local-cache only: there is no on-demand fetch by content hash, since Arweave/peers address by id, so an unknown digest is 404. For header parity with /raw, a representative id that resolves to the digest is looked up (cheap, via the existing contiguous_data_hash index) and run through the same setDataHeaders path, so the full id-scoped header set (X-AR-IO-Data-Id, tags, owner, signature, root offsets) is present and signed by the HTTPSIG middleware. The served digest is pinned onto the attributes so the digest/ETag/Content-Digest headers always describe the bytes streamed. - DB: selectDataAttributesByHash SQL + getDataAttributesByHash through the StandaloneSqlite worker/circuit-breaker/queue/handler chain (no migration — the contiguous_data_hash index already exists) - Data source: ReadThroughDataCache.getDataByHash + ByHashDataSource interface - Attributes: getDataAttributesByHash on DataAttributesSource + composite - Route: DIGEST_DATA_PATH_REGEX, createDigestDataHandler on arIoRouter; handleRangeRequest generalized with an optional region fetcher - Tests: handler (GET/HEAD/range/404/451/400/no-id), getDataByHash, DB method - Docs: openapi path + glossary "Content Digest (ar-io-digest)" Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/glossary.md | 12 + docs/openapi.yaml | 109 ++++++ src/constants.ts | 5 + src/data/composite-data-attributes-source.ts | 13 + src/data/read-through-data-cache.test.ts | 60 ++++ src/data/read-through-data-cache.ts | 55 +++ src/database/sql/data/content-attributes.sql | 17 + src/database/standalone-sqlite.test.ts | 36 ++ src/database/standalone-sqlite.ts | 49 +++ src/metrics.ts | 1 + src/routes/ar-io.ts | 8 +- src/routes/data/handlers.test.ts | 187 ++++++++++ src/routes/data/handlers.ts | 340 +++++++++++++++++-- src/routes/data/index.ts | 18 +- src/types.d.ts | 40 +++ 15 files changed, 929 insertions(+), 21 deletions(-) diff --git a/docs/glossary.md b/docs/glossary.md index 966f80f5b..fcd0c3ec2 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -223,6 +223,18 @@ implementations including filesystem and S3. **Contiguous Data Store** - Storage backend for complete transaction data. Manages both data files and verification metadata. + **Content Digest (ar-io-digest)** - The SHA-256 +hash of a piece of contiguous data, base64url-encoded. It is emitted on data +responses as the `X-AR-IO-Digest` header and is the key under which the +[contiguous data store](#contiguous-data-store) addresses bytes on disk +(`data///`). Because the same value identifies content +across the cache, the index, and the response header, it doubles as a stable +content address. The `GET /ar-io/digest/{digest}` endpoint serves bytes +directly by this value; such responses are inherently self-verifying (the +bytes provably hash to the requested digest) and immutable, but local-cache +only — there is no on-demand fetch by content hash, since Arweave addresses +data by [item ID](#item-id), not by content hash. + ## Data Verification **Data Verification** - The process of cryptographically verifying data diff --git a/docs/openapi.yaml b/docs/openapi.yaml index 597e1cc2c..be31fc57a 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -1745,6 +1745,115 @@ paths: '416': description: Range not satisfiable for HEAD request + '/ar-io/digest/{digest}': + get: + tags: [Data] + summary: Get data by its content digest (SHA-256) + description: | + Retrieve contiguous data addressed by its SHA-256 content digest — the + same base64url value emitted in the `X-AR-IO-Digest` response header on + every data response (and used as the gateway's on-disk cache key). + + This is a content-addressed endpoint: the bytes returned provably hash + to the requested digest, so the response is always self-verifying + (`X-AR-IO-Verified: true`) and immutable (`Cache-Control: …, immutable`). + + Local-cache only: the gateway can only serve a digest it has already + materialized (via prior retrieval or bundle unbundling). There is no + on-demand fetch by content hash, because Arweave and peers address data + by transaction/data-item id, not by content hash — so a digest the node + has never stored returns 404. + + For header parity with `/raw/{txId}`, a representative id that resolves + to this digest is used to populate the full id-scoped header set + (`X-AR-IO-Data-Id`, tags, owner, signature, root offsets), which are + then covered by the HTTPSIG signature when signing is enabled. + parameters: + - name: digest + in: path + required: true + schema: + $ref: '#/components/schemas/Base64Url43' + description: base64url-encoded SHA-256 content digest (X-AR-IO-Digest) + - name: Range + in: header + required: false + schema: + $ref: '#/components/schemas/ByteRange' + description: Byte range(s) to retrieve + responses: + '200': + description: | + Successful response. Emits the same header set as `/raw/{txId}` + (see that endpoint for the full list), with `X-AR-IO-Verified` + always `true` and an immutable `Cache-Control`. + headers: + Content-Type: + schema: + type: string + example: application/octet-stream + Content-Length: + schema: + type: string + example: 1024 + Cache-Control: + schema: + type: string + example: public, max-age=2592000, immutable + X-AR-IO-Digest: + schema: + type: string + example: '4ROTs2lTPAKbr8Y41WrjHu-2q-7S-m-yTuO7fAUzZI4' + ETag: + schema: + type: string + example: '4ROTs2lTPAKbr8Y41WrjHu-2q-7S-m-yTuO7fAUzZI4' + Content-Digest: + schema: + type: string + description: RFC 9530 compliant digest header with SHA-256 + example: 'sha-256=:4ROTs2lTPAKbr8Y41WrjHu+2q+7S+m+yTuO7fAUzZI4=:' + X-AR-IO-Verified: + schema: + $ref: '#/components/schemas/VerificationStatus' + example: true + X-AR-IO-Data-Id: + schema: + type: string + description: A representative id that resolves to this digest + '206': + description: Partial content for range requests + '400': + description: Malformed digest (not a canonical 43-char base64url SHA-256) + '404': + description: No content for this digest in the local cache + '416': + description: Range not satisfiable + '451': + description: Content blocked by this node's content policy + head: + tags: [Data] + summary: Get headers for data by its content digest + description: | + Existence check / header retrieval for content addressed by its SHA-256 + digest. Returns the same headers as the GET response with no body. + parameters: + - name: digest + in: path + required: true + schema: + $ref: '#/components/schemas/Base64Url43' + description: base64url-encoded SHA-256 content digest (X-AR-IO-Digest) + responses: + '200': + description: Successful response (headers only) + '400': + description: Malformed digest + '404': + description: No content for this digest in the local cache + '451': + description: Content blocked by this node's content policy + # Network and Node Status '/info': get: diff --git a/src/constants.ts b/src/constants.ts index fec5ff41d..dd4e6f4f8 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -88,6 +88,11 @@ export const verificationPriorities = { export const DATA_PATH_REGEX = /^\/?([a-zA-Z0-9-_]{43})\/?$|^\/?([a-zA-Z0-9-_]{43})\/(.*)$/i; export const RAW_DATA_PATH_REGEX = /^\/raw\/([a-zA-Z0-9-_]{43})\/?$/i; +// Content-addressed data: base64url SHA-256 digest (43 chars), the value +// emitted as X-AR-IO-Digest. Distinct prefix from /raw/:txid because a +// digest is indistinguishable from a 43-char txid by shape alone. +export const DIGEST_DATA_PATH_REGEX = + /^\/ar-io\/digest\/([a-zA-Z0-9-_]{43})\/?$/i; export const FARCASTER_FRAME_DATA_PATH_REGEX = /^\/local\/farcaster\/frame\/([a-zA-Z0-9-_]{43})\/?$/i; diff --git a/src/data/composite-data-attributes-source.ts b/src/data/composite-data-attributes-source.ts index 91067f5cf..48d38064d 100644 --- a/src/data/composite-data-attributes-source.ts +++ b/src/data/composite-data-attributes-source.ts @@ -10,6 +10,7 @@ import winston from 'winston'; import { ContiguousDataAttributes, ContiguousDataAttributesStore, + DataAttributesByHash, DataAttributesSource, } from '../types.js'; @@ -75,6 +76,18 @@ export class CompositeDataAttributesSource } } + /** + * Reverse lookup by content hash. Delegated straight to the source — this + * is a cheap indexed point lookup used only by the content-addressed + * endpoint, so it is not worth the cache-coherency cost of caching here + * (the id-keyed cache above would not help a hash-keyed query anyway). + */ + async getDataAttributesByHash( + hash: string, + ): Promise { + return this.source.getDataAttributesByHash(hash); + } + private async fetchAndCache( id: string, ): Promise { diff --git a/src/data/read-through-data-cache.test.ts b/src/data/read-through-data-cache.test.ts index e7c14e69e..b25989358 100644 --- a/src/data/read-through-data-cache.test.ts +++ b/src/data/read-through-data-cache.test.ts @@ -110,6 +110,22 @@ describe('ReadThroughDataCache', function () { return undefined; }, + getDataAttributesByHash: async (hash: string) => { + if (hash === 'knownHash') { + return { + hash: 'knownHash', + size: 100, + contentType: 'text/plain', + id: 'knownId', + }; + } + // Indexed in contiguous_data but the blob is missing from the store. + if (hash === 'indexedButNoBlob') { + return { hash: 'indexedButNoBlob', size: 50 }; + } + return undefined; + }, + // eslint-disable-next-line no-empty-pattern saveDataContentAttributes: async ({}: { id: string; @@ -169,6 +185,50 @@ describe('ReadThroughDataCache', function () { mock.restoreAll(); }); + describe('getDataByHash', () => { + it('streams indexed content addressed by hash, marked self-verifying', async () => { + const result = await readThroughDataCache.getDataByHash('knownHash'); + + assert.equal(result.hash, 'knownHash'); + assert.equal(result.size, 100); + assert.equal(result.totalSize, 100); + assert.equal(result.sourceContentType, 'text/plain'); + // Content-addressed reads are self-verifying and always local-cache. + assert.equal(result.verified, true); + assert.equal(result.trusted, true); + assert.equal(result.cached, true); + + const chunks: Buffer[] = []; + for await (const chunk of result.stream) { + chunks.push(Buffer.from(chunk)); + } + assert.equal(Buffer.concat(chunks).toString(), 'simulated data'); + }); + + it('honors a byte region', async () => { + const result = await readThroughDataCache.getDataByHash('knownHash', { + offset: 0, + size: 4, + }); + assert.equal(result.size, 4); + assert.equal(result.totalSize, 100); + }); + + it('rejects when the hash is not indexed', async () => { + await assert.rejects( + readThroughDataCache.getDataByHash('unknownHash'), + /No content indexed/, + ); + }); + + it('rejects when indexed but the blob is missing from the store', async () => { + await assert.rejects( + readThroughDataCache.getDataByHash('indexedButNoBlob'), + /No cached data/, + ); + }); + }); + describe('getCachedData', () => { it('should return data from cache when available', async () => { let calledWithArgument: string; diff --git a/src/data/read-through-data-cache.ts b/src/data/read-through-data-cache.ts index 8315c2657..f16dcb49d 100644 --- a/src/data/read-through-data-cache.ts +++ b/src/data/read-through-data-cache.ts @@ -435,6 +435,61 @@ export class ReadThroughDataCache implements ContiguousDataSource { return undefined; } + /** + * Serve contiguous data addressed directly by its content hash (the + * value emitted as X-AR-IO-Digest and used as the on-disk cache key). + * + * Unlike {@link getData}, there is no id, no manifest/ArNS resolution, and + * no upstream fall-through: Arweave and peers address by transaction id, + * not by content hash, so a hash we have never materialized cannot be + * fetched on demand. The endpoint therefore serves only content already + * present in the local content store. Because the store is keyed by the + * SHA-256 of the bytes, a successful read is self-verifying — the bytes + * provably hash to the requested digest — so the result is reported as + * verified, trusted, and cached. + * + * @throws if no content is indexed for the hash, or the indexed blob is + * missing from the store (evicted/pruned between index and read). + */ + async getDataByHash( + hash: string, + region?: { + offset: number; + size: number; + }, + ): Promise { + const attributes = + await this.contiguousDataIndex.getDataAttributesByHash(hash); + if (attributes === undefined) { + throw new Error(`No content indexed for hash: ${hash}`); + } + + const cacheStream = await this.dataStore.get(hash, region); + if (cacheStream === undefined) { + throw new Error(`No cached data found for hash: ${hash}`); + } + + const requestType = region !== undefined ? 'range' : 'full'; + metrics.getDataStreamSuccessesTotal.inc({ + class: this.constructor.name, + source: 'cache', + request_type: requestType, + }); + + const totalSize = attributes.size; + return { + hash, + stream: cacheStream, + size: region?.size ?? totalSize, + totalSize, + sourceContentType: attributes.contentType, + // Content-addressed: the bytes provably hash to the requested digest. + verified: true, + trusted: true, + cached: true, + }; + } + async getData({ id, requestAttributes, diff --git a/src/database/sql/data/content-attributes.sql b/src/database/sql/data/content-attributes.sql index dafd1b94e..adc9d8d74 100644 --- a/src/database/sql/data/content-attributes.sql +++ b/src/database/sql/data/content-attributes.sql @@ -155,6 +155,23 @@ FROM ( ) LIMIT 1 +-- selectDataAttributesByHash +-- Reverse lookup: resolve content metadata directly from the content hash +-- (the value emitted as X-AR-IO-Digest and used as the on-disk cache key). +-- contiguous_data is keyed by hash (primary-key point lookup); the LEFT JOIN +-- additionally surfaces one representative id that resolves to this hash +-- (via the contiguous_data_hash index) so the content-addressed endpoint can +-- emit id-scoped response headers. Many ids may share a hash; any one will do. +SELECT + cd.hash, + cd.data_size, + cd.original_source_content_type, + cdi.id AS id +FROM contiguous_data cd +LEFT JOIN contiguous_data_ids cdi ON cdi.contiguous_data_hash = cd.hash +WHERE cd.hash = :hash +LIMIT 1 + -- selectDataParent SELECT cdip.parent_id, diff --git a/src/database/standalone-sqlite.test.ts b/src/database/standalone-sqlite.test.ts index c1d641a7e..eb056c8d7 100644 --- a/src/database/standalone-sqlite.test.ts +++ b/src/database/standalone-sqlite.test.ts @@ -1790,6 +1790,42 @@ describe('StandaloneSqliteDatabase', () => { }); }); + describe('getDataAttributesByHash', () => { + // Canonical (round-trip-stable) 43-char base64url values. + const HASH = crypto + .createHash('sha256') + .update('by-hash-content') + .digest('base64url'); + const ID = crypto + .createHash('sha256') + .update('by-hash-representative-id') + .digest('base64url'); + + it('resolves size and a representative id from the content hash', async () => { + await db.saveDataContentAttributes({ + id: ID, + hash: HASH, + dataSize: 4321, + verified: true, + }); + + const attrs = await db.getDataAttributesByHash(HASH); + assert.notEqual(attrs, undefined); + assert.equal(attrs!.hash, HASH); + assert.equal(attrs!.size, 4321); + assert.equal(attrs!.id, ID); + }); + + it('returns undefined for a hash with no content indexed', async () => { + const unknown = crypto + .createHash('sha256') + .update('never-stored') + .digest('base64url'); + const attrs = await db.getDataAttributesByHash(unknown); + assert.equal(attrs, undefined); + }); + }); + describe('upsertNewDataItem clobber resistance (PE-9073)', () => { // Regression: after the unbundle path back-fills parent_id / // root_transaction_id / data_offset on a previously-optimistic data item, diff --git a/src/database/standalone-sqlite.ts b/src/database/standalone-sqlite.ts index d55fb5e05..ed03c9c67 100644 --- a/src/database/standalone-sqlite.ts +++ b/src/database/standalone-sqlite.ts @@ -49,6 +49,7 @@ import { ContiguousDataIndex, DataAttributesSource, ContiguousDataParent, + DataAttributesByHash, DataItemAttributes, GqlQueryable, GqlTransaction, @@ -1223,6 +1224,23 @@ export class StandaloneSqliteDatabaseWorker { }; } + getDataAttributesByHash(hash: string) { + const row = this.stmts.data.selectDataAttributesByHash.get({ + hash: fromB64Url(hash), + }); + + if (row === undefined) { + return undefined; + } + + return { + hash: row.hash ? toB64Url(row.hash) : hash, + size: row.data_size, + contentType: row.original_source_content_type ?? undefined, + id: row.id ? toB64Url(row.id) : undefined, + }; + } + getDataItemAttributes(id: string) { const row = this.stmts.bundles.selectDataItemAttributes.get({ id: fromB64Url(id), @@ -2988,6 +3006,11 @@ export class StandaloneSqliteDatabase Awaited> >; + private getDataAttributesByHashCircuitBreaker: CircuitBreaker< + Parameters, + Awaited> + >; + private getDataItemAttributesCircuitBreaker: CircuitBreaker< Parameters, Awaited> @@ -3051,6 +3074,16 @@ export class StandaloneSqliteDatabase }, ); + this.getDataAttributesByHashCircuitBreaker = new CircuitBreaker( + (hash: string) => { + return this.queueRead('data', `getDataAttributesByHash`, [hash]); + }, + { + name: 'getDataAttributesByHash', + ...dataIndexCircuitBreakerOptions, + }, + ); + this.getDataItemAttributesCircuitBreaker = new CircuitBreaker( (id: string) => { return this.queueRead('bundles', `getDataItemAttributes`, [id]); @@ -3075,12 +3108,14 @@ export class StandaloneSqliteDatabase metrics.circuitBreakerMetrics.add([ this.getDataParentCircuitBreaker, this.getDataAttributesCircuitBreaker, + this.getDataAttributesByHashCircuitBreaker, this.getDataItemAttributesCircuitBreaker, this.getTransactionAttributesCircuitBreaker, ]); Object.entries({ 'get-data-parent': this.getDataParentCircuitBreaker, 'get-data-attributes': this.getDataAttributesCircuitBreaker, + 'get-data-attributes-by-hash': this.getDataAttributesByHashCircuitBreaker, 'get-data-item-attributes': this.getDataItemAttributesCircuitBreaker, 'get-transaction-attributes': this.getTransactionAttributesCircuitBreaker, } satisfies Partial>).forEach( @@ -3467,6 +3502,16 @@ export class StandaloneSqliteDatabase } } + async getDataAttributesByHash( + hash: string, + ): Promise { + try { + return await this.getDataAttributesByHashCircuitBreaker.fire(hash); + } catch (_) { + return undefined; + } + } + async getDataItemAttributes( id: string, ): Promise { @@ -3944,6 +3989,10 @@ if (!isMainThread) { const dataAttributes = worker.getDataAttributes(args[0]); parentPort?.postMessage(dataAttributes); break; + case 'getDataAttributesByHash': + const dataAttributesByHash = worker.getDataAttributesByHash(args[0]); + parentPort?.postMessage(dataAttributesByHash); + break; case 'getDataItemAttributes': const dataItemAttributes = worker.getDataItemAttributes(args[0]); parentPort?.postMessage(dataItemAttributes); diff --git a/src/metrics.ts b/src/metrics.ts index a5e8d0d28..c20674d65 100644 --- a/src/metrics.ts +++ b/src/metrics.ts @@ -900,6 +900,7 @@ const breakerSourceNames = [ 'ar-io-peer-manager', 'composite-sqlite-gql', 'get-data-attributes', + 'get-data-attributes-by-hash', 'get-data-item-attributes', 'get-data-parent', 'get-transaction-attributes', diff --git a/src/routes/ar-io.ts b/src/routes/ar-io.ts index 4861dee0f..29db5f3b9 100644 --- a/src/routes/ar-io.ts +++ b/src/routes/ar-io.ts @@ -15,9 +15,10 @@ import { db, signatureStore, ownerStore } from '../system.js'; import log from '../log.js'; import { ParquetExporter } from '../workers/parquet-exporter.js'; import { NormalizedDataItem, PartialJsonTransaction } from '../types.js'; -import { DATA_PATH_REGEX } from '../constants.js'; +import { DATA_PATH_REGEX, DIGEST_DATA_PATH_REGEX } from '../constants.js'; import { isEmptyString } from '../lib/string.js'; import { buildArIoInfo } from './ar-io-info-builder.js'; +import { digestDataHandler } from './data/index.js'; export const arIoRouter = Router(); export let parquetExporter: ParquetExporter | null = null; @@ -228,6 +229,11 @@ export const arIoInfoHandler = (_req: Request, res: Response) => { }; arIoRouter.get('/ar-io/info', arIoInfoHandler); +// Content-addressed data: serve bytes by their SHA-256 digest (the value +// emitted as X-AR-IO-Digest). GET registration also answers HEAD. Local +// content store only — see createDigestDataHandler. +arIoRouter.get(DIGEST_DATA_PATH_REGEX, digestDataHandler); + // peer list arIoRouter.get('/ar-io/peers', async (_req, res) => { try { diff --git a/src/routes/data/handlers.test.ts b/src/routes/data/handlers.test.ts index dddf98d63..78cc8cf8f 100644 --- a/src/routes/data/handlers.test.ts +++ b/src/routes/data/handlers.test.ts @@ -6,6 +6,7 @@ */ import { strict as assert } from 'node:assert'; import { afterEach, beforeEach, describe, it, mock } from 'node:test'; +import crypto from 'node:crypto'; import express from 'express'; import { Readable } from 'node:stream'; import { default as request } from 'supertest'; @@ -26,10 +27,12 @@ import { } from '../../types.js'; import { createDataHandler, + createDigestDataHandler, getRequestAttributes, matchContentTypePattern, shouldUsePrivateCacheControl, } from './handlers.js'; +import { ByHashDataSource } from '../../types.js'; import { MemoryRateLimiter } from '../../limiter/memory-rate-limiter.js'; import type { PaymentProcessor, @@ -64,6 +67,7 @@ describe('Data routes', () => { }; dataAttributesSource = { getDataAttributes: () => Promise.resolve(undefined), + getDataAttributesByHash: () => Promise.resolve(undefined), }; dataSource = { getData: (params?: any) => { @@ -3364,4 +3368,187 @@ st }); }); }); + + describe('createDigestDataHandler', () => { + const CONTENT = Buffer.from('digest-addressed test content'); + const DIGEST = crypto + .createHash('sha256') + .update(CONTENT) + .digest('base64url'); + // A representative 43-char base64url id that resolves to the digest. + const REPRESENTATIVE_ID = crypto + .createHash('sha256') + .update('representative-id') + .digest('base64url'); + + let app: express.Express; + let dataAttributesSource: DataAttributesSource; + let dataSource: ByHashDataSource; + let dataBlockListValidator: DataBlockListValidator; + + // The handler reads req.params[0]; a regex route populates it. + const route = /^\/ar-io\/digest\/(.+)$/; + + const build = () => + createDigestDataHandler({ + log, + dataSource, + dataAttributesSource, + dataBlockListValidator, + }); + + beforeEach(() => { + app = express(); + dataAttributesSource = { + getDataAttributes: () => + Promise.resolve({ + hash: 'stale-hash-should-be-overridden', + size: CONTENT.length, + offset: 0, + contentType: 'text/plain', + stable: true, + verified: false, + } as any), + getDataAttributesByHash: (hash: string) => + Promise.resolve( + hash === DIGEST + ? { + hash: DIGEST, + size: CONTENT.length, + contentType: 'text/plain', + id: REPRESENTATIVE_ID, + } + : undefined, + ), + }; + dataSource = { + getDataByHash: (hash: string, region?: any) => { + if (hash !== DIGEST) { + return Promise.reject(new Error('No content for hash')); + } + let body = CONTENT; + if (region) { + body = CONTENT.subarray(region.offset, region.offset + region.size); + } + return Promise.resolve({ + hash: DIGEST, + stream: Readable.from(body), + size: region ? region.size : CONTENT.length, + totalSize: CONTENT.length, + sourceContentType: 'text/plain', + verified: true, + trusted: true, + cached: true, + }); + }, + }; + dataBlockListValidator = { + isIdBlocked: () => Promise.resolve(false), + isHashBlocked: () => Promise.resolve(false), + }; + }); + + afterEach(() => { + mock.restoreAll(); + }); + + it('serves bytes with full signed headers (digest, data-id, verified)', async () => { + app.get(route, build()); + + return request(app) + .get(`${'/ar-io/digest/'}${DIGEST}`) + .expect(200) + .then((res: any) => { + assert.equal(res.text, CONTENT.toString()); + assert.equal(res.headers[headerNames.digest.toLowerCase()], DIGEST); + assert.equal(res.headers['etag'], `"${DIGEST}"`); + assert.equal(res.headers[headerNames.verified.toLowerCase()], 'true'); + assert.equal(res.headers[headerNames.trusted.toLowerCase()], 'true'); + // Data id (a TRIGGER_HEADER) drives HTTPSIG signing. + assert.equal( + res.headers[headerNames.dataId.toLowerCase()], + REPRESENTATIVE_ID, + ); + // Content-addressed → immutable. + assert.ok(res.headers['cache-control'].includes('immutable')); + assert.equal( + res.headers['content-type'], + 'text/plain; charset=utf-8', + ); + assert.equal(res.headers['content-length'], String(CONTENT.length)); + }); + }); + + it('responds to HEAD with headers and no body', async () => { + app.get(route, build()); + + return request(app) + .head(`/ar-io/digest/${DIGEST}`) + .expect(200) + .then((res: any) => { + assert.deepEqual(res.body, {}); + assert.equal(res.headers[headerNames.digest.toLowerCase()], DIGEST); + assert.equal(res.headers['etag'], `"${DIGEST}"`); + }); + }); + + it('returns 206 and partial bytes for a range request', async () => { + app.get(route, build()); + + return request(app) + .get(`/ar-io/digest/${DIGEST}`) + .set('Range', 'bytes=0-3') + .expect(206) + .then((res: any) => { + assert.equal(res.text, CONTENT.subarray(0, 4).toString()); + assert.equal( + res.headers['content-range'], + `bytes 0-3/${CONTENT.length}`, + ); + }); + }); + + it('returns 404 for a digest with no content in the store', async () => { + const unknown = crypto + .createHash('sha256') + .update('not-stored') + .digest('base64url'); + app.get(route, build()); + + return request(app).get(`/ar-io/digest/${unknown}`).expect(404); + }); + + it('returns 451 when the content hash is blocked', async () => { + dataBlockListValidator.isHashBlocked = () => Promise.resolve(true); + app.get(route, build()); + + return request(app).get(`/ar-io/digest/${DIGEST}`).expect(451); + }); + + it('returns 400 for a malformed (non-43-char) digest', async () => { + app.get(route, build()); + + return request(app).get('/ar-io/digest/too-short').expect(400); + }); + + it('still serves (200) when no representative id is indexed', async () => { + dataAttributesSource.getDataAttributesByHash = () => + Promise.resolve({ + hash: DIGEST, + size: CONTENT.length, + contentType: 'text/plain', + // no id + }); + app.get(route, build()); + + return request(app) + .get(`/ar-io/digest/${DIGEST}`) + .expect(200) + .then((res: any) => { + assert.equal(res.text, CONTENT.toString()); + assert.equal(res.headers[headerNames.digest.toLowerCase()], DIGEST); + assert.equal(res.headers[headerNames.verified.toLowerCase()], 'true'); + }); + }); + }); }); diff --git a/src/routes/data/handlers.ts b/src/routes/data/handlers.ts index aa6d4013a..d0182eafa 100644 --- a/src/routes/data/handlers.ts +++ b/src/routes/data/handlers.ts @@ -28,6 +28,7 @@ import { isValidTxId } from '../../lib/validation.js'; import { TxMetadataResolver } from '../../data/tx-metadata-resolver.js'; import { DataBlockListValidator, + ByHashDataSource, ContiguousData, ContiguousDataAttributes, ContiguousDataSource, @@ -822,7 +823,7 @@ export const getRequestAttributes = ( interface HandleRangeRequestArgs { log: Logger; - dataSource: ContiguousDataSource; + dataSource?: ContiguousDataSource; rangeHeader: string; res: Response; req: Request; @@ -831,6 +832,15 @@ interface HandleRangeRequestArgs { dataAttributes: ContiguousDataAttributes | undefined; requestAttributes: RequestAttributes; parentSpan?: Span; + /** + * Optional override for fetching a byte region. When provided it is used + * instead of `dataSource.getData`, letting content-addressed callers + * (the /ar-io/digest endpoint) reuse this range machinery without an id. + */ + getRegionData?: (region: { + offset: number; + size: number; + }) => Promise; } const handleRangeRequest = async ({ @@ -844,6 +854,7 @@ const handleRangeRequest = async ({ dataAttributes, requestAttributes, parentSpan, + getRegionData, }: HandleRangeRequestArgs) => { const { startChildSpan } = await import('../../tracing.js'); const span = startChildSpan( @@ -858,6 +869,19 @@ const handleRangeRequest = async ({ parentSpan, ); + // Fetch a single byte region, by content hash when getRegionData is + // supplied, otherwise by id through the standard data source. + const fetchRegion = (region: { offset: number; size: number }) => + getRegionData !== undefined + ? getRegionData(region) + : dataSource!.getData({ + id, + requestAttributes, + region, + parentSpan: span, + signal: req.signal, + }); + try { const ranges = rangeParser(data.size, rangeHeader); @@ -909,15 +933,9 @@ const handleRangeRequest = async ({ return; } - const rangeData = await dataSource.getData({ - id, - requestAttributes, - region: { - offset: start, - size: end - start + 1, - }, - parentSpan: span, - signal: req.signal, + const rangeData = await fetchRegion({ + offset: start, + size: end - start + 1, }); pipeStreamToResponse(rangeData.stream, res, log, id); @@ -971,15 +989,9 @@ const handleRangeRequest = async ({ const start = range.start; const end = range.end; - const rangeData = await dataSource.getData({ - id, - requestAttributes, - region: { - offset: start, - size: end - start + 1, - }, - parentSpan: span, - signal: req.signal, + const rangeData = await fetchRegion({ + offset: start, + size: end - start + 1, }); rangeStreams.push({ range, stream: rangeData.stream }); @@ -1379,6 +1391,296 @@ export const createRawDataHandler = ({ }); }; +/** + * Set response headers for a content-addressed (`/ar-io/digest/:digest`) + * response. The representation is immutable — the URL *is* the hash of the + * bytes — and self-verifying, so we cache hard and stand behind the digest + * as both a cache validator (ETag) and a signed integrity header + * (Content-Digest), unconditionally. + */ +/** + * Serve contiguous data addressed by its content hash (the value emitted as + * `X-AR-IO-Digest`) at `GET|HEAD /ar-io/digest/:digest`. + * + * Local-cache only — there is no on-demand fetch by content hash (Arweave and + * peers address by id), so an unknown digest is a 404. Bytes stream from the + * hash-keyed content store and are therefore self-verifying. + * + * For header parity with `/raw`, a representative id that resolves to this + * digest is looked up and run through the same {@link setDataHeaders} path, + * so the response carries the full id-scoped header set (X-AR-IO-Data-Id, + * tags, owner, signature, root offsets, …) which the HTTPSIG middleware then + * signs. The served digest is pinned onto the attributes so the digest/ETag/ + * Content-Digest headers always describe the bytes actually streamed, even if + * the representative id's index entry has since changed. + */ +export const createDigestDataHandler = ({ + log, + dataSource, + dataAttributesSource, + dataBlockListValidator, + rateLimiter, + paymentProcessor, + dataItemMetaResolver, +}: { + log: Logger; + dataSource: ByHashDataSource; + dataAttributesSource: DataAttributesSource; + dataBlockListValidator: DataBlockListValidator; + rateLimiter?: RateLimiter; + paymentProcessor?: PaymentProcessor; + dataItemMetaResolver?: TxMetadataResolver; +}) => { + return asyncHandler(async (req: Request, res: Response) => { + const requestAttributes = getRequestAttributes(req, res); + const digest = req.params[0]; + + const span = tracer.startSpan('DigestDataHandler.handle', { + attributes: { + 'http.method': req.method, + 'http.target': req.originalUrl, + 'data.request.digest': digest, + 'client.ip': requestAttributes?.clientIp ?? 'unknown', + }, + }); + + return context.with(trace.setSpan(context.active(), span), async () => { + try { + // Validate the digest is a canonical 43-char base64url SHA-256. The + // route regex already enforces shape; this also rejects non-canonical + // encodings (round-trip mismatch). + if ( + digest == null || + !digest.match(/^[a-zA-Z0-9-_]{43}$/) || + Buffer.from(digest, 'base64url').toString('base64url') !== digest + ) { + span.setAttribute('http.status_code', 400); + span.setAttribute('data.error', 'invalid_digest'); + log.warn('Invalid digest', { digest }); + res.status(400).send(`Invalid digest: ${digest}`); + return; + } + + // Return 451 if the content hash is blocked by this node's policy. + try { + if (await dataBlockListValidator.isHashBlocked(digest)) { + span.setAttribute('http.status_code', 451); + span.setAttribute('data.error', 'hash_blocked'); + sendBlocked(res, digest); + return; + } + } catch (error: any) { + span.recordException(error); + log.error('Error checking blocklist:', { + digest, + message: error.message, + stack: error.stack, + }); + } + + // Resolve a representative id for this digest (cheap indexed lookup) + // so the response can carry the full id-scoped, signed header set. + const byHash = + await dataAttributesSource.getDataAttributesByHash(digest); + const resolvedId = byHash?.id; + if (resolvedId !== undefined) { + span.setAttribute('data.representative_id', resolvedId); + } + + // Fire item header (tags/owner/signature) resolution early, in + // parallel with the byte fetch, exactly as the raw handler does. + const tagsPromise = + resolvedId !== undefined + ? fireItemHeaderResolution(resolvedId, dataItemMetaResolver) + : Promise.resolve(undefined); + + let data: ContiguousData; + try { + data = await dataSource.getDataByHash(digest); + } catch (error: any) { + if (error.name === 'AbortError' && req.signal?.aborted) { + throw error; + } + // Not indexed, or the blob is gone from the store. Either way the + // gateway can't serve it and can't fetch it by hash on demand. + span.setAttribute('http.status_code', 404); + span.setAttribute('data.error', 'not_found_by_hash'); + log.debug('No content available for digest', { + digest, + message: error.message, + }); + sendNotFound(res); + return; + } + + try { + // Build the same attributes shape /raw uses. Prefer the + // representative id's full attributes (stable flag, root tx id, + // offsets); fall back to a minimal synthesized set when no id is + // indexed for the hash. Either way pin hash to the served digest + // and verified=true (content-addressed bytes are self-verifying). + let dataAttributes: ContiguousDataAttributes | undefined; + if (resolvedId !== undefined) { + const attrs = + await dataAttributesSource.getDataAttributes(resolvedId); + if (attrs !== undefined) { + // Clone — the attributes source caches this object. + dataAttributes = { ...attrs, hash: digest, verified: true }; + } + } + dataAttributes ??= { + hash: digest, + size: data.totalSize ?? data.size, + offset: 0, + contentType: data.sourceContentType, + isManifest: data.sourceContentType === MANIFEST_CONTENT_TYPE, + stable: false, + verified: true, + } as ContiguousDataAttributes; + + // Header id for X-AR-IO-Data-Id: the representative id when known. + const headerId = resolvedId ?? digest; + + // === PAYMENT AND RATE LIMIT CHECK === + const allowed = await handleDataRateLimitingAndPayment({ + req, + res, + id: headerId, + data, + dataAttributes, + requestAttributes, + rateLimiter, + paymentProcessor, + parentSpan: span, + log, + }); + if (!allowed) { + return; + } + + // Content-addressed responses are immutable: the URL is the hash of + // the bytes. Pin Cache-Control before setDataHeaders (which only + // sets it when absent) so it is always marked immutable. + const contentType = + dataAttributes.contentType ?? + data.sourceContentType ?? + DEFAULT_CONTENT_TYPE; + const usePrivate = shouldUsePrivateCacheControl( + contentType, + data.size, + ); + res.header( + 'Cache-Control', + `${usePrivate ? 'private' : 'public'}, max-age=${ + config.CACHE_STABLE_MAX_AGE + }, immutable`, + ); + + const itemHeaders = await awaitItemHeaders( + tagsPromise, + data.upstreamTags, + headerId, + resolvedId !== undefined ? dataItemMetaResolver : undefined, + ); + + const rangeHeader = req.headers.range; + if (rangeHeader !== undefined) { + span.addEvent('Handling range request'); + span.setAttribute('data.request.range_request', true); + // Range requests create new streams so the original is no longer + // needed. + data.stream.destroy(); + setDataHeaders({ + req, + res, + dataAttributes, + data, + id: headerId, + itemHeaders, + }); + await handleRangeRequest({ + log, + rangeHeader, + res, + req, + data, + id: headerId, + dataAttributes, + requestAttributes, + parentSpan: span, + getRegionData: (region) => + dataSource.getDataByHash(digest, region), + }); + span.setAttribute('http.status_code', res.statusCode); + return; + } + + setDataHeaders({ + req, + res, + dataAttributes, + data, + id: headerId, + itemHeaders, + }); + if (data.size > 0) { + res.header('Content-Length', data.size.toString()); + } + + // Handle If-None-Match for both HEAD and GET requests. + if (handleIfNoneMatch(req, res)) { + span.setAttribute('http.status_code', 304); + res.end(); + data.stream.destroy(); + return; + } + + if (req.method === REQUEST_METHOD_HEAD) { + span.setAttribute('http.status_code', res.statusCode || 200); + res.end(); + data.stream.destroy(); + return; + } + + span.setAttribute('http.status_code', res.statusCode || 200); + span.addEvent('Streaming data to client'); + await sendBodyWithOptionalDigest({ + req, + res, + data, + log, + dataId: headerId, + }); + } catch (error: any) { + if (error.name === 'AbortError' && req.signal?.aborted) { + span.setAttribute('http.status_code', 499); + data.stream.destroy(); + if (!res.headersSent) { + res.status(499).end(); + } + return; + } + data.stream.destroy(); + throw error; + } + } catch (error: any) { + span.recordException(error); + span.setAttribute('http.status_code', 500); + log.error('Unexpected error in digest data handler:', { + digest, + message: error.message, + stack: error.stack, + }); + if (!res.headersSent) { + res.status(500).send('Internal server error'); + } + } finally { + span.end(); + } + }); + }); +}; + const sendManifestResponse = async ({ log, req, diff --git a/src/routes/data/index.ts b/src/routes/data/index.ts index 0efac09d9..4fccab4fc 100644 --- a/src/routes/data/index.ts +++ b/src/routes/data/index.ts @@ -13,7 +13,11 @@ import { RAW_DATA_PATH_REGEX, FARCASTER_FRAME_DATA_PATH_REGEX, } from '../../constants.js'; -import { createDataHandler, createRawDataHandler } from './handlers.js'; +import { + createDataHandler, + createDigestDataHandler, + createRawDataHandler, +} from './handlers.js'; // Used by ArNS Router export const dataHandler = createDataHandler({ @@ -28,6 +32,18 @@ export const dataHandler = createDataHandler({ dataItemMetaResolver: system.dataItemTagHeaderResolver, }); +// Content-addressed data handler, mounted by the AR.IO router at +// /ar-io/digest/:digest (see src/routes/ar-io.ts). +export const digestDataHandler = createDigestDataHandler({ + log, + dataSource: system.onDemandContiguousDataSource, + dataAttributesSource: system.dataAttributesStore, + dataBlockListValidator: system.dataBlockListValidator, + rateLimiter: system.rateLimiter, + paymentProcessor: system.paymentProcessor, + dataItemMetaResolver: system.dataItemTagHeaderResolver, +}); + export const dataRouter = Router(); dataRouter.get(DATA_PATH_REGEX, dataHandler); diff --git a/src/types.d.ts b/src/types.d.ts index af3edf6b6..68a806b52 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -978,6 +978,29 @@ export interface TransactionAttributes { owner: string | null; } +/** + * Minimal content metadata resolved directly from a content hash (the + * value emitted as X-AR-IO-Digest and used as the on-disk cache key). + * Unlike {@link ContiguousDataAttributes}, this carries no id-scoped fields + * (offsets, bundle hierarchy, verification) because the lookup is keyed by + * content, not by a transaction or data item id. + */ +export interface DataAttributesByHash { + /** SHA-256 hash of the contiguous data (base64url encoded). */ + hash: string; + /** Total size of the data payload in bytes. */ + size: number; + /** Source-declared content type, if recorded. */ + contentType?: string; + /** + * A representative transaction/data item id that resolves to this hash, + * if any is indexed. Many ids can share one hash (identical content); this + * is an arbitrary one, used to populate id-scoped response headers + * (X-AR-IO-Data-Id, tags, signature) on the content-addressed endpoint. + */ + id?: string; +} + export interface ContiguousDataParent { parentId: string; parentHash?: string; @@ -987,6 +1010,9 @@ export interface ContiguousDataParent { export interface DataAttributesSource { getDataAttributes(id: string): Promise; + getDataAttributesByHash( + hash: string, + ): Promise; } export interface ContiguousDataAttributesStore extends DataAttributesSource { @@ -998,6 +1024,9 @@ export interface ContiguousDataAttributesStore extends DataAttributesSource { export interface ContiguousDataIndex { getDataAttributes(id: string): Promise; + getDataAttributesByHash( + hash: string, + ): Promise; getDataItemAttributes(id: string): Promise; getTransactionAttributes( id: string, @@ -1111,6 +1140,17 @@ export interface ContiguousDataSource { }): Promise; } +/** + * Serves contiguous data addressed by its content hash (X-AR-IO-Digest / + * on-disk cache key) rather than by transaction or data item id. Implemented + * by the local read-through cache; see {@link ContiguousDataSource} for the + * id-addressed path. Rejects when the hash is not present in the local + * content store (there is no on-demand fetch by content hash). + */ +export interface ByHashDataSource { + getDataByHash(hash: string, region?: Region): Promise; +} + /** * Discriminator describing how a manifest path was resolved. * From 4bf33ddec31a0a2e488ebdb2e1ed1b06b13d1491 Mon Sep 17 00:00:00 2001 From: vilenarios Date: Wed, 27 May 2026 01:49:47 +0000 Subject: [PATCH 2/2] refine(data): deterministic representative id + single by-hash lookup - selectDataAttributesByHash: ORDER BY verified DESC, trusted DESC, id ASC so when several ids share a hash (byte-identical content, different signed envelopes) the representative is deterministic and prefers the strongest provenance, rather than depending on index iteration order. - Collapse the per-request double lookup: getDataByHash now returns the representative id (ByHashData), so the handler no longer issues its own getDataAttributesByHash call. Removed the now-unused method from DataAttributesSource / CompositeDataAttributesSource (kept on ContiguousDataIndex, which getDataByHash uses). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/data/composite-data-attributes-source.ts | 13 ------- src/data/read-through-data-cache.test.ts | 2 ++ src/data/read-through-data-cache.ts | 6 +++- src/database/sql/data/content-attributes.sql | 9 ++++- src/database/standalone-sqlite.test.ts | 34 ++++++++++++++++++ src/routes/data/handlers.test.ts | 25 ++++++-------- src/routes/data/handlers.ts | 36 +++++++++++--------- src/types.d.ts | 15 +++++--- 8 files changed, 89 insertions(+), 51 deletions(-) diff --git a/src/data/composite-data-attributes-source.ts b/src/data/composite-data-attributes-source.ts index 48d38064d..91067f5cf 100644 --- a/src/data/composite-data-attributes-source.ts +++ b/src/data/composite-data-attributes-source.ts @@ -10,7 +10,6 @@ import winston from 'winston'; import { ContiguousDataAttributes, ContiguousDataAttributesStore, - DataAttributesByHash, DataAttributesSource, } from '../types.js'; @@ -76,18 +75,6 @@ export class CompositeDataAttributesSource } } - /** - * Reverse lookup by content hash. Delegated straight to the source — this - * is a cheap indexed point lookup used only by the content-addressed - * endpoint, so it is not worth the cache-coherency cost of caching here - * (the id-keyed cache above would not help a hash-keyed query anyway). - */ - async getDataAttributesByHash( - hash: string, - ): Promise { - return this.source.getDataAttributesByHash(hash); - } - private async fetchAndCache( id: string, ): Promise { diff --git a/src/data/read-through-data-cache.test.ts b/src/data/read-through-data-cache.test.ts index b25989358..44a76091c 100644 --- a/src/data/read-through-data-cache.test.ts +++ b/src/data/read-through-data-cache.test.ts @@ -197,6 +197,8 @@ describe('ReadThroughDataCache', function () { assert.equal(result.verified, true); assert.equal(result.trusted, true); assert.equal(result.cached, true); + // The single internal lookup also surfaces the representative id. + assert.equal(result.representativeId, 'knownId'); const chunks: Buffer[] = []; for await (const chunk of result.stream) { diff --git a/src/data/read-through-data-cache.ts b/src/data/read-through-data-cache.ts index f16dcb49d..2ee4ed674 100644 --- a/src/data/read-through-data-cache.ts +++ b/src/data/read-through-data-cache.ts @@ -20,6 +20,7 @@ import * as metrics from '../metrics.js'; import { KvJsonStore } from '../store/kv-attributes-store.js'; import { startChildSpan } from '../tracing.js'; import { + ByHashData, ContiguousData, ContiguousDataAttributesStore, ContiguousDataIndex, @@ -457,7 +458,7 @@ export class ReadThroughDataCache implements ContiguousDataSource { offset: number; size: number; }, - ): Promise { + ): Promise { const attributes = await this.contiguousDataIndex.getDataAttributesByHash(hash); if (attributes === undefined) { @@ -487,6 +488,9 @@ export class ReadThroughDataCache implements ContiguousDataSource { verified: true, trusted: true, cached: true, + // A representative id resolving to this hash (the same single lookup + // above), so callers need not re-query to emit id-scoped headers. + representativeId: attributes.id, }; } diff --git a/src/database/sql/data/content-attributes.sql b/src/database/sql/data/content-attributes.sql index adc9d8d74..d8800ce63 100644 --- a/src/database/sql/data/content-attributes.sql +++ b/src/database/sql/data/content-attributes.sql @@ -161,7 +161,11 @@ LIMIT 1 -- contiguous_data is keyed by hash (primary-key point lookup); the LEFT JOIN -- additionally surfaces one representative id that resolves to this hash -- (via the contiguous_data_hash index) so the content-addressed endpoint can --- emit id-scoped response headers. Many ids may share a hash; any one will do. +-- emit id-scoped response headers. Many ids may share a hash (byte-identical +-- content under different signed envelopes); the ORDER BY makes the choice +-- deterministic and prefers the strongest provenance — a verified id over a +-- trusted one over an arbitrary one — so the representative is stable across +-- requests rather than dependent on index iteration order. SELECT cd.hash, cd.data_size, @@ -170,6 +174,9 @@ SELECT FROM contiguous_data cd LEFT JOIN contiguous_data_ids cdi ON cdi.contiguous_data_hash = cd.hash WHERE cd.hash = :hash +ORDER BY cdi.verified DESC NULLS LAST, + cdi.trusted DESC NULLS LAST, + cdi.id ASC LIMIT 1 -- selectDataParent diff --git a/src/database/standalone-sqlite.test.ts b/src/database/standalone-sqlite.test.ts index eb056c8d7..06da18638 100644 --- a/src/database/standalone-sqlite.test.ts +++ b/src/database/standalone-sqlite.test.ts @@ -1824,6 +1824,40 @@ describe('StandaloneSqliteDatabase', () => { const attrs = await db.getDataAttributesByHash(unknown); assert.equal(attrs, undefined); }); + + it('deterministically prefers a verified id when several share a hash', async () => { + // Two distinct ids with byte-identical content → same hash. The + // representative must deterministically be the verified one, not + // whichever the index happens to yield first. + const sharedHash = crypto + .createHash('sha256') + .update('shared-by-two-ids') + .digest('base64url'); + const unverifiedId = crypto + .createHash('sha256') + .update('dup-unverified') + .digest('base64url'); + const verifiedId = crypto + .createHash('sha256') + .update('dup-verified') + .digest('base64url'); + + await db.saveDataContentAttributes({ + id: unverifiedId, + hash: sharedHash, + dataSize: 99, + verified: false, + }); + await db.saveDataContentAttributes({ + id: verifiedId, + hash: sharedHash, + dataSize: 99, + verified: true, + }); + + const attrs = await db.getDataAttributesByHash(sharedHash); + assert.equal(attrs!.id, verifiedId); + }); }); describe('upsertNewDataItem clobber resistance (PE-9073)', () => { diff --git a/src/routes/data/handlers.test.ts b/src/routes/data/handlers.test.ts index 78cc8cf8f..7f6fdc56e 100644 --- a/src/routes/data/handlers.test.ts +++ b/src/routes/data/handlers.test.ts @@ -67,7 +67,6 @@ describe('Data routes', () => { }; dataAttributesSource = { getDataAttributes: () => Promise.resolve(undefined), - getDataAttributesByHash: () => Promise.resolve(undefined), }; dataSource = { getData: (params?: any) => { @@ -3409,17 +3408,6 @@ st stable: true, verified: false, } as any), - getDataAttributesByHash: (hash: string) => - Promise.resolve( - hash === DIGEST - ? { - hash: DIGEST, - size: CONTENT.length, - contentType: 'text/plain', - id: REPRESENTATIVE_ID, - } - : undefined, - ), }; dataSource = { getDataByHash: (hash: string, region?: any) => { @@ -3439,6 +3427,7 @@ st verified: true, trusted: true, cached: true, + representativeId: REPRESENTATIVE_ID, }); }, }; @@ -3532,12 +3521,18 @@ st }); it('still serves (200) when no representative id is indexed', async () => { - dataAttributesSource.getDataAttributesByHash = () => + // getDataByHash resolves the bytes but yields no representative id. + dataSource.getDataByHash = () => Promise.resolve({ hash: DIGEST, + stream: Readable.from(CONTENT), size: CONTENT.length, - contentType: 'text/plain', - // no id + totalSize: CONTENT.length, + sourceContentType: 'text/plain', + verified: true, + trusted: true, + cached: true, + // no representativeId }); app.get(route, build()); diff --git a/src/routes/data/handlers.ts b/src/routes/data/handlers.ts index d0182eafa..1cafa28a4 100644 --- a/src/routes/data/handlers.ts +++ b/src/routes/data/handlers.ts @@ -28,6 +28,7 @@ import { isValidTxId } from '../../lib/validation.js'; import { TxMetadataResolver } from '../../data/tx-metadata-resolver.js'; import { DataBlockListValidator, + ByHashData, ByHashDataSource, ContiguousData, ContiguousDataAttributes, @@ -1478,23 +1479,12 @@ export const createDigestDataHandler = ({ }); } - // Resolve a representative id for this digest (cheap indexed lookup) - // so the response can carry the full id-scoped, signed header set. - const byHash = - await dataAttributesSource.getDataAttributesByHash(digest); - const resolvedId = byHash?.id; - if (resolvedId !== undefined) { - span.setAttribute('data.representative_id', resolvedId); - } - - // Fire item header (tags/owner/signature) resolution early, in - // parallel with the byte fetch, exactly as the raw handler does. - const tagsPromise = - resolvedId !== undefined - ? fireItemHeaderResolution(resolvedId, dataItemMetaResolver) - : Promise.resolve(undefined); - - let data: ContiguousData; + // Fetch the bytes by content hash. A single indexed lookup inside + // getDataByHash both confirms the digest is materialized and yields a + // representative id that resolves to it (returned as data.representativeId) + // — so the response can carry the full id-scoped, signed header set + // without a second by-hash query here. + let data: ByHashData; try { data = await dataSource.getDataByHash(digest); } catch (error: any) { @@ -1513,6 +1503,18 @@ export const createDigestDataHandler = ({ return; } + const resolvedId = data.representativeId; + if (resolvedId !== undefined) { + span.setAttribute('data.representative_id', resolvedId); + } + + // Resolve item headers (tags/owner/signature) for the representative + // id, mirroring the raw handler. + const tagsPromise = + resolvedId !== undefined + ? fireItemHeaderResolution(resolvedId, dataItemMetaResolver) + : Promise.resolve(undefined); + try { // Build the same attributes shape /raw uses. Prefer the // representative id's full attributes (stable flag, root tx id, diff --git a/src/types.d.ts b/src/types.d.ts index 68a806b52..e682f6acb 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -1010,9 +1010,6 @@ export interface ContiguousDataParent { export interface DataAttributesSource { getDataAttributes(id: string): Promise; - getDataAttributesByHash( - hash: string, - ): Promise; } export interface ContiguousDataAttributesStore extends DataAttributesSource { @@ -1148,7 +1145,17 @@ export interface ContiguousDataSource { * content store (there is no on-demand fetch by content hash). */ export interface ByHashDataSource { - getDataByHash(hash: string, region?: Region): Promise; + getDataByHash(hash: string, region?: Region): Promise; +} + +/** + * {@link ContiguousData} plus the representative id that resolved the hash. + * Returned by {@link ByHashDataSource.getDataByHash} so the content-addressed + * endpoint can emit id-scoped headers without a second by-hash lookup. May be + * undefined when a hash is present in the content store with no indexed id. + */ +export interface ByHashData extends ContiguousData { + representativeId?: string; } /**