From 5109f988c0ee6d1a1d22521929daf71abba05f6a Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 30 Jun 2026 13:19:36 +0200 Subject: [PATCH 01/10] feat: add pypi packages worker Signed-off-by: anilb --- .../migrations/V1781800000__pypi_worker.sql | 9 + scripts/services/pypi-worker.yaml | 67 +++++ services/apps/packages_worker/package.json | 3 + .../apps/packages_worker/src/activities.ts | 5 + .../packages_worker/src/bin/pypi-worker.ts | 8 + .../packages_worker/src/npm/activities.ts | 3 +- .../apps/packages_worker/src/npm/proxies.ts | 31 +- services/apps/packages_worker/src/proxies.ts | 34 +++ .../src/pypi/__tests__/normalize.test.ts | 230 +++++++++++++++ .../packages_worker/src/pypi/activities.ts | 150 ++++++++++ .../packages_worker/src/pypi/fetchProject.ts | 64 ++++ .../packages_worker/src/pypi/normalize.ts | 276 ++++++++++++++++++ .../apps/packages_worker/src/pypi/proxies.ts | 14 + .../apps/packages_worker/src/pypi/schedule.ts | 40 +++ .../apps/packages_worker/src/pypi/types.ts | 45 +++ .../packages_worker/src/pypi/upsertProject.ts | 148 ++++++++++ .../packages_worker/src/pypi/workflows.ts | 43 +++ .../packages_worker/src/workflows/index.ts | 1 + .../data-access-layer/src/packages/index.ts | 1 + .../src/packages/maintainers.ts | 11 +- .../src/packages/packages.ts | 97 ++++++ .../src/packages/pypiPackageState.ts | 57 ++++ .../src/packages/versions.ts | 71 +++++ 23 files changed, 1373 insertions(+), 35 deletions(-) create mode 100644 backend/src/osspckgs/migrations/V1781800000__pypi_worker.sql create mode 100644 scripts/services/pypi-worker.yaml create mode 100644 services/apps/packages_worker/src/bin/pypi-worker.ts create mode 100644 services/apps/packages_worker/src/proxies.ts create mode 100644 services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts create mode 100644 services/apps/packages_worker/src/pypi/activities.ts create mode 100644 services/apps/packages_worker/src/pypi/fetchProject.ts create mode 100644 services/apps/packages_worker/src/pypi/normalize.ts create mode 100644 services/apps/packages_worker/src/pypi/proxies.ts create mode 100644 services/apps/packages_worker/src/pypi/schedule.ts create mode 100644 services/apps/packages_worker/src/pypi/types.ts create mode 100644 services/apps/packages_worker/src/pypi/upsertProject.ts create mode 100644 services/apps/packages_worker/src/pypi/workflows.ts create mode 100644 services/libs/data-access-layer/src/packages/pypiPackageState.ts diff --git a/backend/src/osspckgs/migrations/V1781800000__pypi_worker.sql b/backend/src/osspckgs/migrations/V1781800000__pypi_worker.sql new file mode 100644 index 0000000000..94d4686c24 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1781800000__pypi_worker.sql @@ -0,0 +1,9 @@ + +CREATE TABLE pypi_package_state ( + purl text PRIMARY KEY, + metadata_first_scanned_at timestamptz NOT NULL DEFAULT now(), + metadata_last_run_at timestamptz, + metadata_run_result jsonb -- { status, attempts, httpStatus?, errorKind?, message? } +); + +CREATE INDEX ON pypi_package_state (metadata_last_run_at); diff --git a/scripts/services/pypi-worker.yaml b/scripts/services/pypi-worker.yaml new file mode 100644 index 0000000000..07abfdd92c --- /dev/null +++ b/scripts/services/pypi-worker.yaml @@ -0,0 +1,67 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: pypi-worker + CROWD_TEMPORAL_TASKQUEUE: pypi-worker + CROWD_TEMPORAL_NAMESPACE: ${CROWD_PACKAGES_TEMPORAL_NAMESPACE} + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + +services: + pypi-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run start:pypi-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + pypi-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run dev:pypi-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: pypi-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index d49e5c2f34..ac3db89634 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -26,6 +26,9 @@ "start:npm-worker": "CROWD_TEMPORAL_TASKQUEUE=npm-worker SERVICE=npm-worker tsx src/bin/npm-worker.ts", "dev:npm-worker": "CROWD_TEMPORAL_TASKQUEUE=npm-worker SERVICE=npm-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/npm-worker.ts", "dev:npm-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=npm-worker SERVICE=npm-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/npm-worker.ts", + "start:pypi-worker": "CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker tsx src/bin/pypi-worker.ts", + "dev:pypi-worker": "CROWD_TEMPORAL_TASKQUEUE=pypi-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/pypi-worker.ts", + "dev:pypi-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/pypi-worker.ts", "start:osv-worker": "CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker tsx src/bin/osv-worker.ts", "dev:osv-worker": "CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/osv-worker.ts", "dev:osv-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/osv-worker.ts", diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index ebb4594484..078b8bac0f 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -25,3 +25,8 @@ export { cargoCleanup, } from './cargo/activities' export { enrichGoVersionsBatch, enrichGoStatusBatch } from './go/activities' +export { + getUnscannedPypiBatch, + ingestPypiPackageBatch, + pypiStopAfterFirstPage, +} from './pypi/activities' diff --git a/services/apps/packages_worker/src/bin/pypi-worker.ts b/services/apps/packages_worker/src/bin/pypi-worker.ts new file mode 100644 index 0000000000..bb2fd45655 --- /dev/null +++ b/services/apps/packages_worker/src/bin/pypi-worker.ts @@ -0,0 +1,8 @@ +import { schedulePypiIngest } from '../pypi/schedule' +import { svc } from '../service' + +setImmediate(async () => { + await svc.init() + await schedulePypiIngest() + await svc.start() +}) diff --git a/services/apps/packages_worker/src/npm/activities.ts b/services/apps/packages_worker/src/npm/activities.ts index 1fa7c52645..dabc8b3fdd 100644 --- a/services/apps/packages_worker/src/npm/activities.ts +++ b/services/apps/packages_worker/src/npm/activities.ts @@ -25,6 +25,7 @@ import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getPackagesDb } from '../db' +import { proxyUrl } from '../proxies' import { NPM_EARLIEST, computeChunks } from './downloadGaps' import { fetchChangesSince, fetchCurrentSeq } from './fetchChanges' @@ -35,7 +36,7 @@ import { } from './fetchDownloads' import { fetchPackument } from './fetchPackument' import { Last30dWindow, computeMissingLast30dWindows } from './last30dGaps' -import { laneCount, proxyForLane, proxyUrl } from './proxies' +import { laneCount, proxyForLane } from './proxies' import { isFetchError } from './types' import { upsertPackage } from './upsertPackage' diff --git a/services/apps/packages_worker/src/npm/proxies.ts b/services/apps/packages_worker/src/npm/proxies.ts index 72ff2dee34..798af93c4a 100644 --- a/services/apps/packages_worker/src/npm/proxies.ts +++ b/services/apps/packages_worker/src/npm/proxies.ts @@ -1,33 +1,6 @@ -export interface ProxyEndpoint { - host: string - port: string - username: string - password: string -} - -export function parseProxies(): ProxyEndpoint[] { - const raw = process.env.CROWD_PACKAGES_PROXIES - if (!raw) return [] - return raw - .split(',') - .map((entry) => entry.trim()) - .filter((entry) => entry.length > 0) - .map((entry) => { - const [host, port, username, password] = entry.split(':') - return { host, port, username, password } - }) - .filter((p) => p.host && p.port && p.username && p.password) -} - -export function proxyCount(): number { - return parseProxies().length -} - -export function proxyUrl(p: ProxyEndpoint): string { - return `http://${p.username}:${p.password}@${p.host}:${p.port}` -} +import { parseProxies, proxyCount, type ProxyEndpoint } from '../proxies' -// Global kill-switch for the proxy layer. When off (the default), every npm worker +// Global kill-switch for the npm proxy layer. When off (the default), every npm worker // runs a single direct lane (no ProxyAgent) — see laneCount/proxyForLane. export function proxiesEnabled(): boolean { const raw = (process.env.CROWD_PACKAGES_PROXIES_ENABLED ?? '').trim().toLowerCase() diff --git a/services/apps/packages_worker/src/proxies.ts b/services/apps/packages_worker/src/proxies.ts new file mode 100644 index 0000000000..cb6cb89641 --- /dev/null +++ b/services/apps/packages_worker/src/proxies.ts @@ -0,0 +1,34 @@ +// Generic proxy primitives shared across the workers +// The proxy list is shared via CROWD_PACKAGES_PROXIES; each sub-worker owns its own enable +// flag and lane logic (see e.g. npm/proxies.ts, pypi/proxies.ts). + +export interface ProxyEndpoint { + host: string + port: string + username: string + password: string +} + +// Parse the shared CROWD_PACKAGES_PROXIES list ("host:port:user:pass,host:port:user:pass"). +// Malformed entries (missing any field) are dropped. +export function parseProxies(): ProxyEndpoint[] { + const raw = process.env.CROWD_PACKAGES_PROXIES + if (!raw) return [] + return raw + .split(',') + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const [host, port, username, password] = entry.split(':') + return { host, port, username, password } + }) + .filter((p) => p.host && p.port && p.username && p.password) +} + +export function proxyCount(): number { + return parseProxies().length +} + +export function proxyUrl(p: ProxyEndpoint): string { + return `http://${p.username}:${p.password}@${p.host}:${p.port}` +} diff --git a/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts b/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts new file mode 100644 index 0000000000..42e496f8a4 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts @@ -0,0 +1,230 @@ +import { describe, expect, it } from 'vitest' + +import { + classifyProjectUrls, + collectPypiMaintainers, + isPypiPrerelease, + parseKeywords, + pypiNameFromPurl, + resolvePypiLicenses, + stripNullBytesDeep, +} from '../normalize' +import type { PyPiInfo } from '../types' + +function info(partial: Partial): PyPiInfo { + return { name: 'demo', ...partial } +} + +describe('pypiNameFromPurl', () => { + it('strips the pkg:pypi/ prefix', () => { + expect(pypiNameFromPurl('pkg:pypi/flask')).toBe('flask') + }) + + it('decodes percent-encoded segments', () => { + expect(pypiNameFromPurl('pkg:pypi/zope.interface')).toBe('zope.interface') + expect(pypiNameFromPurl('pkg:pypi/ruamel.yaml%2Bclib')).toBe('ruamel.yaml+clib') + }) +}) + +describe('stripNullBytesDeep', () => { + it('removes NUL bytes from nested strings', () => { + const nul = String.fromCharCode(0) + const v = stripNullBytesDeep({ a: `x${nul}y`, b: [`p${nul}`, 'q'] }) + expect(v.a).toBe('xy') + expect(v.b).toEqual(['p', 'q']) + }) +}) + +describe('resolvePypiLicenses', () => { + it('prefers the SPDX license_expression', () => { + const r = resolvePypiLicenses( + info({ license_expression: 'MIT OR Apache-2.0', license: 'ignored', classifiers: [] }), + ) + expect(r.licenses).toEqual(['MIT', 'Apache-2.0']) + expect(r.licensesRaw).toBe('MIT OR Apache-2.0') + }) + + it('falls back to License :: classifiers', () => { + const r = resolvePypiLicenses( + info({ + classifiers: [ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: BSD License', + ], + }), + ) + expect(r.licenses).toEqual(['BSD-3-Clause']) + }) + + it('uses the classifier leaf for unmapped licenses', () => { + const r = resolvePypiLicenses( + info({ classifiers: ['License :: OSI Approved :: Eclipse Public License 1.0 (EPL-1.0)'] }), + ) + expect(r.licenses).toEqual(['Eclipse Public License 1.0 (EPL-1.0)']) + }) + + it('ignores grouping-only classifier nodes', () => { + const r = resolvePypiLicenses(info({ classifiers: ['License :: OSI Approved'] })) + expect(r.licenses).toEqual([]) + }) + + it('uses a short license string when no expression/classifier', () => { + const r = resolvePypiLicenses(info({ license: 'MIT' })) + expect(r.licenses).toEqual(['MIT']) + expect(r.licensesRaw).toBe('MIT') + }) + + it('keeps long license text out of the licenses array', () => { + const text = 'Permission is hereby granted, free of charge, to any person...'.repeat(5) + const r = resolvePypiLicenses(info({ license: text })) + expect(r.licenses).toEqual([]) + expect(r.licensesRaw).toBe(text) + }) +}) + +describe('isPypiPrerelease', () => { + it('returns false for stable releases', () => { + expect(isPypiPrerelease('1.0.0')).toBe(false) + expect(isPypiPrerelease('2023.9.1')).toBe(false) + expect(isPypiPrerelease('3.1.3')).toBe(false) + }) + + it('detects alpha/beta/rc markers', () => { + expect(isPypiPrerelease('1.0a1')).toBe(true) + expect(isPypiPrerelease('1.0b2')).toBe(true) + expect(isPypiPrerelease('1.0rc1')).toBe(true) + expect(isPypiPrerelease('1.0c3')).toBe(true) + expect(isPypiPrerelease('1.0.0alpha')).toBe(true) + expect(isPypiPrerelease('2.0.0beta1')).toBe(true) + }) + + it('detects dev releases', () => { + expect(isPypiPrerelease('1.0.dev0')).toBe(true) + expect(isPypiPrerelease('1.0.0.dev3')).toBe(true) + expect(isPypiPrerelease('1.0a1.dev1')).toBe(true) + }) + + it('does NOT treat post-releases as prereleases', () => { + expect(isPypiPrerelease('1.0.post1')).toBe(false) + expect(isPypiPrerelease('1.0.0.post2')).toBe(false) + }) + + it('ignores local and epoch parts', () => { + expect(isPypiPrerelease('1.0.0+ubuntu1')).toBe(false) + expect(isPypiPrerelease('1!2.0.0')).toBe(false) + expect(isPypiPrerelease('1!2.0rc1')).toBe(true) + }) +}) + +describe('collectPypiMaintainers', () => { + it('parses "Name " from *_email', () => { + const people = collectPypiMaintainers( + info({ maintainer_email: 'Pallets ' }), + ) + expect(people).toEqual([ + { + username: 'Pallets', + displayName: 'Pallets', + email: 'contact@palletsprojects.com', + role: 'maintainer', + }, + ]) + }) + + it('pairs separate name and email fields', () => { + const people = collectPypiMaintainers( + info({ author: 'Kenneth Reitz', author_email: 'me@kennethreitz.org' }), + ) + expect(people).toEqual([ + { + username: 'Kenneth Reitz', + displayName: 'Kenneth Reitz', + email: 'me@kennethreitz.org', + role: 'author', + }, + ]) + }) + + it('splits comma-separated people', () => { + const people = collectPypiMaintainers( + info({ author_email: 'A One , B Two ' }), + ) + expect(people.map((p) => p.username)).toEqual(['A One', 'B Two']) + }) + + it('uses email as username when no name is present', () => { + const people = collectPypiMaintainers(info({ author_email: 'solo@x.com' })) + expect(people).toEqual([ + { username: 'solo@x.com', displayName: null, email: 'solo@x.com', role: 'author' }, + ]) + }) + + it('returns nothing when author/maintainer are absent', () => { + expect(collectPypiMaintainers(info({}))).toEqual([]) + }) + + it('lets the author role win on a username collision', () => { + const people = collectPypiMaintainers( + info({ maintainer: 'Same Person', author: 'Same Person' }), + ) + expect(people).toEqual([ + { username: 'Same Person', displayName: 'Same Person', email: null, role: 'author' }, + ]) + }) +}) + +describe('classifyProjectUrls', () => { + it('picks homepage, repo, and funding from project_urls', () => { + const r = classifyProjectUrls( + { + Homepage: 'https://flask.palletsprojects.com/', + Source: 'https://github.com/pallets/flask/', + Donate: 'https://palletsprojects.com/donate', + }, + null, + ) + expect(r.homepage).toBe('https://flask.palletsprojects.com/') + expect(r.declaredRepositoryUrl).toBe('https://github.com/pallets/flask/') + expect(r.fundingLinks).toEqual([{ type: 'other', url: 'https://palletsprojects.com/donate' }]) + }) + + it('prefers info.home_page for homepage', () => { + const r = classifyProjectUrls({ Homepage: 'https://example.org' }, 'https://primary.example') + expect(r.homepage).toBe('https://primary.example') + }) + + it('falls back to a repo-looking homepage when no explicit repo key', () => { + const r = classifyProjectUrls({ Homepage: 'https://github.com/psf/requests' }, null) + expect(r.declaredRepositoryUrl).toBe('https://github.com/psf/requests') + }) + + it('infers funding type from the host', () => { + const r = classifyProjectUrls( + { + Funding: 'https://github.com/sponsors/foo', + Sponsor: 'https://opencollective.com/bar', + }, + null, + ) + expect(r.fundingLinks).toEqual([ + { type: 'github', url: 'https://github.com/sponsors/foo' }, + { type: 'opencollective', url: 'https://opencollective.com/bar' }, + ]) + }) + + it('returns nulls/empties when there are no urls', () => { + const r = classifyProjectUrls(null, null) + expect(r).toEqual({ homepage: null, declaredRepositoryUrl: null, fundingLinks: [] }) + }) +}) + +describe('parseKeywords', () => { + it('splits on commas and whitespace and dedupes', () => { + expect(parseKeywords('web, async, http web')).toEqual(['web', 'async', 'http']) + }) + + it('returns [] for empty/missing input', () => { + expect(parseKeywords(null)).toEqual([]) + expect(parseKeywords(' ')).toEqual([]) + }) +}) diff --git a/services/apps/packages_worker/src/pypi/activities.ts b/services/apps/packages_worker/src/pypi/activities.ts new file mode 100644 index 0000000000..9340b1d95d --- /dev/null +++ b/services/apps/packages_worker/src/pypi/activities.ts @@ -0,0 +1,150 @@ +import { ProxyAgent, type Dispatcher } from 'undici' + +import { + getUnscannedPypiPurls, + logAuditFieldChanges, + markPypiPackageScanned, +} from '@crowd/data-access-layer/src/packages' +import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getPackagesDb } from '../db' +import { proxyUrl } from '../proxies' + +import { fetchProject } from './fetchProject' +import { pypiNameFromPurl } from './normalize' +import { pypiProxyPool } from './proxies' +import { isFetchError } from './types' +import { upsertProject } from './upsertProject' + +const log = getServiceChildLogger('pypi') + +const WORKER = 'pypi' + +// 4xx (404 or any other client error like a malformed/illegal project name that +// leaked into `packages`). 429 is excluded — it's transient and rides the slow path. +function isClientError(code: number | undefined, kind: string): boolean { + return kind === 'NOT_FOUND' || (code !== undefined && code >= 400 && code < 500 && code !== 429) +} + +// 4xx/malformed get a few quick in-lane retries with a small linear backoff, then the +// package is given up on and marked scanned. 429/5xx/network throw and ride Temporal's +// exponential activity-retry instead. +const INGEST_4XX_ATTEMPTS = 3 +const INGEST_4XX_BACKOFF_MS = 1000 + +// Per-package throttle. PyPI is Fastly-backed and tolerates pip-scale traffic, so a +// modest sleep keeps the lane polite (optional proxy fan-out is off by default). +function ingestSleepMs(): number { + const n = parseInt(process.env.CROWD_PACKAGES_PYPI_INGEST_SLEEP_MS ?? '300', 10) + return Number.isFinite(n) && n >= 0 ? n : 300 +} + +// Re-enrich a critical package whose metadata is older than this many days. PyPI has +// no _changes-style feed, so freshness is staleness-driven. +function refreshDays(): number { + const n = parseInt(process.env.CROWD_PACKAGES_PYPI_REFRESH_DAYS ?? '14', 10) + return Number.isFinite(n) && n > 0 ? n : 14 +} + +// Scope of the metadata sweep. Defaults to true (is_critical packages only — the +// intended steady state). Set CROWD_PACKAGES_PYPI_RUN_ONLY_FOR_CRITICAL=false to enrich +// every PyPI package (temporary, e.g. while criticality is still being populated). +function runOnlyForCritical(): boolean { + const raw = (process.env.CROWD_PACKAGES_PYPI_RUN_ONLY_FOR_CRITICAL ?? 'true').trim().toLowerCase() + return !(raw === 'false' || raw === '0' || raw === 'no') +} + +// Debug/test switch (CROWD_PACKAGES_PYPI_STOP_AFTER_FIRST_PAGE): when true, the workflow +// processes a single page and returns without continueAsNew +export async function pypiStopAfterFirstPage(): Promise { + const raw = (process.env.CROWD_PACKAGES_PYPI_STOP_AFTER_FIRST_PAGE ?? 'false') + .trim() + .toLowerCase() + return raw === 'true' || raw === '1' || raw === 'yes' +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +// Fully enrich a single package. `purl` is the source-of-truth identifier from the +// packages row; the PyPI project name (for the HTTP fetch) is derived from it. +async function ingestOne( + qx: QueryExecutor, + purl: string, + dispatcher?: Dispatcher, +): Promise { + const name = pypiNameFromPurl(purl) + + for (let attempt = 1; attempt <= INGEST_4XX_ATTEMPTS; attempt++) { + const result = await fetchProject(name, dispatcher) + + if (!isFetchError(result)) { + const { changedFields } = await upsertProject(qx, result, purl) + await logAuditFieldChanges(qx, WORKER, purl, changedFields) + await markPypiPackageScanned(qx, purl, { status: 'success', attempts: attempt }) + return + } + + if (!isClientError(result.statusCode, result.kind) && result.kind !== 'MALFORMED') { + throw new Error(`Failed to fetch PyPI project ${name}: ${result.message}`) + } + + if (attempt < INGEST_4XX_ATTEMPTS) { + await sleep(attempt * INGEST_4XX_BACKOFF_MS) + continue + } + log.warn( + { purl, statusCode: result.statusCode, kind: result.kind }, + 'pypi project 4xx/malformed after fast retries — marking scanned and skipping', + ) + await markPypiPackageScanned(qx, purl, { + status: 'error', + attempts: INGEST_4XX_ATTEMPTS, + httpStatus: result.statusCode, + errorKind: result.kind, + message: result.message, + }) + } +} + +export async function getUnscannedPypiBatch( + afterPurl: string, + batchSize: number, +): Promise<{ purls: string[]; nextCursor: string }> { + const qx = await getPackagesDb() + const purls = await getUnscannedPypiPurls( + qx, + afterPurl, + batchSize, + refreshDays(), + runOnlyForCritical(), + ) + return { purls, nextCursor: purls.length ? purls[purls.length - 1] : afterPurl } +} + +// Enrich a batch of PyPI packages sequentially, throttled to stay polite to the +// registry. 4xx packages are skipped inside ingestOne; a transient (429/5xx/network) +// error throws out of here so Temporal retries the batch with exponential backoff. +export async function ingestPypiPackageBatch(purls: string[]): Promise { + if (purls.length === 0) return + const qx = await getPackagesDb() + + // Optional proxy layer (off by default). With a single lane, rotate across the + // configured proxy pool per package so traffic spreads over all IPs; when disabled the + // pool is empty and `dispatcher` stays undefined (direct egress). One ProxyAgent per + // proxy, reused for the whole batch and closed at the end. + const agents = pypiProxyPool().map((p) => new ProxyAgent(proxyUrl(p))) + try { + let i = 0 + for (const purl of purls) { + await sleep(ingestSleepMs()) + const dispatcher = agents.length ? agents[i++ % agents.length] : undefined + await ingestOne(qx, purl, dispatcher) + } + } finally { + await Promise.all(agents.map((a) => a.close())) + } + log.info({ count: purls.length, proxied: agents.length }, 'Ingested PyPI package batch') +} diff --git a/services/apps/packages_worker/src/pypi/fetchProject.ts b/services/apps/packages_worker/src/pypi/fetchProject.ts new file mode 100644 index 0000000000..97c0aab2e5 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/fetchProject.ts @@ -0,0 +1,64 @@ +import type { Dispatcher } from 'undici' + +import type { FetchError, PyPiProject } from './types' + +const REGISTRY = 'https://pypi.org/pypi' +const USER_AGENT = 'lfx-packages-worker/0.1 (+https://lfx.linuxfoundation.org)' + +// Fetch a project's metadata from the PyPI JSON API. +// Error's handled with respect to their types (retryable or not) +// 404 → NOT_FOUND (skip) +// 429 → RATE_LIMIT and 5xx/network → TRANSIENT (Temporal retries) +// malformed body → MALFORMED (skip). +// `dispatcher` routes the request through a proxy IP when the +// optional PyPI proxy layer is enabled; undefined means direct egress. +export async function fetchProject( + name: string, + dispatcher?: Dispatcher, +): Promise { + const url = `${REGISTRY}/${encodeURIComponent(name)}/json` + const abort = new AbortController() + const timer = setTimeout(() => abort.abort(), 30_000) + let res: Response + try { + // `dispatcher` is an undici-specific fetch option not present in the DOM RequestInit type. + const init: RequestInit & { dispatcher?: Dispatcher } = { + headers: { + Accept: 'application/json', + 'User-Agent': USER_AGENT, + }, + signal: abort.signal, + } + if (dispatcher) init.dispatcher = dispatcher + res = await fetch(url, init as RequestInit) + } catch (err) { + return { kind: 'TRANSIENT', message: String(err) } + } finally { + clearTimeout(timer) + } + + if (res.status === 404) + return { kind: 'NOT_FOUND', message: `${name} not found`, statusCode: 404 } + if (res.status === 429) return { kind: 'RATE_LIMIT', message: 'rate limited', statusCode: 429 } + if (!res.ok) return { kind: 'TRANSIENT', message: `HTTP ${res.status}`, statusCode: res.status } + + let json: unknown + try { + json = await res.json() + } catch { + return { kind: 'MALFORMED', message: 'invalid JSON' } + } + + if (!isPyPiProject(json)) return { kind: 'MALFORMED', message: 'unexpected shape' } + return json +} + +function isPyPiProject(v: unknown): v is PyPiProject { + if (typeof v !== 'object' || v === null || !('info' in v)) return false + const info = (v as { info: unknown }).info + return ( + typeof info === 'object' && + info !== null && + typeof (info as { name?: unknown }).name === 'string' + ) +} diff --git a/services/apps/packages_worker/src/pypi/normalize.ts b/services/apps/packages_worker/src/pypi/normalize.ts new file mode 100644 index 0000000000..c82709d1f2 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/normalize.ts @@ -0,0 +1,276 @@ +import type { PyPiInfo } from './types' + +const PURL_PYPI_PREFIX = 'pkg:pypi/' + +// Postgres text columns cannot store NUL (U+0000). Built at runtime so there is no +// NUL literal in the source. +const NUL_GLOBAL = new RegExp(String.fromCharCode(0), 'g') + +// The PyPI project name from a purl. PyPI purls are `pkg:pypi/` (no namespace); +// the name segment is percent-encoded per the purl spec, so decode it to get the +// registry name used by the JSON API. +export function pypiNameFromPurl(purl: string): string { + return decodeURIComponent(purl.slice(PURL_PYPI_PREFIX.length)) +} + +// Strip NUL bytes in place from every string before persisting — otherwise the +// inlined value breaks the PostgreSQL wire protocol ("invalid message format"). +export function stripNullBytesDeep(value: T): T { + if (typeof value === 'string') { + return value.replace(NUL_GLOBAL, '') as T + } + if (Array.isArray(value)) { + for (let i = 0; i < value.length; i++) value[i] = stripNullBytesDeep(value[i]) + return value + } + if (value !== null && typeof value === 'object') { + const obj = value as Record + for (const k of Object.keys(obj)) obj[k] = stripNullBytesDeep(obj[k]) + return value + } + return value +} + +function blankToNull(s: string | null | undefined): string | null { + if (s == null) return null + const t = s.trim() + return t || null +} + +function dedup(arr: string[]): string[] { + return [...new Set(arr)] +} + +// Split an SPDX-ish expression ("MIT", "MIT OR Apache-2.0", "(MIT AND BSD-3-Clause)") +// into individual license tokens. +function splitSpdx(raw: string): string[] { + return raw + .split(/\s+(?:OR|AND|WITH)\s+/i) + .map((s) => s.replace(/[()]/g, '').trim()) + .filter(Boolean) +} + +// Common trove "License :: ..." classifiers → SPDX identifiers. +const TROVE_TO_SPDX: Record = { + 'License :: OSI Approved :: MIT License': 'MIT', + 'License :: OSI Approved :: MIT No Attribution License (MIT-0)': 'MIT-0', + 'License :: OSI Approved :: Apache Software License': 'Apache-2.0', + 'License :: OSI Approved :: BSD License': 'BSD-3-Clause', + 'License :: OSI Approved :: ISC License (ISCL)': 'ISC', + 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)': 'GPL-2.0-only', + 'License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)': 'GPL-2.0-or-later', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)': 'GPL-3.0-only', + 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)': 'GPL-3.0-or-later', + 'License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)': 'LGPL-2.0-only', + 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)': + 'LGPL-2.1-or-later', + 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)': 'LGPL-3.0-only', + 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)': + 'LGPL-3.0-or-later', + 'License :: OSI Approved :: GNU Affero General Public License v3': 'AGPL-3.0-only', + 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)': + 'AGPL-3.0-or-later', + 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)': 'MPL-2.0', + 'License :: OSI Approved :: Python Software Foundation License': 'PSF-2.0', + 'License :: OSI Approved :: The Unlicense (Unlicense)': 'Unlicense', + 'License :: OSI Approved :: zlib/libpng License': 'Zlib', + 'License :: OSI Approved :: Boost Software License 1.0 (BSL-1.0)': 'BSL-1.0', + 'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication': 'CC0-1.0', +} + +// Grouping/ambiguous classifier nodes that carry no usable license identifier. +const TROVE_IGNORE = new Set([ + 'License :: OSI Approved', + 'License :: Other/Proprietary License', + 'License :: Public Domain', + 'License :: Freely Distributable', + 'License :: Freeware', + 'License :: DFSG approved', +]) + +function licensesFromClassifiers(classifiers: string[]): string[] { + const out: string[] = [] + for (const raw of classifiers) { + const c = raw.trim() + if (!c.startsWith('License ::')) continue + if (TROVE_IGNORE.has(c)) continue + const mapped = TROVE_TO_SPDX[c] + if (mapped) { + out.push(mapped) + continue + } + // Fallback: classifier leaf (after the last "::"), stripping a trailing "License". + const leaf = c.split('::').pop()?.trim() + if (leaf) out.push(leaf.replace(/\s+License$/i, '').trim() || leaf) + } + return out +} + +function isShortLicenseToken(s: string): boolean { + return !s.includes('\n') && s.length <= 60 +} + +// PyPI license priority: PEP 639 SPDX expression → "License ::" classifiers → a short +// `license` token. A long/multiline `license` value is full license text (not an +// identifier), so it only feeds licenses_raw. Returns SPDX-ish tokens for the +// `licenses` array plus the original string for `licenses_raw`. +export function resolvePypiLicenses(info: PyPiInfo): { + licenses: string[] + licensesRaw: string | null +} { + const expr = blankToNull(info.license_expression) + const licenseStr = blankToNull(info.license) + const licensesRaw = expr ?? licenseStr + + if (expr) return { licenses: dedup(splitSpdx(expr)), licensesRaw } + + const fromClassifiers = licensesFromClassifiers(info.classifiers ?? []) + if (fromClassifiers.length) return { licenses: dedup(fromClassifiers), licensesRaw } + + if (licenseStr && isShortLicenseToken(licenseStr)) { + return { licenses: dedup(splitSpdx(licenseStr)), licensesRaw } + } + return { licenses: [], licensesRaw } +} + +// PEP 440 pre-release / dev-release detection. True when the version carries an +// alpha/beta/rc marker (a, b, c/rc, or spelled out) or a dev segment. Post-releases +// (".postN") are NOT prereleases. Local ("+...") and epoch ("N!") parts are ignored. +export function isPypiPrerelease(version: string): boolean { + const v = version.trim().toLowerCase().split('+')[0].replace(/^\d+!/, '') + if (/(?:^|[\d._-])dev[._-]?\d*$/.test(v)) return true + if (/[\d._-](?:alpha|beta|preview|pre|rc|a|b|c)[._-]?\d*$/.test(v)) return true + return false +} + +export interface PypiPerson { + username: string + displayName: string | null + email: string | null + role: 'author' | 'maintainer' +} + +function parseNameEmail(s: string): { name: string | null; email: string | null } { + const trimmed = s.trim() + const m = trimmed.match(/^(.*?)\s*<([^>]+)>$/) + if (m) return { name: blankToNull(m[1]), email: blankToNull(m[2]) } + if (/^[^\s@]+@[^\s@]+$/.test(trimmed)) return { name: null, email: trimmed } + return { name: blankToNull(trimmed), email: null } +} + +function splitList(s: string): string[] { + return s + .split(',') + .map((p) => p.trim()) + .filter(Boolean) +} + +function peopleForRole( + nameField: string | null | undefined, + emailField: string | null | undefined, + role: 'author' | 'maintainer', +): PypiPerson[] { + const emailParts = emailField ? splitList(emailField) : [] + const nameParts = nameField ? splitList(nameField) : [] + const raw: Array<{ name: string | null; email: string | null }> = [] + + if (emailParts.length) { + // Modern packages put "Name " (often several, comma-separated) in *_email. + emailParts.forEach((part, i) => { + const pe = parseNameEmail(part) + raw.push({ name: pe.name ?? nameParts[i] ?? null, email: pe.email }) + }) + } else if (nameParts.length) { + nameParts.forEach((n) => raw.push({ name: blankToNull(n), email: null })) + } + + const out: PypiPerson[] = [] + for (const p of raw) { + const username = p.name ?? p.email + if (!username) continue + out.push({ username, displayName: p.name, email: p.email, role }) + } + return out +} + +// Build maintainers from PyPI's free-text author/maintainer fields. PyPI exposes no +// account usernames, so the parsed name (falling back to email) is the synthetic +// username. Author overwrites maintainer on a username collision +export function collectPypiMaintainers(info: PyPiInfo): PypiPerson[] { + const map = new Map() + for (const p of peopleForRole(info.maintainer, info.maintainer_email, 'maintainer')) { + map.set(p.username, p) + } + for (const p of peopleForRole(info.author, info.author_email, 'author')) { + map.set(p.username, p) + } + return [...map.values()] +} + +export interface PypiFundingLink { + type: string + url: string +} + +function inferFundingType(url: string): string { + if (/github\.com/i.test(url)) return 'github' + if (/patreon\.com/i.test(url)) return 'patreon' + if (/opencollective\.com/i.test(url)) return 'opencollective' + if (/ko-fi\.com/i.test(url)) return 'ko-fi' + if (/tidelift\.com/i.test(url)) return 'tidelift' + return 'other' +} + +export function classifyProjectUrls( + projectUrls: Record | null | undefined, + homePage: string | null | undefined, +): { + homepage: string | null + declaredRepositoryUrl: string | null + fundingLinks: PypiFundingLink[] +} { + const entries = Object.entries(projectUrls ?? {}).map( + ([k, v]) => [k.trim(), (v ?? '').trim()] as const, + ) + const findByKey = (re: RegExp): string | null => + entries.find(([k, v]) => re.test(k) && v)?.[1] ?? null + + const homepage = + blankToNull(homePage) ?? + findByKey(/^homepage$/i) ?? + findByKey(/^home[\s-]*page$/i) ?? + findByKey(/^home$/i) + + const REPO_HOST = /github\.com|gitlab\.com|bitbucket\.org/i + let declaredRepositoryUrl = + findByKey(/^source(\s*code)?$/i) ?? + findByKey(/^repository$/i) ?? + findByKey(/^repo$/i) ?? + findByKey(/^code$/i) ?? + entries.find(([k, v]) => /source|repo|code|git/i.test(k) && REPO_HOST.test(v))?.[1] ?? + null + // Many projects only declare a Homepage that is itself the repo. + if (!declaredRepositoryUrl && homepage && REPO_HOST.test(homepage)) { + declaredRepositoryUrl = homepage + } + + const seen = new Set() + const fundingLinks: PypiFundingLink[] = [] + for (const [k, v] of entries) { + if (!v || !/fund|donate|sponsor/i.test(k) || seen.has(v)) continue + seen.add(v) + fundingLinks.push({ type: inferFundingType(v), url: v }) + } + + return { homepage, declaredRepositoryUrl, fundingLinks } +} + +export function parseKeywords(raw: string | null | undefined): string[] { + if (!raw) return [] + return dedup( + raw + .split(/[,\s]+/) + .map((k) => k.trim()) + .filter(Boolean), + ) +} diff --git a/services/apps/packages_worker/src/pypi/proxies.ts b/services/apps/packages_worker/src/pypi/proxies.ts new file mode 100644 index 0000000000..c460fcf27f --- /dev/null +++ b/services/apps/packages_worker/src/pypi/proxies.ts @@ -0,0 +1,14 @@ +import { parseProxies, type ProxyEndpoint } from '../proxies' + +// Off by default: when disabled thesingle PyPI lane egresses directly (no ProxyAgent). +// The proxy list is shared with workers via CROWD_PACKAGES_PROXIES +// only the enable flag (CROWD_PACKAGES_PYPI_PROXIES_ENABLED) is PyPI-specific. +export function pypiProxiesEnabled(): boolean { + const raw = (process.env.CROWD_PACKAGES_PYPI_PROXIES_ENABLED ?? '').trim().toLowerCase() + return raw === 'true' || raw === '1' +} + +// The proxy endpoints the PyPI lane may rotate through, or [] when disabled/unconfigured. +export function pypiProxyPool(): ProxyEndpoint[] { + return pypiProxiesEnabled() ? parseProxies() : [] +} diff --git a/services/apps/packages_worker/src/pypi/schedule.ts b/services/apps/packages_worker/src/pypi/schedule.ts new file mode 100644 index 0000000000..2382bb61bb --- /dev/null +++ b/services/apps/packages_worker/src/pypi/schedule.ts @@ -0,0 +1,40 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../service' +import { ingestPypiPackages } from '../workflows' + +export async function schedulePypiIngest(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'pypi-registry-ingest', + spec: { + cronExpressions: ['0 5 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: ingestPypiPackages, + taskQueue: 'pypi-worker', + workflowRunTimeout: '24 hours', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule pypi-registry-ingest already registered.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/pypi/types.ts b/services/apps/packages_worker/src/pypi/types.ts new file mode 100644 index 0000000000..9269d4746e --- /dev/null +++ b/services/apps/packages_worker/src/pypi/types.ts @@ -0,0 +1,45 @@ +export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'MALFORMED' + +export interface FetchError { + kind: FetchErrorKind + message: string + statusCode?: number +} + +export function isFetchError(v: unknown): v is FetchError { + return typeof v === 'object' && v !== null && 'kind' in v && 'message' in v +} + +// Subset of the PyPI JSON API project response we consume. +// https://pypi.org/pypi//json — see docs.pypi.org/api/json. +export interface PyPiReleaseFile { + upload_time_iso_8601?: string + yanked?: boolean +} + +export interface PyPiInfo { + name: string + version?: string + summary?: string | null + author?: string | null + author_email?: string | null + maintainer?: string | null + maintainer_email?: string | null + license?: string | null + // PEP 639 SPDX license expression (newer packages). + license_expression?: string | null + keywords?: string | null + classifiers?: string[] + home_page?: string | null + // Canonical PyPI project URL, e.g. https://pypi.org/project// + package_url?: string | null + project_urls?: Record | null + // True when the latest release is yanked. + yanked?: boolean +} + +export interface PyPiProject { + info: PyPiInfo + // version string -> array of distribution file objects (may be empty). + releases?: Record +} diff --git a/services/apps/packages_worker/src/pypi/upsertProject.ts b/services/apps/packages_worker/src/pypi/upsertProject.ts new file mode 100644 index 0000000000..650f14f284 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/upsertProject.ts @@ -0,0 +1,148 @@ +import { + getOrCreateRepoByUrl, + upsertNpmFundingLinks, + upsertNpmMaintainers, + upsertPackageRepo, + upsertPypiPackage, + upsertPypiVersions, +} from '@crowd/data-access-layer/src/packages' +import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { canonicalizeRepoUrl } from '../utils/canonicalizeRepoUrl' + +import { + classifyProjectUrls, + collectPypiMaintainers, + isPypiPrerelease, + parseKeywords, + pypiNameFromPurl, + resolvePypiLicenses, + stripNullBytesDeep, +} from './normalize' +import type { PyPiProject, PyPiReleaseFile } from './types' + +interface PypiVersionRow { + number: string + publishedAt: string | null + isLatest: boolean + isPrerelease: boolean + isYanked: boolean + license: string | null +} + +function fileUploadTimes(files: PyPiReleaseFile[]): string[] { + return files + .map((f) => f.upload_time_iso_8601) + .filter((t): t is string => typeof t === 'string' && t.length > 0) +} + +function minStr(arr: string[]): string | null { + return arr.length ? arr.reduce((a, b) => (a < b ? a : b)) : null +} + +function maxStr(arr: string[]): string | null { + return arr.length ? arr.reduce((a, b) => (a > b ? a : b)) : null +} + +export async function upsertProject( + qx: QueryExecutor, + project: PyPiProject, + purl: string, +): Promise<{ purl: string; changedFields: string[] }> { + const info = project.info + stripNullBytesDeep(info) + + const name = info.name + const status = info.yanked ? 'yanked' : 'active' + const pypiName = pypiNameFromPurl(purl) + const registryUrl = + (typeof info.package_url === 'string' && info.package_url) || + `https://pypi.org/project/${pypiName}/` + const description = info.summary?.trim() ? info.summary.trim() : null + + const { homepage, declaredRepositoryUrl, fundingLinks } = classifyProjectUrls( + info.project_urls, + info.home_page, + ) + const repo = declaredRepositoryUrl ? canonicalizeRepoUrl(declaredRepositoryUrl) : null + const { licenses, licensesRaw } = resolvePypiLicenses(info) + const keywords = parseKeywords(info.keywords) + const maintainers = collectPypiMaintainers(info) + + const releases = project.releases ?? {} + const latestVersion = info.version ?? null + const packageLicense = licenses[0] ?? null + + const allUploadTimes: string[] = [] + const versionRows: PypiVersionRow[] = [] + for (const [number, files] of Object.entries(releases)) { + // A version whose files were all deleted has an empty array — no release artifact, + // so skip it (it would carry no publish date and inflate the version count). + if (!Array.isArray(files) || files.length === 0) continue + const times = fileUploadTimes(files) + allUploadTimes.push(...times) + versionRows.push({ + number, + publishedAt: minStr(times), + isLatest: number === latestVersion, + isPrerelease: isPypiPrerelease(number), + isYanked: files.every((f) => f.yanked === true), + license: packageLicense, + }) + } + + const firstReleaseAt = minStr(allUploadTimes) + const latestReleaseAt = maxStr(allUploadTimes) + + const changed = new Set() + + await qx.tx(async (t) => { + const { id: pkgId, changedFields: pkgChanged } = await upsertPypiPackage(t, { + purl, + namespace: null, + name, + status, + registryUrl, + description, + homepage, + declaredRepositoryUrl, + repositoryUrl: repo?.url ?? null, + licenses: licenses.length ? licenses : null, + licensesRaw, + keywords: keywords.length ? keywords : null, + versionsCount: versionRows.length, + latestVersion, + firstReleaseAt, + latestReleaseAt, + }) + pkgChanged.forEach((f) => changed.add(f)) + + if (repo) { + const { id: repoId, changedFields: repoChanged } = await getOrCreateRepoByUrl( + t, + repo.url, + repo.host, + ) + repoChanged.forEach((f) => changed.add(f)) + const linkChanged = await upsertPackageRepo(t, pkgId, repoId, 'declared', 0.8) + linkChanged.forEach((f) => changed.add(f)) + } + + if (versionRows.length > 0) { + const verChanged = await upsertPypiVersions(t, pkgId, versionRows) + verChanged.forEach((f) => changed.add(f)) + } + + if (maintainers.length > 0) { + const mChanged = await upsertNpmMaintainers(t, pkgId, maintainers, 'pypi') + mChanged.forEach((f) => changed.add(f)) + } + + if (fundingLinks.length > 0) { + const fChanged = await upsertNpmFundingLinks(t, pkgId, fundingLinks) + fChanged.forEach((f) => changed.add(f)) + } + }) + + return { purl, changedFields: Array.from(changed) } +} diff --git a/services/apps/packages_worker/src/pypi/workflows.ts b/services/apps/packages_worker/src/pypi/workflows.ts new file mode 100644 index 0000000000..bf252d31ab --- /dev/null +++ b/services/apps/packages_worker/src/pypi/workflows.ts @@ -0,0 +1,43 @@ +import { continueAsNew, proxyActivities } from '@temporalio/workflow' + +import type * as activities from './activities' + +const acts = proxyActivities({ + startToCloseTimeout: '15 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 5, + }, +}) + +// Packages fetched per batch (one sequential, throttled activity call) and batches +// per workflow run before continueAsNew resets history. ~INGEST_BATCH × ROUNDS_PER_RUN +// packages are drained per run. +const INGEST_BATCH = 50 +const ROUNDS_PER_RUN = 20 + +interface IngestState { + cursor: string +} + +// Drain critical PyPI packages due for a metadata run (never scanned, or stale), +// keyset-paginated on purl. A short batch (< INGEST_BATCH) means the due set after the +// cursor is exhausted, so the run ends; otherwise continueAsNew carries the cursor. +// The daily schedule re-selects newly-due packages on the next run. +export async function ingestPypiPackages(state: IngestState = { cursor: '' }): Promise { + let cursor = state.cursor + + const stopAfterFirstPage = await acts.pypiStopAfterFirstPage() + + for (let r = 0; r < ROUNDS_PER_RUN; r++) { + const { purls, nextCursor } = await acts.getUnscannedPypiBatch(cursor, INGEST_BATCH) + if (purls.length === 0) return + await acts.ingestPypiPackageBatch(purls) + cursor = nextCursor + if (stopAfterFirstPage) return + if (purls.length < INGEST_BATCH) return + } + + await continueAsNew({ cursor }) +} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 411ba509f8..78939cece5 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -20,3 +20,4 @@ export { ingestScorecard } from '../scorecard/workflows' export { rankPackagesWorkflow } from '../criticality/workflow' export { cargoSyncWorkflow } from '../cargo/workflows' export { enrichGoVersions, enrichGoStatus } from '../go/workflows' +export { ingestPypiPackages } from '../pypi/workflows' diff --git a/services/libs/data-access-layer/src/packages/index.ts b/services/libs/data-access-layer/src/packages/index.ts index ed3118bb19..13631858c0 100644 --- a/services/libs/data-access-layer/src/packages/index.ts +++ b/services/libs/data-access-layer/src/packages/index.ts @@ -5,6 +5,7 @@ export * from './fundingLinks' export * from './maintainers' export * from './npmPackageState' export * from './npmWorkerState' +export * from './pypiPackageState' export * from './packages' export * from './repos' export * from './versions' diff --git a/services/libs/data-access-layer/src/packages/maintainers.ts b/services/libs/data-access-layer/src/packages/maintainers.ts index 43a828c08a..a44d489682 100644 --- a/services/libs/data-access-layer/src/packages/maintainers.ts +++ b/services/libs/data-access-layer/src/packages/maintainers.ts @@ -11,6 +11,7 @@ export async function upsertNpmMaintainers( qx: QueryExecutor, packageId: string, maintainers: NpmMaintainerInput[], + ecosystem = 'npm', ): Promise { const changed = new Set() @@ -21,11 +22,11 @@ export async function upsertNpmMaintainers( for (const m of ordered) { const row: { changed_fields: string[] } = await qx.selectOne( `WITH old AS ( - SELECT display_name, email FROM maintainers WHERE ecosystem = 'npm' AND username = $(username) + SELECT display_name, email FROM maintainers WHERE ecosystem = $(ecosystem) AND username = $(username) ), ins AS ( INSERT INTO maintainers (ecosystem, username, display_name, email, created_at, updated_at) - VALUES ('npm', $(username), $(displayName), $(email), NOW(), NOW()) + VALUES ($(ecosystem), $(username), $(displayName), $(email), NOW(), NOW()) ON CONFLICT (ecosystem, username) DO UPDATE SET display_name = COALESCE(EXCLUDED.display_name, maintainers.display_name), email = COALESCE(EXCLUDED.email, maintainers.email), @@ -37,7 +38,7 @@ export async function upsertNpmMaintainers( CASE WHEN o.email IS DISTINCT FROM ins.email THEN 'maintainers.email' END ], NULL) AS changed_fields FROM ins LEFT JOIN old o ON true`, - { username: m.username, displayName: m.displayName, email: m.email }, + { ecosystem, username: m.username, displayName: m.displayName, email: m.email }, ) row.changed_fields.forEach((f) => changed.add(f)) } @@ -57,10 +58,10 @@ export async function upsertNpmMaintainers( for (const m of ordered) { const row: { maintainer_id: string } | null = await qx.selectOneOrNone( `INSERT INTO package_maintainers (package_id, maintainer_id, role, created_at, updated_at) - SELECT $(packageId)::bigint, id, $(role), NOW(), NOW() FROM maintainers WHERE ecosystem = 'npm' AND username = $(username) + SELECT $(packageId)::bigint, id, $(role), NOW(), NOW() FROM maintainers WHERE ecosystem = $(ecosystem) AND username = $(username) ON CONFLICT (package_id, maintainer_id) DO UPDATE SET role = EXCLUDED.role, updated_at = EXCLUDED.updated_at RETURNING maintainer_id::text AS maintainer_id`, - { packageId, role: m.role, username: m.username }, + { packageId, role: m.role, username: m.username, ecosystem }, ) if (row) afterMap.set(row.maintainer_id, m.role) } diff --git a/services/libs/data-access-layer/src/packages/packages.ts b/services/libs/data-access-layer/src/packages/packages.ts index fc8f258d76..16c8ed84ef 100644 --- a/services/libs/data-access-layer/src/packages/packages.ts +++ b/services/libs/data-access-layer/src/packages/packages.ts @@ -110,6 +110,103 @@ export async function upsertNpmPackage( return { id: row.id, changedFields: row.changed_fields } } +export interface PypiPackageUpsertInput { + purl: string + namespace: string | null + name: string + status: string + registryUrl: string + description: string | null + homepage: string | null + declaredRepositoryUrl: string | null + repositoryUrl: string | null + licenses: string[] | null + licensesRaw: string | null + keywords: string[] | null + versionsCount: number + latestVersion: string | null + firstReleaseAt: string | null + latestReleaseAt: string | null +} + +export async function upsertPypiPackage( + qx: QueryExecutor, + input: PypiPackageUpsertInput, +): Promise<{ id: string; changedFields: string[] }> { + const row: { id: string; changed_fields: string[] } = await qx.selectOne( + `WITH old AS ( + SELECT namespace, name, status, registry_url, description, homepage, + declared_repository_url, repository_url, licenses, licenses_raw, keywords, + versions_count, latest_version, first_release_at, latest_release_at, + ingestion_source + FROM packages WHERE purl = $(purl) + ), + ins AS ( + INSERT INTO packages ( + purl, ecosystem, namespace, name, status, registry_url, + description, homepage, declared_repository_url, repository_url, + licenses, licenses_raw, keywords, + versions_count, latest_version, first_release_at, latest_release_at, + ingestion_source, last_synced_at, created_at + ) VALUES ( + $(purl), 'pypi', $(namespace), $(name), $(status), $(registryUrl), + $(description), $(homepage), $(declaredRepositoryUrl), $(repositoryUrl), + $(licenses), $(licensesRaw), $(keywords), + $(versionsCount), $(latestVersion), $(firstReleaseAt), $(latestReleaseAt), + 'pypi-registry', NOW(), NOW() + ) + ON CONFLICT (purl) DO UPDATE SET + namespace = EXCLUDED.namespace, + name = EXCLUDED.name, + status = EXCLUDED.status, + registry_url = EXCLUDED.registry_url, + description = EXCLUDED.description, + homepage = EXCLUDED.homepage, + declared_repository_url = EXCLUDED.declared_repository_url, + repository_url = EXCLUDED.repository_url, + licenses = EXCLUDED.licenses, + licenses_raw = EXCLUDED.licenses_raw, + keywords = EXCLUDED.keywords, + -- A package with no released files reports 0 versions; don't clobber a + -- previously-known count (matches the retained version rows). + versions_count = CASE WHEN EXCLUDED.versions_count = 0 + THEN packages.versions_count + ELSE EXCLUDED.versions_count END, + latest_version = EXCLUDED.latest_version, + first_release_at = EXCLUDED.first_release_at, + latest_release_at = EXCLUDED.latest_release_at, + ingestion_source = EXCLUDED.ingestion_source, + last_synced_at = EXCLUDED.last_synced_at + RETURNING id, namespace, name, status, registry_url, description, homepage, + declared_repository_url, repository_url, licenses, licenses_raw, keywords, + versions_count, latest_version, first_release_at, latest_release_at, + ingestion_source + ) + SELECT ins.id::text AS id, + array_remove(ARRAY[ + CASE WHEN o.namespace IS DISTINCT FROM ins.namespace THEN 'packages.namespace' END, + CASE WHEN o.name IS DISTINCT FROM ins.name THEN 'packages.name' END, + CASE WHEN o.status IS DISTINCT FROM ins.status THEN 'packages.status' END, + CASE WHEN o.registry_url IS DISTINCT FROM ins.registry_url THEN 'packages.registry_url' END, + CASE WHEN o.description IS DISTINCT FROM ins.description THEN 'packages.description' END, + CASE WHEN o.homepage IS DISTINCT FROM ins.homepage THEN 'packages.homepage' END, + CASE WHEN o.declared_repository_url IS DISTINCT FROM ins.declared_repository_url THEN 'packages.declared_repository_url' END, + CASE WHEN o.repository_url IS DISTINCT FROM ins.repository_url THEN 'packages.repository_url' END, + CASE WHEN o.licenses IS DISTINCT FROM ins.licenses THEN 'packages.licenses' END, + CASE WHEN o.licenses_raw IS DISTINCT FROM ins.licenses_raw THEN 'packages.licenses_raw' END, + CASE WHEN o.keywords IS DISTINCT FROM ins.keywords THEN 'packages.keywords' END, + CASE WHEN o.versions_count IS DISTINCT FROM ins.versions_count THEN 'packages.versions_count' END, + CASE WHEN o.latest_version IS DISTINCT FROM ins.latest_version THEN 'packages.latest_version' END, + CASE WHEN o.first_release_at IS DISTINCT FROM ins.first_release_at THEN 'packages.first_release_at' END, + CASE WHEN o.latest_release_at IS DISTINCT FROM ins.latest_release_at THEN 'packages.latest_release_at' END, + CASE WHEN o.ingestion_source IS DISTINCT FROM ins.ingestion_source THEN 'packages.ingestion_source' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON true`, + input, + ) + return { id: row.id, changedFields: row.changed_fields } +} + export async function getTrackedNpmPackages( qx: QueryExecutor, purls: string[], diff --git a/services/libs/data-access-layer/src/packages/pypiPackageState.ts b/services/libs/data-access-layer/src/packages/pypiPackageState.ts new file mode 100644 index 0000000000..b0351ff6c5 --- /dev/null +++ b/services/libs/data-access-layer/src/packages/pypiPackageState.ts @@ -0,0 +1,57 @@ +import { QueryExecutor } from '../queryExecutor' + +// Structured outcome of a metadata ingest run, stored as JSONB in +// pypi_package_state.metadata_run_result. +export interface PypiMetadataRunResult { + status: 'success' | 'error' + attempts: number + httpStatus?: number + errorKind?: string + message?: string +} + +export async function markPypiPackageScanned( + qx: QueryExecutor, + purl: string, + result: PypiMetadataRunResult, +): Promise { + await qx.result( + `INSERT INTO pypi_package_state (purl, metadata_run_result, metadata_last_run_at) + VALUES ($(purl), $(result)::jsonb, NOW()) + ON CONFLICT (purl) DO UPDATE SET + metadata_run_result = EXCLUDED.metadata_run_result, + metadata_last_run_at = EXCLUDED.metadata_last_run_at`, + { purl, result: JSON.stringify(result) }, + ) +} + +// PyPI packages in `packages` that still need a metadata run: either never scanned +// (no state row) or stale (last run older than refreshDays). PyPI has no _changes-style +// feed like npm, so freshness is driven by this staleness window instead. When +// onlyCritical is true (the intended steady state) enrichment is scoped to is_critical +// packages like npm/maven; pass false to sweep the whole PyPI set (temporary — e.g. +// while criticality is still being populated). +export async function getUnscannedPypiPurls( + qx: QueryExecutor, + afterPurl: string, + batchSize: number, + refreshDays: number, + onlyCritical: boolean, +): Promise { + const rows: Array<{ purl: string }> = await qx.select( + `SELECT p.purl + FROM packages p + LEFT JOIN pypi_package_state s ON s.purl = p.purl + WHERE p.ecosystem = 'pypi' + AND (NOT $(onlyCritical) OR p.is_critical = TRUE) + AND p.purl > $(afterPurl) + AND ( + s.metadata_last_run_at IS NULL + OR s.metadata_last_run_at < NOW() - ($(refreshDays) || ' days')::interval + ) + ORDER BY p.purl + LIMIT $(batchSize)`, + { afterPurl, batchSize, refreshDays, onlyCritical }, + ) + return rows.map((r) => r.purl) +} diff --git a/services/libs/data-access-layer/src/packages/versions.ts b/services/libs/data-access-layer/src/packages/versions.ts index 2a9cd5276f..f90a571f85 100644 --- a/services/libs/data-access-layer/src/packages/versions.ts +++ b/services/libs/data-access-layer/src/packages/versions.ts @@ -68,3 +68,74 @@ export async function upsertNpmVersions( ) return row.changed_fields } + +export interface PypiVersionInput { + number: string + publishedAt: string | null + isLatest: boolean + isPrerelease: boolean + isYanked: boolean + license: string | null +} + +export async function upsertPypiVersions( + qx: QueryExecutor, + packageId: string, + versions: PypiVersionInput[], +): Promise { + if (versions.length === 0) return [] + const row: { changed_fields: string[] } = await qx.selectOne( + `WITH old AS ( + SELECT number, published_at, is_latest, is_yanked, is_prerelease, licenses + FROM versions + WHERE package_id = $(packageId)::bigint AND number = ANY($(numbers)::text[]) + ), + ins AS ( + INSERT INTO versions ( + package_id, ecosystem, namespace, name, number, + published_at, is_latest, is_yanked, is_prerelease, licenses, last_synced_at, + created_at + ) + SELECT $(packageId)::bigint, 'pypi', p.namespace, p.name, v.num, + v.pub::timestamptz, v.latest, v.yanked, v.pre, + CASE WHEN v.lic IS NULL THEN NULL::text[] ELSE ARRAY[v.lic] END, + NOW(), NOW() + FROM unnest( + $(numbers)::text[], + $(publishedAts)::text[], + $(isLatests)::bool[], + $(isYankeds)::bool[], + $(isPrereleases)::bool[], + $(licenses)::text[] + ) AS v(num, pub, latest, yanked, pre, lic) + CROSS JOIN (SELECT namespace, name FROM packages WHERE id = $(packageId)::bigint) p + ON CONFLICT (package_id, number) DO UPDATE SET + published_at = COALESCE(EXCLUDED.published_at, versions.published_at), + is_latest = EXCLUDED.is_latest, + is_yanked = EXCLUDED.is_yanked, + is_prerelease = EXCLUDED.is_prerelease, + licenses = EXCLUDED.licenses, + last_synced_at = EXCLUDED.last_synced_at + RETURNING number, published_at, is_latest, is_yanked, is_prerelease, licenses + ) + SELECT array_remove(ARRAY[ + CASE WHEN bool_or(o.number IS NULL) THEN 'versions.number' END, + CASE WHEN bool_or(o.number IS NULL OR o.published_at IS DISTINCT FROM ins.published_at) THEN 'versions.published_at' END, + CASE WHEN bool_or(o.is_latest IS DISTINCT FROM ins.is_latest) THEN 'versions.is_latest' END, + CASE WHEN bool_or(o.is_yanked IS DISTINCT FROM ins.is_yanked) THEN 'versions.is_yanked' END, + CASE WHEN bool_or(o.is_prerelease IS DISTINCT FROM ins.is_prerelease) THEN 'versions.is_prerelease' END, + CASE WHEN bool_or(o.licenses IS DISTINCT FROM ins.licenses) THEN 'versions.licenses' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON o.number = ins.number`, + { + packageId, + numbers: versions.map((v) => v.number), + publishedAts: versions.map((v) => v.publishedAt), + isLatests: versions.map((v) => v.isLatest), + isYankeds: versions.map((v) => v.isYanked), + isPrereleases: versions.map((v) => v.isPrerelease), + licenses: versions.map((v) => v.license), + }, + ) + return row.changed_fields +} From 8aa037204ae4428e11bf6aeba9afa2e8e8fd60b7 Mon Sep 17 00:00:00 2001 From: anilb Date: Wed, 1 Jul 2026 17:15:41 +0200 Subject: [PATCH 02/10] feat: pypi downloads ingest Signed-off-by: anilb --- .../src/bin/bq-dataset-ingest.ts | 6 + .../packages_worker/src/deps-dev/README.md | 2 + .../activities/getCriticalPypiCount.ts | 12 + .../src/deps-dev/activities/index.ts | 1 + .../queries/__tests__/pypiDownloads.test.ts | 179 +++++++++++++++ .../deps-dev/queries/pypiDownloadsDates.ts | 64 ++++++ .../src/deps-dev/queries/pypiDownloadsSql.ts | 105 +++++++++ .../src/deps-dev/schedules/pypiDownloads.ts | 63 ++++++ .../src/deps-dev/workflows/index.ts | 1 + .../deps-dev/workflows/ingestPypiDownloads.ts | 212 ++++++++++++++++++ .../packages_worker/src/npm/activities.ts | 8 +- .../apps/packages_worker/src/npm/normalize.ts | 21 -- .../packages_worker/src/npm/upsertPackage.ts | 3 +- .../src/pypi/__tests__/fetchProject.test.ts | 79 +++++++ .../src/pypi/__tests__/ingest.test.ts | 170 ++++++++++++++ .../src/pypi/__tests__/normalize.test.ts | 78 ++++++- .../packages_worker/src/pypi/activities.ts | 53 ++++- .../packages_worker/src/pypi/fetchProject.ts | 60 ++--- .../packages_worker/src/pypi/normalize.ts | 98 +++++--- .../packages_worker/src/pypi/retryPolicy.ts | 4 + .../packages_worker/src/pypi/upsertProject.ts | 54 +---- .../packages_worker/src/pypi/workflows.ts | 5 +- .../src/scripts/monitorOsspckgs.ts | 7 +- .../__tests__/stripNullBytesDeep.test.ts | 18 ++ .../src/utils/isClientError.ts | 7 + .../src/utils/stripNullBytesDeep.ts | 18 ++ .../packages_worker/src/workflows/index.ts | 2 + .../src/osspckgs/ingestJobs.ts | 2 + .../src/packages/packages.ts | 9 + .../src/packages/versions.ts | 15 ++ 30 files changed, 1201 insertions(+), 155 deletions(-) create mode 100644 services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts create mode 100644 services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts create mode 100644 services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts create mode 100644 services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts create mode 100644 services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts create mode 100644 services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts create mode 100644 services/apps/packages_worker/src/pypi/__tests__/fetchProject.test.ts create mode 100644 services/apps/packages_worker/src/pypi/__tests__/ingest.test.ts create mode 100644 services/apps/packages_worker/src/pypi/retryPolicy.ts create mode 100644 services/apps/packages_worker/src/utils/__tests__/stripNullBytesDeep.test.ts create mode 100644 services/apps/packages_worker/src/utils/isClientError.ts create mode 100644 services/apps/packages_worker/src/utils/stripNullBytesDeep.ts diff --git a/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts b/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts index a81c14914a..41982f2223 100644 --- a/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts +++ b/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts @@ -1,4 +1,8 @@ import { scheduleOsspckgsBootstrap } from '../deps-dev/schedules/bootstrap' +import { + schedulePypiDownloads30d, + schedulePypiDownloadsDaily, +} from '../deps-dev/schedules/pypiDownloads' import { scheduleOsspckgsCleanup } from '../schedules/cleanup' import { svc } from '../service' @@ -6,5 +10,7 @@ setImmediate(async () => { await svc.init() await scheduleOsspckgsBootstrap() await scheduleOsspckgsCleanup() + await schedulePypiDownloads30d() + await schedulePypiDownloadsDaily() await svc.start() }) diff --git a/services/apps/packages_worker/src/deps-dev/README.md b/services/apps/packages_worker/src/deps-dev/README.md index 8e283a9e4e..115ad4edc1 100644 --- a/services/apps/packages_worker/src/deps-dev/README.md +++ b/services/apps/packages_worker/src/deps-dev/README.md @@ -58,6 +58,8 @@ The mode-specific key takes precedence over the generic key. Value must be a pos | `BQ_DATASET_INGEST_DEPENDENT_COUNTS_MAX_BQ_GB` | 2000 | `dependent_counts` | | | `BQ_DATASET_INGEST_SCORECARD_REPOS_MAX_BQ_GB` | 50 | `scorecard_repos` | | | `BQ_DATASET_INGEST_SCORECARD_CHECKS_MAX_BQ_GB` | 500 | `scorecard_checks` | | +| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_30D_MAX_BQ_GB` | 6000 | `pypi_downloads_30d` | Per 30-day window scan (~4.56 TB measured; set in `ingestPypiDownloads.ts`) | +| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_DAILY_MAX_BQ_GB` | 2000 | `pypi_downloads_daily` | Scales with backfill range; raise for long backfills | The override logic lives in `src/deps-dev/activities/bqExportToGcs.ts`. diff --git a/services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts b/services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts new file mode 100644 index 0000000000..c3833e000f --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts @@ -0,0 +1,12 @@ +import { getCriticalPypiPackageCount } from '@crowd/data-access-layer' + +import { getPackagesDb } from '../../db' + +// Count of critical PyPI packages, so the daily downloads workflow can skip its BigQuery scan +// when there are none (the merge is scoped to is_critical, mirroring how deps.dev scopes to our +// packages in the Postgres merge rather than pushing our package list into BigQuery). +export async function getCriticalPypiCount(): Promise<{ count: number }> { + const qx = await getPackagesDb() + const count = await getCriticalPypiPackageCount(qx) + return { count } +} diff --git a/services/apps/packages_worker/src/deps-dev/activities/index.ts b/services/apps/packages_worker/src/deps-dev/activities/index.ts index 11e1ed49c8..6c20114476 100644 --- a/services/apps/packages_worker/src/deps-dev/activities/index.ts +++ b/services/apps/packages_worker/src/deps-dev/activities/index.ts @@ -1,4 +1,5 @@ export * from './bqExportToGcs' +export * from './getCriticalPypiCount' export * from './setJobStep' export * from './createVersionsLookup' export * from './managePackageDepsConstraints' diff --git a/services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts b/services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts new file mode 100644 index 0000000000..045b1337a7 --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts @@ -0,0 +1,179 @@ +import { describe, expect, it } from 'vitest' + +import { + PYPI_EARLIEST, + computeLast30dWindows, + defaultDailyRange, + utcFirstOfCurrentMonth, +} from '../pypiDownloadsDates' +import { + PYPI_DOWNLOADS_30D_KIND, + PYPI_DOWNLOADS_DAILY_KIND, + buildPypiDownloads30dMergeSql, + buildPypiDownloads30dSql, + buildPypiDownloadsDailyMergeSql, + buildPypiDownloadsDailySql, +} from '../pypiDownloadsSql' + +// Note: PEP 503 name normalization is done in SQL (BQ `REGEXP_REPLACE(LOWER(file.project), …)` +// and the PG merge's `REGEXP_REPLACE(LOWER(p.name), …, 'g')`), asserted in the builder tests +// below — there is no TS-side normalizer (we don't push our package list into BigQuery). + +// ── Criterion 2: monthly 30-day windows (npm-identical math) ─────────────────────────────── +describe('computeLast30dWindows', () => { + it('PYPI_EARLIEST is the 2018-07-26 Linehaul floor', () => { + expect(PYPI_EARLIEST).toBe('2018-07-26') + }) + + it('with no fromDate returns only the latest bucket ending at upperEndDate', () => { + const w = computeLast30dWindows(null, '2026-06-01') + expect(w).toEqual([{ start: '2026-05-02', end: '2026-06-01', isLatest: true }]) + }) + + it('enumerates a contiguous monthly series up to and including the latest', () => { + const w = computeLast30dWindows('2026-04-10', '2026-06-01') + expect(w.map((x) => x.end)).toEqual(['2026-04-01', '2026-05-01', '2026-06-01']) + // end_date - 30 days for each bucket + expect(w.map((x) => x.start)).toEqual(['2026-03-02', '2026-04-01', '2026-05-02']) + // exactly one latest, and it is the final bucket + expect(w.filter((x) => x.isLatest)).toEqual([ + { start: '2026-05-02', end: '2026-06-01', isLatest: true }, + ]) + }) + + it('clamps the lower bound and start_date to PYPI_EARLIEST', () => { + const w = computeLast30dWindows('2015-01-01', '2018-09-01') + // 2018-07-01 bucket is skipped (its end precedes PYPI_EARLIEST) + expect(w.map((x) => x.end)).toEqual(['2018-08-01', '2018-09-01']) + // first bucket's start (would be 2018-07-02) is clamped up to the floor + expect(w[0]).toEqual({ start: '2018-07-26', end: '2018-08-01', isLatest: false }) + expect(w[1].isLatest).toBe(true) + }) + + it('returns an empty array when fromDate is after upperEndDate', () => { + expect(computeLast30dWindows('2026-07-01', '2026-06-01')).toEqual([]) + }) +}) + +// ── Criterion 3: daily trailing window + first-of-month helper ───────────────────────────── +describe('defaultDailyRange', () => { + it('defaults to a 2-day self-healing window [today-2, today-1]', () => { + expect(defaultDailyRange('2026-06-30')).toEqual({ + startDate: '2026-06-28', + endDate: '2026-06-29', + }) + }) + + it('crosses month boundaries correctly', () => { + expect(defaultDailyRange('2026-03-01')).toEqual({ + startDate: '2026-02-27', + endDate: '2026-02-28', + }) + }) +}) + +describe('utcFirstOfCurrentMonth', () => { + it('returns the 1st of the month containing the reference date', () => { + expect(utcFirstOfCurrentMonth('2026-06-30')).toBe('2026-06-01') + expect(utcFirstOfCurrentMonth('2026-01-15')).toBe('2026-01-01') + }) +}) + +// ── Criterion 4: 30d BigQuery aggregate query ───────────────────────────────────────────── +describe('buildPypiDownloads30dSql', () => { + const sql = buildPypiDownloads30dSql({ startDate: '2026-05-02', endDate: '2026-06-01' }) + + it('reads from the public file_downloads table', () => { + expect(sql).toContain('bigquery-public-data.pypi.file_downloads') + }) + + it('filters the date range on the timestamp partition', () => { + expect(sql).toContain('DATE(timestamp)') + expect(sql).toContain('BETWEEN') + expect(sql).toContain('2026-05-02') + expect(sql).toContain('2026-06-01') + }) + + it('excludes bandersnatch mirror traffic while keeping NULL installers', () => { + expect(sql).toContain("COALESCE(details.installer.name, '') <> 'bandersnatch'") + }) + + it('groups by the PEP 503-normalized project and counts downloads', () => { + expect(sql).toContain("REGEXP_REPLACE(LOWER(file.project), r'[-_.]+', '-')") + expect(sql).toMatch(/COUNT\(\*\)\s+AS\s+downloads/i) + expect(sql).toContain('GROUP BY') + }) +}) + +// ── Criterion 5: daily BigQuery aggregate query ─────────────────────────────────────────── +describe('buildPypiDownloadsDailySql', () => { + const sql = buildPypiDownloadsDailySql({ startDate: '2026-06-01', endDate: '2026-06-03' }) + + it('aggregates per project per day', () => { + expect(sql).toContain('bigquery-public-data.pypi.file_downloads') + expect(sql).toContain('DATE(timestamp) AS day') + expect(sql).toContain("COALESCE(details.installer.name, '') <> 'bandersnatch'") + expect(sql).toContain("REGEXP_REPLACE(LOWER(file.project), r'[-_.]+', '-')") + expect(sql).toMatch(/COUNT\(\*\)\s+AS\s+downloads/i) + expect(sql).toContain('GROUP BY project, day') + }) + + it('never pushes our package list into BigQuery — scoping to is_critical is the merge job', () => { + expect(sql).not.toContain('UNNEST') + expect(sql).not.toContain('is_critical') + }) +}) + +// ── Criterion 6: merge SQL builders ─────────────────────────────────────────────────────── +describe('buildPypiDownloads30dMergeSql', () => { + it('emits only the insert when not mirroring', () => { + const stmts = buildPypiDownloads30dMergeSql({ + startDate: '2026-05-02', + endDate: '2026-06-01', + mirrorToPackages: false, + }) + expect(stmts).toHaveLength(1) + const insert = stmts[0] + expect(insert).toContain('INSERT INTO downloads_last_30d') + expect(insert).toContain('staging.pypi_downloads_30d_raw') + expect(insert).toContain("p.ecosystem = 'pypi'") + expect(insert).toContain('2026-05-02') + expect(insert).toContain('2026-06-01') + expect(insert).toContain('ON CONFLICT (purl, end_date) DO UPDATE') + // PG-side normalization must use the 'g' flag to replace every separator run + expect(insert).toContain("REGEXP_REPLACE(LOWER(p.name), '[-_.]+', '-', 'g')") + }) + + it('appends a packages mirror update when mirroring the latest window', () => { + const stmts = buildPypiDownloads30dMergeSql({ + startDate: '2026-05-02', + endDate: '2026-06-01', + mirrorToPackages: true, + }) + expect(stmts).toHaveLength(2) + expect(stmts[1]).toContain('UPDATE packages') + expect(stmts[1]).toContain('downloads_last_30d') + expect(stmts[1]).toContain('IS DISTINCT FROM') + }) +}) + +describe('buildPypiDownloadsDailyMergeSql', () => { + it('inserts into downloads_daily scoped to critical pypi packages', () => { + const sql = buildPypiDownloadsDailyMergeSql() + expect(sql).toContain('INSERT INTO downloads_daily') + expect(sql).toContain('package_id') + expect(sql).toContain('staging.pypi_downloads_daily_raw') + expect(sql).toContain("p.ecosystem = 'pypi'") + expect(sql).toContain('p.is_critical') + expect(sql).toContain('ON CONFLICT (package_id, date) DO UPDATE') + expect(sql).toContain("REGEXP_REPLACE(LOWER(p.name), '[-_.]+', '-', 'g')") + }) +}) + +// ── Criterion 7: job-kind registry ──────────────────────────────────────────────────────── +describe('PyPI downloads job kinds', () => { + it('exposes the two new kind identifiers', () => { + expect(PYPI_DOWNLOADS_30D_KIND).toBe('pypi_downloads_30d') + expect(PYPI_DOWNLOADS_DAILY_KIND).toBe('pypi_downloads_daily') + }) +}) diff --git a/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts b/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts new file mode 100644 index 0000000000..b15bfa01a8 --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts @@ -0,0 +1,64 @@ +// Date math for the PyPI downloads workflows. Pure functions — no clock access; the +// caller passes the reference "today" so runs are deterministic and testable. + +// PyPI's BigQuery download data is unreliable before this date: Linehaul under-reported +// downloads prior to 2018-07-26 (see the official PyPI download-analysis guide). Backfills +// are clamped to this floor so we never ingest known-bad counts. +export const PYPI_EARLIEST = '2018-07-26' + +export interface Last30dWindow { + start: string + end: string + isLatest: boolean +} + +function addDaysUTC(date: string, days: number): string { + const d = new Date(date + 'T00:00:00Z') + d.setUTCDate(d.getUTCDate() + days) + return d.toISOString().slice(0, 10) +} + +// 1st of the calendar month (UTC) containing `today` (YYYY-MM-DD). +export function utcFirstOfCurrentMonth(today: string): string { + const d = new Date(today + 'T00:00:00Z') + return new Date(Date.UTC(d.getUTCFullYear(), d.getUTCMonth(), 1)).toISOString().slice(0, 10) +} + +// Monthly 30-day windows, identical in math to npm's computeMissingLast30dWindows: +// end_date = 1st of a calendar month (UTC), start_date = end_date - 30 days, isLatest only +// for the bucket whose end === upperEndDate. Walks from max(fromDate, PYPI_EARLIEST) up to +// upperEndDate. No fromDate → only the latest bucket. +export function computeLast30dWindows( + fromDate: string | null, + upperEndDate: string, +): Last30dWindow[] { + const lower = fromDate ? (fromDate > PYPI_EARLIEST ? fromDate : PYPI_EARLIEST) : upperEndDate + const lowerDate = new Date(lower + 'T00:00:00Z') + const firstMonth = Date.UTC(lowerDate.getUTCFullYear(), lowerDate.getUTCMonth(), 1) + const lastMonth = new Date(upperEndDate + 'T00:00:00Z').getTime() + if (firstMonth > lastMonth) return [] + + const result: Last30dWindow[] = [] + let m = firstMonth + while (m <= lastMonth) { + const d = new Date(m) + const endDate = d.toISOString().slice(0, 10) // 1st of the month + // Skip windows whose end precedes the Linehaul floor. + if (endDate >= PYPI_EARLIEST) { + let start = addDaysUTC(endDate, -30) + if (start < PYPI_EARLIEST) start = PYPI_EARLIEST + result.push({ start, end: endDate, isLatest: m === lastMonth }) + } + m = Date.UTC(d.getUTCFullYear(), d.getUTCMonth() + 1, 1) + } + return result +} + +// The daily trailing window spans 2 days so the most-recent (possibly partial) partition is +// re-scanned once on the next run and corrected, while keeping the daily BigQuery scan small. +const DAILY_TRAILING_DAYS = 2 + +// Default daily trailing window: [today - 2, today - 1] inclusive (self-healing). +export function defaultDailyRange(today: string): { startDate: string; endDate: string } { + return { startDate: addDaysUTC(today, -DAILY_TRAILING_DAYS), endDate: addDaysUTC(today, -1) } +} diff --git a/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts b/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts new file mode 100644 index 0000000000..ed90aa904c --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts @@ -0,0 +1,105 @@ +export const PYPI_DOWNLOADS_30D_KIND = 'pypi_downloads_30d' +export const PYPI_DOWNLOADS_DAILY_KIND = 'pypi_downloads_daily' + +// Staging tables the BQ export lands in before the merge into the final download tables. +export const PYPI_DOWNLOADS_30D_STAGING = 'staging.pypi_downloads_30d_raw' +export const PYPI_DOWNLOADS_DAILY_STAGING = 'staging.pypi_downloads_daily_raw' + +const BQ_TABLE = '`bigquery-public-data.pypi.file_downloads`' +// PEP 503 canonical project, BigQuery dialect (REGEXP_REPLACE is global by default). +const BQ_PROJECT_NORM = "REGEXP_REPLACE(LOWER(file.project), r'[-_.]+', '-')" +// Keep NULL installers; only drop known mirror traffic. +const MIRROR_EXCLUDE = "COALESCE(details.installer.name, '') <> 'bandersnatch'" +// PEP 503 canonical name, Postgres dialect — the 'g' flag is required so EVERY separator +// run collapses (Postgres REGEXP_REPLACE otherwise replaces only the first match). +const PG_NAME_NORM = "REGEXP_REPLACE(LOWER(p.name), '[-_.]+', '-', 'g')" + +// Aggregate query: net downloads per package over [startDate, endDate], one total each. +export function buildPypiDownloads30dSql({ + startDate, + endDate, +}: { + startDate: string + endDate: string +}): string { + return ` +SELECT + ${BQ_PROJECT_NORM} AS project, + COUNT(*) AS downloads +FROM ${BQ_TABLE} +WHERE DATE(timestamp) BETWEEN DATE('${startDate}') AND DATE('${endDate}') + AND ${MIRROR_EXCLUDE} +GROUP BY project +` +} + +// Aggregate query: per-day downloads per package over [startDate, endDate]. Like every other BQ +// job here (see deps.dev), it exports all projects and lets the Postgres merge scope to our +// packages (is_critical) — we never push our package list into BigQuery. +export function buildPypiDownloadsDailySql({ + startDate, + endDate, +}: { + startDate: string + endDate: string +}): string { + return ` +SELECT + ${BQ_PROJECT_NORM} AS project, + DATE(timestamp) AS day, + COUNT(*) AS downloads +FROM ${BQ_TABLE} +WHERE DATE(timestamp) BETWEEN DATE('${startDate}') AND DATE('${endDate}') + AND ${MIRROR_EXCLUDE} +GROUP BY project, day +` +} + +// Merge statements for the 30d window: insert into downloads_last_30d, and (only for the +// latest window) mirror the count onto packages.downloads_last_30d. +export function buildPypiDownloads30dMergeSql({ + startDate, + endDate, + mirrorToPackages, +}: { + startDate: string + endDate: string + mirrorToPackages: boolean +}): string[] { + const insert = ` +INSERT INTO downloads_last_30d (purl, start_date, end_date, count, created_at, updated_at) +SELECT p.purl, DATE '${startDate}', DATE '${endDate}', s.downloads, NOW(), NOW() +FROM ${PYPI_DOWNLOADS_30D_STAGING} s +JOIN packages p ON p.ecosystem = 'pypi' + AND ${PG_NAME_NORM} = s.project +ON CONFLICT (purl, end_date) DO UPDATE SET + count = EXCLUDED.count, + start_date = EXCLUDED.start_date, + updated_at = NOW() +` + if (!mirrorToPackages) return [insert] + + const mirror = ` +UPDATE packages p +SET downloads_last_30d = s.downloads +FROM ${PYPI_DOWNLOADS_30D_STAGING} s +WHERE p.ecosystem = 'pypi' + AND ${PG_NAME_NORM} = s.project + AND p.downloads_last_30d IS DISTINCT FROM s.downloads +` + return [insert, mirror] +} + +// Merge statement for daily downloads into downloads_daily, scoped to critical pypi packages. +export function buildPypiDownloadsDailyMergeSql(): string { + return ` +INSERT INTO downloads_daily (package_id, date, count, created_at, updated_at) +SELECT p.id, s.day, s.downloads, NOW(), NOW() +FROM ${PYPI_DOWNLOADS_DAILY_STAGING} s +JOIN packages p ON p.ecosystem = 'pypi' AND p.is_critical + AND ${PG_NAME_NORM} = s.project +ON CONFLICT (package_id, date) DO UPDATE SET + count = EXCLUDED.count, + updated_at = NOW() +` +} diff --git a/services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts b/services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts new file mode 100644 index 0000000000..a9bfe8e254 --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts @@ -0,0 +1,63 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../../service' +import { ingestPypiDownloadsDaily, ingestPypiDownloadsLast30d } from '../workflows' + +// Last-30d downloads for all pypi packages. Runs on the 4th of the month (06:00 UTC) — a few +// days after the window's end (1st of the month) so the BigQuery partitions have settled. No +// args → the workflow processes only the latest window and mirrors it onto packages. +export async function schedulePypiDownloads30d(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'pypi-downloads-30d-monthly', + spec: { cronExpressions: ['0 6 4 * *'] }, + policies: { overlap: ScheduleOverlapPolicy.SKIP, catchupWindow: '1 hour' }, + action: { + type: 'startWorkflow', + workflowType: ingestPypiDownloadsLast30d, + taskQueue: 'bq-dataset-ingest', + workflowExecutionTimeout: '12 hours', + retry: { initialInterval: '1 minute', backoffCoefficient: 2, maximumAttempts: 3 }, + args: [{}], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule pypi-downloads-30d-monthly already registered.') + } else { + throw err + } + } +} + +// Daily downloads for the critical pypi subset. Runs daily at 06:00 UTC; no args → the workflow +// scans the self-healing 2-day trailing window. +export async function schedulePypiDownloadsDaily(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'pypi-downloads-daily', + spec: { cronExpressions: ['0 6 * * *'] }, + policies: { overlap: ScheduleOverlapPolicy.SKIP, catchupWindow: '1 hour' }, + action: { + type: 'startWorkflow', + workflowType: ingestPypiDownloadsDaily, + taskQueue: 'bq-dataset-ingest', + workflowExecutionTimeout: '6 hours', + retry: { initialInterval: '1 minute', backoffCoefficient: 2, maximumAttempts: 3 }, + args: [{}], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule pypi-downloads-daily already registered.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/deps-dev/workflows/index.ts b/services/apps/packages_worker/src/deps-dev/workflows/index.ts index 8ef97c9e3f..e6d8d563f7 100644 --- a/services/apps/packages_worker/src/deps-dev/workflows/index.ts +++ b/services/apps/packages_worker/src/deps-dev/workflows/index.ts @@ -4,5 +4,6 @@ export * from './ingestAdvisories' export * from './ingestDependentCounts' export * from './ingestDependencies' export * from './ingestPackages' +export * from './ingestPypiDownloads' export * from './ingestRepos' export * from './ingestVersions' diff --git a/services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts b/services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts new file mode 100644 index 0000000000..4ef1db5dc2 --- /dev/null +++ b/services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts @@ -0,0 +1,212 @@ +import { proxyActivities, workflowInfo } from '@temporalio/workflow' + +import type * as depsDevActivities from '../activities' +import { + computeLast30dWindows, + defaultDailyRange, + utcFirstOfCurrentMonth, +} from '../queries/pypiDownloadsDates' +import { + PYPI_DOWNLOADS_30D_KIND, + PYPI_DOWNLOADS_30D_STAGING, + PYPI_DOWNLOADS_DAILY_KIND, + PYPI_DOWNLOADS_DAILY_STAGING, + buildPypiDownloads30dMergeSql, + buildPypiDownloads30dSql, + buildPypiDownloadsDailyMergeSql, + buildPypiDownloadsDailySql, +} from '../queries/pypiDownloadsSql' + +const { bqExportToGcs } = proxyActivities({ + startToCloseTimeout: '1 hour', + retry: { maximumAttempts: 3, initialInterval: '1 minute', backoffCoefficient: 2 }, +}) + +const { listParquetFiles } = proxyActivities({ + startToCloseTimeout: '5 minutes', + retry: { maximumAttempts: 3 }, +}) + +const { gcsParquetToStaging } = proxyActivities({ + startToCloseTimeout: '2 hours', + heartbeatTimeout: '2 minutes', + retry: { maximumAttempts: 2 }, +}) + +const { mergeStagingToTable } = proxyActivities({ + startToCloseTimeout: '1 hour', + retry: { maximumAttempts: 1 }, +}) + +const { getCriticalPypiCount } = proxyActivities({ + startToCloseTimeout: '1 minute', + retry: { maximumAttempts: 3 }, +}) + +// Per-window 30-day scans can reach ~1.5 TiB worst case; daily scans are far smaller but a wide +// backfill multiplies by the day count. Defaults guard against runaway scans and are overridable +// per kind via BQ_DATASET_INGEST_PYPI_DOWNLOADS_{30D,DAILY}_MAX_BQ_GB. +// A single 30d window scans ~31 day-partitions; measured at ~4.56 TB (≈147 GB/day averaged over +// a month — weekdays are heavier than the weekend sample). Ceiling sits above that with headroom; +// raise it if a future month exceeds it. Daily scans its 2-day trailing window (~300 GB). +const MAX_BYTES_GB_30D = 6000 +const MAX_BYTES_GB_DAILY = 2000 + +const ROWS_PER_CHUNK = 1_000_000 + +const STAGING_DDL_30D = `CREATE UNLOGGED TABLE IF NOT EXISTS ${PYPI_DOWNLOADS_30D_STAGING} ( + project text, + downloads bigint +)` +const PG_COLUMNS_30D = ['project', 'downloads'] + +const STAGING_DDL_DAILY = `CREATE UNLOGGED TABLE IF NOT EXISTS ${PYPI_DOWNLOADS_DAILY_STAGING} ( + project text, + day date, + downloads bigint +)` +const PG_COLUMNS_DAILY = ['project', 'day', 'downloads'] + +// Shared GCS-parquet → staging → merge driver: chunk the export files so a single staging load +// stays bounded, merging each chunk into the target table(s). Mirrors ingestPackages/ingestDependentCounts. +async function loadAndMerge(params: { + jobId: number + gcsPrefix: string + stagingTable: string + stagingDdl: string + pgColumns: string[] + mergeSql: string | string[] + tableNames: string | string[] +}): Promise { + const { fileNames, rowCounts } = await listParquetFiles({ gcsPrefix: params.gcsPrefix }) + const totalFiles = fileNames.length + + if (totalFiles === 0) { + await mergeStagingToTable({ jobId: params.jobId, mergeSql: [], tableNames: [], isFinal: true }) + return + } + + const totalRows = rowCounts.reduce((a, b) => a + b, 0) + const filesPerChunk = + totalRows > 0 + ? Math.max(1, Math.round((ROWS_PER_CHUNK * fileNames.length) / totalRows)) + : Math.min(fileNames.length, 2) + const totalChunks = Math.ceil(fileNames.length / filesPerChunk) + let priorRowsAffected = 0 + let priorStagingRows = 0 + const priorTableRowCounts: Record = {} + + for (let chunkIndex = 0; chunkIndex < totalChunks; chunkIndex++) { + const start = chunkIndex * filesPerChunk + const chunk = fileNames.slice(start, start + filesPerChunk) + const isFinal = chunkIndex === totalChunks - 1 + + const { rowsLoaded } = await gcsParquetToStaging({ + jobId: params.jobId, + stagingTable: params.stagingTable, + stagingDdl: params.stagingDdl, + pgColumns: params.pgColumns, + fileNames: chunk, + filesOffset: start, + totalFiles, + priorStagingRows, + }) + priorStagingRows += rowsLoaded + + const { rowsAffected, tableRowCounts } = await mergeStagingToTable({ + jobId: params.jobId, + mergeSql: params.mergeSql, + tableNames: params.tableNames, + isFinal, + priorRowsAffected, + priorTableRowCounts, + chunkInfo: { index: chunkIndex, total: totalChunks }, + }) + + priorRowsAffected += rowsAffected + if (!isFinal) { + for (const [k, v] of Object.entries(tableRowCounts)) { + priorTableRowCounts[k] = (priorTableRowCounts[k] ?? 0) + v + } + } + } +} + +// Last-30-day downloads for ALL pypi packages. Scheduled monthly with no fromDate → only the +// latest window (mirrored onto packages.downloads_last_30d). Pass fromDate to backfill every +// monthly 30-day bucket from that date up to the current one. +export async function ingestPypiDownloadsLast30d(opts: { fromDate?: string }): Promise { + const start = workflowInfo().startTime + const baseRunId = start.toISOString().replace(/[:.]/g, '-') + const today = start.toISOString().slice(0, 10) + const upperEndDate = utcFirstOfCurrentMonth(today) + + const windows = computeLast30dWindows(opts.fromDate ?? null, upperEndDate) + + for (const window of windows) { + const exportResult = await bqExportToGcs({ + jobKind: PYPI_DOWNLOADS_30D_KIND, + sql: buildPypiDownloads30dSql({ startDate: window.start, endDate: window.end }), + runId: `${baseRunId}-${window.end}`, + syncMode: 'full', + snapshotAt: window.end, + maxBytesGb: MAX_BYTES_GB_30D, + }) + + await loadAndMerge({ + jobId: exportResult.jobId, + gcsPrefix: exportResult.gcsPrefix, + stagingTable: PYPI_DOWNLOADS_30D_STAGING, + stagingDdl: STAGING_DDL_30D, + pgColumns: PG_COLUMNS_30D, + mergeSql: buildPypiDownloads30dMergeSql({ + startDate: window.start, + endDate: window.end, + mirrorToPackages: window.isLatest, + }), + tableNames: window.isLatest ? ['downloads_last_30d', 'packages'] : 'downloads_last_30d', + }) + } +} + +// Daily downloads for the critical pypi subset. Scheduled daily with no range → the last 2-day +// self-healing window. Pass an explicit startDate/endDate to backfill an arbitrary range. +export async function ingestPypiDownloadsDaily(opts: { + startDate?: string + endDate?: string +}): Promise { + const start = workflowInfo().startTime + const runId = start.toISOString().replace(/[:.]/g, '-') + const today = start.toISOString().slice(0, 10) + + const range = + opts.startDate && opts.endDate + ? { startDate: opts.startDate, endDate: opts.endDate } + : defaultDailyRange(today) + + const { count } = await getCriticalPypiCount() + // Nothing to ingest — skip the (full-partition) BQ scan that would merge zero rows. + if (count === 0) return + + const exportResult = await bqExportToGcs({ + jobKind: PYPI_DOWNLOADS_DAILY_KIND, + sql: buildPypiDownloadsDailySql({ + startDate: range.startDate, + endDate: range.endDate, + }), + runId, + syncMode: 'full', + snapshotAt: range.endDate, + maxBytesGb: MAX_BYTES_GB_DAILY, + }) + + await loadAndMerge({ + jobId: exportResult.jobId, + gcsPrefix: exportResult.gcsPrefix, + stagingTable: PYPI_DOWNLOADS_DAILY_STAGING, + stagingDdl: STAGING_DDL_DAILY, + pgColumns: PG_COLUMNS_DAILY, + mergeSql: buildPypiDownloadsDailyMergeSql(), + tableNames: 'downloads_daily', + }) +} diff --git a/services/apps/packages_worker/src/npm/activities.ts b/services/apps/packages_worker/src/npm/activities.ts index dabc8b3fdd..4cdb71566d 100644 --- a/services/apps/packages_worker/src/npm/activities.ts +++ b/services/apps/packages_worker/src/npm/activities.ts @@ -26,6 +26,7 @@ import { getServiceChildLogger } from '@crowd/logging' import { getPackagesDb } from '../db' import { proxyUrl } from '../proxies' +import { isClientError } from '../utils/isClientError' import { NPM_EARLIEST, computeChunks } from './downloadGaps' import { fetchChangesSince, fetchCurrentSeq } from './fetchChanges' @@ -95,13 +96,6 @@ export async function commitNpmChangesSeq(lastSeq: string): Promise { await setNpmChangesLastSeq(qx, lastSeq) } -// 4xx (404 or any other client error like 405 from a malformed/illegal npm name — -// e.g. deps.dev dependency-chain strings that leaked into `packages`). 429 is -// excluded — it's transient and handled by the slow exponential path. -function isClientError(code: number | undefined, kind: string): boolean { - return kind === 'NOT_FOUND' || (code !== undefined && code >= 400 && code < 500 && code !== 429) -} - // 4xx errors get a few quick in-lane retries with a small linear backoff (1s, 2s), // then the package is given up on and marked scanned. 429/5xx/network errors are NOT // handled here — they throw and ride Temporal's exponential activity-retry instead. diff --git a/services/apps/packages_worker/src/npm/normalize.ts b/services/apps/packages_worker/src/npm/normalize.ts index 8b2f2eec96..4be34c8710 100644 --- a/services/apps/packages_worker/src/npm/normalize.ts +++ b/services/apps/packages_worker/src/npm/normalize.ts @@ -13,27 +13,6 @@ export function parseNpmName(raw: string): { namespace: string | null; name: str return { namespace: null, name: raw } } -// Postgres text columns cannot store NUL (U+0000); npm packuments occasionally -// carry them (e.g. mojibake descriptions). Strip them in place from every string -// in the packument before persisting — otherwise the inlined value breaks the -// PostgreSQL wire protocol ("invalid message format"). -export function stripNullBytesDeep(value: T): T { - if (typeof value === 'string') { - // eslint-disable-next-line no-control-regex - return value.replace(/\u0000/g, '') as T - } - if (Array.isArray(value)) { - for (let i = 0; i < value.length; i++) value[i] = stripNullBytesDeep(value[i]) - return value - } - if (value !== null && typeof value === 'object') { - const obj = value as Record - for (const k of Object.keys(obj)) obj[k] = stripNullBytesDeep(obj[k]) - return value - } - return value -} - export function normalizeLicenses(packument: Packument): string[] { const rawArr = packument.licenses if (rawArr && Array.isArray(rawArr)) { diff --git a/services/apps/packages_worker/src/npm/upsertPackage.ts b/services/apps/packages_worker/src/npm/upsertPackage.ts index 13cfa7a851..cfc922fdcf 100644 --- a/services/apps/packages_worker/src/npm/upsertPackage.ts +++ b/services/apps/packages_worker/src/npm/upsertPackage.ts @@ -8,13 +8,14 @@ import { } from '@crowd/data-access-layer/src/packages' import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { stripNullBytesDeep } from '../utils/stripNullBytesDeep' + import { collectMaintainers, extractRepo, isPrerelease, normalizeLicenses, parseNpmName, - stripNullBytesDeep, versionLicense, } from './normalize' import type { FundingEntry, Packument } from './types' diff --git a/services/apps/packages_worker/src/pypi/__tests__/fetchProject.test.ts b/services/apps/packages_worker/src/pypi/__tests__/fetchProject.test.ts new file mode 100644 index 0000000000..c181e93b23 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/__tests__/fetchProject.test.ts @@ -0,0 +1,79 @@ +import { afterEach, describe, expect, it, vi } from 'vitest' + +import { fetchProject } from '../fetchProject' + +// Minimal Response stand-in for the global fetch mock. +function fakeResponse(status: number, body?: unknown, jsonThrows = false): Response { + return { + status, + ok: status >= 200 && status < 300, + json: async () => { + if (jsonThrows) throw new Error('bad json') + return body + }, + } as unknown as Response +} + +const validProject = { info: { name: 'flask' }, releases: {} } + +afterEach(() => { + vi.unstubAllGlobals() + vi.useRealTimers() +}) + +describe('fetchProject status → FetchError kind mapping', () => { + it('returns the parsed project on 200 with a valid shape', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(200, validProject))) + const result = await fetchProject('flask') + expect(result).toEqual(validProject) + }) + + it('maps 404 → NOT_FOUND (skippable)', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(404))) + expect(await fetchProject('nope')).toMatchObject({ kind: 'NOT_FOUND', statusCode: 404 }) + }) + + it('maps 429 → RATE_LIMIT (transient, distinct from generic 4xx)', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(429))) + expect(await fetchProject('busy')).toMatchObject({ kind: 'RATE_LIMIT', statusCode: 429 }) + }) + + it('maps other non-ok statuses (5xx) → TRANSIENT with the status code', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(503))) + expect(await fetchProject('flaky')).toMatchObject({ kind: 'TRANSIENT', statusCode: 503 }) + }) + + it('maps a network/fetch rejection → TRANSIENT (no status code)', async () => { + vi.stubGlobal('fetch', vi.fn().mockRejectedValue(new Error('ECONNRESET'))) + const result = await fetchProject('flask') + expect(result).toMatchObject({ kind: 'TRANSIENT' }) + expect((result as { statusCode?: number }).statusCode).toBeUndefined() + }) + + it('maps an unparseable body → MALFORMED', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(200, undefined, true))) + expect(await fetchProject('flask')).toMatchObject({ kind: 'MALFORMED' }) + }) + + it('maps an unexpected JSON shape → MALFORMED', async () => { + vi.stubGlobal('fetch', vi.fn().mockResolvedValue(fakeResponse(200, { not: 'a project' }))) + expect(await fetchProject('flask')).toMatchObject({ kind: 'MALFORMED' }) + }) + + it('maps a body read aborted by the 30s timeout → TRANSIENT (not MALFORMED)', async () => { + vi.useFakeTimers() + vi.stubGlobal( + 'fetch', + vi.fn().mockResolvedValue({ + status: 200, + ok: true, + // body never settles before the 30s abort fires, then rejects (as an aborted read would) + json: () => + new Promise((_resolve, reject) => setTimeout(() => reject(new Error('aborted')), 40_000)), + } as unknown as Response), + ) + const p = fetchProject('flask') + await vi.advanceTimersByTimeAsync(41_000) // 30s abort fires first, then the json read rejects + expect(await p).toMatchObject({ kind: 'TRANSIENT' }) + }) +}) diff --git a/services/apps/packages_worker/src/pypi/__tests__/ingest.test.ts b/services/apps/packages_worker/src/pypi/__tests__/ingest.test.ts new file mode 100644 index 0000000000..083ac9c7f9 --- /dev/null +++ b/services/apps/packages_worker/src/pypi/__tests__/ingest.test.ts @@ -0,0 +1,170 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +import { logAuditFieldChanges, markPypiPackageScanned } from '@crowd/data-access-layer/src/packages' +import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { isClientError } from '../../utils/isClientError' +import { ingestOne, ingestPurlsWithGiveUp } from '../activities' +import { fetchProject } from '../fetchProject' +import { INGEST_MAX_ATTEMPTS } from '../retryPolicy' +import { upsertProject } from '../upsertProject' + +// Mock the heavy collaborators so we can exercise the pure classification/retry/give-up logic. +vi.mock('../fetchProject', () => ({ fetchProject: vi.fn() })) +vi.mock('../upsertProject', () => ({ upsertProject: vi.fn() })) +vi.mock('@crowd/data-access-layer/src/packages', () => ({ + getUnscannedPypiPurls: vi.fn(), + logAuditFieldChanges: vi.fn(), + markPypiPackageScanned: vi.fn(), +})) + +const mockFetch = vi.mocked(fetchProject) +const mockUpsert = vi.mocked(upsertProject) +const mockMarkScanned = vi.mocked(markPypiPackageScanned) +const mockAudit = vi.mocked(logAuditFieldChanges) + +const qx = {} as QueryExecutor + +beforeEach(() => { + vi.clearAllMocks() +}) + +// ── isClientError: which (statusCode, kind) combinations are "give up immediately" 4xx ────── +describe('isClientError', () => { + it('treats NOT_FOUND and any non-429 4xx as a client error', () => { + expect(isClientError(404, 'NOT_FOUND')).toBe(true) + expect(isClientError(undefined, 'NOT_FOUND')).toBe(true) + expect(isClientError(403, 'TRANSIENT')).toBe(true) // 403 leaked in as TRANSIENT kind + expect(isClientError(400, 'MALFORMED')).toBe(true) + }) + + it('does NOT treat 429 or 5xx/transient as a client error', () => { + expect(isClientError(429, 'RATE_LIMIT')).toBe(false) + expect(isClientError(500, 'TRANSIENT')).toBe(false) + expect(isClientError(undefined, 'TRANSIENT')).toBe(false) + expect(isClientError(200, 'OK')).toBe(false) + }) +}) + +// ── ingestOne: success marks scanned; 4xx gives up after retries; transient throws ────────── +describe('ingestOne', () => { + it('on success upserts and marks the package scanned with the attempt count', async () => { + mockFetch.mockResolvedValue({ info: { name: 'flask' } } as never) + mockUpsert.mockResolvedValue({ + purl: 'pkg:pypi/flask', + changedFields: ['description'], + } as never) + + await ingestOne(qx, 'pkg:pypi/flask') + + expect(mockAudit).toHaveBeenCalledWith(qx, 'pypi', 'pkg:pypi/flask', ['description']) + expect(mockMarkScanned).toHaveBeenCalledWith( + qx, + 'pkg:pypi/flask', + expect.objectContaining({ status: 'success', attempts: 1 }), + ) + }) + + it('throws on a transient (5xx) result WITHOUT marking the package scanned', async () => { + mockFetch.mockResolvedValue({ + kind: 'TRANSIENT', + statusCode: 500, + message: 'HTTP 500', + } as never) + + await expect(ingestOne(qx, 'pkg:pypi/flask')).rejects.toThrow() + expect(mockMarkScanned).not.toHaveBeenCalled() + expect(mockFetch).toHaveBeenCalledTimes(1) + }) + + it('gives up on a persistent 4xx after the fast retries and marks it scanned(error)', async () => { + vi.useFakeTimers() + mockFetch.mockResolvedValue({ + kind: 'NOT_FOUND', + statusCode: 404, + message: 'not found', + } as never) + + const p = ingestOne(qx, 'pkg:pypi/ghost') + await vi.runAllTimersAsync() + await p + + expect(mockFetch).toHaveBeenCalledTimes(3) + expect(mockMarkScanned).toHaveBeenCalledWith( + qx, + 'pkg:pypi/ghost', + expect.objectContaining({ + status: 'error', + attempts: 3, + httpStatus: 404, + errorKind: 'NOT_FOUND', + }), + ) + vi.useRealTimers() + }) + + it('gives up on a persistent MALFORMED body after retries and marks it scanned(error)', async () => { + vi.useFakeTimers() + mockFetch.mockResolvedValue({ kind: 'MALFORMED', message: 'unexpected shape' } as never) + + const p = ingestOne(qx, 'pkg:pypi/weird') + await vi.runAllTimersAsync() + await p + + expect(mockFetch).toHaveBeenCalledTimes(3) + expect(mockMarkScanned).toHaveBeenCalledWith( + qx, + 'pkg:pypi/weird', + expect.objectContaining({ status: 'error', attempts: 3, errorKind: 'MALFORMED' }), + ) + vi.useRealTimers() + }) + + it('throws on a 429/RATE_LIMIT result without marking scanned (rides Temporal retry)', async () => { + mockFetch.mockResolvedValue({ + kind: 'RATE_LIMIT', + statusCode: 429, + message: 'rate limited', + } as never) + + await expect(ingestOne(qx, 'pkg:pypi/flask')).rejects.toThrow() + expect(mockMarkScanned).not.toHaveBeenCalled() + expect(mockFetch).toHaveBeenCalledTimes(1) + }) +}) + +// ── ingestPurlsWithGiveUp: one persistently-failing package must not stall the cursor ─────── +describe('ingestPurlsWithGiveUp', () => { + const purls = ['pkg:pypi/a', 'pkg:pypi/bad', 'pkg:pypi/c'] + const makeIngest = () => + vi.fn((purl: string) => + purl === 'pkg:pypi/bad' ? Promise.reject(new Error('transient')) : Promise.resolve(), + ) + + it('rethrows (and aborts the batch) while Temporal retries remain', async () => { + const ingest = makeIngest() + await expect(ingestPurlsWithGiveUp(qx, purls, 1, ingest)).rejects.toThrow() + // aborted at 'bad' — the package after it is never reached, and nothing is given up + expect(ingest).toHaveBeenCalledTimes(2) + expect(mockMarkScanned).not.toHaveBeenCalled() + }) + + it('on the final attempt gives up on the bad package and continues past it', async () => { + const ingest = makeIngest() + await expect( + ingestPurlsWithGiveUp(qx, purls, INGEST_MAX_ATTEMPTS, ingest), + ).resolves.toBeUndefined() + // every package attempted, including the one after the failure + expect(ingest).toHaveBeenCalledTimes(3) + expect(mockMarkScanned).toHaveBeenCalledTimes(1) + expect(mockMarkScanned).toHaveBeenCalledWith( + qx, + 'pkg:pypi/bad', + expect.objectContaining({ status: 'error' }), + ) + }) +}) + +afterEach(() => { + vi.useRealTimers() +}) diff --git a/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts b/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts index 42e496f8a4..18c0783af0 100644 --- a/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts +++ b/services/apps/packages_worker/src/pypi/__tests__/normalize.test.ts @@ -1,15 +1,15 @@ import { describe, expect, it } from 'vitest' import { + buildVersionRows, classifyProjectUrls, collectPypiMaintainers, isPypiPrerelease, parseKeywords, pypiNameFromPurl, resolvePypiLicenses, - stripNullBytesDeep, } from '../normalize' -import type { PyPiInfo } from '../types' +import type { PyPiInfo, PyPiReleaseFile } from '../types' function info(partial: Partial): PyPiInfo { return { name: 'demo', ...partial } @@ -26,12 +26,64 @@ describe('pypiNameFromPurl', () => { }) }) -describe('stripNullBytesDeep', () => { - it('removes NUL bytes from nested strings', () => { - const nul = String.fromCharCode(0) - const v = stripNullBytesDeep({ a: `x${nul}y`, b: [`p${nul}`, 'q'] }) - expect(v.a).toBe('xy') - expect(v.b).toEqual(['p', 'q']) +describe('buildVersionRows', () => { + const file = (partial: Partial): PyPiReleaseFile => ({ ...partial }) + + it('skips releases whose files were all deleted (empty array)', () => { + const { versionRows, firstReleaseAt, latestReleaseAt } = buildVersionRows( + { + '1.0': [file({ upload_time_iso_8601: '2020-01-01T00:00:00Z' })], + '1.1': [], // all files deleted + }, + '1.1', + 'MIT', + ) + expect(versionRows.map((r) => r.number)).toEqual(['1.0']) + expect(firstReleaseAt).toBe('2020-01-01T00:00:00Z') + expect(latestReleaseAt).toBe('2020-01-01T00:00:00Z') + }) + + it('marks a release yanked only when every file is yanked', () => { + const { versionRows } = buildVersionRows( + { + '1.0': [file({ yanked: true }), file({ yanked: false })], + '2.0': [file({ yanked: true }), file({ yanked: true })], + }, + '2.0', + null, + ) + const byNumber = Object.fromEntries(versionRows.map((r) => [r.number, r])) + expect(byNumber['1.0'].isYanked).toBe(false) + expect(byNumber['2.0'].isYanked).toBe(true) + }) + + it('flags isLatest, derives publishedAt from the earliest file, and spans first/latest release', () => { + const { versionRows, firstReleaseAt, latestReleaseAt } = buildVersionRows( + { + '1.0': [ + file({ upload_time_iso_8601: '2021-06-01T00:00:00Z' }), + file({ upload_time_iso_8601: '2021-05-01T00:00:00Z' }), + ], + '2.0': [file({ upload_time_iso_8601: '2022-01-01T00:00:00Z' })], + }, + '2.0', + 'Apache-2.0', + ) + const byNumber = Object.fromEntries(versionRows.map((r) => [r.number, r])) + expect(byNumber['1.0'].publishedAt).toBe('2021-05-01T00:00:00Z') // earliest file + expect(byNumber['1.0'].isLatest).toBe(false) + expect(byNumber['2.0'].isLatest).toBe(true) + expect(byNumber['2.0'].license).toBe('Apache-2.0') + expect(firstReleaseAt).toBe('2021-05-01T00:00:00Z') + expect(latestReleaseAt).toBe('2022-01-01T00:00:00Z') + }) + + it('returns empty rows and null dates for no releases', () => { + expect(buildVersionRows({}, null, null)).toEqual({ + versionRows: [], + firstReleaseAt: null, + latestReleaseAt: null, + }) }) }) @@ -159,6 +211,16 @@ describe('collectPypiMaintainers', () => { ]) }) + it('keeps surplus names when there are fewer emails than names', () => { + const people = collectPypiMaintainers( + info({ author: 'Alice Smith, Bob Jones', author_email: 'alice@x.com' }), + ) + expect(people).toEqual([ + { username: 'Alice Smith', displayName: 'Alice Smith', email: 'alice@x.com', role: 'author' }, + { username: 'Bob Jones', displayName: 'Bob Jones', email: null, role: 'author' }, + ]) + }) + it('returns nothing when author/maintainer are absent', () => { expect(collectPypiMaintainers(info({}))).toEqual([]) }) diff --git a/services/apps/packages_worker/src/pypi/activities.ts b/services/apps/packages_worker/src/pypi/activities.ts index 9340b1d95d..58429d9610 100644 --- a/services/apps/packages_worker/src/pypi/activities.ts +++ b/services/apps/packages_worker/src/pypi/activities.ts @@ -1,4 +1,5 @@ -import { ProxyAgent, type Dispatcher } from 'undici' +import { Context } from '@temporalio/activity' +import { type Dispatcher, ProxyAgent } from 'undici' import { getUnscannedPypiPurls, @@ -10,10 +11,12 @@ import { getServiceChildLogger } from '@crowd/logging' import { getPackagesDb } from '../db' import { proxyUrl } from '../proxies' +import { isClientError } from '../utils/isClientError' import { fetchProject } from './fetchProject' import { pypiNameFromPurl } from './normalize' import { pypiProxyPool } from './proxies' +import { INGEST_MAX_ATTEMPTS } from './retryPolicy' import { isFetchError } from './types' import { upsertProject } from './upsertProject' @@ -21,12 +24,6 @@ const log = getServiceChildLogger('pypi') const WORKER = 'pypi' -// 4xx (404 or any other client error like a malformed/illegal project name that -// leaked into `packages`). 429 is excluded — it's transient and rides the slow path. -function isClientError(code: number | undefined, kind: string): boolean { - return kind === 'NOT_FOUND' || (code !== undefined && code >= 400 && code < 500 && code !== 429) -} - // 4xx/malformed get a few quick in-lane retries with a small linear backoff, then the // package is given up on and marked scanned. 429/5xx/network throw and ride Temporal's // exponential activity-retry instead. @@ -70,7 +67,7 @@ function sleep(ms: number): Promise { // Fully enrich a single package. `purl` is the source-of-truth identifier from the // packages row; the PyPI project name (for the HTTP fetch) is derived from it. -async function ingestOne( +export async function ingestOne( qx: QueryExecutor, purl: string, dispatcher?: Dispatcher, @@ -109,6 +106,37 @@ async function ingestOne( } } +// Process purls sequentially. On a transient throw, rethrow so Temporal retries the whole +// batch — UNTIL those retries are exhausted (attempt >= INGEST_MAX_ATTEMPTS), after which the +// one offending package is marked scanned-error and the loop continues, so a single +// persistently-failing package can't stall the keyset cursor for everything after it. +export async function ingestPurlsWithGiveUp( + qx: QueryExecutor, + purls: string[], + attempt: number, + ingest: (purl: string, index: number) => Promise, +): Promise { + let i = 0 + for (const purl of purls) { + const index = i++ + try { + await ingest(purl, index) + } catch (err) { + // Retry via Temporal while attempts remain; then mark scanned-error and continue. + if (attempt < INGEST_MAX_ATTEMPTS) throw err + log.warn( + { purl, attempt, err: String(err) }, + 'pypi transient failure after max attempts — marking scanned(error) and continuing', + ) + await markPypiPackageScanned(qx, purl, { + status: 'error', + attempts: attempt, + message: String(err), + }) + } + } +} + export async function getUnscannedPypiBatch( afterPurl: string, batchSize: number, @@ -135,14 +163,15 @@ export async function ingestPypiPackageBatch(purls: string[]): Promise { // configured proxy pool per package so traffic spreads over all IPs; when disabled the // pool is empty and `dispatcher` stays undefined (direct egress). One ProxyAgent per // proxy, reused for the whole batch and closed at the end. + const attempt = Context.current().info.attempt + const agents = pypiProxyPool().map((p) => new ProxyAgent(proxyUrl(p))) try { - let i = 0 - for (const purl of purls) { + await ingestPurlsWithGiveUp(qx, purls, attempt, async (purl, i) => { await sleep(ingestSleepMs()) - const dispatcher = agents.length ? agents[i++ % agents.length] : undefined + const dispatcher = agents.length ? agents[i % agents.length] : undefined await ingestOne(qx, purl, dispatcher) - } + }) } finally { await Promise.all(agents.map((a) => a.close())) } diff --git a/services/apps/packages_worker/src/pypi/fetchProject.ts b/services/apps/packages_worker/src/pypi/fetchProject.ts index 97c0aab2e5..74fb7e78fa 100644 --- a/services/apps/packages_worker/src/pypi/fetchProject.ts +++ b/services/apps/packages_worker/src/pypi/fetchProject.ts @@ -18,39 +18,45 @@ export async function fetchProject( ): Promise { const url = `${REGISTRY}/${encodeURIComponent(name)}/json` const abort = new AbortController() + // The 30s timer must cover the body read too, not just the headers — clear it only in the + // finally below, once the whole response (including res.json()) is done. const timer = setTimeout(() => abort.abort(), 30_000) - let res: Response try { - // `dispatcher` is an undici-specific fetch option not present in the DOM RequestInit type. - const init: RequestInit & { dispatcher?: Dispatcher } = { - headers: { - Accept: 'application/json', - 'User-Agent': USER_AGENT, - }, - signal: abort.signal, + let res: Response + try { + // `dispatcher` is an undici-specific fetch option not present in the DOM RequestInit type. + const init: RequestInit & { dispatcher?: Dispatcher } = { + headers: { + Accept: 'application/json', + 'User-Agent': USER_AGENT, + }, + signal: abort.signal, + } + if (dispatcher) init.dispatcher = dispatcher + res = await fetch(url, init as RequestInit) + } catch (err) { + return { kind: 'TRANSIENT', message: String(err) } } - if (dispatcher) init.dispatcher = dispatcher - res = await fetch(url, init as RequestInit) - } catch (err) { - return { kind: 'TRANSIENT', message: String(err) } - } finally { - clearTimeout(timer) - } - if (res.status === 404) - return { kind: 'NOT_FOUND', message: `${name} not found`, statusCode: 404 } - if (res.status === 429) return { kind: 'RATE_LIMIT', message: 'rate limited', statusCode: 429 } - if (!res.ok) return { kind: 'TRANSIENT', message: `HTTP ${res.status}`, statusCode: res.status } + if (res.status === 404) + return { kind: 'NOT_FOUND', message: `${name} not found`, statusCode: 404 } + if (res.status === 429) return { kind: 'RATE_LIMIT', message: 'rate limited', statusCode: 429 } + if (!res.ok) return { kind: 'TRANSIENT', message: `HTTP ${res.status}`, statusCode: res.status } - let json: unknown - try { - json = await res.json() - } catch { - return { kind: 'MALFORMED', message: 'invalid JSON' } - } + let json: unknown + try { + json = await res.json() + } catch { + // A body that stalls past the timeout aborts here — that's transient (retry), not malformed. + if (abort.signal.aborted) return { kind: 'TRANSIENT', message: 'body read timed out' } + return { kind: 'MALFORMED', message: 'invalid JSON' } + } - if (!isPyPiProject(json)) return { kind: 'MALFORMED', message: 'unexpected shape' } - return json + if (!isPyPiProject(json)) return { kind: 'MALFORMED', message: 'unexpected shape' } + return json + } finally { + clearTimeout(timer) + } } function isPyPiProject(v: unknown): v is PyPiProject { diff --git a/services/apps/packages_worker/src/pypi/normalize.ts b/services/apps/packages_worker/src/pypi/normalize.ts index c82709d1f2..2ae91e29ba 100644 --- a/services/apps/packages_worker/src/pypi/normalize.ts +++ b/services/apps/packages_worker/src/pypi/normalize.ts @@ -1,11 +1,7 @@ -import type { PyPiInfo } from './types' +import type { PyPiInfo, PyPiReleaseFile } from './types' const PURL_PYPI_PREFIX = 'pkg:pypi/' -// Postgres text columns cannot store NUL (U+0000). Built at runtime so there is no -// NUL literal in the source. -const NUL_GLOBAL = new RegExp(String.fromCharCode(0), 'g') - // The PyPI project name from a purl. PyPI purls are `pkg:pypi/` (no namespace); // the name segment is percent-encoded per the purl spec, so decode it to get the // registry name used by the JSON API. @@ -13,24 +9,6 @@ export function pypiNameFromPurl(purl: string): string { return decodeURIComponent(purl.slice(PURL_PYPI_PREFIX.length)) } -// Strip NUL bytes in place from every string before persisting — otherwise the -// inlined value breaks the PostgreSQL wire protocol ("invalid message format"). -export function stripNullBytesDeep(value: T): T { - if (typeof value === 'string') { - return value.replace(NUL_GLOBAL, '') as T - } - if (Array.isArray(value)) { - for (let i = 0; i < value.length; i++) value[i] = stripNullBytesDeep(value[i]) - return value - } - if (value !== null && typeof value === 'object') { - const obj = value as Record - for (const k of Object.keys(obj)) obj[k] = stripNullBytesDeep(obj[k]) - return value - } - return value -} - function blankToNull(s: string | null | undefined): string | null { if (s == null) return null const t = s.trim() @@ -174,14 +152,13 @@ function peopleForRole( const nameParts = nameField ? splitList(nameField) : [] const raw: Array<{ name: string | null; email: string | null }> = [] - if (emailParts.length) { - // Modern packages put "Name " (often several, comma-separated) in *_email. - emailParts.forEach((part, i) => { - const pe = parseNameEmail(part) - raw.push({ name: pe.name ?? nameParts[i] ?? null, email: pe.email }) - }) - } else if (nameParts.length) { - nameParts.forEach((n) => raw.push({ name: blankToNull(n), email: null })) + // Modern packages put "Name " (often several, comma-separated) in *_email. Pair names + // and email entries by index, iterating the LONGER of the two so surplus names aren't dropped + // — older metadata commonly lists several names but only a single email. + const count = Math.max(emailParts.length, nameParts.length) + for (let i = 0; i < count; i++) { + const pe = emailParts[i] ? parseNameEmail(emailParts[i]) : { name: null, email: null } + raw.push({ name: pe.name ?? nameParts[i] ?? null, email: pe.email }) } const out: PypiPerson[] = [] @@ -274,3 +251,62 @@ export function parseKeywords(raw: string | null | undefined): string[] { .filter(Boolean), ) } + +export interface PypiVersionRow { + number: string + publishedAt: string | null + isLatest: boolean + isPrerelease: boolean + isYanked: boolean + license: string | null +} + +function fileUploadTimes(files: PyPiReleaseFile[]): string[] { + return files + .map((f) => f.upload_time_iso_8601) + .filter((t): t is string => typeof t === 'string' && t.length > 0) +} + +function minStr(arr: string[]): string | null { + return arr.length ? arr.reduce((a, b) => (a < b ? a : b)) : null +} + +function maxStr(arr: string[]): string | null { + return arr.length ? arr.reduce((a, b) => (a > b ? a : b)) : null +} + +// Derive per-version rows and the package's first/latest release timestamps from PyPI's +// `releases` map. A version whose files were all deleted has an empty array — no release +// artifact — so it is skipped (it carries no publish date and would inflate the version count). +// A release is yanked only when every one of its files is yanked; publish dates come from the +// min/max upload time across a release's files. +export function buildVersionRows( + releases: Record, + latestVersion: string | null, + license: string | null, +): { + versionRows: PypiVersionRow[] + firstReleaseAt: string | null + latestReleaseAt: string | null +} { + const allUploadTimes: string[] = [] + const versionRows: PypiVersionRow[] = [] + for (const [number, files] of Object.entries(releases)) { + if (!Array.isArray(files) || files.length === 0) continue + const times = fileUploadTimes(files) + allUploadTimes.push(...times) + versionRows.push({ + number, + publishedAt: minStr(times), + isLatest: number === latestVersion, + isPrerelease: isPypiPrerelease(number), + isYanked: files.every((f) => f.yanked === true), + license, + }) + } + return { + versionRows, + firstReleaseAt: minStr(allUploadTimes), + latestReleaseAt: maxStr(allUploadTimes), + } +} diff --git a/services/apps/packages_worker/src/pypi/retryPolicy.ts b/services/apps/packages_worker/src/pypi/retryPolicy.ts new file mode 100644 index 0000000000..16b0bd15eb --- /dev/null +++ b/services/apps/packages_worker/src/pypi/retryPolicy.ts @@ -0,0 +1,4 @@ +// Per-package metadata-fetch attempts. Shared between the workflow's Temporal retry policy +// and the activity's give-up threshold so the two never drift: a package is only given up on +// (marked scanned-error so the cursor can advance) once Temporal has exhausted these attempts. +export const INGEST_MAX_ATTEMPTS = 5 diff --git a/services/apps/packages_worker/src/pypi/upsertProject.ts b/services/apps/packages_worker/src/pypi/upsertProject.ts index 650f14f284..988fbb6e9a 100644 --- a/services/apps/packages_worker/src/pypi/upsertProject.ts +++ b/services/apps/packages_worker/src/pypi/upsertProject.ts @@ -9,40 +9,17 @@ import { import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { canonicalizeRepoUrl } from '../utils/canonicalizeRepoUrl' +import { stripNullBytesDeep } from '../utils/stripNullBytesDeep' import { + buildVersionRows, classifyProjectUrls, collectPypiMaintainers, - isPypiPrerelease, parseKeywords, pypiNameFromPurl, resolvePypiLicenses, - stripNullBytesDeep, } from './normalize' -import type { PyPiProject, PyPiReleaseFile } from './types' - -interface PypiVersionRow { - number: string - publishedAt: string | null - isLatest: boolean - isPrerelease: boolean - isYanked: boolean - license: string | null -} - -function fileUploadTimes(files: PyPiReleaseFile[]): string[] { - return files - .map((f) => f.upload_time_iso_8601) - .filter((t): t is string => typeof t === 'string' && t.length > 0) -} - -function minStr(arr: string[]): string | null { - return arr.length ? arr.reduce((a, b) => (a < b ? a : b)) : null -} - -function maxStr(arr: string[]): string | null { - return arr.length ? arr.reduce((a, b) => (a > b ? a : b)) : null -} +import type { PyPiProject } from './types' export async function upsertProject( qx: QueryExecutor, @@ -73,26 +50,11 @@ export async function upsertProject( const latestVersion = info.version ?? null const packageLicense = licenses[0] ?? null - const allUploadTimes: string[] = [] - const versionRows: PypiVersionRow[] = [] - for (const [number, files] of Object.entries(releases)) { - // A version whose files were all deleted has an empty array — no release artifact, - // so skip it (it would carry no publish date and inflate the version count). - if (!Array.isArray(files) || files.length === 0) continue - const times = fileUploadTimes(files) - allUploadTimes.push(...times) - versionRows.push({ - number, - publishedAt: minStr(times), - isLatest: number === latestVersion, - isPrerelease: isPypiPrerelease(number), - isYanked: files.every((f) => f.yanked === true), - license: packageLicense, - }) - } - - const firstReleaseAt = minStr(allUploadTimes) - const latestReleaseAt = maxStr(allUploadTimes) + const { versionRows, firstReleaseAt, latestReleaseAt } = buildVersionRows( + releases, + latestVersion, + packageLicense, + ) const changed = new Set() diff --git a/services/apps/packages_worker/src/pypi/workflows.ts b/services/apps/packages_worker/src/pypi/workflows.ts index bf252d31ab..125cbd39da 100644 --- a/services/apps/packages_worker/src/pypi/workflows.ts +++ b/services/apps/packages_worker/src/pypi/workflows.ts @@ -1,13 +1,16 @@ import { continueAsNew, proxyActivities } from '@temporalio/workflow' import type * as activities from './activities' +import { INGEST_MAX_ATTEMPTS } from './retryPolicy' const acts = proxyActivities({ startToCloseTimeout: '15 minutes', retry: { initialInterval: '30 seconds', backoffCoefficient: 2, - maximumAttempts: 5, + // Kept in lockstep with the activity's give-up threshold (ingestPurlsWithGiveUp): a + // package is only abandoned once these Temporal retries are exhausted. + maximumAttempts: INGEST_MAX_ATTEMPTS, }, }) diff --git a/services/apps/packages_worker/src/scripts/monitorOsspckgs.ts b/services/apps/packages_worker/src/scripts/monitorOsspckgs.ts index e2e325daaf..6d16b03ce1 100644 --- a/services/apps/packages_worker/src/scripts/monitorOsspckgs.ts +++ b/services/apps/packages_worker/src/scripts/monitorOsspckgs.ts @@ -235,6 +235,8 @@ const KIND_TABLES: Record = { advisory_packages: ['advisory_packages', 'advisory_affected_ranges'], dependent_counts: ['packages'], ranking: ['packages'], + pypi_downloads_30d: ['downloads_last_30d', 'packages'], + pypi_downloads_daily: ['downloads_daily'], } const TABLE_ABBREV: Record = { @@ -246,6 +248,8 @@ const TABLE_ABBREV: Record = { advisories: 'adv', advisory_packages: 'ap', advisory_affected_ranges: 'ar', + downloads_last_30d: 'dl30d', + downloads_daily: 'dldaily', } async function fetchTableCounts(): Promise> { @@ -256,7 +260,8 @@ async function fetchTableCounts(): Promise> { WHERE relname IN ( 'packages', 'versions', 'package_dependencies', 'repos', 'package_repos', - 'advisories', 'advisory_packages', 'advisory_affected_ranges' + 'advisories', 'advisory_packages', 'advisory_affected_ranges', + 'downloads_daily', 'downloads_last_30d' ) `) const result: Record = {} diff --git a/services/apps/packages_worker/src/utils/__tests__/stripNullBytesDeep.test.ts b/services/apps/packages_worker/src/utils/__tests__/stripNullBytesDeep.test.ts new file mode 100644 index 0000000000..5cc79de037 --- /dev/null +++ b/services/apps/packages_worker/src/utils/__tests__/stripNullBytesDeep.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, it } from 'vitest' + +import { stripNullBytesDeep } from '../stripNullBytesDeep' + +describe('stripNullBytesDeep', () => { + it('removes NUL bytes from nested strings', () => { + const nul = String.fromCharCode(0) + const v = stripNullBytesDeep({ a: `x${nul}y`, b: [`p${nul}`, 'q'] }) + expect(v.a).toBe('xy') + expect(v.b).toEqual(['p', 'q']) + }) + + it('passes through non-string scalars unchanged', () => { + expect(stripNullBytesDeep(42)).toBe(42) + expect(stripNullBytesDeep(null)).toBe(null) + expect(stripNullBytesDeep(true)).toBe(true) + }) +}) diff --git a/services/apps/packages_worker/src/utils/isClientError.ts b/services/apps/packages_worker/src/utils/isClientError.ts new file mode 100644 index 0000000000..b21ec5ab00 --- /dev/null +++ b/services/apps/packages_worker/src/utils/isClientError.ts @@ -0,0 +1,7 @@ +// A registry fetch outcome that should be given up on rather than retried: 404, or any other +// 4xx client error (e.g. a malformed/illegal project name that leaked into `packages`). 429 is +// excluded — it's transient and rides the slow (Temporal-retry) path. Shared by the npm and pypi +// workers, whose fetchers classify HTTP outcomes into these (statusCode, kind) pairs. +export function isClientError(code: number | undefined, kind: string): boolean { + return kind === 'NOT_FOUND' || (code !== undefined && code >= 400 && code < 500 && code !== 429) +} diff --git a/services/apps/packages_worker/src/utils/stripNullBytesDeep.ts b/services/apps/packages_worker/src/utils/stripNullBytesDeep.ts new file mode 100644 index 0000000000..e403e352e1 --- /dev/null +++ b/services/apps/packages_worker/src/utils/stripNullBytesDeep.ts @@ -0,0 +1,18 @@ +// Postgres text columns cannot store NUL (U+0000). Recursively strip NUL bytes from every string +const NUL_GLOBAL = new RegExp(String.fromCharCode(0), 'g') + +export function stripNullBytesDeep(value: T): T { + if (typeof value === 'string') { + return value.replace(NUL_GLOBAL, '') as T + } + if (Array.isArray(value)) { + for (let i = 0; i < value.length; i++) value[i] = stripNullBytesDeep(value[i]) + return value + } + if (value !== null && typeof value === 'object') { + const obj = value as Record + for (const k of Object.keys(obj)) obj[k] = stripNullBytesDeep(obj[k]) + return value + } + return value +} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 78939cece5..9bf0065c34 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -13,6 +13,8 @@ export { ingestDependencies, ingestAdvisories, ingestDependentCounts, + ingestPypiDownloadsLast30d, + ingestPypiDownloadsDaily, } from '../deps-dev/workflows' export { osvSync } from '../osv/workflows' export { ingestMavenPackages } from '../maven/workflows' diff --git a/services/libs/data-access-layer/src/osspckgs/ingestJobs.ts b/services/libs/data-access-layer/src/osspckgs/ingestJobs.ts index 9606035e23..57e9511342 100644 --- a/services/libs/data-access-layer/src/osspckgs/ingestJobs.ts +++ b/services/libs/data-access-layer/src/osspckgs/ingestJobs.ts @@ -12,6 +12,8 @@ export type OsspckgsJobKind = | 'scorecard_repos' | 'scorecard_checks' | 'ranking' + | 'pypi_downloads_30d' + | 'pypi_downloads_daily' export type OsspckgsJobStatus = | 'pending' diff --git a/services/libs/data-access-layer/src/packages/packages.ts b/services/libs/data-access-layer/src/packages/packages.ts index 16c8ed84ef..0389cb2892 100644 --- a/services/libs/data-access-layer/src/packages/packages.ts +++ b/services/libs/data-access-layer/src/packages/packages.ts @@ -225,3 +225,12 @@ export async function getTrackedNpmPackages( firstReleaseAt: r.first_release_at, })) } + +// How many critical PyPI packages exist — a cheap guard so the daily downloads workflow can +// skip its BigQuery scan entirely when there are none to ingest (the merge scopes to is_critical). +export async function getCriticalPypiPackageCount(qx: QueryExecutor): Promise { + const row: { count: string } = await qx.selectOne( + `SELECT COUNT(*)::text AS count FROM packages WHERE ecosystem = 'pypi' AND is_critical = TRUE`, + ) + return Number(row.count) +} diff --git a/services/libs/data-access-layer/src/packages/versions.ts b/services/libs/data-access-layer/src/packages/versions.ts index f90a571f85..ab3aefbba0 100644 --- a/services/libs/data-access-layer/src/packages/versions.ts +++ b/services/libs/data-access-layer/src/packages/versions.ts @@ -137,5 +137,20 @@ export async function upsertPypiVersions( licenses: versions.map((v) => v.license), }, ) + + // Clear a stale is_latest on any OTHER version of this package — e.g. a previously-latest + // version whose files were all deleted, so it is not in this batch and would otherwise keep + // is_latest = true alongside the new latest. + const latestNumbers = versions.filter((v) => v.isLatest).map((v) => v.number) + if (latestNumbers.length > 0) { + await qx.result( + `UPDATE versions SET is_latest = false + WHERE package_id = $(packageId)::bigint + AND is_latest = true + AND NOT (number = ANY($(latestNumbers)::text[]))`, + { packageId, latestNumbers }, + ) + } + return row.changed_fields } From a0c9edf0d4f3b4ed632147845fdfec275a86d24a Mon Sep 17 00:00:00 2001 From: anilb Date: Wed, 1 Jul 2026 18:15:41 +0200 Subject: [PATCH 03/10] docs: pypi downloads adr Signed-off-by: anilb --- ...5-pypi-downloads-bigquery-merge-scoping.md | 63 +++++++++++++++++++ docs/adr/README.md | 1 + services/apps/packages_worker/CONTEXT.md | 16 ++++- 3 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 docs/adr/0005-pypi-downloads-bigquery-merge-scoping.md diff --git a/docs/adr/0005-pypi-downloads-bigquery-merge-scoping.md b/docs/adr/0005-pypi-downloads-bigquery-merge-scoping.md new file mode 100644 index 0000000000..6089d04062 --- /dev/null +++ b/docs/adr/0005-pypi-downloads-bigquery-merge-scoping.md @@ -0,0 +1,63 @@ +# ADR-0005: PyPI downloads via BigQuery bulk export, scoped in the Postgres merge + +**Date**: 2026-07-01 +**Status**: accepted +**Deciders**: Anil B + +_Consolidated ADR for the PyPI downloads worker — record further PyPI-worker download decisions here rather than opening new ADRs._ + +## Context + +We need PyPI download counts to match the npm shape: daily counts for the **Critical slice** +(`downloads_daily`) and rolling 30-day **Window** counts for all tracked pypi packages +(`downloads_last_30d`, mirrored to `packages.downloads_last_30d`). Unlike npm, **PyPI exposes no +per-package downloads HTTP API** — the only source is the public BigQuery dataset +`bigquery-public-data.pypi.file_downloads` (raw per-download events, timestamp-partitioned). The +worker already has proven deps.dev BigQuery→GCS→staging→merge plumbing and a job monitor keyed on +`osspckgs_ingest_jobs`. Cost is driven by bytes scanned, and a single day of the three columns we +read (`file.project`, `timestamp`, `details.installer.name`) measures ~107 GB (weekend) / ~147 GB +(monthly average), so a 30-day window is ~4.56 TB. + +## Decision + +Ingest PyPI downloads as two new `bq-dataset-ingest` job kinds (`pypi_downloads_30d`, +`pypi_downloads_daily`) that run one BigQuery aggregate over a date range, export **all** projects to +GCS, load to staging, and **scope to the Critical slice in the Postgres merge** (`JOIN packages … +AND is_critical` for daily) — we never push our package list into BigQuery. The 30d workflow does a +**Latest-window refresh** for all pypi (mirroring the latest **Window**); the daily workflow does a +2-day **Trailing re-scan** for the critical subset. Both are idempotent (`ON CONFLICT DO UPDATE`), +fixed-window, and gap-recovered by manual **Backfill** — they are deliberately **not** self-healing. + +## Alternatives Considered + +### Alternative 1: npm-style per-package HTTP fetch with watermark due-selection +- **Pros**: reuses the npm downloads model exactly; source is scoped to what's due; naturally self-healing. +- **Cons**: requires a per-package downloads API. +- **Why not**: PyPI has no such API. The BigQuery public dataset is the only source, which forces a bulk-aggregate model. + +### Alternative 2: Push the critical package list into BigQuery (inline `IN UNNEST([...])`) to shrink the export +- **Pros**: smaller GCS export and staging load, especially for daily backfills. +- **Cons**: inlines our data into the query text. +- **Why not**: the critical set can grow to tens of thousands+; the inline list blows BigQuery's ~1 MB query-text limit (and Temporal's ~2 MB payload limit for the name list). Merge-scoping is unbounded and matches how every deps.dev job scopes to our data in Postgres, not at the source. A cheap `getCriticalPypiCount` guard skips the scan when there are zero critical packages. + +### Alternative 3: Gap-filling self-healing (npm's `computeMissingLast30dWindows` model) +- **Pros**: auto-recovers missed days/months without manual intervention. +- **Cons**: needs per-package due-selection / existing-window diffing, extra state and complexity, and re-scans BigQuery anyway. +- **Why not**: for a bulk-BQ source the simpler fixed-window + idempotent-upsert + manual **Backfill** model is sufficient; deps.dev jobs re-scan on re-run too. The daily 2-day **Trailing re-scan** already corrects a partial most-recent partition. + +## Consequences + +### Positive +- Reuses the deps.dev BQ→GCS→staging→merge plumbing and the `monitor:osspckgs` cost/row dashboard for free. +- Scoping in the merge scales to any critical-set size; our package identifiers never leave Postgres. +- Idempotent upserts make re-runs and overlapping backfills safe (no duplicate rows). + +### Negative +- Re-running a date range re-scans BigQuery and re-bills — there is no "already imported" skip. +- The daily 2-day window re-scans each calendar day ~2×; steady-state cost ≈ $610/yr daily + $311/yr 30d ≈ **~$920/yr** at $6.25/TiB (measured). +- Not self-healing: an outage or missed schedule is recovered only by a manual **Backfill**. +- Daily export carries all ~800k projects even though the merge keeps only the critical subset (larger data movement than a source-filtered approach). + +### Risks +- **BigQuery cost / runaway scans** — mitigated by per-kind `BQ_DATASET_INGEST_PYPI_DOWNLOADS_*_MAX_BQ_GB` ceilings enforced via a pre-run dry-run (aborts before billing); defaults set from measured sizes (30d = 6000 GB, daily = 2000 GB). +- **Traffic growth** — the ~4.56 TB/30d figure grows with PyPI traffic; ceilings may need raising over time. diff --git a/docs/adr/README.md b/docs/adr/README.md index e96d02f8b3..250066a4a4 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -11,6 +11,7 @@ Use the `/adr` skill in Claude Code to record new ADRs or query past decisions. | [ADR-0001](./0001-oss-packages-design-decisions.md) | OSS packages — design decisions (living) | living | 2026-05-27 | | [ADR-0003](./0003-deps-bq-table-selection.md) | Use DependencyGraphEdgesLatest for deps ingestion; defer DependenciesLatest until NUGET or GO needed | accepted | 2026-05-29 | | [ADR-0004](./0004-go-nuget-transitive-dependent-counts.md) | Compute GO/NUGET transitive dependent counts via exact reverse closure (over HLL approximation) | accepted | 2026-06-23 | +| [ADR-0005](./0005-pypi-downloads-bigquery-merge-scoping.md) | PyPI downloads via BigQuery bulk export, scoped in the Postgres merge | accepted | 2026-07-01 | ## Why ADRs? diff --git a/services/apps/packages_worker/CONTEXT.md b/services/apps/packages_worker/CONTEXT.md index 5ded936c58..1baf79e026 100644 --- a/services/apps/packages_worker/CONTEXT.md +++ b/services/apps/packages_worker/CONTEXT.md @@ -1,6 +1,6 @@ # OSS Packages -Tracks open-source packages across ecosystems (npm, Maven). All packages live in `packages`; criticality scoring ranks them in place and marks the top-N per ecosystem as `is_critical = true`. +Tracks open-source packages across ecosystems (npm, pypi, maven, go, nuget, cargo — the set grows as ecosystems are onboarded). All packages live in `packages`; criticality scoring ranks them in place and marks the top-N per ecosystem as `is_critical = true`. ## Language @@ -25,7 +25,7 @@ Package URL (`pkg:npm/react`, `pkg:maven/org.apache/commons`). The canonical cro _Avoid_: package id (that's the `packages.id` bigserial) **Ecosystem**: -A package registry namespace — `npm`, `maven`. Lowercase. +A package registry namespace — `npm`, `pypi`, `maven`, `go`, `nuget`, `cargo`. Lowercase. Open set — new ecosystems are onboarded over time. _Avoid_: system (deps.dev's term), registry **Packument**: @@ -38,9 +38,19 @@ One rolling 30-day span in `downloads_last_30d`, identified by its `end_date` (a _Avoid_: month, period, snapshot **Self-healing**: -A workflow that recomputes the full set of expected rows on every run, diffs against what's in the DB, and fills only the gaps. No assumption of continuity between runs. +A workflow that recomputes the full set of expected rows on every run, diffs against what's in the DB, and fills only the gaps. No assumption of continuity between runs. **npm downloads only** — pypi downloads deliberately do NOT self-heal (see **Trailing re-scan** / **Latest-window refresh**). _Avoid_: backfill (that's the one-time historical fill; self-healing is the ongoing property) +**Trailing re-scan** (pypi daily): +The pypi daily downloads workflow re-scans a fixed 2-day trailing window (`[today−2, today−1]`) every run and upserts. It corrects a partial most-recent partition but does **not** diff against the DB or fill older gaps. Missed days are recovered only by **Backfill**. +_Avoid_: self-healing (that's the npm gap-filling property) + +**Latest-window refresh** (pypi 30d): +The pypi 30d workflow, given no `fromDate`, computes and ingests only the latest **Window** (idempotent upsert, mirrored to `packages.downloads_last_30d`). It does not gap-fill missed months; those are recovered by **Backfill**. + +**Backfill**: +A manual, one-time run over an explicit date range to fill history or recover gaps — pypi daily takes `{startDate, endDate}`, pypi 30d takes `{fromDate}` (enumerates every monthly **Window** from then to the latest). For pypi downloads this is the *only* gap-recovery mechanism; scheduled runs are fixed-window, not self-healing. + ## Relationships - All packages live in `packages`; `rank_packages()` sets `is_critical = true` on the top-N per ecosystem to define the **Critical slice**. From c915a755b9df33323c0d0f31f487dca7ca501a1d Mon Sep 17 00:00:00 2001 From: anilb Date: Wed, 1 Jul 2026 19:31:36 +0200 Subject: [PATCH 04/10] refactor: move pypi downloads out of deps-dev Signed-off-by: anilb --- services/apps/packages_worker/src/activities.ts | 1 + .../apps/packages_worker/src/bin/bq-dataset-ingest.ts | 2 +- .../packages_worker/src/deps-dev/activities/index.ts | 1 - .../packages_worker/src/deps-dev/workflows/index.ts | 1 - .../downloads}/__tests__/pypiDownloads.test.ts | 0 .../downloads}/getCriticalPypiCount.ts | 0 .../downloads}/ingestPypiDownloads.ts | 10 ++++++---- .../schedules => pypi/downloads}/pypiDownloads.ts | 3 ++- .../queries => pypi/downloads}/pypiDownloadsDates.ts | 0 .../queries => pypi/downloads}/pypiDownloadsSql.ts | 0 services/apps/packages_worker/src/workflows/index.ts | 6 ++++-- 11 files changed, 14 insertions(+), 10 deletions(-) rename services/apps/packages_worker/src/{deps-dev/queries => pypi/downloads}/__tests__/pypiDownloads.test.ts (100%) rename services/apps/packages_worker/src/{deps-dev/activities => pypi/downloads}/getCriticalPypiCount.ts (100%) rename services/apps/packages_worker/src/{deps-dev/workflows => pypi/downloads}/ingestPypiDownloads.ts (96%) rename services/apps/packages_worker/src/{deps-dev/schedules => pypi/downloads}/pypiDownloads.ts (98%) rename services/apps/packages_worker/src/{deps-dev/queries => pypi/downloads}/pypiDownloadsDates.ts (100%) rename services/apps/packages_worker/src/{deps-dev/queries => pypi/downloads}/pypiDownloadsSql.ts (100%) diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index 651929981f..1c40183c40 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -30,4 +30,5 @@ export { ingestPypiPackageBatch, pypiStopAfterFirstPage, } from './pypi/activities' +export { getCriticalPypiCount } from './pypi/downloads/getCriticalPypiCount' export { processNuGetBatch } from './nuget/activities' diff --git a/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts b/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts index 119f05fe90..bb0b386a53 100644 --- a/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts +++ b/services/apps/packages_worker/src/bin/bq-dataset-ingest.ts @@ -2,7 +2,7 @@ import { scheduleOsspckgsBootstrap } from '../deps-dev/schedules/bootstrap' import { schedulePypiDownloads30d, schedulePypiDownloadsDaily, -} from '../deps-dev/schedules/pypiDownloads' +} from '../pypi/downloads/pypiDownloads' import { svc } from '../service' setImmediate(async () => { diff --git a/services/apps/packages_worker/src/deps-dev/activities/index.ts b/services/apps/packages_worker/src/deps-dev/activities/index.ts index be331e547e..3de67ac1d2 100644 --- a/services/apps/packages_worker/src/deps-dev/activities/index.ts +++ b/services/apps/packages_worker/src/deps-dev/activities/index.ts @@ -1,5 +1,4 @@ export * from './bqExportToGcs' -export * from './getCriticalPypiCount' export * from './setJobStep' export * from './createVersionsLookup' export * from './managePackageDepsConstraints' diff --git a/services/apps/packages_worker/src/deps-dev/workflows/index.ts b/services/apps/packages_worker/src/deps-dev/workflows/index.ts index 4197f6442b..642653ca2c 100644 --- a/services/apps/packages_worker/src/deps-dev/workflows/index.ts +++ b/services/apps/packages_worker/src/deps-dev/workflows/index.ts @@ -3,6 +3,5 @@ export * from './ingestAdvisories' export * from './ingestDependentCounts' export * from './ingestDependencies' export * from './ingestPackages' -export * from './ingestPypiDownloads' export * from './ingestRepos' export * from './ingestVersions' diff --git a/services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts b/services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts similarity index 100% rename from services/apps/packages_worker/src/deps-dev/queries/__tests__/pypiDownloads.test.ts rename to services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts diff --git a/services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts b/services/apps/packages_worker/src/pypi/downloads/getCriticalPypiCount.ts similarity index 100% rename from services/apps/packages_worker/src/deps-dev/activities/getCriticalPypiCount.ts rename to services/apps/packages_worker/src/pypi/downloads/getCriticalPypiCount.ts diff --git a/services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts similarity index 96% rename from services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts rename to services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts index 4ef1db5dc2..5f83f37e2a 100644 --- a/services/apps/packages_worker/src/deps-dev/workflows/ingestPypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts @@ -1,11 +1,13 @@ import { proxyActivities, workflowInfo } from '@temporalio/workflow' -import type * as depsDevActivities from '../activities' +import type * as depsDevActivities from '../../deps-dev/activities' + +import type * as pypiDownloadsActivities from './getCriticalPypiCount' import { computeLast30dWindows, defaultDailyRange, utcFirstOfCurrentMonth, -} from '../queries/pypiDownloadsDates' +} from './pypiDownloadsDates' import { PYPI_DOWNLOADS_30D_KIND, PYPI_DOWNLOADS_30D_STAGING, @@ -15,7 +17,7 @@ import { buildPypiDownloads30dSql, buildPypiDownloadsDailyMergeSql, buildPypiDownloadsDailySql, -} from '../queries/pypiDownloadsSql' +} from './pypiDownloadsSql' const { bqExportToGcs } = proxyActivities({ startToCloseTimeout: '1 hour', @@ -38,7 +40,7 @@ const { mergeStagingToTable } = proxyActivities({ retry: { maximumAttempts: 1 }, }) -const { getCriticalPypiCount } = proxyActivities({ +const { getCriticalPypiCount } = proxyActivities({ startToCloseTimeout: '1 minute', retry: { maximumAttempts: 3 }, }) diff --git a/services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts similarity index 98% rename from services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts rename to services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts index a9bfe8e254..379f504e65 100644 --- a/services/apps/packages_worker/src/deps-dev/schedules/pypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts @@ -1,7 +1,8 @@ import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' import { svc } from '../../service' -import { ingestPypiDownloadsDaily, ingestPypiDownloadsLast30d } from '../workflows' + +import { ingestPypiDownloadsDaily, ingestPypiDownloadsLast30d } from './ingestPypiDownloads' // Last-30d downloads for all pypi packages. Runs on the 4th of the month (06:00 UTC) — a few // days after the window's end (1st of the month) so the BigQuery partitions have settled. No diff --git a/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts similarity index 100% rename from services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsDates.ts rename to services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts diff --git a/services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsSql.ts similarity index 100% rename from services/apps/packages_worker/src/deps-dev/queries/pypiDownloadsSql.ts rename to services/apps/packages_worker/src/pypi/downloads/pypiDownloadsSql.ts diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 02bdc59ef9..e2c956f056 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -12,8 +12,6 @@ export { ingestDependencies, ingestAdvisories, ingestDependentCounts, - ingestPypiDownloadsLast30d, - ingestPypiDownloadsDaily, } from '../deps-dev/workflows' export { osvSync } from '../osv/workflows' export { ingestMavenPackages } from '../maven/workflows' @@ -22,4 +20,8 @@ export { rankPackagesWorkflow } from '../criticality/workflow' export { cargoSyncWorkflow } from '../cargo/workflows' export { enrichGoVersions, enrichGoStatus } from '../go/workflows' export { ingestPypiPackages } from '../pypi/workflows' +export { + ingestPypiDownloadsLast30d, + ingestPypiDownloadsDaily, +} from '../pypi/downloads/ingestPypiDownloads' export { ingestNuGetPackages } from '../nuget/workflows' From 5937852436b5b19186d6d954d134142908af81f0 Mon Sep 17 00:00:00 2001 From: anilb Date: Wed, 1 Jul 2026 20:20:09 +0200 Subject: [PATCH 05/10] fix: address pypi worker review Signed-off-by: anilb --- .../apps/packages_worker/src/npm/upsertPackage.ts | 4 ++-- .../src/pypi/downloads/ingestPypiDownloads.ts | 14 ++++++++------ .../apps/packages_worker/src/pypi/fetchProject.ts | 2 +- services/apps/packages_worker/src/pypi/proxies.ts | 8 ++++---- .../packages_worker/src/pypi/upsertProject.ts | 6 +++--- .../data-access-layer/src/packages/maintainers.ts | 2 +- .../data-access-layer/src/packages/packages.ts | 8 ++++---- .../data-access-layer/src/packages/versions.ts | 15 ++++++++------- 8 files changed, 31 insertions(+), 28 deletions(-) diff --git a/services/apps/packages_worker/src/npm/upsertPackage.ts b/services/apps/packages_worker/src/npm/upsertPackage.ts index cfc922fdcf..130f128f91 100644 --- a/services/apps/packages_worker/src/npm/upsertPackage.ts +++ b/services/apps/packages_worker/src/npm/upsertPackage.ts @@ -1,9 +1,9 @@ import { getOrCreateRepoByUrl, upsertNpmFundingLinks, - upsertNpmMaintainers, upsertNpmPackage, upsertNpmVersions, + upsertPackageMaintainers, upsertPackageRepo, } from '@crowd/data-access-layer/src/packages' import type { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' @@ -106,7 +106,7 @@ export async function upsertPackage( verChanged.forEach((f) => changed.add(f)) if (maintainers.length > 0) { - const mChanged = await upsertNpmMaintainers(t, pkgId, maintainers) + const mChanged = await upsertPackageMaintainers(t, pkgId, maintainers) mChanged.forEach((f) => changed.add(f)) } diff --git a/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts index 5f83f37e2a..6b0ba7778b 100644 --- a/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts @@ -45,12 +45,9 @@ const { getCriticalPypiCount } = proxyActivities retry: { maximumAttempts: 3 }, }) -// Per-window 30-day scans can reach ~1.5 TiB worst case; daily scans are far smaller but a wide -// backfill multiplies by the day count. Defaults guard against runaway scans and are overridable -// per kind via BQ_DATASET_INGEST_PYPI_DOWNLOADS_{30D,DAILY}_MAX_BQ_GB. -// A single 30d window scans ~31 day-partitions; measured at ~4.56 TB (≈147 GB/day averaged over -// a month — weekdays are heavier than the weekend sample). Ceiling sits above that with headroom; -// raise it if a future month exceeds it. Daily scans its 2-day trailing window (~300 GB). +// Per-kind ceilings guard against runaway BQ scans; override via +// BQ_DATASET_INGEST_PYPI_DOWNLOADS_{30D,DAILY}_MAX_BQ_GB. Defaults sized from the measured +// ~4.56 TB/30d window and ~300 GB daily 2-day window, with headroom. const MAX_BYTES_GB_30D = 6000 const MAX_BYTES_GB_DAILY = 2000 @@ -181,6 +178,11 @@ export async function ingestPypiDownloadsDaily(opts: { const runId = start.toISOString().replace(/[:.]/g, '-') const today = start.toISOString().slice(0, 10) + // A backfill must supply BOTH bounds; a single bound is a mistake, not a partial range — + // fail loudly rather than silently scanning the default 2-day window. + if (Boolean(opts.startDate) !== Boolean(opts.endDate)) { + throw new Error('ingestPypiDownloadsDaily: startDate and endDate must be provided together') + } const range = opts.startDate && opts.endDate ? { startDate: opts.startDate, endDate: opts.endDate } diff --git a/services/apps/packages_worker/src/pypi/fetchProject.ts b/services/apps/packages_worker/src/pypi/fetchProject.ts index 74fb7e78fa..b25f1af12f 100644 --- a/services/apps/packages_worker/src/pypi/fetchProject.ts +++ b/services/apps/packages_worker/src/pypi/fetchProject.ts @@ -6,7 +6,7 @@ const REGISTRY = 'https://pypi.org/pypi' const USER_AGENT = 'lfx-packages-worker/0.1 (+https://lfx.linuxfoundation.org)' // Fetch a project's metadata from the PyPI JSON API. -// Error's handled with respect to their types (retryable or not) +// Errors are handled per their type (retryable or not): // 404 → NOT_FOUND (skip) // 429 → RATE_LIMIT and 5xx/network → TRANSIENT (Temporal retries) // malformed body → MALFORMED (skip). diff --git a/services/apps/packages_worker/src/pypi/proxies.ts b/services/apps/packages_worker/src/pypi/proxies.ts index c460fcf27f..a8273a73ff 100644 --- a/services/apps/packages_worker/src/pypi/proxies.ts +++ b/services/apps/packages_worker/src/pypi/proxies.ts @@ -1,8 +1,8 @@ -import { parseProxies, type ProxyEndpoint } from '../proxies' +import { type ProxyEndpoint, parseProxies } from '../proxies' -// Off by default: when disabled thesingle PyPI lane egresses directly (no ProxyAgent). -// The proxy list is shared with workers via CROWD_PACKAGES_PROXIES -// only the enable flag (CROWD_PACKAGES_PYPI_PROXIES_ENABLED) is PyPI-specific. +// Off by default: when disabled, the single PyPI lane egresses directly (no ProxyAgent). +// The proxy list is shared with other workers via CROWD_PACKAGES_PROXIES; only the enable +// flag (CROWD_PACKAGES_PYPI_PROXIES_ENABLED) is PyPI-specific. export function pypiProxiesEnabled(): boolean { const raw = (process.env.CROWD_PACKAGES_PYPI_PROXIES_ENABLED ?? '').trim().toLowerCase() return raw === 'true' || raw === '1' diff --git a/services/apps/packages_worker/src/pypi/upsertProject.ts b/services/apps/packages_worker/src/pypi/upsertProject.ts index 988fbb6e9a..911c248f7e 100644 --- a/services/apps/packages_worker/src/pypi/upsertProject.ts +++ b/services/apps/packages_worker/src/pypi/upsertProject.ts @@ -1,7 +1,7 @@ import { getOrCreateRepoByUrl, upsertNpmFundingLinks, - upsertNpmMaintainers, + upsertPackageMaintainers, upsertPackageRepo, upsertPypiPackage, upsertPypiVersions, @@ -91,12 +91,12 @@ export async function upsertProject( } if (versionRows.length > 0) { - const verChanged = await upsertPypiVersions(t, pkgId, versionRows) + const verChanged = await upsertPypiVersions(t, pkgId, versionRows, latestVersion) verChanged.forEach((f) => changed.add(f)) } if (maintainers.length > 0) { - const mChanged = await upsertNpmMaintainers(t, pkgId, maintainers, 'pypi') + const mChanged = await upsertPackageMaintainers(t, pkgId, maintainers, 'pypi') mChanged.forEach((f) => changed.add(f)) } diff --git a/services/libs/data-access-layer/src/packages/maintainers.ts b/services/libs/data-access-layer/src/packages/maintainers.ts index a44d489682..6ef59d7397 100644 --- a/services/libs/data-access-layer/src/packages/maintainers.ts +++ b/services/libs/data-access-layer/src/packages/maintainers.ts @@ -7,7 +7,7 @@ export interface NpmMaintainerInput { role: 'author' | 'maintainer' } -export async function upsertNpmMaintainers( +export async function upsertPackageMaintainers( qx: QueryExecutor, packageId: string, maintainers: NpmMaintainerInput[], diff --git a/services/libs/data-access-layer/src/packages/packages.ts b/services/libs/data-access-layer/src/packages/packages.ts index 0389cb2892..3a18d0cd82 100644 --- a/services/libs/data-access-layer/src/packages/packages.ts +++ b/services/libs/data-access-layer/src/packages/packages.ts @@ -72,8 +72,8 @@ export async function upsertNpmPackage( THEN packages.versions_count ELSE EXCLUDED.versions_count END, latest_version = EXCLUDED.latest_version, - first_release_at = EXCLUDED.first_release_at, - latest_release_at = EXCLUDED.latest_release_at, + first_release_at = COALESCE(EXCLUDED.first_release_at, packages.first_release_at), + latest_release_at = COALESCE(EXCLUDED.latest_release_at, packages.latest_release_at), ingestion_source = EXCLUDED.ingestion_source, last_synced_at = EXCLUDED.last_synced_at RETURNING id, namespace, name, status, registry_url, description, homepage, @@ -173,8 +173,8 @@ export async function upsertPypiPackage( THEN packages.versions_count ELSE EXCLUDED.versions_count END, latest_version = EXCLUDED.latest_version, - first_release_at = EXCLUDED.first_release_at, - latest_release_at = EXCLUDED.latest_release_at, + first_release_at = COALESCE(EXCLUDED.first_release_at, packages.first_release_at), + latest_release_at = COALESCE(EXCLUDED.latest_release_at, packages.latest_release_at), ingestion_source = EXCLUDED.ingestion_source, last_synced_at = EXCLUDED.last_synced_at RETURNING id, namespace, name, status, registry_url, description, homepage, diff --git a/services/libs/data-access-layer/src/packages/versions.ts b/services/libs/data-access-layer/src/packages/versions.ts index ab3aefbba0..f3fff47bc8 100644 --- a/services/libs/data-access-layer/src/packages/versions.ts +++ b/services/libs/data-access-layer/src/packages/versions.ts @@ -82,6 +82,7 @@ export async function upsertPypiVersions( qx: QueryExecutor, packageId: string, versions: PypiVersionInput[], + latestNumber: string | null, ): Promise { if (versions.length === 0) return [] const row: { changed_fields: string[] } = await qx.selectOne( @@ -138,17 +139,17 @@ export async function upsertPypiVersions( }, ) - // Clear a stale is_latest on any OTHER version of this package — e.g. a previously-latest - // version whose files were all deleted, so it is not in this batch and would otherwise keep - // is_latest = true alongside the new latest. - const latestNumbers = versions.filter((v) => v.isLatest).map((v) => v.number) - if (latestNumbers.length > 0) { + // Clear a stale is_latest on every OTHER version of this package. Anchored on the declared + // latest (info.version) — NOT on what's in this batch — so a previously-latest version whose + // files were all deleted (and is therefore omitted from the batch) can't keep is_latest = true + // alongside the new latest. When no latest is known, leave flags untouched rather than wipe all. + if (latestNumber != null) { await qx.result( `UPDATE versions SET is_latest = false WHERE package_id = $(packageId)::bigint AND is_latest = true - AND NOT (number = ANY($(latestNumbers)::text[]))`, - { packageId, latestNumbers }, + AND number <> $(latestNumber)`, + { packageId, latestNumber }, ) } From c4daff942ce56903bd66e8889268f7fd2ee1c378 Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 2 Jul 2026 10:51:35 +0200 Subject: [PATCH 06/10] fix: address second review round Signed-off-by: anilb --- services/apps/packages_worker/package.json | 4 ++-- .../src/pypi/downloads/__tests__/pypiDownloads.test.ts | 2 +- .../packages_worker/src/pypi/downloads/ingestPypiDownloads.ts | 4 ++-- .../apps/packages_worker/src/pypi/downloads/pypiDownloads.ts | 2 +- .../packages_worker/src/pypi/downloads/pypiDownloadsDates.ts | 2 +- services/apps/packages_worker/src/pypi/upsertProject.ts | 2 +- services/libs/data-access-layer/src/packages/maintainers.ts | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 14f1a04bad..f0448b591b 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -29,8 +29,8 @@ "dev:npm-worker": "CROWD_TEMPORAL_TASKQUEUE=npm-worker SERVICE=npm-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/npm-worker.ts", "dev:npm-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=npm-worker SERVICE=npm-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/npm-worker.ts", "start:pypi-worker": "CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker tsx src/bin/pypi-worker.ts", - "dev:pypi-worker": "CROWD_TEMPORAL_TASKQUEUE=pypi-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/pypi-worker.ts", - "dev:pypi-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/pypi-worker.ts", + "dev:pypi-worker": "CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/pypi-worker.ts", + "dev:pypi-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=pypi-worker SERVICE=pypi-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/pypi-worker.ts", "start:osv-worker": "CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker tsx src/bin/osv-worker.ts", "dev:osv-worker": "CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/osv-worker.ts", "dev:osv-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=osv-worker SERVICE=osv-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/osv-worker.ts", diff --git a/services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts b/services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts index 045b1337a7..6631c16990 100644 --- a/services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts +++ b/services/apps/packages_worker/src/pypi/downloads/__tests__/pypiDownloads.test.ts @@ -57,7 +57,7 @@ describe('computeLast30dWindows', () => { // ── Criterion 3: daily trailing window + first-of-month helper ───────────────────────────── describe('defaultDailyRange', () => { - it('defaults to a 2-day self-healing window [today-2, today-1]', () => { + it('defaults to a 2-day trailing re-scan window [today-2, today-1]', () => { expect(defaultDailyRange('2026-06-30')).toEqual({ startDate: '2026-06-28', endDate: '2026-06-29', diff --git a/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts index 6b0ba7778b..ba59c20285 100644 --- a/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/ingestPypiDownloads.ts @@ -168,8 +168,8 @@ export async function ingestPypiDownloadsLast30d(opts: { fromDate?: string }): P } } -// Daily downloads for the critical pypi subset. Scheduled daily with no range → the last 2-day -// self-healing window. Pass an explicit startDate/endDate to backfill an arbitrary range. +// Daily downloads for the critical pypi subset. Scheduled daily with no range → the 2-day +// trailing re-scan window. Pass an explicit startDate/endDate to backfill an arbitrary range. export async function ingestPypiDownloadsDaily(opts: { startDate?: string endDate?: string diff --git a/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts index 379f504e65..04fc77bc8e 100644 --- a/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts @@ -35,7 +35,7 @@ export async function schedulePypiDownloads30d(): Promise { } // Daily downloads for the critical pypi subset. Runs daily at 06:00 UTC; no args → the workflow -// scans the self-healing 2-day trailing window. +// scans the 2-day trailing re-scan window. export async function schedulePypiDownloadsDaily(): Promise { const { temporal } = svc if (!temporal) throw new Error('Temporal client not initialized') diff --git a/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts index b15bfa01a8..4d3853499d 100644 --- a/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts +++ b/services/apps/packages_worker/src/pypi/downloads/pypiDownloadsDates.ts @@ -58,7 +58,7 @@ export function computeLast30dWindows( // re-scanned once on the next run and corrected, while keeping the daily BigQuery scan small. const DAILY_TRAILING_DAYS = 2 -// Default daily trailing window: [today - 2, today - 1] inclusive (self-healing). +// Default daily trailing re-scan window: [today - 2, today - 1] inclusive. export function defaultDailyRange(today: string): { startDate: string; endDate: string } { return { startDate: addDaysUTC(today, -DAILY_TRAILING_DAYS), endDate: addDaysUTC(today, -1) } } diff --git a/services/apps/packages_worker/src/pypi/upsertProject.ts b/services/apps/packages_worker/src/pypi/upsertProject.ts index 911c248f7e..38b24fb9f4 100644 --- a/services/apps/packages_worker/src/pypi/upsertProject.ts +++ b/services/apps/packages_worker/src/pypi/upsertProject.ts @@ -26,8 +26,8 @@ export async function upsertProject( project: PyPiProject, purl: string, ): Promise<{ purl: string; changedFields: string[] }> { + stripNullBytesDeep(project) const info = project.info - stripNullBytesDeep(info) const name = info.name const status = info.yanked ? 'yanked' : 'active' diff --git a/services/libs/data-access-layer/src/packages/maintainers.ts b/services/libs/data-access-layer/src/packages/maintainers.ts index 6ef59d7397..b49c8e048d 100644 --- a/services/libs/data-access-layer/src/packages/maintainers.ts +++ b/services/libs/data-access-layer/src/packages/maintainers.ts @@ -1,6 +1,6 @@ import { QueryExecutor } from '../queryExecutor' -export interface NpmMaintainerInput { +export interface PackageMaintainerInput { username: string displayName: string | null email: string | null @@ -10,7 +10,7 @@ export interface NpmMaintainerInput { export async function upsertPackageMaintainers( qx: QueryExecutor, packageId: string, - maintainers: NpmMaintainerInput[], + maintainers: PackageMaintainerInput[], ecosystem = 'npm', ): Promise { const changed = new Set() From 41428440c6c7cab77a4618f74a5fa9982d1b72be Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 2 Jul 2026 11:01:11 +0200 Subject: [PATCH 07/10] style: fix import order in npm proxies Signed-off-by: anilb --- services/apps/packages_worker/src/npm/proxies.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/packages_worker/src/npm/proxies.ts b/services/apps/packages_worker/src/npm/proxies.ts index 798af93c4a..80a51ca998 100644 --- a/services/apps/packages_worker/src/npm/proxies.ts +++ b/services/apps/packages_worker/src/npm/proxies.ts @@ -1,4 +1,4 @@ -import { parseProxies, proxyCount, type ProxyEndpoint } from '../proxies' +import { type ProxyEndpoint, parseProxies, proxyCount } from '../proxies' // Global kill-switch for the npm proxy layer. When off (the default), every npm worker // runs a single direct lane (no ProxyAgent) — see laneCount/proxyForLane. From 1732ab30b74b241bf02cc233ee01d837f9f9f95e Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 2 Jul 2026 11:09:42 +0200 Subject: [PATCH 08/10] fix: use per-run timeout on schedules Signed-off-by: anilb --- .../apps/packages_worker/src/pypi/downloads/pypiDownloads.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts index 04fc77bc8e..bd91c5d61e 100644 --- a/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts +++ b/services/apps/packages_worker/src/pypi/downloads/pypiDownloads.ts @@ -20,7 +20,7 @@ export async function schedulePypiDownloads30d(): Promise { type: 'startWorkflow', workflowType: ingestPypiDownloadsLast30d, taskQueue: 'bq-dataset-ingest', - workflowExecutionTimeout: '12 hours', + workflowRunTimeout: '12 hours', retry: { initialInterval: '1 minute', backoffCoefficient: 2, maximumAttempts: 3 }, args: [{}], }, @@ -49,7 +49,7 @@ export async function schedulePypiDownloadsDaily(): Promise { type: 'startWorkflow', workflowType: ingestPypiDownloadsDaily, taskQueue: 'bq-dataset-ingest', - workflowExecutionTimeout: '6 hours', + workflowRunTimeout: '6 hours', retry: { initialInterval: '1 minute', backoffCoefficient: 2, maximumAttempts: 3 }, args: [{}], }, From 5ed1493b3021b4a54bebc56fd38724633ca798de Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 2 Jul 2026 11:20:00 +0200 Subject: [PATCH 09/10] style: format markdown docs Signed-off-by: anilb --- services/apps/packages_worker/CONTEXT.md | 2 +- services/apps/packages_worker/src/deps-dev/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/apps/packages_worker/CONTEXT.md b/services/apps/packages_worker/CONTEXT.md index 1baf79e026..a0fdc7f897 100644 --- a/services/apps/packages_worker/CONTEXT.md +++ b/services/apps/packages_worker/CONTEXT.md @@ -49,7 +49,7 @@ _Avoid_: self-healing (that's the npm gap-filling property) The pypi 30d workflow, given no `fromDate`, computes and ingests only the latest **Window** (idempotent upsert, mirrored to `packages.downloads_last_30d`). It does not gap-fill missed months; those are recovered by **Backfill**. **Backfill**: -A manual, one-time run over an explicit date range to fill history or recover gaps — pypi daily takes `{startDate, endDate}`, pypi 30d takes `{fromDate}` (enumerates every monthly **Window** from then to the latest). For pypi downloads this is the *only* gap-recovery mechanism; scheduled runs are fixed-window, not self-healing. +A manual, one-time run over an explicit date range to fill history or recover gaps — pypi daily takes `{startDate, endDate}`, pypi 30d takes `{fromDate}` (enumerates every monthly **Window** from then to the latest). For pypi downloads this is the _only_ gap-recovery mechanism; scheduled runs are fixed-window, not self-healing. ## Relationships diff --git a/services/apps/packages_worker/src/deps-dev/README.md b/services/apps/packages_worker/src/deps-dev/README.md index cefc81283f..3b15208a5a 100644 --- a/services/apps/packages_worker/src/deps-dev/README.md +++ b/services/apps/packages_worker/src/deps-dev/README.md @@ -63,8 +63,8 @@ The mode-specific key takes precedence over the generic key. Value must be a pos | `BQ_DATASET_INGEST_DEPENDENT_COUNTS_NUGET_MAX_BQ_GB` | 200 | `dependent_counts_nuget` | NUGET exact reverse transitive closure over `NuGetRequirementsLatest` (script mode). All 3 count columns. `maximumBytesBilled` runaway cap above the measured ~32 GB. | | `BQ_DATASET_INGEST_SCORECARD_REPOS_MAX_BQ_GB` | 50 | `scorecard_repos` | | | `BQ_DATASET_INGEST_SCORECARD_CHECKS_MAX_BQ_GB` | 500 | `scorecard_checks` | | -| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_30D_MAX_BQ_GB` | 6000 | `pypi_downloads_30d` | Per 30-day window scan (~4.56 TB measured; set in `ingestPypiDownloads.ts`). | -| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_DAILY_MAX_BQ_GB` | 2000 | `pypi_downloads_daily` | Daily 2-day trailing window (~300 GB); scales with backfill range, raise for long backfills. | +| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_30D_MAX_BQ_GB` | 6000 | `pypi_downloads_30d` | Per 30-day window scan (~4.56 TB measured; set in `ingestPypiDownloads.ts`). | +| `BQ_DATASET_INGEST_PYPI_DOWNLOADS_DAILY_MAX_BQ_GB` | 2000 | `pypi_downloads_daily` | Daily 2-day trailing window (~300 GB); scales with backfill range, raise for long backfills. | The override logic lives in `src/deps-dev/activities/bqExportToGcs.ts`. From fd4dbd7017fd5901452a36c15482ca84687004e9 Mon Sep 17 00:00:00 2001 From: anilb Date: Thu, 2 Jul 2026 11:53:54 +0200 Subject: [PATCH 10/10] build: register pypi-worker in packages builder Signed-off-by: anilb --- scripts/builders/packages.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/builders/packages.env b/scripts/builders/packages.env index 191e44ea95..8323a94c66 100644 --- a/scripts/builders/packages.env +++ b/scripts/builders/packages.env @@ -1,4 +1,4 @@ DOCKERFILE="./services/docker/Dockerfile.packages" CONTEXT="../" REPO="sjc.ocir.io/axbydjxa5zuh/packages" -SERVICES="github-repos-enricher bq-dataset-ingest npm-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker" +SERVICES="github-repos-enricher bq-dataset-ingest npm-worker pypi-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker"