From f7769a6fc8bdc6de457429c3f748207603a9da2c Mon Sep 17 00:00:00 2001 From: adarsh0728 Date: Thu, 7 May 2026 19:56:54 +0530 Subject: [PATCH 1/2] add log_level support for Go components and document log level controls Signed-off-by: adarsh0728 --- docs/development/debugging.md | 6 + .../configuration/environment-variables.md | 10 +- .../reference/configuration/log-levels.md | 186 ++++++++++++++++++ mkdocs.yml | 1 + pkg/apis/numaflow/v1alpha1/const.go | 1 + pkg/mvtxdaemon/server/service/rater/rater.go | 10 +- pkg/shared/logging/log.go | 14 +- pkg/shared/logging/log_test.go | 89 +++++++++ 8 files changed, 312 insertions(+), 5 deletions(-) create mode 100644 docs/user-guide/reference/configuration/log-levels.md create mode 100644 pkg/shared/logging/log_test.go diff --git a/docs/development/debugging.md b/docs/development/debugging.md index 68a3adaad8..71fc5abb64 100644 --- a/docs/development/debugging.md +++ b/docs/development/debugging.md @@ -1,5 +1,11 @@ # How To Debug +## Controlling Log Level + +For a full reference on log-level controls — including which pods are affected, YAML snippets for every component, and `RUST_LOG` for Rust data-plane pods — see [Log Levels](../user-guide/reference/configuration/log-levels.md). + +## Debug Logs + To enable debug logs in a Vertex Pod, set environment variable `NUMAFLOW_DEBUG` to `true` for the Vertex. For example: ```yaml diff --git a/docs/user-guide/reference/configuration/environment-variables.md b/docs/user-guide/reference/configuration/environment-variables.md index 5da09b97c3..314ee66f62 100644 --- a/docs/user-guide/reference/configuration/environment-variables.md +++ b/docs/user-guide/reference/configuration/environment-variables.md @@ -1,6 +1,14 @@ # Environment Variables -For the `numa` container of vertex pods, environment variable `NUMAFLOW_DEBUG` can be set to `true` for [debugging](../../../development/debugging.md). +## Log level control + +Numaflow exposes three env vars for controlling log verbosity across its pods: + +- `NUMAFLOW_LOG_LEVEL` — sets the log level for all **Go** components (daemon, controller, webhook, UX server, ISB service jobs). Accepts any [zapcore level](https://pkg.go.dev/go.uber.org/zap/zapcore#Level) (`debug`, `info`, `warn`, `error`, etc.). Overrides the level implied by `NUMAFLOW_DEBUG`. Invalid values are silently ignored. +- `RUST_LOG` — sets the log level for all **Rust** data-plane pods (vertex `numa` container, MonoVertex `numa` container, serving pods). Accepts standard [`tracing-subscriber` EnvFilter syntax](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html) (e.g. `warn`, `numaflow_core=debug,info`). +- `NUMAFLOW_DEBUG` — development shortcut honored by both runtimes; sets level to `debug` and switches log output from JSON to human-readable text. **Note:** the format change may break log shippers expecting JSON — prefer `NUMAFLOW_LOG_LEVEL` or `RUST_LOG` when only the level needs changing. + +See [Log Levels](log-levels.md) for a full pod inventory, per-component YAML examples, and common recipes. In [`udf`](../../user-defined-functions/map/map.md), [`udsink`](../../sinks/user-defined-sinks.md) and [`transformer`](../../sources/transformer/overview.md) containers, there are some preset environment variables that can be used directly. diff --git a/docs/user-guide/reference/configuration/log-levels.md b/docs/user-guide/reference/configuration/log-levels.md new file mode 100644 index 0000000000..5b2744e015 --- /dev/null +++ b/docs/user-guide/reference/configuration/log-levels.md @@ -0,0 +1,186 @@ +# Log Levels + +Numaflow pods use two separate logging systems depending on whether they run Go or Rust code. This page explains how to control the log level for each. + +## Quick reference + +| Pod / container | Runtime | Log-level env var | Default level | +|---|---|---|---| +| Pipeline daemon | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| MonoVertex daemon | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| ISB svc create / delete job | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| ISB svc validate (init container) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| Controller (`numaflow-controller`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| Webhook (`numaflow-webhook`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| UX server (`numaflow-server`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | +| Pipeline vertex `numa` container | Rust | `RUST_LOG` | `info` | +| MonoVertex `numa` container | Rust | `RUST_LOG` | `info` | +| Serving pod | Rust | `RUST_LOG` | `info` | +| InterStepBufferService (JetStream / Redis) | upstream image | n/a | n/a | + +`NUMAFLOW_DEBUG=true` is honored by **both** runtimes but has different effects (see [below](#numaflow_debug-interaction)). + +--- + +## Go components — `NUMAFLOW_LOG_LEVEL` + +All Go binaries (daemon, controller, webhook, UX server, ISB service jobs) use the shared `NewLogger()` helper, which reads `NUMAFLOW_LOG_LEVEL` at startup. + +**Accepted values:** any level recognized by [go.uber.org/zap/zapcore](https://pkg.go.dev/go.uber.org/zap/zapcore#Level): `debug`, `info`, `warn`, `error`, `dpanic`, `panic`, `fatal`. In practice `debug`, `info`, `warn`, and `error` are the useful operational values. + +**Default:** `info` + +**Precedence:** `NUMAFLOW_LOG_LEVEL` overrides the level implied by `NUMAFLOW_DEBUG`. Invalid values are silently ignored and the default level is used instead. + +### Pipeline daemon pod + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +spec: + templates: + daemon: + containerTemplate: + env: + - name: NUMAFLOW_LOG_LEVEL + value: warn +``` + +### MonoVertex daemon pod + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: MonoVertex +spec: + daemonTemplate: + containerTemplate: + env: + - name: NUMAFLOW_LOG_LEVEL + value: warn +``` + +### Controller, webhook, and UX server + +These are cluster-level components deployed via the install manifests. Set `NUMAFLOW_LOG_LEVEL` directly on the relevant Deployment: + +```yaml +# numaflow-controller Deployment +env: + - name: NUMAFLOW_LOG_LEVEL + value: warn +``` + +### ISB service jobs + +Init and finalizer jobs for pipeline ISB creation/deletion/validation: + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +spec: + templates: + job: + containerTemplate: + env: + - name: NUMAFLOW_LOG_LEVEL + value: warn +``` + +--- + +## Rust components — `RUST_LOG` + +Pipeline vertex pods, MonoVertex pods, and Serving pods run the Numaflow Rust data-plane binary. These use the [`tracing-subscriber`](https://docs.rs/tracing-subscriber) `EnvFilter`, which reads `RUST_LOG` at startup. + +**Accepted values:** standard `EnvFilter` syntax — simple level names (`debug`, `info`, `warn`, `error`) or per-crate directives (`numaflow_core=debug,h2=warn,info`). + +**Default:** `info` + +### Pipeline vertex pod + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +spec: + vertices: + - name: my-vertex + containerTemplate: + env: + - name: RUST_LOG + value: warn +``` + +To enable debug logs for a specific crate only: + +```yaml + - name: RUST_LOG + value: "numaflow_core=debug,info" +``` + +### MonoVertex pod + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: MonoVertex +spec: + containerTemplate: + env: + - name: RUST_LOG + value: warn +``` + +### Serving pod + +```yaml +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: ServingPipeline +spec: + serving: + containerTemplate: + env: + - name: RUST_LOG + value: warn +``` + +--- + +## `NUMAFLOW_DEBUG` interaction + +`NUMAFLOW_DEBUG=true` is a development shortcut. Its effects differ by runtime: + +| Effect | Go | Rust | +|---|---|---| +| Log level | Lowered to `debug` | Lowered to `debug` (plus `h2::codec=info`) | +| Log format | Switches from JSON to console (human-readable) | Switches from JSON to human-readable text | +| Stacktraces | Added at `warn`+ (vs `error`+ by default) | n/a | + +**Important:** switching from JSON to text format on the Rust side (`NUMAFLOW_DEBUG=true`) may break log shippers or aggregators that expect structured JSON. Prefer `RUST_LOG=debug` to lower the level without changing the output format. + +`NUMAFLOW_LOG_LEVEL` overrides the level for Go components regardless of `NUMAFLOW_DEBUG`. There is no equivalent override on the Rust side — use `RUST_LOG` instead. + +--- + +## Common recipes + +**Suppress idle-rater info noise on a MonoVertex daemon:** +```yaml +# MonoVertex.spec.daemonTemplate.containerTemplate.env +- name: NUMAFLOW_LOG_LEVEL + value: warn +``` + +**Enable debug for a single Rust crate without flooding all logs:** +```yaml +# Pipeline.spec.vertices[].containerTemplate.env +- name: RUST_LOG + value: "numaflow_core=debug,info" +``` + +**Enable full debug on a vertex pod (both Go sidecar and Rust data-plane):** +```yaml +# Pipeline.spec.vertices[].containerTemplate.env +- name: NUMAFLOW_DEBUG + value: "true" +# Note: this also switches log output from JSON to text. +# To keep JSON format while lowering the Rust level, use RUST_LOG=debug instead. +``` diff --git a/mkdocs.yml b/mkdocs.yml index 1b4ea37fcf..0815c8c93c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -125,6 +125,7 @@ nav: - user-guide/reference/configuration/container-resources.md - user-guide/reference/configuration/volumes.md - user-guide/reference/configuration/environment-variables.md + - user-guide/reference/configuration/log-levels.md - user-guide/reference/configuration/labels-and-annotations.md - user-guide/reference/configuration/init-containers.md - user-guide/reference/configuration/sidecar-containers.md diff --git a/pkg/apis/numaflow/v1alpha1/const.go b/pkg/apis/numaflow/v1alpha1/const.go index 46aaec973b..73a9d51ef2 100644 --- a/pkg/apis/numaflow/v1alpha1/const.go +++ b/pkg/apis/numaflow/v1alpha1/const.go @@ -129,6 +129,7 @@ const ( EnvLeaderElectionLeaseRenewPeriod = "NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD" EnvUDContainerType = "NUMAFLOW_UD_CONTAINER_TYPE" EnvDebug = "NUMAFLOW_DEBUG" + EnvLogLevel = "NUMAFLOW_LOG_LEVEL" EnvPPROF = "NUMAFLOW_PPROF" EnvHealthCheckDisabled = "NUMAFLOW_HEALTH_CHECK_DISABLED" EnvGRPCMaxMessageSize = "NUMAFLOW_GRPC_MAX_MESSAGE_SIZE" diff --git a/pkg/mvtxdaemon/server/service/rater/rater.go b/pkg/mvtxdaemon/server/service/rater/rater.go index 6269d10719..09bf9e73a5 100644 --- a/pkg/mvtxdaemon/server/service/rater/rater.go +++ b/pkg/mvtxdaemon/server/service/rater/rater.go @@ -270,7 +270,11 @@ func (r *Rater) getPodMetrics(podName string) map[string]*dto.MetricFamily { func (r *Rater) getPodReadCounts(podName string, result map[string]*dto.MetricFamily) *PodMetricsCount { value, ok := result[monoVtxReadMetricName] if !ok || value == nil || len(value.GetMetric()) == 0 { - r.log.Infof("[Pod name %s]: Metric %q is unavailable, the pod might haven't started processing data", podName, monoVtxReadMetricName) + // Logged at debug because this fires on every rater tick when the queue is idle (no messages + // ever read since pod start), since the Rust prometheus client only registers a counter after + // it is first incremented. + // To suppress, set NUMAFLOW_LOG_LEVEL=warn on the daemon container. + r.log.Debugf("[Pod name %s]: Metric %q is unavailable, the pod might haven't started processing data", podName, monoVtxReadMetricName) return nil } @@ -293,7 +297,9 @@ func (r *Rater) getPodPendingCounts(podName string, result map[string]*dto.Metri podPendingCount := &PodMetricsCount{podName, metricsList[0].Gauge.GetValue()} return podPendingCount } else { - r.log.Infof("[Pod name %s]: Metric %q is unavailable, the pod might haven't started processing data", podName, monoVtxPendingRawMetric) + // Same rationale as getPodReadCounts: gauge may not be emitted yet on an idle pod. + // To suppress, set NUMAFLOW_LOG_LEVEL=warn on the daemon container. + r.log.Debugf("[Pod name %s]: Metric %q is unavailable, the pod might haven't started processing data", podName, monoVtxPendingRawMetric) } return nil } diff --git a/pkg/shared/logging/log.go b/pkg/shared/logging/log.go index cc1226d20e..bd1d7fab48 100644 --- a/pkg/shared/logging/log.go +++ b/pkg/shared/logging/log.go @@ -24,7 +24,11 @@ import ( "go.uber.org/zap/zapcore" ) -// NewLogger returns a new zap.SugaredLogger +// NewLogger returns a new zap.SugaredLogger. +// Log level can be overridden at runtime via the NUMAFLOW_LOG_LEVEL env var +// (accepts any zapcore level: debug, info, warn, error, dpanic, panic, fatal). +// NUMAFLOW_DEBUG=true selects the development preset (console encoder, debug level). +// NUMAFLOW_LOG_LEVEL overrides the level chosen by NUMAFLOW_DEBUG; invalid values are silently ignored. func NewLogger() *zap.SugaredLogger { var config zap.Config debugMode, ok := os.LookupEnv("NUMAFLOW_DEBUG") @@ -33,7 +37,13 @@ func NewLogger() *zap.SugaredLogger { } else { config = zap.NewProductionConfig() } - // Config customization goes here if any + // NUMAFLOW_LOG_LEVEL overrides the level set by the preset above. + // Invalid values are silently ignored so a typo in a manifest does not crash the pod. + if lvlStr, ok := os.LookupEnv("NUMAFLOW_LOG_LEVEL"); ok { + if lvl, err := zapcore.ParseLevel(lvlStr); err == nil { + config.Level = zap.NewAtomicLevelAt(lvl) + } + } config.EncoderConfig.EncodeTime = zapcore.RFC3339NanoTimeEncoder config.OutputPaths = []string{"stdout"} logger, err := config.Build() diff --git a/pkg/shared/logging/log_test.go b/pkg/shared/logging/log_test.go new file mode 100644 index 0000000000..71db5fb9ca --- /dev/null +++ b/pkg/shared/logging/log_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2022 The Numaproj Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logging + +import ( + "testing" + + "go.uber.org/zap/zapcore" +) + +func TestNewLogger_DefaultLevel(t *testing.T) { + t.Setenv("NUMAFLOW_DEBUG", "") + t.Setenv("NUMAFLOW_LOG_LEVEL", "") + logger := NewLogger() + if logger == nil { + t.Fatal("expected non-nil logger") + } +} + +func TestNewLogger_LogLevel(t *testing.T) { + tests := []struct { + envVal string + wantLevel zapcore.Level + }{ + {"debug", zapcore.DebugLevel}, + {"info", zapcore.InfoLevel}, + {"warn", zapcore.WarnLevel}, + {"error", zapcore.ErrorLevel}, + } + for _, tt := range tests { + t.Run(tt.envVal, func(t *testing.T) { + t.Setenv("NUMAFLOW_DEBUG", "") + t.Setenv("NUMAFLOW_LOG_LEVEL", tt.envVal) + logger := NewLogger() + if logger == nil { + t.Fatal("expected non-nil logger") + } + if !logger.Level().Enabled(tt.wantLevel) { + t.Errorf("expected level %v to be enabled", tt.wantLevel) + } + if tt.wantLevel != zapcore.DebugLevel && logger.Level().Enabled(zapcore.DebugLevel) { + t.Errorf("expected debug level to be disabled for %v", tt.envVal) + } + }) + } +} + +func TestNewLogger_InvalidLogLevel_FallsBackToDefault(t *testing.T) { + t.Setenv("NUMAFLOW_DEBUG", "") + t.Setenv("NUMAFLOW_LOG_LEVEL", "garbage") + logger := NewLogger() + if logger == nil { + t.Fatal("expected non-nil logger") + } + // default production config is info; debug should be disabled + if logger.Level().Enabled(zapcore.DebugLevel) { + t.Error("expected debug to be disabled on invalid NUMAFLOW_LOG_LEVEL") + } +} + +func TestNewLogger_LogLevelOverridesDebugPreset(t *testing.T) { + // NUMAFLOW_DEBUG=true sets level to debug; NUMAFLOW_LOG_LEVEL=warn should win + t.Setenv("NUMAFLOW_DEBUG", "true") + t.Setenv("NUMAFLOW_LOG_LEVEL", "warn") + logger := NewLogger() + if logger == nil { + t.Fatal("expected non-nil logger") + } + if logger.Level().Enabled(zapcore.InfoLevel) { + t.Error("expected info to be disabled when NUMAFLOW_LOG_LEVEL=warn overrides NUMAFLOW_DEBUG=true") + } + if !logger.Level().Enabled(zapcore.WarnLevel) { + t.Error("expected warn to be enabled") + } +} From 59f726693258c9de08e33b4bddb8c2fe305dc18f Mon Sep 17 00:00:00 2001 From: adarsh0728 Date: Mon, 11 May 2026 09:19:15 +0530 Subject: [PATCH 2/2] platform wide log level support Signed-off-by: adarsh0728 --- docs/development/debugging.md | 2 +- .../configuration/environment-variables.md | 6 +- .../reference/configuration/log-levels.md | 89 +++++++++--------- pkg/shared/logging/log.go | 36 ++++++-- pkg/shared/logging/log_test.go | 11 +++ rust/numaflow/src/setup_tracing.rs | 91 +++++++++++++++++-- 6 files changed, 174 insertions(+), 61 deletions(-) diff --git a/docs/development/debugging.md b/docs/development/debugging.md index 71fc5abb64..6cc0a04f34 100644 --- a/docs/development/debugging.md +++ b/docs/development/debugging.md @@ -2,7 +2,7 @@ ## Controlling Log Level -For a full reference on log-level controls — including which pods are affected, YAML snippets for every component, and `RUST_LOG` for Rust data-plane pods — see [Log Levels](../user-guide/reference/configuration/log-levels.md). +For a full reference on log-level controls — including which pods are affected, YAML snippets for every component, and advanced `RUST_LOG` filtering for data-plane pods — see [Log Levels](../user-guide/reference/configuration/log-levels.md). ## Debug Logs diff --git a/docs/user-guide/reference/configuration/environment-variables.md b/docs/user-guide/reference/configuration/environment-variables.md index 314ee66f62..8cd4047e68 100644 --- a/docs/user-guide/reference/configuration/environment-variables.md +++ b/docs/user-guide/reference/configuration/environment-variables.md @@ -4,9 +4,9 @@ Numaflow exposes three env vars for controlling log verbosity across its pods: -- `NUMAFLOW_LOG_LEVEL` — sets the log level for all **Go** components (daemon, controller, webhook, UX server, ISB service jobs). Accepts any [zapcore level](https://pkg.go.dev/go.uber.org/zap/zapcore#Level) (`debug`, `info`, `warn`, `error`, etc.). Overrides the level implied by `NUMAFLOW_DEBUG`. Invalid values are silently ignored. -- `RUST_LOG` — sets the log level for all **Rust** data-plane pods (vertex `numa` container, MonoVertex `numa` container, serving pods). Accepts standard [`tracing-subscriber` EnvFilter syntax](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html) (e.g. `warn`, `numaflow_core=debug,info`). -- `NUMAFLOW_DEBUG` — development shortcut honored by both runtimes; sets level to `debug` and switches log output from JSON to human-readable text. **Note:** the format change may break log shippers expecting JSON — prefer `NUMAFLOW_LOG_LEVEL` or `RUST_LOG` when only the level needs changing. +- `NUMAFLOW_LOG_LEVEL` — sets the log level (`debug`, `info`, `warn`, `error`) for Numaflow-owned components. Overrides the level implied by `NUMAFLOW_DEBUG`. +- `RUST_LOG` — advanced override for data-plane pods (vertex `numa` container, MonoVertex `numa` container, serving pods). Accepts standard [`tracing-subscriber` EnvFilter syntax](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html) (e.g. `warn`, `numaflow_core=debug,info`) and takes precedence over `NUMAFLOW_LOG_LEVEL`. +- `NUMAFLOW_DEBUG` — development shortcut; sets level to `debug` and may switch log output from JSON to human-readable text. **Note:** the format change may break log shippers expecting JSON — prefer `NUMAFLOW_LOG_LEVEL` when only the level needs changing. See [Log Levels](log-levels.md) for a full pod inventory, per-component YAML examples, and common recipes. diff --git a/docs/user-guide/reference/configuration/log-levels.md b/docs/user-guide/reference/configuration/log-levels.md index 5b2744e015..14f7a8eeda 100644 --- a/docs/user-guide/reference/configuration/log-levels.md +++ b/docs/user-guide/reference/configuration/log-levels.md @@ -1,36 +1,36 @@ # Log Levels -Numaflow pods use two separate logging systems depending on whether they run Go or Rust code. This page explains how to control the log level for each. +Numaflow-owned pods use `NUMAFLOW_LOG_LEVEL` as the standard log-level control. Data-plane pods also support `RUST_LOG` for advanced filtering. ## Quick reference -| Pod / container | Runtime | Log-level env var | Default level | -|---|---|---|---| -| Pipeline daemon | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| MonoVertex daemon | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| ISB svc create / delete job | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| ISB svc validate (init container) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| Controller (`numaflow-controller`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| Webhook (`numaflow-webhook`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| UX server (`numaflow-server`) | Go | `NUMAFLOW_LOG_LEVEL` | `info` | -| Pipeline vertex `numa` container | Rust | `RUST_LOG` | `info` | -| MonoVertex `numa` container | Rust | `RUST_LOG` | `info` | -| Serving pod | Rust | `RUST_LOG` | `info` | -| InterStepBufferService (JetStream / Redis) | upstream image | n/a | n/a | - -`NUMAFLOW_DEBUG=true` is honored by **both** runtimes but has different effects (see [below](#numaflow_debug-interaction)). +| Pod / container | Standard log-level env var | Default level | +|---|---|---| +| Pipeline daemon | `NUMAFLOW_LOG_LEVEL` | `info` | +| MonoVertex daemon | `NUMAFLOW_LOG_LEVEL` | `info` | +| ISB svc create / delete job | `NUMAFLOW_LOG_LEVEL` | `info` | +| ISB svc validate (init container) | `NUMAFLOW_LOG_LEVEL` | `info` | +| Controller (`numaflow-controller`) | `NUMAFLOW_LOG_LEVEL` | `info` | +| Webhook (`numaflow-webhook`) | `NUMAFLOW_LOG_LEVEL` | `info` | +| UX server (`numaflow-server`) | `NUMAFLOW_LOG_LEVEL` | `info` | +| Pipeline vertex `numa` container | `NUMAFLOW_LOG_LEVEL` | `info` | +| MonoVertex `numa` container | `NUMAFLOW_LOG_LEVEL` | `info` | +| Serving pod | `NUMAFLOW_LOG_LEVEL` | `info` | +| InterStepBufferService (JetStream / Redis) | n/a | n/a | + +`NUMAFLOW_DEBUG=true` is also supported as a development shortcut (see [below](#numaflow_debug-interaction)). --- -## Go components — `NUMAFLOW_LOG_LEVEL` +## Standard log levels — `NUMAFLOW_LOG_LEVEL` -All Go binaries (daemon, controller, webhook, UX server, ISB service jobs) use the shared `NewLogger()` helper, which reads `NUMAFLOW_LOG_LEVEL` at startup. +Numaflow-owned components read `NUMAFLOW_LOG_LEVEL` at startup. -**Accepted values:** any level recognized by [go.uber.org/zap/zapcore](https://pkg.go.dev/go.uber.org/zap/zapcore#Level): `debug`, `info`, `warn`, `error`, `dpanic`, `panic`, `fatal`. In practice `debug`, `info`, `warn`, and `error` are the useful operational values. +**Accepted values:** `debug`, `info`, `warn`, `error`. **Default:** `info` -**Precedence:** `NUMAFLOW_LOG_LEVEL` overrides the level implied by `NUMAFLOW_DEBUG`. Invalid values are silently ignored and the default level is used instead. +**Precedence:** `NUMAFLOW_LOG_LEVEL` overrides the level implied by `NUMAFLOW_DEBUG`. Invalid values fall back to the level selected by `NUMAFLOW_DEBUG` or the default. For data-plane pods, `RUST_LOG` takes precedence over `NUMAFLOW_LOG_LEVEL` when set. ### Pipeline daemon pod @@ -88,11 +88,11 @@ spec: --- -## Rust components — `RUST_LOG` +## Pipeline, MonoVertex, and Serving pods -Pipeline vertex pods, MonoVertex pods, and Serving pods run the Numaflow Rust data-plane binary. These use the [`tracing-subscriber`](https://docs.rs/tracing-subscriber) `EnvFilter`, which reads `RUST_LOG` at startup. +Pipeline vertex pods, MonoVertex pods, and Serving pods use `NUMAFLOW_LOG_LEVEL` for common log-level cases: -**Accepted values:** standard `EnvFilter` syntax — simple level names (`debug`, `info`, `warn`, `error`) or per-crate directives (`numaflow_core=debug,h2=warn,info`). +**Accepted values:** `debug`, `info`, `warn`, `error`. **Default:** `info` @@ -106,17 +106,10 @@ spec: - name: my-vertex containerTemplate: env: - - name: RUST_LOG + - name: NUMAFLOW_LOG_LEVEL value: warn ``` -To enable debug logs for a specific crate only: - -```yaml - - name: RUST_LOG - value: "numaflow_core=debug,info" -``` - ### MonoVertex pod ```yaml @@ -125,7 +118,7 @@ kind: MonoVertex spec: containerTemplate: env: - - name: RUST_LOG + - name: NUMAFLOW_LOG_LEVEL value: warn ``` @@ -138,25 +131,31 @@ spec: serving: containerTemplate: env: - - name: RUST_LOG + - name: NUMAFLOW_LOG_LEVEL value: warn ``` +### Advanced data-plane filtering — `RUST_LOG` + +Data-plane pods also support standard [`tracing-subscriber` EnvFilter syntax](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html) via `RUST_LOG`. Use this only when you need fine-grained filtering. When `RUST_LOG` is set, it takes precedence over `NUMAFLOW_LOG_LEVEL`. + +For example, to enable debug logs for a specific target only: + +```yaml +# Pipeline.spec.vertices[].containerTemplate.env +- name: RUST_LOG + value: "numaflow_core=debug,info" +``` + --- ## `NUMAFLOW_DEBUG` interaction -`NUMAFLOW_DEBUG=true` is a development shortcut. Its effects differ by runtime: - -| Effect | Go | Rust | -|---|---|---| -| Log level | Lowered to `debug` | Lowered to `debug` (plus `h2::codec=info`) | -| Log format | Switches from JSON to console (human-readable) | Switches from JSON to human-readable text | -| Stacktraces | Added at `warn`+ (vs `error`+ by default) | n/a | +`NUMAFLOW_DEBUG=true` is a development shortcut. It lowers the default log level to `debug` and may switch log output from structured JSON to human-readable text. -**Important:** switching from JSON to text format on the Rust side (`NUMAFLOW_DEBUG=true`) may break log shippers or aggregators that expect structured JSON. Prefer `RUST_LOG=debug` to lower the level without changing the output format. +**Important:** switching from JSON to text format may break log shippers or aggregators that expect structured JSON. Prefer `NUMAFLOW_LOG_LEVEL=debug` to lower the level without changing the output format. -`NUMAFLOW_LOG_LEVEL` overrides the level for Go components regardless of `NUMAFLOW_DEBUG`. There is no equivalent override on the Rust side — use `RUST_LOG` instead. +`NUMAFLOW_LOG_LEVEL` overrides the level implied by `NUMAFLOW_DEBUG` without changing the format selected by `NUMAFLOW_DEBUG`. For data-plane pods, `RUST_LOG` takes precedence over `NUMAFLOW_LOG_LEVEL` when set. --- @@ -169,18 +168,18 @@ spec: value: warn ``` -**Enable debug for a single Rust crate without flooding all logs:** +**Enable debug for a single data-plane target without flooding all logs:** ```yaml # Pipeline.spec.vertices[].containerTemplate.env - name: RUST_LOG value: "numaflow_core=debug,info" ``` -**Enable full debug on a vertex pod (both Go sidecar and Rust data-plane):** +**Enable full debug on a vertex pod:** ```yaml # Pipeline.spec.vertices[].containerTemplate.env - name: NUMAFLOW_DEBUG value: "true" # Note: this also switches log output from JSON to text. -# To keep JSON format while lowering the Rust level, use RUST_LOG=debug instead. +# To keep JSON format while lowering the level, use NUMAFLOW_LOG_LEVEL=debug instead. ``` diff --git a/pkg/shared/logging/log.go b/pkg/shared/logging/log.go index bd1d7fab48..5c118c32fa 100644 --- a/pkg/shared/logging/log.go +++ b/pkg/shared/logging/log.go @@ -18,30 +18,39 @@ package logging import ( "context" + "fmt" "os" + "strings" zap "go.uber.org/zap" "go.uber.org/zap/zapcore" ) +const ( + envDebug = "NUMAFLOW_DEBUG" + envLogLevel = "NUMAFLOW_LOG_LEVEL" +) + // NewLogger returns a new zap.SugaredLogger. // Log level can be overridden at runtime via the NUMAFLOW_LOG_LEVEL env var -// (accepts any zapcore level: debug, info, warn, error, dpanic, panic, fatal). +// (accepted values: debug, info, warn, error). // NUMAFLOW_DEBUG=true selects the development preset (console encoder, debug level). -// NUMAFLOW_LOG_LEVEL overrides the level chosen by NUMAFLOW_DEBUG; invalid values are silently ignored. +// NUMAFLOW_LOG_LEVEL overrides the level chosen by NUMAFLOW_DEBUG. func NewLogger() *zap.SugaredLogger { var config zap.Config - debugMode, ok := os.LookupEnv("NUMAFLOW_DEBUG") + debugMode, ok := os.LookupEnv(envDebug) if ok && debugMode == "true" { config = zap.NewDevelopmentConfig() } else { config = zap.NewProductionConfig() } // NUMAFLOW_LOG_LEVEL overrides the level set by the preset above. - // Invalid values are silently ignored so a typo in a manifest does not crash the pod. - if lvlStr, ok := os.LookupEnv("NUMAFLOW_LOG_LEVEL"); ok { - if lvl, err := zapcore.ParseLevel(lvlStr); err == nil { + // Invalid values fall back to the preset level so a typo does not crash the pod. + if lvlStr, ok := os.LookupEnv(envLogLevel); ok && strings.TrimSpace(lvlStr) != "" { + if lvl, ok := parseLogLevel(lvlStr); ok { config.Level = zap.NewAtomicLevelAt(lvl) + } else { + _, _ = fmt.Fprintf(os.Stderr, "invalid %s=%q, using default log level\n", envLogLevel, lvlStr) } } config.EncoderConfig.EncodeTime = zapcore.RFC3339NanoTimeEncoder @@ -53,6 +62,21 @@ func NewLogger() *zap.SugaredLogger { return logger.Named("numaflow").Sugar() } +func parseLogLevel(level string) (zapcore.Level, bool) { + switch strings.ToLower(strings.TrimSpace(level)) { + case "debug": + return zapcore.DebugLevel, true + case "info": + return zapcore.InfoLevel, true + case "warn": + return zapcore.WarnLevel, true + case "error": + return zapcore.ErrorLevel, true + default: + return zapcore.InfoLevel, false + } +} + type loggerKey struct{} // WithLogger returns a copy of parent context in which the diff --git a/pkg/shared/logging/log_test.go b/pkg/shared/logging/log_test.go index 71db5fb9ca..ee84cb5f91 100644 --- a/pkg/shared/logging/log_test.go +++ b/pkg/shared/logging/log_test.go @@ -39,6 +39,7 @@ func TestNewLogger_LogLevel(t *testing.T) { {"debug", zapcore.DebugLevel}, {"info", zapcore.InfoLevel}, {"warn", zapcore.WarnLevel}, + {"WARN", zapcore.WarnLevel}, {"error", zapcore.ErrorLevel}, } for _, tt := range tests { @@ -87,3 +88,13 @@ func TestNewLogger_LogLevelOverridesDebugPreset(t *testing.T) { t.Error("expected warn to be enabled") } } + +func TestParseLogLevelRejectsRuntimeSpecificLevels(t *testing.T) { + for _, level := range []string{"dpanic", "panic", "fatal"} { + t.Run(level, func(t *testing.T) { + if _, ok := parseLogLevel(level); ok { + t.Fatalf("expected %q to be rejected", level) + } + }) + } +} diff --git a/rust/numaflow/src/setup_tracing.rs b/rust/numaflow/src/setup_tracing.rs index 7a19d8ad1f..028bee0e6a 100644 --- a/rust/numaflow/src/setup_tracing.rs +++ b/rust/numaflow/src/setup_tracing.rs @@ -7,6 +7,11 @@ use tracing_subscriber::{Layer, filter::EnvFilter, fmt}; use std::backtrace::{Backtrace, BacktraceStatus}; use std::panic::PanicHookInfo; +const DEFAULT_LOG_LEVEL: &str = "info"; +const DEBUG_LOG_LEVEL: &str = "debug,h2::codec=info"; +const NUMAFLOW_LOG_LEVEL_ENV: &str = "NUMAFLOW_LOG_LEVEL"; +const RUST_LOG_ENV: &str = "RUST_LOG"; + /// Panic hook to send panic info to `tracing` instead of stderr. /// Without this, a panic will be logged to stderr as: /// ``` @@ -196,19 +201,55 @@ impl Drop for TracerProviderGuard { } } +fn parse_numaflow_log_level(level: &str) -> Option<&'static str> { + match level.trim().to_ascii_lowercase().as_str() { + "debug" => Some("debug"), + "info" => Some("info"), + "warn" => Some("warn"), + "error" => Some("error"), + _ => None, + } +} + +fn default_log_directive( + debug_mode: bool, + numaflow_log_level: Option<&str>, + rust_log_set: bool, +) -> &'static str { + if rust_log_set { + return DEFAULT_LOG_LEVEL; + } + + if let Some(level) = numaflow_log_level + && !level.trim().is_empty() + { + if let Some(parsed_level) = parse_numaflow_log_level(level) { + return parsed_level; + } + eprintln!( + "[setup_tracing] Invalid {NUMAFLOW_LOG_LEVEL_ENV}='{level}', using default log level" + ); + } + + if debug_mode { + DEBUG_LOG_LEVEL + } else { + DEFAULT_LOG_LEVEL + } +} + /// Initialize the tracing subscriber with optional OTLP export. /// Returns a `TracerProviderGuard` that will flush buffered spans on drop. /// Callers must bind it (e.g., `let _guard = register();`) rather than /// discard it, or the provider will shut down immediately. pub fn register() -> TracerProviderGuard { let debug_mode = std::env::var("NUMAFLOW_DEBUG").is_ok_and(|v| v.to_lowercase() == "true"); - let default_log_level = if debug_mode { - "debug,h2::codec=info" // "h2::codec" is too noisy - } else { - "info" - }; + let rust_log_set = std::env::var(RUST_LOG_ENV).is_ok_and(|v| !v.trim().is_empty()); + let numaflow_log_level = std::env::var(NUMAFLOW_LOG_LEVEL_ENV).ok(); + let default_log_level = + default_log_directive(debug_mode, numaflow_log_level.as_deref(), rust_log_set); - // Build filtering from default directives and allow `RUST_LOG` environment variable to override. + // Build filtering from Numaflow defaults and allow `RUST_LOG` to override with EnvFilter syntax. let filter = EnvFilter::builder() .with_default_directive(default_log_level.parse().unwrap_or(Level::INFO.into())) .from_env_lossy(); @@ -303,6 +344,44 @@ mod tests { )) } + #[test] + fn default_log_directive_uses_info_by_default() { + assert_eq!(default_log_directive(false, None, false), "info"); + } + + #[test] + fn default_log_directive_uses_debug_mode_default() { + assert_eq!( + default_log_directive(true, None, false), + "debug,h2::codec=info" + ); + } + + #[test] + fn default_log_directive_uses_numaflow_log_level() { + assert_eq!(default_log_directive(false, Some("warn"), false), "warn"); + assert_eq!(default_log_directive(false, Some("ERROR"), false), "error"); + } + + #[test] + fn default_log_directive_numaflow_log_level_overrides_debug() { + assert_eq!(default_log_directive(true, Some("warn"), false), "warn"); + } + + #[test] + fn default_log_directive_invalid_numaflow_log_level_falls_back() { + assert_eq!(default_log_directive(false, Some("verbose"), false), "info"); + assert_eq!( + default_log_directive(true, Some("verbose"), false), + "debug,h2::codec=info" + ); + } + + #[test] + fn default_log_directive_rust_log_takes_precedence() { + assert_eq!(default_log_directive(true, Some("warn"), true), "info"); + } + #[test] fn sampler_always_on() { assert!(matches!(