diff --git a/LICENSE-PiD.txt b/LICENSE-PiD.txt
new file mode 100644
index 00000000000..81f434709b0
--- /dev/null
+++ b/LICENSE-PiD.txt
@@ -0,0 +1,68 @@
+PiD (Pixel Diffusion Decoder) — License notice
+
+Upstream project: https://github.com/nv-tlabs/PiD
+Vendored under: invokeai/backend/pid/
+
+================================================================================
+CODE (Apache License 2.0)
+================================================================================
+
+The PiD source code, including the `pid/_src/` subtree and the `pid/_ext/imaginaire/`
+framework subset, is licensed under the Apache License, Version 2.0.
+
+Copyright 2026 NVIDIA CORPORATION & AFFILIATES.
+
+Portions of the framework (pid/_ext/imaginaire/) were originally adapted from
+the cosmos-predict2.5 project (https://github.com/nvidia-cosmos/cosmos-predict2.5/).
+
+Files vendored into invokeai/backend/pid/ retain their original SPDX-License-Identifier
+headers. The Apache 2.0 license text is available at:
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+================================================================================
+MODEL WEIGHTS (NVIDIA Source Code License v1 — non-commercial)
+================================================================================
+
+The pre-trained PiD decoder checkpoints distributed by NVIDIA at
+
+ https://huggingface.co/nvidia/PiD
+
+are released under the NSCLv1 license. Per NSCLv1, the weights may only be used
+for non-commercial (research or evaluation) purposes:
+
+ https://huggingface.co/nvidia/PixelDiT-1300M-1024px/blob/main/LICENSE
+
+This restriction applies to the weights only, not to the InvokeAI source code
+or the vendored PiD source code (which remain Apache 2.0). Users are responsible
+for ensuring their use of the PiD weights complies with NSCLv1.
+
+================================================================================
+LOCAL MODIFICATIONS
+================================================================================
+
+The following changes were applied to the upstream PiD subset when vendoring:
+
+* All `pid.*` imports were rewritten to `invokeai.backend.pid.*`.
+* `pid/_src/configs/`, `pid/_src/tokenizers/`, `pid/_src/checkpointer/`,
+ `pid/_src/inference/_demo_*.py`, `from_*.py`, `create_dataset.py`,
+ `rae_generation.py`, and `scale_rae_generation.py` were dropped (not needed
+ for the decoder-only inference subset).
+* `pid/_ext/imaginaire/checkpointer/`, `trainer.py`, `visualize/`, `flags.py`,
+ `config.py`, `types/`, `utils/easy_io/`, `utils/callback.py`,
+ `utils/config_helper.py`, `utils/validator{,_params}.py` and the
+ `lazy_config/omegaconf_patch.py` were dropped.
+* The upstream `utils/log.py` (loguru-based) and `utils/misc.py` were replaced
+ with stdlib-based stubs covering only the API surface used by the decoder.
+* `lazy_config/file_io.py` (iopath PathManager) and `lazy_config/registry.py`
+ (fvcore Registry) were replaced with stdlib-only implementations.
+* `lazy_config/lazy.py` was reduced to a minimal `LazyCall`/`LazyConfig` stub;
+ the upstream yaml/cloudpickle/dill/detectron2 config save/load paths are
+ intentionally not supported.
+* `lazy_config/instantiate.py` was reduced to a stdlib-only implementation;
+ the upstream omegaconf `DictConfig`/`ListConfig` branches were dropped, so
+ no `omegaconf` dependency is required.
+* `_src/utils/model_loader.py` (which depended on Imaginaire's distributed
+ checkpointer + easy_io) and `_src/inference/inference_utils.py` (S3 / video
+ helpers) were removed; their decode-path equivalents are reimplemented in
+ `invokeai/backend/pid/decode.py`.
diff --git a/docs/src/content/docs/features/pid-decode.mdx b/docs/src/content/docs/features/pid-decode.mdx
new file mode 100644
index 00000000000..d630c318e29
--- /dev/null
+++ b/docs/src/content/docs/features/pid-decode.mdx
@@ -0,0 +1,76 @@
+---
+title: PiD Super-Resolution Decode
+lastUpdated: 2026-07-01
+sidebar:
+ order: 5
+---
+
+import { Steps, Aside, Tabs, TabItem } from '@astrojs/starlight/components'
+
+**PiD** (Pixel Diffusion Decoder) is an alternative way to turn a model's latents into an image. Instead of the usual VAE decode, it runs a short pixel-space diffusion that produces a **4× super-resolved** result in a single, few-step pass — so a 512×512 generation comes out as a detailed 2048×2048 image.
+
+Because it decodes in pixel space and is conditioned on your prompt, PiD often recovers finer texture and edge detail than a plain VAE decode followed by an upscaler.
+
+
+
+## Supported models
+
+PiD works with these base models:
+
+| Base model | PiD decoder to install |
+|---|---|
+| FLUX.1 | PiD Decoder FLUX |
+| FLUX.2 Klein (4B / 9B) | PiD Decoder FLUX.2 |
+| Stable Diffusion 3 | PiD Decoder SD3 |
+| SDXL | PiD Decoder SDXL |
+| Z-Image / Z-Image Turbo | **PiD Decoder FLUX** (Z-Image shares FLUX.1's VAE) |
+| Qwen-Image | PiD Decoder Qwen-Image |
+
+
+
+## What you need to install
+
+PiD needs two extra models, both available in **Model Manager → Starter Models**:
+
+
+1. A **PiD Decoder** for your base model (e.g. *PiD Decoder FLUX (2K)*). Some bases offer a *2K* and a *2K-to-4K* preset; SDXL and Qwen-Image ship only the *2K-to-4K* preset.
+2. The **Gemma 2 2B (PiD caption encoder)** — PiD uses it to condition the decode on your prompt. It installs automatically as a dependency of any PiD decoder, and is shared across all of them.
+
+
+Each PiD decoder is roughly 5 GB and the shared Gemma-2 encoder is roughly 5 GB.
+
+## Enabling PiD
+
+Open the **Generation** settings for a supported model and expand the advanced options. You'll find a **PiD** control with three modes:
+
+
+
+ Standard VAE decode. No PiD models required.
+
+
+ Generate at the requested size, decode 4× with PiD, then downscale the result back to the requested size. This is the safe default and works everywhere — the output matches your bounding box exactly, so it composites cleanly on the Canvas.
+
+
+ Treat the requested dimensions as the **4× target**: the image is generated at target ÷ 4 and PiD's full 4× output is used directly (no downscale), preserving all of the added detail. Great when you want a large, highly-detailed result.
+
+
+
+When PiD mode is not *Off*, pick your **PiD Decoder** and **Gemma-2 Encoder** below the mode selector. The **PiD Steps** control (default 4) sets how many decode steps run — the released checkpoints are trained for 4.
+
+PiD is available in both the **Generate** tab (text-to-image) and on the **Canvas** (image-to-image), in both Fit and Native modes.
+
+## Tips & limitations
+
+- **Turn off "Scale Before Processing"** on the Canvas when using PiD — PiD already decodes at 4×, so pre-scaling would inflate the work and is blocked.
+- **Inpaint / Outpaint** are not supported with PiD yet; use text-to-image or image-to-image.
+- **SDXL Refiner** cannot be combined with PiD — disable one of them.
+- PiD's memory use scales with the *output* resolution. A 2048px output needs only a little more headroom than a normal decode, but Native mode at large target sizes (e.g. a 4096px result) is significantly heavier.
+- Turbo variants (e.g. Z-Image Turbo) work as usual — the low step count / no-CFG only affects generation; PiD's own step count is separate.
+
+
diff --git a/invokeai/app/invocations/flux2_pid_decode.py b/invokeai/app/invocations/flux2_pid_decode.py
new file mode 100644
index 00000000000..3cc325abdf4
--- /dev/null
+++ b/invokeai/app/invocations/flux2_pid_decode.py
@@ -0,0 +1,223 @@
+"""FLUX.2 Klein PiD decode invocation.
+
+Replaces the regular FLUX.2 VAE decode with the PiD pixel-diffusion super-res
+decoder (``PiD_res2k[to4k]_sr4x_official_flux2_distill_4step``). Produces a 4x
+super-resolved image from a FLUX.2 latent in a single 4-step distill pass. The
+4B and 9B FLUX.2 Klein variants share the same 32-channel VAE, so this one node
+covers both.
+
+Latent layout (the important difference from the FLUX.1 node):
+
+* ``flux2_denoise`` stores an *unpacked* ``(B, 32, H/8, W/8)`` latent that is
+ already **BN-denormalized** (``x * bn_std + bn_mean`` is applied before the
+ unpack, see ``flux2_denoise.py``). That is exactly the raw latent the FLUX.2
+ VAE's conv decoder consumes.
+* PiD's FLUX.2 backbone expects the **packed** ``(B, 128, H/16, W/16)``
+ representation (``lq_latent_channels=128``, ``latent_spatial_down_factor=16``
+ in ``backend/pid/decode.py``). We therefore patchify the stored latent
+ (2x2 spatial patches folded into channels: 32*4 = 128) *before* handing it to
+ PiD - mirroring ``pack_flux2`` but keeping a spatial ``(B, C, h, w)`` layout
+ instead of the transformer's ``(B, seq, C)`` sequence layout.
+
+Denormalization: unlike FLUX.1 (single ``scale``/``shift``) and Z-Image
+(checkpoint-specific ``scaling_factor``/``shift_factor``), the FLUX.2 VAE
+(``AutoencoderKLFlux2``) exposes **no** scalar ``scaling_factor``/``shift_factor``
+at all - its only normalization is the per-channel BatchNorm applied/inverted
+*outside* the VAE in ``flux2_denoise``. So the packed latent is already in PiD's
+expected raw space and no further scaling is needed (identity fallbacks below).
+We still accept an optional ``vae`` input and read the constants at runtime (like
+the Z-Image node) so any future FLUX.2 VAE variant that does expose scalar
+constants is honored automatically.
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+# FLUX.2 uses per-channel BatchNorm (affine=False) for latent normalization, and
+# that BN is already inverted in flux2_denoise before the latent is stored. The
+# FLUX.2 VAE (AutoencoderKLFlux2) has no scalar scaling_factor/shift_factor, so
+# the identity transform below is the correct default: the stored (packed) latent
+# is already the raw representation PiD was trained on.
+_FLUX2_VAE_SCALING_FACTOR_FALLBACK: float = 1.0
+_FLUX2_VAE_SHIFT_FACTOR_FALLBACK: float = 0.0
+
+
+@invocation(
+ "flux2_pid_decode",
+ title="Latents to Image - FLUX.2 + PiD (4x SR)",
+ tags=["latents", "image", "pid", "flux2", "klein", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class Flux2PiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder.
+
+ Produces a 4x super-resolved image in a single pass. The stored FLUX.2 latent
+ is patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout
+ PiD's FLUX.2 backbone expects, then decoded directly (it is already in raw,
+ BN-denormalized space; see the module docstring).
+ """
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD FLUX.2 decoder checkpoint.",
+ input=Input.Connection,
+ )
+ vae: VAEField | None = InputField(
+ default=None,
+ title="VAE",
+ description="FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. "
+ "FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is "
+ "normally an identity transform and the input can be left unconnected.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ # 1) Patchify the stored FLUX.2 latent into PiD's expected layout.
+ # flux2_denoise stores an unpacked (B, 32, H/8, W/8) latent; PiD's
+ # FLUX.2 backbone wants the packed (B, 128, H/16, W/16) form (32*4=128
+ # channels, spatial halved). This mirrors pack_flux2's 2x2 patchify but
+ # keeps a spatial (B, C, h, w) layout rather than a (B, seq, C) sequence.
+ if latents.shape[-3] != 32:
+ raise ValueError(
+ f"FLUX.2 PiD decode expected a 32-channel latent from flux2_denoise, got shape "
+ f"{tuple(latents.shape)}. The upstream node must output the unpacked FLUX.2 latent."
+ )
+ packed = rearrange(latents, "b c (h ph) (w pw) -> b (c ph pw) h w", ph=2, pw=2)
+ context.logger.info(
+ f"FLUX.2 PiD decode: stored latent shape={tuple(latents.shape)} -> packed for PiD "
+ f"shape={tuple(packed.shape)} (expect [B, 128, H/16, W/16]) dtype={packed.dtype}"
+ )
+
+ # 2) Resolve the scalar scaling/shift (identity for current FLUX.2 VAEs).
+ scaling_factor = _FLUX2_VAE_SCALING_FACTOR_FALLBACK
+ shift_factor = _FLUX2_VAE_SHIFT_FACTOR_FALLBACK
+ if self.vae is not None:
+ vae_info = context.models.load(self.vae.vae)
+ with vae_info.model_on_device() as (_, vae):
+ config = getattr(vae, "config", None)
+ if config is not None and hasattr(config, "scaling_factor"):
+ scaling_factor = float(config.scaling_factor)
+ shift_factor = float(getattr(config, "shift_factor", None) or 0.0)
+ else:
+ scaling_factor = float(getattr(vae, "scale_factor", scaling_factor))
+ shift_factor = float(getattr(vae, "shift_factor", shift_factor))
+ del vae_info
+ TorchDevice.empty_cache()
+
+ # 3) Encode caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 4) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ # The working-memory estimate scales with the OUTPUT pixel count, so it must see the PACKED latent
+ # (spatial H/16), not the unpacked one - otherwise it over-reserves by 4x.
+ estimated_working_memory = estimate_pid_decode_working_memory(packed, BaseModelType.Flux2)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ # The packed latent is already BN-denormalized (raw VAE-input space); the scalar transform below is
+ # identity for current FLUX.2 VAEs and only bites if a VAE ever exposes real scalar constants.
+ denorm_latent = packed.to(device=device, dtype=dtype) / scaling_factor + shift_factor
+ context.logger.info(
+ f"FLUX.2 PiD denorm_latent stats[min={denorm_latent.min().item():.3f} "
+ f"max={denorm_latent.max().item():.3f} mean={denorm_latent.mean().item():.3f}] "
+ f"using scale={scaling_factor:.4f} shift={shift_factor:.4f}"
+ )
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux2)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/flux_pid_decode.py b/invokeai/app/invocations/flux_pid_decode.py
new file mode 100644
index 00000000000..73d7c286a1d
--- /dev/null
+++ b/invokeai/app/invocations/flux_pid_decode.py
@@ -0,0 +1,146 @@
+"""FLUX PiD decode invocation.
+
+Replaces the regular FLUX VAE decode with the PiD pixel-diffusion super-res
+decoder (``PiD_res2k_sr4x_official_flux_distill_4step``). Produces a 4x
+super-resolved image from a FLUX latent in a single 4-step distill pass.
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.flux.util import get_flux_ae_params
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+ "flux_pid_decode",
+ title="Latents to Image - FLUX + PiD (4x SR)",
+ tags=["latents", "image", "pid", "flux", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class FluxPiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode a FLUX latent with the PiD pixel-diffusion decoder.
+
+ The FLUX AutoEncoder usually denormalises the stored latent internally
+ before its conv decoder runs (`z / scale + shift`); we apply the same
+ transform manually here so PiD sees the raw latent it was trained on.
+ """
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD FLUX decoder checkpoint.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ # 1) Encode caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 2) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.Flux)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ # FLUX latent is stored in normalised form (matching FluxAutoEncoder
+ # state); denormalise so PiD sees the same representation it
+ # consumed during training.
+ ae = get_flux_ae_params()
+ denorm_latent = latents.to(device=device, dtype=dtype) / ae.scale_factor + ae.shift_factor
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/gemma2_encoder_loader.py b/invokeai/app/invocations/gemma2_encoder_loader.py
new file mode 100644
index 00000000000..7273fd5619e
--- /dev/null
+++ b/invokeai/app/invocations/gemma2_encoder_loader.py
@@ -0,0 +1,49 @@
+from invokeai.app.invocations.baseinvocation import (
+ BaseInvocation,
+ BaseInvocationOutput,
+ Classification,
+ invocation,
+ invocation_output,
+)
+from invokeai.app.invocations.fields import InputField, OutputField
+from invokeai.app.invocations.model import Gemma2EncoderField, ModelIdentifierField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import ModelType, SubModelType
+
+
+@invocation_output("gemma2_encoder_output")
+class Gemma2EncoderOutput(BaseInvocationOutput):
+ gemma2_encoder: Gemma2EncoderField = OutputField(
+ description="Gemma-2 text encoder used by PiD decoders",
+ title="Gemma-2 Encoder",
+ )
+
+
+@invocation(
+ "gemma2_encoder_loader",
+ title="Gemma-2 Encoder - PiD",
+ tags=["model", "gemma2", "pid"],
+ category="model",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class Gemma2EncoderLoaderInvocation(BaseInvocation):
+ """Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder
+ submodels for use by a PiD decode node."""
+
+ gemma2_model: ModelIdentifierField = InputField(
+ description="Gemma-2 model used to encode captions for PiD decoders.",
+ title="Gemma-2",
+ ui_model_type=ModelType.Gemma2Encoder,
+ )
+
+ def invoke(self, context: InvocationContext) -> Gemma2EncoderOutput:
+ key = self.gemma2_model.key
+ if not context.models.exists(key):
+ raise Exception(f"Unknown Gemma2 model: {key}")
+
+ tokenizer = self.gemma2_model.model_copy(update={"submodel_type": SubModelType.Tokenizer})
+ text_encoder = self.gemma2_model.model_copy(update={"submodel_type": SubModelType.TextEncoder})
+ return Gemma2EncoderOutput(
+ gemma2_encoder=Gemma2EncoderField(tokenizer=tokenizer, text_encoder=text_encoder),
+ )
diff --git a/invokeai/app/invocations/model.py b/invokeai/app/invocations/model.py
index 0c96cdb1d9d..a24e95984b9 100644
--- a/invokeai/app/invocations/model.py
+++ b/invokeai/app/invocations/model.py
@@ -92,6 +92,19 @@ class VAEField(BaseModel):
seamless_axes: List[str] = Field(default_factory=list, description='Axes("x" and "y") to which apply seamless')
+class Gemma2EncoderField(BaseModel):
+ """Field for the Gemma-2 text encoder used by PiD decoders."""
+
+ tokenizer: ModelIdentifierField = Field(description="Info to load tokenizer submodel")
+ text_encoder: ModelIdentifierField = Field(description="Info to load text_encoder submodel")
+
+
+class PiDDecoderField(BaseModel):
+ """Field for a PiD (Pixel Diffusion Decoder) checkpoint."""
+
+ decoder: ModelIdentifierField = Field(description="Info to load PiD decoder checkpoint")
+
+
class ControlLoRAField(LoRAField):
img: ImageField = Field(description="Image to use in structural conditioning")
diff --git a/invokeai/app/invocations/pid_decoder_loader.py b/invokeai/app/invocations/pid_decoder_loader.py
new file mode 100644
index 00000000000..ff22702d3e2
--- /dev/null
+++ b/invokeai/app/invocations/pid_decoder_loader.py
@@ -0,0 +1,44 @@
+from invokeai.app.invocations.baseinvocation import (
+ BaseInvocation,
+ BaseInvocationOutput,
+ Classification,
+ invocation,
+ invocation_output,
+)
+from invokeai.app.invocations.fields import InputField, OutputField
+from invokeai.app.invocations.model import ModelIdentifierField, PiDDecoderField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import ModelType
+
+
+@invocation_output("pid_decoder_output")
+class PiDDecoderOutput(BaseInvocationOutput):
+ pid_decoder: PiDDecoderField = OutputField(
+ description="PiD (Pixel Diffusion Decoder) checkpoint",
+ title="PiD Decoder",
+ )
+
+
+@invocation(
+ "pid_decoder_loader",
+ title="PiD Decoder - FLUX / FLUX.2 / SD3",
+ tags=["model", "pid", "decoder"],
+ category="model",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class PiDDecoderLoaderInvocation(BaseInvocation):
+ """Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use
+ by the per-backbone PiD decode nodes."""
+
+ pid_decoder_model: ModelIdentifierField = InputField(
+ description="PiD decoder checkpoint matching the upstream backbone.",
+ title="PiD Decoder",
+ ui_model_type=ModelType.PiDDecoder,
+ )
+
+ def invoke(self, context: InvocationContext) -> PiDDecoderOutput:
+ key = self.pid_decoder_model.key
+ if not context.models.exists(key):
+ raise Exception(f"Unknown PiD decoder: {key}")
+ return PiDDecoderOutput(pid_decoder=PiDDecoderField(decoder=self.pid_decoder_model))
diff --git a/invokeai/app/invocations/pid_upscale.py b/invokeai/app/invocations/pid_upscale.py
new file mode 100644
index 00000000000..1cca5943241
--- /dev/null
+++ b/invokeai/app/invocations/pid_upscale.py
@@ -0,0 +1,176 @@
+"""PiD super-resolution upscale invocation.
+
+Stand-alone 4x super-resolution path that does **not** require a Generator
+latent. Pipeline::
+
+ image
+ -> FLUX VAE encode (denormalised back to raw)
+ -> Gemma-2 caption encode
+ -> PiD decoder (4x SR)
+ -> image (4x linear)
+
+This is the PiD analogue of ESRGAN / SUPIR: a one-shot, end-to-end pixel
+upscaler. The FLUX VAE is also valid for Z-Image inputs (they share the
+same 16-channel encoder). SD3 / FLUX.2 upscale paths would each need their
+own invocation with the matching VAE encode and latent denormalisation;
+they are deferred until we have the matching PiD checkpoints to validate
+against.
+"""
+
+from contextlib import ExitStack
+
+import einops
+import torch
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ ImageField,
+ Input,
+ InputField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.flux_vae_encode import FluxVaeEncodeInvocation
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.flux.util import get_flux_ae_params
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
+from invokeai.backend.util.devices import TorchDevice
+
+
+@invocation(
+ "pid_upscale",
+ title="PiD Upscale (4x) - FLUX VAE",
+ tags=["upscale", "image", "pid", "super-resolution", "flux"],
+ category="image",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class PiDUpscaleInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode.
+
+ Works for source images that the FLUX VAE can encode (i.e. natural
+ photos / generated images at any size that lands on the VAE's 8-pixel
+ grid). The caption is used to condition the PiD decoder; leaving it
+ empty produces an unconditional decode and is the cheapest option, but
+ the model was distilled with rich captions and benefits from one.
+ """
+
+ image: ImageField = InputField(description="Image to upscale.")
+ vae: VAEField = InputField(
+ description="FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder).",
+ input=Input.Connection,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD FLUX decoder checkpoint.",
+ input=Input.Connection,
+ )
+ prompt: str = InputField(
+ default="",
+ description="Optional caption describing the image. Empty -> empty-caption decode.",
+ ui_component=UIComponent.Textarea,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ # 1) Encode the source image into a FLUX raw latent.
+ pil_image = context.images.get_pil(self.image.image_name).convert("RGB")
+ image_tensor = image_resized_to_grid_as_tensor(pil_image)
+ if image_tensor.dim() == 3:
+ image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
+
+ vae_info = context.models.load(self.vae.vae)
+ context.util.signal_progress("Running VAE encode")
+ normalised_latent = FluxVaeEncodeInvocation.vae_encode(vae_info=vae_info, image_tensor=image_tensor)
+ # FluxAutoEncoder.encode emits `scale * (raw - shift)`. PiD expects raw,
+ # so undo it. Holds for the Z-Image case as well (same VAE constants).
+ ae = get_flux_ae_params()
+ raw_latent = normalised_latent / ae.scale_factor + ae.shift_factor
+ raw_latent = raw_latent.to("cpu") # park while we swap to Gemma
+ del normalised_latent
+ TorchDevice.empty_cache()
+
+ # 2) Encode the caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 3) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(raw_latent, BaseModelType.Flux)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ latent_on_device = raw_latent.to(device=device, dtype=dtype)
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux)
+ x0 = decoder.decode(
+ latent=latent_on_device,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = einops.rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/qwen_image_pid_decode.py b/invokeai/app/invocations/qwen_image_pid_decode.py
new file mode 100644
index 00000000000..ddf410b5c3c
--- /dev/null
+++ b/invokeai/app/invocations/qwen_image_pid_decode.py
@@ -0,0 +1,212 @@
+"""Qwen-Image PiD decode invocation.
+
+Replaces Qwen-Image's AutoencoderKLQwenImage decode with the PiD pixel-diffusion
+super-res decoder (``PiD_res2kto4k_sr4x_official_qwenimage_distill_4step``).
+Produces a 4x super-resolved image from a Qwen-Image latent in a single 4-step
+distill pass.
+
+Qwen-Image is 16-channel at an 8x spatial down-factor (``_PER_BACKBONE[QwenImage]``
+in ``backend/pid/decode.py``: ``lq_latent_channels=16``, ``latent_spatial_down_factor=8``),
+so no packing is needed. Two Qwen-specific wrinkles, both handled below and both
+verified against the existing ``qwen_image_l2i`` node:
+
+1. **5D latent.** The denoiser stores a 5D ``(B, 16, num_frames, H, W)`` latent
+ (Qwen's VAE is a video-style autoencoder). PiD is a 2D image decoder, so we
+ drop the singleton temporal dim before decoding.
+2. **Per-channel normalization.** Unlike FLUX / Z-Image / SDXL (a scalar
+ ``scaling_factor`` / ``shift``), the Qwen VAE normalizes each of the 16 latent
+ channels by its own ``latents_mean`` / ``latents_std`` vector. Denormalization
+ is therefore ``z_raw = z_norm * latents_std + latents_mean`` per channel -
+ exactly the transform ``qwen_image_l2i`` applies before ``vae.decode``, so PiD
+ (which replaces that decode) sees the same raw latent. We read the vectors from
+ the VAE config when a ``vae`` is wired, with the diffusers defaults as fallback.
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+# Per-channel Qwen-Image VAE normalization constants (diffusers AutoencoderKLQwenImage defaults, z_dim=16). Used
+# only as a fallback when no `vae` is wired; prefer the wired VAE config's latents_mean / latents_std at runtime.
+_QWEN_VAE_LATENTS_MEAN_FALLBACK: list[float] = [
+ -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+ 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921,
+] # fmt: skip
+_QWEN_VAE_LATENTS_STD_FALLBACK: list[float] = [
+ 2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+ 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160,
+] # fmt: skip
+
+
+@invocation(
+ "qwen_image_pid_decode",
+ title="Latents to Image - Qwen-Image + PiD (4x SR)",
+ tags=["latents", "image", "pid", "qwen-image", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class QwenImagePiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode a Qwen-Image latent with the PiD pixel-diffusion decoder.
+
+ Produces a 4x super-resolved image in a single pass. The 5D Qwen latent is
+ reduced to 2D and per-channel denormalized (``z * std + mean``) before PiD.
+ """
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD Qwen-Image decoder checkpoint.",
+ input=Input.Connection,
+ )
+ vae: VAEField | None = InputField(
+ default=None,
+ title="VAE",
+ description="Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. "
+ "If omitted, the diffusers default Qwen-Image constants are used.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ # 1) Reduce the stored 5D (B, C, num_frames, H, W) latent to 2D (B, C, H, W). Qwen's VAE is a video-style
+ # autoencoder; for a single image num_frames == 1 (mirrors qwen_image_l2i's `img[:, :, 0]`).
+ if latents.ndim == 5:
+ if latents.shape[2] != 1:
+ raise ValueError(
+ f"Qwen-Image PiD decode expected a single temporal frame, got shape {tuple(latents.shape)}."
+ )
+ latents = latents[:, :, 0]
+ if latents.ndim != 4 or latents.shape[-3] != 16:
+ raise ValueError(f"Qwen-Image PiD decode expected a 16-channel latent, got shape {tuple(latents.shape)}.")
+
+ # 2) Resolve the per-channel latents_mean / latents_std used to denormalise the stored latent.
+ latents_mean = list(_QWEN_VAE_LATENTS_MEAN_FALLBACK)
+ latents_std = list(_QWEN_VAE_LATENTS_STD_FALLBACK)
+ if self.vae is not None:
+ vae_info = context.models.load(self.vae.vae)
+ with vae_info.model_on_device() as (_, vae):
+ config = getattr(vae, "config", None)
+ cfg_mean = getattr(config, "latents_mean", None) if config is not None else None
+ cfg_std = getattr(config, "latents_std", None) if config is not None else None
+ if cfg_mean is not None and cfg_std is not None:
+ latents_mean = [float(x) for x in cfg_mean]
+ latents_std = [float(x) for x in cfg_std]
+ del vae_info
+ TorchDevice.empty_cache()
+ if len(latents_mean) != 16 or len(latents_std) != 16:
+ raise ValueError(
+ f"Qwen-Image VAE latents_mean/latents_std must have 16 entries, got {len(latents_mean)}/{len(latents_std)}."
+ )
+ context.logger.info(
+ f"Qwen-Image PiD decode: latent shape={tuple(latents.shape)} (expect [B, 16, H/8, W/8]) "
+ f"dtype={latents.dtype} per-channel denorm (mean/std from {'VAE config' if self.vae else 'fallback'})"
+ )
+
+ # 3) Encode caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 4) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.QwenImage)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ # Per-channel denormalise: z_raw = z_norm * std + mean (the transform qwen_image_l2i applies before
+ # vae.decode). mean/std are (16,) -> (1, 16, 1, 1) to broadcast over the (B, 16, H, W) latent.
+ mean_t = torch.tensor(latents_mean, device=device, dtype=dtype).view(1, 16, 1, 1)
+ std_t = torch.tensor(latents_std, device=device, dtype=dtype).view(1, 16, 1, 1)
+ denorm_latent = latents.to(device=device, dtype=dtype) * std_t + mean_t
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.QwenImage)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/sd3_pid_decode.py b/invokeai/app/invocations/sd3_pid_decode.py
new file mode 100644
index 00000000000..ef65b3d98d9
--- /dev/null
+++ b/invokeai/app/invocations/sd3_pid_decode.py
@@ -0,0 +1,139 @@
+"""SD3 PiD decode invocation.
+
+Replaces SD3's AutoencoderKL decode with the PiD pixel-diffusion super-res
+decoder (``PiD_res2k_sr4x_official_sd3_distill_4step``). Produces a 4x
+super-resolved image from an SD3 latent in a 4-step distill pass.
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+# SD3 medium VAE constants (see diffusers `stabilityai/stable-diffusion-3-medium` VAE config
+# and PiD's pipeline_registry.py confirmation).
+_SD3_VAE_SCALING_FACTOR: float = 1.5305
+_SD3_VAE_SHIFT_FACTOR: float = 0.0609
+
+
+@invocation(
+ "sd3_pid_decode",
+ title="Latents to Image - SD3 + PiD (4x SR)",
+ tags=["latents", "image", "pid", "sd3", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class SD3PiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode an SD3 latent with the PiD pixel-diffusion decoder."""
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD SD3 decoder checkpoint.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.StableDiffusion3)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ denorm_latent = latents.to(device=device, dtype=dtype) / _SD3_VAE_SCALING_FACTOR + _SD3_VAE_SHIFT_FACTOR
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.StableDiffusion3)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/sdxl_pid_decode.py b/invokeai/app/invocations/sdxl_pid_decode.py
new file mode 100644
index 00000000000..9c98be8b422
--- /dev/null
+++ b/invokeai/app/invocations/sdxl_pid_decode.py
@@ -0,0 +1,185 @@
+"""SDXL PiD decode invocation.
+
+Replaces SDXL's AutoencoderKL decode with the PiD pixel-diffusion super-res
+decoder (``PiD_res2kto4k_sr4x_official_sdxl_distill_4step``). Produces a 4x
+super-resolved image from an SDXL latent in a single 4-step distill pass.
+
+SDXL latents are 4-channel at an 8x spatial down-factor (``_PER_BACKBONE[SDXL]``
+in ``backend/pid/decode.py``: ``lq_latent_channels=4``, ``latent_spatial_down_factor=8``),
+so - unlike FLUX.2 - no patchify/pack is needed; the stored latent goes straight
+to PiD after denormalization.
+
+Denormalization: SDXL's VAE (``AutoencoderKL``) exposes a scalar
+``scaling_factor`` (0.13025) and no shift, so the stored latent is denormalized
+as ``z / scaling_factor + shift`` (matching the FLUX / Z-Image nodes). We read
+the constants from the VAE config at runtime when a ``vae`` is wired, falling
+back to the documented SDXL constants otherwise.
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+# SDXL VAE constants (diffusers `stabilityai/sdxl-vae` config: scaling_factor=0.13025, no shift). Prefer reading
+# scaling_factor / shift_factor from the wired VAE config at runtime; use these only as a fallback.
+_SDXL_VAE_SCALING_FACTOR_FALLBACK: float = 0.13025
+_SDXL_VAE_SHIFT_FACTOR_FALLBACK: float = 0.0
+
+
+@invocation(
+ "sdxl_pid_decode",
+ title="Latents to Image - SDXL + PiD (4x SR)",
+ tags=["latents", "image", "pid", "sdxl", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class SDXLPiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode an SDXL latent with the PiD pixel-diffusion decoder.
+
+ Produces a 4x super-resolved image in a single pass. The SDXL latent is
+ 4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``)
+ and handed straight to PiD - no packing needed.
+ """
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD SDXL decoder checkpoint.",
+ input=Input.Connection,
+ )
+ vae: VAEField | None = InputField(
+ default=None,
+ title="VAE",
+ description="SDXL VAE, used to read scaling_factor / shift_factor. "
+ "If omitted, the SDXL fallback constants (0.13025 / 0.0) are used.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ # 1) Resolve the VAE scaling/shift used to denormalise the stored SDXL latent. Prefer the VAE config; fall
+ # back to the documented SDXL constants (0.13025 / 0.0).
+ scaling_factor = _SDXL_VAE_SCALING_FACTOR_FALLBACK
+ shift_factor = _SDXL_VAE_SHIFT_FACTOR_FALLBACK
+ if self.vae is not None:
+ vae_info = context.models.load(self.vae.vae)
+ with vae_info.model_on_device() as (_, vae):
+ config = getattr(vae, "config", None)
+ if config is not None and hasattr(config, "scaling_factor"):
+ scaling_factor = float(config.scaling_factor)
+ shift_factor = float(getattr(config, "shift_factor", None) or 0.0)
+ else:
+ scaling_factor = float(getattr(vae, "scale_factor", scaling_factor))
+ shift_factor = float(getattr(vae, "shift_factor", shift_factor))
+ del vae_info
+ TorchDevice.empty_cache()
+ context.logger.info(
+ f"SDXL PiD decode: latent shape={tuple(latents.shape)} (expect [B, 4, H/8, W/8]) dtype={latents.dtype} "
+ f"using scale={scaling_factor:.5f} shift={shift_factor:.5f}"
+ )
+
+ # 2) Encode caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ caption_embs = caption_embs.detach().to("cpu")
+ caption_mask = caption_mask.detach().to("cpu")
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 3) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.StableDiffusionXL)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ # SDXL latents come out of the LDM in the VAE-normalized space; denormalise so PiD sees the raw latent.
+ denorm_latent = latents.to(device=device, dtype=dtype) / scaling_factor + shift_factor
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.StableDiffusionXL)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+
+ TorchDevice.empty_cache()
+
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/invocations/z_image_pid_decode.py b/invokeai/app/invocations/z_image_pid_decode.py
new file mode 100644
index 00000000000..e52a092c075
--- /dev/null
+++ b/invokeai/app/invocations/z_image_pid_decode.py
@@ -0,0 +1,204 @@
+"""Z-Image PiD decode invocation.
+
+Z-Image shares FLUX.1's 16-channel VAE, so the FLUX-trained PiD decoder
+(``PiD_res2k_sr4x_official_flux_distill_4step``) is the correct choice for
+Z-Image latents. This node replaces the regular Z-Image VAE decode with a
+PiD super-resolution decode (4x scale, ~256×256 latent → 2048×2048 image
+by default).
+"""
+
+from contextlib import ExitStack
+
+import torch
+from einops import rearrange
+from PIL import Image
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
+from invokeai.app.invocations.fields import (
+ FieldDescriptions,
+ Input,
+ InputField,
+ LatentsField,
+ UIComponent,
+ WithBoard,
+ WithMetadata,
+)
+from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField
+from invokeai.app.invocations.primitives import ImageOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+from invokeai.backend.pid.decode import (
+ PiDDecodeConfig,
+ PiDDecoder,
+ encode_caption_for_pid,
+ estimate_pid_decode_working_memory,
+)
+from invokeai.backend.util.devices import TorchDevice
+
+# Fallback Z-Image VAE constants. PiD's pipeline_registry.py explicitly notes
+# the exact values depend on the pretrained checkpoint, so prefer reading them
+# from the VAE config at runtime (see `vae` input below) and use these only as
+# a last resort.
+_ZIMAGE_VAE_SCALING_FACTOR_FALLBACK: float = 0.3611
+_ZIMAGE_VAE_SHIFT_FACTOR_FALLBACK: float = 0.1159
+
+
+@invocation(
+ "z_image_pid_decode",
+ title="Latents to Image - Z-Image + PiD (4x SR)",
+ tags=["latents", "image", "pid", "z-image", "upscale"],
+ category="latents",
+ version="1.0.0",
+ classification=Classification.Prototype,
+)
+class ZImagePiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard):
+ """Decode a Z-Image latent with the PiD pixel-diffusion decoder.
+
+ Produces a 4x super-resolved image in a single pass (Z-Image decoder is
+ trained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial
+ down-factor gives a 32x linear scale from latent to pixel).
+ """
+
+ latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection)
+ prompt: str = InputField(
+ description="Text prompt the latent was generated from. PiD conditions on it.",
+ ui_component=UIComponent.Textarea,
+ )
+ gemma2_encoder: Gemma2EncoderField = InputField(
+ title="Gemma-2 Encoder",
+ description="Gemma-2 caption encoder. Required by PiD.",
+ input=Input.Connection,
+ )
+ pid_decoder: PiDDecoderField = InputField(
+ title="PiD Decoder",
+ description="PiD FLUX decoder checkpoint.",
+ input=Input.Connection,
+ )
+ vae: VAEField | None = InputField(
+ default=None,
+ title="VAE",
+ description="Z-Image VAE used to read scaling_factor / shift_factor. "
+ "If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used.",
+ input=Input.Connection,
+ )
+ num_inference_steps: int = InputField(
+ default=4,
+ ge=1,
+ le=8,
+ description="Number of PiD distill steps. The released checkpoints are trained for 4.",
+ )
+ seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.")
+
+ @torch.no_grad()
+ def invoke(self, context: InvocationContext) -> ImageOutput:
+ latents = context.tensors.load(self.latents.latents_name)
+
+ # 1) Resolve the VAE scaling/shift used to denormalise the stored
+ # Z-Image latent. PiD's pipeline_registry says these are
+ # checkpoint-specific for Z-Image, so prefer the VAE config when
+ # available and fall back to the FLUX values otherwise.
+ scaling_factor = _ZIMAGE_VAE_SCALING_FACTOR_FALLBACK
+ shift_factor = _ZIMAGE_VAE_SHIFT_FACTOR_FALLBACK
+ if self.vae is not None:
+ vae_info = context.models.load(self.vae.vae)
+ with vae_info.model_on_device() as (_, vae):
+ config = getattr(vae, "config", None)
+ if config is not None and hasattr(config, "scaling_factor"):
+ scaling_factor = float(config.scaling_factor)
+ shift_factor = float(getattr(config, "shift_factor", None) or 0.0)
+ else:
+ # FluxAutoEncoder stores the constants directly on the module.
+ scaling_factor = float(getattr(vae, "scale_factor", scaling_factor))
+ shift_factor = float(getattr(vae, "shift_factor", shift_factor))
+ del vae_info
+ TorchDevice.empty_cache()
+ context.logger.info(
+ f"Z-Image PiD decode: latent shape={tuple(latents.shape)} dtype={latents.dtype} "
+ f"stats[min={latents.min().item():.3f} max={latents.max().item():.3f} "
+ f"mean={latents.mean().item():.3f}] using scale={scaling_factor:.4f} shift={shift_factor:.4f}"
+ )
+
+ # 2) Encode caption with Gemma-2.
+ gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder)
+ gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer)
+ with ExitStack() as stack:
+ (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device())
+ (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device())
+ if not isinstance(gemma_encoder, PreTrainedModel):
+ raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.")
+ if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase):
+ raise TypeError(
+ f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}."
+ )
+
+ device = TorchDevice.choose_torch_device()
+ encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
+
+ context.util.signal_progress("Encoding caption with Gemma-2")
+ caption_embs, caption_mask = encode_caption_for_pid(
+ [self.prompt],
+ tokenizer=gemma_tokenizer,
+ encoder=gemma_encoder,
+ device=device,
+ dtype=encode_dtype,
+ )
+ # Move off-device so Gemma's slot in the cache can be reclaimed.
+ caption_embs = caption_embs.detach().to("cpu")
+
+ caption_mask = caption_mask.detach().to("cpu")
+ # Drop Gemma references so the cache can evict it before we load PiD.
+ del gemma_encoder, gemma_tokenizer
+ # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM
+ # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to
+ # fit the decode on its own, so we deliberately do NOT evict every other model here.
+ context.models.offload_from_vram(self.gemma2_encoder.text_encoder)
+ TorchDevice.empty_cache()
+
+ # 2) Run PiD decode (the loader already returns a live PidNet).
+ pid_info = context.models.load(self.pid_decoder.decoder)
+ estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.Flux)
+ with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net):
+ if not isinstance(pid_net, PidNet):
+ raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.")
+ device = TorchDevice.choose_torch_device()
+ dtype = next(iter(pid_net.parameters())).dtype
+
+ # Z-Image latents come out of the diffusers pipeline normalised
+ # by the VAE constants. PiD expects the raw latent.
+ denorm_latent = latents.to(device=device, dtype=dtype) / scaling_factor + shift_factor
+ context.logger.info(
+ f"denorm_latent stats[min={denorm_latent.min().item():.3f} "
+ f"max={denorm_latent.max().item():.3f} mean={denorm_latent.mean().item():.3f} "
+ f"std={denorm_latent.float().std().item():.3f}]; "
+ f"caption_embs shape={tuple(caption_embs.shape)} "
+ f"stats[min={caption_embs.min().item():.3f} max={caption_embs.max().item():.3f} "
+ f"mean={caption_embs.mean().item():.3f} std={caption_embs.float().std().item():.3f}]"
+ )
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+
+ context.util.signal_progress("Running PiD decoder")
+ decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux)
+ x0 = decoder.decode(
+ latent=denorm_latent,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed),
+ )
+ context.logger.info(
+ f"PiD output stats: shape={tuple(x0.shape)} dtype={x0.dtype} "
+ f"raw[min={x0.min().item():.3f} max={x0.max().item():.3f} "
+ f"mean={x0.mean().item():.3f} std={x0.float().std().item():.3f}] "
+ f"nan_count={int(torch.isnan(x0).sum().item())} "
+ f"inf_count={int(torch.isinf(x0).sum().item())}"
+ )
+
+ TorchDevice.empty_cache()
+
+ # x0 is [B, 3, H, W] in [-1, 1]; convert the first item to a PIL image.
+ img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c")
+ img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy())
+
+ image_dto = context.images.save(image=img_pil)
+ return ImageOutput.build(image_dto)
diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py
index e06f8f2df91..56303dace97 100644
--- a/invokeai/app/services/model_records/model_records_base.py
+++ b/invokeai/app/services/model_records/model_records_base.py
@@ -30,6 +30,7 @@
ModelSourceType,
ModelType,
ModelVariantType,
+ PiDDecoderVariantType,
Qwen3VariantType,
QwenImageVariantType,
SchedulerPredictionType,
@@ -135,6 +136,7 @@ def validate_source_url(cls, v: Any) -> Optional[str]:
| ZImageVariantType
| QwenImageVariantType
| Qwen3VariantType
+ | PiDDecoderVariantType
] = Field(description="The variant of the model.", default=None)
prediction_type: Optional[SchedulerPredictionType] = Field(
description="The prediction type of the model.", default=None
diff --git a/invokeai/app/services/shared/invocation_context.py b/invokeai/app/services/shared/invocation_context.py
index e38766d5ba2..44b657bca80 100644
--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@@ -426,6 +426,22 @@ def load_by_attrs(
self._util.signal_progress(message)
return self._services.model_manager.load.load_model(configs[0], submodel_type)
+ def offload_from_vram(self, identifier: Union[str, "ModelIdentifierField"]) -> int:
+ """Move a model (and all of its submodels) from VRAM to RAM, freeing its VRAM but keeping it cached.
+
+ Use this when an invocation is done with a model for the rest of the run - e.g. a one-shot text encoder -
+ so the next, larger load does not have to compete with it for VRAM. The model stays in the RAM cache, so
+ a subsequent load only re-streams it back to VRAM rather than rebuilding it from disk.
+
+ Args:
+ identifier: The key or ModelField representing the model to offload.
+
+ Returns:
+ The number of VRAM bytes freed.
+ """
+ key = identifier if isinstance(identifier, str) else identifier.key
+ return self._services.model_manager.load.ram_cache.offload_model_from_vram(key)
+
@staticmethod
def _raise_if_external(model: AnyModelConfig) -> None:
if model.base == BaseModelType.External or model.format == ModelFormat.ExternalApi:
diff --git a/invokeai/backend/model_manager/configs/factory.py b/invokeai/backend/model_manager/configs/factory.py
index b176a6ff0b2..f68741ba4c6 100644
--- a/invokeai/backend/model_manager/configs/factory.py
+++ b/invokeai/backend/model_manager/configs/factory.py
@@ -28,6 +28,7 @@
)
from invokeai.backend.model_manager.configs.external_api import ExternalApiModelConfig
from invokeai.backend.model_manager.configs.flux_redux import FLUXRedux_Checkpoint_Config
+from invokeai.backend.model_manager.configs.gemma2_encoder import Gemma2Encoder_Gemma2Encoder_Config
from invokeai.backend.model_manager.configs.identification_utils import NotAMatchError
from invokeai.backend.model_manager.configs.ip_adapter import (
IPAdapter_Checkpoint_FLUX_Config,
@@ -86,6 +87,13 @@
Main_GGUF_ZImage_Config,
MainModelDefaultSettings,
)
+from invokeai.backend.model_manager.configs.pid_decoder import (
+ PiDDecoder_Checkpoint_Flux2_Config,
+ PiDDecoder_Checkpoint_FLUX_Config,
+ PiDDecoder_Checkpoint_QwenImage_Config,
+ PiDDecoder_Checkpoint_SD3_Config,
+ PiDDecoder_Checkpoint_SDXL_Config,
+)
from invokeai.backend.model_manager.configs.qwen3_encoder import (
Qwen3Encoder_Checkpoint_Config,
Qwen3Encoder_GGUF_Config,
@@ -207,6 +215,12 @@
Annotated[VAE_Diffusers_SD1_Config, VAE_Diffusers_SD1_Config.get_tag()],
Annotated[VAE_Diffusers_SDXL_Config, VAE_Diffusers_SDXL_Config.get_tag()],
Annotated[VAE_Diffusers_Flux2_Config, VAE_Diffusers_Flux2_Config.get_tag()],
+ # PiD Decoder - checkpoint format
+ Annotated[PiDDecoder_Checkpoint_FLUX_Config, PiDDecoder_Checkpoint_FLUX_Config.get_tag()],
+ Annotated[PiDDecoder_Checkpoint_Flux2_Config, PiDDecoder_Checkpoint_Flux2_Config.get_tag()],
+ Annotated[PiDDecoder_Checkpoint_SD3_Config, PiDDecoder_Checkpoint_SD3_Config.get_tag()],
+ Annotated[PiDDecoder_Checkpoint_SDXL_Config, PiDDecoder_Checkpoint_SDXL_Config.get_tag()],
+ Annotated[PiDDecoder_Checkpoint_QwenImage_Config, PiDDecoder_Checkpoint_QwenImage_Config.get_tag()],
# ControlNet - checkpoint format
Annotated[ControlNet_Checkpoint_SD1_Config, ControlNet_Checkpoint_SD1_Config.get_tag()],
Annotated[ControlNet_Checkpoint_SD2_Config, ControlNet_Checkpoint_SD2_Config.get_tag()],
@@ -250,6 +264,8 @@
Annotated[Qwen3Encoder_Qwen3Encoder_Config, Qwen3Encoder_Qwen3Encoder_Config.get_tag()],
Annotated[Qwen3Encoder_Checkpoint_Config, Qwen3Encoder_Checkpoint_Config.get_tag()],
Annotated[Qwen3Encoder_GGUF_Config, Qwen3Encoder_GGUF_Config.get_tag()],
+ # Gemma 2 Encoder (used by PiD)
+ Annotated[Gemma2Encoder_Gemma2Encoder_Config, Gemma2Encoder_Gemma2Encoder_Config.get_tag()],
# Qwen VL Encoder (Qwen2.5-VL multimodal encoder for Qwen Image)
Annotated[QwenVLEncoder_Diffusers_Config, QwenVLEncoder_Diffusers_Config.get_tag()],
Annotated[QwenVLEncoder_Checkpoint_Config, QwenVLEncoder_Checkpoint_Config.get_tag()],
diff --git a/invokeai/backend/model_manager/configs/gemma2_encoder.py b/invokeai/backend/model_manager/configs/gemma2_encoder.py
new file mode 100644
index 00000000000..b922f4e060e
--- /dev/null
+++ b/invokeai/backend/model_manager/configs/gemma2_encoder.py
@@ -0,0 +1,70 @@
+"""Model config for the Gemma-2-2b-it text encoder used by PiD.
+
+PiD's pre-trained decoders condition on Gemma-2-2b-it caption embeddings
+(2304-dim). This config recognises a stand-alone diffusers/transformers
+directory containing a Gemma2 causal LM (config.json + safetensors weights +
+tokenizer files).
+
+The reference model PiD uses is `Efficient-Large-Model/gemma-2-2b-it`, an
+ungated mirror of `google/gemma-2-2b-it`. Both produce a
+`Gemma2ForCausalLM` config which is what we match on.
+
+License note: Gemma 2 is distributed under the Gemma Terms of Use (Google).
+This config only describes how to recognise the model on disk; downloading
+and accepting Gemma's license is the user's responsibility.
+"""
+
+from typing import Any, Literal, Self
+
+from pydantic import Field
+
+from invokeai.backend.model_manager.configs.base import Config_Base
+from invokeai.backend.model_manager.configs.identification_utils import (
+ NotAMatchError,
+ raise_for_class_name,
+ raise_for_override_fields,
+ raise_if_not_dir,
+)
+from invokeai.backend.model_manager.model_on_disk import ModelOnDisk
+from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat, ModelType
+
+
+class Gemma2Encoder_Gemma2Encoder_Config(Config_Base):
+ """Standalone Gemma-2 causal LM directory used as a text encoder by PiD.
+
+ Expected directory layout (HuggingFace `from_pretrained`-compatible)::
+
+ /
+ config.json # architectures: ["Gemma2ForCausalLM"]
+ tokenizer.json
+ tokenizer_config.json
+ model-*.safetensors # or model.safetensors / *.bin
+ """
+
+ base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
+ type: Literal[ModelType.Gemma2Encoder] = Field(default=ModelType.Gemma2Encoder)
+ format: Literal[ModelFormat.Gemma2Encoder] = Field(default=ModelFormat.Gemma2Encoder)
+ cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
+
+ @classmethod
+ def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
+ raise_if_not_dir(mod)
+ raise_for_override_fields(cls, override_fields)
+
+ config_path = mod.path / "config.json"
+ if not config_path.exists():
+ raise NotAMatchError(f"missing config.json at {config_path}")
+
+ # Reject full diffusers pipelines (they have model_index.json at root).
+ if (mod.path / "model_index.json").exists():
+ raise NotAMatchError("directory looks like a full diffusers pipeline, not a standalone Gemma2 encoder")
+
+ # Architecture marker is the canonical signal.
+ raise_for_class_name(config_path, {"Gemma2ForCausalLM"})
+
+ # Sanity check that tokenizer files live alongside the model (PiD calls
+ # AutoTokenizer.from_pretrained on the same directory).
+ if not any((mod.path / f).exists() for f in ("tokenizer.json", "tokenizer.model")):
+ raise NotAMatchError("directory does not contain Gemma2 tokenizer files (tokenizer.json/tokenizer.model)")
+
+ return cls(**override_fields)
diff --git a/invokeai/backend/model_manager/configs/pid_decoder.py b/invokeai/backend/model_manager/configs/pid_decoder.py
new file mode 100644
index 00000000000..7a677c0e5ca
--- /dev/null
+++ b/invokeai/backend/model_manager/configs/pid_decoder.py
@@ -0,0 +1,234 @@
+"""Model configs for PiD (Pixel Diffusion Decoder) checkpoints.
+
+PiD decoders are released by NVIDIA at https://huggingface.co/nvidia/PiD and
+ship per supported backbone (FLUX.1, FLUX.2, SD3) in two resolution presets
+(`res2k_sr4x_*` and `res2kto4k_sr4x_*`). See `LICENSE-PiD.txt` at the repo
+root — code is Apache-2.0, weights are NSCLv1 (non-commercial / research).
+"""
+
+import re
+from typing import Any, Literal, Self
+
+from pydantic import Field
+
+from invokeai.backend.model_manager.configs.base import Checkpoint_Config_Base, Config_Base
+from invokeai.backend.model_manager.configs.identification_utils import (
+ NotAMatchError,
+ raise_for_override_fields,
+ raise_if_not_file,
+)
+from invokeai.backend.model_manager.model_on_disk import ModelOnDisk
+from invokeai.backend.model_manager.taxonomy import (
+ BaseModelType,
+ ModelFormat,
+ ModelType,
+ PiDDecoderVariantType,
+)
+
+# Marker substring produced by `PidNet.lq_proj` (see
+# invokeai/backend/pid/_src/networks/pid_net.py). The pretrained PixDiT_T2I
+# weights do not contain `lq_proj`, so its presence in any key is diagnostic
+# of a PiD-style checkpoint. We match by substring (not prefix) because the
+# official `.pth` files keep PidDistillModel's `net.` prefix, so keys look
+# like `net.lq_proj.layers.0.weight`.
+_PID_MARKER_SUBSTRING = "lq_proj"
+
+
+def _looks_like_pid_decoder(state_dict: dict[str | int, Any]) -> bool:
+ return any(isinstance(k, str) and _PID_MARKER_SUBSTRING in k for k in state_dict)
+
+
+# The latent input projection (`lq_proj.latent_proj.0`) is a Conv2d whose
+# in-channel count equals the backbone's latent channel count — the released
+# sr4x checkpoints apply no spatial fold here, so the Conv's dim-1 is exactly
+# `lq_latent_channels` (see `_PER_BACKBONE` in invokeai/backend/pid/decode.py):
+# FLUX.1 / SD3 = 16, FLUX.2 = 128. This is the only architectural dimension
+# that varies between backbones and is therefore a filename-independent
+# discriminator between FLUX.2 and the 16-channel family. (FLUX.1 and SD3 are
+# architecturally identical and cannot be told apart from the weights alone.)
+# We match the key by suffix because the official `.pth` keep the `net.` prefix.
+_LATENT_PROJ_KEY_SUFFIX = "lq_proj.latent_proj.0.weight"
+
+_LATENT_CHANNELS_TO_BASES: dict[int, set[BaseModelType]] = {
+ 4: {BaseModelType.StableDiffusionXL},
+ 16: {BaseModelType.Flux, BaseModelType.StableDiffusion3, BaseModelType.QwenImage},
+ 128: {BaseModelType.Flux2},
+}
+
+
+def _latent_channels_from_state_dict(state_dict: dict[str | int, Any]) -> int | None:
+ """Read the backbone's latent channel count from the `lq_proj` input Conv.
+
+ Returns None if the diagnostic weight is absent or not a 4D conv tensor.
+ """
+ for k, v in state_dict.items():
+ if isinstance(k, str) and k.endswith(_LATENT_PROJ_KEY_SUFFIX):
+ shape = getattr(v, "shape", None)
+ if shape is not None and len(shape) == 4:
+ return int(shape[1])
+ return None
+
+
+def _name_for_matching(mod: ModelOnDisk) -> str:
+ """Searchable name for backbone/variant heuristics.
+
+ NVIDIA distributes PiD checkpoints as
+ ``PiD_res2k_sr4x_official__distill_4step/model_ema_bf16.pth`` — the
+ backbone + variant live in the *directory* name, not the weights filename.
+ We therefore match against both the filename and its parent directory.
+ """
+ return f"{mod.path.parent.name} {mod.path.name}"
+
+
+def _backbone_from_filename(name: str) -> BaseModelType | None:
+ """Heuristic backbone match against NVIDIA's checkpoint filename conventions.
+
+ Returns None if no backbone can be inferred.
+ """
+ n = name.lower()
+ # Order matters: 'flux2' must match before 'flux'.
+ if re.search(r"\bflux[_-]?2\b|flux2", n):
+ return BaseModelType.Flux2
+ if "flux" in n:
+ return BaseModelType.Flux
+ if re.search(r"\bsdxl\b|sdxl", n):
+ return BaseModelType.StableDiffusionXL
+ if re.search(r"qwen[_-]?image|qwenimage", n):
+ return BaseModelType.QwenImage
+ if re.search(r"\bsd[_-]?3\b|sd3", n):
+ return BaseModelType.StableDiffusion3
+ return None
+
+
+def _variant_from_filename(name: str) -> PiDDecoderVariantType:
+ """Map NVIDIA's `res2k_sr4x` / `res2kto4k_sr4x` filename slice to a variant.
+
+ Defaults to ``Res2k_Sr4x`` when no clear marker is present.
+ """
+ n = name.lower()
+ if "res2kto4k" in n or "res2k_to_4k" in n or "res2k_to4k" in n:
+ return PiDDecoderVariantType.Res2kTo4k_Sr4x
+ return PiDDecoderVariantType.Res2k_Sr4x
+
+
+class PiDDecoder_Checkpoint_Config_Base(Checkpoint_Config_Base):
+ """Shared logic for PiD decoder checkpoint configs.
+
+ Concrete subclasses pin `base` to a specific backbone. Backbone matching is
+ driven primarily by the latent channel count read from the weights, with the
+ filename / directory name as a tie-breaker for the architecturally identical
+ FLUX.1 / SD3 pair. `variant` is carried as data without participating in the
+ discriminator tag (one config class per backbone).
+ """
+
+ type: Literal[ModelType.PiDDecoder] = Field(default=ModelType.PiDDecoder)
+ format: Literal[ModelFormat.Checkpoint] = Field(default=ModelFormat.Checkpoint)
+
+ @classmethod
+ def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
+ raise_if_not_file(mod)
+ raise_for_override_fields(cls, override_fields)
+
+ state_dict = mod.load_state_dict()
+ if not _looks_like_pid_decoder(state_dict):
+ raise NotAMatchError("state dict does not look like a PiD decoder (no 'lq_proj.*' keys)")
+
+ # Whether the caller explicitly pinned a base (e.g. a starter-model install passes base=sd-3).
+ # In the ambiguous 16-channel FLUX.1/SD3 case this override is trusted when the filename is silent.
+ had_base_override = override_fields.get("base") is not None
+ cls._validate_base(mod, state_dict, had_base_override=had_base_override)
+
+ variant = override_fields.pop("variant", None) or _variant_from_filename(_name_for_matching(mod))
+ return cls(**override_fields, variant=variant)
+
+ @classmethod
+ def _validate_base(
+ cls, mod: ModelOnDisk, state_dict: dict[str | int, Any], *, had_base_override: bool = False
+ ) -> None:
+ """Confirm this checkpoint belongs to the config's pinned backbone.
+
+ The latent channel count (read from the weights) is authoritative and
+ separates FLUX.2 (128ch) from the 16ch family. FLUX.1 and SD3 share an
+ identical architecture, so within the 16ch family we fall back to the
+ filename / directory name, defaulting to FLUX.1 when it is silent.
+
+ ``had_base_override`` is True when the caller explicitly pinned ``base``
+ (e.g. a starter-model install). In the ambiguous 16ch case, a trusted
+ override wins over the FLUX.1 default — necessary because the HF
+ single-file download renames the parent directory, dropping the
+ ``…official_sd3_distill…`` hint that would otherwise identify SD3.
+ """
+ expected_base = cls.model_fields["base"].default
+ channels = _latent_channels_from_state_dict(state_dict)
+
+ if channels is not None:
+ candidate_bases = _LATENT_CHANNELS_TO_BASES.get(channels)
+ if candidate_bases is None:
+ raise NotAMatchError(
+ f"PiD checkpoint has {channels} latent channels; no supported backbone uses this "
+ "(supported: 16 for FLUX.1/SD3, 128 for FLUX.2)"
+ )
+ if expected_base not in candidate_bases:
+ raise NotAMatchError(f"latent channels={channels} do not match backbone {expected_base}")
+ if len(candidate_bases) > 1:
+ # Ambiguous 16ch family — disambiguate FLUX.1 vs SD3 by name.
+ named_base = _backbone_from_filename(_name_for_matching(mod))
+ if named_base in candidate_bases:
+ if named_base is not expected_base:
+ raise NotAMatchError(f"name indicates {named_base}, not {expected_base}")
+ elif had_base_override:
+ # Name is silent, but the caller explicitly pinned this base → trust it.
+ return
+ elif expected_base is not BaseModelType.Flux:
+ # Name gives no usable hint and no override → default the family to FLUX.1.
+ raise NotAMatchError("ambiguous 16-channel PiD checkpoint; defaulting to FLUX.1")
+ return
+
+ # No diagnostic weight (unexpected) → fall back to filename-only matching.
+ inferred_base = _backbone_from_filename(_name_for_matching(mod))
+ if inferred_base is None:
+ raise NotAMatchError(
+ "cannot determine PiD decoder backbone from weights or filename (expected one of: flux, flux2, sd3)"
+ )
+ if inferred_base is not expected_base:
+ raise NotAMatchError(f"backbone is {inferred_base}, not {expected_base}")
+
+
+class PiDDecoder_Checkpoint_FLUX_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base):
+ """PiD decoder for the FLUX.1 backbone (16-channel latent)."""
+
+ base: Literal[BaseModelType.Flux] = Field(default=BaseModelType.Flux)
+ variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.")
+
+
+class PiDDecoder_Checkpoint_Flux2_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base):
+ """PiD decoder for the FLUX.2 backbone (128-channel latent)."""
+
+ base: Literal[BaseModelType.Flux2] = Field(default=BaseModelType.Flux2)
+ variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.")
+
+
+class PiDDecoder_Checkpoint_SD3_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base):
+ """PiD decoder for the Stable Diffusion 3 backbone (16-channel latent)."""
+
+ base: Literal[BaseModelType.StableDiffusion3] = Field(default=BaseModelType.StableDiffusion3)
+ variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.")
+
+
+class PiDDecoder_Checkpoint_SDXL_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base):
+ """PiD decoder for the SDXL backbone (4-channel latent)."""
+
+ base: Literal[BaseModelType.StableDiffusionXL] = Field(default=BaseModelType.StableDiffusionXL)
+ variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.")
+
+
+class PiDDecoder_Checkpoint_QwenImage_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base):
+ """PiD decoder for the Qwen-Image backbone (16-channel latent).
+
+ Shares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same
+ filename / directory-name disambiguation (or a trusted explicit ``base`` override)
+ as SD3 - see ``_validate_base``.
+ """
+
+ base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage)
+ variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.")
diff --git a/invokeai/backend/model_manager/configs/text_llm.py b/invokeai/backend/model_manager/configs/text_llm.py
index a0fb3e009f9..edac40ea57a 100644
--- a/invokeai/backend/model_manager/configs/text_llm.py
+++ b/invokeai/backend/model_manager/configs/text_llm.py
@@ -41,6 +41,14 @@ def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -
if not class_name.endswith("ForCausalLM"):
raise NotAMatchError(f"model architecture '{class_name}' is not a causal language model")
+ # Defer to specialised text-encoder configs for models that have a
+ # dedicated wrapper. Without this both configs match the same
+ # directory and the user ends up with a `text_llm` entry even though
+ # a more specific type exists.
+ _SPECIALISED_CAUSAL_LM_ARCHITECTURES = {"Gemma2ForCausalLM"}
+ if class_name in _SPECIALISED_CAUSAL_LM_ARCHITECTURES:
+ raise NotAMatchError(f"architecture '{class_name}' is handled by a dedicated encoder config, not TextLLM")
+
# Verify tokenizer files exist to avoid runtime failures
tokenizer_files = {"tokenizer.json", "tokenizer.model", "tokenizer_config.json"}
if not any((mod.path / f).exists() for f in tokenizer_files):
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
index e3a0928e52b..7808104a047 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -929,3 +929,23 @@ def drop_model(self, model_key: str) -> int:
gc.collect()
TorchDevice.empty_cache()
return len(dropped)
+
+ def offload_model_from_vram(self, model_key: str) -> int:
+ """Move a model (and its submodels) from VRAM to RAM without dropping it from the cache.
+
+ Unlike `drop_model`, the cache entry is kept, so the model stays resident in RAM and the next load does
+ not have to rebuild it from disk - only re-stream its weights back to VRAM. This is useful for freeing
+ VRAM after a one-shot use (e.g. a text encoder that has already produced its embeddings) before a much
+ larger model loads. Locked (in-use) entries are skipped.
+
+ Returns the number of VRAM bytes freed.
+ """
+ prefix = f"{model_key}:"
+ bytes_freed = 0
+ for key, entry in list(self._cached_models.items()):
+ if (key == model_key or key.startswith(prefix)) and not entry.is_locked:
+ bytes_freed += self._move_model_to_ram(entry, entry.cached_model.total_bytes())
+ if bytes_freed > 0:
+ gc.collect()
+ TorchDevice.empty_cache()
+ return bytes_freed
diff --git a/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py b/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py
new file mode 100644
index 00000000000..b9db92b31a6
--- /dev/null
+++ b/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py
@@ -0,0 +1,61 @@
+"""Loader for the Gemma-2 text encoder used by PiD.
+
+PiD only consumes the decoder block of the causal LM (see
+`pid/_src/models/pixeldit_model.py::_load_text_encoder`:
+`AutoModelForCausalLM.from_pretrained(...).get_decoder()`), so this loader
+returns the decoder sub-module for the `TextEncoder` submodel and the
+tokenizer for the `Tokenizer` submodel.
+"""
+
+from pathlib import Path
+from typing import Optional
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from invokeai.backend.model_manager.configs.factory import AnyModelConfig
+from invokeai.backend.model_manager.configs.gemma2_encoder import Gemma2Encoder_Gemma2Encoder_Config
+from invokeai.backend.model_manager.load.load_default import ModelLoader
+from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry
+from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType
+from invokeai.backend.util.devices import TorchDevice
+
+
+@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.Gemma2Encoder, format=ModelFormat.Gemma2Encoder)
+class Gemma2EncoderLoader(ModelLoader):
+ """Loads a Gemma-2 causal LM directory and exposes its decoder + tokenizer."""
+
+ def _load_model(
+ self,
+ config: AnyModelConfig,
+ submodel_type: Optional[SubModelType] = None,
+ ) -> AnyModel:
+ if not isinstance(config, Gemma2Encoder_Gemma2Encoder_Config):
+ raise ValueError("Only Gemma2Encoder_Gemma2Encoder_Config models are supported here.")
+
+ model_path = Path(config.path)
+
+ match submodel_type:
+ case SubModelType.Tokenizer:
+ return AutoTokenizer.from_pretrained(model_path, local_files_only=True)
+ case SubModelType.TextEncoder:
+ target_device = TorchDevice.choose_torch_device()
+ model_dtype = TorchDevice.choose_bfloat16_safe_dtype(target_device)
+ causal_lm = AutoModelForCausalLM.from_pretrained(
+ model_path,
+ torch_dtype=model_dtype,
+ low_cpu_mem_usage=True,
+ local_files_only=True,
+ )
+ # PiD only ever uses the decoder block — the transformer stack
+ # without the LM head. Upstream calls `.get_decoder()`, but
+ # transformers 4.56 returns None for Gemma2, so we reach for
+ # `.model` (the underlying Gemma2Model) directly and let the
+ # rest of `causal_lm` (lm_head etc.) be garbage-collected.
+ inner = getattr(causal_lm, "get_decoder", lambda: None)() or causal_lm.model
+ inner.eval()
+ inner.requires_grad_(False)
+ return inner
+
+ raise ValueError(
+ f"Unsupported submodel type for Gemma2 encoder: {submodel_type!r}. Expected Tokenizer or TextEncoder."
+ )
diff --git a/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py b/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py
new file mode 100644
index 00000000000..0a91c27ee4f
--- /dev/null
+++ b/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py
@@ -0,0 +1,104 @@
+"""Loader for PiD (Pixel Diffusion Decoder) checkpoints.
+
+Returns a fully-constructed `PidNet` so the model cache can size it
+correctly and apply its standard sequential-offload / partial-load
+policies. We instantiate the architecture (per backbone) here and pour the
+checkpoint's tensors directly into it, then discard the intermediate state
+dict — avoiding the 2x VRAM peak you would get from holding both a `dict`
+and the live module at the same time.
+"""
+
+from pathlib import Path
+from typing import Optional
+
+import torch
+from safetensors.torch import load_file as safetensors_load_file
+
+from invokeai.backend.model_manager.configs.factory import AnyModelConfig
+from invokeai.backend.model_manager.load.load_default import ModelLoader
+from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry
+from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType
+from invokeai.backend.pid.decode import load_pid_decoder
+
+# NVIDIA's official PiD `.pth` checkpoints store the student under the `net.`
+# prefix (see `PidDistillModel.state_dict(prefix="net.")` in the vendored
+# upstream). We strip it on load so PidNet.load_state_dict() can consume the
+# dict directly.
+_NET_PREFIX = "net."
+
+
+def _load_raw_checkpoint(path: Path) -> dict[str, torch.Tensor]:
+ suffix = path.suffix.lower()
+ if suffix == ".safetensors":
+ return safetensors_load_file(str(path))
+ if suffix in {".pth", ".pt", ".ckpt", ".bin"}:
+ # NVIDIA's PiD `.pth` checkpoints are plain tensor dicts (verified
+ # against the released res2k_sr4x_official_flux checkpoint).
+ sd = torch.load(str(path), map_location="cpu", weights_only=True)
+ if isinstance(sd, dict) and "state_dict" in sd and isinstance(sd["state_dict"], dict):
+ sd = sd["state_dict"]
+ return sd # type: ignore[return-value]
+ raise ValueError(f"Unrecognised PiD decoder checkpoint extension: {suffix!r}")
+
+
+def _strip_net_prefix(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+ if not any(k.startswith(_NET_PREFIX) for k in state_dict if isinstance(k, str)):
+ return state_dict
+ out: dict[str, torch.Tensor] = {}
+ for k, v in state_dict.items():
+ if isinstance(k, str) and k.startswith(_NET_PREFIX):
+ out[k[len(_NET_PREFIX) :]] = v
+ elif isinstance(k, str) and (
+ k.startswith("net_ema.") or k.startswith("fake_score.") or k.startswith("discriminator.")
+ ):
+ continue
+ else:
+ out[k] = v
+ return out
+
+
+@ModelLoaderRegistry.register(base=BaseModelType.Flux, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(base=BaseModelType.Flux2, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(
+ base=BaseModelType.StableDiffusion3, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint
+)
+@ModelLoaderRegistry.register(
+ base=BaseModelType.StableDiffusionXL, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint
+)
+@ModelLoaderRegistry.register(base=BaseModelType.QwenImage, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint)
+class PiDDecoderLoader(ModelLoader):
+ """Loads a PiD checkpoint into a fully-constructed PidNet of the matching backbone."""
+
+ def _load_model(
+ self,
+ config: AnyModelConfig,
+ submodel_type: Optional[SubModelType] = None,
+ ) -> AnyModel:
+ if submodel_type is not None:
+ raise ValueError("Unexpected submodel requested for PiD decoder.")
+
+ # Backbone is encoded in the config's `base` field — populated by
+ # PiDDecoder_Checkpoint_*_Config when the user added the model.
+ backbone: BaseModelType = config.base
+
+ raw_sd = _strip_net_prefix(_load_raw_checkpoint(Path(config.path)))
+
+ # Build the live PidNet on CPU and pour the checkpoint in — then drop
+ # the dict so we don't hold two copies in RAM at once.
+ pid_net = load_pid_decoder(raw_sd, backbone)
+ del raw_sd
+
+ # We deliberately keep PidNet's parameters in float32 here. PiD
+ # consumes Gemma-2 hidden states that contain large outliers
+ # (per-token max well past 100) and the in-network RMSNorm
+ # (`variance = hidden_states.pow(2).mean(-1, keepdim=True)`) loses
+ # precision badly in bf16, producing all-NaN outputs. The decode
+ # wrapper runs the forward pass under `torch.autocast(bf16)` so the
+ # bulk of the matmuls still execute in bf16 — only the precision-
+ # critical reductions stay fp32. This roughly doubles VRAM for the
+ # weights (~5 GB instead of ~2.5 GB) but is the only configuration
+ # we have measured to be numerically stable end-to-end.
+
+ pid_net.eval()
+ pid_net.requires_grad_(False)
+ return pid_net
diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py
index 9bc58e44269..4ca1a275bb6 100644
--- a/invokeai/backend/model_manager/starter_models.py
+++ b/invokeai/backend/model_manager/starter_models.py
@@ -14,6 +14,7 @@
BaseModelType,
ModelFormat,
ModelType,
+ PiDDecoderVariantType,
QwenImageVariantType,
)
@@ -128,6 +129,116 @@ class StarterModelBundle(BaseModel):
# endregion
+# region PiD (Pixel Diffusion Decoder)
+# PiD's pretrained decoders condition on Gemma-2-2b-it caption embeddings (2304-dim). NVIDIA references the ungated
+# mirror Efficient-Large-Model/gemma-2-2b-it. It is shared across all PiD backbones, so it is a dependency of each
+# decoder below (and offered standalone here so it can be installed once).
+gemma2_2b_encoder = StarterModel(
+ name="Gemma 2 2B (PiD caption encoder)",
+ base=BaseModelType.Any,
+ source="Efficient-Large-Model/gemma-2-2b-it",
+ description="Gemma-2-2b-it text encoder that PiD uses to condition its diffusion decode on a caption. ~5GB",
+ type=ModelType.Gemma2Encoder,
+ format=ModelFormat.Gemma2Encoder,
+)
+
+# NVIDIA PiD decoders (https://huggingface.co/nvidia/PiD). Code is Apache-2.0; weights are NSCLv1 (non-commercial /
+# research). Each is a 4x super-resolution decoder that replaces the regular VAE decode and needs the Gemma-2 encoder.
+pid_decoder_flux_2k = StarterModel(
+ name="PiD Decoder FLUX (2K)",
+ base=BaseModelType.Flux,
+ source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for FLUX latents, 2K target preset (e.g. 512 -> 2048). ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+pid_decoder_flux_2kto4k = StarterModel(
+ name="PiD Decoder FLUX (2K to 4K)",
+ base=BaseModelType.Flux,
+ source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_flux_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for FLUX latents, 2K-to-4K preset for higher-resolution output. ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2kTo4k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+# FLUX.2 Klein shares one 32-channel VAE across the 4B and 9B variants, so a single decoder per preset covers both.
+# The 128-channel packed latent is unambiguous (unlike the 16ch FLUX/SD3 case), so no directory-name disambiguation
+# is needed for the config probe.
+pid_decoder_flux2_2k = StarterModel(
+ name="PiD Decoder FLUX.2 (2K)",
+ base=BaseModelType.Flux2,
+ source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for FLUX.2 Klein latents, 2K target preset (e.g. 512 -> 2048). ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+pid_decoder_flux2_2kto4k = StarterModel(
+ name="PiD Decoder FLUX.2 (2K to 4K)",
+ base=BaseModelType.Flux2,
+ source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for FLUX.2 Klein latents, 2K-to-4K preset for higher-resolution output. ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2kTo4k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+# SD3 uses a 16-channel latent, architecturally identical to FLUX.1. The config probe disambiguates via the
+# checkpoint's directory name (`…official_sd3_distill…`); if the HF single-file download drops that name, the
+# explicit base=StableDiffusion3 override the installer sends is trusted instead (see pid_decoder.py::_validate_base).
+pid_decoder_sd3_2k = StarterModel(
+ name="PiD Decoder SD3 (2K)",
+ base=BaseModelType.StableDiffusion3,
+ source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for SD3 latents, 2K target preset (e.g. 512 -> 2048). ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+pid_decoder_sd3_2kto4k = StarterModel(
+ name="PiD Decoder SD3 (2K to 4K)",
+ base=BaseModelType.StableDiffusion3,
+ source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for SD3 latents, 2K-to-4K preset for higher-resolution output. ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2kTo4k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+# SDXL uses a 4-channel latent, which is unambiguous (no FLUX/SD3-style directory-name disambiguation needed).
+# NVIDIA ships only the 2K-to-4K preset for SDXL (no plain 2K checkpoint).
+pid_decoder_sdxl_2kto4k = StarterModel(
+ name="PiD Decoder SDXL (2K to 4K)",
+ base=BaseModelType.StableDiffusionXL,
+ source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_sdxl_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for SDXL latents, 2K-to-4K preset. ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2kTo4k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+# Qwen-Image uses a 16-channel latent (ambiguous with FLUX/SD3). The config probe disambiguates via the checkpoint's
+# directory name (`…official_qwenimage_distill…`); if the HF single-file download drops it, the explicit
+# base=QwenImage override the installer sends is trusted instead (see pid_decoder.py::_validate_base). Only the
+# 2K-to-4K preset exists.
+pid_decoder_qwenimage_2kto4k = StarterModel(
+ name="PiD Decoder Qwen-Image (2K to 4K)",
+ base=BaseModelType.QwenImage,
+ source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_qwenimage_distill_4step/model_ema_bf16.pth",
+ description="NVIDIA PiD 4x super-resolution decoder for Qwen-Image latents, 2K-to-4K preset. ~5GB",
+ type=ModelType.PiDDecoder,
+ format=ModelFormat.Checkpoint,
+ variant=PiDDecoderVariantType.Res2kTo4k_Sr4x,
+ dependencies=[gemma2_2b_encoder],
+)
+# endregion
+
+
# region: Main
flux_schnell_quantized = StarterModel(
name="FLUX.1 schnell (quantized)",
@@ -1710,6 +1821,15 @@ def _gemini_3_resolution_presets(
anima_base,
anima_qwen3_encoder,
anima_vae,
+ gemma2_2b_encoder,
+ pid_decoder_flux_2k,
+ pid_decoder_flux_2kto4k,
+ pid_decoder_flux2_2k,
+ pid_decoder_flux2_2kto4k,
+ pid_decoder_sd3_2k,
+ pid_decoder_sd3_2kto4k,
+ pid_decoder_sdxl_2kto4k,
+ pid_decoder_qwenimage_2kto4k,
]
sd1_bundle: list[StarterModel] = [
diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py
index a2e4e58bdc4..0bb2eb2bf33 100644
--- a/invokeai/backend/model_manager/taxonomy.py
+++ b/invokeai/backend/model_manager/taxonomy.py
@@ -79,12 +79,14 @@ class ModelType(str, Enum):
T5Encoder = "t5_encoder"
Qwen3Encoder = "qwen3_encoder"
QwenVLEncoder = "qwen_vl_encoder"
+ Gemma2Encoder = "gemma2_encoder"
SpandrelImageToImage = "spandrel_image_to_image"
SigLIP = "siglip"
FluxRedux = "flux_redux"
LlavaOnevision = "llava_onevision"
TextLLM = "text_llm"
ExternalImageGenerator = "external_image_generator"
+ PiDDecoder = "pid_decoder"
Unknown = "unknown"
@@ -178,6 +180,23 @@ class Qwen3VariantType(str, Enum):
"""Qwen3 0.6B text encoder (hidden_size=1024). Used by Anima."""
+class PiDDecoderVariantType(str, Enum):
+ """PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA.
+
+ Each backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that
+ differ only in target output resolution; the underlying network is the
+ same. NVIDIA's checkpoint filenames encode this as e.g.
+ `PiD_res2k_sr4x_official_flux_distill_4step` vs
+ `PiD_res2kto4k_sr4x_official_flux_distill_4step`.
+ """
+
+ Res2k_Sr4x = "res2k_sr4x"
+ """Standard 2K target preset (decodes to ~2K via 4x super-resolution)."""
+
+ Res2kTo4k_Sr4x = "res2kto4k_sr4x"
+ """Upsampling preset (designed for chaining to push ~2K inputs to ~4K)."""
+
+
class ModelFormat(str, Enum):
"""Storage format of model."""
@@ -193,6 +212,7 @@ class ModelFormat(str, Enum):
T5Encoder = "t5_encoder"
Qwen3Encoder = "qwen3_encoder"
QwenVLEncoder = "qwen_vl_encoder"
+ Gemma2Encoder = "gemma2_encoder"
BnbQuantizedLlmInt8b = "bnb_quantized_int8b"
BnbQuantizednf4b = "bnb_quantized_nf4b"
GGUFQuantized = "gguf_quantized"
@@ -249,6 +269,7 @@ class FluxLoRAFormat(str, Enum):
ZImageVariantType,
QwenImageVariantType,
Qwen3VariantType,
+ PiDDecoderVariantType,
]
variant_type_adapter = TypeAdapter[
ModelVariantType
@@ -258,6 +279,7 @@ class FluxLoRAFormat(str, Enum):
| ZImageVariantType
| QwenImageVariantType
| Qwen3VariantType
+ | PiDDecoderVariantType
](
ModelVariantType
| ClipVariantType
@@ -266,4 +288,5 @@ class FluxLoRAFormat(str, Enum):
| ZImageVariantType
| QwenImageVariantType
| Qwen3VariantType
+ | PiDDecoderVariantType
)
diff --git a/invokeai/backend/pid/__init__.py b/invokeai/backend/pid/__init__.py
new file mode 100644
index 00000000000..a247ebb89dd
--- /dev/null
+++ b/invokeai/backend/pid/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Vendored from https://github.com/nv-tlabs/PiD (Apache-2.0).
+# Original upstream lives at pid/. Files here have been re-rooted to
+# invokeai.backend.pid.* and the configs/, tokenizers/, _demo_*, from_*,
+# checkpointer/, trainer.py, visualize/ subtrees have been excluded.
+#
+# See THIRD_PARTY_LICENSES.md for the full attribution.
diff --git a/invokeai/backend/pid/_ext/__init__.py b/invokeai/backend/pid/_ext/__init__.py
new file mode 100644
index 00000000000..1e792c09005
--- /dev/null
+++ b/invokeai/backend/pid/_ext/__init__.py
@@ -0,0 +1,3 @@
+# Vendored from PiD's _ext/ subtree (https://github.com/nv-tlabs/PiD).
+# Originally copied from cosmos-predict2.5 (https://github.com/nvidia-cosmos/cosmos-predict2.5/).
+# Apache-2.0.
diff --git a/invokeai/backend/pid/_ext/imaginaire/__init__.py b/invokeai/backend/pid/_ext/imaginaire/__init__.py
new file mode 100644
index 00000000000..3159bfe6564
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py
new file mode 100644
index 00000000000..fbbe88ede08
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Upstream re-exported `LazyDict = omegaconf.DictConfig`; in this vendored
+# subset configs are plain Python mappings, so `LazyDict` aliases the
+# attribute-accessible dict subclass produced by `LazyCall`.
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config.instantiate import instantiate
+from invokeai.backend.pid._ext.imaginaire.lazy_config.lazy import LazyCall, LazyConfig, _LazyCallResult
+
+PLACEHOLDER = None
+LazyDict = _LazyCallResult
+
+__all__ = ["instantiate", "LazyCall", "LazyConfig", "PLACEHOLDER", "LazyDict"]
diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py
new file mode 100644
index 00000000000..0579fe5f56e
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Minimal stdlib-based stand-in for the upstream iopath PathManager.
+# Only `open()` on local paths and trivial helpers are supported; the upstream
+# HTTPURLHandler / OneDrivePathHandler paths are not used by the decoder
+# inference subset we vendor.
+
+import io
+import shutil
+from typing import IO, Any
+
+__all__ = ["PathManager", "PathHandler"]
+
+
+class PathHandler:
+ """Base no-op handler (kept for API parity)."""
+
+ def _open(self, path: str, mode: str = "r", **kwargs: Any) -> IO:
+ return io.open(path, mode, **kwargs)
+
+
+class _LocalPathManager:
+ def open(self, path: str, mode: str = "r", **kwargs: Any) -> IO:
+ return io.open(path, mode, **kwargs)
+
+ def get_local_path(self, path: str, **kwargs: Any) -> str:
+ return path
+
+ def exists(self, path: str) -> bool:
+ import os.path
+
+ return os.path.exists(path)
+
+ def isfile(self, path: str) -> bool:
+ import os.path
+
+ return os.path.isfile(path)
+
+ def isdir(self, path: str) -> bool:
+ import os.path
+
+ return os.path.isdir(path)
+
+ def mkdirs(self, path: str) -> None:
+ import os
+
+ os.makedirs(path, exist_ok=True)
+
+ def copy(self, src: str, dst: str, overwrite: bool = False) -> bool:
+ shutil.copy(src, dst)
+ return True
+
+ def register_handler(self, handler: PathHandler) -> None:
+ pass
+
+
+PathManager = _LocalPathManager()
diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py
new file mode 100644
index 00000000000..bdb5b4abb07
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Stdlib-only `instantiate()`. The upstream module also handled
+# omegaconf.DictConfig / ListConfig structured configs and OmegaConf.to_object
+# round-trips. In the vendored decoder-inference subset all configs are
+# constructed as plain Python mappings (see invokeai/backend/pid/decode.py),
+# so the omegaconf paths are not required.
+
+import collections.abc as abc
+import dataclasses
+import logging
+from typing import Any
+
+import attrs
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config.registry import _convert_target_to_string, locate
+
+__all__ = ["dump_dataclass", "instantiate"]
+
+
+def is_dataclass_or_attrs(target: Any) -> bool:
+ return dataclasses.is_dataclass(target) or attrs.has(target)
+
+
+def dump_dataclass(obj: Any) -> dict:
+ """Recursively dump a dataclass into a dict that can be re-instantiated."""
+ assert dataclasses.is_dataclass(obj) and not isinstance(obj, type), (
+ "dump_dataclass() requires an instance of a dataclass."
+ )
+ ret: dict = {"_target_": _convert_target_to_string(type(obj))}
+ for f in dataclasses.fields(obj):
+ v = getattr(obj, f.name)
+ if dataclasses.is_dataclass(v):
+ v = dump_dataclass(v)
+ if isinstance(v, (list, tuple)):
+ v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+ ret[f.name] = v
+ return ret
+
+
+def instantiate(cfg: Any, *args: Any, **kwargs: Any) -> Any:
+ """Recursively instantiate objects defined by `_target_` + arguments.
+
+ Accepts any Mapping with a `_target_` key (e.g. plain dict or the
+ `_LazyCallResult` produced by `LazyCall`). Lists are walked recursively.
+ """
+ if isinstance(cfg, list):
+ return [instantiate(x) for x in cfg]
+
+ if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+ is_recursive = bool(cfg.get("_recursive_", True))
+ if is_recursive:
+ resolved = {k: instantiate(v) for k, v in cfg.items()}
+ else:
+ resolved = dict(cfg)
+ resolved.pop("_recursive_", None)
+ cls = resolved.pop("_target_")
+ if isinstance(cls, str):
+ cls_name = cls
+ cls = locate(cls_name)
+ assert cls is not None, cls_name
+ else:
+ cls_name = getattr(cls, "__qualname__", str(cls))
+ assert callable(cls), f"_target_ {cls_name} does not define a callable object"
+ try:
+ return cls(*args, **{**resolved, **kwargs})
+ except TypeError:
+ logging.getLogger(__name__).error("Error when instantiating %s!", cls_name)
+ raise
+
+ return cfg
diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py
new file mode 100644
index 00000000000..65069589396
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Minimal LazyCall / LazyConfig stub. The upstream module supports file-based
+# config save/load via yaml + cloudpickle + dill + detectron2 helpers; the
+# vendored decoder-inference subset only needs `LazyCall(cls)(**kwargs)` as a
+# convenient producer of `{_target_: "cls.fqn", **kwargs}` dicts that
+# `instantiate()` can resolve.
+
+from typing import Any
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config.registry import _convert_target_to_string
+
+__all__ = ["LazyCall", "LazyConfig"]
+
+
+class _LazyCallResult(dict):
+ """A plain dict tagged for `instantiate()`. Behaves like a DictConfig
+ enough for our subset (attribute access falls back to item access)."""
+
+ def __getattr__(self, key: str) -> Any:
+ try:
+ return self[key]
+ except KeyError as e:
+ raise AttributeError(key) from e
+
+ def __setattr__(self, key: str, value: Any) -> None:
+ self[key] = value
+
+
+class LazyCall:
+ """`LazyCall(cls)(**kwargs)` -> `{_target_: , **kwargs}`."""
+
+ def __init__(self, target: Any) -> None:
+ self._target = target
+
+ def __call__(self, **kwargs: Any) -> _LazyCallResult:
+ target_str = _convert_target_to_string(self._target) if not isinstance(self._target, str) else self._target
+ return _LazyCallResult(_target_=target_str, **kwargs)
+
+
+class LazyConfig:
+ """File-IO helpers from the upstream module are not used in the inference
+ subset and are intentionally omitted."""
+
+ @staticmethod
+ def load(*args: Any, **kwargs: Any) -> Any:
+ raise NotImplementedError("LazyConfig.load is not supported in the vendored PiD inference subset.")
+
+ @staticmethod
+ def save(*args: Any, **kwargs: Any) -> Any:
+ raise NotImplementedError("LazyConfig.save is not supported in the vendored PiD inference subset.")
diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py
new file mode 100644
index 00000000000..73d8bc973a2
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py
@@ -0,0 +1,117 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pydoc
+from typing import Any
+
+
+class Registry:
+ """Minimal stand-in for fvcore.common.registry.Registry.
+
+ Only the subset used by the vendored PiD decode path is implemented:
+ name-keyed object registry with ``register``/``get``.
+ """
+
+ def __init__(self, name: str) -> None:
+ self._name = name
+ self._obj_map: dict[str, Any] = {}
+
+ def register(self, obj: Any = None, *, name: str | None = None) -> Any:
+ if obj is None:
+
+ def deco(x: Any) -> Any:
+ self._do_register(name or x.__name__, x)
+ return x
+
+ return deco
+ self._do_register(name or obj.__name__, obj)
+ return obj
+
+ def _do_register(self, name: str, obj: Any) -> None:
+ if name in self._obj_map:
+ raise KeyError(f"{name} already registered in {self._name}")
+ self._obj_map[name] = obj
+
+ def get(self, name: str) -> Any:
+ if name not in self._obj_map:
+ raise KeyError(f"{name} not found in {self._name}")
+ return self._obj_map[name]
+
+ def __contains__(self, name: str) -> bool:
+ return name in self._obj_map
+
+ def __iter__(self):
+ return iter(self._obj_map.items())
+
+
+"""
+``Registry`` and `locate` provide ways to map a string (typically found
+in config files) to callable objects.
+"""
+
+__all__ = ["Registry", "locate"]
+
+
+def _convert_target_to_string(t: Any) -> str:
+ """
+ Inverse of ``locate()``.
+
+ Args:
+ t: any object with ``__module__`` and ``__qualname__``
+ """
+ module, qualname = t.__module__, t.__qualname__
+
+ # Compress the path to this object, e.g. ``module.submodule._impl.class``
+ # may become ``module.submodule.class``, if the later also resolves to the same
+ # object. This simplifies the string, and also is less affected by moving the
+ # class implementation.
+ module_parts = module.split(".")
+ for k in range(1, len(module_parts)):
+ prefix = ".".join(module_parts[:k])
+ candidate = f"{prefix}.{qualname}"
+ try:
+ if locate(candidate) is t:
+ return candidate
+ except ImportError:
+ pass
+ return f"{module}.{qualname}"
+
+
+def locate(name: str) -> Any:
+ """
+ Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
+ such as "module.submodule.class_name".
+
+ Raise Exception if it cannot be found.
+ """
+ obj = pydoc.locate(name)
+ if obj is None:
+ # Fallback: walk the module path manually for cases pydoc.locate misses
+ # (e.g. nested classes, re-exports).
+ import importlib
+
+ parts = name.split(".")
+ for k in range(len(parts) - 1, 0, -1):
+ mod_path, attr_path = ".".join(parts[:k]), parts[k:]
+ try:
+ obj = importlib.import_module(mod_path)
+ for a in attr_path:
+ obj = getattr(obj, a)
+ break
+ except (ImportError, AttributeError):
+ obj = None
+ if obj is None:
+ raise ImportError(f"Cannot dynamically locate object {name}!")
+ return obj
diff --git a/invokeai/backend/pid/_ext/imaginaire/model.py b/invokeai/backend/pid/_ext/imaginaire/model.py
new file mode 100644
index 00000000000..f2ab6e02c3a
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/model.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config import LazyDict, instantiate
+
+
+class ImaginaireModel(torch.nn.Module):
+ """The base model class of Imaginaire. It is inherited from torch.nn.Module.
+
+ All models in Imaginaire should inherit ImaginaireModel. It should include the implementions for all the
+ computation graphs. All inheriting child classes should implement the following methods:
+ - training_step(): The training step of the model, including the loss computation.
+ - validation_step(): The validation step of the model, including the loss computation.
+ - forward(): The computation graph for model inference.
+ The following methods have default implementations in ImaginaireModel:
+ - init_optimizer_scheduler(): Creates the optimizer and scheduler for the model.
+ """
+
+ def __init__(self) -> None:
+ super().__init__()
+
+ def init_optimizer_scheduler(
+ self, optimizer_config: LazyDict, scheduler_config: LazyDict
+ ) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
+ """Creates the optimizer and scheduler for the model.
+
+ Args:
+ config_model (ModelConfig): The config object for the model.
+
+ Returns:
+ optimizer (torch.optim.Optimizer): The model optimizer.
+ scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+ """
+ optimizer_config.params = self.parameters()
+ optimizer = instantiate(optimizer_config)
+ scheduler_config.optimizer = optimizer
+ scheduler = instantiate(scheduler_config)
+ return optimizer, scheduler
+
+ def training_step(
+ self, data_batch: dict[str, torch.Tensor], iteration: int
+ ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+ """The training step of the model, including the loss computation.
+
+ Args:
+ data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+ iteration (int): Current iteration number.
+
+ Returns:
+ output_batch (dict[str, torch.Tensor]): Auxiliary model output from the training batch.
+ loss (torch.Tensor): The total loss for backprop (weighted sum of various losses).
+ """
+ raise NotImplementedError
+
+ @torch.no_grad()
+ def validation_step(
+ self, data_batch: dict[str, torch.Tensor], iteration: int
+ ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+ """The validation step of the model, including the loss computation.
+
+ Args:
+ data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+ iteration (int): Current iteration number.
+
+ Returns:
+ output_batch (dict[str, torch.Tensor]): Auxiliary model output from the validation batch.
+ loss (torch.Tensor): The total loss (weighted sum of various losses).
+ """
+ raise NotImplementedError
+
+ @torch.inference_mode()
+ def forward(self, *args: Any, **kwargs: Any) -> Any:
+ """The computation graph for model inference.
+
+ Args:
+ *args: Whatever you decide to pass into the forward method.
+ **kwargs: Keyword arguments are also possible.
+
+ Return:
+ Your model's output.
+ """
+ raise NotImplementedError
+
+ def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None:
+ """The model preparation before the training is launched
+
+ Args:
+ memory_format (torch.memory_format): Memory format of the model.
+ """
+ pass
+
+ def on_before_zero_grad(
+ self, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LRScheduler, iteration: int
+ ) -> None:
+ """Hook before zero_grad() is called.
+
+ Args:
+ optimizer (torch.optim.Optimizer): The model optimizer.
+ scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+ iteration (int): Current iteration number.
+ """
+ pass
+
+ def on_after_backward(self, iteration: int = 0) -> None:
+ """Hook after loss.backward() is called.
+
+ This method is called immediately after the backward pass, allowing for custom operations
+ or modifications to be performed on the gradients before the optimizer step.
+
+ Args:
+ iteration (int): Current iteration number.
+ """
+ pass
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py b/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py
new file mode 100644
index 00000000000..3159bfe6564
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py b/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py
new file mode 100644
index 00000000000..c42805a66df
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import nn
+
+
+def disabled_train(self, mode: bool = True):
+ """Overwrite model.train with this function to make sure train/eval mode
+ does not change anymore."""
+ return self
+
+
+def count_params(model: nn.Module, verbose=False) -> int:
+ total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ if verbose:
+ print(f"{model.__class__.__name__} has {total_params * 1.0e-6:.2f} M params.")
+ return total_params
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/device.py b/invokeai/backend/pid/_ext/imaginaire/utils/device.py
new file mode 100644
index 00000000000..aab75fc59fa
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/device.py
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import math
+import os
+
+from invokeai.backend.pid._ext.imaginaire.utils.log import logger as logging
+
+
+def get_gpu_architecture():
+ """
+ Retrieves the GPU architecture of the available GPUs.
+
+ Returns:
+ str: The GPU architecture, which can be "H100", "A100", or "Other".
+ """
+ import pynvml
+
+ try:
+ pynvml.nvmlInit()
+ device_count = pynvml.nvmlDeviceGetCount()
+ for i in range(device_count):
+ handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+ model_name = pynvml.nvmlDeviceGetName(handle)
+ if isinstance(model_name, bytes):
+ model_name = model_name.decode("utf-8")
+ print(f"GPU {i}: Model: {model_name}")
+
+ # Check for specific models like H100 or A100
+ if "H100" in model_name or "H200" in model_name:
+ return "H100"
+ elif "A100" in model_name:
+ return "A100"
+ elif "L40S" in model_name:
+ return "L40S"
+ elif "B200" in model_name:
+ return "B200"
+ except pynvml.NVMLError as error:
+ print(f"Failed to get GPU info: {error}")
+ finally:
+ pynvml.nvmlShutdown()
+
+ # return "Other" incase of non hopper/ampere or error
+ return "Other"
+
+
+class GPUArchitectureNotSupported(Exception):
+ """
+ Custom exception raised when the expected GPU architecture is not supported.
+ """
+
+ pass
+
+
+def print_gpu_mem(str=None):
+ import pynvml
+
+ try:
+ pynvml.nvmlInit()
+ meminfo = pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(0))
+ logging.info(
+ f"{str}: {meminfo.used / 1024 / 1024}/{meminfo.total / 1024 / 1024}MiB used ({meminfo.free / 1024 / 1024}MiB free)"
+ )
+ except pynvml.NVMLError as error:
+ print(f"Failed to get GPU memory info: {error}")
+
+
+def force_gc():
+ print_gpu_mem()
+ print("gc()")
+ gc.collect()
+ print_gpu_mem()
+ print("empty cuda cache")
+ # print(torch.cuda.memory_summary())
+ print_gpu_mem()
+
+
+def gpu0_has_80gb_or_less():
+ import pynvml
+
+ try:
+ pynvml.nvmlInit()
+ meminfo = pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(0))
+ return meminfo.total / 1024 / 1024 / 1024 <= 80
+ except pynvml.NVMLError as error:
+ print(f"Failed to get GPU memory info: {error}")
+
+
+class Device:
+ _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) # type: ignore
+
+ def __init__(self, device_idx: int):
+ import pynvml
+
+ super().__init__()
+ self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+ def get_name(self) -> str:
+ import pynvml
+
+ return pynvml.nvmlDeviceGetName(self.handle)
+
+ def get_cpu_affinity(self) -> list[int]:
+ import pynvml
+
+ affinity_string = ""
+ for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements):
+ # assume nvml returns list of 64 bit ints
+ affinity_string = "{:064b}".format(j) + affinity_string
+ affinity_list = [int(x) for x in affinity_string]
+ affinity_list.reverse() # so core 0 is in 0th element of list
+ return [i for i, e in enumerate(affinity_list) if e != 0]
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py b/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py
new file mode 100644
index 00000000000..78d8599abcd
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py
@@ -0,0 +1,444 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import collections
+import collections.abc
+import ctypes
+import functools
+import os
+from contextlib import contextmanager
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, Callable, Container, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import get_process_group_ranks
+
+from invokeai.backend.pid._ext.imaginaire.utils.device import Device
+
+if dist.is_available():
+ from torch.distributed.distributed_c10d import _get_default_group
+ from torch.distributed.utils import _sync_module_states, _verify_param_shape_across_processes
+
+from invokeai.backend.pid._ext.imaginaire.utils import log
+
+if TYPE_CHECKING:
+ DDPConfig = Any # config module not vendored; type hint kept for parity
+
+try:
+ from megatron.core import parallel_state
+except ImportError:
+ parallel_state = None # type: ignore[assignment]
+
+
+def init() -> int | None:
+ """Initialize distributed training."""
+ import pynvml
+
+ if dist.is_initialized():
+ return torch.cuda.current_device()
+
+ # Set GPU affinity.
+ pynvml.nvmlInit()
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
+ try:
+ device = Device(local_rank)
+ os.sched_setaffinity(0, device.get_cpu_affinity())
+ except Exception as e:
+ log.warning(f"Failed to set device affinity: {e}")
+ # Set up NCCL communication.
+ os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "0"
+ os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+ if dist.is_available():
+ torch.cuda.set_device(local_rank)
+ # Get the timeout value from environment variable
+ timeout_seconds = os.getenv("TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC", 1800)
+ # Convert the timeout to an integer (if it isn't already) and then to a timedelta
+ timeout_timedelta = timedelta(seconds=int(timeout_seconds))
+ dist.init_process_group(backend="nccl", init_method="env://", timeout=timeout_timedelta)
+ log.info(
+ f"Initialized distributed training with local rank {local_rank} with timeout {timeout_seconds}",
+ rank0_only=False,
+ )
+ # Increase the L2 fetch granularity for faster speed.
+ _libcudart = ctypes.CDLL("libcudart.so")
+ # Set device limit on the current device.
+ p_value = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+ _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+ _libcudart.cudaDeviceGetLimit(p_value, ctypes.c_int(0x05))
+ log.info(f"Training with {get_world_size()} GPUs.")
+
+
+def get_rank(group: Optional[dist.ProcessGroup] = None) -> int:
+ """Get the rank (GPU device) of the worker.
+
+ Returns:
+ rank (int): The rank of the worker.
+ """
+ rank = 0
+ if dist.is_available() and dist.is_initialized():
+ rank = dist.get_rank(group)
+ return rank
+
+
+def get_world_size(group: Optional[dist.ProcessGroup] = None) -> int:
+ """Get world size. How many GPUs are available in this job.
+
+ Returns:
+ world_size (int): The total number of GPUs available in this job.
+ """
+ world_size = 1
+ if dist.is_available() and dist.is_initialized():
+ world_size = dist.get_world_size(group)
+ return world_size
+
+
+def is_rank0() -> bool:
+ """Check if current process is the master GPU.
+
+ Returns:
+ (bool): True if this function is called from the master GPU, else False.
+ """
+ return get_rank() == 0
+
+
+def is_local_rank0() -> bool:
+ """Check if current process is the local master GPU in the current node.
+
+ Returns:
+ (bool): True if this function is called from the local master GPU, else False.
+ """
+ return torch.cuda.current_device() == 0
+
+
+def rank0_only(func: Callable) -> Callable:
+ """Apply this function only to the master GPU.
+
+ Example usage:
+ @rank0_only
+ def func(x):
+ return x + 3
+
+ Args:
+ func (Callable): a function.
+
+ Returns:
+ (Callable): A function wrapper executing the function only on the master GPU.
+ """
+
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs): # noqa: ANN202
+ if is_rank0():
+ return func(*args, **kwargs)
+ else:
+ return None
+
+ return wrapper
+
+
+def barrier() -> None:
+ """Barrier for all GPUs."""
+ if dist.is_available() and dist.is_initialized():
+ dist.barrier()
+
+
+def rank0_first(func: Callable) -> Callable:
+ """run the function on rank 0 first, then on other ranks."""
+
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs): # noqa: ANN202
+ if is_rank0():
+ result = func(*args, **kwargs)
+ barrier()
+ if not is_rank0():
+ result = func(*args, **kwargs)
+ return result
+
+ return wrapper
+
+
+def parallel_model_wrapper(config_ddp: DDPConfig, model: torch.nn.Module) -> torch.nn.Module | DistributedDataParallel:
+ """Wraps the model to enable data parallalism for training across multiple GPU devices.
+
+ Args:
+ config_ddp (DDPConfig): The data parallel config.
+ model (torch.nn.Module): The PyTorch module.
+
+ Returns:
+ model (torch.nn.Module | DistributedDataParallel): The data parallel model wrapper
+ if distributed environment is available, otherwise return the original model.
+ """
+ if dist.is_available() and dist.is_initialized():
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
+ try:
+ ddp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+ except Exception as e:
+ log.info(e)
+ log.info("parallel_state not initialized, treating all GPUs equally for DDP")
+ ddp_group = None
+
+ model = DistributedDataParallel(
+ model,
+ device_ids=[local_rank],
+ output_device=local_rank,
+ find_unused_parameters=config_ddp.find_unused_parameters,
+ static_graph=config_ddp.static_graph,
+ broadcast_buffers=config_ddp.broadcast_buffers,
+ process_group=ddp_group,
+ )
+ return model
+
+
+class DistributedDataParallel(torch.nn.parallel.DistributedDataParallel):
+ """This extends torch.nn.parallel.DistributedDataParallel with .training_step().
+
+ This borrows the concept of `forward-redirection` from Pytorch lightning. It wraps an ImaginaireModel such that
+ model.training_step() would be executed when calling self.training_step(), while preserving the behavior of calling
+ model() for Pytorch modules. Internally, this is a double rerouting mechanism (training_step -> forward ->
+ training_step), allowing us to preserve the function names and signatures.
+ """
+
+ def __init__(self, model: torch.nn.Module, *args, **kwargs):
+ super().__init__(model, *args, **kwargs)
+ self.show_sync_grad_static_graph_warning = True
+
+ def training_step(self, *args, **kwargs) -> Any:
+ # Cache the original model.forward() method.
+ original_forward = self.module.forward
+
+ def wrapped_training_step(*_args, **_kwargs): # noqa: ANN202
+ # Unpatch immediately before calling training_step() because itself may want to call the real forward.
+ self.module.forward = original_forward
+ # The actual .training_step().
+ return self.module.training_step(*_args, **_kwargs)
+
+ # Patch the original_module's forward so we can redirect the arguments back to the real method.
+ self.module.forward = wrapped_training_step
+ # Call self, which implicitly calls self.forward() --> model.forward(), which is now model.training_step().
+ # Without calling self.forward() or model.forward() explciitly, implicit hooks are also executed.
+ return self(*args, **kwargs)
+
+
+@contextmanager
+def ddp_sync_grad(model, enabled):
+ r"""
+ Context manager to enable/disable gradient synchronizations across DDP processes for DDP model.
+ Modified from:
+ https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.no_sync
+ Note that this is incompatible with static_graph=True and will be an no-op if static_graph=True.
+
+ Within this context, gradients will be accumulated on module
+ variables, which will later be synchronized in the first
+ forward-backward pass exiting the context.
+
+ .. warning::
+ The forward pass should be included inside the context manager, or
+ else gradients will still be synchronized.
+ """
+ assert isinstance(model, torch.nn.Module)
+ if isinstance(model, DistributedDataParallel):
+ old_require_backward_grad_sync = model.require_backward_grad_sync
+ if model.static_graph and model.require_backward_grad_sync != enabled:
+ if model.show_sync_grad_static_graph_warning:
+ log.warning("DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced.")
+ model.show_sync_grad_static_graph_warning = False
+ else:
+ model.require_backward_grad_sync = enabled
+ try:
+ yield
+ finally:
+ if isinstance(model, DistributedDataParallel):
+ model.require_backward_grad_sync = old_require_backward_grad_sync
+
+
+def collate_batches(data_batches: list[dict[str, torch.Tensor]]) -> torch.Tensor | dict[str, torch.Tensor]:
+ """Aggregate the list of data batches from all devices and process the results.
+
+ This is used for gathering validation data batches with pid._ext.imaginaire.utils.dataloader.DistributedEvalSampler.
+ It will return the data/output of the entire validation set in its original index order. The sizes of data_batches
+ in different ranks may differ by 1 (if dataset size is not evenly divisible), in which case a dummy sample will be
+ created before calling dis.all_gather().
+
+ Args:
+ data_batches (list[dict[str, torch.Tensor]]): List of tensors or (hierarchical) dictionary where
+ leaf entries are tensors.
+
+ Returns:
+ data_gather (torch.Tensor | dict[str, torch.Tensor]): tensors or (hierarchical) dictionary where
+ leaf entries are concatenated tensors.
+ """
+ if isinstance(data_batches[0], torch.Tensor):
+ # Concatenate the local data batches.
+ data_concat = torch.cat(data_batches, dim=0) # type: ignore
+ # Get the largest number of local samples from all ranks to determine whether to dummy-pad on this rank.
+ max_num_local_samples = torch.tensor(len(data_concat), device="cuda")
+ dist.all_reduce(max_num_local_samples, op=dist.ReduceOp.MAX)
+ if len(data_concat) < max_num_local_samples:
+ assert len(data_concat) + 1 == max_num_local_samples
+ dummy = torch.empty_like(data_concat[:1])
+ data_concat = torch.cat([data_concat, dummy], dim=0)
+ dummy_count = torch.tensor(1, device="cuda")
+ else:
+ dummy_count = torch.tensor(0, device="cuda")
+ # Get all concatenated batches from all ranks and concatenate again.
+ dist.all_reduce(dummy_count, op=dist.ReduceOp.SUM)
+ data_concat = all_gather_tensor(data_concat.contiguous())
+ data_collate = torch.stack(data_concat, dim=1).flatten(start_dim=0, end_dim=1)
+ # Remove the dummy samples.
+ if dummy_count > 0:
+ data_collate = data_collate[:-dummy_count]
+ elif isinstance(data_batches[0], collections.abc.Mapping):
+ data_collate = {}
+ for key in data_batches[0].keys():
+ data_collate[key] = collate_batches([data[key] for data in data_batches]) # type: ignore
+ else:
+ raise TypeError
+ return data_collate
+
+
+@torch.no_grad()
+def all_gather_tensor(tensor: torch.Tensor) -> list[torch.Tensor]:
+ """Gather the corresponding tensor from all GPU devices to a list.
+
+ Args:
+ tensor (torch.Tensor): Pytorch tensor.
+
+ Returns:
+ tensor_list (list[torch.Tensor]): A list of Pytorch tensors gathered from all GPU devices.
+ """
+ tensor_list = [torch.zeros_like(tensor) for _ in range(get_world_size())]
+ dist.all_gather(tensor_list, tensor)
+ return tensor_list
+
+
+def broadcast(tensor, src, group=None, async_op=False):
+ world_size = get_world_size()
+ if world_size < 2:
+ return tensor
+ dist.broadcast(tensor, src=src, group=group, async_op=async_op)
+
+
+def dist_reduce_tensor(tensor, rank=0, reduce="mean"):
+ r"""Reduce to rank 0"""
+ world_size = get_world_size()
+ if world_size < 2:
+ return tensor
+ with torch.no_grad():
+ dist.reduce(tensor, dst=rank)
+ if get_rank() == rank:
+ if reduce == "mean":
+ tensor /= world_size
+ elif reduce == "sum":
+ pass
+ else:
+ raise NotImplementedError
+ return tensor
+
+
+def sync_model_states(
+ model: torch.nn.Module,
+ process_group: Optional[dist.ProcessGroup] = None,
+ src: int = 0,
+ params_and_buffers_to_ignore: Optional[Container[str]] = None,
+ broadcast_buffers: bool = True,
+):
+ """
+ Modify based on DDP source code
+ Synchronizes the parameters and buffers of a model across different processes in a distributed setting.
+
+ This function ensures that all processes in the specified process group have the same initial parameters and
+ buffers from the source rank, typically rank 0. It is useful when different processes start with different model
+ states and a synchronization is required to ensure consistency across all ranks.
+
+ Args:
+ model (nn.Module): The model whose parameters and buffers are to be synchronized.
+ process_group (dist.ProcessGroup, optional): The process group for communication. If None,
+ the default group is used. Defaults to None.
+ src (int, optional): The source rank from which parameters and buffers will be broadcasted.
+ Defaults to 0.
+ params_and_buffers_to_ignore (Optional[Container[str]], optional): A container of parameter and buffer
+ names to exclude from synchronization. Defaults to None, which means all parameters and buffers are
+ included.
+ broadcast_buffers (bool, optional): Whether to broadcast buffers or not. Defaults to True.
+
+ Side Effects:
+ This function modifies the state of the model in-place to synchronize it with the source rank's model state.
+
+ Raises:
+ RuntimeError: If the shapes of parameters across processes do not match, a runtime error will be raised.
+
+ Examples:
+ >>> # downloading duplicated model weights from s3 in each rank and save network bandwidth
+ >>> # useful and save our time when model weights are huge
+ >>> if dist.get_rank == 0:
+ >>> model.load_state_dict(network_bound_weights_download_fn(s3_weights_path))
+ >>> dist.barrir()
+ >>> sync_model_states(model) # sync rank0 weights to other ranks
+ """
+ if not dist.is_available() or not dist.is_initialized():
+ return
+ if process_group is None:
+ process_group = _get_default_group()
+ if not params_and_buffers_to_ignore:
+ params_and_buffers_to_ignore = set()
+
+ log.info(
+ f"Synchronizing model states from rank {src} to all ranks in process group {get_process_group_ranks(process_group)}."
+ )
+
+ # Build tuple of (module, parameter) for all parameters that require grads.
+ modules_and_parameters = [
+ (module, parameter)
+ for module_name, module in model.named_modules()
+ for parameter in [
+ param
+ # Note that we access module.named_parameters instead of
+ # parameters(module). parameters(module) is only needed in the
+ # single-process multi device case, where it accesses replicated
+ # parameters through _former_parameters.
+ for param_name, param in module.named_parameters(recurse=False)
+ if f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+ # if param.requires_grad
+ # and f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+ ]
+ ]
+
+ # Deduplicate any parameters that might be shared across child modules.
+ memo = set()
+ modules_and_parameters = [
+ # "p not in memo" is the deduplication check.
+ # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+ (m, p)
+ for m, p in modules_and_parameters
+ if p not in memo and not memo.add(p) # type: ignore[func-returns-value]
+ ]
+
+ # Build list of parameters.
+ parameters = [parameter for _, parameter in modules_and_parameters]
+ if len(parameters) == 0:
+ return
+
+ _verify_param_shape_across_processes(process_group, parameters)
+
+ _sync_module_states(
+ module=model,
+ process_group=process_group,
+ broadcast_bucket_size=int(250 * 1024 * 1024),
+ src=src,
+ params_and_buffers_to_ignore=params_and_buffers_to_ignore,
+ broadcast_buffers=broadcast_buffers,
+ )
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/log.py b/invokeai/backend/pid/_ext/imaginaire/utils/log.py
new file mode 100644
index 00000000000..c29f1265955
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/log.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# stdlib-based replacement for the upstream loguru-based logger.
+# Provides a drop-in `logger` plus `info/warning/error/...` module-level
+# functions so vendored call sites do not need to be touched.
+
+import logging
+from typing import Any
+
+logger = logging.getLogger("invokeai.backend.pid")
+
+
+def info(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.info(str(msg), *args)
+
+
+def warning(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.warning(str(msg), *args)
+
+
+warn = warning
+
+
+def error(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.error(str(msg), *args)
+
+
+def debug(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.debug(str(msg), *args)
+
+
+def critical(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.critical(str(msg), *args)
+
+
+def exception(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.exception(str(msg), *args)
+
+
+def trace(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.debug(str(msg), *args)
+
+
+def success(msg: Any, *args: Any, **kwargs: Any) -> None:
+ logger.info(str(msg), *args)
+
+
+def init_loguru_stdout() -> None:
+ pass
+
+
+def init_loguru_file(path: str) -> None:
+ pass
diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/misc.py b/invokeai/backend/pid/_ext/imaginaire/utils/misc.py
new file mode 100644
index 00000000000..08170c8b99d
--- /dev/null
+++ b/invokeai/backend/pid/_ext/imaginaire/utils/misc.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Minimal stand-in for the upstream misc module. The full module pulled in
+# wandb / straggler / termcolor / easy_io / DTensor helpers that the decoder
+# inference subset does not use.
+
+from __future__ import annotations
+
+import random
+import time
+from contextlib import contextmanager
+from typing import Iterator
+
+import numpy as np
+import torch
+
+from invokeai.backend.pid._ext.imaginaire.utils.log import logger
+
+
+@contextmanager
+def timer(label: str) -> Iterator[None]:
+ start = time.perf_counter()
+ try:
+ yield
+ finally:
+ logger.info("%s took %.2fs", label, time.perf_counter() - start)
+
+
+def set_random_seed(seed: int, by_rank: bool = False) -> None:
+ if by_rank:
+ try:
+ import torch.distributed as dist
+
+ if dist.is_available() and dist.is_initialized():
+ seed = seed + dist.get_rank()
+ except Exception:
+ pass
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed_all(seed)
+
+
+def requires_grad(model: torch.nn.Module, value: bool = True) -> None:
+ for p in model.parameters():
+ p.requires_grad = value
diff --git a/invokeai/backend/pid/_src/__init__.py b/invokeai/backend/pid/_src/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/invokeai/backend/pid/_src/inference/__init__.py b/invokeai/backend/pid/_src/inference/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/invokeai/backend/pid/_src/inference/checkpoint_registry.py b/invokeai/backend/pid/_src/inference/checkpoint_registry.py
new file mode 100644
index 00000000000..00099c4e71b
--- /dev/null
+++ b/invokeai/backend/pid/_src/inference/checkpoint_registry.py
@@ -0,0 +1,122 @@
+# Shared official PID checkpoint registry.
+#
+# Single source of truth for the (experiment_name, checkpoint_path) pair used by
+# every pixel-decoder demo in `pid/_src/inference/`. The registry is keyed by
+# (backbone, ckpt_type):
+#
+# ckpt_type = "2k" Original 2048px-trained decoders, used as
+# 512→2048 (4×) decoder for diffusers-style backbones,
+# or 256→2048 (8×) for Scale-RAE.
+# ckpt_type = "2kto4k" Multi-resolution-trained decoders (data bucketing
+# 2048→3840 + SD3-style dynamic shift). Designed for
+# 1024 LDM → 4K (3840) decoding. Only registered for
+# the diffusers backbones (flux/flux2/sd3/zimage);
+# rae / scale_rae have no 2kto4k variant.
+#
+# Backbone-tag namespace:
+# flux Flux1-dev (16-ch VAE) LDM + from_clean (2k + 2kto4k)
+# flux2 Flux2-dev (128-ch BN VAE) LDM + from_clean (2k + 2kto4k)
+# sd3 SD3 medium (16-ch VAE) LDM + from_clean (2k + 2kto4k)
+# zimage ZImage (Flux1's 16-ch VAE) LDM only — reuses Flux1 model (2k + 2kto4k)
+# zimage_turbo ZImage-Turbo (same 16-ch VAE) LDM only — reuses Flux1 model (2k + 2kto4k)
+# rae DINOv2-B + RAE ViT-XL (768-ch RAE) LDM + from_clean (2k only, sr4x)
+# scale_rae SigLIP-2 So400M + Scale-RAE ViT-XL LDM + from_clean (2k only, sr8x)
+#
+# `pid_scale` is the spatial upscaling factor baked into the PID network
+# (sr4x → 4, sr8x → 8) and is forwarded to the demo's --scale argument.
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class PIDCheckpoint:
+ experiment: str
+ checkpoint_path: str
+ pid_scale: int
+
+
+_CKPT_ROOT = "checkpoints"
+
+VALID_CKPT_TYPES = ("2k", "2kto4k")
+
+
+PID_CHECKPOINT_REGISTRY: dict[tuple[str, str], PIDCheckpoint] = {
+ # ---- 2k (the original 2048-trained release) ----
+ ("flux", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr4x_official_flux_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("flux2", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr4x_official_flux2_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("sd3", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr4x_official_sd3_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("zimage", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr4x_official_flux_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("rae", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr4x_official_dinov2_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_dinov2_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("scale_rae", "2k"): PIDCheckpoint(
+ experiment="PiD_res2k_sr8x_official_siglip_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr8x_official_siglip_distill_4step/model_ema_bf16.pth",
+ pid_scale=8,
+ ),
+ # ---- 2kto4k (multi-res-trained, dynamic_shift-aware) ----
+ ("flux", "2kto4k"): PIDCheckpoint(
+ experiment="PiD_res2kto4k_sr4x_official_flux_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_flux_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("flux2", "2kto4k"): PIDCheckpoint(
+ experiment="PiD_res2kto4k_sr4x_official_flux2_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+ ("sd3", "2kto4k"): PIDCheckpoint(
+ experiment="PiD_res2kto4k_sr4x_official_sd3_distill_4step",
+ checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth",
+ pid_scale=4,
+ ),
+}
+# ZImage and ZImage-Turbo use Flux1's 16-ch VAE for both ckpt types → alias to
+# the flux entries. Keep explicit aliases (vs. duplicating) so updating "flux"
+# updates these backbones too.
+PID_CHECKPOINT_REGISTRY[("zimage_turbo", "2k")] = PID_CHECKPOINT_REGISTRY[("flux", "2k")]
+PID_CHECKPOINT_REGISTRY[("zimage", "2kto4k")] = PID_CHECKPOINT_REGISTRY[("flux", "2kto4k")]
+PID_CHECKPOINT_REGISTRY[("zimage_turbo", "2kto4k")] = PID_CHECKPOINT_REGISTRY[("flux", "2kto4k")]
+
+
+def get_pid_checkpoint(backbone: str, ckpt_type: str = "2k") -> PIDCheckpoint:
+ """Return the registered official PID checkpoint for `(backbone, ckpt_type)`.
+
+ `ckpt_type` defaults to `"2k"` so existing call sites keep their pre-2kto4k
+ behavior. Raises KeyError with the list of valid keys when the pair is
+ unknown — typical cause is asking for a `2kto4k` variant of a backbone
+ that doesn't ship one (rae / scale_rae).
+ """
+ if ckpt_type not in VALID_CKPT_TYPES:
+ raise KeyError(f"Unknown ckpt_type {ckpt_type!r}. Valid: {VALID_CKPT_TYPES}")
+ try:
+ return PID_CHECKPOINT_REGISTRY[(backbone, ckpt_type)]
+ except KeyError as exc:
+ valid = ", ".join(sorted(f"{b}+{t}" for b, t in PID_CHECKPOINT_REGISTRY))
+ raise KeyError(f"Unknown (backbone, ckpt_type)=({backbone!r}, {ckpt_type!r}). Valid: {valid}") from exc
+
+
+__all__ = [
+ "PIDCheckpoint",
+ "PID_CHECKPOINT_REGISTRY",
+ "VALID_CKPT_TYPES",
+ "get_pid_checkpoint",
+]
diff --git a/invokeai/backend/pid/_src/inference/pipeline_registry.py b/invokeai/backend/pid/_src/inference/pipeline_registry.py
new file mode 100644
index 00000000000..4994e2c878a
--- /dev/null
+++ b/invokeai/backend/pid/_src/inference/pipeline_registry.py
@@ -0,0 +1,364 @@
+"""
+Registry of diffusers pipelines for FPD-vs-VAE evaluation on generated images.
+
+Each DiffusionPipelineConfig describes how to load a diffusers pipeline, extract
+latents in (B, C, H, W) format, denormalize them, and decode with the pipeline's VAE.
+
+Supported backbones: flux, sdxl, sd3, flux2, qwenimage, zimage, zimage_turbo.
+
+Latent normalization conventions:
+ - Flux/SDXL/SD3: simple affine scale+shift → raw = latent / scale + shift
+ - Flux2: BatchNorm-based → raw = latent * bn_std + bn_mean
+ (running stats stored in AutoencoderKLFlux2.latent_norm)
+ - QwenImage: per-channel mean/std → raw = latent * std + mean
+ (vectors stored in pipeline.vae.config.latents_mean / latents_std)
+ - ZImage/ZImage-Turbo: affine scale+shift read from pipeline.vae.config at runtime
+ (vae_scale_factor=0 in registry signals runtime lookup)
+
+Diffusers `output_type="latent"` returns the denoised latent in the *normalized*
+space (same convention as tokenizer.encode()). For FPD the latent is used directly
+— no extra denormalization is needed. denormalize_latent() is only needed for VAE
+decode when the pipeline's decode path doesn't handle it internally.
+
+Requires diffusers >= 0.37.0 for Flux2/QwenImage/ZImage support.
+"""
+
+import importlib
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+
+# ---------------------------------------------------------------------------
+# Config dataclass
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DiffusionPipelineConfig:
+ name: str # "flux", "sdxl", "sd3", "flux2"
+ pipeline_class: str # e.g. "diffusers.FluxPipeline"
+ default_model_id: str # HuggingFace model ID
+ latent_channels: int # 16 (Flux/SD3), 4 (SDXL), 32 (Flux2)
+ spatial_compression: int # 8
+ # Affine normalization (Flux1/SDXL/SD3). Set both to 0 for BN-based (Flux2).
+ vae_scale_factor: float # diffusers VAE scaling
+ vae_shift_factor: float # diffusers VAE shift (0 if none)
+ # Whether this backbone uses BatchNorm-based latent normalization (Flux2)
+ uses_bn_normalization: bool = False
+ # Whether this backbone uses per-channel mean/std normalization (QwenImage)
+ uses_perchannel_normalization: bool = False
+ # Whether the VAE is a video-style 3D VAE that produces 5D latents (QwenImage)
+ has_temporal_dim: bool = False
+ default_resolution: tuple[int, int] = (1024, 1024)
+ default_num_inference_steps: int = 28
+ default_guidance_scale: float = 3.5
+ # Extra kwargs forwarded to pipeline.__call__
+ extra_generate_kwargs: dict = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Registry
+# ---------------------------------------------------------------------------
+
+PIPELINE_REGISTRY: dict[str, DiffusionPipelineConfig] = {
+ "flux": DiffusionPipelineConfig(
+ name="flux",
+ pipeline_class="diffusers.FluxPipeline",
+ default_model_id="black-forest-labs/FLUX.1-dev",
+ latent_channels=16,
+ spatial_compression=8,
+ vae_scale_factor=0.3611,
+ vae_shift_factor=0.1159,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=28,
+ default_guidance_scale=3.5,
+ extra_generate_kwargs={"max_sequence_length": 512},
+ ),
+ "sdxl": DiffusionPipelineConfig(
+ name="sdxl",
+ pipeline_class="diffusers.StableDiffusionXLPipeline",
+ default_model_id="stabilityai/stable-diffusion-xl-base-1.0",
+ latent_channels=4,
+ spatial_compression=8,
+ vae_scale_factor=0.13025,
+ vae_shift_factor=0.0,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=30,
+ default_guidance_scale=7.5,
+ ),
+ "sd3": DiffusionPipelineConfig(
+ name="sd3",
+ pipeline_class="diffusers.StableDiffusion3Pipeline",
+ default_model_id="stabilityai/stable-diffusion-3-medium-diffusers",
+ latent_channels=16,
+ spatial_compression=8,
+ vae_scale_factor=1.5305,
+ vae_shift_factor=0.0609,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=28,
+ default_guidance_scale=4.0,
+ ),
+ "flux2": DiffusionPipelineConfig(
+ name="flux2",
+ pipeline_class="diffusers.Flux2Pipeline",
+ default_model_id="black-forest-labs/FLUX.2-dev",
+ latent_channels=32,
+ spatial_compression=8,
+ # Flux2 uses BatchNorm-based normalization, not affine scale/shift.
+ # Set to 0 — actual denormalization uses pipeline.vae.latent_norm running stats.
+ vae_scale_factor=0.0,
+ vae_shift_factor=0.0,
+ uses_bn_normalization=True,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=50,
+ default_guidance_scale=4.0,
+ extra_generate_kwargs={"max_sequence_length": 512},
+ ),
+ "qwenimage": DiffusionPipelineConfig(
+ name="qwenimage",
+ pipeline_class="diffusers.QwenImagePipeline",
+ default_model_id="Qwen/Qwen-Image",
+ latent_channels=16,
+ spatial_compression=8,
+ # QwenImage uses per-channel mean/std normalization, not affine scale/shift.
+ # Actual denormalization reads pipeline.vae.config.latents_mean / latents_std.
+ vae_scale_factor=0.0,
+ vae_shift_factor=0.0,
+ uses_perchannel_normalization=True,
+ has_temporal_dim=True,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=50,
+ default_guidance_scale=4.0,
+ extra_generate_kwargs={"max_sequence_length": 512, "true_cfg_scale": 4.0, "negative_prompt": " "},
+ ),
+ "zimage": DiffusionPipelineConfig(
+ name="zimage",
+ pipeline_class="diffusers.ZImagePipeline",
+ default_model_id="Tongyi-MAI/Z-Image",
+ latent_channels=16,
+ spatial_compression=8,
+ # ZImage uses affine normalization but exact values depend on the pretrained
+ # checkpoint. Set to 0 so denormalize_latent() reads from pipeline.vae.config.
+ vae_scale_factor=0.0,
+ vae_shift_factor=0.0,
+ default_resolution=(1024, 1024),
+ default_num_inference_steps=50,
+ default_guidance_scale=5.0,
+ extra_generate_kwargs={"max_sequence_length": 512},
+ ),
+ "zimage_turbo": DiffusionPipelineConfig(
+ name="zimage_turbo",
+ pipeline_class="diffusers.ZImagePipeline",
+ default_model_id="Tongyi-MAI/Z-Image-Turbo",
+ latent_channels=16,
+ spatial_compression=8,
+ # ZImage-Turbo shares ZImage's VAE/latent convention. Runtime values are
+ # read from pipeline.vae.config by denormalize_latent().
+ vae_scale_factor=0.0,
+ vae_shift_factor=0.0,
+ default_resolution=(1024, 1024),
+ # The model card describes Turbo as an 8-NFE distilled model. Diffusers'
+ # example uses num_inference_steps=9, yielding 8 non-zero scheduler jumps
+ # followed by the terminal sigma=0 sample.
+ default_num_inference_steps=9,
+ default_guidance_scale=0.0,
+ extra_generate_kwargs={"max_sequence_length": 512},
+ ),
+}
+
+
+def get_config(name: str) -> DiffusionPipelineConfig:
+ if name not in PIPELINE_REGISTRY:
+ raise ValueError(f"Unknown backbone '{name}'. Available: {list(PIPELINE_REGISTRY.keys())}")
+ return PIPELINE_REGISTRY[name]
+
+
+# ---------------------------------------------------------------------------
+# Pipeline loading
+# ---------------------------------------------------------------------------
+
+
+def load_pipeline(
+ name: str, model_id: Optional[str] = None, dtype=torch.bfloat16, device: str = "cuda", cpu_offload: bool = False
+):
+ """Dynamically import and load a diffusers pipeline.
+
+ Args:
+ cpu_offload: If True, use enable_model_cpu_offload() instead of .to(device).
+ Keeps model weights on CPU and only moves the active component to GPU during
+ forward pass. Essential for large models (Flux2, QwenImage, etc.) that exceed
+ single-GPU VRAM when all components are loaded simultaneously.
+
+ Returns (pipeline, cfg) where pipeline is ready to call and cfg is the
+ DiffusionPipelineConfig for this backbone.
+ """
+ cfg = get_config(name)
+ model_id = model_id or cfg.default_model_id
+
+ # e.g. "diffusers.FluxPipeline" -> module="diffusers", cls="FluxPipeline"
+ module_path, cls_name = cfg.pipeline_class.rsplit(".", 1)
+ mod = importlib.import_module(module_path)
+ PipelineClass = getattr(mod, cls_name)
+
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+ print(f"Loading {cfg.pipeline_class} from {model_id} (dtype={dtype}) ...")
+ pipeline = PipelineClass.from_pretrained(model_id, torch_dtype=dtype, token=token)
+ if cpu_offload:
+ # Only the active component (text encoder / transformer / VAE) lives on GPU at a time.
+ # enable_model_cpu_offload() defaults to gpu_id=0 — must pass the correct device
+ # explicitly for multi-GPU torchrun, otherwise all ranks pile onto GPU 0.
+ gpu_id = torch.cuda.current_device()
+ pipeline.enable_model_cpu_offload(gpu_id=gpu_id)
+ print(f"Pipeline loaded with model CPU offload (gpu_id={gpu_id}).")
+ else:
+ pipeline = pipeline.to(device)
+ print(f"Pipeline loaded on {device}.")
+ return pipeline, cfg
+
+
+# ---------------------------------------------------------------------------
+# Latent handling
+# ---------------------------------------------------------------------------
+
+
+def denormalize_latent(pipeline, latent: torch.Tensor, cfg: DiffusionPipelineConfig) -> torch.Tensor:
+ """Reverse the latent normalization applied during VAE encode.
+
+ For Flux1/SDXL/SD3 (affine): raw = latent / scale + shift
+ For Flux2 (BatchNorm): raw = latent * bn_std + bn_mean
+ where bn_std/bn_mean come from pipeline.vae.latent_norm running stats.
+
+ Only needed when manually feeding latent to the pipeline's VAE.decode(),
+ which expects the *raw* (un-normalized) latent space.
+ """
+ if cfg.uses_bn_normalization:
+ # Flux2: denormalize via BatchNorm running statistics.
+ # diffusers 0.37+: stored as pipeline.vae.bn (BatchNorm2d, affine=False).
+ bn = pipeline.vae.bn
+ # running_mean/var are (C_packed,) where C_packed = latent_channels * patch_h * patch_w
+ # The latent from output_type="latent" is already in packed BN-normalized space.
+ bn_mean = bn.running_mean.to(latent.device, latent.dtype)
+ bn_var = bn.running_var.to(latent.device, latent.dtype)
+ bn_std = (bn_var + bn.eps).sqrt()
+ # Reshape to broadcast: (1, C_packed, 1, 1)
+ bn_mean = bn_mean.view(1, -1, 1, 1)
+ bn_std = bn_std.view(1, -1, 1, 1)
+ return latent * bn_std + bn_mean
+ elif cfg.uses_perchannel_normalization:
+ # QwenImage: denormalize via per-channel mean/std from VAE config
+ latents_mean = torch.tensor(pipeline.vae.config.latents_mean).view(1, -1, 1, 1).to(latent.device, latent.dtype)
+ latents_std = torch.tensor(pipeline.vae.config.latents_std).view(1, -1, 1, 1).to(latent.device, latent.dtype)
+ return latent * latents_std + latents_mean
+ else:
+ # Affine scale/shift
+ scale = cfg.vae_scale_factor
+ shift = cfg.vae_shift_factor
+ if scale == 0.0:
+ # Fallback: read from pipeline's VAE config at runtime (e.g., ZImage)
+ scale = pipeline.vae.config.scaling_factor
+ shift = getattr(pipeline.vae.config, "shift_factor", None) or 0.0
+ return latent / scale + shift
+
+
+def extract_latent(pipeline, raw_output, cfg: DiffusionPipelineConfig, height: int, width: int) -> torch.Tensor:
+ """Normalize pipeline output_type="latent" to (B, C, H, W).
+
+ Flux1 packs latents into (B, seq_len, C) — needs _unpack_latents().
+ Flux2 packs latents into (B, seq_len, C) — needs _unpack_latents_with_ids().
+ SDXL / SD3 already return (B, C, H, W).
+ """
+ latent = raw_output.images # could be packed for Flux/Flux2
+
+ if cfg.name == "flux":
+ # Flux1: packed (B, seq_len, C) → (B, C, H, W)
+ from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
+
+ latent = FluxPipeline._unpack_latents(
+ latent,
+ height=height,
+ width=width,
+ vae_scale_factor=pipeline.vae_scale_factor,
+ )
+ elif cfg.name == "flux2":
+ # Flux2: packed (B, seq_len, C) → (B, C, H, W) using position IDs.
+ # diffusers 0.37+ API: _unpack_latents_with_ids(x, x_ids) where x_ids are
+ # (B, H*W, 4) position coordinates generated by _prepare_latent_ids.
+ from diffusers.pipelines.flux2.pipeline_flux2 import Flux2Pipeline
+
+ # Compute expected spatial dims in latent space (after VAE + 2x2 packing)
+ vae_sf = pipeline.vae_scale_factor # typically 8
+ latent_h = height // (vae_sf * 2)
+ latent_w = width // (vae_sf * 2)
+ # _prepare_latent_ids takes a (B, C, H, W) tensor and reads .shape
+ dummy = torch.zeros(latent.shape[0], 1, latent_h, latent_w, device=latent.device)
+ latent_ids = Flux2Pipeline._prepare_latent_ids(dummy).to(latent.device)
+ result = Flux2Pipeline._unpack_latents_with_ids(latent, latent_ids)
+ # _unpack_latents_with_ids returns a list/stacked tensor (B, C, H, W)
+ latent = result if isinstance(result, torch.Tensor) else torch.stack(result, dim=0)
+ elif cfg.name == "qwenimage":
+ # QwenImage: packed (B, seq_len, C) → (B, C, 1, H, W) with temporal dim
+ from diffusers.pipelines.qwenimage.pipeline_qwenimage import QwenImagePipeline
+
+ latent = QwenImagePipeline._unpack_latents(
+ latent,
+ height=height,
+ width=width,
+ vae_scale_factor=pipeline.vae_scale_factor,
+ )
+ # Squeeze temporal dim: (B, C, 1, H, W) → (B, C, H, W)
+ latent = latent.squeeze(2)
+
+ # ZImage: already (B, C, H, W), no unpacking needed.
+
+ if latent.ndim != 4:
+ raise RuntimeError(f"Expected 4-D latent (B, C, H, W) after extraction, got shape {latent.shape}")
+ return latent
+
+
+def decode_with_pipeline_vae(pipeline, latent: torch.Tensor, cfg: DiffusionPipelineConfig) -> torch.Tensor:
+ """Standard VAE decode using the pipeline's own VAE.
+
+ Takes the *normalized* latent (as returned by output_type="latent"),
+ denormalizes it, and decodes to pixel space.
+
+ Returns: (B, 3, H, W) float tensor in [0, 1].
+ """
+ raw_latent = denormalize_latent(pipeline, latent, cfg)
+
+ if cfg.uses_bn_normalization:
+ # Flux2 VAE: unpatch before decoding.
+ # raw_latent is (B, C_packed, pH, pW) — C_packed = latent_channels * patch_h * patch_w.
+ # Must undo patchification to get (B, latent_channels, H/8, W/8) before vae.decode().
+ from diffusers.pipelines.flux2.pipeline_flux2 import Flux2Pipeline
+
+ raw_latent = Flux2Pipeline._unpatchify_latents(raw_latent)
+
+ if cfg.has_temporal_dim:
+ # Video-style 3D VAE (e.g., QwenImage): expects (B, C, T, H, W)
+ raw_latent = raw_latent.unsqueeze(2)
+
+ # Match VAE dtype — schedulers often output float32 while VAE weights are bfloat16.
+ raw_latent = raw_latent.to(pipeline.vae.dtype)
+
+ with torch.no_grad():
+ decoded = pipeline.vae.decode(raw_latent, return_dict=False)[0]
+
+ if cfg.has_temporal_dim:
+ # 3D VAE returns (B, 3, T, H, W) — take first frame
+ decoded = decoded[:, :, 0]
+
+ # diffusers VAE outputs in [-1, 1] — map to [0, 1]
+ decoded = (decoded * 0.5 + 0.5).clamp(0, 1)
+ return decoded
+
+
+def print_latent_stats(latent: torch.Tensor, label: str = "latent"):
+ """Print mean/std/min/max for latent debugging."""
+ with torch.no_grad():
+ print(
+ f" [{label}] shape={list(latent.shape)} "
+ f"mean={latent.mean().item():.4f} std={latent.std().item():.4f} "
+ f"min={latent.min().item():.4f} max={latent.max().item():.4f}"
+ )
diff --git a/invokeai/backend/pid/_src/models/__init__.py b/invokeai/backend/pid/_src/models/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/invokeai/backend/pid/_src/models/pid_distill_model.py b/invokeai/backend/pid/_src/models/pid_distill_model.py
new file mode 100644
index 00000000000..c061deda36b
--- /dev/null
+++ b/invokeai/backend/pid/_src/models/pid_distill_model.py
@@ -0,0 +1,315 @@
+# PID distillation model — inference subset of the DMD2-distilled student.
+#
+# The training-time teacher / fake_score / discriminator / DMD-loss machinery has been
+# stripped; what remains is the student net (`self.net`) plus the few-step sampler
+# (`_get_t_list`, `_student_sample_loop`, `_velocity_to_x0`) consumed by
+# `generate_samples_from_batch`.
+
+from __future__ import annotations
+
+import logging
+from collections import OrderedDict
+from contextlib import nullcontext
+from typing import Optional
+
+import attrs
+import torch
+
+from invokeai.backend.pid._src.models.pid_model import PidModel, PidModelConfig
+
+logger = logging.getLogger(__name__)
+
+
+@attrs.define(slots=False)
+class PidDistillModelConfig(PidModelConfig):
+ """Inference config for the distilled student."""
+
+ # Few-step student schedule.
+ student_timestep: float = 1.0
+ student_sample_steps: int = 1
+ student_sample_type: str = "sde"
+ student_t_list: Optional[list] = None
+ student_input_mode: str = "teacher_forcing"
+
+
+class PidDistillModel(PidModel):
+ """Inference-only PID distilled student."""
+
+ def __init__(self, config: PidDistillModelConfig):
+ # Stubs left in place so any parent code that probes for these attributes
+ # gets None instead of AttributeError.
+ self.teacher = None
+ self.fake_score = None
+ self.discriminator = None
+ super().__init__(config)
+
+ # ---------------------------------------------------------------------
+ # Net output ↔ (x0, velocity) conversion
+ # ---------------------------------------------------------------------
+
+ def _net_output_to_x0(
+ self,
+ x_t: torch.Tensor,
+ net_output: torch.Tensor,
+ t: torch.Tensor,
+ prediction_type: str,
+ ) -> torch.Tensor:
+ if prediction_type == "x0":
+ return net_output.to(x_t.dtype)
+ if prediction_type == "velocity":
+ original_dtype = x_t.dtype
+ s = [x_t.shape[0]] + [1] * (x_t.ndim - 1)
+ t_shaped = t.double().view(*s)
+ return (x_t.double() - t_shaped * net_output.double()).to(original_dtype)
+ raise ValueError(f"Invalid prediction_type: {prediction_type}")
+
+ def _net_output_to_velocity(
+ self,
+ x_t: torch.Tensor,
+ net_output: torch.Tensor,
+ t: torch.Tensor,
+ prediction_type: str,
+ ) -> torch.Tensor:
+ if prediction_type == "velocity":
+ return net_output
+ if prediction_type == "x0":
+ original_dtype = x_t.dtype
+ s = [x_t.shape[0]] + [1] * (x_t.ndim - 1)
+ t_shaped = t.double().view(*s).clamp(min=5e-2)
+ return ((x_t.double() - net_output.double()) / t_shaped).to(original_dtype)
+ raise ValueError(f"Invalid prediction_type: {prediction_type}")
+
+ def _velocity_to_x0(self, x_t: torch.Tensor, net_output: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+ return self._net_output_to_x0(x_t, net_output, t, self.config.prediction_type)
+
+ # ---------------------------------------------------------------------
+ # Multi-step student sampler
+ # ---------------------------------------------------------------------
+
+ def _get_t_list(self, device, num_steps: Optional[int] = None) -> torch.Tensor:
+ target_steps = num_steps if num_steps is not None else self.config.student_sample_steps
+
+ if self.config.student_t_list is not None:
+ full_t = torch.tensor(self.config.student_t_list, device=device, dtype=torch.float32)
+ if target_steps != self.config.student_sample_steps:
+ indices = torch.linspace(0, len(full_t) - 1, target_steps + 1).round().long()
+ t_list = full_t[indices]
+ else:
+ t_list = full_t
+ else:
+ t_list = torch.linspace(
+ self.config.student_timestep,
+ 0.0,
+ target_steps + 1,
+ device=device,
+ dtype=torch.float32,
+ )
+ assert abs(t_list[-1].item()) < 1e-6, "t_list must end at 0"
+ if num_steps is not None:
+ logger.info(f"[distill inference] num_steps={num_steps}, t_list={t_list.tolist()}")
+ return t_list
+
+ def _student_sample_loop(
+ self,
+ noise: torch.Tensor,
+ t_list: torch.Tensor,
+ caption_embs: torch.Tensor,
+ lq_video_or_image: Optional[torch.Tensor],
+ lq_latent: Optional[torch.Tensor],
+ degrade_sigma_tensor: Optional[torch.Tensor],
+ generator: Optional[torch.Generator] = None,
+ ) -> torch.Tensor:
+ B = noise.shape[0]
+ timescale = self.fm_trainer.timescale
+ autocast_ctx = torch.autocast("cuda", dtype=self.autocast_dtype) if self.autocast_dtype else nullcontext()
+ x = noise
+ net = self.net
+
+ with autocast_ctx:
+ for t_cur, t_next in zip(t_list[:-1], t_list[1:], strict=True):
+ t_cur_batch = t_cur.expand(B)
+ t_cur_scaled = t_cur_batch * timescale
+
+ v_pred = net(
+ x,
+ t_cur_scaled,
+ caption_embs,
+ lq_video_or_image=lq_video_or_image,
+ lq_latent=lq_latent,
+ degrade_sigma=degrade_sigma_tensor,
+ )
+
+ if t_next.item() > 0:
+ if self.config.student_sample_type == "ode":
+ v_for_step = self._net_output_to_velocity(x, v_pred, t_cur_batch, self.config.prediction_type)
+ dt = t_next - t_cur
+ x = x + dt * v_for_step
+ else:
+ x0_pred = self._velocity_to_x0(x, v_pred, t_cur_batch)
+ eps_infer = torch.randn(
+ x0_pred.shape,
+ device=x0_pred.device,
+ dtype=x0_pred.dtype,
+ generator=generator,
+ )
+ s = [B] + [1] * (x.ndim - 1)
+ t_next_bcast = t_next.reshape(1).expand(s)
+ x = (1.0 - t_next_bcast) * x0_pred + t_next_bcast * eps_infer
+ else:
+ x = self._velocity_to_x0(x, v_pred, t_cur_batch)
+
+ return x
+
+ # ---------------------------------------------------------------------
+ # Inference entry point
+ # ---------------------------------------------------------------------
+
+ @torch.no_grad()
+ def generate_samples_from_batch(
+ self,
+ data_batch: dict,
+ guidance: float = None,
+ cfg_scale: float = None,
+ num_steps: int = None,
+ seed: int = 0,
+ image_size=None,
+ shift: float = None,
+ is_negative_prompt: bool = False,
+ **kwargs,
+ ):
+ # Encode any missing LQ_latent via the frozen VAE so callers can pass either
+ # LQ_video_or_image or LQ_latent.
+ if "LQ_latent" not in data_batch and "LQ_video_or_image" in data_batch and self.vae_encoder is not None:
+ data_batch["LQ_latent"] = (
+ self.encode_lq_latent(data_batch["LQ_video_or_image"]).contiguous().to(**self.tensor_kwargs)
+ )
+ if "degrade_sigma" not in data_batch and "LQ_latent" in data_batch:
+ B = data_batch["LQ_latent"].shape[0]
+ data_batch["degrade_sigma"] = torch.zeros(B, device=data_batch["LQ_latent"].device, dtype=torch.float32)
+
+ x0_key = self.config.input_data_key
+ if image_size is None and x0_key in data_batch:
+ x0_shape = data_batch[x0_key].shape
+ img_h, img_w = x0_shape[-2], x0_shape[-1]
+ else:
+ image_size = image_size or self.config.image_size
+ if isinstance(image_size, (list, tuple)):
+ img_h, img_w = int(image_size[0]), int(image_size[1])
+ else:
+ img_h = img_w = int(image_size)
+
+ # Determine shift: explicit arg > SD3-style dynamic_shift (if configured) > config default.
+ # The 4-step distilled sampler doesn't consume `shift` directly (it uses
+ # student_t_list), but we keep the precedence ladder symmetric with the
+ # non-distilled inference path in case future call sites read it.
+ if shift is None and self.config.dynamic_shift is not None:
+ import math
+
+ _ds = self.config.dynamic_shift
+ shift = _ds["base_shift"] * math.sqrt(max(img_h, img_w) / _ds["base_image_size_for_shift_calc"])
+
+ captions = data_batch[self.config.input_caption_key]
+ if isinstance(captions, str):
+ captions = [captions]
+ B = len(captions)
+ if self.config.use_fixed_prompt:
+ captions = [self.config.fixed_positive_prompt] * B
+ caption_embs, _ = self._encode_text_raw(captions)
+ caption_embs = caption_embs.to(**self.tensor_kwargs)
+
+ lq_video_or_image = None
+ lq_latent = None
+ if self.config.lq_condition_type in ("image", "image_latent"):
+ lq_video_or_image = data_batch.get("LQ_video_or_image")
+ if lq_video_or_image is not None:
+ lq_video_or_image = lq_video_or_image.to(**self.tensor_kwargs)
+ if self.config.lq_condition_type in ("latent", "image_latent"):
+ lq_latent = data_batch.get("LQ_latent")
+ if lq_latent is not None:
+ lq_latent = lq_latent.to(**self.tensor_kwargs)
+
+ sigma_val = data_batch.get("degrade_sigma", 0.0)
+ if isinstance(sigma_val, torch.Tensor):
+ degrade_sigma_tensor = sigma_val.to(device="cuda", dtype=torch.float32).reshape(-1)
+ if degrade_sigma_tensor.numel() == 1:
+ degrade_sigma_tensor = degrade_sigma_tensor.expand(B).contiguous()
+ assert degrade_sigma_tensor.shape == (B,), (
+ f"data_batch['degrade_sigma'] expected [B={B}], got {tuple(degrade_sigma_tensor.shape)}"
+ )
+ elif isinstance(sigma_val, (list, tuple)):
+ degrade_sigma_tensor = torch.tensor(sigma_val, device="cuda", dtype=torch.float32)
+ assert degrade_sigma_tensor.shape == (B,), (
+ f"data_batch['degrade_sigma'] expected length {B}, got {len(sigma_val)}"
+ )
+ else:
+ degrade_sigma_tensor = torch.full((B,), float(sigma_val), device="cuda", dtype=torch.float32)
+
+ gen = torch.Generator(device="cuda").manual_seed(int(seed))
+ noise = torch.randn(B, 3, img_h, img_w, device="cuda", generator=gen)
+
+ autocast_ctx = torch.autocast("cuda", dtype=self.autocast_dtype) if self.autocast_dtype else nullcontext()
+ net = self.net
+ net.eval()
+
+ effective_steps = num_steps if num_steps is not None else self.config.student_sample_steps
+
+ if effective_steps == 1:
+ t_student = torch.full((B,), self.config.student_timestep, device="cuda", dtype=torch.float32)
+ t_student_scaled = t_student * self.fm_trainer.timescale
+ with autocast_ctx:
+ v_student = net(
+ noise,
+ t_student_scaled,
+ caption_embs,
+ lq_video_or_image=lq_video_or_image,
+ lq_latent=lq_latent,
+ degrade_sigma=degrade_sigma_tensor,
+ )
+ x0_student = self._velocity_to_x0(noise, v_student, t_student)
+ else:
+ t_list = self._get_t_list(device=torch.device("cuda"), num_steps=num_steps)
+ x0_student = self._student_sample_loop(
+ noise,
+ t_list,
+ caption_embs,
+ lq_video_or_image,
+ lq_latent,
+ degrade_sigma_tensor,
+ generator=gen,
+ )
+
+ return x0_student.clamp(-1, 1).unsqueeze(2)
+
+ # ---------------------------------------------------------------------
+ # Checkpoint helpers (only the student `net.` prefix matters at inference)
+ # ---------------------------------------------------------------------
+
+ def model_dict(self) -> dict:
+ return {"net": self.net}
+
+ def state_dict(self, *args, **kwargs):
+ return self.net.state_dict(prefix="net.")
+
+ def load_state_dict(self, state_dict, strict=True, assign=False, **kwargs):
+ _net_sd = OrderedDict()
+ for k, v in state_dict.items():
+ if k.startswith("net.") and not k.startswith("net_ema."):
+ _net_sd[k[len("net.") :]] = v
+ elif k.startswith("net_ema.") or k.startswith("fake_score.") or k.startswith("discriminator."):
+ continue
+ else:
+ _net_sd[k] = v
+
+ missing, unexpected = self.net.load_state_dict(_net_sd, strict=False, assign=assign)
+ if missing:
+ lq_missing = [k for k in missing if "lq_proj" in k]
+ other_missing = [k for k in missing if "lq_proj" not in k]
+ if lq_missing:
+ logger.info(f"Expected missing LQ keys ({len(lq_missing)} keys)")
+ if other_missing and strict:
+ logger.warning(f"Missing keys in net: {other_missing}")
+ if unexpected:
+ logger.warning(f"Unexpected keys in net: {unexpected}")
+
+ def on_train_start(self, memory_format=torch.preserve_format) -> None:
+ super().on_train_start(memory_format)
diff --git a/invokeai/backend/pid/_src/models/pid_model.py b/invokeai/backend/pid/_src/models/pid_model.py
new file mode 100644
index 00000000000..976c931e29d
--- /dev/null
+++ b/invokeai/backend/pid/_src/models/pid_model.py
@@ -0,0 +1,75 @@
+# PID (PixelDiT SR) model — inference subset.
+#
+# At inference the only thing this class adds on top of PixelDiTModel is the
+# frozen VAE (`vae_encoder`) used by `encode_lq_latent`. The training-time
+# degradation pipeline, LoRA injection, LPIPS loss, and training/validation
+# steps have all been removed.
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import attrs
+import torch
+from torch import Tensor
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate as lazy_instantiate
+from invokeai.backend.pid._ext.imaginaire.utils import misc
+from invokeai.backend.pid._src.models.pixeldit_model import PixelDiTModel, PixelDiTModelConfig
+
+logger = logging.getLogger(__name__)
+
+
+@attrs.define(slots=False)
+class PidModelConfig(PixelDiTModelConfig):
+ # "image" = LQ image only, "latent" = LQ latent only, "image_latent" = both.
+ lq_condition_type: str = "latent"
+
+ # Frozen VAE config for encoding LQ images to latent.
+ tokenizer: Any = None
+
+ # VAE latent channels (must match tokenizer.latent_ch).
+ state_ch: int = 16
+
+ # Fixed prompt override (training convenience kept here so checkpoints that set
+ # use_fixed_prompt=True still load).
+ use_fixed_prompt: bool = False
+ fixed_positive_prompt: str = ""
+
+
+class PidModel(PixelDiTModel):
+ """PID (PixelDiT SR) inference model (frozen VAE + LQ-conditioned student)."""
+
+ def __init__(self, config: PidModelConfig):
+ super().__init__(config)
+
+ if config.tokenizer is not None:
+ with misc.timer("PidModel: load_vae"):
+ from invokeai.backend.pid._src.tokenizers.base_vae import BaseVAE
+
+ self.vae_encoder: BaseVAE = lazy_instantiate(config.tokenizer)
+ if config.state_ch > 0:
+ assert self.vae_encoder.latent_ch == config.state_ch, (
+ f"latent_ch {self.vae_encoder.latent_ch} != state_ch {config.state_ch}"
+ )
+ else:
+ self.vae_encoder = None
+ logger.warning("No VAE configured — LQ latent encoding disabled.")
+
+ @torch.no_grad()
+ def encode_lq_latent(self, lq_image: Tensor) -> Tensor:
+ """Encode an LQ image through the frozen VAE.
+
+ Args:
+ lq_image: [B, C, H_lq, W_lq] in [-1, 1].
+
+ Returns:
+ LQ latent [B, z_dim, zH, zW].
+ """
+ if lq_image.ndim == 4:
+ lq_image = lq_image.unsqueeze(2)
+ latent = self.vae_encoder.encode(lq_image)
+ if latent.ndim == 5:
+ latent = latent[:, :, 0, :, :]
+ return latent
diff --git a/invokeai/backend/pid/_src/models/pixeldit_model.py b/invokeai/backend/pid/_src/models/pixeldit_model.py
new file mode 100644
index 00000000000..168cd016be1
--- /dev/null
+++ b/invokeai/backend/pid/_src/models/pixeldit_model.py
@@ -0,0 +1,269 @@
+# PixelDiT T2I model — inference subset.
+#
+# Provides the bare minimum needed by PidDistillModel: net + frozen text
+# encoder + caption embedding helper + a flow-matching `timescale` field.
+# Training-time machinery (EMA, REPA, flow-matching trainer, training/validation
+# steps) has been removed.
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import attrs
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate as lazy_instantiate
+from invokeai.backend.pid._ext.imaginaire.model import ImaginaireModel
+from invokeai.backend.pid._ext.imaginaire.utils import misc
+from invokeai.backend.pid._src.utils.context_parallel import broadcast as cp_broadcast
+from invokeai.backend.pid._src.utils.context_parallel import robust_broadcast
+
+try:
+ from megatron.core import parallel_state
+except ImportError:
+ parallel_state = None # CP is opt-in; gracefully degrade when megatron is absent
+
+logger = logging.getLogger(__name__)
+
+
+@attrs.define(slots=False)
+class _EMAStubConfig:
+ """Minimal stub kept so that DCP ModelWrapper.state_dict() can read `config.ema.enabled`."""
+
+ enabled: bool = False
+ rate: float = 0.1
+ iteration_shift: int = 0
+
+
+@attrs.define(slots=False)
+class PixelDiTModelConfig:
+ net: Any = None
+ precision: str = "bfloat16"
+ ema: _EMAStubConfig = attrs.Factory(_EMAStubConfig)
+
+ input_data_key: str = "image"
+ input_caption_key: str = "caption"
+
+ text_encoder_name: str = "gemma-2-2b-it"
+ caption_channels: int = 2304
+ y_norm: bool = True
+ y_norm_scale_factor: float = 0.01
+ model_max_length: int = 300
+ chi_prompt: list = attrs.Factory(list)
+ conditioner: Any = None
+
+ # Flow matching: only `fm_timescale` is read at inference (network expects
+ # t * timescale as its scalar timestep input).
+ fm_timescale: float = 1000.0
+ logit_mean: float = 0.0
+ logit_std: float = 1.0
+ prediction_type: str = "velocity"
+
+ shift: float = 4.0
+ cfg_scale: float = 2.75
+ image_size: int = 1024
+ negative_prompt: str = "low quality, worst quality, over-saturated, three legs, six fingers, cartoon, anime, cgi, low res, blurry, deformed, distortion, duplicated limbs, plastic skin, jpeg artifacts, watermark"
+ num_sample_steps: int = 50
+
+ dynamic_shift: dict | None = None
+
+
+_TEXT_ENCODER_DICT = {
+ "gemma-2b": "google/gemma-2b",
+ "gemma-2b-it": "google/gemma-2b-it",
+ "gemma-2-2b": "google/gemma-2-2b",
+ "gemma-2-2b-it": "Efficient-Large-Model/gemma-2-2b-it",
+ "gemma-2-9b": "google/gemma-2-9b",
+ "gemma-2-9b-it": "google/gemma-2-9b-it",
+ "Qwen2-0.5B-Instruct": "Qwen/Qwen2-0.5B-Instruct",
+ "Qwen2-1.5B-Instruct": "Qwen/Qwen2-1.5B-Instruct",
+}
+
+
+def _load_text_encoder(name: str, device: str = "cuda"):
+ import torch.distributed as dist
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ assert name in _TEXT_ENCODER_DICT, f"Unsupported text encoder: {name}"
+ model_id = _TEXT_ENCODER_DICT[name]
+
+ is_distributed = dist.is_initialized()
+ is_rank0 = (not is_distributed) or (dist.get_rank() == 0)
+
+ if is_distributed and not is_rank0:
+ dist.barrier()
+
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ tokenizer.padding_side = "right"
+ text_encoder = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).get_decoder().to(device)
+ text_encoder.eval()
+ text_encoder.requires_grad_(False)
+
+ if is_distributed and is_rank0:
+ dist.barrier()
+
+ return tokenizer, text_encoder
+
+
+class _FlowMatchingTimescale(nn.Module):
+ """Tiny stand-in for the deleted `FlowMatchingTrainer` — only `timescale` is read."""
+
+ def __init__(self, timescale: float):
+ super().__init__()
+ self.timescale = timescale
+
+
+class PixelDiTModel(ImaginaireModel):
+ SUPPORTS_CONTEXT_PARALLEL: bool = False
+
+ def __init__(self, config: PixelDiTModelConfig):
+ super().__init__()
+ self.config = config
+
+ if config.dynamic_shift is not None:
+ _ds = config.dynamic_shift
+ logger.info(
+ f"PixelDiT dynamic shift: base_shift={_ds['base_shift']} "
+ f"base_image_size={_ds['base_image_size_for_shift_calc']}"
+ )
+
+ _dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+ requested_dtype = _dtype_map[config.precision]
+ if requested_dtype != torch.float32:
+ self.autocast_dtype = requested_dtype
+ self.precision = torch.float32
+ else:
+ self.autocast_dtype = None
+ self.precision = torch.float32
+ self.tensor_kwargs = {"device": "cuda", "dtype": self.precision}
+
+ with misc.timer("PixelDiTModel: build_net"):
+ self.net = lazy_instantiate(config.net)
+ self.net = self.net.to(device="cuda", dtype=torch.float32)
+ self.net.requires_grad_(True)
+ if hasattr(self.net, "init_weights"):
+ self.net.init_weights()
+ logger.info(f"PixDiT_T2I params: {sum(p.numel() for p in self.net.parameters()):,}")
+
+ # Frozen text encoder. Use object.__setattr__ so DCP / nn.Module don't try to
+ # register it as a child / save it in state_dict.
+ with misc.timer("PixelDiTModel: load_text_encoder"):
+ _tokenizer, _text_encoder = _load_text_encoder(config.text_encoder_name, device="cuda")
+ object.__setattr__(self, "tokenizer", _tokenizer)
+ object.__setattr__(self, "text_encoder", _text_encoder)
+ self._chi_prompt_str = "\n".join(config.chi_prompt) if config.chi_prompt else ""
+ self._num_chi_tokens = len(self.tokenizer.encode(self._chi_prompt_str)) if self._chi_prompt_str else 0
+ self._null_caption_embs = self._encode_text_raw([config.negative_prompt if config.negative_prompt else ""])[
+ 0
+ ]
+
+ # Tiny flow-matching shim: only `timescale` is consumed by inference.
+ self.fm_trainer = _FlowMatchingTimescale(config.fm_timescale)
+
+ self.conditioner = lazy_instantiate(config.conditioner)
+ logger.info(f"PixelDiT conditioner: {self.conditioner}")
+
+ # ---------------------------------------------------------------------
+ # Text encoding
+ # ---------------------------------------------------------------------
+
+ @torch.no_grad()
+ def _encode_text_raw(self, captions: list[str]) -> tuple[Tensor, Tensor]:
+ if self._chi_prompt_str:
+ prompts_all = [self._chi_prompt_str + cap for cap in captions]
+ max_length_all = self._num_chi_tokens + self.config.model_max_length - 2
+ else:
+ prompts_all = captions
+ max_length_all = self.config.model_max_length
+
+ caption_token = self.tokenizer(
+ prompts_all,
+ max_length=max_length_all,
+ padding="max_length",
+ truncation=True,
+ return_tensors="pt",
+ ).to("cuda")
+
+ caption_embs = self.text_encoder(caption_token.input_ids, caption_token.attention_mask)[0]
+
+ select_index = [0] + list(range(-self.config.model_max_length + 1, 0))
+ caption_embs = caption_embs[:, select_index]
+ emb_masks = caption_token.attention_mask[:, select_index]
+ return caption_embs, emb_masks
+
+ def _normalize_image(self, img: Tensor) -> Tensor:
+ if img.dtype == torch.uint8:
+ return img.float() / 127.5 - 1.0
+ elif img.max() > 1.0:
+ return img.float() / 127.5 - 1.0
+ else:
+ if img.min() >= 0:
+ return img.float() * 2.0 - 1.0
+ return img.float()
+
+ # ---------------------------------------------------------------------
+ # Context-parallel helpers (no-op when megatron CP isn't initialized).
+ # ---------------------------------------------------------------------
+
+ @staticmethod
+ def get_context_parallel_group():
+ if parallel_state is not None and parallel_state.is_initialized():
+ return parallel_state.get_context_parallel_group()
+ return None
+
+ def _maybe_enable_cp_on_nets(self, nets: list) -> None:
+ cp_group = self.get_context_parallel_group()
+ for net in nets:
+ if net is None:
+ continue
+ if cp_group is None or cp_group.size() <= 1:
+ if hasattr(net, "disable_context_parallel") and getattr(net, "is_context_parallel_enabled", False):
+ net.disable_context_parallel()
+ else:
+ if hasattr(net, "enable_context_parallel"):
+ net.enable_context_parallel(cp_group)
+
+ def _broadcast_tensor_for_cp(self, t: Tensor | None) -> Tensor | None:
+ cp_group = self.get_context_parallel_group()
+ if t is None or cp_group is None or cp_group.size() <= 1:
+ return t
+ from torch.distributed import get_process_group_ranks
+
+ src = min(get_process_group_ranks(cp_group))
+ return robust_broadcast(t.contiguous(), src=src, pg=cp_group)
+
+ def _broadcast_object_for_cp(self, obj):
+ return cp_broadcast(obj, self.get_context_parallel_group())
+
+ # ---------------------------------------------------------------------
+ # Checkpoint helpers — the distill subclass overrides these for its
+ # net.* / fake_score.* / discriminator.* prefix routing.
+ # ---------------------------------------------------------------------
+
+ def state_dict(self, *args, **kwargs):
+ return self.net.state_dict(prefix="net.")
+
+ def load_state_dict(self, state_dict, strict=True, assign=False, **kwargs):
+ has_core_keys = any(k.startswith("core.") for k in state_dict)
+ has_net_keys = any(k.startswith("net.") for k in state_dict)
+
+ if has_core_keys and not has_net_keys:
+ logger.info("Loading original PixelDiT checkpoint (core.* prefix)")
+ net_sd = {}
+ for k, v in state_dict.items():
+ if k == "pos_embed":
+ continue
+ if k.startswith("core."):
+ net_sd[k[len("core.") :]] = v
+ self.net.load_state_dict(net_sd, strict=False, assign=assign)
+ else:
+ _net_sd = {
+ k[len("net.") :]: v
+ for k, v in state_dict.items()
+ if k.startswith("net.") and not k.startswith("net_ema.")
+ }
+ if _net_sd:
+ self.net.load_state_dict(_net_sd, strict=strict, assign=assign)
diff --git a/invokeai/backend/pid/_src/modules/__init__.py b/invokeai/backend/pid/_src/modules/__init__.py
new file mode 100644
index 00000000000..7ab23eecabc
--- /dev/null
+++ b/invokeai/backend/pid/_src/modules/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/invokeai/backend/pid/_src/modules/conditioner.py b/invokeai/backend/pid/_src/modules/conditioner.py
new file mode 100644
index 00000000000..84629c53496
--- /dev/null
+++ b/invokeai/backend/pid/_src/modules/conditioner.py
@@ -0,0 +1,563 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from contextlib import nullcontext
+from dataclasses import dataclass, fields
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+
+import torch
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+
+from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate
+from invokeai.backend.pid._ext.imaginaire.utils import log
+from invokeai.backend.pid._ext.imaginaire.utils.count_params import count_params, disabled_train
+from invokeai.backend.pid._src.utils.context_parallel import broadcast
+
+
+def batch_mul(x, y):
+ """Broadcast-multiply x by y, padding the shorter shape with trailing 1s."""
+ nd1, nd2 = x.ndim, y.ndim
+ common = min(nd1, nd2)
+ for axis in range(common):
+ assert x.shape[axis] == y.shape[axis], f"Dimensions not equal at axis {axis}"
+ if nd1 < nd2:
+ x = x.reshape(x.shape + (1,) * (nd2 - nd1))
+ elif nd2 < nd1:
+ y = y.reshape(y.shape + (1,) * (nd1 - nd2))
+ return x * y
+
+
+T = TypeVar("T", bound="BaseCondition")
+
+
+def broadcast_condition(condition: BaseCondition, process_group: Optional[ProcessGroup] = None) -> BaseCondition:
+ """
+ Broadcast the condition from the minimum rank in the specified group(s).
+ """
+ if condition.is_broadcasted:
+ return condition
+
+ kwargs = condition.to_dict(skip_underscore=False)
+ for key, value in kwargs.items():
+ if value is not None:
+ kwargs[key] = broadcast(value, process_group)
+ kwargs["_is_broadcasted"] = True
+ return type(condition)(**kwargs)
+
+
+@dataclass(frozen=True)
+class BaseCondition(ABC): # noqa: B024 # upstream marker base class — no abstract methods by design
+ """
+ Attributes:
+ _is_broadcasted: Flag indicating if parallel broadcast splitting
+ has been performed. This is an internal implementation detail.
+ """
+
+ _is_broadcasted: bool = False
+
+ def to_dict(self, skip_underscore: bool = True) -> Dict[str, Any]:
+ """Converts the condition to a dictionary.
+
+ Returns:
+ Dictionary containing the condition's fields and values.
+ """
+ # return {f.name: getattr(self, f.name) for f in fields(self) if not f.name.startswith("_")}
+ return {f.name: getattr(self, f.name) for f in fields(self) if not (f.name.startswith("_") and skip_underscore)}
+
+ @property
+ def is_broadcasted(self) -> bool:
+ return self._is_broadcasted
+
+ def broadcast(self, process_group: torch.distributed.ProcessGroup) -> BaseCondition:
+ """Broadcasts and splits the condition across the checkpoint parallelism group.
+ For most condition, such asT2VCondition, we do not need split.
+
+ Args:
+ process_group: The process group for broadcast and split
+
+ Returns:
+ A new BaseCondition instance with the broadcasted and split condition.
+ """
+ if self.is_broadcasted:
+ return self
+ return broadcast_condition(self, process_group)
+
+
+@dataclass(frozen=True)
+class PixelDiTCondition(BaseCondition):
+ """Condition for PixelDiT T2I models.
+
+ caption: list[str] — raw caption strings (after dropout). The model's internal
+ text encoder (e.g. Gemma-2-2b-it) handles encoding.
+ """
+
+ caption: Optional[list] = None
+
+
+@dataclass(frozen=True)
+class PidCondition(BaseCondition):
+ """Condition for PID (PixelDiT SR) models.
+
+ caption: list[str] — raw caption strings (after dropout).
+ lq_video_or_image: [B, 3, H_lq, W_lq] — LQ image at original low resolution.
+ lq_latent: [B, z_dim, zH, zW] — LQ VAE latent.
+ """
+
+ caption: Optional[list] = None
+ lq_video_or_image: Optional[torch.Tensor] = None
+ lq_latent: Optional[torch.Tensor] = None
+
+
+class AbstractEmbModel(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ self._is_trainable = None
+ self._dropout_rate = None
+ self._input_key = None
+ self._return_dict = False
+
+ @property
+ def is_trainable(self) -> bool:
+ return self._is_trainable
+
+ @property
+ def dropout_rate(self) -> Union[float, torch.Tensor]:
+ return self._dropout_rate
+
+ @property
+ def input_key(self) -> str:
+ return self._input_key
+
+ @property
+ def is_return_dict(self) -> bool:
+ return self._return_dict
+
+ @is_trainable.setter
+ def is_trainable(self, value: bool):
+ self._is_trainable = value
+
+ @dropout_rate.setter
+ def dropout_rate(self, value: Union[float, torch.Tensor]):
+ self._dropout_rate = value
+
+ @input_key.setter
+ def input_key(self, value: str):
+ self._input_key = value
+
+ @is_return_dict.setter
+ def is_return_dict(self, value: bool):
+ self._return_dict = value
+
+ @is_trainable.deleter
+ def is_trainable(self):
+ del self._is_trainable
+
+ @dropout_rate.deleter
+ def dropout_rate(self):
+ del self._dropout_rate
+
+ @input_key.deleter
+ def input_key(self):
+ del self._input_key
+
+ @is_return_dict.deleter
+ def is_return_dict(self):
+ del self._return_dict
+
+ def random_dropout_input(
+ self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+ ) -> torch.Tensor:
+ del key
+ dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+ return batch_mul(
+ torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor),
+ in_tensor,
+ )
+
+ def details(self) -> str:
+ return ""
+
+ def summary(self) -> str:
+ input_key = self.input_key if self.input_key is not None else getattr(self, "input_keys", None)
+ return (
+ f"{self.__class__.__name__} \n\tinput key: {input_key}"
+ f"\n\tParam count: {count_params(self, False)} \n\tTrainable: {self.is_trainable}"
+ f"\n\tDropout rate: {self.dropout_rate}"
+ f"\n\t{self.details()}"
+ )
+
+
+class CaptionStringDrop(AbstractEmbModel):
+ """Embedder for raw caption strings with dropout (replaces with empty string).
+
+ Unlike TextAttrEmptyStringDrop which operates on pre-computed tensor embeddings,
+ this embedder handles raw caption strings (list[str]) from the data batch. On
+ dropout, the caption is replaced with an empty string so the model's own text
+ encoder produces null embeddings.
+
+ Used by PixelDiT which encodes text inside the model (Gemma-2-2b-it) rather
+ than consuming pre-computed UMT5 embeddings from the dataset.
+
+ Args:
+ input_key: key in data_batch containing caption strings (default: "caption")
+ output_key: key in condition output (default: "caption")
+ dropout_rate: probability of replacing caption with "" (for CFG training)
+ """
+
+ def __init__(self, input_key: str = "caption", output_key: str = "caption", dropout_rate: float = 0.0):
+ super().__init__()
+ self._input_key = input_key
+ self._dropout_rate = dropout_rate
+ self._output_key = output_key
+
+ def forward(self, captions):
+ # Ensure list[str] — random_dropout_input normalizes, but guard forward too
+ if isinstance(captions, str):
+ captions = [captions]
+ return {self._output_key: captions}
+
+ def random_dropout_input(self, in_data, dropout_rate=None, key=None):
+ """Per-sample caption dropout: replace each caption with "" independently."""
+ del key
+ import random as _random
+
+ if in_data is None:
+ return in_data
+ # Normalize: webdataset collate may return a single string when batch_size=1
+ if isinstance(in_data, str):
+ in_data = [in_data]
+ dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+ if dropout_rate <= 0:
+ return in_data
+ return ["" if _random.random() < dropout_rate else cap for cap in in_data]
+
+ def details(self) -> str:
+ return f"Output key: [{self._output_key}]"
+
+
+class GeneralConditioner(nn.Module, ABC):
+ """
+ An abstract module designed to handle various embedding models with conditional and unconditional configurations.
+ This abstract base class initializes and manages a collection of embedders that can dynamically adjust
+ their dropout rates based on conditioning.
+
+ Attributes:
+ KEY2DIM (dict): A mapping from output keys to dimensions used for concatenation.
+ embedders (nn.ModuleDict): A dictionary containing all embedded models initialized and configured
+ based on the provided configurations.
+
+ Parameters:
+ emb_models (Union[List, Any]): A dictionary where keys are embedder names and values are configurations
+ for initializing the embedders.
+
+ Example:
+ See Edify4ConditionerConfig
+ """
+
+ KEY2DIM = {"crossattn_emb": 1}
+
+ def __init__(self, **emb_models: Union[List, Any]):
+ super().__init__()
+ self.embedders = nn.ModuleDict()
+ for n, (emb_name, emb_config) in enumerate(emb_models.items()):
+ embedder = instantiate(emb_config)
+ # assert isinstance(
+ # embedder, AbstractEmbModel
+ # ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+ embedder.is_trainable = getattr(emb_config, "is_trainable", True)
+ embedder.dropout_rate = getattr(emb_config, "dropout_rate", 0.0)
+ if not embedder.is_trainable:
+ embedder.train = disabled_train
+ for param in embedder.parameters():
+ param.requires_grad = False
+ embedder.eval()
+
+ log.info(f"Initialized embedder #{n}-{emb_name}: \n {embedder.summary()}")
+ self.embedders[emb_name] = embedder
+
+ @abstractmethod
+ def forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> Any:
+ """Should be implemented in subclasses to handle conditon datatype"""
+ raise NotImplementedError
+
+ def _forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> Dict:
+ """
+ Processes the input batch through all configured embedders, applying conditional dropout rates if specified.
+ Output tensors for each key are concatenated along the dimensions specified in KEY2DIM.
+
+ Parameters:
+ batch (Dict): The input data batch to process.
+ override_dropout_rate (Optional[Dict[str, float]]): Optional dictionary to override default dropout rates
+ per embedder key.
+
+ Returns:
+ Dict: A dictionary of output tensors concatenated by specified dimensions.
+
+ Note:
+ In case the network code is sensitive to the order of concatenation, you can either control the order via \
+ config file or make sure the embedders return a unique key for each output.
+ """
+ output = defaultdict(list)
+ if override_dropout_rate is None:
+ override_dropout_rate = {}
+
+ # make sure emb_name in override_dropout_rate is valid
+ for emb_name in override_dropout_rate.keys():
+ assert emb_name in self.embedders, f"invalid name found {emb_name}"
+
+ for emb_name, embedder in self.embedders.items():
+ embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+ with embedding_context():
+ if isinstance(embedder.input_key, str):
+ emb_out = embedder(
+ embedder.random_dropout_input(
+ batch[embedder.input_key], override_dropout_rate.get(emb_name, None)
+ )
+ )
+ elif isinstance(embedder.input_key, list):
+ emb_out = embedder(
+ *[
+ embedder.random_dropout_input(batch.get(k), override_dropout_rate.get(emb_name, None), k)
+ for k in embedder.input_key
+ ]
+ )
+ else:
+ raise KeyError(
+ f"Embedder '{embedder.__class__.__name__}' requires an 'input_key' attribute to be defined as either a string or list of strings"
+ )
+ for k, v in emb_out.items():
+ output[k].append(v)
+ # Concatenate the outputs
+ return {k: torch.cat(v, dim=self.KEY2DIM.get(k, -1)) for k, v in output.items()}
+
+ def get_condition_uncondition(
+ self,
+ data_batch: Dict,
+ ) -> Tuple[Any, Any]:
+ """
+ Processes the provided data batch to generate two sets of outputs: conditioned and unconditioned. This method
+ manipulates the dropout rates of embedders to simulate two scenarios — one where all conditions are applied
+ (conditioned), and one where they are removed or reduced to the minimum (unconditioned).
+
+ This method first sets the dropout rates to zero for the conditioned scenario to fully apply the embedders' effects.
+ For the unconditioned scenario, it sets the dropout rates to 1 (or to 0 if the initial unconditional dropout rate
+ is insignificant) to minimize the embedders' influences, simulating an unconditioned generation.
+
+ Parameters:
+ data_batch (Dict): The input data batch that contains all necessary information for embedding processing. The
+ data is expected to match the required format and keys expected by the embedders.
+
+ Returns:
+ Tuple[Any, Any]: A tuple containing two condition:
+ - The first one contains the outputs with all embedders fully applied (conditioned outputs).
+ - The second one contains the outputs with embedders minimized or not applied (unconditioned outputs).
+ """
+ cond_dropout_rates, dropout_rates = {}, {}
+ for emb_name, embedder in self.embedders.items():
+ cond_dropout_rates[emb_name] = 0.0
+ dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+
+ condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates)
+ un_condition: Any = self(data_batch, override_dropout_rate=dropout_rates)
+ return condition, un_condition
+
+
+class PixelDiTConditioner(GeneralConditioner):
+ """Conditioner for PixelDiT T2I models. Returns PixelDiTCondition.
+
+ Unlike FPDConditioner which works with pre-computed tensor embeddings,
+ this conditioner handles raw caption strings. The model's internal text
+ encoder does the actual encoding after conditioning.
+
+ Overrides _forward to skip torch.cat (caption outputs are list[str], not tensors).
+
+ Embedders typically include:
+ - caption: CaptionStringDrop (raw string with empty-string dropout for CFG)
+ """
+
+ def _forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> Dict:
+ """Like GeneralConditioner._forward but returns values directly (no torch.cat)."""
+ output = {}
+ if override_dropout_rate is None:
+ override_dropout_rate = {}
+ for emb_name, embedder in self.embedders.items():
+ embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+ with embedding_context():
+ in_data = batch[embedder.input_key]
+ in_data = embedder.random_dropout_input(in_data, override_dropout_rate.get(emb_name, None))
+ emb_out = embedder(in_data)
+ output.update(emb_out)
+ return output
+
+ def forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> PixelDiTCondition:
+ output = self._forward(batch, override_dropout_rate)
+ return PixelDiTCondition(**output)
+
+ def get_condition_uncondition(self, data_batch: Dict) -> Tuple[PixelDiTCondition, PixelDiTCondition]:
+ """Returns (condition, uncondition) pair for CFG inference."""
+ condition = self(data_batch, override_dropout_rate=dict.fromkeys(self.embedders, 0.0))
+ uncondition = self(data_batch, override_dropout_rate=dict.fromkeys(self.embedders, 1.0))
+ return condition, uncondition
+
+
+# =============================================================================
+# PID (PixelDiT SR) — condition, embedder, and conditioner
+# =============================================================================
+
+
+class LQTensorDrop(AbstractEmbModel):
+ """Embedder for LQ tensors (image or latent) with per-sample zero dropout.
+
+ On dropout, the tensor is replaced with a zero tensor of the same shape.
+ Supports coupled dropout: when coupled_with is set, this embedder reuses
+ the dropout mask from the coupled embedder (stored in _shared_lq_keep_mask).
+
+ Args:
+ input_key: key in data_batch (e.g. "LQ_video_or_image" or "LQ_latent").
+ output_key: key in condition output (e.g. "lq_video_or_image" or "lq_latent").
+ dropout_rate: probability of zeroing out the tensor (for CFG training).
+ is_primary: if True, this embedder generates the shared dropout mask.
+ If False, it reuses the mask from the primary embedder.
+ """
+
+ # Class-level shared mask for coupled dropout (reset each forward pass)
+ _shared_lq_keep_mask: Optional[torch.Tensor] = None
+
+ def __init__(
+ self,
+ input_key: str = "LQ_video_or_image",
+ output_key: str = "lq_video_or_image",
+ dropout_rate: float = 0.0,
+ is_primary: bool = True,
+ ):
+ super().__init__()
+ self._input_key = input_key
+ self._dropout_rate = dropout_rate
+ self._output_key = output_key
+ self._is_primary = is_primary
+
+ def forward(self, element: torch.Tensor) -> Dict[str, torch.Tensor]:
+ return {self._output_key: element}
+
+ def random_dropout_input(
+ self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+ ) -> torch.Tensor:
+ del key
+ dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+ if dropout_rate <= 0 or in_tensor is None:
+ if self._is_primary:
+ LQTensorDrop._shared_lq_keep_mask = None
+ return in_tensor
+
+ B = in_tensor.shape[0]
+ if self._is_primary:
+ # Generate and store shared mask
+ keep_mask = torch.bernoulli((1.0 - dropout_rate) * torch.ones(B, device=in_tensor.device))
+ LQTensorDrop._shared_lq_keep_mask = keep_mask
+ else:
+ # Reuse mask from primary embedder
+ keep_mask = LQTensorDrop._shared_lq_keep_mask
+ if keep_mask is None:
+ # Fallback: generate own mask if primary hasn't run yet
+ keep_mask = torch.bernoulli((1.0 - dropout_rate) * torch.ones(B, device=in_tensor.device))
+
+ keep_mask_expanded = keep_mask.view(B, *[1] * (in_tensor.dim() - 1)).type_as(in_tensor)
+ return keep_mask_expanded * in_tensor
+
+ def details(self) -> str:
+ return f"Output key: {self._output_key}, primary: {self._is_primary}"
+
+
+class PidConditioner(PixelDiTConditioner):
+ """Conditioner for PID (PixelDiT SR) models. Returns PidCondition.
+
+ Handles caption strings (CaptionStringDrop) + LQ tensors (LQTensorDrop).
+ LQ image and LQ latent share coupled dropout: when one is dropped, both are.
+
+ Inherits get_condition_uncondition from GeneralConditioner which respects
+ per-embedder dropout_rate: if caption dropout_rate=0, caption is never
+ dropped in uncondition (only LQ gets dropped for CFG).
+
+ Embedders typically include:
+ - caption: CaptionStringDrop (raw string dropout)
+ - lq_video_or_image: LQTensorDrop (primary, generates shared mask)
+ - lq_latent: LQTensorDrop (secondary, reuses shared mask)
+ """
+
+ def _forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> Dict:
+ """Process embedders. Handles both string (caption) and tensor (LQ) outputs."""
+ output = {}
+ if override_dropout_rate is None:
+ override_dropout_rate = {}
+ # Reset shared mask at start of each forward
+ LQTensorDrop._shared_lq_keep_mask = None
+ for emb_name, embedder in self.embedders.items():
+ embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+ with embedding_context():
+ in_data = batch[embedder.input_key]
+ in_data = embedder.random_dropout_input(in_data, override_dropout_rate.get(emb_name, None))
+ emb_out = embedder(in_data)
+ output.update(emb_out)
+ return output
+
+ def forward(
+ self,
+ batch: Dict,
+ override_dropout_rate: Optional[Dict[str, float]] = None,
+ ) -> PidCondition:
+ output = self._forward(batch, override_dropout_rate)
+ return PidCondition(**output)
+
+ def get_condition_uncondition(self, data_batch: Dict) -> Tuple[PidCondition, PidCondition]:
+ """Returns (condition, uncondition) pair for CFG inference.
+
+ Respects per-embedder dropout_rate: embedders with dropout_rate=0 in config
+ are NOT dropped in uncondition (e.g. caption with dropout_rate=0 stays).
+ """
+ cond_dropout_rates, uncond_dropout_rates = {}, {}
+ for emb_name, embedder in self.embedders.items():
+ cond_dropout_rates[emb_name] = 0.0
+ uncond_dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+
+ condition = self(data_batch, override_dropout_rate=cond_dropout_rates)
+ uncondition = self(data_batch, override_dropout_rate=uncond_dropout_rates)
+ return condition, uncondition
diff --git a/invokeai/backend/pid/_src/networks/__init__.py b/invokeai/backend/pid/_src/networks/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/invokeai/backend/pid/_src/networks/lq_projection_2d.py b/invokeai/backend/pid/_src/networks/lq_projection_2d.py
new file mode 100644
index 00000000000..b18b5f86c13
--- /dev/null
+++ b/invokeai/backend/pid/_src/networks/lq_projection_2d.py
@@ -0,0 +1,413 @@
+# 2D LQ projection for pixel-space image super-resolution.
+#
+# Takes LQ image [B, 3, H_lq, W_lq] at original low resolution and/or
+# LQ VAE latent [B, z_dim, zH, zW], projects them to patch-aligned tokens
+# for injection into the PixDiT_T2I transformer.
+#
+# Spatial alignment (lossless):
+# Image branch: PixelUnshuffle to fold spatial dims into channels, aligning
+# to the patch grid without any interpolation.
+# Latent branch: Nearest interpolate or fold to align to the patch grid.
+#
+# ControlNet-style injection gate (single implementation):
+# "sigma_aware_per_token_per_dim":
+# x + sigmoid(Linear([x, lq]) - exp(log_alpha)*sigma) * lq (per-token per-dim, B,N,D; monotonic in sigma)
+
+import math
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# ---------------------------------------------------------------------------
+# Gate module
+# ---------------------------------------------------------------------------
+
+
+class SigmaAwareGatePerTokenPerDim(nn.Module):
+ """Per-token per-dim variant of SigmaAwareGatePerTokenPerDim.
+
+ Content branch projects to dim instead of 1, so the gate is independent per
+ (token, channel) instead of shared across channels. Sigma branch stays scalar
+ per sample and broadcasts (B, 1, 1) → (B, N, D).
+
+ Init: content_proj.bias=2.0, log_alpha=log(5) →
+ gate ≈ sigmoid(2.0 - 5*sigma): ~0.88 at sigma=0, ~0.5 at sigma=0.4, ~0.05 at sigma=1.
+ Requires sigma to always be provided (asserts at forward time).
+ """
+
+ def __init__(self, dim: int):
+ super().__init__()
+ self.content_proj = nn.Linear(dim * 2, dim)
+ nn.init.trunc_normal_(self.content_proj.weight, std=0.01)
+ nn.init.constant_(self.content_proj.bias, 2.0)
+ self.log_alpha = nn.Parameter(torch.tensor(math.log(5.0)))
+
+ def compute_gate_scalar(
+ self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ assert sigma is not None, "SigmaAwareGatePerTokenPerDim requires sigma input"
+ content_logit = self.content_proj(torch.cat([x, lq], dim=-1)) # (B, N, D)
+ sigma_offset = -self.log_alpha.exp() * sigma.float().view(-1, 1, 1) # (B, 1, 1)
+ return torch.sigmoid(content_logit + sigma_offset) # (B, N, D)
+
+ def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None) -> torch.Tensor:
+ return x + self.compute_gate_scalar(x, lq, sigma) * lq
+
+
+_SUPPORTED_GATE_TYPE = "sigma_aware_per_token_per_dim"
+
+
+def _build_gate(gate_type: str, dim: int, zero_init: bool = True) -> nn.Module:
+ # zero_init is intentionally not forwarded: redundant with zero-init output_heads.
+ if gate_type != _SUPPORTED_GATE_TYPE:
+ raise ValueError(f"Unknown gate_type: {gate_type!r}. Only {_SUPPORTED_GATE_TYPE!r} is supported.")
+ return SigmaAwareGatePerTokenPerDim(dim)
+
+
+# ---------------------------------------------------------------------------
+# Pre-activation residual block (used by image / latent encoders below).
+# ---------------------------------------------------------------------------
+
+
+class ResBlock(nn.Module):
+ """Pre-activation residual block: GroupNorm → SiLU → Conv → GroupNorm → SiLU → Conv + skip."""
+
+ def __init__(self, channels: int, num_groups: int = 4):
+ super().__init__()
+ self.block = nn.Sequential(
+ nn.GroupNorm(num_groups, channels),
+ nn.SiLU(),
+ nn.Conv2d(channels, channels, kernel_size=3, padding=1),
+ nn.GroupNorm(num_groups, channels),
+ nn.SiLU(),
+ nn.Conv2d(channels, channels, kernel_size=3, padding=1),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x + self.block(x)
+
+
+# ---------------------------------------------------------------------------
+# LQ Projection 2D
+# ---------------------------------------------------------------------------
+
+
+class LQProjection2D(nn.Module):
+ """2D LQ projection for image super-resolution in pixel space.
+
+ Spatial alignment strategy (lossless, no bilinear interpolation):
+
+ Image branch:
+ LQ image is at H_lq = H_hq / sr_scale. Patch grid is pH = H_hq / patch_size.
+ Ratio = H_lq / pH = patch_size / sr_scale.
+ - If ratio >= 1 (LQ res >= patch grid): PixelUnshuffle(ratio) to fold spatial
+ dims into channels. E.g. sr_scale=4, ps=16: ratio=4, unshuffle folds 4x4 pixels
+ into channels: [B, 3, 256, 256] → [B, 3*16, 64, 64] = [B, 48, 64, 64].
+ - If ratio < 1 (LQ res < patch grid): Conv2d with PixelShuffle to upsample.
+
+ Latent branch:
+ LQ latent is at zH = H_lq / lsdf. Patch grid is pH = H_hq / patch_size.
+ z_patch_ratio = pH / zH = (sr_scale * lsdf) / patch_size.
+ - If z_patch_ratio <= 1 (latent res >= patch grid): fold z_patch_ratio×z_patch_ratio
+ spatial elements into channels (same as FastPixelDecoder._align_z_to_patch_grid).
+ - If z_patch_ratio > 1 (latent res < patch grid): nearest interpolate to upsample.
+
+ Args:
+ in_channels: LQ image channels (3 for RGB, 0 to disable image branch).
+ latent_channels: LQ latent channels (e.g. 16 for Wan VAE, 0 to disable).
+ hidden_dim: internal feature dimension for conv processing.
+ out_dim: output dimension (must match transformer hidden_size).
+ patch_size: spatial patch size of the transformer (e.g. 16).
+ sr_scale: super-resolution scale factor (LQ is sr_scale times smaller).
+ latent_spatial_down_factor: VAE spatial downscale factor (default 8).
+ num_res_blocks: number of ResBlocks after initial conv projection in each branch.
+ 0 = no ResBlocks (original shallow design).
+ 4 = recommended for stronger feature extraction (~4x deeper).
+ num_outputs: number of output feature sets — one per transformer block
+ for controlnet injection.
+ gate_type: must be "sigma_aware_per_token_per_dim" (sigma-conditioned per-token per-dim gate).
+ interval: inject every N blocks (only relevant when num_outputs > 1).
+ zero_init: if True, zero-init all output projections for safe pretrained start.
+ pit_output: if True, add a dedicated output head for PiT block injection.
+ The PiT head output is appended as the last element of forward() output.
+ """
+
+ def __init__(
+ self,
+ in_channels: int = 3,
+ latent_channels: int = 0,
+ hidden_dim: int = 512,
+ out_dim: int = 1536,
+ patch_size: int = 16,
+ sr_scale: int = 4,
+ latent_spatial_down_factor: int = 8,
+ num_res_blocks: int = 4,
+ num_outputs: int = 1,
+ gate_type: str = _SUPPORTED_GATE_TYPE,
+ interval: int = 1,
+ zero_init: bool = True,
+ pit_output: bool = False,
+ ):
+ super().__init__()
+ assert in_channels > 0 or latent_channels > 0, "At least one of in_channels or latent_channels must be > 0"
+
+ self.in_channels = in_channels
+ self.latent_channels = latent_channels
+ self.hidden_dim = hidden_dim
+ self.out_dim = out_dim
+ self.patch_size = patch_size
+ self.sr_scale = sr_scale
+ self.latent_spatial_down_factor = latent_spatial_down_factor
+ self.num_outputs = num_outputs
+ self.interval = interval
+ self.zero_init = zero_init
+ self.pit_output = pit_output
+
+ # --- Image branch ---
+ # PixelUnshuffle → Conv proj → ResBlocks for deep feature extraction
+ if in_channels > 0:
+ assert patch_size >= sr_scale and patch_size % sr_scale == 0, (
+ f"patch_size ({patch_size}) must be >= sr_scale ({sr_scale}) and divisible"
+ )
+ self.image_unshuffle_factor = patch_size // sr_scale
+ unshuffle_ch = in_channels * self.image_unshuffle_factor**2
+ layers = [
+ nn.Conv2d(unshuffle_ch, hidden_dim, kernel_size=3, stride=1, padding=1),
+ nn.SiLU(),
+ nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1),
+ ]
+ for _ in range(num_res_blocks):
+ layers.append(ResBlock(hidden_dim))
+ self.image_conv = nn.Sequential(*layers)
+ else:
+ self.image_conv = None
+ self.image_unshuffle_factor = 0
+
+ # --- Latent branch ---
+ # Spatial alignment (fold / upsample) → Conv proj → ResBlocks
+ if latent_channels > 0:
+ z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size
+ self.z_to_patch_ratio = z_to_patch_ratio
+
+ if z_to_patch_ratio > 1:
+ # Latent is lower res than patch grid → nearest upsample (no learnable params).
+ # LearnedLatentUpsampler (PixelShuffle) caused DDP numerical issues on multi-node.
+ self.latent_upsampler = None
+ self.latent_upsample_ratio = int(z_to_patch_ratio)
+ latent_proj_in_ch = latent_channels
+ elif z_to_patch_ratio == 1:
+ self.latent_upsampler = None
+ latent_proj_in_ch = latent_channels
+ else:
+ fold_factor = int(1 / z_to_patch_ratio)
+ assert fold_factor * z_to_patch_ratio == 1.0, (
+ f"fold_factor {fold_factor} * z_to_patch_ratio {z_to_patch_ratio} != 1"
+ )
+ self.latent_upsampler = None
+ self.latent_fold_factor = fold_factor
+ latent_proj_in_ch = latent_channels * fold_factor**2
+
+ layers = [
+ nn.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, stride=1, padding=1),
+ nn.SiLU(),
+ nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1),
+ ]
+ for _ in range(num_res_blocks):
+ layers.append(ResBlock(hidden_dim))
+ self.latent_proj = nn.Sequential(*layers)
+ else:
+ self.latent_proj = None
+ self.z_to_patch_ratio = 0
+ self.latent_upsampler = None
+
+ # --- Merge + shared ResBlocks (if both branches active) ---
+ if in_channels > 0 and latent_channels > 0:
+ layers = [nn.Conv2d(hidden_dim * 2, hidden_dim, kernel_size=1), nn.SiLU()]
+ for _ in range(num_res_blocks):
+ layers.append(ResBlock(hidden_dim))
+ self.merge = nn.Sequential(*layers)
+ else:
+ self.merge = None
+
+ # --- Output heads ---
+ self.output_heads = nn.ModuleList([nn.Linear(hidden_dim, out_dim) for _ in range(num_outputs)])
+
+ # --- Dedicated PiT output head (separate from DiT heads) ---
+ if pit_output:
+ self.pit_head = nn.Linear(hidden_dim, out_dim)
+ else:
+ self.pit_head = None
+
+ # --- Gate modules (one per injection point, for controlnet-style injection) ---
+ # Using a ModuleList instead of a single shared module allows each block to learn
+ # independent gating behaviour (different content_proj weights and log_alpha).
+ self.gate_modules = nn.ModuleList(
+ [_build_gate(gate_type, out_dim, zero_init=zero_init) for _ in range(num_outputs)]
+ )
+
+ def init_weights(self):
+ """Initialize weights. Zero-init output heads when zero_init=True.
+
+ Conv layers use truncated normal (std=0.02) instead of kaiming_normal_
+ to keep intermediate activations small under bfloat16 autocast.
+ With zero-init output heads the forward output is zero regardless of
+ conv init scale, but large conv activations cause grad overflow in
+ bfloat16 backward (output_head.weight.grad ∝ conv_features).
+ """
+ for module in self.modules():
+ if isinstance(module, nn.Conv2d):
+ nn.init.trunc_normal_(module.weight, std=0.02)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
+
+ for head in self.output_heads:
+ if self.zero_init:
+ nn.init.zeros_(head.weight)
+ if head.bias is not None:
+ nn.init.zeros_(head.bias)
+ else:
+ # Small init so LQ signal is present from the start but doesn't
+ # overwhelm the pretrained base model.
+ nn.init.trunc_normal_(head.weight, std=0.02)
+ if head.bias is not None:
+ nn.init.zeros_(head.bias)
+
+ # PiT head follows same init strategy
+ if self.pit_head is not None:
+ if self.zero_init:
+ nn.init.zeros_(self.pit_head.weight)
+ if self.pit_head.bias is not None:
+ nn.init.zeros_(self.pit_head.bias)
+ else:
+ nn.init.trunc_normal_(self.pit_head.weight, std=0.02)
+ if self.pit_head.bias is not None:
+ nn.init.zeros_(self.pit_head.bias)
+
+ def is_gate_active(self, block_idx: int) -> bool:
+ """Whether gate() should be called for this block index."""
+ if self.interval > 1:
+ return block_idx % self.interval == 0
+ return True
+
+ def _get_output_index(self, block_idx: int) -> int:
+ """Map block_idx to output head index, respecting interval."""
+ if self.interval > 1:
+ return block_idx // self.interval
+ return block_idx
+
+ def gate(
+ self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None, out_idx: int = 0
+ ) -> torch.Tensor:
+ """Apply gating: inject lq features into transformer hidden state x."""
+ return self.gate_modules[out_idx](x, lq, sigma=sigma)
+
+ def _align_image_to_patch_grid(
+ self, lq_video_or_image: torch.Tensor, target_pH: int, target_pW: int
+ ) -> torch.Tensor:
+ """Align LQ image to patch grid via PixelUnshuffle.
+
+ [B, C, H_lq, W_lq] → pad if needed → PixelUnshuffle(factor) → [B, C*f*f, pH, pW]
+ Then conv to [B, hidden_dim, pH, pW].
+
+ Multi-AR images may have H_lq not divisible by unshuffle_factor. We pad to
+ target_pH * f, target_pW * f to ensure exact alignment with the patch grid.
+ """
+ f = self.image_unshuffle_factor
+ B, C, H_lq, W_lq = lq_video_or_image.shape
+ target_H_lq = target_pH * f
+ target_W_lq = target_pW * f
+
+ # Pad or crop to exact target size if needed (multi-AR may not align perfectly)
+ if H_lq != target_H_lq or W_lq != target_W_lq:
+ lq_video_or_image = F.interpolate(
+ lq_video_or_image, size=(target_H_lq, target_W_lq), mode="bilinear", align_corners=False
+ )
+
+ x = F.pixel_unshuffle(lq_video_or_image, f) # [B, C*f*f, target_pH, target_pW]
+ return self.image_conv(x) # [B, hidden_dim, target_pH, target_pW]
+
+ def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor:
+ """Align LQ latent to patch grid via nearest interpolate or fold.
+
+ Returns [B, hidden_dim, pH, pW].
+ """
+ B, z_dim = lq_latent.shape[:2]
+
+ if self.z_to_patch_ratio > 1:
+ # Upsample: latent is lower res than patch grid → nearest interpolate
+ z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest")
+ elif self.z_to_patch_ratio == 1:
+ z_aligned = lq_latent
+ if z_aligned.shape[2] != pH or z_aligned.shape[3] != pW:
+ z_aligned = F.interpolate(z_aligned, size=(pH, pW), mode="nearest", align_corners=False)
+ else:
+ # Fold: latent is higher res than patch grid
+ f = self.latent_fold_factor
+ # Ensure latent spatial matches expected fold size
+ zH_expected, zW_expected = pH * f, pW * f
+ if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected:
+ lq_latent = F.interpolate(
+ lq_latent, size=(zH_expected, zW_expected), mode="nearest", align_corners=False
+ )
+ z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f)
+ z_aligned = z_aligned.permute(0, 1, 3, 5, 2, 4)
+ z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW)
+
+ return self.latent_proj(z_aligned) # [B, hidden_dim, pH, pW]
+
+ def forward(
+ self,
+ lq_video_or_image: Optional[torch.Tensor] = None,
+ lq_latent: Optional[torch.Tensor] = None,
+ target_pH: int = 0,
+ target_pW: int = 0,
+ ) -> List[torch.Tensor]:
+ """Project LQ inputs to patch-aligned token features.
+
+ Args:
+ lq_video_or_image: [B, C, H_lq, W_lq] LQ image at original low resolution. Can be None.
+ lq_latent: [B, z_dim, zH, zW] LQ VAE latent. Can be None.
+ target_pH: target patch grid height (H_hq / patch_size).
+ target_pW: target patch grid width (W_hq / patch_size).
+
+ Returns:
+ List of [B, N, out_dim] tensors where N = target_pH * target_pW.
+ Length = num_outputs (+ 1 if pit_output=True).
+ """
+ assert target_pH > 0 and target_pW > 0, "Must provide target_pH and target_pW"
+ features = []
+
+ # Image branch: PixelUnshuffle → Conv
+ if self.image_conv is not None and lq_video_or_image is not None:
+ features.append(self._align_image_to_patch_grid(lq_video_or_image, target_pH, target_pW))
+
+ # Latent branch: Fold/Upsample → Conv
+ if self.latent_proj is not None and lq_latent is not None:
+ features.append(self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW))
+
+ # Merge or select single branch
+ if len(features) == 2 and self.merge is not None:
+ merged = self.merge(torch.cat(features, dim=1)) # [B, hidden_dim, pH, pW]
+ elif len(features) == 1:
+ merged = features[0]
+ else:
+ # Both inputs are None — return zero features
+ ref = lq_video_or_image if lq_video_or_image is not None else lq_latent
+ B, device, dtype = ref.shape[0], ref.device, ref.dtype
+ N = target_pH * target_pW
+ num_total = self.num_outputs + (1 if self.pit_output else 0)
+ return [torch.zeros(B, N, self.out_dim, device=device, dtype=dtype) for _ in range(num_total)]
+
+ # Flatten to tokens: [B, hidden_dim, pH, pW] -> [B, N, hidden_dim]
+ tokens = merged.flatten(2).transpose(1, 2)
+
+ # Project through output heads
+ outputs = [head(tokens) for head in self.output_heads]
+
+ # Append dedicated PiT head output as last element
+ if self.pit_head is not None:
+ outputs.append(self.pit_head(tokens))
+
+ return outputs
diff --git a/invokeai/backend/pid/_src/networks/pid_net.py b/invokeai/backend/pid/_src/networks/pid_net.py
new file mode 100644
index 00000000000..290ccd50a3d
--- /dev/null
+++ b/invokeai/backend/pid/_src/networks/pid_net.py
@@ -0,0 +1,469 @@
+# PidNet — Super-resolution variant of PixDiT_T2I.
+#
+# Extends the text-to-image PixDiT model with LQ (low-quality) image/latent
+# conditioning for image super-resolution. The base T2I architecture is unchanged;
+# LQ information is injected via per-block gated injection between transformer
+# blocks ("controlnet" mode — the only mode supported in this inference subset).
+# Gate: sigma_aware_per_token_per_dim (sigma-conditioned LQ injection).
+#
+# All LQ modules are zero-initialized by default (zero_init_lq=True) so the network
+# starts identical to the pretrained T2I model.
+#
+# Loading pretrained T2I checkpoint: use strict=False to ignore missing LQ keys.
+#
+# Reference:
+# - PixDiT_T2I: pid/_src/networks/pixeldit_official.py
+# - LQ projection: pid/_src/networks/lq_projection_2d.py
+
+from typing import Optional
+
+import torch
+
+from invokeai.backend.pid._ext.imaginaire.utils import log
+from invokeai.backend.pid._src.networks.lq_projection_2d import LQProjection2D
+from invokeai.backend.pid._src.networks.pixeldit_official import PixDiT_T2I
+from invokeai.backend.pid._src.utils.context_parallel import cat_outputs_cp_with_grad, split_inputs_cp
+
+
+class PidNet(PixDiT_T2I):
+ """PixDiT T2I with LQ condition injection for super-resolution.
+
+ Inherits all PixDiT_T2I functionality (MMDiT patch blocks, PiT pixel blocks,
+ text conditioning, RoPE, encoder-decoder compression, REPA). Adds LQ projection
+ module and controlnet-style gated injection logic.
+
+ Args (in addition to PixDiT_T2I args):
+ lq_inject_mode: kept as a parameter for config compatibility — only
+ "controlnet" is supported in this inference subset.
+ lq_in_channels: LQ image channels (3 for RGB, 0 to disable image branch).
+ lq_latent_channels: LQ latent channels (e.g. 16 for Wan VAE, 0 to disable).
+ lq_hidden_dim: internal projection hidden dimension.
+ lq_num_res_blocks: number of ResBlocks per branch for deeper feature extraction.
+ lq_gate_type: "sigma_aware_per_token_per_dim" only.
+ lq_interval: inject every N blocks.
+ zero_init_lq: zero-init all LQ projections for safe pretrained start.
+ train_lq_proj_only: freeze base T2I, train only LQ projection modules.
+ sr_scale: super-resolution scale factor (default 4).
+ latent_spatial_down_factor: VAE spatial downscale factor (default 8).
+ """
+
+ def __init__(
+ self,
+ # --- PixDiT_T2I base args ---
+ in_channels=3,
+ num_groups=16,
+ hidden_size=1152,
+ pixel_hidden_size=64,
+ pixel_attn_hidden_size=None,
+ pixel_num_groups=None,
+ patch_depth=26,
+ pixel_depth=2,
+ num_text_blocks=4,
+ patch_size=16,
+ txt_embed_dim=4096,
+ txt_max_length=1024,
+ use_text_rope: bool = True,
+ text_rope_theta: float = 10000.0,
+ rope_mode: str = "ntk_aware",
+ rope_ref_h: int = 1024,
+ rope_ref_w: int = 1024,
+ repa_encoder_index: int = -1,
+ enable_ed: bool = False,
+ ed_compress_ratio: int = 1,
+ ed_depth_per_stage: int = 1,
+ ed_window_size: int = 2,
+ ed_num_heads: Optional[int] = None,
+ ed_hidden_size: Optional[int] = None,
+ ed_use_token_shuffle: bool = True,
+ # --- SR-specific args ---
+ lq_inject_mode: str = "controlnet",
+ lq_in_channels: int = 3,
+ lq_latent_channels: int = 0,
+ lq_hidden_dim: int = 512,
+ lq_num_res_blocks: int = 4,
+ lq_gate_type: str = "sigma_aware_per_token_per_dim",
+ lq_interval: int = 1,
+ zero_init_lq: bool = True,
+ train_lq_proj_only: bool = False,
+ sr_scale: int = 4,
+ latent_spatial_down_factor: int = 8,
+ # --- PiT LQ injection args ---
+ # Inject LQ features into PiT pixel blocks via a dedicated output head
+ # from the same LQ projection CNN backbone. Added to s_cond before PiT loop.
+ pit_lq_inject: bool = False,
+ pit_lq_gate_type: str = "sigma_aware_per_token_per_dim",
+ ):
+ super().__init__(
+ in_channels=in_channels,
+ num_groups=num_groups,
+ hidden_size=hidden_size,
+ pixel_hidden_size=pixel_hidden_size,
+ pixel_attn_hidden_size=pixel_attn_hidden_size,
+ pixel_num_groups=pixel_num_groups,
+ patch_depth=patch_depth,
+ pixel_depth=pixel_depth,
+ num_text_blocks=num_text_blocks,
+ patch_size=patch_size,
+ txt_embed_dim=txt_embed_dim,
+ txt_max_length=txt_max_length,
+ use_text_rope=use_text_rope,
+ text_rope_theta=text_rope_theta,
+ rope_mode=rope_mode,
+ rope_ref_h=rope_ref_h,
+ rope_ref_w=rope_ref_w,
+ repa_encoder_index=repa_encoder_index,
+ enable_ed=enable_ed,
+ ed_compress_ratio=ed_compress_ratio,
+ ed_depth_per_stage=ed_depth_per_stage,
+ ed_window_size=ed_window_size,
+ ed_num_heads=ed_num_heads,
+ ed_hidden_size=ed_hidden_size,
+ ed_use_token_shuffle=ed_use_token_shuffle,
+ )
+
+ assert lq_inject_mode == "controlnet", (
+ f"Only lq_inject_mode='controlnet' is supported in this inference subset, got '{lq_inject_mode}'"
+ )
+ self.lq_inject_mode = lq_inject_mode
+ self.sr_scale = sr_scale
+ self.train_lq_proj_only = train_lq_proj_only
+
+ num_lq_outputs = (patch_depth + lq_interval - 1) // lq_interval
+
+ self.pit_lq_inject = pit_lq_inject
+
+ self.lq_proj = LQProjection2D(
+ in_channels=lq_in_channels,
+ latent_channels=lq_latent_channels,
+ hidden_dim=lq_hidden_dim,
+ out_dim=hidden_size,
+ patch_size=patch_size,
+ sr_scale=sr_scale,
+ latent_spatial_down_factor=latent_spatial_down_factor,
+ num_res_blocks=lq_num_res_blocks,
+ num_outputs=num_lq_outputs,
+ gate_type=lq_gate_type,
+ interval=lq_interval,
+ zero_init=zero_init_lq,
+ pit_output=pit_lq_inject,
+ )
+
+ # PiT LQ gate (applied to s_cond before pixel blocks)
+ if pit_lq_inject:
+ from invokeai.backend.pid._src.networks.lq_projection_2d import _build_gate
+
+ self.pit_lq_gate = _build_gate(pit_lq_gate_type, hidden_size, zero_init=zero_init_lq)
+ else:
+ self.pit_lq_gate = None
+
+ if train_lq_proj_only:
+ for p in self.parameters():
+ p.requires_grad_(False)
+ for p in self.lq_proj.parameters():
+ p.requires_grad_(True)
+ if self.pit_lq_gate is not None and hasattr(self.pit_lq_gate, "parameters"):
+ for p in self.pit_lq_gate.parameters():
+ p.requires_grad_(True)
+
+ def init_weights(self):
+ """Initialize LQ projection."""
+ self.lq_proj.init_weights()
+ log.info("LQ projection init_weights complete")
+
+ def _compute_lq_features(self, lq_video_or_image, lq_latent, lq_mask, Hs, Ws):
+ lq_features = self.lq_proj(
+ lq_video_or_image=lq_video_or_image,
+ lq_latent=lq_latent,
+ target_pH=Hs,
+ target_pW=Ws,
+ )
+ if lq_mask is not None:
+ lq_features = [f * lq_mask.view(-1, 1, 1) for f in lq_features]
+ # Under CP, lq_features are produced at full L (LQ inputs are replicated
+ # across CP ranks). Split each along the token axis so they line up with
+ # the rank-local image stream the patch blocks consume.
+ if self._cp_group is not None:
+ lq_features = [split_inputs_cp(f, seq_dim=1, cp_group=self._cp_group) for f in lq_features]
+ return lq_features
+
+ def _run_patch_blocks(
+ self,
+ s_main,
+ y_emb,
+ condition,
+ pos,
+ pos_txt,
+ attn_mask_joint,
+ lq_features,
+ degrade_sigma=None,
+ feature_indices=None,
+ ):
+ """Run patch_blocks loop with controlnet-style LQ injection.
+
+ Args:
+ feature_indices: Optional set of block indices whose output features should be
+ collected and returned (for GAN discriminator). None = no collection.
+
+ Returns:
+ (s_main, y_emb, collected_features) where collected_features is a list of
+ [B, L, D] tensors (one per index in feature_indices), or None if not requested.
+ """
+ has_lq = lq_features is not None
+
+ collected_features = [] if feature_indices is not None else None
+
+ for i in range(self.patch_depth):
+ if has_lq and self.lq_proj.is_gate_active(i):
+ out_idx = self.lq_proj._get_output_index(i)
+ if out_idx < len(lq_features):
+ s_main = self.lq_proj.gate(s_main, lq_features[out_idx], sigma=degrade_sigma, out_idx=out_idx)
+
+ s_main, y_emb = self.patch_blocks[i](
+ s_main,
+ y_emb,
+ condition,
+ pos,
+ pos_txt,
+ attn_mask_joint,
+ )
+
+ # Collect intermediate features for GAN discriminator
+ if feature_indices is not None and i in feature_indices:
+ collected_features.append(s_main.clone())
+
+ if 0 < self.repa_encoder_index == (i + 1):
+ self.last_repa_tokens = s_main
+
+ return s_main, y_emb, collected_features
+
+ def _unpatchify_features(self, features: list, Hs: int, Ws: int) -> list:
+ """Reshape patch token features [B, L, D] → [B, D, Hs, Ws] for discriminator.
+
+ PixDiT tokens are 1-to-1 with spatial patches (no sub-patch splitting in the
+ token dimension), so we just reshape to a 2D spatial feature map.
+ Compatible with Discriminator_ImageDiT which uses Conv2D heads.
+
+ Under CP, collected features are rank-local [B, L_local, D]. We gather
+ them along the token axis here so the discriminator (which has no CP
+ plumbing) sees the full feature map.
+
+ Args:
+ features: List of [B, L_local_or_full, D] token tensors.
+ Hs, Ws: Spatial patch grid dimensions (full).
+
+ Returns:
+ List of [B, D, Hs, Ws] tensors.
+ """
+ result = []
+ for feat in features:
+ if self._cp_group is not None:
+ feat = cat_outputs_cp_with_grad(feat.contiguous(), seq_dim=1, cp_group=self._cp_group)
+ B, _L, D = feat.shape
+ result.append(feat.view(B, Hs, Ws, D).permute(0, 3, 1, 2)) # [B, D, Hs, Ws]
+ return result
+
+ def forward(
+ self,
+ x,
+ t,
+ y,
+ s=None,
+ mask=None,
+ lq_video_or_image=None,
+ lq_latent=None,
+ lq_mask=None,
+ degrade_sigma=None,
+ # --- Feature extraction for GAN discriminator ---
+ feature_indices=None,
+ return_features_early: bool = False,
+ ):
+ B, _, H, W = x.shape
+ Hs = H // self.patch_size
+ Ws = W // self.patch_size
+ L = Hs * Ws
+
+ # Context-parallel local sequence length. When CP is enabled, every rank
+ # sees the same full inputs (x, y, t, lq_*) — we patchify on full size,
+ # then immediately split tokens along L so the heavy transformer/pixel
+ # blocks operate on L_local = L / cp_size each.
+ cp_group = self._cp_group
+ cp_size = cp_group.size() if cp_group is not None else 1
+ if cp_size > 1:
+ assert L % cp_size == 0, f"L={L} not divisible by cp_size={cp_size}"
+ L_local = L // cp_size
+
+ # Compute LQ features (split along L internally when CP is active).
+ has_lq = lq_video_or_image is not None or lq_latent is not None
+ lq_features = self._compute_lq_features(lq_video_or_image, lq_latent, lq_mask, Hs, Ws) if has_lq else None
+
+ collected_features = None # populated by _run_patch_blocks when feature_indices is set
+
+ # Patch tokens — full unfolding on every rank (cheap; identical across ranks).
+ pos = self.fetch_pos(Hs, Ws, x.device) # full pos; the CP-aware attention slices for q internally
+ x_patches = torch.nn.functional.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+
+ t_emb = self.t_embedder(t.view(-1)).view(B, -1, self.hidden_size)
+
+ # Text tokens (replicated across CP ranks; not split).
+ if y.dim() != 3:
+ raise ValueError("Text embedding y must be [B, L, D]")
+ Ltxt = min(y.shape[1], self.txt_max_length)
+ y = y[:, :Ltxt, :]
+ y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size)
+ y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb.dtype)
+
+ # Condition signal: silu(t_emb), [B, 1, D]
+ condition = torch.nn.functional.silu(t_emb)
+
+ # Mask
+ pad = None
+ pos_txt = self.fetch_pos_text(Ltxt, x.device) if self.use_text_rope else None
+ if mask is not None and isinstance(mask, torch.Tensor):
+ m = mask
+ while m.dim() > 2 and m.size(1) == 1:
+ m = m.squeeze(1)
+ if m.dim() == 3 and m.size(1) == 1:
+ m = m.squeeze(1)
+ if m.dim() == 2:
+ pad = m == 0
+
+ if s is None:
+ s0 = self.s_embedder(x_patches)
+ # Split image patch tokens across the CP group along the sequence axis.
+ # Everything downstream (lq injection, patch_blocks, pixel pathway)
+ # operates on the rank-local slice until the final fold gather.
+ if cp_group is not None:
+ s0 = split_inputs_cp(s0, seq_dim=1, cp_group=cp_group)
+ self.last_repa_tokens = None
+
+ if self.use_ed and self.encoder_ed is not None and self.decoder_ed is not None:
+ # Encoder-decoder path (CP not supported here; PixDiT_T2I.enable_context_parallel asserts)
+ H_tokens, W_tokens = Hs, Ws
+ s_ed = s0 if self.s_ed_proj_in is None else self.s_ed_proj_in(s0)
+ if self.s_ed_in_norm is not None:
+ s_ed = self.s_ed_in_norm(s_ed)
+ c_ed = condition if self.s_ed_cond_proj is None else self.s_ed_cond_proj(condition)
+ bottleneck, skip_tokens, Hb, Wb = self.encoder_ed(s_ed, H_tokens, W_tokens, c_ed)
+ pos_b = self.fetch_pos(Hb, Wb, x.device)
+ s_main = bottleneck if self.s_ed_proj_out is None else self.s_ed_proj_out(bottleneck)
+ if self.s_ed_out_norm is not None:
+ s_main = self.s_ed_out_norm(s_main)
+ s_main = torch.nn.functional.silu(t_emb + s_main)
+
+ attn_mask_joint = None
+ if pad is not None:
+ L_img_curr = s_main.shape[1]
+ pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device)
+ pad_txt = (
+ pad[:, :Ltxt]
+ if pad.size(1) >= Ltxt
+ else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True)
+ )
+ attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr)
+
+ s_main, y_emb, collected_features = self._run_patch_blocks(
+ s_main,
+ y_emb,
+ condition,
+ pos_b,
+ pos_txt,
+ attn_mask_joint,
+ lq_features,
+ degrade_sigma=degrade_sigma,
+ feature_indices=feature_indices,
+ )
+
+ s_bottleneck2 = s_main if self.s_ed_proj_in is None else self.s_ed_proj_in(s_main)
+ if self.s_ed_in_norm is not None:
+ s_bottleneck2 = self.s_ed_in_norm(s_bottleneck2)
+ decoded, _, _ = self.decoder_ed(s_bottleneck2, Hb, Wb, skip_tokens, c_ed)
+ s = decoded if self.s_ed_proj_out is None else self.s_ed_proj_out(decoded)
+ if self.s_ed_out_norm is not None:
+ s = self.s_ed_out_norm(s)
+ s = torch.nn.functional.silu(t_emb + s)
+ else:
+ # Standard path (no encoder-decoder).
+ s_main = s0
+ attn_mask_joint = None
+ if pad is not None:
+ # SDPA's K dimension is full image length (CP gathers K/V across
+ # CP ranks inside the joint attention). Use full L for the K-side
+ # mask regardless of CP.
+ pad_img = torch.zeros((B, L), dtype=torch.bool, device=x.device)
+ pad_txt = (
+ pad[:, :Ltxt]
+ if pad.size(1) >= Ltxt
+ else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True)
+ )
+ attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L)
+
+ s_main, y_emb, collected_features = self._run_patch_blocks(
+ s_main,
+ y_emb,
+ condition,
+ pos,
+ pos_txt,
+ attn_mask_joint,
+ lq_features,
+ degrade_sigma=degrade_sigma,
+ feature_indices=feature_indices,
+ )
+
+ s = torch.nn.functional.silu(t_emb + s_main)
+
+ if not (0 < self.repa_encoder_index <= self.patch_depth):
+ self.last_repa_tokens = s
+
+ # Early exit for discriminator feature extraction (skip pixel blocks).
+ # `_unpatchify_features` handles the CP all-gather along L internally.
+ if return_features_early and feature_indices is not None and collected_features:
+ return self._unpatchify_features(collected_features, Hs, Ws)
+
+ # Ensure patch token length matches the rank-local grid (L_local under CP,
+ # L otherwise). This guard exists for ED/token-shuffle paths where the
+ # block stack may emit a different length than the input.
+ batch_size, length, _ = s.shape
+ if length != L_local:
+ if length > L_local:
+ s = s[:, :L_local, :]
+ else:
+ pad_len = L_local - length
+ s = torch.cat([s, s.new_zeros(B, pad_len, s.shape[2])], dim=1)
+
+ # Pixel pathway with optional PiT LQ injection — operates on rank-local
+ # patches under CP. lq_features[-1] was already split along L in
+ # `_compute_lq_features`, so its B*L_local view lines up with s.
+ s_cond = s.reshape(B * L_local, self.hidden_size)
+ if self.pit_lq_inject and lq_features is not None:
+ pit_lq = lq_features[-1].reshape(B * L_local, self.hidden_size)
+ sigma_flat = degrade_sigma.repeat_interleave(L_local) if degrade_sigma is not None else None
+ s_cond = self.pit_lq_gate(s_cond, pit_lq, sigma=sigma_flat)
+
+ # Pixel embedder runs on the full image (cheap; identical across CP
+ # ranks). Reshape and slice to the rank-local subset of patches so that
+ # the per-pixel branch processes exactly L_local patches.
+ x_pixels = self.pixel_embedder(x, img_height=H, img_width=W, patch_size=self.patch_size)
+ if cp_group is not None:
+ P2 = self.patch_size * self.patch_size
+ x_pixels = x_pixels.view(B, L, P2, self.pixel_hidden_size)
+ x_pixels = split_inputs_cp(x_pixels, seq_dim=1, cp_group=cp_group)
+ x_pixels = x_pixels.reshape(B * L_local, P2, self.pixel_hidden_size)
+ for blk in self.pixel_blocks:
+ x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask)
+
+ x_pixels = self.final_layer(x_pixels) # [B*L_local, P², C_out]
+ C_out = self.out_channels
+ P2 = self.patch_size * self.patch_size
+ x_pixels = x_pixels.view(B, L_local, P2, C_out).permute(0, 3, 2, 1).contiguous()
+ x_pixels = x_pixels.view(B, C_out * P2, L_local)
+ # Gather pixel patches across CP ranks along L so `fold` reconstructs
+ # the full image. `cat_outputs_cp_with_grad` keeps gradients on each
+ # rank's local slice.
+ if cp_group is not None:
+ x_pixels = cat_outputs_cp_with_grad(x_pixels.contiguous(), seq_dim=2, cp_group=cp_group)
+ output = torch.nn.functional.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size)
+
+ # Return (output, features) when feature extraction is enabled (without early exit)
+ if feature_indices is not None and collected_features is not None:
+ return output, self._unpatchify_features(collected_features, Hs, Ws)
+ return output
diff --git a/invokeai/backend/pid/_src/networks/pixeldit_official.py b/invokeai/backend/pid/_src/networks/pixeldit_official.py
new file mode 100644
index 00000000000..6fdda4917db
--- /dev/null
+++ b/invokeai/backend/pid/_src/networks/pixeldit_official.py
@@ -0,0 +1,1438 @@
+# PixelDiT T2I — consolidated network architecture.
+# Verbatim copy from the original PixelDiT repo, merged into a single file.
+# Sources:
+# pixdit_core/modules.py — building blocks (RMSNorm, RoPE, attention, etc.)
+# pixdit_core/pixeldit_c2i.py — PatchTokenEmbedder, PixelTokenEmbedder, PiTBlock
+# pixdit_core/pixeldit_t2i.py — MMDiT joint attention, encoder-decoder, PixDiT_T2I
+#
+# Only import statements were changed (everything is now local). Logic is unchanged.
+
+import math
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed import ProcessGroup
+from torch.nn.functional import scaled_dot_product_attention
+
+from invokeai.backend.pid._src.utils.context_parallel import cat_outputs_cp_with_grad
+
+# =============================================================================
+# From pixdit_core/modules.py
+# =============================================================================
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+ """
+ grid_size: int of the grid height and width
+ return:
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ grid_h = np.arange(grid_size, dtype=np.float32)
+ grid_w = np.arange(grid_size, dtype=np.float32)
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+
+ grid = grid.reshape([2, 1, grid_size, grid_size])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ assert embed_dim % 2 == 0
+
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+ return emb
+
+
+def apply_adaln(x, shift, scale):
+ return x * (1 + scale) + shift
+
+
+class TimestepConditioner(nn.Module):
+ def __init__(self, hidden_size, frequency_embedding_size=256):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+ nn.SiLU(),
+ nn.Linear(hidden_size, hidden_size, bias=True),
+ )
+ self.frequency_embedding_size = frequency_embedding_size
+
+ @staticmethod
+ def timestep_embedding(t, dim, max_period=10):
+ half = dim // 2
+ freqs = torch.exp(
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+ )
+ args = t[..., None].float() * freqs[None, ...]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ return embedding
+
+ def forward(self, t):
+ t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+ mlp_dtype = next(self.mlp.parameters()).dtype
+ if t_freq.dtype != mlp_dtype:
+ t_freq = t_freq.to(mlp_dtype)
+ t_emb = self.mlp(t_freq)
+ return t_emb
+
+
+class RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class FeedForward(nn.Module):
+ def __init__(self, dim: int, hidden_dim: int):
+ super().__init__()
+ hidden_dim = int(2 * hidden_dim / 3)
+ self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+ self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+ self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+
+ def forward(self, x):
+ x = self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
+ return x
+
+
+def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float = 10000.0, scale=16.0):
+ x_pos = torch.linspace(0, scale, width)
+ y_pos = torch.linspace(0, scale, height)
+ y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+ y_pos = y_pos.reshape(-1)
+ x_pos = x_pos.reshape(-1)
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+ x_freqs = torch.outer(x_pos, freqs).float()
+ y_freqs = torch.outer(y_pos, freqs).float()
+ x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+ y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+ freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+ freqs_cis = freqs_cis.reshape(height * width, -1)
+ return freqs_cis
+
+
+def precompute_freqs_cis_2d_ntk(
+ dim: int,
+ height: int,
+ width: int,
+ ref_grid_h: int,
+ ref_grid_w: int,
+ theta: float = 10000.0,
+ scale: float = 16.0,
+):
+ """NTK-aware 2D RoPE. Identical to precompute_freqs_cis_2d when
+ height == ref_grid_h and width == ref_grid_w. For other resolutions
+ the base theta is scaled per-axis following the NTK-aware formula:
+ ntk_factor = (current / ref) ** (dim_axis / (dim_axis - 2))
+ theta_axis = theta * ntk_factor
+ where dim_axis = dim // 2 (half the head dim per spatial axis).
+ """
+ dim_axis = dim // 2 # each axis gets dim//4 complex pairs → dim//2 real dims
+ h_scale = height / ref_grid_h
+ w_scale = width / ref_grid_w
+ h_ntk = h_scale ** (dim_axis / (dim_axis - 2)) if dim_axis > 2 else 1.0
+ w_ntk = w_scale ** (dim_axis / (dim_axis - 2)) if dim_axis > 2 else 1.0
+ h_theta = theta * h_ntk
+ w_theta = theta * w_ntk
+
+ x_pos = torch.linspace(0, scale, width)
+ y_pos = torch.linspace(0, scale, height)
+ y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij")
+ y_pos = y_pos.reshape(-1)
+ x_pos = x_pos.reshape(-1)
+
+ freqs_w = 1.0 / (w_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+ freqs_h = 1.0 / (h_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+
+ x_freqs = torch.outer(x_pos, freqs_w).float()
+ y_freqs = torch.outer(y_pos, freqs_h).float()
+ x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)
+ y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)
+ freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1)
+ freqs_cis = freqs_cis.reshape(height * width, -1)
+ return freqs_cis
+
+
+def apply_rotary_emb(
+ xq: torch.Tensor,
+ xk: torch.Tensor,
+ freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ freqs_cis = freqs_cis[None, :, None, :]
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+ xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+ return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class RotaryAttention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ qk_norm: bool = True,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ norm_layer: nn.Module = RMSNorm,
+ ) -> None:
+ super().__init__()
+ assert dim % num_heads == 0, "dim should be divisible by num_heads"
+
+ self.dim = dim
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+ self.scale = self.head_dim**-0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+ self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ # Context-parallel group; when set, `forward` runs split-Q / gather-K,V.
+ self._cp_group: Optional[ProcessGroup] = None
+
+ def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]):
+ self._cp_group = cp_group
+
+ def forward(self, x: torch.Tensor, pos, mask) -> torch.Tensor:
+ # CP convention: caller passes `pos` of full sequence length (N_full).
+ # When `_cp_group` is set, `x` is the rank-local slice [B, N_local, C]
+ # with N_local = N_full / cp_size. We gather k/v to full length, apply
+ # RoPE with the appropriate slice/full pos, and run SDPA producing
+ # local-Q output [B, N_local, C].
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+ q = self.q_norm(q)
+ k = self.k_norm(k)
+ if self._cp_group is None:
+ q, k = apply_rotary_emb(q, k, freqs_cis=pos)
+ else:
+ cp_size = self._cp_group.size()
+ cp_rank = self._cp_group.rank()
+ N_full = pos.shape[0]
+ assert N_full % cp_size == 0, f"pos length {N_full} not divisible by cp_size {cp_size}"
+ N_local = N_full // cp_size
+ assert N == N_local, f"local x length {N} != expected {N_local}"
+ pos_local = pos.view(cp_size, N_local, -1)[cp_rank]
+ # Apply RoPE to local q with local pos.
+ q, _ = apply_rotary_emb(q, q, freqs_cis=pos_local)
+ # Gather k, v across CP ranks along the sequence dim, then RoPE with full pos.
+ # `all_gather` requires contiguous tensors; the qkv permute leaves k/v as non-contiguous views.
+ k = cat_outputs_cp_with_grad(k.contiguous(), seq_dim=1, cp_group=self._cp_group)
+ v = cat_outputs_cp_with_grad(v.contiguous(), seq_dim=1, cp_group=self._cp_group)
+ _, k = apply_rotary_emb(k, k, freqs_cis=pos)
+ q = q.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2)
+ k = k.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous()
+ v = v.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous()
+
+ x = scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+ x = x.transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class MLP(nn.Module):
+ def __init__(self, dim: int, mlp_ratio: float = 4.0, drop: float = 0.0):
+ super().__init__()
+ hidden_dim = int(dim * mlp_ratio)
+ self.fc1 = nn.Linear(dim, hidden_dim)
+ self.act = nn.GELU()
+ self.fc2 = nn.Linear(hidden_dim, dim)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class FinalLayer(nn.Module):
+ def __init__(self, hidden_size, out_channels):
+ super().__init__()
+ self.norm = RMSNorm(hidden_size, eps=1e-6)
+ self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+ def forward(self, x):
+ x = self.norm(x)
+ x = self.linear(x)
+ return x
+
+
+# =============================================================================
+# From pixdit_core/pixeldit_c2i.py (PatchTokenEmbedder, PixelTokenEmbedder, PiTBlock)
+# =============================================================================
+
+
+class PatchTokenEmbedder(nn.Module):
+ def __init__(
+ self,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ norm_layer=None,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+ self.proj = nn.Linear(in_chans, embed_dim, bias=bias)
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+ def forward(self, x):
+ x = self.proj(x)
+ x = self.norm(x)
+ return x
+
+
+class PixelTokenEmbedder(nn.Module):
+ def __init__(self, in_channels: int, hidden_size_output: int):
+ super().__init__()
+ self.in_channels = int(in_channels)
+ self.hidden_size_output = int(hidden_size_output)
+ self.proj = nn.Linear(self.in_channels, self.hidden_size_output, bias=True)
+ self._pos_cache = {}
+
+ def _fetch_pixel_pos_patch(self, patch_size: int, device, dtype):
+ key = ("patch", patch_size)
+ if key in self._pos_cache:
+ pe = self._pos_cache[key]
+ return pe.to(device=device, dtype=dtype)
+ pos_np = get_2d_sincos_pos_embed(self.hidden_size_output, patch_size)
+ pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [P2, D]
+ self._pos_cache[key] = pos
+ return pos
+
+ def _fetch_pixel_pos_image(self, height: int, width: int, device, dtype):
+ if height == width:
+ key = ("image", height, width)
+ if key in self._pos_cache:
+ pe = self._pos_cache[key]
+ return pe.to(device=device, dtype=dtype)
+ pos_np = get_2d_sincos_pos_embed(self.hidden_size_output, height)
+ pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [H*W, D]
+ self._pos_cache[key] = pos
+ return pos
+ else:
+ key = ("image", height, width)
+ if key in self._pos_cache:
+ pe = self._pos_cache[key]
+ return pe.to(device=device, dtype=dtype)
+ # Build a non-square grid (H x W) and compute 2D sin/cos embedding
+ grid_h = np.arange(height, dtype=np.float32)
+ grid_w = np.arange(width, dtype=np.float32)
+ grid = np.meshgrid(grid_w, grid_h) # w first to match existing convention
+ grid = np.stack(grid, axis=0).reshape(2, 1, height, width)
+ pos_np = get_2d_sincos_pos_embed_from_grid(self.hidden_size_output, grid)
+ pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [H*W, D]
+ self._pos_cache[key] = pos
+ return pos
+
+ def forward(self, inputs: torch.Tensor, img_height: int = None, img_width: int = None, patch_size: int = None):
+ # Two modes:
+ # 1) Legacy patch mode: inputs [B*L, P2, C] -> add 2D sincos within patch (P2 = patch_size^2)
+ # 2) Image mode: inputs [B, C, H, W] -> patchify inside and add full-image (H*W) pixel-space sincos sampled per patch
+ if inputs.dim() == 3:
+ # Legacy: [B*L, P2, C]
+ batch_tokens, p2, _ = inputs.shape
+ patch_sz = int(p2**0.5)
+ pos = self._fetch_pixel_pos_patch(patch_sz, inputs.device, inputs.dtype) # [P2, D]
+ x = self.proj(inputs)
+ x = x + pos.unsqueeze(0)
+ return x
+ elif inputs.dim() == 4:
+ # Image mode: [B, C, H, W]
+ assert img_height is not None and img_width is not None and patch_size is not None, (
+ "Need H, W, patch_size for image mode"
+ )
+ B, C, H, W = inputs.shape
+ assert H == img_height and W == img_width, "Input spatial size mismatch"
+ assert (H % patch_size == 0) and (W % patch_size == 0), "H and W must be divisible by patch_size"
+ Hs, Ws = H // patch_size, W // patch_size
+ P2 = patch_size * patch_size
+ # linear proj per pixel
+ x = inputs.permute(0, 2, 3, 1).contiguous() # [B, H, W, C]
+ x = self.proj(x) # [B, H, W, D]
+ # full-image pixel-space pos
+ pos_full = self._fetch_pixel_pos_image(H, W, inputs.device, inputs.dtype) # [H*W, D]
+ pos_full = pos_full.view(H, W, self.hidden_size_output)
+ # add pos at image grid then patchify to [B*L, P2, D]
+ x = x + pos_full.unsqueeze(0)
+ x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous() # [B, Hs, Ws, ps, ps, D]
+ x = x.view(B * Hs * Ws, P2, self.hidden_size_output)
+ return x
+ else:
+ raise ValueError("PixelTokenEmbedder expects inputs of shape [B*L,P2,C] or [B,C,H,W]")
+
+
+class PiTBlock(nn.Module):
+ def __init__(
+ self,
+ pixel_hidden_size: int,
+ patch_hidden_size: int,
+ patch_size: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ attn_hidden_size: Optional[int] = None,
+ attn_num_heads: Optional[int] = None,
+ rope_mode: str = "original",
+ rope_ref_grid_h: int = 32,
+ rope_ref_grid_w: int = 32,
+ ):
+ super().__init__()
+ self.pixel_dim = int(pixel_hidden_size)
+ self.context_dim = int(patch_hidden_size)
+ self.patch_size = int(patch_size)
+ self.attn_dim = int(attn_hidden_size) if attn_hidden_size is not None else self.context_dim
+ self.num_heads = int(attn_num_heads) if attn_num_heads is not None else int(num_heads)
+ self.rope_mode = rope_mode
+ self.rope_ref_grid_h = rope_ref_grid_h
+ self.rope_ref_grid_w = rope_ref_grid_w
+ assert self.attn_dim % self.num_heads == 0, "pixel attention hidden size must be divisible by pixel num_heads"
+ p2 = self.patch_size * self.patch_size
+ self.compress_to_attn = nn.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True)
+ self.expand_from_attn = nn.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True)
+ self.norm1 = RMSNorm(self.pixel_dim, eps=1e-6)
+ self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False)
+ self.norm2 = RMSNorm(self.pixel_dim, eps=1e-6)
+ self.mlp = MLP(self.pixel_dim, mlp_ratio=mlp_ratio, drop=0.0)
+ self.adaLN_modulation = nn.Sequential(nn.Linear(self.context_dim, 6 * self.pixel_dim * p2, bias=True))
+ self._pos_cache = {}
+ # CP group; when set, the attention runs split-Q / gather-K,V across L.
+ self._cp_group: Optional[ProcessGroup] = None
+
+ def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]):
+ self._cp_group = cp_group
+ self.attn.set_context_parallel_group(cp_group)
+
+ def _fetch_pos(self, height: int, width: int, device):
+ key = (height, width)
+ if key in self._pos_cache:
+ return self._pos_cache[key].to(device)
+ head_dim = self.attn_dim // self.num_heads
+ if self.rope_mode == "ntk_aware":
+ pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to(
+ device
+ )
+ else:
+ pos = precompute_freqs_cis_2d(head_dim, height, width).to(device)
+ self._pos_cache[key] = pos
+ return pos
+
+ def forward(
+ self, x: torch.Tensor, s_cond: torch.Tensor, image_height: int, image_width: int, patch_size: int, mask=None
+ ) -> torch.Tensor:
+ # x: [B*L_local, P2, C]; under CP, L_local = (Hs*Ws)/cp_size. Without CP,
+ # L_local == L_full. The reshape uses L_local for the (B, L_local, ...)
+ # axis; the inner attention all-gathers k/v back to full length.
+ BL, P2, C = x.shape
+ if C != self.pixel_dim:
+ raise ValueError(f"PiTBlock expected pixel_dim={self.pixel_dim}, got {C}")
+ assert patch_size == self.patch_size, "PiTBlock expects fixed patch_size"
+ assert P2 == patch_size * patch_size, "Token count per patch must equal patch_size^2"
+ assert (image_height % patch_size == 0) and (image_width % patch_size == 0), (
+ "H and W must be divisible by patch_size"
+ )
+ Hs, Ws = image_height // patch_size, image_width // patch_size
+ L = Hs * Ws
+ cp_size = self._cp_group.size() if self._cp_group is not None else 1
+ assert L % cp_size == 0, f"L={L} not divisible by cp_size={cp_size}"
+ L_local = L // cp_size
+ assert s_cond.shape[0] == BL, "s_cond batch must match x batch"
+ assert BL % L_local == 0, "Total sequences must be a multiple of local patch count"
+ B = BL // L_local
+ # adaLN per pixel (within patch): params
+ cond_params = self.adaLN_modulation(s_cond) # [BL, 6*pixel_dim*P2]
+ cond_params = cond_params.view(BL, P2, 6 * self.pixel_dim)
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(cond_params, 6, dim=-1)
+ x_norm = apply_adaln(self.norm1(x), shift_msa, scale_msa)
+ x_flat = x_norm.view(BL, P2 * self.pixel_dim)
+ x_comp = self.compress_to_attn(x_flat).view(B, L_local, self.attn_dim)
+ # attention across patch tokens (L) — pos is full-length; the CP-aware
+ # RotaryAttention gathers k/v across CP ranks internally.
+ pos_comp = self._fetch_pos(Hs, Ws, x.device)
+ attn_out = self.attn(x_comp, pos_comp, mask) # [B, L_local, attn_dim]
+ attn_flat = self.expand_from_attn(attn_out.view(B * L_local, self.attn_dim))
+ attn_exp = attn_flat.view(BL, P2, self.pixel_dim)
+ # residual & MLP locally
+ x = x + gate_msa * attn_exp
+ mlp_out = self.mlp(apply_adaln(self.norm2(x), shift_mlp, scale_mlp))
+ x = x + gate_mlp * mlp_out
+ return x
+
+
+# =============================================================================
+# From pixdit_core/pixeldit_t2i.py
+# =============================================================================
+
+
+class MMDiTJointAttention(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ attn_drop: float = 0.0,
+ proj_drop: float = 0.0,
+ ) -> None:
+ super().__init__()
+ assert dim % num_heads == 0, "dim should be divisible by num_heads"
+ self.dim = dim
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+
+ # Separate QKV projections for image (x) and text (y) streams
+ self.qkv_x = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.qkv_y = nn.Linear(dim, dim * 3, bias=qkv_bias)
+
+ # Per-stream QK normalization (head-wise)
+ self.q_norm_x = RMSNorm(self.head_dim)
+ self.k_norm_x = RMSNorm(self.head_dim)
+ self.q_norm_y = RMSNorm(self.head_dim)
+ self.k_norm_y = RMSNorm(self.head_dim)
+
+ # Output projections for each stream
+ self.proj_x = nn.Linear(dim, dim)
+ self.proj_y = nn.Linear(dim, dim)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj_drop_x = nn.Dropout(proj_drop)
+ self.proj_drop_y = nn.Dropout(proj_drop)
+ # CP group for the image stream. Text is replicated across CP ranks.
+ self._cp_group: Optional[ProcessGroup] = None
+
+ def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]):
+ self._cp_group = cp_group
+
+ def forward(
+ self,
+ x: torch.Tensor, # [B, Nx, C] image stream (Nx = Nx_local under CP)
+ y: torch.Tensor, # [B, Ny, C] text stream (always full / replicated)
+ pos_img: torch.Tensor, # [Nx_full, head_dim/2] complex RoPE freqs (always full)
+ pos_txt: torch.Tensor = None, # [Ny, head_dim/2] complex RoPE freqs for text (optional)
+ attn_mask: torch.Tensor = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ B, Nx, C = x.shape
+ By, Ny, Cy = y.shape
+ assert B == By and C == Cy, "x and y must share batch and channel dims"
+
+ # QKV for image
+ qkv_x = self.qkv_x(x).reshape(B, Nx, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4)
+ qx, kx, vx = qkv_x[0], qkv_x[1], qkv_x[2] # [B, Nx, H, Hc]
+ qx = self.q_norm_x(qx)
+ kx = self.k_norm_x(kx)
+
+ # QKV for text
+ qkv_y = self.qkv_y(y).reshape(B, Ny, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4)
+ qy, ky, vy = qkv_y[0], qkv_y[1], qkv_y[2] # [B, Ny, H, Hc]
+ qy = self.q_norm_y(qy)
+ ky = self.k_norm_y(ky)
+
+ # Image RoPE — under CP, q uses the rank-local slice of pos_img, k (after
+ # all-gather along the sequence dim) uses the full pos_img.
+ if self._cp_group is None:
+ qx, kx = apply_rotary_emb(qx, kx, freqs_cis=pos_img)
+ else:
+ cp_size = self._cp_group.size()
+ cp_rank = self._cp_group.rank()
+ Nx_full = pos_img.shape[0]
+ assert Nx_full % cp_size == 0, f"pos_img length {Nx_full} not divisible by cp_size {cp_size}"
+ Nx_local = Nx_full // cp_size
+ assert Nx == Nx_local, f"local image stream length {Nx} != expected {Nx_local}"
+ pos_img_local = pos_img.view(cp_size, Nx_local, -1)[cp_rank]
+ qx, _ = apply_rotary_emb(qx, qx, freqs_cis=pos_img_local)
+ # `all_gather` requires contiguous tensors; the qkv permute leaves k/v as non-contiguous views.
+ kx = cat_outputs_cp_with_grad(kx.contiguous(), seq_dim=1, cp_group=self._cp_group)
+ vx = cat_outputs_cp_with_grad(vx.contiguous(), seq_dim=1, cp_group=self._cp_group)
+ _, kx = apply_rotary_emb(kx, kx, freqs_cis=pos_img)
+ if pos_txt is not None:
+ qy, ky = apply_rotary_emb(qy, ky, freqs_cis=pos_txt)
+
+ # SDPA expects [B, H, S, Hc]; build joint sequence [text, image].
+ # Under CP: qx is [B, H, Nx_local, Hc]; kx, vx are [B, H, Nx_full, Hc].
+ qx = qx.transpose(1, 2)
+ kx = kx.transpose(1, 2)
+ vx = vx.transpose(1, 2)
+
+ qy = qy.transpose(1, 2) # [B, H, Ny, Hc]
+ ky = ky.transpose(1, 2)
+ vy = vy.transpose(1, 2)
+
+ q_joint = torch.cat([qy, qx], dim=2) # [B, H, Ny + Nx_local, Hc]
+ k_joint = torch.cat([ky, kx], dim=2) # [B, H, Ny + Nx_full, Hc]
+ v_joint = torch.cat([vy, vx], dim=2)
+
+ out_joint = F.scaled_dot_product_attention(q_joint, k_joint, v_joint, dropout_p=0.0, attn_mask=attn_mask)
+ # Split back to [text, image]; image output is local under CP.
+ out_y = out_joint[:, :, :Ny, :]
+ out_x = out_joint[:, :, Ny:, :]
+
+ # Merge heads
+ out_y = out_y.transpose(1, 2).reshape(B, Ny, C)
+ out_x = out_x.transpose(1, 2).reshape(B, Nx, C)
+
+ # Output projections
+ out_x = self.proj_drop_x(self.proj_x(out_x))
+ out_y = self.proj_drop_y(self.proj_y(out_y))
+ return out_x, out_y
+
+
+class MMDiTBlockT2I(nn.Module):
+ def __init__(self, hidden_size, groups, mlp_ratio=4.0, adaLN_modulation_img=None, adaLN_modulation_txt=None):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.groups = groups
+ self.head_dim = hidden_size // groups
+
+ # Per-stream norms
+ self.norm_x1 = RMSNorm(hidden_size, eps=1e-6)
+ self.norm_y1 = RMSNorm(hidden_size, eps=1e-6)
+
+ self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False)
+
+ self.norm_x2 = RMSNorm(hidden_size, eps=1e-6)
+ self.norm_y2 = RMSNorm(hidden_size, eps=1e-6)
+
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
+ self.mlp_x = FeedForward(hidden_size, mlp_hidden_dim)
+ self.mlp_y = FeedForward(hidden_size, mlp_hidden_dim)
+
+ # Per-stream AdaLN modulation
+ self.adaLN_modulation_img = (
+ adaLN_modulation_img
+ if adaLN_modulation_img is not None
+ else nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+ )
+ self.adaLN_modulation_txt = (
+ adaLN_modulation_txt
+ if adaLN_modulation_txt is not None
+ else nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+ )
+
+ def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]):
+ # The block itself has no CP-affecting state; only the joint attention does.
+ self.attn.set_context_parallel_group(cp_group)
+
+ def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None):
+ # c: [B, 1, C] typically, broadcast across tokens
+ shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(
+ 6, dim=-1
+ )
+ shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(
+ 6, dim=-1
+ )
+
+ # 1) Joint attention with dual-stream
+ x_norm = apply_adaln(self.norm_x1(x), shift_msa_x, scale_msa_x)
+ y_norm = apply_adaln(self.norm_y1(y), shift_msa_y, scale_msa_y)
+ attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask)
+ x = x + gate_msa_x * attn_x
+ y = y + gate_msa_y * attn_y
+
+ # 2) Per-stream MLP with AdaLN
+ x = x + gate_mlp_x * self.mlp_x(apply_adaln(self.norm_x2(x), shift_mlp_x, scale_mlp_x))
+ y = y + gate_mlp_y * self.mlp_y(apply_adaln(self.norm_y2(y), shift_mlp_y, scale_mlp_y))
+ return x, y
+
+
+def _compute_num_stages_from_ratio(compress_ratio: int) -> int:
+ if compress_ratio <= 1:
+ return 0
+ if compress_ratio & (compress_ratio - 1) != 0:
+ raise ValueError(f"ed_compress_ratio must be power of 2, got {compress_ratio}")
+ return int(math.log2(compress_ratio))
+
+
+class _TransformerBlock(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ drop: float = 0.0,
+ use_token_compression: bool = False,
+ token_shuffle_window_size: int = 1,
+ rope_mode: str = "original",
+ rope_ref_grid_h: int = 32,
+ rope_ref_grid_w: int = 32,
+ ):
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.rope_mode = rope_mode
+ self.rope_ref_grid_h = rope_ref_grid_h
+ self.rope_ref_grid_w = rope_ref_grid_w
+ self.norm1 = RMSNorm(dim, eps=1e-6)
+ self.attn = RotaryAttention(dim, num_heads=num_heads, qkv_bias=False)
+ self.norm2 = RMSNorm(dim, eps=1e-6)
+ self.mlp = MLP(dim, mlp_ratio=mlp_ratio, drop=drop)
+ self.adaLN_modulation = nn.Sequential(nn.Linear(dim, 6 * dim, bias=True))
+ self.use_token_compression = bool(use_token_compression)
+ ts_ws = int(token_shuffle_window_size) if self.use_token_compression else 1
+
+ if self.use_token_compression and ts_ws > 1:
+
+ class _AttnTokenShuffleCompression(nn.Module):
+ def __init__(self):
+ super().__init__()
+ s2 = ts_ws * ts_ws
+ adapted_hidden = ((dim + s2 - 1) // s2) * s2
+ needs_adapter_in = adapted_hidden != dim
+ compressed_dim = adapted_hidden // s2
+ self.s = ts_ws
+ self.adapted_hidden = adapted_hidden
+ self.compressed_dim = compressed_dim
+ self.adapter_in = (
+ nn.Sequential(nn.Linear(dim, adapted_hidden, bias=True), nn.GELU())
+ if needs_adapter_in
+ else nn.Identity()
+ )
+ self.proj_down = nn.Linear(adapted_hidden, compressed_dim, bias=True)
+ self.proj_to_attn = (
+ nn.Identity() if adapted_hidden == dim else nn.Linear(adapted_hidden, dim, bias=True)
+ )
+
+ def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ B, N, C = x.shape
+ assert N == height * width, f"Token count {N} != {height}*{width}"
+ s = self.s
+ assert height % s == 0 and width % s == 0, (
+ f"Height {height} and Width {width} must be divisible by token shuffle size {s}"
+ )
+ x = x.view(B, height, width, C)
+ x = self.adapter_in(x)
+ x = self.proj_down(x)
+ c_per = self.compressed_dim
+ x = x.view(B, height // s, s, width // s, s, c_per)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+ x = x.view(B, (height // s) * (width // s), s * s * c_per)
+ x = self.proj_to_attn(x)
+ return x
+
+ class _AttnTokenShuffleExpansion(nn.Module):
+ def __init__(self):
+ super().__init__()
+ s2 = ts_ws * ts_ws
+ adapted_hidden = ((dim + s2 - 1) // s2) * s2
+ needs_adapter_out = adapted_hidden != dim
+ compressed_dim = adapted_hidden // s2
+ self.s = ts_ws
+ self.adapted_hidden = adapted_hidden
+ self.compressed_dim = compressed_dim
+ self.proj_from_attn = (
+ nn.Identity() if adapted_hidden == dim else nn.Linear(dim, adapted_hidden, bias=True)
+ )
+ self.proj_up = nn.Sequential(nn.Linear(compressed_dim, adapted_hidden, bias=True), nn.GELU())
+ self.adapter_out = (
+ nn.Sequential(nn.Linear(adapted_hidden, dim, bias=True), nn.GELU())
+ if needs_adapter_out
+ else nn.Identity()
+ )
+
+ def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ B, Np, C = x.shape
+ s = self.s
+ Hs, Ws = height // s, width // s
+ assert Np == Hs * Ws, f"Token count {Np} != {Hs}*{Ws}"
+ x = self.proj_from_attn(x)
+ c_per = self.compressed_dim
+ x = x.view(B, Hs, Ws, s, s, c_per)
+ x_flat = x.reshape(B * Hs * Ws * s * s, c_per)
+ x_expanded = self.proj_up(x_flat)
+ x_expanded = x_expanded.view(B, Hs, Ws, s, s, self.adapted_hidden)
+ x_expanded = x_expanded.permute(0, 1, 3, 2, 4, 5).contiguous()
+ x_expanded = x_expanded.view(B, Hs * s, Ws * s, self.adapted_hidden)
+ x_expanded = self.adapter_out(x_expanded)
+ x_expanded = x_expanded.view(B, height * width, dim)
+ return x_expanded
+
+ self._ts_compress = _AttnTokenShuffleCompression()
+ self._ts_expand = _AttnTokenShuffleExpansion()
+ else:
+ self._ts_compress = None
+ self._ts_expand = None
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ c: torch.Tensor,
+ pos: torch.Tensor,
+ mask: Optional[torch.Tensor] = None,
+ height: Optional[int] = None,
+ width: Optional[int] = None,
+ ) -> torch.Tensor:
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+ use_ts = (
+ self.use_token_compression
+ and self._ts_compress is not None
+ and self._ts_expand is not None
+ and height is not None
+ and width is not None
+ )
+ if use_ts:
+ x_norm = apply_adaln(self.norm1(x), shift_msa, scale_msa)
+ x_comp = self._ts_compress(x_norm, height, width)
+ s = self._ts_compress.s
+ Hs, Ws = height // s, width // s
+ head_dim = self.dim // self.num_heads
+ if self.rope_mode == "ntk_aware":
+ pos_comp = precompute_freqs_cis_2d_ntk(head_dim, Hs, Ws, self.rope_ref_grid_h, self.rope_ref_grid_w).to(
+ x.device
+ )
+ else:
+ pos_comp = precompute_freqs_cis_2d(head_dim, Hs, Ws).to(x.device)
+ attn_out = self.attn(x_comp, pos_comp, mask)
+ attn_out = self._ts_expand(attn_out, height, width)
+ x = x + gate_msa * attn_out
+ else:
+ attn_out = self.attn(apply_adaln(self.norm1(x), shift_msa, scale_msa), pos, mask)
+ x = x + gate_msa * attn_out
+ x = x + gate_mlp * self.mlp(apply_adaln(self.norm2(x), shift_mlp, scale_mlp))
+ return x
+
+
+class _PatchMerging(nn.Module):
+ def __init__(self, hidden_size: int, window_size: int = 2):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.window_size = int(window_size)
+ s2 = self.window_size * self.window_size
+ self.adapted_hidden = ((hidden_size + s2 - 1) // s2) * s2
+ self.needs_adapter = self.adapted_hidden != hidden_size
+ self.adapter_in = (
+ nn.Sequential(nn.Linear(hidden_size, self.adapted_hidden, bias=True), nn.GELU())
+ if self.needs_adapter
+ else nn.Identity()
+ )
+ self.compressed_dim = self.adapted_hidden // s2
+ self.proj_down = nn.Linear(self.adapted_hidden, self.compressed_dim, bias=True)
+ self.proj_to_hidden = (
+ nn.Identity()
+ if self.adapted_hidden == hidden_size
+ else nn.Sequential(nn.Linear(self.adapted_hidden, hidden_size, bias=True), nn.GELU())
+ )
+
+ def forward(self, x: torch.Tensor, height: int, width: int):
+ B, N, C = x.shape
+ assert N == height * width, f"Token count {N} doesn't match H*W={height * width}"
+ s = self.window_size
+ assert height % s == 0 and width % s == 0, f"H and W must be divisible by {s}"
+ x = x.view(B, height, width, C)
+ x = self.adapter_in(x)
+ x = self.proj_down(x)
+ c_per = self.compressed_dim
+ x = x.view(B, height // s, s, width // s, s, c_per)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+ x = x.view(B, (height // s) * (width // s), s * s * c_per)
+ x = self.proj_to_hidden(x)
+ return x, height // s, width // s
+
+
+class _PatchExpanding(nn.Module):
+ def __init__(self, hidden_size: int, window_size: int = 2):
+ super().__init__()
+ self.hidden_size = hidden_size
+ self.window_size = int(window_size)
+ s2 = self.window_size * self.window_size
+ self.adapted_hidden = ((hidden_size + s2 - 1) // s2) * s2
+ self.needs_adapter = self.adapted_hidden != hidden_size
+ self.proj_from_hidden = (
+ nn.Identity()
+ if self.adapted_hidden == hidden_size
+ else nn.Linear(hidden_size, self.adapted_hidden, bias=True)
+ )
+ self.compressed_dim = self.adapted_hidden // s2
+ self.proj_up = nn.Sequential(nn.Linear(self.compressed_dim, self.adapted_hidden, bias=True), nn.GELU())
+ self.adapter_out = (
+ nn.Sequential(nn.Linear(self.adapted_hidden, hidden_size, bias=True), nn.GELU())
+ if self.needs_adapter
+ else nn.Identity()
+ )
+
+ def forward(self, x: torch.Tensor, height: int, width: int):
+ B, Np, C = x.shape
+ Hs, Ws = height, width
+ s = self.window_size
+ x = self.proj_from_hidden(x)
+ c_per = self.adapted_hidden // (s * s)
+ x = x.view(B, Hs, Ws, s, s, c_per)
+ x_flat = x.reshape(B * Hs * Ws * s * s, c_per)
+ x_expanded = self.proj_up(x_flat)
+ x_expanded = x_expanded.view(B, Hs, Ws, s, s, self.adapted_hidden)
+ x_expanded = x_expanded.permute(0, 1, 3, 2, 4, 5).contiguous()
+ x_expanded = x_expanded.view(B, Hs * s, Ws * s, self.adapted_hidden)
+ x_expanded = self.adapter_out(x_expanded)
+ x_expanded = x_expanded.view(B, (Hs * s) * (Ws * s), self.hidden_size)
+ return x_expanded, Hs * s, Ws * s
+
+
+class _EncoderED(nn.Module):
+ def __init__(
+ self,
+ hidden_size: int,
+ num_stages: int,
+ depth_per_stage: int = 1,
+ num_heads: int = 8,
+ window_size: int = 2,
+ mlp_ratio: float = 4.0,
+ drop: float = 0.0,
+ use_attn_token_shuffle: bool = False,
+ rope_mode: str = "original",
+ rope_ref_grid_h: int = 32,
+ rope_ref_grid_w: int = 32,
+ ):
+ super().__init__()
+ self.hidden_size = int(hidden_size)
+ self.num_heads = int(num_heads)
+ self.num_stages = int(num_stages)
+ self.window_size = int(window_size)
+ self.use_attn_token_shuffle = bool(use_attn_token_shuffle)
+ self.rope_mode = rope_mode
+ self.rope_ref_grid_h = rope_ref_grid_h
+ self.rope_ref_grid_w = rope_ref_grid_w
+ self._pos_cache = {}
+ stages = []
+ for i_stage in range(self.num_stages):
+ ts_ws = 2 ** (self.num_stages - i_stage) if self.use_attn_token_shuffle else 1
+ blocks = nn.ModuleList(
+ [
+ _TransformerBlock(
+ hidden_size,
+ num_heads,
+ mlp_ratio,
+ drop,
+ use_token_compression=self.use_attn_token_shuffle,
+ token_shuffle_window_size=ts_ws,
+ rope_mode=rope_mode,
+ rope_ref_grid_h=rope_ref_grid_h,
+ rope_ref_grid_w=rope_ref_grid_w,
+ )
+ for _ in range(int(depth_per_stage))
+ ]
+ )
+ compress = _PatchMerging(hidden_size, window_size=self.window_size)
+ stages.append(nn.ModuleDict({"blocks": blocks, "compress": compress}))
+ self.stages = nn.ModuleList(stages)
+
+ def _fetch_pos(self, height: int, width: int, device: torch.device):
+ key = (height, width)
+ if key in self._pos_cache:
+ return self._pos_cache[key].to(device)
+ head_dim = self.hidden_size // self.num_heads
+ if self.rope_mode == "ntk_aware":
+ pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to(
+ device
+ )
+ else:
+ pos = precompute_freqs_cis_2d(head_dim, height, width).to(device)
+ self._pos_cache[key] = pos
+ return pos
+
+ def forward(self, x: torch.Tensor, height: int, width: int, c: torch.Tensor):
+ H, W = height, width
+ skip_tokens = []
+ for stage in self.stages:
+ for blk in stage["blocks"]:
+ pos = self._fetch_pos(H, W, x.device)
+ x = blk(x, c, pos, None, H, W) if self.use_attn_token_shuffle else blk(x, c, pos, None)
+ skip_tokens.append(x)
+ x, H, W = stage["compress"](x, H, W)
+ return x, skip_tokens, H, W
+
+
+class _DecoderED(nn.Module):
+ def __init__(
+ self,
+ hidden_size: int,
+ num_stages: int,
+ depth_per_stage: int = 1,
+ num_heads: int = 8,
+ window_size: int = 2,
+ mlp_ratio: float = 4.0,
+ drop: float = 0.0,
+ use_attn_token_shuffle: bool = False,
+ rope_mode: str = "original",
+ rope_ref_grid_h: int = 32,
+ rope_ref_grid_w: int = 32,
+ ):
+ super().__init__()
+ self.hidden_size = int(hidden_size)
+ self.num_heads = int(num_heads)
+ self.num_stages = int(num_stages)
+ self.window_size = int(window_size)
+ self.use_attn_token_shuffle = bool(use_attn_token_shuffle)
+ self.rope_mode = rope_mode
+ self.rope_ref_grid_h = rope_ref_grid_h
+ self.rope_ref_grid_w = rope_ref_grid_w
+ self._pos_cache = {}
+ stages = []
+ for i_stage in range(self.num_stages):
+ ts_ws = 2**i_stage if self.use_attn_token_shuffle else 1
+ blocks = nn.ModuleList(
+ [
+ _TransformerBlock(
+ hidden_size,
+ num_heads,
+ mlp_ratio,
+ drop,
+ use_token_compression=self.use_attn_token_shuffle,
+ token_shuffle_window_size=ts_ws,
+ rope_mode=rope_mode,
+ rope_ref_grid_h=rope_ref_grid_h,
+ rope_ref_grid_w=rope_ref_grid_w,
+ )
+ for _ in range(int(depth_per_stage))
+ ]
+ )
+ expand = _PatchExpanding(hidden_size, window_size=self.window_size)
+ stages.append(nn.ModuleDict({"blocks": blocks, "expand": expand}))
+ self.stages = nn.ModuleList(stages)
+
+ def _fetch_pos(self, height: int, width: int, device: torch.device):
+ key = (height, width)
+ if key in self._pos_cache:
+ return self._pos_cache[key].to(device)
+ head_dim = self.hidden_size // self.num_heads
+ if self.rope_mode == "ntk_aware":
+ pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to(
+ device
+ )
+ else:
+ pos = precompute_freqs_cis_2d(head_dim, height, width).to(device)
+ self._pos_cache[key] = pos
+ return pos
+
+ def forward(self, x: torch.Tensor, bottleneck_h: int, bottleneck_w: int, skip_tokens, c: torch.Tensor):
+ H, W = bottleneck_h, bottleneck_w
+ for i, stage in enumerate(self.stages):
+ for blk in stage["blocks"]:
+ pos = self._fetch_pos(H, W, x.device)
+ x = blk(x, c, pos, None, H, W) if self.use_attn_token_shuffle else blk(x, c, pos, None)
+ x, H, W = stage["expand"](x, H, W)
+ skip_idx = len(self.stages) - 1 - i
+ if 0 <= skip_idx < len(skip_tokens):
+ skip = skip_tokens[skip_idx]
+ expected_tokens = H * W
+ if skip.shape[1] == expected_tokens:
+ x = x + skip
+ return x, H, W
+
+
+# =============================================================================
+# Main T2I network: PixDiT_T2I
+# =============================================================================
+
+
+class PixDiT_T2I(nn.Module):
+ def __init__(
+ self,
+ in_channels=3,
+ num_groups=16,
+ hidden_size=1152,
+ pixel_hidden_size=64,
+ pixel_attn_hidden_size=None,
+ pixel_num_groups=None,
+ patch_depth=26,
+ pixel_depth=2,
+ num_text_blocks=4,
+ patch_size=16,
+ txt_embed_dim=4096,
+ txt_max_length=1024,
+ use_text_rope: bool = True,
+ text_rope_theta: float = 10000.0,
+ # NTK-aware RoPE: set rope_mode="ntk_aware" and provide the reference
+ # pixel resolution used during training. When the actual grid size
+ # differs from ref, the base theta is scaled per-axis.
+ rope_mode: str = "original", # "original" | "ntk_aware"
+ rope_ref_h: int = 1024,
+ rope_ref_w: int = 1024,
+ repa_encoder_index: int = -1,
+ enable_ed: bool = False,
+ ed_compress_ratio: int = 1,
+ ed_depth_per_stage: int = 1,
+ ed_window_size: int = 2,
+ ed_num_heads: Optional[int] = None,
+ ed_hidden_size: Optional[int] = None,
+ ed_use_token_shuffle: bool = True,
+ ):
+ super().__init__()
+ self.in_channels = int(in_channels)
+ self.out_channels = int(in_channels)
+ self.hidden_size = int(hidden_size)
+ self.num_groups = int(num_groups)
+ self.patch_depth = int(patch_depth)
+ self.pixel_depth = int(pixel_depth)
+ self.num_text_blocks = int(num_text_blocks)
+ self.patch_size = int(patch_size)
+ self.pixel_hidden_size = int(pixel_hidden_size)
+ self.txt_embed_dim = int(txt_embed_dim)
+ self.txt_max_length = int(txt_max_length)
+ self.use_text_rope = bool(use_text_rope)
+ self.text_rope_theta = float(text_rope_theta)
+ self.rope_mode = rope_mode
+ self.rope_ref_grid_h = rope_ref_h // self.patch_size
+ self.rope_ref_grid_w = rope_ref_w // self.patch_size
+ self.repa_encoder_index = int(repa_encoder_index)
+ if self.pixel_depth <= 0:
+ raise ValueError("PixDiT_T2I expects pixel_depth > 0 to retain the pixel pathway")
+
+ # Embedders
+ self.pixel_embedder = PixelTokenEmbedder(in_channels, self.pixel_hidden_size)
+ self.s_embedder = PatchTokenEmbedder(in_channels * patch_size**2, hidden_size, bias=True)
+ self.t_embedder = TimestepConditioner(hidden_size)
+ self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, hidden_size, bias=True, norm_layer=RMSNorm)
+ self.y_pos_embedding = nn.Parameter(torch.randn(1, self.txt_max_length, hidden_size))
+
+ # Blocks
+ # Shared AdaLN modulator for conditional blocks (optional)
+ self._shared_cond_adaln = None
+ self._shared_cond_adaln_img = None
+ self._shared_cond_adaln_txt = None
+ self.patch_blocks = nn.ModuleList(
+ [
+ MMDiTBlockT2I(
+ self.hidden_size,
+ self.num_groups,
+ adaLN_modulation_img=self._shared_cond_adaln_img,
+ adaLN_modulation_txt=self._shared_cond_adaln_txt,
+ )
+ for _ in range(self.patch_depth)
+ ]
+ )
+ # Remove AdaLN-based text refinement; PixDiT keeps cross-attn-only text handling
+ self.text_refine_blocks = None
+ self.pixel_attn_hidden_size = (
+ int(pixel_attn_hidden_size) if pixel_attn_hidden_size is not None else self.hidden_size
+ )
+ self.pixel_num_groups = int(pixel_num_groups) if pixel_num_groups is not None else self.num_groups
+ self.pixel_blocks = nn.ModuleList(
+ [
+ PiTBlock(
+ self.pixel_hidden_size,
+ self.hidden_size,
+ patch_size=self.patch_size,
+ num_heads=self.num_groups,
+ mlp_ratio=4.0,
+ attn_hidden_size=self.pixel_attn_hidden_size,
+ attn_num_heads=self.pixel_num_groups,
+ rope_mode=self.rope_mode,
+ rope_ref_grid_h=self.rope_ref_grid_h,
+ rope_ref_grid_w=self.rope_ref_grid_w,
+ )
+ for _ in range(self.pixel_depth)
+ ]
+ )
+
+ self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels)
+
+ self.precompute_pos = {}
+ self.precompute_pos_txt = {} # cache for 1D text RoPE
+ self.last_repa_tokens = None
+
+ self.enable_ed = bool(enable_ed)
+ self.ed_compress_ratio = int(ed_compress_ratio)
+ self.ed_depth_per_stage = int(ed_depth_per_stage)
+ self.ed_window_size = int(ed_window_size)
+ self.ed_num_heads = int(ed_num_heads) if ed_num_heads is not None else self.num_groups
+ self.ed_hidden_size = int(ed_hidden_size) if ed_hidden_size is not None else self.hidden_size
+ self.ed_use_token_shuffle = bool(ed_use_token_shuffle)
+ self.encoder_ed: Optional[_EncoderED] = None
+ self.decoder_ed: Optional[_DecoderED] = None
+ self.s_ed_proj_in: Optional[nn.Module] = None
+ self.s_ed_proj_out: Optional[nn.Module] = None
+ self.s_ed_cond_proj: Optional[nn.Module] = None
+ self.s_ed_in_norm: Optional[RMSNorm] = None
+ self.s_ed_out_norm: Optional[RMSNorm] = None
+ num_stages = _compute_num_stages_from_ratio(self.ed_compress_ratio) if self.enable_ed else 0
+ self.use_ed = self.enable_ed and num_stages > 0
+ if self.use_ed:
+ if self.ed_hidden_size % self.ed_num_heads != 0:
+ raise ValueError(
+ f"ed_hidden_size {self.ed_hidden_size} must be divisible by ed_num_heads {self.ed_num_heads}"
+ )
+ self.s_ed_proj_in = (
+ nn.Identity()
+ if self.ed_hidden_size == self.hidden_size
+ else nn.Linear(self.hidden_size, self.ed_hidden_size, bias=True)
+ )
+ self.s_ed_proj_out = (
+ nn.Identity()
+ if self.ed_hidden_size == self.hidden_size
+ else nn.Linear(self.ed_hidden_size, self.hidden_size, bias=True)
+ )
+ self.s_ed_cond_proj = (
+ nn.Identity()
+ if self.ed_hidden_size == self.hidden_size
+ else nn.Linear(self.hidden_size, self.ed_hidden_size, bias=True)
+ )
+ self.s_ed_in_norm = RMSNorm(self.ed_hidden_size, eps=1e-6)
+ self.s_ed_out_norm = RMSNorm(self.hidden_size, eps=1e-6)
+ self.encoder_ed = _EncoderED(
+ hidden_size=self.ed_hidden_size,
+ num_stages=num_stages,
+ depth_per_stage=self.ed_depth_per_stage,
+ num_heads=self.ed_num_heads,
+ window_size=self.ed_window_size,
+ use_attn_token_shuffle=self.ed_use_token_shuffle,
+ rope_mode=self.rope_mode,
+ rope_ref_grid_h=self.rope_ref_grid_h,
+ rope_ref_grid_w=self.rope_ref_grid_w,
+ )
+ self.decoder_ed = _DecoderED(
+ hidden_size=self.ed_hidden_size,
+ num_stages=num_stages,
+ depth_per_stage=self.ed_depth_per_stage,
+ num_heads=self.ed_num_heads,
+ window_size=self.ed_window_size,
+ use_attn_token_shuffle=self.ed_use_token_shuffle,
+ rope_mode=self.rope_mode,
+ rope_ref_grid_h=self.rope_ref_grid_h,
+ rope_ref_grid_w=self.rope_ref_grid_w,
+ )
+
+ self.initialize_weights()
+
+ # Context-parallel state — set by `enable_context_parallel`. The base
+ # class does not split tokens itself; subclasses (e.g. PidNet)
+ # are responsible for splitting along L in `forward` and gathering
+ # before the final fold. This attribute is propagated to every patch
+ # block (joint MMDiT attention) and pixel block (RotaryAttention).
+ self._cp_group: Optional[ProcessGroup] = None
+ self._is_context_parallel_enabled: bool = False
+
+ @property
+ def is_context_parallel_enabled(self) -> bool:
+ return self._is_context_parallel_enabled
+
+ def enable_context_parallel(self, cp_group: ProcessGroup):
+ # CP for the ED (encoder-decoder) path is not implemented; refuse to
+ # enable CP if the network was built with use_ed=True so we don't
+ # silently produce wrong results.
+ if self.use_ed:
+ raise NotImplementedError(
+ "PixDiT_T2I context parallel is not implemented for the encoder-decoder path. "
+ "Build with enable_ed=False to use CP."
+ )
+ for block in self.patch_blocks:
+ block.set_context_parallel_group(cp_group)
+ for block in self.pixel_blocks:
+ block.set_context_parallel_group(cp_group)
+ self._cp_group = cp_group
+ self._is_context_parallel_enabled = True
+
+ def disable_context_parallel(self):
+ for block in self.patch_blocks:
+ block.set_context_parallel_group(None)
+ for block in self.pixel_blocks:
+ block.set_context_parallel_group(None)
+ self._cp_group = None
+ self._is_context_parallel_enabled = False
+
+ def fetch_pos(self, height, width, device):
+ if (height, width) in self.precompute_pos:
+ return self.precompute_pos[(height, width)].to(device)
+ head_dim = self.hidden_size // self.num_groups
+ if self.rope_mode == "ntk_aware":
+ pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to(
+ device
+ )
+ else:
+ pos = precompute_freqs_cis_2d(head_dim, height, width).to(device)
+ self.precompute_pos[(height, width)] = pos
+ return pos
+
+ def fetch_pos_text(self, length, device):
+ if length in self.precompute_pos_txt:
+ return self.precompute_pos_txt[length].to(device)
+ # Build 1D RoPE freqs for text stream using the same per-head dim as image
+ head_dim = self.hidden_size // self.num_groups
+ # Create frequencies for complex rotation: [length, head_dim//2]
+ freqs = 1.0 / (self.text_rope_theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+ positions = torch.arange(0, length, device=device).float().unsqueeze(1) # [length,1]
+ angles = positions * freqs.unsqueeze(0) # [length, head_dim//2]
+ freqs_cis = torch.polar(torch.ones_like(angles), angles) # complex64/complex32
+ self.precompute_pos_txt[length] = freqs_cis
+ return freqs_cis
+
+ def initialize_weights(self):
+ # Initialize s_embedder like nn.Linear
+ w = self.s_embedder.proj.weight.data
+ nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+ nn.init.constant_(self.s_embedder.proj.bias, 0)
+
+ # Initialize timestep embedding MLP
+ nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+ nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+
+ # zero init final layer
+ nn.init.zeros_(self.final_layer.linear.weight)
+ nn.init.zeros_(self.final_layer.linear.bias)
+
+ def forward(self, x, t, y, s=None, mask=None):
+ B, _, H, W = x.shape
+ # Derive grid token count deterministically from spatial size
+ Hs = H // self.patch_size
+ Ws = W // self.patch_size
+ L = Hs * Ws
+
+ # Patch tokens for condition pathway
+ pos = self.fetch_pos(Hs, Ws, x.device)
+ x_patches = torch.nn.functional.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+
+ t_emb = self.t_embedder(t.view(-1)).view(B, -1, self.hidden_size)
+
+ # Text tokens -> project to hidden_size and add learned pos
+ if y.dim() != 3:
+ raise ValueError("Text embedding y must be [B, L, D]")
+ Ltxt = min(y.shape[1], self.txt_max_length)
+ y = y[:, :Ltxt, :]
+ y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size)
+ y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb.dtype)
+
+ # PixDiT design: no AdaLN modulation applied on text stream
+ condition = torch.nn.functional.silu(t_emb)
+
+ # Condition blocks on patch tokens with MM-DiT joint attention to text tokens
+ pad = None
+ pos_txt = self.fetch_pos_text(Ltxt, x.device) if self.use_text_rope else None
+ if mask is not None and isinstance(mask, torch.Tensor):
+ m = mask
+ while m.dim() > 2 and m.size(1) == 1:
+ m = m.squeeze(1)
+ if m.dim() == 3 and m.size(1) == 1:
+ m = m.squeeze(1)
+ if m.dim() == 2:
+ pad = m == 0
+
+ if s is None:
+ s0 = self.s_embedder(x_patches)
+ self.last_repa_tokens = None
+ if self.use_ed and self.encoder_ed is not None and self.decoder_ed is not None:
+ H_tokens, W_tokens = Hs, Ws
+ s_ed = s0 if self.s_ed_proj_in is None else self.s_ed_proj_in(s0)
+ if self.s_ed_in_norm is not None:
+ s_ed = self.s_ed_in_norm(s_ed)
+ c_ed = condition if self.s_ed_cond_proj is None else self.s_ed_cond_proj(condition)
+ bottleneck, skip_tokens, Hb, Wb = self.encoder_ed(s_ed, H_tokens, W_tokens, c_ed)
+ pos_b = self.fetch_pos(Hb, Wb, x.device)
+ s_main = bottleneck if self.s_ed_proj_out is None else self.s_ed_proj_out(bottleneck)
+ if self.s_ed_out_norm is not None:
+ s_main = self.s_ed_out_norm(s_main)
+ s_main = torch.nn.functional.silu(t_emb + s_main)
+
+ attn_mask_joint = None
+ if pad is not None:
+ L_img_curr = s_main.shape[1]
+ pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device)
+ pad_txt = (
+ pad[:, :Ltxt]
+ if pad.size(1) >= Ltxt
+ else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True)
+ )
+ attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr)
+
+ for i in range(self.patch_depth):
+ s_main, y_emb = self.patch_blocks[i](s_main, y_emb, condition, pos_b, pos_txt, attn_mask_joint)
+ if 0 < self.repa_encoder_index == (i + 1):
+ self.last_repa_tokens = s_main
+ s_bottleneck2 = s_main if self.s_ed_proj_in is None else self.s_ed_proj_in(s_main)
+ if self.s_ed_in_norm is not None:
+ s_bottleneck2 = self.s_ed_in_norm(s_bottleneck2)
+ decoded, _, _ = self.decoder_ed(s_bottleneck2, Hb, Wb, skip_tokens, c_ed)
+ s = decoded if self.s_ed_proj_out is None else self.s_ed_proj_out(decoded)
+ if self.s_ed_out_norm is not None:
+ s = self.s_ed_out_norm(s)
+ s = torch.nn.functional.silu(t_emb + s)
+ else:
+ s_main = s0
+ attn_mask_joint = None
+ if pad is not None:
+ L_img_curr = s_main.shape[1]
+ pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device)
+ pad_txt = (
+ pad[:, :Ltxt]
+ if pad.size(1) >= Ltxt
+ else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True)
+ )
+ attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr)
+
+ for i in range(self.patch_depth):
+ s_main, y_emb = self.patch_blocks[i](s_main, y_emb, condition, pos, pos_txt, attn_mask_joint)
+ if 0 < self.repa_encoder_index == (i + 1):
+ self.last_repa_tokens = s_main
+ s = torch.nn.functional.silu(t_emb + s_main)
+ # If no valid tap index is specified, expose last conditional output
+ if not (0 < self.repa_encoder_index <= self.patch_depth):
+ self.last_repa_tokens = s
+
+ # Ensure the patch token length matches the spatial grid L
+ batch_size, length, _ = s.shape
+ if length != L:
+ if length > L:
+ s = s[:, :L, :]
+ else:
+ pad_len = L - length
+ s = torch.cat([s, s.new_zeros(B, pad_len, s.shape[2])], dim=1)
+ length = L
+
+ # Pixel pathway
+ s_cond = s.view(B * L, self.hidden_size)
+ x_pixels = self.pixel_embedder(x, img_height=H, img_width=W, patch_size=self.patch_size)
+ for blk in self.pixel_blocks:
+ x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask)
+
+ # Project back to image and fold
+ x_pixels = self.final_layer(x_pixels) # [B*L, P2, C]
+ C_out = self.out_channels
+ P2 = self.patch_size * self.patch_size
+ x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).contiguous()
+ x_pixels = x_pixels.view(B, C_out * P2, L)
+ x_img = torch.nn.functional.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size)
+ return x_img
diff --git a/invokeai/backend/pid/_src/utils/__init__.py b/invokeai/backend/pid/_src/utils/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/invokeai/backend/pid/_src/utils/context_parallel.py b/invokeai/backend/pid/_src/utils/context_parallel.py
new file mode 100644
index 00000000000..a89c4fe4aa4
--- /dev/null
+++ b/invokeai/backend/pid/_src/utils/context_parallel.py
@@ -0,0 +1,194 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.distributed import ProcessGroup, all_gather, broadcast_object_list, get_process_group_ranks, get_world_size
+from torch.distributed.utils import _verify_param_shape_across_processes
+
+from invokeai.backend.pid._ext.imaginaire.utils import distributed
+
+
+def split_inputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+ """
+ Split input tensor along the sequence dimension for context parallelism.
+
+ This function divides the input tensor into equal parts along the specified
+ sequence dimension, based on the number of ranks in the context parallelism group.
+ It then selects the part corresponding to the current rank.
+
+ Args:
+ x: Input tensor to be split.
+ seq_dim: The dimension along which to split the input (sequence dimension).
+ cp_group: The process group for context parallelism.
+
+ Returns:
+ A slice of the input tensor corresponding to the current rank.
+
+ Raises:
+ AssertionError: If the sequence dimension is not divisible by the number of ranks.
+ """
+ cp_ranks = get_process_group_ranks(cp_group)
+ cp_size = len(cp_ranks)
+
+ assert x.shape[seq_dim] % cp_size == 0, f"{x.shape[seq_dim]} cannot divide cp_size {cp_size}"
+ x = x.view(*x.shape[:seq_dim], cp_size, x.shape[seq_dim] // cp_size, *x.shape[(seq_dim + 1) :])
+ seq_idx = torch.tensor([cp_group.rank()], device=x.device)
+ x = x.index_select(seq_dim, seq_idx)
+ # Note that the new sequence length is the original sequence length / cp_size
+ x = x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :])
+ return x
+
+
+def cat_outputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+ """
+ Concatenate outputs from different ranks in the checkpoint parallelism group.
+
+ This function gathers tensors from all ranks in the checkpoint parallelism group
+ and concatenates them along the specified sequence dimension.
+
+ Args:
+ x: Input tensor to be concatenated.
+ seq_dim: The dimension along which to concatenate the tensors (sequence dimension).
+ cp_group: The process group for checkpoint parallelism.
+
+ Returns:
+ A tensor that is the concatenation of tensors from all ranks in the cp_group.
+
+ Raises:
+ RuntimeError: If the gather operation fails.
+ """
+ # Get the world size (number of processes in the group)
+ world_size = get_world_size(cp_group)
+
+ # Create a list to store tensors from all ranks
+ gathered_tensors = [torch.zeros_like(x) for _ in range(world_size)]
+
+ # Gather tensors from all ranks
+ try:
+ all_gather(gathered_tensors, x, group=cp_group)
+ except RuntimeError as e:
+ raise RuntimeError(f"Failed to gather tensors: {e}")
+
+ # Concatenate the gathered tensors along the specified dimension
+ return torch.cat(gathered_tensors, dim=seq_dim)
+
+
+def cat_outputs_cp_with_grad(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor:
+ """
+ Concatenate outputs from different ranks in the context parallelism group.
+
+ This function gathers tensors from all ranks in the checkpoint parallelism group
+ and concatenates them along the specified sequence dimension.
+
+ It retains computational graph locally for each rank by replacing the portion of the tensor with original output.
+
+ Args:
+ x: Input tensor to be concatenated.
+ seq_dim: The dimension along which to concatenate the tensors (sequence dimension).
+ cp_group: The process group for checkpoint parallelism.
+
+ Returns:
+ A tensor that is the concatenation of tensors from all ranks in the cp_group.
+
+ Raises:
+ RuntimeError: If the gather operation fails.
+ """
+ # Get the world size (number of processes in the group)
+ cp_size = cp_group.size()
+ assert cp_size > 0, "cp_size should be greater than 0"
+
+ # Create a list to store tensors from all ranks
+ gathered_tensors = [torch.zeros_like(x) for _ in range(cp_size)]
+
+ # Gather tensors from all ranks
+ try:
+ all_gather(gathered_tensors, x, group=cp_group)
+ except RuntimeError as e:
+ raise RuntimeError(f"Failed to gather tensors: {e}")
+
+ rank = cp_group.rank()
+ gathered_tensors[rank] = x
+ # Concatenate the gathered tensors along the specified dimension
+ return torch.cat(gathered_tensors, dim=seq_dim)
+
+
+def robust_broadcast(tensor: torch.Tensor, src: int, pg: ProcessGroup, is_check_shape: bool = False) -> torch.Tensor:
+ """
+ Perform a robust broadcast operation that works regardless of tensor shapes on different ranks.
+
+ Args:
+ tensor (torch.Tensor): The tensor to broadcast (on src rank) or receive (on other ranks).
+ src (int): The source rank for the broadcast. Defaults to 0.
+
+ Returns:
+ torch.Tensor: The broadcasted tensor on all ranks.
+ """
+ # First, broadcast the shape of the tensor
+ if distributed.get_rank() == src:
+ shape = torch.tensor(tensor.shape, dtype=torch.long).cuda()
+ else:
+ shape = torch.empty(tensor.dim(), dtype=torch.long).cuda()
+ if is_check_shape:
+ _verify_param_shape_across_processes(pg, [shape])
+ torch.distributed.broadcast(shape, src, group=pg)
+
+ # Resize the tensor on non-src ranks if necessary
+ if distributed.get_rank() != src:
+ tensor = tensor.new_empty(shape.tolist()).type_as(tensor)
+
+ # Now broadcast the tensor data; torch.distributed.broadcast requires contiguous tensors
+ # (e.g. tensors from expand() are non-contiguous views with stride=0)
+ tensor = tensor.contiguous()
+ torch.distributed.broadcast(tensor, src, group=pg)
+
+ return tensor
+
+
+def broadcast(
+ item: torch.Tensor | str | None, process_group: Optional[ProcessGroup] = None
+) -> torch.Tensor | str | None:
+ """
+ Broadcast the item from the minimum rank in the specified group(s).
+ """
+ if process_group is None:
+ return item
+
+ min_rank = min(get_process_group_ranks(process_group))
+ if isinstance(item, torch.Tensor): # assume the device is cuda
+ item = robust_broadcast(item, min_rank, process_group)
+ elif item is not None:
+ broadcastable_list = [item]
+ broadcast_object_list(broadcastable_list, min_rank, group=process_group)
+ item = broadcastable_list[0]
+ return item
+
+
+def broadcast_split_tensor(
+ tensor: torch.Tensor,
+ seq_dim: int,
+ process_group: Optional[ProcessGroup] = None,
+) -> torch.Tensor:
+ """
+ Broadcast the tensor from the minimum rank in the specified group(s).
+ """
+ if tensor is None:
+ return tensor
+ min_rank = min(get_process_group_ranks(process_group))
+ tensor = robust_broadcast(tensor, min_rank, process_group)
+ return split_inputs_cp(tensor, seq_dim, process_group)
diff --git a/invokeai/backend/pid/decode.py b/invokeai/backend/pid/decode.py
new file mode 100644
index 00000000000..f7946c5fa22
--- /dev/null
+++ b/invokeai/backend/pid/decode.py
@@ -0,0 +1,489 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Decode pipeline for the vendored PiD (Pixel Diffusion Decoder).
+
+This module bridges between InvokeAI's model-manager-loaded PiD checkpoints
+(state dicts produced by `model_loaders/pid_decoder.py`) and the underlying
+`PidNet` super-resolution network. It deliberately reimplements the small
+sampling loop from `PidDistillModel.generate_samples_from_batch` (vendored
+in `_src/models/pid_distill_model.py`) so the wrapper stays free of the
+upstream's CUDA-only, distributed-training-flavoured init paths and can be
+driven entirely by InvokeAI's per-call device / dtype choices.
+
+Hyperparameters were extracted from PiD's `pid_sr4x` base net config and
+the per-backbone experiment overrides (NVIDIA's upstream `pid/_src/configs/`,
+not vendored here — only the values needed at inference). See
+`shared_config.py` and `experiment/{flux,flux2,sd3}.py` in the upstream
+repository for the source of truth.
+"""
+
+from __future__ import annotations
+
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from invokeai.backend.model_manager.taxonomy import BaseModelType
+from invokeai.backend.pid._src.networks.pid_net import PidNet
+
+# ---------------------------------------------------------------------------
+# Network hyperparameters per backbone
+# ---------------------------------------------------------------------------
+
+# `pid_sr4x` base config (defaults/model_pid.py upstream) plus the shared
+# `_common_model_overrides` net dict (experiment/shared_config.py upstream).
+_PID_SR4X_BASE: dict = {
+ # T2I backbone (PixDiT_T2I args)
+ "in_channels": 3,
+ "num_groups": 24,
+ "hidden_size": 1536,
+ "pixel_hidden_size": 16,
+ "pixel_attn_hidden_size": 1152,
+ "pixel_num_groups": 16,
+ "patch_depth": 14,
+ "pixel_depth": 2,
+ "patch_size": 16,
+ "txt_embed_dim": 2304, # Gemma-2-2b-it hidden size
+ "txt_max_length": 300,
+ "use_text_rope": True,
+ "text_rope_theta": 10000.0,
+ "rope_mode": "ntk_aware",
+ "rope_ref_h": 1024,
+ "rope_ref_w": 1024,
+ "repa_encoder_index": -1, # REPA disabled at inference
+ # SR / LQ branch
+ "lq_inject_mode": "controlnet",
+ "lq_in_channels": 0,
+ "lq_hidden_dim": 512,
+ "lq_gate_type": "sigma_aware_per_token_per_dim",
+ "lq_interval": 2, # overridden by shared_config
+ "zero_init_lq": True,
+ "train_lq_proj_only": False,
+ "sr_scale": 4,
+ "pit_lq_inject": False,
+ "pit_lq_gate_type": "sigma_aware_per_token_per_dim",
+}
+
+# Per-backbone net deltas (mirrors upstream experiment/{name}.py).
+_PER_BACKBONE: dict[BaseModelType, dict] = {
+ BaseModelType.Flux: {
+ "lq_latent_channels": 16,
+ "latent_spatial_down_factor": 8,
+ },
+ BaseModelType.Flux2: {
+ "lq_latent_channels": 128,
+ "latent_spatial_down_factor": 16,
+ },
+ BaseModelType.StableDiffusion3: {
+ "lq_latent_channels": 16,
+ "latent_spatial_down_factor": 8,
+ },
+ BaseModelType.StableDiffusionXL: {
+ "lq_latent_channels": 4,
+ "latent_spatial_down_factor": 8,
+ },
+ BaseModelType.QwenImage: {
+ "lq_latent_channels": 16,
+ "latent_spatial_down_factor": 8,
+ },
+}
+
+# Distilled-student schedule (`student_t_list` from shared_config).
+_STUDENT_T_LIST: list[float] = [0.999, 0.866, 0.634, 0.342, 0.0]
+
+# Flow-matching timescale that maps the [0,1] schedule to the network's
+# expected timestep range.
+_FM_TIMESCALE: float = 1000.0
+
+# Caption pre-processing constants from PiD's `shared_config.py`. The model
+# was trained with these strings prepended; using anything else degrades
+# quality. See `_encode_text_raw` in the upstream pixeldit_model.py.
+PID_CHI_PROMPT: str = "\n".join(
+ [
+ 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:',
+ "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.",
+ "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
+ "Here are examples of how to transform or refine prompts:",
+ "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.",
+ "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.",
+ "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:",
+ "User Prompt: ",
+ ]
+)
+PID_NEGATIVE_PROMPT: str = (
+ "low quality, worst quality, over-saturated, three legs, six fingers, cartoon, anime, "
+ "cgi, low res, blurry, deformed, distortion, duplicated limbs, plastic skin, jpeg artifacts, "
+ "watermark"
+)
+PID_MODEL_MAX_LENGTH: int = 300
+
+
+# Working-memory (activation) estimate for the PiD decode, mirroring `estimate_vae_working_memory_*` (see #8414).
+# PiD runs a multi-step pixel-diffusion in float32 at the full super-resolved output resolution, so its peak
+# activation memory scales with the OUTPUT pixel count.
+#
+# This is ONLY the activation headroom reserved for the decode itself - it does NOT do the heavy lifting of
+# evicting the main transformer/encoders (the nodes call context.models.offload_all_from_vram() for that before
+# loading PidNet). It must therefore stay modest: the cache uses max(this_estimate, device_working_mem_gb=3GB),
+# and an over-large value pushes the working set negative and forces PidNet to partial-load onto the CPU (slow).
+# ~4GB at a 2048px output is a small headroom above the 3GB default. Experimentally-tunable; calibrate to peak.
+_PID_DECODE_WORKING_MEMORY_SCALING_CONSTANT = 250
+
+
+def estimate_pid_decode_working_memory(latent: Tensor, backbone: BaseModelType) -> int:
+ """Estimate the working (activation) memory in bytes for a PiD decode of *latent*.
+
+ The decoded image is ``latent_spatial * sr_scale * latent_spatial_down_factor`` pixels per side. PidNet runs
+ in float32 (see ``model_loaders/pid_decoder.py``), so the element size is 4 bytes. Returns 0 for unsupported
+ backbones so callers fall back to the cache's default working-memory reservation.
+ """
+ per_backbone = _PER_BACKBONE.get(backbone)
+ if per_backbone is None:
+ return 0
+ total_up = int(_PID_SR4X_BASE["sr_scale"]) * int(per_backbone["latent_spatial_down_factor"])
+ out_h = int(latent.shape[-2]) * total_up
+ out_w = int(latent.shape[-1]) * total_up
+ element_size = 4 # PidNet runs in float32 (see model_loaders/pid_decoder.py)
+ return int(out_h * out_w * element_size * _PID_DECODE_WORKING_MEMORY_SCALING_CONSTANT)
+
+
+def build_pid_net(backbone: BaseModelType) -> PidNet:
+ """Build an uninitialised PidNet of the right shape for *backbone*.
+
+ The returned network is on CPU and in float32; the caller is responsible
+ for casting it to the desired dtype/device before loading weights.
+ """
+ if backbone not in _PER_BACKBONE:
+ raise ValueError(
+ f"PiD decoder backbone {backbone!r} is not supported. Expected one of: {list(_PER_BACKBONE.keys())}."
+ )
+ kwargs = {**_PID_SR4X_BASE, **_PER_BACKBONE[backbone]}
+ return PidNet(**kwargs)
+
+
+def load_pid_decoder(state_dict: dict[str, Tensor], backbone: BaseModelType) -> PidNet:
+ """Instantiate a PidNet for *backbone* and populate it with *state_dict*.
+
+ The state dict is expected to be the model-manager loader's output, i.e.
+ already stripped of the `net.` prefix used by NVIDIA's distill model
+ serialisation. The caller still owns dtype/device placement of the
+ returned net.
+ """
+ net = build_pid_net(backbone)
+ # strict=False keeps parity with the upstream loader: missing LQ-projection
+ # keys are tolerated when reloading PixDiT_T2I weights into PidNet, and
+ # extra keys (e.g. legacy EMA artefacts) are dropped.
+ missing, unexpected = net.load_state_dict(state_dict, strict=False)
+ if unexpected:
+ raise RuntimeError(
+ f"PiD checkpoint has unexpected keys not present in PidNet: {unexpected[:5]}"
+ + (f" (+ {len(unexpected) - 5} more)" if len(unexpected) > 5 else "")
+ )
+ if missing:
+ # We tolerate missing `lq_proj.*` (e.g. if the user accidentally
+ # passed a vanilla PixDiT_T2I checkpoint), but anything else points
+ # to a real architecture mismatch.
+ non_lq = [k for k in missing if "lq_proj" not in k]
+ if non_lq:
+ raise RuntimeError(
+ f"PiD checkpoint is missing non-LQ keys required by PidNet: {non_lq[:5]}"
+ + (f" (+ {len(non_lq) - 5} more)" if len(non_lq) > 5 else "")
+ )
+ return net
+
+
+# ---------------------------------------------------------------------------
+# Sampling
+# ---------------------------------------------------------------------------
+
+
+def _get_t_list(device: torch.device, *, num_steps: Optional[int] = None) -> Tensor:
+ """Distill-student sigma schedule.
+
+ When *num_steps* differs from the trained 4 steps, linearly sub-sample
+ the canonical 5-point list (mirrors `PidDistillModel._get_t_list`).
+ """
+ full = torch.tensor(_STUDENT_T_LIST, device=device, dtype=torch.float32)
+ if num_steps is None or num_steps == 4:
+ t = full
+ else:
+ idx = torch.linspace(0, len(full) - 1, num_steps + 1).round().long()
+ t = full[idx]
+ assert abs(t[-1].item()) < 1e-6, "t_list must end at 0"
+ return t
+
+
+def _velocity_to_x0(x_t: Tensor, net_output: Tensor, t: Tensor) -> Tensor:
+ """Convert the network's velocity prediction back to x0 at time *t*."""
+ s = [x_t.shape[0]] + [1] * (x_t.ndim - 1)
+ t_shaped = t.double().view(*s)
+ return (x_t.double() - t_shaped * net_output.double()).to(x_t.dtype)
+
+
+@torch.no_grad()
+def _student_sample_loop(
+ net: PidNet,
+ *,
+ noise: Tensor,
+ t_list: Tensor,
+ caption_embs: Tensor,
+ caption_mask: Optional[Tensor],
+ lq_latent: Optional[Tensor],
+ degrade_sigma: Tensor,
+ sample_type: str = "sde",
+ autocast_dtype: Optional[torch.dtype] = None,
+ generator: Optional[torch.Generator] = None,
+) -> Tensor:
+ """Few-step distilled sampler.
+
+ Mirrors `PidDistillModel._student_sample_loop` — the only mode supported
+ here is "sde" (the default for the released res2k_sr4x checkpoints).
+
+ ``autocast_dtype`` mirrors PiD's training-time precision config (bf16):
+ the parameters can stay in float32 but cosines / RoPE tensors created
+ inside the forward must be cast on the fly. Set to ``None`` to disable.
+ """
+ batch_size = noise.shape[0]
+ x = noise
+ autocast_ctx = (
+ torch.autocast(noise.device.type, dtype=autocast_dtype)
+ if autocast_dtype is not None and noise.device.type == "cuda"
+ else nullcontext()
+ )
+ for t_cur, t_next in zip(t_list[:-1], t_list[1:], strict=True):
+ t_cur_batch = t_cur.expand(batch_size)
+ with autocast_ctx:
+ # Do not pass the caption mask through here: upstream PiD's
+ # PidDistillModel sampler omits it too, and PidNet forwards the
+ # same `mask` argument unchanged to its pixel blocks where the
+ # shape (B, T_text) is incompatible with the patch-token K
+ # dimension that block expects. We keep `caption_mask` available
+ # in the signature so a future patch-block-only path can reuse
+ # it without another API change.
+ v_pred = net(
+ x,
+ t_cur_batch * _FM_TIMESCALE,
+ caption_embs,
+ lq_video_or_image=None,
+ lq_latent=lq_latent,
+ degrade_sigma=degrade_sigma,
+ )
+ if t_next.item() > 0:
+ x0_pred = _velocity_to_x0(x, v_pred, t_cur_batch)
+ eps_infer = torch.randn(
+ x0_pred.shape,
+ device=x0_pred.device,
+ dtype=x0_pred.dtype,
+ generator=generator,
+ )
+ broadcast_shape = [batch_size] + [1] * (x.ndim - 1)
+ t_next_b = t_next.reshape(1).expand(broadcast_shape)
+ if sample_type == "ode":
+ # ODE step (kept for symmetry; unused by the 4-step preset).
+ dt = t_next - t_cur
+ x = x + dt * v_pred
+ else:
+ x = (1.0 - t_next_b) * x0_pred + t_next_b * eps_infer
+ else:
+ x = _velocity_to_x0(x, v_pred, t_cur_batch)
+ return x
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class PiDDecodeConfig:
+ """Per-call decode knobs.
+
+ The defaults match NVIDIA's released `res2k_sr4x_*_distill_4step`
+ presets; callers (i.e. the Phase 6.x invocations) may override them.
+ """
+
+ num_inference_steps: int = 4
+ scale: int = 4
+ sample_type: str = "sde"
+ # Caller-supplied per-sample noise levels of the input latent — 0.0 means
+ # "the latent is the clean x0 from the LDM" (the from_ldm path); the
+ # from_clean upscale path passes the LDM scheduler's per-step sigma here.
+ degrade_sigma: float | list[float] | Tensor = 0.0
+ seed: int = 0
+ student_t_list: list[float] = field(default_factory=lambda: list(_STUDENT_T_LIST))
+
+
+class PiDDecoder:
+ """High-level decoder that hides PidNet construction and sampling.
+
+ Usage::
+
+ net = load_pid_decoder(state_dict, backbone)
+ net = net.to(device=..., dtype=...)
+ decoder = PiDDecoder(net, backbone=BaseModelType.Flux)
+ image = decoder.decode(latent=..., caption_embs=...)
+ """
+
+ def __init__(self, net: PidNet, backbone: BaseModelType) -> None:
+ if backbone not in _PER_BACKBONE:
+ raise ValueError(f"Unsupported PiD backbone: {backbone!r}")
+ self.net = net
+ self.backbone = backbone
+
+ @property
+ def sr_scale(self) -> int:
+ return int(self.net.sr_scale)
+
+ @property
+ def latent_spatial_down_factor(self) -> int:
+ return int(_PER_BACKBONE[self.backbone]["latent_spatial_down_factor"])
+
+ @torch.no_grad()
+ def decode(
+ self,
+ *,
+ latent: Tensor,
+ caption_embs: Tensor,
+ caption_mask: Optional[Tensor] = None,
+ config: Optional[PiDDecodeConfig] = None,
+ ) -> Tensor:
+ """Decode *latent* + *caption_embs* into a pixel tensor in [-1, 1].
+
+ Args:
+ latent: ``[B, C_lat, H_lat, W_lat]`` LQ latent (the LDM's x0
+ output, scaled per the backbone's VAE convention).
+ caption_embs: ``[B, T, 2304]`` Gemma-2-2b-it caption embeddings
+ (output of `_encode_text_raw` upstream — InvokeAI callers
+ produce this via `Gemma2EncoderLoader`).
+ config: per-call sampling overrides; defaults to the released
+ `res2k_sr4x_*_distill_4step` preset.
+
+ Returns:
+ ``[B, 3, H_lat * sr_scale * latent_spatial_down_factor,
+ W_lat * sr_scale * latent_spatial_down_factor]`` in [-1, 1].
+ """
+ cfg = config or PiDDecodeConfig()
+ device = latent.device
+ dtype = next(self.net.parameters()).dtype
+ # On CUDA, always run the forward pass under bf16 autocast: matmuls and
+ # convolutions execute in bf16 (fast + small activations), while
+ # numerically sensitive reductions like RMSNorm stay in the parameter
+ # dtype. PidNet is intentionally loaded in fp32 (see the loader) so
+ # those reductions actually keep their precision.
+ autocast_dtype = torch.bfloat16 if device.type == "cuda" else None
+ batch_size = latent.shape[0]
+
+ # Spatial size of the noise tensor — the decoder operates in pixel
+ # space at sr_scale * latent_spatial_down_factor times the latent.
+ total_up = self.sr_scale * self.latent_spatial_down_factor
+ img_h = int(latent.shape[-2] * total_up)
+ img_w = int(latent.shape[-1] * total_up)
+
+ gen = torch.Generator(device=device).manual_seed(int(cfg.seed))
+ noise = torch.randn(batch_size, 3, img_h, img_w, device=device, generator=gen, dtype=dtype)
+
+ sigma = cfg.degrade_sigma
+ if isinstance(sigma, Tensor):
+ degrade_sigma_t = sigma.to(device=device, dtype=torch.float32).reshape(-1)
+ if degrade_sigma_t.numel() == 1:
+ degrade_sigma_t = degrade_sigma_t.expand(batch_size).contiguous()
+ elif isinstance(sigma, (list, tuple)):
+ degrade_sigma_t = torch.tensor(sigma, device=device, dtype=torch.float32)
+ else:
+ degrade_sigma_t = torch.full((batch_size,), float(sigma), device=device, dtype=torch.float32)
+ if degrade_sigma_t.shape != (batch_size,):
+ raise ValueError(
+ f"degrade_sigma must broadcast to [B={batch_size}], got shape {tuple(degrade_sigma_t.shape)}"
+ )
+
+ caption_embs = caption_embs.to(device=device, dtype=dtype)
+ if caption_mask is not None:
+ caption_mask = caption_mask.to(device=device)
+ lq_latent = latent.to(device=device, dtype=dtype)
+
+ t_list = _get_t_list(device, num_steps=cfg.num_inference_steps)
+
+ self.net.eval()
+ x0 = _student_sample_loop(
+ self.net,
+ noise=noise,
+ t_list=t_list,
+ caption_embs=caption_embs,
+ caption_mask=caption_mask,
+ lq_latent=lq_latent,
+ degrade_sigma=degrade_sigma_t,
+ sample_type=cfg.sample_type,
+ autocast_dtype=autocast_dtype,
+ generator=gen,
+ )
+ return x0.clamp(-1, 1)
+
+
+@torch.no_grad()
+def encode_caption_for_pid(
+ captions: list[str],
+ *,
+ tokenizer: "object", # AutoTokenizer; typed loose to avoid importing transformers at module load
+ encoder: "object", # Gemma2Model
+ device: torch.device,
+ dtype: torch.dtype = torch.bfloat16,
+ chi_prompt: str = PID_CHI_PROMPT,
+ model_max_length: int = PID_MODEL_MAX_LENGTH,
+) -> tuple[Tensor, Tensor]:
+ """Mirror of `PixelDiTModel._encode_text_raw`.
+
+ Prepends the chi-prompt, tokenises with right-padding, runs Gemma's
+ `model` (the transformer stack without the LM head), and selects
+ ``[CLS] + last (model_max_length - 1)`` tokens to yield a fixed
+ ``[B, model_max_length, 2304]`` embedding plus the matching attention
+ mask. The mask is critical: PidNet's joint attention zeros padded text
+ tokens out via this mask. Without it the decoder treats all ~300 slots
+ (including the padding) as valid caption tokens and produces a
+ washed-out average image.
+ """
+ if not captions:
+ raise ValueError("encode_caption_for_pid requires at least one caption.")
+ n_chi_tokens = len(tokenizer.encode(chi_prompt)) if chi_prompt else 0
+ prompts = [chi_prompt + c for c in captions]
+ max_len = (n_chi_tokens + model_max_length - 2) if chi_prompt else model_max_length
+ # PiD was trained with right-padding (see PixelDiTModel._load_text_encoder
+ # upstream). Gemma2's tokenizer defaults to "left" which would push the
+ # BOS token away from index 0 and shove pads into the slice the decoder
+ # consumes — yielding a garbled caption embedding. We toggle the value
+ # for the duration of this call and restore it afterwards so we don't
+ # poison the shared cached tokenizer.
+ old_padding_side = getattr(tokenizer, "padding_side", "right")
+ try:
+ tokenizer.padding_side = "right"
+ toks = tokenizer(
+ prompts,
+ max_length=max_len,
+ padding="max_length",
+ truncation=True,
+ return_tensors="pt",
+ ).to(device)
+ finally:
+ tokenizer.padding_side = old_padding_side
+ hidden = encoder(toks.input_ids, toks.attention_mask)[0]
+ select_idx = [0] + list(range(-(model_max_length - 1), 0))
+ caption_embs = hidden[:, select_idx].to(dtype=dtype)
+ # Cast to bool: HF tokenizers emit attention_mask as int64, but PidNet's
+ # SDPA call (scaled_dot_product_attention) refuses any int dtype — it
+ # requires bool or matching float. Bool also matches the upstream
+ # `pad = mask == 0` reduction in pid_net.py.
+ caption_mask = toks.attention_mask[:, select_idx].to(torch.bool)
+ return caption_embs, caption_mask
+
+
+__all__ = [
+ "PID_CHI_PROMPT",
+ "PID_MODEL_MAX_LENGTH",
+ "PID_NEGATIVE_PROMPT",
+ "PiDDecodeConfig",
+ "PiDDecoder",
+ "build_pid_net",
+ "encode_caption_for_pid",
+ "load_pid_decoder",
+]
diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json
index e822e8b260e..5621184e87e 100644
--- a/invokeai/frontend/web/openapi.json
+++ b/invokeai/frontend/web/openapi.json
@@ -876,6 +876,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -969,6 +984,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -1197,6 +1215,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -1290,6 +1323,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -1518,6 +1554,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -1611,6 +1662,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -1889,6 +1943,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -1982,6 +2051,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -2284,6 +2356,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -2377,6 +2464,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -3499,6 +3589,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -3592,6 +3697,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -11542,6 +11650,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -11635,6 +11758,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -25118,11 +25244,11 @@
"$ref": "#/components/schemas/FluxConditioningOutput"
}
},
- "Flux2VaeDecodeInvocation": {
+ "Flux2PiDDecodeInvocation": {
"category": "latents",
"class": "invocation",
"classification": "prototype",
- "description": "Generates an image from latents using FLUX.2 Klein's 32-channel VAE.",
+ "description": "Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The stored FLUX.2 latent\nis patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout\nPiD's FLUX.2 backbone expects, then decoded directly (it is already in raw,\nBN-denormalized space; see the module docstring).",
"node_pack": "invokeai",
"properties": {
"board": {
@@ -25196,83 +25322,54 @@
"input": "connection",
"orig_required": true
},
- "vae": {
+ "prompt": {
"anyOf": [
{
- "$ref": "#/components/schemas/VAEField"
+ "type": "string"
},
{
"type": "null"
}
],
"default": null,
- "description": "VAE",
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
"field_kind": "input",
- "input": "connection",
- "orig_required": true
- },
- "type": {
- "const": "flux2_vae_decode",
- "default": "flux2_vae_decode",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["type", "id"],
- "tags": ["latents", "image", "vae", "l2i", "flux2", "klein"],
- "title": "Latents to Image - FLUX2",
- "type": "object",
- "version": "1.0.0",
- "output": {
- "$ref": "#/components/schemas/ImageOutput"
- }
- },
- "Flux2VaeEncodeInvocation": {
- "category": "latents",
- "class": "invocation",
- "classification": "prototype",
- "description": "Encodes an image into latents using FLUX.2 Klein's 32-channel VAE.",
- "node_pack": "invokeai",
- "properties": {
- "id": {
- "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
- "field_kind": "node_attribute",
- "title": "Id",
- "type": "string"
- },
- "is_intermediate": {
- "default": false,
- "description": "Whether or not this is an intermediate invocation.",
- "field_kind": "node_attribute",
- "input": "direct",
+ "input": "any",
"orig_required": true,
- "title": "Is Intermediate",
- "type": "boolean",
- "ui_hidden": false,
- "ui_type": "IsIntermediate"
+ "title": "Prompt",
+ "ui_component": "textarea"
},
- "use_cache": {
- "default": true,
- "description": "Whether or not to use the cache",
- "field_kind": "node_attribute",
- "title": "Use Cache",
- "type": "boolean"
+ "gemma2_encoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
},
- "image": {
+ "pid_decoder": {
"anyOf": [
{
- "$ref": "#/components/schemas/ImageField"
+ "$ref": "#/components/schemas/PiDDecoderField"
},
{
"type": "null"
}
],
"default": null,
- "description": "The image to encode.",
+ "description": "PiD FLUX.2 decoder checkpoint.",
"field_kind": "input",
- "input": "any",
- "orig_required": true
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
},
"vae": {
"anyOf": [
@@ -25284,114 +25381,326 @@
}
],
"default": null,
- "description": "VAE",
+ "description": "FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is normally an identity transform and the input can be left unconnected.",
"field_kind": "input",
"input": "connection",
- "orig_required": true
+ "orig_default": null,
+ "orig_required": false,
+ "title": "VAE"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
},
"type": {
- "const": "flux2_vae_encode",
- "default": "flux2_vae_encode",
+ "const": "flux2_pid_decode",
+ "default": "flux2_pid_decode",
"field_kind": "node_attribute",
"title": "type",
"type": "string"
}
},
"required": ["type", "id"],
- "tags": ["latents", "image", "vae", "i2l", "flux2", "klein"],
- "title": "Image to Latents - FLUX2",
+ "tags": ["latents", "image", "pid", "flux2", "klein", "upscale"],
+ "title": "Latents to Image - FLUX.2 + PiD (4x SR)",
"type": "object",
"version": "1.0.0",
"output": {
- "$ref": "#/components/schemas/LatentsOutput"
+ "$ref": "#/components/schemas/ImageOutput"
}
},
- "Flux2VariantType": {
- "type": "string",
- "enum": ["klein_4b", "klein_4b_base", "klein_9b", "klein_9b_base"],
- "title": "Flux2VariantType",
- "description": "FLUX.2 model variants."
- },
- "FluxConditioningCollectionOutput": {
- "class": "output",
- "description": "Base class for nodes that output a collection of conditioning tensors",
+ "Flux2VaeDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Generates an image from latents using FLUX.2 Klein's 32-channel VAE.",
+ "node_pack": "invokeai",
"properties": {
- "collection": {
- "description": "The output conditioning tensors",
- "field_kind": "output",
- "items": {
- "$ref": "#/components/schemas/FluxConditioningField"
- },
- "title": "Collection",
- "type": "array",
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
"ui_hidden": false
},
- "type": {
- "const": "flux_conditioning_collection_output",
- "default": "flux_conditioning_collection_output",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["output_meta", "collection", "type", "type"],
- "title": "FluxConditioningCollectionOutput",
- "type": "object"
- },
- "FluxConditioningField": {
- "description": "A conditioning tensor primitive value",
- "properties": {
- "conditioning_name": {
- "description": "The name of conditioning tensor",
- "title": "Conditioning Name",
- "type": "string"
- },
- "mask": {
+ "metadata": {
"anyOf": [
{
- "$ref": "#/components/schemas/TensorField"
+ "$ref": "#/components/schemas/MetadataField"
},
{
"type": "null"
}
],
"default": null,
- "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True."
- }
- },
- "required": ["conditioning_name"],
- "title": "FluxConditioningField",
- "type": "object"
- },
- "FluxConditioningOutput": {
- "class": "output",
- "description": "Base class for nodes that output a single conditioning tensor",
- "properties": {
- "conditioning": {
- "$ref": "#/components/schemas/FluxConditioningField",
- "description": "Conditioning tensor",
- "field_kind": "output",
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
"ui_hidden": false
},
- "type": {
- "const": "flux_conditioning_output",
- "default": "flux_conditioning_output",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["output_meta", "conditioning", "type", "type"],
- "title": "FluxConditioningOutput",
- "type": "object"
- },
- "FluxControlLoRALoaderInvocation": {
- "category": "model",
- "class": "invocation",
- "classification": "stable",
- "description": "LoRA model and Image to use with FLUX transformer generation.",
- "node_pack": "invokeai",
- "properties": {
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LatentsField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Latents tensor",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "VAE",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "type": {
+ "const": "flux2_vae_decode",
+ "default": "flux2_vae_decode",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "vae", "l2i", "flux2", "klein"],
+ "title": "Latents to Image - FLUX2",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
+ "Flux2VaeEncodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Encodes an image into latents using FLUX.2 Klein's 32-channel VAE.",
+ "node_pack": "invokeai",
+ "properties": {
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "image": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ImageField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The image to encode.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "VAE",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "type": {
+ "const": "flux2_vae_encode",
+ "default": "flux2_vae_encode",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "vae", "i2l", "flux2", "klein"],
+ "title": "Image to Latents - FLUX2",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/LatentsOutput"
+ }
+ },
+ "Flux2VariantType": {
+ "type": "string",
+ "enum": ["klein_4b", "klein_4b_base", "klein_9b", "klein_9b_base"],
+ "title": "Flux2VariantType",
+ "description": "FLUX.2 model variants."
+ },
+ "FluxConditioningCollectionOutput": {
+ "class": "output",
+ "description": "Base class for nodes that output a collection of conditioning tensors",
+ "properties": {
+ "collection": {
+ "description": "The output conditioning tensors",
+ "field_kind": "output",
+ "items": {
+ "$ref": "#/components/schemas/FluxConditioningField"
+ },
+ "title": "Collection",
+ "type": "array",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "flux_conditioning_collection_output",
+ "default": "flux_conditioning_collection_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "collection", "type", "type"],
+ "title": "FluxConditioningCollectionOutput",
+ "type": "object"
+ },
+ "FluxConditioningField": {
+ "description": "A conditioning tensor primitive value",
+ "properties": {
+ "conditioning_name": {
+ "description": "The name of conditioning tensor",
+ "title": "Conditioning Name",
+ "type": "string"
+ },
+ "mask": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/TensorField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True."
+ }
+ },
+ "required": ["conditioning_name"],
+ "title": "FluxConditioningField",
+ "type": "object"
+ },
+ "FluxConditioningOutput": {
+ "class": "output",
+ "description": "Base class for nodes that output a single conditioning tensor",
+ "properties": {
+ "conditioning": {
+ "$ref": "#/components/schemas/FluxConditioningField",
+ "description": "Conditioning tensor",
+ "field_kind": "output",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "flux_conditioning_output",
+ "default": "flux_conditioning_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "conditioning", "type", "type"],
+ "title": "FluxConditioningOutput",
+ "type": "object"
+ },
+ "FluxControlLoRALoaderInvocation": {
+ "category": "model",
+ "class": "invocation",
+ "classification": "stable",
+ "description": "LoRA model and Image to use with FLUX transformer generation.",
+ "node_pack": "invokeai",
+ "properties": {
"id": {
"description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
"field_kind": "node_attribute",
@@ -27576,37 +27885,203 @@
"title": "FluxModelLoaderOutput",
"type": "object"
},
- "FluxReduxConditioningField": {
- "description": "A FLUX Redux conditioning tensor primitive value",
+ "FluxPiDDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Decode a FLUX latent with the PiD pixel-diffusion decoder.\n\nThe FLUX AutoEncoder usually denormalises the stored latent internally\nbefore its conv decoder runs (`z / scale + shift`); we apply the same\ntransform manually here so PiD sees the raw latent it was trained on.",
+ "node_pack": "invokeai",
"properties": {
- "conditioning": {
- "$ref": "#/components/schemas/TensorField",
- "description": "The Redux image conditioning tensor."
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
},
- "mask": {
+ "metadata": {
"anyOf": [
{
- "$ref": "#/components/schemas/TensorField"
+ "$ref": "#/components/schemas/MetadataField"
},
{
"type": "null"
}
],
"default": null,
- "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True."
- }
- },
- "required": ["conditioning"],
- "title": "FluxReduxConditioningField",
- "type": "object"
- },
- "FluxReduxInvocation": {
- "category": "conditioning",
- "class": "invocation",
- "classification": "beta",
- "description": "Runs a FLUX Redux model to generate a conditioning tensor.",
- "node_pack": "invokeai",
- "properties": {
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LatentsField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Latents tensor",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "prompt": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "title": "Prompt",
+ "ui_component": "textarea"
+ },
+ "gemma2_encoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
+ },
+ "pid_decoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/PiDDecoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PiD FLUX decoder checkpoint.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
+ },
+ "type": {
+ "const": "flux_pid_decode",
+ "default": "flux_pid_decode",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "pid", "flux", "upscale"],
+ "title": "Latents to Image - FLUX + PiD (4x SR)",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
+ "FluxReduxConditioningField": {
+ "description": "A FLUX Redux conditioning tensor primitive value",
+ "properties": {
+ "conditioning": {
+ "$ref": "#/components/schemas/TensorField",
+ "description": "The Redux image conditioning tensor."
+ },
+ "mask": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/TensorField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True."
+ }
+ },
+ "required": ["conditioning"],
+ "title": "FluxReduxConditioningField",
+ "type": "object"
+ },
+ "FluxReduxInvocation": {
+ "category": "conditioning",
+ "class": "invocation",
+ "classification": "beta",
+ "description": "Runs a FLUX Redux model to generate a conditioning tensor.",
+ "node_pack": "invokeai",
+ "properties": {
"id": {
"description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
"field_kind": "node_attribute",
@@ -28522,6 +28997,245 @@
"$ref": "#/components/schemas/ImageCollectionOutput"
}
},
+ "Gemma2EncoderField": {
+ "description": "Field for the Gemma-2 text encoder used by PiD decoders.",
+ "properties": {
+ "tokenizer": {
+ "$ref": "#/components/schemas/ModelIdentifierField",
+ "description": "Info to load tokenizer submodel"
+ },
+ "text_encoder": {
+ "$ref": "#/components/schemas/ModelIdentifierField",
+ "description": "Info to load text_encoder submodel"
+ }
+ },
+ "required": ["tokenizer", "text_encoder"],
+ "title": "Gemma2EncoderField",
+ "type": "object"
+ },
+ "Gemma2EncoderLoaderInvocation": {
+ "category": "model",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder\nsubmodels for use by a PiD decode node.",
+ "node_pack": "invokeai",
+ "properties": {
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "gemma2_model": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 model used to encode captions for PiD decoders.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "title": "Gemma-2",
+ "ui_model_type": ["gemma2_encoder"]
+ },
+ "type": {
+ "const": "gemma2_encoder_loader",
+ "default": "gemma2_encoder_loader",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["model", "gemma2", "pid"],
+ "title": "Gemma-2 Encoder - PiD",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/Gemma2EncoderOutput"
+ }
+ },
+ "Gemma2EncoderOutput": {
+ "class": "output",
+ "properties": {
+ "gemma2_encoder": {
+ "$ref": "#/components/schemas/Gemma2EncoderField",
+ "description": "Gemma-2 text encoder used by PiD decoders",
+ "field_kind": "output",
+ "title": "Gemma-2 Encoder",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "gemma2_encoder_output",
+ "default": "gemma2_encoder_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "gemma2_encoder", "type", "type"],
+ "title": "Gemma2EncoderOutput",
+ "type": "object"
+ },
+ "Gemma2Encoder_Gemma2Encoder_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "base": {
+ "type": "string",
+ "const": "any",
+ "title": "Base",
+ "default": "any"
+ },
+ "type": {
+ "type": "string",
+ "const": "gemma2_encoder",
+ "title": "Type",
+ "default": "gemma2_encoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "gemma2_encoder",
+ "title": "Format",
+ "default": "gemma2_encoder"
+ },
+ "cpu_only": {
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cpu Only",
+ "description": "Whether this model should run on CPU only"
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "base",
+ "type",
+ "format",
+ "cpu_only"
+ ],
+ "title": "Gemma2Encoder_Gemma2Encoder_Config",
+ "description": "Standalone Gemma-2 causal LM directory used as a text encoder by PiD.\n\nExpected directory layout (HuggingFace `from_pretrained`-compatible)::\n\n /\n config.json # architectures: [\"Gemma2ForCausalLM\"]\n tokenizer.json\n tokenizer_config.json\n model-*.safetensors # or model.safetensors / *.bin"
+ },
"GeneratePasswordResponse": {
"properties": {
"password": {
@@ -28896,6 +29610,9 @@
{
"$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation"
},
+ {
+ "$ref": "#/components/schemas/Flux2PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/Flux2VaeDecodeInvocation"
},
@@ -28932,6 +29649,9 @@
{
"$ref": "#/components/schemas/FluxModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/FluxPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/FluxReduxInvocation"
},
@@ -28950,6 +29670,9 @@
{
"$ref": "#/components/schemas/GeminiImageGenerationInvocation"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation"
+ },
{
"$ref": "#/components/schemas/GetMaskBoundingBoxInvocation"
},
@@ -29250,6 +29973,12 @@
{
"$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderLoaderInvocation"
+ },
+ {
+ "$ref": "#/components/schemas/PiDUpscaleInvocation"
+ },
{
"$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation"
},
@@ -29277,6 +30006,9 @@
{
"$ref": "#/components/schemas/QwenImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/QwenImageTextEncoderInvocation"
},
@@ -29313,6 +30045,9 @@
{
"$ref": "#/components/schemas/SD3LatentsToImageInvocation"
},
+ {
+ "$ref": "#/components/schemas/SD3PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLCompelPromptInvocation"
},
@@ -29325,6 +30060,9 @@
{
"$ref": "#/components/schemas/SDXLModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/SDXLPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation"
},
@@ -29439,6 +30177,9 @@
{
"$ref": "#/components/schemas/ZImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/ZImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation"
},
@@ -29604,6 +30345,9 @@
{
"$ref": "#/components/schemas/FluxReduxOutput"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderOutput"
+ },
{
"$ref": "#/components/schemas/GradientMaskOutput"
},
@@ -29697,6 +30441,9 @@
{
"$ref": "#/components/schemas/PairTileImageOutput"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderOutput"
+ },
{
"$ref": "#/components/schemas/PromptTemplateOutput"
},
@@ -36479,6 +37226,9 @@
{
"$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation"
},
+ {
+ "$ref": "#/components/schemas/Flux2PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/Flux2VaeDecodeInvocation"
},
@@ -36515,6 +37265,9 @@
{
"$ref": "#/components/schemas/FluxModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/FluxPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/FluxReduxInvocation"
},
@@ -36533,6 +37286,9 @@
{
"$ref": "#/components/schemas/GeminiImageGenerationInvocation"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation"
+ },
{
"$ref": "#/components/schemas/GetMaskBoundingBoxInvocation"
},
@@ -36833,6 +37589,12 @@
{
"$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderLoaderInvocation"
+ },
+ {
+ "$ref": "#/components/schemas/PiDUpscaleInvocation"
+ },
{
"$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation"
},
@@ -36860,6 +37622,9 @@
{
"$ref": "#/components/schemas/QwenImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/QwenImageTextEncoderInvocation"
},
@@ -36896,6 +37661,9 @@
{
"$ref": "#/components/schemas/SD3LatentsToImageInvocation"
},
+ {
+ "$ref": "#/components/schemas/SD3PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLCompelPromptInvocation"
},
@@ -36908,6 +37676,9 @@
{
"$ref": "#/components/schemas/SDXLModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/SDXLPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation"
},
@@ -37022,6 +37793,9 @@
{
"$ref": "#/components/schemas/ZImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/ZImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation"
},
@@ -37144,6 +37918,9 @@
{
"$ref": "#/components/schemas/FluxReduxOutput"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderOutput"
+ },
{
"$ref": "#/components/schemas/GradientMaskOutput"
},
@@ -37237,6 +38014,9 @@
{
"$ref": "#/components/schemas/PairTileImageOutput"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderOutput"
+ },
{
"$ref": "#/components/schemas/PromptTemplateOutput"
},
@@ -37608,6 +38388,9 @@
{
"$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation"
},
+ {
+ "$ref": "#/components/schemas/Flux2PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/Flux2VaeDecodeInvocation"
},
@@ -37644,6 +38427,9 @@
{
"$ref": "#/components/schemas/FluxModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/FluxPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/FluxReduxInvocation"
},
@@ -37662,6 +38448,9 @@
{
"$ref": "#/components/schemas/GeminiImageGenerationInvocation"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation"
+ },
{
"$ref": "#/components/schemas/GetMaskBoundingBoxInvocation"
},
@@ -37962,6 +38751,12 @@
{
"$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderLoaderInvocation"
+ },
+ {
+ "$ref": "#/components/schemas/PiDUpscaleInvocation"
+ },
{
"$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation"
},
@@ -37989,6 +38784,9 @@
{
"$ref": "#/components/schemas/QwenImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/QwenImageTextEncoderInvocation"
},
@@ -38025,6 +38823,9 @@
{
"$ref": "#/components/schemas/SD3LatentsToImageInvocation"
},
+ {
+ "$ref": "#/components/schemas/SD3PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLCompelPromptInvocation"
},
@@ -38037,6 +38838,9 @@
{
"$ref": "#/components/schemas/SDXLModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/SDXLPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation"
},
@@ -38151,6 +38955,9 @@
{
"$ref": "#/components/schemas/ZImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/ZImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation"
},
@@ -38406,6 +39213,9 @@
"flux2_klein_text_encoder": {
"$ref": "#/components/schemas/FluxConditioningOutput"
},
+ "flux2_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"flux2_vae_decode": {
"$ref": "#/components/schemas/ImageOutput"
},
@@ -38445,6 +39255,9 @@
"flux_model_loader": {
"$ref": "#/components/schemas/FluxModelLoaderOutput"
},
+ "flux_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"flux_redux": {
"$ref": "#/components/schemas/FluxReduxOutput"
},
@@ -38463,6 +39276,9 @@
"gemini_image_generation": {
"$ref": "#/components/schemas/ImageCollectionOutput"
},
+ "gemma2_encoder_loader": {
+ "$ref": "#/components/schemas/Gemma2EncoderOutput"
+ },
"get_image_mask_bounding_box": {
"$ref": "#/components/schemas/BoundingBoxOutput"
},
@@ -38769,6 +39585,12 @@
"pbr_maps": {
"$ref": "#/components/schemas/PBRMapsOutput"
},
+ "pid_decoder_loader": {
+ "$ref": "#/components/schemas/PiDDecoderOutput"
+ },
+ "pid_upscale": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"pidi_edge_detection": {
"$ref": "#/components/schemas/ImageOutput"
},
@@ -38796,6 +39618,9 @@
"qwen_image_model_loader": {
"$ref": "#/components/schemas/QwenImageModelLoaderOutput"
},
+ "qwen_image_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"qwen_image_text_encoder": {
"$ref": "#/components/schemas/QwenImageConditioningOutput"
},
@@ -38841,6 +39666,9 @@
"sd3_model_loader": {
"$ref": "#/components/schemas/Sd3ModelLoaderOutput"
},
+ "sd3_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"sd3_text_encoder": {
"$ref": "#/components/schemas/SD3ConditioningOutput"
},
@@ -38856,6 +39684,9 @@
"sdxl_model_loader": {
"$ref": "#/components/schemas/SDXLModelLoaderOutput"
},
+ "sdxl_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"sdxl_refiner_compel_prompt": {
"$ref": "#/components/schemas/ConditioningOutput"
},
@@ -38961,6 +39792,9 @@
"z_image_model_loader": {
"$ref": "#/components/schemas/ZImageModelLoaderOutput"
},
+ "z_image_pid_decode": {
+ "$ref": "#/components/schemas/ImageOutput"
+ },
"z_image_seed_variance_enhancer": {
"$ref": "#/components/schemas/ZImageConditioningOutput"
},
@@ -39037,6 +39871,7 @@
"flux2_klein_lora_loader",
"flux2_klein_model_loader",
"flux2_klein_text_encoder",
+ "flux2_pid_decode",
"flux2_vae_decode",
"flux2_vae_encode",
"flux_control_lora_loader",
@@ -39050,12 +39885,14 @@
"flux_lora_collection_loader",
"flux_lora_loader",
"flux_model_loader",
+ "flux_pid_decode",
"flux_redux",
"flux_text_encoder",
"flux_vae_decode",
"flux_vae_encode",
"freeu",
"gemini_image_generation",
+ "gemma2_encoder_loader",
"get_image_mask_bounding_box",
"grounding_dino",
"hed_edge_detection",
@@ -39158,6 +39995,8 @@
"pair_tile_image",
"paste_image_into_bounding_box",
"pbr_maps",
+ "pid_decoder_loader",
+ "pid_upscale",
"pidi_edge_detection",
"prompt_from_file",
"prompt_template",
@@ -39167,6 +40006,7 @@
"qwen_image_lora_collection_loader",
"qwen_image_lora_loader",
"qwen_image_model_loader",
+ "qwen_image_pid_decode",
"qwen_image_text_encoder",
"rand_float",
"rand_int",
@@ -39182,11 +40022,13 @@
"sd3_i2l",
"sd3_l2i",
"sd3_model_loader",
+ "sd3_pid_decode",
"sd3_text_encoder",
"sdxl_compel_prompt",
"sdxl_lora_collection_loader",
"sdxl_lora_loader",
"sdxl_model_loader",
+ "sdxl_pid_decode",
"sdxl_refiner_compel_prompt",
"sdxl_refiner_model_loader",
"seamless",
@@ -39222,6 +40064,7 @@
"z_image_lora_collection_loader",
"z_image_lora_loader",
"z_image_model_loader",
+ "z_image_pid_decode",
"z_image_seed_variance_enhancer",
"z_image_text_encoder"
]
@@ -39505,6 +40348,9 @@
{
"$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation"
},
+ {
+ "$ref": "#/components/schemas/Flux2PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/Flux2VaeDecodeInvocation"
},
@@ -39541,6 +40387,9 @@
{
"$ref": "#/components/schemas/FluxModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/FluxPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/FluxReduxInvocation"
},
@@ -39559,6 +40408,9 @@
{
"$ref": "#/components/schemas/GeminiImageGenerationInvocation"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation"
+ },
{
"$ref": "#/components/schemas/GetMaskBoundingBoxInvocation"
},
@@ -39859,6 +40711,12 @@
{
"$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderLoaderInvocation"
+ },
+ {
+ "$ref": "#/components/schemas/PiDUpscaleInvocation"
+ },
{
"$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation"
},
@@ -39886,6 +40744,9 @@
{
"$ref": "#/components/schemas/QwenImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/QwenImageTextEncoderInvocation"
},
@@ -39922,6 +40783,9 @@
{
"$ref": "#/components/schemas/SD3LatentsToImageInvocation"
},
+ {
+ "$ref": "#/components/schemas/SD3PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLCompelPromptInvocation"
},
@@ -39934,6 +40798,9 @@
{
"$ref": "#/components/schemas/SDXLModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/SDXLPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation"
},
@@ -40048,6 +40915,9 @@
{
"$ref": "#/components/schemas/ZImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/ZImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation"
},
@@ -40392,6 +41262,9 @@
{
"$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation"
},
+ {
+ "$ref": "#/components/schemas/Flux2PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/Flux2VaeDecodeInvocation"
},
@@ -40428,6 +41301,9 @@
{
"$ref": "#/components/schemas/FluxModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/FluxPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/FluxReduxInvocation"
},
@@ -40446,6 +41322,9 @@
{
"$ref": "#/components/schemas/GeminiImageGenerationInvocation"
},
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation"
+ },
{
"$ref": "#/components/schemas/GetMaskBoundingBoxInvocation"
},
@@ -40746,6 +41625,12 @@
{
"$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderLoaderInvocation"
+ },
+ {
+ "$ref": "#/components/schemas/PiDUpscaleInvocation"
+ },
{
"$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation"
},
@@ -40773,6 +41658,9 @@
{
"$ref": "#/components/schemas/QwenImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/QwenImageTextEncoderInvocation"
},
@@ -40809,6 +41697,9 @@
{
"$ref": "#/components/schemas/SD3LatentsToImageInvocation"
},
+ {
+ "$ref": "#/components/schemas/SD3PiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLCompelPromptInvocation"
},
@@ -40821,6 +41712,9 @@
{
"$ref": "#/components/schemas/SDXLModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/SDXLPiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation"
},
@@ -40935,6 +41829,9 @@
{
"$ref": "#/components/schemas/ZImageModelLoaderInvocation"
},
+ {
+ "$ref": "#/components/schemas/ZImagePiDDecodeInvocation"
+ },
{
"$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation"
},
@@ -54788,6 +55685,7 @@
"t5_encoder",
"qwen3_encoder",
"qwen_vl_encoder",
+ "gemma2_encoder",
"bnb_quantized_int8b",
"bnb_quantized_nf4b",
"gguf_quantized",
@@ -55133,6 +56031,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -55226,6 +56139,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -55705,6 +56621,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -55798,6 +56729,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -56162,6 +57096,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -56255,6 +57204,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -56469,6 +57421,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -56562,6 +57529,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -56962,6 +57932,9 @@
{
"$ref": "#/components/schemas/Qwen3VariantType"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderVariantType"
+ },
{
"type": "null"
}
@@ -57101,12 +58074,14 @@
"t5_encoder",
"qwen3_encoder",
"qwen_vl_encoder",
+ "gemma2_encoder",
"spandrel_image_to_image",
"siglip",
"flux_redux",
"llava_onevision",
"text_llm",
"external_image_generator",
+ "pid_decoder",
"unknown"
],
"title": "ModelType",
@@ -57225,6 +58200,21 @@
{
"$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config"
+ },
+ {
+ "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config"
+ },
{
"$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config"
},
@@ -57318,6 +58308,9 @@
{
"$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config"
},
+ {
+ "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config"
+ },
{
"$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config"
},
@@ -58845,174 +59838,1159 @@
"$ref": "#/components/schemas/ImageOutput"
}
},
- "PiDiNetEdgeDetectionInvocation": {
- "category": "controlnet_preprocessors",
- "class": "invocation",
- "classification": "stable",
- "description": "Generates an edge map using PiDiNet.",
- "node_pack": "invokeai",
- "properties": {
- "board": {
- "anyOf": [
- {
- "$ref": "#/components/schemas/BoardField"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "The board to save the image to",
- "field_kind": "internal",
- "input": "direct",
- "orig_required": false,
- "ui_hidden": false
- },
- "metadata": {
- "anyOf": [
- {
- "$ref": "#/components/schemas/MetadataField"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "Optional metadata to be saved with the image",
- "field_kind": "internal",
- "input": "connection",
- "orig_required": false,
- "ui_hidden": false
- },
- "id": {
- "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
- "field_kind": "node_attribute",
- "title": "Id",
- "type": "string"
- },
- "is_intermediate": {
- "default": false,
- "description": "Whether or not this is an intermediate invocation.",
- "field_kind": "node_attribute",
- "input": "direct",
- "orig_required": true,
- "title": "Is Intermediate",
- "type": "boolean",
- "ui_hidden": false,
- "ui_type": "IsIntermediate"
- },
- "use_cache": {
- "default": true,
- "description": "Whether or not to use the cache",
- "field_kind": "node_attribute",
- "title": "Use Cache",
- "type": "boolean"
- },
- "image": {
- "anyOf": [
- {
- "$ref": "#/components/schemas/ImageField"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "The image to process",
- "field_kind": "input",
- "input": "any",
- "orig_required": true
- },
- "quantize_edges": {
- "default": false,
- "description": "Whether or not to use safe mode",
- "field_kind": "input",
- "input": "any",
- "orig_default": false,
- "orig_required": false,
- "title": "Quantize Edges",
- "type": "boolean"
- },
- "scribble": {
- "default": false,
- "description": "Whether or not to use scribble mode",
- "field_kind": "input",
- "input": "any",
- "orig_default": false,
- "orig_required": false,
- "title": "Scribble",
- "type": "boolean"
- },
- "type": {
- "const": "pidi_edge_detection",
- "default": "pidi_edge_detection",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["type", "id"],
- "tags": ["controlnet", "edge"],
- "title": "PiDiNet Edge Detection",
- "type": "object",
- "version": "1.0.0",
- "output": {
- "$ref": "#/components/schemas/ImageOutput"
- }
- },
- "PresetData": {
+ "PiDDecoderField": {
+ "description": "Field for a PiD (Pixel Diffusion Decoder) checkpoint.",
"properties": {
- "positive_prompt": {
- "type": "string",
- "title": "Positive Prompt",
- "description": "Positive prompt"
- },
- "negative_prompt": {
- "type": "string",
- "title": "Negative Prompt",
- "description": "Negative prompt"
- }
- },
- "additionalProperties": false,
- "type": "object",
- "required": ["positive_prompt", "negative_prompt"],
- "title": "PresetData"
- },
- "PresetType": {
- "type": "string",
- "enum": ["user", "default"],
- "title": "PresetType"
- },
- "ProgressImage": {
- "description": "The progress image sent intermittently during processing",
- "properties": {
- "width": {
- "description": "The effective width of the image in pixels",
- "minimum": 1,
- "title": "Width",
- "type": "integer"
- },
- "height": {
- "description": "The effective height of the image in pixels",
- "minimum": 1,
- "title": "Height",
- "type": "integer"
- },
- "dataURL": {
- "description": "The image data as a b64 data URL",
- "title": "Dataurl",
- "type": "string"
+ "decoder": {
+ "$ref": "#/components/schemas/ModelIdentifierField",
+ "description": "Info to load PiD decoder checkpoint"
}
},
- "required": ["width", "height", "dataURL"],
- "title": "ProgressImage",
+ "required": ["decoder"],
+ "title": "PiDDecoderField",
"type": "object"
},
- "PromptTemplateInvocation": {
- "category": "prompt",
+ "PiDDecoderLoaderInvocation": {
+ "category": "model",
"class": "invocation",
- "classification": "stable",
- "description": "Applies a Style Preset template to positive and negative prompts.\n\nSelect a Style Preset and provide positive/negative prompts. The node replaces\n{prompt} placeholders in the template with your input prompts.",
+ "classification": "prototype",
+ "description": "Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use\nby the per-backbone PiD decode nodes.",
+ "node_pack": "invokeai",
+ "properties": {
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "pid_decoder_model": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PiD decoder checkpoint matching the upstream backbone.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "title": "PiD Decoder",
+ "ui_model_type": ["pid_decoder"]
+ },
+ "type": {
+ "const": "pid_decoder_loader",
+ "default": "pid_decoder_loader",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["model", "pid", "decoder"],
+ "title": "PiD Decoder - FLUX / FLUX.2 / SD3",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/PiDDecoderOutput"
+ }
+ },
+ "PiDDecoderOutput": {
+ "class": "output",
+ "properties": {
+ "pid_decoder": {
+ "$ref": "#/components/schemas/PiDDecoderField",
+ "description": "PiD (Pixel Diffusion Decoder) checkpoint",
+ "field_kind": "output",
+ "title": "PiD Decoder",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "pid_decoder_output",
+ "default": "pid_decoder_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "pid_decoder", "type", "type"],
+ "title": "PiDDecoderOutput",
+ "type": "object"
+ },
+ "PiDDecoderVariantType": {
+ "type": "string",
+ "enum": ["res2k_sr4x", "res2kto4k_sr4x"],
+ "title": "PiDDecoderVariantType",
+ "description": "PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA.\n\nEach backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that\ndiffer only in target output resolution; the underlying network is the\nsame. NVIDIA's checkpoint filenames encode this as e.g.\n`PiD_res2k_sr4x_official_flux_distill_4step` vs\n`PiD_res2kto4k_sr4x_official_flux_distill_4step`."
+ },
+ "PiDDecoder_Checkpoint_FLUX_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "config_path": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Config Path",
+ "description": "Path to the config for this model, if any."
+ },
+ "type": {
+ "type": "string",
+ "const": "pid_decoder",
+ "title": "Type",
+ "default": "pid_decoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "checkpoint",
+ "title": "Format",
+ "default": "checkpoint"
+ },
+ "base": {
+ "type": "string",
+ "const": "flux",
+ "title": "Base",
+ "default": "flux"
+ },
+ "variant": {
+ "$ref": "#/components/schemas/PiDDecoderVariantType",
+ "description": "Resolution preset of the PiD decoder checkpoint."
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "config_path",
+ "type",
+ "format",
+ "base",
+ "variant"
+ ],
+ "title": "PiDDecoder_Checkpoint_FLUX_Config",
+ "description": "PiD decoder for the FLUX.1 backbone (16-channel latent)."
+ },
+ "PiDDecoder_Checkpoint_Flux2_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "config_path": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Config Path",
+ "description": "Path to the config for this model, if any."
+ },
+ "type": {
+ "type": "string",
+ "const": "pid_decoder",
+ "title": "Type",
+ "default": "pid_decoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "checkpoint",
+ "title": "Format",
+ "default": "checkpoint"
+ },
+ "base": {
+ "type": "string",
+ "const": "flux2",
+ "title": "Base",
+ "default": "flux2"
+ },
+ "variant": {
+ "$ref": "#/components/schemas/PiDDecoderVariantType",
+ "description": "Resolution preset of the PiD decoder checkpoint."
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "config_path",
+ "type",
+ "format",
+ "base",
+ "variant"
+ ],
+ "title": "PiDDecoder_Checkpoint_Flux2_Config",
+ "description": "PiD decoder for the FLUX.2 backbone (128-channel latent)."
+ },
+ "PiDDecoder_Checkpoint_QwenImage_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "config_path": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Config Path",
+ "description": "Path to the config for this model, if any."
+ },
+ "type": {
+ "type": "string",
+ "const": "pid_decoder",
+ "title": "Type",
+ "default": "pid_decoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "checkpoint",
+ "title": "Format",
+ "default": "checkpoint"
+ },
+ "base": {
+ "type": "string",
+ "const": "qwen-image",
+ "title": "Base",
+ "default": "qwen-image"
+ },
+ "variant": {
+ "$ref": "#/components/schemas/PiDDecoderVariantType",
+ "description": "Resolution preset of the PiD decoder checkpoint."
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "config_path",
+ "type",
+ "format",
+ "base",
+ "variant"
+ ],
+ "title": "PiDDecoder_Checkpoint_QwenImage_Config",
+ "description": "PiD decoder for the Qwen-Image backbone (16-channel latent).\n\nShares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same\nfilename / directory-name disambiguation (or a trusted explicit ``base`` override)\nas SD3 - see ``_validate_base``."
+ },
+ "PiDDecoder_Checkpoint_SD3_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "config_path": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Config Path",
+ "description": "Path to the config for this model, if any."
+ },
+ "type": {
+ "type": "string",
+ "const": "pid_decoder",
+ "title": "Type",
+ "default": "pid_decoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "checkpoint",
+ "title": "Format",
+ "default": "checkpoint"
+ },
+ "base": {
+ "type": "string",
+ "const": "sd-3",
+ "title": "Base",
+ "default": "sd-3"
+ },
+ "variant": {
+ "$ref": "#/components/schemas/PiDDecoderVariantType",
+ "description": "Resolution preset of the PiD decoder checkpoint."
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "config_path",
+ "type",
+ "format",
+ "base",
+ "variant"
+ ],
+ "title": "PiDDecoder_Checkpoint_SD3_Config",
+ "description": "PiD decoder for the Stable Diffusion 3 backbone (16-channel latent)."
+ },
+ "PiDDecoder_Checkpoint_SDXL_Config": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "A unique key for this model."
+ },
+ "hash": {
+ "type": "string",
+ "title": "Hash",
+ "description": "The hash of the model file(s)."
+ },
+ "path": {
+ "type": "string",
+ "title": "Path",
+ "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory."
+ },
+ "file_size": {
+ "type": "integer",
+ "title": "File Size",
+ "description": "The size of the model in bytes."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Name of the model."
+ },
+ "description": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Description",
+ "description": "Model description"
+ },
+ "source": {
+ "type": "string",
+ "title": "Source",
+ "description": "The original source of the model (path, URL or repo_id)."
+ },
+ "source_type": {
+ "$ref": "#/components/schemas/ModelSourceType",
+ "description": "The type of source"
+ },
+ "source_api_response": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Api Response",
+ "description": "The original API response from the source, as stringified JSON."
+ },
+ "source_url": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Source Url",
+ "description": "Optional URL for the model (e.g. download page or model page)."
+ },
+ "cover_image": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cover Image",
+ "description": "Url for image to preview model"
+ },
+ "config_path": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Config Path",
+ "description": "Path to the config for this model, if any."
+ },
+ "type": {
+ "type": "string",
+ "const": "pid_decoder",
+ "title": "Type",
+ "default": "pid_decoder"
+ },
+ "format": {
+ "type": "string",
+ "const": "checkpoint",
+ "title": "Format",
+ "default": "checkpoint"
+ },
+ "base": {
+ "type": "string",
+ "const": "sdxl",
+ "title": "Base",
+ "default": "sdxl"
+ },
+ "variant": {
+ "$ref": "#/components/schemas/PiDDecoderVariantType",
+ "description": "Resolution preset of the PiD decoder checkpoint."
+ }
+ },
+ "type": "object",
+ "required": [
+ "key",
+ "hash",
+ "path",
+ "file_size",
+ "name",
+ "description",
+ "source",
+ "source_type",
+ "source_api_response",
+ "source_url",
+ "cover_image",
+ "config_path",
+ "type",
+ "format",
+ "base",
+ "variant"
+ ],
+ "title": "PiDDecoder_Checkpoint_SDXL_Config",
+ "description": "PiD decoder for the SDXL backbone (4-channel latent)."
+ },
+ "PiDUpscaleInvocation": {
+ "category": "image",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode.\n\nWorks for source images that the FLUX VAE can encode (i.e. natural\nphotos / generated images at any size that lands on the VAE's 8-pixel\ngrid). The caption is used to condition the PiD decoder; leaving it\nempty produces an unconditional decode and is the cheapest option, but\nthe model was distilled with rich captions and benefits from one.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "image": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ImageField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Image to upscale.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder).",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "gemma2_encoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
+ },
+ "pid_decoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/PiDDecoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PiD FLUX decoder checkpoint.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
+ },
+ "prompt": {
+ "default": "",
+ "description": "Optional caption describing the image. Empty -> empty-caption decode.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": "",
+ "orig_required": false,
+ "title": "Prompt",
+ "type": "string",
+ "ui_component": "textarea"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
+ },
+ "type": {
+ "const": "pid_upscale",
+ "default": "pid_upscale",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["upscale", "image", "pid", "super-resolution", "flux"],
+ "title": "PiD Upscale (4x) - FLUX VAE",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
+ "PiDiNetEdgeDetectionInvocation": {
+ "category": "controlnet_preprocessors",
+ "class": "invocation",
+ "classification": "stable",
+ "description": "Generates an edge map using PiDiNet.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "image": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ImageField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The image to process",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true
+ },
+ "quantize_edges": {
+ "default": false,
+ "description": "Whether or not to use safe mode",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": false,
+ "orig_required": false,
+ "title": "Quantize Edges",
+ "type": "boolean"
+ },
+ "scribble": {
+ "default": false,
+ "description": "Whether or not to use scribble mode",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": false,
+ "orig_required": false,
+ "title": "Scribble",
+ "type": "boolean"
+ },
+ "type": {
+ "const": "pidi_edge_detection",
+ "default": "pidi_edge_detection",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["controlnet", "edge"],
+ "title": "PiDiNet Edge Detection",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
+ "PresetData": {
+ "properties": {
+ "positive_prompt": {
+ "type": "string",
+ "title": "Positive Prompt",
+ "description": "Positive prompt"
+ },
+ "negative_prompt": {
+ "type": "string",
+ "title": "Negative Prompt",
+ "description": "Negative prompt"
+ }
+ },
+ "additionalProperties": false,
+ "type": "object",
+ "required": ["positive_prompt", "negative_prompt"],
+ "title": "PresetData"
+ },
+ "PresetType": {
+ "type": "string",
+ "enum": ["user", "default"],
+ "title": "PresetType"
+ },
+ "ProgressImage": {
+ "description": "The progress image sent intermittently during processing",
+ "properties": {
+ "width": {
+ "description": "The effective width of the image in pixels",
+ "minimum": 1,
+ "title": "Width",
+ "type": "integer"
+ },
+ "height": {
+ "description": "The effective height of the image in pixels",
+ "minimum": 1,
+ "title": "Height",
+ "type": "integer"
+ },
+ "dataURL": {
+ "description": "The image data as a b64 data URL",
+ "title": "Dataurl",
+ "type": "string"
+ }
+ },
+ "required": ["width", "height", "dataURL"],
+ "title": "ProgressImage",
+ "type": "object"
+ },
+ "PromptTemplateInvocation": {
+ "category": "prompt",
+ "class": "invocation",
+ "classification": "stable",
+ "description": "Applies a Style Preset template to positive and negative prompts.\n\nSelect a Style Preset and provide positive/negative prompts. The node replaces\n{prompt} placeholders in the template with your input prompts.",
"node_pack": "invokeai",
"properties": {
"id": {
@@ -60779,127 +62757,310 @@
"title": "Use Cache",
"type": "boolean"
},
- "model": {
- "$ref": "#/components/schemas/ModelIdentifierField",
- "description": "Qwen Image Edit model (Transformer) to load",
+ "model": {
+ "$ref": "#/components/schemas/ModelIdentifierField",
+ "description": "Qwen Image Edit model (Transformer) to load",
+ "field_kind": "input",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Transformer",
+ "ui_model_base": ["qwen-image"],
+ "ui_model_type": ["main"]
+ },
+ "vae_model": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Standalone Qwen Image VAE model. If not provided, VAE will be loaded from the Component Source (or from the main model if it is Diffusers).",
+ "field_kind": "input",
+ "input": "direct",
+ "orig_default": null,
+ "orig_required": false,
+ "title": "VAE",
+ "ui_model_base": ["qwen-image"],
+ "ui_model_type": ["vae"]
+ },
+ "qwen_vl_encoder_model": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Standalone Qwen2.5-VL encoder model. If not provided, the encoder will be loaded from the Component Source (or from the main model if it is Diffusers).",
+ "field_kind": "input",
+ "input": "direct",
+ "orig_default": null,
+ "orig_required": false,
+ "title": "Qwen VL Encoder",
+ "ui_model_type": ["qwen_vl_encoder"]
+ },
+ "component_source": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Diffusers Qwen Image model to extract VAE and/or Qwen VL encoder from. Use this if you don't have separate VAE/encoder models. Ignored for any submodel that is provided separately.",
+ "field_kind": "input",
+ "input": "direct",
+ "orig_default": null,
+ "orig_required": false,
+ "title": "Component Source (Diffusers)",
+ "ui_model_base": ["qwen-image"],
+ "ui_model_format": ["diffusers"],
+ "ui_model_type": ["main"]
+ },
+ "type": {
+ "const": "qwen_image_model_loader",
+ "default": "qwen_image_model_loader",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["model", "type", "id"],
+ "tags": ["model", "qwen_image"],
+ "title": "Main Model - Qwen Image",
+ "type": "object",
+ "version": "1.2.0",
+ "output": {
+ "$ref": "#/components/schemas/QwenImageModelLoaderOutput"
+ }
+ },
+ "QwenImageModelLoaderOutput": {
+ "class": "output",
+ "description": "Qwen Image model loader output.",
+ "properties": {
+ "transformer": {
+ "$ref": "#/components/schemas/TransformerField",
+ "description": "Transformer",
+ "field_kind": "output",
+ "title": "Transformer",
+ "ui_hidden": false
+ },
+ "qwen_vl_encoder": {
+ "$ref": "#/components/schemas/QwenVLEncoderField",
+ "description": "Qwen2.5-VL tokenizer, processor and text/vision encoder",
+ "field_kind": "output",
+ "title": "Qwen VL Encoder",
+ "ui_hidden": false
+ },
+ "vae": {
+ "$ref": "#/components/schemas/VAEField",
+ "description": "VAE",
+ "field_kind": "output",
+ "title": "VAE",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "qwen_image_model_loader_output",
+ "default": "qwen_image_model_loader_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "transformer", "qwen_vl_encoder", "vae", "type", "type"],
+ "title": "QwenImageModelLoaderOutput",
+ "type": "object"
+ },
+ "QwenImagePiDDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Decode a Qwen-Image latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The 5D Qwen latent is\nreduced to 2D and per-channel denormalized (``z * std + mean``) before PiD.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LatentsField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Latents tensor",
"field_kind": "input",
- "input": "direct",
+ "input": "connection",
+ "orig_required": true
+ },
+ "prompt": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
+ "field_kind": "input",
+ "input": "any",
"orig_required": true,
- "title": "Transformer",
- "ui_model_base": ["qwen-image"],
- "ui_model_type": ["main"]
+ "title": "Prompt",
+ "ui_component": "textarea"
},
- "vae_model": {
+ "gemma2_encoder": {
"anyOf": [
{
- "$ref": "#/components/schemas/ModelIdentifierField"
+ "$ref": "#/components/schemas/Gemma2EncoderField"
},
{
"type": "null"
}
],
"default": null,
- "description": "Standalone Qwen Image VAE model. If not provided, VAE will be loaded from the Component Source (or from the main model if it is Diffusers).",
+ "description": "Gemma-2 caption encoder. Required by PiD.",
"field_kind": "input",
- "input": "direct",
- "orig_default": null,
- "orig_required": false,
- "title": "VAE",
- "ui_model_base": ["qwen-image"],
- "ui_model_type": ["vae"]
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
},
- "qwen_vl_encoder_model": {
+ "pid_decoder": {
"anyOf": [
{
- "$ref": "#/components/schemas/ModelIdentifierField"
+ "$ref": "#/components/schemas/PiDDecoderField"
},
{
"type": "null"
}
],
"default": null,
- "description": "Standalone Qwen2.5-VL encoder model. If not provided, the encoder will be loaded from the Component Source (or from the main model if it is Diffusers).",
+ "description": "PiD Qwen-Image decoder checkpoint.",
"field_kind": "input",
- "input": "direct",
- "orig_default": null,
- "orig_required": false,
- "title": "Qwen VL Encoder",
- "ui_model_type": ["qwen_vl_encoder"]
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
},
- "component_source": {
+ "vae": {
"anyOf": [
{
- "$ref": "#/components/schemas/ModelIdentifierField"
+ "$ref": "#/components/schemas/VAEField"
},
{
"type": "null"
}
],
"default": null,
- "description": "Diffusers Qwen Image model to extract VAE and/or Qwen VL encoder from. Use this if you don't have separate VAE/encoder models. Ignored for any submodel that is provided separately.",
+ "description": "Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. If omitted, the diffusers default Qwen-Image constants are used.",
"field_kind": "input",
- "input": "direct",
+ "input": "connection",
"orig_default": null,
"orig_required": false,
- "title": "Component Source (Diffusers)",
- "ui_model_base": ["qwen-image"],
- "ui_model_format": ["diffusers"],
- "ui_model_type": ["main"]
+ "title": "VAE"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
},
"type": {
- "const": "qwen_image_model_loader",
- "default": "qwen_image_model_loader",
+ "const": "qwen_image_pid_decode",
+ "default": "qwen_image_pid_decode",
"field_kind": "node_attribute",
"title": "type",
"type": "string"
}
},
- "required": ["model", "type", "id"],
- "tags": ["model", "qwen_image"],
- "title": "Main Model - Qwen Image",
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "pid", "qwen-image", "upscale"],
+ "title": "Latents to Image - Qwen-Image + PiD (4x SR)",
"type": "object",
- "version": "1.2.0",
+ "version": "1.0.0",
"output": {
- "$ref": "#/components/schemas/QwenImageModelLoaderOutput"
+ "$ref": "#/components/schemas/ImageOutput"
}
},
- "QwenImageModelLoaderOutput": {
- "class": "output",
- "description": "Qwen Image model loader output.",
- "properties": {
- "transformer": {
- "$ref": "#/components/schemas/TransformerField",
- "description": "Transformer",
- "field_kind": "output",
- "title": "Transformer",
- "ui_hidden": false
- },
- "qwen_vl_encoder": {
- "$ref": "#/components/schemas/QwenVLEncoderField",
- "description": "Qwen2.5-VL tokenizer, processor and text/vision encoder",
- "field_kind": "output",
- "title": "Qwen VL Encoder",
- "ui_hidden": false
- },
- "vae": {
- "$ref": "#/components/schemas/VAEField",
- "description": "VAE",
- "field_kind": "output",
- "title": "VAE",
- "ui_hidden": false
- },
- "type": {
- "const": "qwen_image_model_loader_output",
- "default": "qwen_image_model_loader_output",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["output_meta", "transformer", "qwen_vl_encoder", "vae", "type", "type"],
- "title": "QwenImageModelLoaderOutput",
- "type": "object"
- },
"QwenImageTextEncoderInvocation": {
"category": "conditioning",
"class": "invocation",
@@ -62907,159 +65068,325 @@
"title": "Use Cache",
"type": "boolean"
},
- "image": {
- "anyOf": [
- {
- "$ref": "#/components/schemas/ImageField"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "The image to encode",
- "field_kind": "input",
- "input": "any",
- "orig_required": true
- },
- "vae": {
+ "image": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ImageField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The image to encode",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "VAE",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "type": {
+ "const": "sd3_i2l",
+ "default": "sd3_i2l",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["image", "latents", "vae", "i2l", "sd3"],
+ "title": "Image to Latents - SD3",
+ "type": "object",
+ "version": "1.0.1",
+ "output": {
+ "$ref": "#/components/schemas/LatentsOutput"
+ }
+ },
+ "SD3LatentsToImageInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "stable",
+ "description": "Generates an image from latents.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LatentsField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Latents tensor",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "VAE",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "type": {
+ "const": "sd3_l2i",
+ "default": "sd3_l2i",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "vae", "l2i", "sd3"],
+ "title": "Latents to Image - SD3",
+ "type": "object",
+ "version": "1.3.2",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
+ "SD3PiDDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Decode an SD3 latent with the PiD pixel-diffusion decoder.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
"anyOf": [
{
- "$ref": "#/components/schemas/VAEField"
+ "$ref": "#/components/schemas/LatentsField"
},
{
"type": "null"
}
],
"default": null,
- "description": "VAE",
+ "description": "Latents tensor",
"field_kind": "input",
"input": "connection",
"orig_required": true
},
- "type": {
- "const": "sd3_i2l",
- "default": "sd3_i2l",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["type", "id"],
- "tags": ["image", "latents", "vae", "i2l", "sd3"],
- "title": "Image to Latents - SD3",
- "type": "object",
- "version": "1.0.1",
- "output": {
- "$ref": "#/components/schemas/LatentsOutput"
- }
- },
- "SD3LatentsToImageInvocation": {
- "category": "latents",
- "class": "invocation",
- "classification": "stable",
- "description": "Generates an image from latents.",
- "node_pack": "invokeai",
- "properties": {
- "board": {
+ "prompt": {
"anyOf": [
{
- "$ref": "#/components/schemas/BoardField"
+ "type": "string"
},
{
"type": "null"
}
],
"default": null,
- "description": "The board to save the image to",
- "field_kind": "internal",
- "input": "direct",
- "orig_required": false,
- "ui_hidden": false
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "title": "Prompt",
+ "ui_component": "textarea"
},
- "metadata": {
+ "gemma2_encoder": {
"anyOf": [
{
- "$ref": "#/components/schemas/MetadataField"
+ "$ref": "#/components/schemas/Gemma2EncoderField"
},
{
"type": "null"
}
],
"default": null,
- "description": "Optional metadata to be saved with the image",
- "field_kind": "internal",
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
"input": "connection",
- "orig_required": false,
- "ui_hidden": false
- },
- "id": {
- "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
- "field_kind": "node_attribute",
- "title": "Id",
- "type": "string"
- },
- "is_intermediate": {
- "default": false,
- "description": "Whether or not this is an intermediate invocation.",
- "field_kind": "node_attribute",
- "input": "direct",
"orig_required": true,
- "title": "Is Intermediate",
- "type": "boolean",
- "ui_hidden": false,
- "ui_type": "IsIntermediate"
- },
- "use_cache": {
- "default": true,
- "description": "Whether or not to use the cache",
- "field_kind": "node_attribute",
- "title": "Use Cache",
- "type": "boolean"
+ "title": "Gemma-2 Encoder"
},
- "latents": {
+ "pid_decoder": {
"anyOf": [
{
- "$ref": "#/components/schemas/LatentsField"
+ "$ref": "#/components/schemas/PiDDecoderField"
},
{
"type": "null"
}
],
"default": null,
- "description": "Latents tensor",
+ "description": "PiD SD3 decoder checkpoint.",
"field_kind": "input",
"input": "connection",
- "orig_required": true
+ "orig_required": true,
+ "title": "PiD Decoder"
},
- "vae": {
- "anyOf": [
- {
- "$ref": "#/components/schemas/VAEField"
- },
- {
- "type": "null"
- }
- ],
- "default": null,
- "description": "VAE",
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
"field_kind": "input",
- "input": "connection",
- "orig_required": true
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
},
"type": {
- "const": "sd3_l2i",
- "default": "sd3_l2i",
+ "const": "sd3_pid_decode",
+ "default": "sd3_pid_decode",
"field_kind": "node_attribute",
"title": "type",
"type": "string"
}
},
"required": ["type", "id"],
- "tags": ["latents", "image", "vae", "l2i", "sd3"],
- "title": "Latents to Image - SD3",
+ "tags": ["latents", "image", "pid", "sd3", "upscale"],
+ "title": "Latents to Image - SD3 + PiD (4x SR)",
"type": "object",
- "version": "1.3.2",
+ "version": "1.0.0",
"output": {
"$ref": "#/components/schemas/ImageOutput"
}
@@ -63583,83 +65910,266 @@
"title": "Use Cache",
"type": "boolean"
},
- "model": {
+ "model": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "SDXL Main model (UNet, VAE, CLIP1, CLIP2) to load",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "ui_model_base": ["sdxl"],
+ "ui_model_type": ["main"]
+ },
+ "type": {
+ "const": "sdxl_model_loader",
+ "default": "sdxl_model_loader",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["model", "sdxl"],
+ "title": "Main Model - SDXL",
+ "type": "object",
+ "version": "1.0.4",
+ "output": {
+ "$ref": "#/components/schemas/SDXLModelLoaderOutput"
+ }
+ },
+ "SDXLModelLoaderOutput": {
+ "class": "output",
+ "description": "SDXL base model loader output",
+ "properties": {
+ "unet": {
+ "$ref": "#/components/schemas/UNetField",
+ "description": "UNet (scheduler, LoRAs)",
+ "field_kind": "output",
+ "title": "UNet",
+ "ui_hidden": false
+ },
+ "clip": {
+ "$ref": "#/components/schemas/CLIPField",
+ "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count",
+ "field_kind": "output",
+ "title": "CLIP 1",
+ "ui_hidden": false
+ },
+ "clip2": {
+ "$ref": "#/components/schemas/CLIPField",
+ "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count",
+ "field_kind": "output",
+ "title": "CLIP 2",
+ "ui_hidden": false
+ },
+ "vae": {
+ "$ref": "#/components/schemas/VAEField",
+ "description": "VAE",
+ "field_kind": "output",
+ "title": "VAE",
+ "ui_hidden": false
+ },
+ "type": {
+ "const": "sdxl_model_loader_output",
+ "default": "sdxl_model_loader_output",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["output_meta", "unet", "clip", "clip2", "vae", "type", "type"],
+ "title": "SDXLModelLoaderOutput",
+ "type": "object"
+ },
+ "SDXLPiDDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Decode an SDXL latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The SDXL latent is\n4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``)\nand handed straight to PiD - no packing needed.",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
"anyOf": [
{
- "$ref": "#/components/schemas/ModelIdentifierField"
+ "$ref": "#/components/schemas/LatentsField"
},
{
"type": "null"
}
],
"default": null,
- "description": "SDXL Main model (UNet, VAE, CLIP1, CLIP2) to load",
+ "description": "Latents tensor",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "prompt": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
"field_kind": "input",
"input": "any",
"orig_required": true,
- "ui_model_base": ["sdxl"],
- "ui_model_type": ["main"]
- },
- "type": {
- "const": "sdxl_model_loader",
- "default": "sdxl_model_loader",
- "field_kind": "node_attribute",
- "title": "type",
- "type": "string"
- }
- },
- "required": ["type", "id"],
- "tags": ["model", "sdxl"],
- "title": "Main Model - SDXL",
- "type": "object",
- "version": "1.0.4",
- "output": {
- "$ref": "#/components/schemas/SDXLModelLoaderOutput"
- }
- },
- "SDXLModelLoaderOutput": {
- "class": "output",
- "description": "SDXL base model loader output",
- "properties": {
- "unet": {
- "$ref": "#/components/schemas/UNetField",
- "description": "UNet (scheduler, LoRAs)",
- "field_kind": "output",
- "title": "UNet",
- "ui_hidden": false
+ "title": "Prompt",
+ "ui_component": "textarea"
},
- "clip": {
- "$ref": "#/components/schemas/CLIPField",
- "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count",
- "field_kind": "output",
- "title": "CLIP 1",
- "ui_hidden": false
+ "gemma2_encoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
},
- "clip2": {
- "$ref": "#/components/schemas/CLIPField",
- "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count",
- "field_kind": "output",
- "title": "CLIP 2",
- "ui_hidden": false
+ "pid_decoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/PiDDecoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PiD SDXL decoder checkpoint.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
},
"vae": {
- "$ref": "#/components/schemas/VAEField",
- "description": "VAE",
- "field_kind": "output",
- "title": "VAE",
- "ui_hidden": false
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "SDXL VAE, used to read scaling_factor / shift_factor. If omitted, the SDXL fallback constants (0.13025 / 0.0) are used.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_default": null,
+ "orig_required": false,
+ "title": "VAE"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
},
"type": {
- "const": "sdxl_model_loader_output",
- "default": "sdxl_model_loader_output",
+ "const": "sdxl_pid_decode",
+ "default": "sdxl_pid_decode",
"field_kind": "node_attribute",
"title": "type",
"type": "string"
}
},
- "required": ["output_meta", "unet", "clip", "clip2", "vae", "type", "type"],
- "title": "SDXLModelLoaderOutput",
- "type": "object"
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "pid", "sdxl", "upscale"],
+ "title": "Latents to Image - SDXL + PiD (4x SR)",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
},
"SDXLRefinerCompelPromptInvocation": {
"category": "prompt",
@@ -66439,6 +68949,9 @@
{
"$ref": "#/components/schemas/Qwen3VariantType"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderVariantType"
+ },
{
"type": "null"
}
@@ -66599,6 +69112,9 @@
{
"$ref": "#/components/schemas/Qwen3VariantType"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderVariantType"
+ },
{
"type": "null"
}
@@ -67531,6 +70047,9 @@
{
"$ref": "#/components/schemas/Qwen3VariantType"
},
+ {
+ "$ref": "#/components/schemas/PiDDecoderVariantType"
+ },
{
"type": "null"
}
@@ -74203,6 +76722,189 @@
"title": "ZImageModelLoaderOutput",
"type": "object"
},
+ "ZImagePiDDecodeInvocation": {
+ "category": "latents",
+ "class": "invocation",
+ "classification": "prototype",
+ "description": "Decode a Z-Image latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass (Z-Image decoder is\ntrained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial\ndown-factor gives a 32x linear scale from latent to pixel).",
+ "node_pack": "invokeai",
+ "properties": {
+ "board": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/BoardField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "The board to save the image to",
+ "field_kind": "internal",
+ "input": "direct",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "metadata": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/MetadataField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Optional metadata to be saved with the image",
+ "field_kind": "internal",
+ "input": "connection",
+ "orig_required": false,
+ "ui_hidden": false
+ },
+ "id": {
+ "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.",
+ "field_kind": "node_attribute",
+ "title": "Id",
+ "type": "string"
+ },
+ "is_intermediate": {
+ "default": false,
+ "description": "Whether or not this is an intermediate invocation.",
+ "field_kind": "node_attribute",
+ "input": "direct",
+ "orig_required": true,
+ "title": "Is Intermediate",
+ "type": "boolean",
+ "ui_hidden": false,
+ "ui_type": "IsIntermediate"
+ },
+ "use_cache": {
+ "default": true,
+ "description": "Whether or not to use the cache",
+ "field_kind": "node_attribute",
+ "title": "Use Cache",
+ "type": "boolean"
+ },
+ "latents": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/LatentsField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Latents tensor",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true
+ },
+ "prompt": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Text prompt the latent was generated from. PiD conditions on it.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_required": true,
+ "title": "Prompt",
+ "ui_component": "textarea"
+ },
+ "gemma2_encoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/Gemma2EncoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Gemma-2 caption encoder. Required by PiD.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "Gemma-2 Encoder"
+ },
+ "pid_decoder": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/PiDDecoderField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PiD FLUX decoder checkpoint.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_required": true,
+ "title": "PiD Decoder"
+ },
+ "vae": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/VAEField"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Z-Image VAE used to read scaling_factor / shift_factor. If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used.",
+ "field_kind": "input",
+ "input": "connection",
+ "orig_default": null,
+ "orig_required": false,
+ "title": "VAE"
+ },
+ "num_inference_steps": {
+ "default": 4,
+ "description": "Number of PiD distill steps. The released checkpoints are trained for 4.",
+ "field_kind": "input",
+ "input": "any",
+ "maximum": 8,
+ "minimum": 1,
+ "orig_default": 4,
+ "orig_required": false,
+ "title": "Num Inference Steps",
+ "type": "integer"
+ },
+ "seed": {
+ "default": 0,
+ "description": "Seed for the PiD decoder's noise.",
+ "field_kind": "input",
+ "input": "any",
+ "orig_default": 0,
+ "orig_required": false,
+ "title": "Seed",
+ "type": "integer"
+ },
+ "type": {
+ "const": "z_image_pid_decode",
+ "default": "z_image_pid_decode",
+ "field_kind": "node_attribute",
+ "title": "type",
+ "type": "string"
+ }
+ },
+ "required": ["type", "id"],
+ "tags": ["latents", "image", "pid", "z-image", "upscale"],
+ "title": "Latents to Image - Z-Image + PiD (4x SR)",
+ "type": "object",
+ "version": "1.0.0",
+ "output": {
+ "$ref": "#/components/schemas/ImageOutput"
+ }
+ },
"ZImageSeedVarianceEnhancerInvocation": {
"category": "prompt",
"class": "invocation",
diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json
index fdb9391f976..a4619b8a9e5 100644
--- a/invokeai/frontend/web/public/locales/en.json
+++ b/invokeai/frontend/web/public/locales/en.json
@@ -1354,6 +1354,12 @@
"selectModelToView": "Select a model to view its details",
"typePhraseHere": "Type phrase here",
"t5Encoder": "T5 Encoder",
+ "gemma2Encoder": "Gemma-2 Encoder",
+ "pidDecoder": "PiD Decoder",
+ "pidMode": "PiD Decode",
+ "pidModeOff": "Off",
+ "pidModeFit": "On (Fit to size)",
+ "pidModeNative": "On (Native 4×)",
"qwen3Encoder": "Qwen3 Encoder",
"qwenVLEncoder": "Qwen2.5-VL Encoder",
"animaVae": "VAE",
@@ -1678,6 +1684,10 @@
"noStartingFrameImage": "No starting frame image",
"noT5EncoderModelSelected": "No T5 Encoder model selected for FLUX generation",
"noFLUXVAEModelSelected": "No VAE model selected for FLUX generation",
+ "noPidDecoderModelSelected": "No PiD decoder model selected",
+ "noGemma2EncoderModelSelected": "No Gemma-2 encoder model selected (required by PiD)",
+ "pidScaleBeforeProcessingMustBeOff": "Turn off Scale Before Processing (set it to None) to use PiD decode",
+ "pidIncompatibleWithRefiner": "PiD decode is not compatible with the SDXL Refiner. Disable one of them.",
"noCLIPEmbedModelSelected": "No CLIP Embed model selected for FLUX generation",
"noQwen3EncoderModelSelected": "No Qwen3 Encoder model selected for FLUX2 Klein generation",
"noFlux2KleinVaeModelSelected": "No VAE selected. Non-diffusers FLUX.2 Klein models require a standalone VAE",
@@ -1990,6 +2000,8 @@
"imagenIncompatibleGenerationMode": "Google {{model}} supports Text to Image only. Use other models for Image to Image, Inpainting and Outpainting tasks.",
"chatGPT4oIncompatibleGenerationMode": "ChatGPT 4o supports Text to Image and Image to Image only. Use other models Inpainting and Outpainting tasks.",
"fluxKontextIncompatibleGenerationMode": "FLUX Kontext does not support generation from images placed on the canvas. Re-try using the Reference Image section and disable any Raster Layers.",
+ "pidUnsupportedMode": "PiD decode currently supports Text to Image and Image to Image only. Disable PiD for Inpaint/Outpaint.",
+ "pidScaleBeforeProcessingOff": "Turn off Scale Before Processing (set it to None) to use PiD decode.",
"problemUnpublishingWorkflow": "Problem Unpublishing Workflow",
"problemUnpublishingWorkflowDescription": "There was a problem unpublishing the workflow. Please try again.",
"workflowUnpublished": "Workflow Unpublished",
@@ -2057,6 +2069,15 @@
"0.5: Gentler schedule for resolutions just above native (1024px)."
]
},
+ "pidMode": {
+ "heading": "PiD Decode (Super-Resolution Decoder)",
+ "paragraphs": [
+ "PiD replaces the standard VAE decode with NVIDIA's Pixel Diffusion Decoder, a diffusion-based 4x super-resolution decoder. It requires a PiD decoder model and a Gemma-2 caption encoder.",
+ "Fit: generate at your chosen resolution, PiD decodes it 4x, then downscales back to that size - extra detail at the same output size, and it composites onto the canvas (works for Image to Image too).",
+ "Native (4x): your dimensions are the 4x target. Generation runs at a quarter of them (e.g. 512 -> 2048) and PiD's full 4x output is used directly. The PiD decoders are trained for 2K output (512px sources), with 2K-to-4K variants for 4K.",
+ "Because PiD's diffusion decode reconstructs detail, you can usually lower the generation Steps to save time. 'Scale Before Processing' must be set to None while PiD is enabled."
+ ]
+ },
"seedVarianceEnhancer": {
"heading": "Seed Variance Enhancer",
"paragraphs": [
diff --git a/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts b/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts
index e9d855648ad..e97466d51e5 100644
--- a/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts
+++ b/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts
@@ -6,6 +6,7 @@ export type Feature =
| 'fluxDypePreset'
| 'fluxDypeScale'
| 'fluxDypeExponent'
+ | 'pidMode'
| 'hrf'
| 'paramNegativeConditioning'
| 'paramPositiveConditioning'
@@ -102,6 +103,10 @@ export const POPOVER_DATA: { [key in Feature]?: PopoverData } = {
fluxDypeExponent: {
placement: 'right',
},
+ pidMode: {
+ placement: 'right',
+ href: 'https://github.com/nv-tlabs/PiD',
+ },
inpainting: {
href: 'https://support.invoke.ai/support/solutions/articles/151000096702-inpainting-outpainting-and-bounding-box',
},
diff --git a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts
index d210d2fd2ac..9f9824f6ab2 100644
--- a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts
@@ -157,7 +157,8 @@ describe('paramsSliceConfig persisted state migration', () => {
const result = migrate?.(v2State) as ReturnType;
- expect(result._version).toBe(3);
+ // v2 migrates all the way through the current chain (v2 -> v3 adds Qwen fields, v3 -> v4 adds PiD fields).
+ expect(result._version).toBe(4);
expect(result.qwenImageVaeModel).toBeNull();
expect(result.qwenImageQwenVLEncoderModel).toBeNull();
// Existing params should be preserved
diff --git a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts
index c4c90cf98e7..de17d4676f8 100644
--- a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts
@@ -11,6 +11,7 @@ import type {
AspectRatioID,
InfillMethod,
ParamsState,
+ PidMode,
PromptHistoryItem,
RgbaColor,
} from 'features/controlLayers/store/types';
@@ -49,7 +50,12 @@ import type {
ParameterVAEModel,
} from 'features/parameters/types/parameterSchemas';
import { getExternalPanelControl, hasExternalPanelControl } from 'features/parameters/util/externalPanelSchema';
-import { getGridSize, getIsSizeOptimal, getOptimalDimension } from 'features/parameters/util/optimalDimension';
+import {
+ getGridSize,
+ getIsSizeOptimal,
+ getOptimalDimension,
+ getPidScale,
+} from 'features/parameters/util/optimalDimension';
import { modelConfigsAdapterSelectors, selectModelConfigsQuery } from 'services/api/endpoints/models';
import type { AnyModelConfigWithExternal } from 'services/api/types';
import { isExternalApiModelConfig, isNonRefinerMainModelConfig } from 'services/api/types';
@@ -260,6 +266,39 @@ const slice = createSlice({
}
state.kleinQwen3EncoderModel = result.data;
},
+ pidModeChanged: (state, action: PayloadAction) => {
+ const prevPidScale = getPidScale(state.pidMode);
+ const nextPidScale = getPidScale(action.payload);
+ state.pidMode = action.payload;
+ // Entering/leaving native mode reinterprets the dimensions (4x target <-> generation resolution), so
+ // re-fit them to the new mode's optimal target on the new grid, preserving aspect ratio.
+ if (prevPidScale !== nextPidScale) {
+ const base = state.model?.base as BaseModelType | undefined;
+ const optimalDimension = getOptimalDimension(base, nextPidScale);
+ const { width, height } = calculateNewSize(
+ state.dimensions.aspectRatio.value,
+ optimalDimension * optimalDimension,
+ base,
+ nextPidScale
+ );
+ state.dimensions.width = width;
+ state.dimensions.height = height;
+ }
+ },
+ pidDecoderModelSelected: (state, action: PayloadAction<{ key: string; name: string; base: string } | null>) => {
+ const result = zParamsState.shape.pidDecoderModel.safeParse(action.payload);
+ if (!result.success) {
+ return;
+ }
+ state.pidDecoderModel = result.data;
+ },
+ gemma2EncoderModelSelected: (state, action: PayloadAction<{ key: string; name: string; base: string } | null>) => {
+ const result = zParamsState.shape.gemma2EncoderModel.safeParse(action.payload);
+ if (!result.success) {
+ return;
+ }
+ state.gemma2EncoderModel = result.data;
+ },
qwenImageComponentSourceSelected: (state, action: PayloadAction) => {
const result = zParamsState.shape.qwenImageComponentSource.safeParse(action.payload);
if (!result.success) {
@@ -392,7 +431,7 @@ const slice = createSlice({
//#region Dimensions
sizeRecalled: (state, action: PayloadAction<{ width: number; height: number }>) => {
const { width, height } = action.payload;
- const gridSize = getGridSize(state.model?.base as BaseModelType | undefined);
+ const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode));
state.dimensions.width = Math.max(roundDownToMultiple(width, gridSize), 64);
state.dimensions.height = Math.max(roundDownToMultiple(height, gridSize), 64);
state.dimensions.aspectRatio.value = state.dimensions.width / state.dimensions.height;
@@ -401,7 +440,7 @@ const slice = createSlice({
},
widthChanged: (state, action: PayloadAction<{ width: number; updateAspectRatio?: boolean; clamp?: boolean }>) => {
const { width, updateAspectRatio, clamp } = action.payload;
- const gridSize = getGridSize(state.model?.base as BaseModelType | undefined);
+ const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode));
state.dimensions.width = clamp ? Math.max(roundDownToMultiple(width, gridSize), 64) : width;
if (state.dimensions.aspectRatio.isLocked) {
@@ -419,7 +458,7 @@ const slice = createSlice({
},
heightChanged: (state, action: PayloadAction<{ height: number; updateAspectRatio?: boolean; clamp?: boolean }>) => {
const { height, updateAspectRatio, clamp } = action.payload;
- const gridSize = getGridSize(state.model?.base as BaseModelType | undefined);
+ const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode));
state.dimensions.height = clamp ? Math.max(roundDownToMultiple(height, gridSize), 64) : height;
if (state.dimensions.aspectRatio.isLocked) {
@@ -457,7 +496,8 @@ const slice = createSlice({
const { width, height } = calculateNewSize(
state.dimensions.aspectRatio.value,
state.dimensions.width * state.dimensions.height,
- state.model?.base as BaseModelType | undefined
+ state.model?.base as BaseModelType | undefined,
+ getPidScale(state.pidMode)
);
state.dimensions.width = width;
state.dimensions.height = height;
@@ -475,7 +515,8 @@ const slice = createSlice({
const { width, height } = calculateNewSize(
state.dimensions.aspectRatio.value,
state.dimensions.width * state.dimensions.height,
- state.model?.base as BaseModelType | undefined
+ state.model?.base as BaseModelType | undefined,
+ getPidScale(state.pidMode)
);
state.dimensions.width = width;
state.dimensions.height = height;
@@ -483,12 +524,14 @@ const slice = createSlice({
}
},
sizeOptimized: (state) => {
- const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined);
+ const pidScale = getPidScale(state.pidMode);
+ const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined, pidScale);
if (state.dimensions.aspectRatio.isLocked) {
const { width, height } = calculateNewSize(
state.dimensions.aspectRatio.value,
optimalDimension * optimalDimension,
- state.model?.base as BaseModelType | undefined
+ state.model?.base as BaseModelType | undefined,
+ pidScale
);
state.dimensions.width = width;
state.dimensions.height = height;
@@ -499,19 +542,22 @@ const slice = createSlice({
}
},
syncedToOptimalDimension: (state) => {
- const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined);
+ const pidScale = getPidScale(state.pidMode);
+ const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined, pidScale);
if (
!getIsSizeOptimal(
state.dimensions.width,
state.dimensions.height,
- state.model?.base as BaseModelType | undefined
+ state.model?.base as BaseModelType | undefined,
+ pidScale
)
) {
const bboxDims = calculateNewSize(
state.dimensions.aspectRatio.value,
optimalDimension * optimalDimension,
- state.model?.base as BaseModelType | undefined
+ state.model?.base as BaseModelType | undefined,
+ pidScale
);
state.dimensions.width = bboxDims.width;
state.dimensions.height = bboxDims.height;
@@ -616,6 +662,10 @@ const resetState = (state: ParamsState): ParamsState => {
newState.animaQwen3EncoderModel = oldState.animaQwen3EncoderModel;
newState.kleinVaeModel = oldState.kleinVaeModel;
newState.kleinQwen3EncoderModel = oldState.kleinQwen3EncoderModel;
+ newState.pidMode = oldState.pidMode;
+ newState.pidDecoderModel = oldState.pidDecoderModel;
+ newState.gemma2EncoderModel = oldState.gemma2EncoderModel;
+ newState.pidSteps = oldState.pidSteps;
newState.qwenImageComponentSource = oldState.qwenImageComponentSource;
newState.qwenImageVaeModel = oldState.qwenImageVaeModel;
newState.qwenImageQwenVLEncoderModel = oldState.qwenImageQwenVLEncoderModel;
@@ -668,6 +718,9 @@ export const {
zImageQwen3SourceModelSelected,
kleinVaeModelSelected,
kleinQwen3EncoderModelSelected,
+ pidModeChanged,
+ pidDecoderModelSelected,
+ gemma2EncoderModelSelected,
qwenImageComponentSourceSelected,
qwenImageVaeModelSelected,
qwenImageQwenVLEncoderModelSelected,
@@ -744,6 +797,15 @@ export const paramsSliceConfig: SliceConfig = {
state.qwenImageQwenVLEncoderModel = null;
}
+ if (state._version === 3) {
+ // v3 -> v4, add PiD (Pixel Diffusion Decoder) fields
+ state._version = 4;
+ state.pidMode = 'off';
+ state.pidDecoderModel = null;
+ state.gemma2EncoderModel = null;
+ state.pidSteps = 4;
+ }
+
return zParamsState.parse(state);
},
},
@@ -787,6 +849,9 @@ export const selectAnimaQwen3EncoderModel = createParamsSelector((params) => par
export const selectAnimaScheduler = createParamsSelector((params) => params.animaScheduler);
export const selectKleinVaeModel = createParamsSelector((params) => params.kleinVaeModel);
export const selectKleinQwen3EncoderModel = createParamsSelector((params) => params.kleinQwen3EncoderModel);
+export const selectPidMode = createParamsSelector((params) => params.pidMode);
+export const selectPidDecoderModel = createParamsSelector((params) => params.pidDecoderModel);
+export const selectGemma2EncoderModel = createParamsSelector((params) => params.gemma2EncoderModel);
export const selectQwenImageComponentSource = createParamsSelector((params) => params.qwenImageComponentSource);
export const selectQwenImageVaeModel = createParamsSelector((params) => params.qwenImageVaeModel);
export const selectQwenImageQwenVLEncoderModel = createParamsSelector((params) => params.qwenImageQwenVLEncoderModel);
diff --git a/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts b/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts
index db37f32d49e..7aa4748cf74 100644
--- a/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts
@@ -14,7 +14,7 @@ import type {
CanvasState,
} from 'features/controlLayers/store/types';
import type { BaseModelType } from 'features/nodes/types/common';
-import { getGridSize, getOptimalDimension } from 'features/parameters/util/optimalDimension';
+import { getGridSize, getOptimalDimension, getPidScale } from 'features/parameters/util/optimalDimension';
import type { Equals } from 'tsafe';
import { assert } from 'tsafe';
@@ -76,7 +76,7 @@ export const selectHasEntities = createSelector(selectEntityCountAll, (count) =>
*/
export const selectOptimalDimension = createSelector(selectParamsSlice, (params): number => {
const modelBase = params.model?.base as BaseModelType | undefined;
- return getOptimalDimension(modelBase ?? null);
+ return getOptimalDimension(modelBase ?? null, getPidScale(params.pidMode));
});
/**
@@ -84,7 +84,7 @@ export const selectOptimalDimension = createSelector(selectParamsSlice, (params)
*/
export const selectGridSize = createSelector(selectParamsSlice, (params): number => {
const modelBase = params.model?.base as BaseModelType | undefined;
- return getGridSize(modelBase ?? null);
+ return getGridSize(modelBase ?? null, getPidScale(params.pidMode));
});
/**
diff --git a/invokeai/frontend/web/src/features/controlLayers/store/types.ts b/invokeai/frontend/web/src/features/controlLayers/store/types.ts
index 53cb70d8f0e..4701c3157b1 100644
--- a/invokeai/frontend/web/src/features/controlLayers/store/types.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/store/types.ts
@@ -780,8 +780,11 @@ const zPositivePromptHistory = z
export const zInfillMethod = z.enum(['patchmatch', 'lama', 'cv2', 'color', 'tile']);
export type InfillMethod = z.infer;
+const zPidMode = z.enum(['off', 'fit', 'native']);
+export type PidMode = z.infer;
+
export const zParamsState = z.object({
- _version: z.literal(3),
+ _version: z.literal(4),
maskBlur: z.number(),
maskBlurMethod: zParameterMaskBlurMethod,
canvasCoherenceMode: zParameterCanvasCoherenceMode,
@@ -844,6 +847,14 @@ export const zParamsState = z.object({
// Flux2 Klein model components - uses Qwen3 instead of CLIP+T5
kleinVaeModel: zParameterVAEModel.nullable(), // Optional: Separate FLUX.2 VAE for Klein
kleinQwen3EncoderModel: zModelIdentifierField.nullable(), // Optional: Separate Qwen3 Encoder for Klein
+ // PiD (Pixel Diffusion Decoder) - optional 4x super-resolution decode replacing the VAE decode.
+ // - 'off': regular VAE decode
+ // - 'fit': PiD decodes 4x internally, then downscales back to the bbox (compositing-safe; works in canvas/inpaint)
+ // - 'native': PiD's full 4x output IS the result; the user-facing dimensions are the target, generation runs at target / 4
+ pidMode: zPidMode,
+ pidDecoderModel: zModelIdentifierField.nullable(), // PiD decoder checkpoint (matched to the main model's base)
+ gemma2EncoderModel: zModelIdentifierField.nullable(), // Gemma-2 caption encoder required by PiD
+ pidSteps: z.number(), // PiD distill steps (released checkpoints are trained for 4)
// Qwen Image Edit model components - GGUF transformer needs a Diffusers source for VAE/encoder
qwenImageComponentSource: zParameterModel.nullable(), // Diffusers model providing VAE + text encoder
qwenImageVaeModel: zParameterVAEModel.nullable(), // Optional: Standalone Qwen Image VAE checkpoint
@@ -869,7 +880,7 @@ export const zParamsState = z.object({
});
export type ParamsState = z.infer;
export const getInitialParamsState = (): ParamsState => ({
- _version: 3,
+ _version: 4,
maskBlur: 16,
maskBlurMethod: 'box',
canvasCoherenceMode: 'Gaussian Blur',
@@ -929,6 +940,10 @@ export const getInitialParamsState = (): ParamsState => ({
animaScheduler: 'euler',
kleinVaeModel: null,
kleinQwen3EncoderModel: null,
+ pidMode: 'off',
+ pidDecoderModel: null,
+ gemma2EncoderModel: null,
+ pidSteps: 4,
qwenImageComponentSource: null,
qwenImageVaeModel: null,
qwenImageQwenVLEncoderModel: null,
diff --git a/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts b/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts
index 5f58e77545a..a183e1b5b8f 100644
--- a/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts
+++ b/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts
@@ -57,12 +57,13 @@ export const getScaledBoundingBoxDimensions = (dimensions: Dimensions, base?: Ba
* @param ratio The aspect ratio to calculate the new size for
* @param area The input area
* @param base The base model
+ * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD)
* @returns The width and height that will fit the given aspect ratio, retaining the input area
*/
-export const calculateNewSize = (ratio: number, area: number, base?: BaseModelType): Dimensions => {
+export const calculateNewSize = (ratio: number, area: number, base?: BaseModelType, pidScale = 1): Dimensions => {
const exactWidth = Math.sqrt(area * ratio);
const exactHeight = exactWidth / ratio;
- const gridSize = getGridSize(base);
+ const gridSize = getGridSize(base, pidScale);
return {
width: roundToMultiple(exactWidth, gridSize),
diff --git a/invokeai/frontend/web/src/features/modelManagerV2/models.ts b/invokeai/frontend/web/src/features/modelManagerV2/models.ts
index cf295c9af6a..a95b6348c15 100644
--- a/invokeai/frontend/web/src/features/modelManagerV2/models.ts
+++ b/invokeai/frontend/web/src/features/modelManagerV2/models.ts
@@ -8,10 +8,12 @@ import {
isControlNetModelConfig,
isExternalApiModelConfig,
isFluxReduxModelConfig,
+ isGemma2EncoderModelConfig,
isIPAdapterModelConfig,
isLLaVAModelConfig,
isLoRAModelConfig,
isNonRefinerMainModelConfig,
+ isPiDDecoderModelConfig,
isQwen3EncoderModelConfig,
isQwenVLEncoderModelConfig,
isRefinerMainModelModelConfig,
@@ -85,6 +87,16 @@ const MODEL_CATEGORIES: Record = {
i18nKey: 'modelManager.qwenVLEncoder',
filter: isQwenVLEncoderModelConfig,
},
+ gemma2_encoder: {
+ category: 'gemma2_encoder',
+ i18nKey: 'modelManager.gemma2Encoder',
+ filter: isGemma2EncoderModelConfig,
+ },
+ pid_decoder: {
+ category: 'pid_decoder',
+ i18nKey: 'modelManager.pidDecoder',
+ filter: isPiDDecoderModelConfig,
+ },
control_lora: {
category: 'control_lora',
i18nKey: 'modelManager.controlLora',
@@ -187,11 +199,13 @@ export const MODEL_TYPE_TO_LONG_NAME: Record = {
t5_encoder: 'T5 Encoder',
qwen3_encoder: 'Qwen3 Encoder',
qwen_vl_encoder: 'Qwen2.5-VL Encoder',
+ gemma2_encoder: 'Gemma-2 Encoder',
clip_embed: 'CLIP Embed',
siglip: 'SigLIP',
flux_redux: 'FLUX Redux',
text_llm: 'Text LLM',
external_image_generator: 'External Image Generator',
+ pid_decoder: 'PiD Decoder',
unknown: 'Unknown',
};
@@ -255,6 +269,8 @@ export const MODEL_VARIANT_TO_LONG_NAME: Record = {
qwen3_4b: 'Qwen3 4B',
qwen3_8b: 'Qwen3 8B',
qwen3_06b: 'Qwen3 0.6B',
+ res2k_sr4x: 'PiD 2K (4x SR)',
+ res2kto4k_sr4x: 'PiD 4K (4x SR Upscale)',
};
export const MODEL_FORMAT_TO_LONG_NAME: Record = {
@@ -271,6 +287,7 @@ export const MODEL_FORMAT_TO_LONG_NAME: Record = {
t5_encoder: 'T5 Encoder',
qwen3_encoder: 'Qwen3 Encoder',
qwen_vl_encoder: 'Qwen2.5-VL Encoder',
+ gemma2_encoder: 'Gemma-2 Encoder',
bnb_quantized_int8b: 'BNB Quantized (int8b)',
bnb_quantized_nf4b: 'BNB Quantized (nf4b)',
gguf_quantized: 'GGUF Quantized',
diff --git a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx
index 71d2efe0e45..79e35926667 100644
--- a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx
+++ b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx
@@ -16,6 +16,7 @@ const FORMAT_NAME_MAP: Record = {
t5_encoder: 't5_encoder',
qwen3_encoder: 'qwen3_encoder',
qwen_vl_encoder: 'qwen_vl_encoder',
+ gemma2_encoder: 'gemma2_encoder',
bnb_quantized_int8b: 'bnb_quantized_int8b',
bnb_quantized_nf4b: 'quantized',
gguf_quantized: 'gguf',
@@ -37,6 +38,7 @@ const FORMAT_COLOR_MAP: Record = {
t5_encoder: 'base',
qwen3_encoder: 'base',
qwen_vl_encoder: 'base',
+ gemma2_encoder: 'base',
bnb_quantized_int8b: 'base',
bnb_quantized_nf4b: 'base',
gguf_quantized: 'base',
diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts
index fb2a1ce946a..24e54996642 100644
--- a/invokeai/frontend/web/src/features/nodes/types/common.ts
+++ b/invokeai/frontend/web/src/features/nodes/types/common.ts
@@ -134,10 +134,12 @@ export const zModelType = z.enum([
't5_encoder',
'qwen3_encoder',
'qwen_vl_encoder',
+ 'gemma2_encoder',
'clip_embed',
'siglip',
'flux_redux',
'external_image_generator',
+ 'pid_decoder',
'unknown',
]);
export type ModelType = z.infer;
@@ -164,6 +166,7 @@ export const zFlux2VariantType = z.enum(['klein_4b', 'klein_4b_base', 'klein_9b'
export const zZImageVariantType = z.enum(['turbo', 'zbase']);
const zQwenImageVariantType = z.enum(['generate', 'edit']);
export const zQwen3VariantType = z.enum(['qwen3_4b', 'qwen3_8b', 'qwen3_06b']);
+const zPiDDecoderVariantType = z.enum(['res2k_sr4x', 'res2kto4k_sr4x']);
export const zAnyModelVariant = z.union([
zModelVariantType,
zClipVariantType,
@@ -172,6 +175,7 @@ export const zAnyModelVariant = z.union([
zZImageVariantType,
zQwenImageVariantType,
zQwen3VariantType,
+ zPiDDecoderVariantType,
]);
export type AnyModelVariant = z.infer;
export const zModelFormat = z.enum([
@@ -187,6 +191,7 @@ export const zModelFormat = z.enum([
't5_encoder',
'qwen3_encoder',
'qwen_vl_encoder',
+ 'gemma2_encoder',
'bnb_quantized_int8b',
'bnb_quantized_nf4b',
'gguf_quantized',
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts
index f17ff970f27..1df9d6ec658 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts
@@ -9,7 +9,7 @@ import {
} from 'features/nodes/util/graph/graphBuilderUtils';
import type {
DenoiseLatentsNodes,
- LatentToImageNodes,
+ ImageOutputNodes,
MainModelLoaderNodes,
VaeSourceNodes,
} from 'features/nodes/util/graph/types';
@@ -20,7 +20,9 @@ type AddImageToImageArg = {
g: Graph;
state: RootState;
manager: CanvasManager;
- l2i: Invocation;
+ // Only the `.image` output is consumed downstream, so any image-producing node works here (e.g. a PiD decode
+ // chain substituted for the regular VAE decode).
+ l2i: Invocation;
i2l: Invocation<
| 'i2l'
| 'flux_vae_encode'
@@ -45,19 +47,7 @@ export const addImageToImage = async ({
noise,
denoise,
vaeSource,
-}: AddImageToImageArg): Promise<
- Invocation<
- | 'img_resize'
- | 'l2i'
- | 'flux_vae_decode'
- | 'flux2_vae_decode'
- | 'sd3_l2i'
- | 'cogview4_l2i'
- | 'qwen_image_l2i'
- | 'z_image_l2i'
- | 'anima_l2i'
- >
-> => {
+}: AddImageToImageArg): Promise> => {
const { denoising_start, denoising_end } = getDenoisingStartAndEnd(state);
denoise.denoising_start = denoising_start;
denoise.denoising_end = denoising_end;
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts
new file mode 100644
index 00000000000..fb8f65f6bc3
--- /dev/null
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts
@@ -0,0 +1,349 @@
+import type { RootState } from 'app/store/store';
+import { roundDownToMultiple } from 'common/util/roundDownToMultiple';
+import type { CanvasManager } from 'features/controlLayers/konva/CanvasManager';
+import { getPrefixedId } from 'features/controlLayers/konva/util';
+import { selectMainModelConfig, selectParamsSlice } from 'features/controlLayers/store/paramsSlice';
+import type { Graph } from 'features/nodes/util/graph/generation/Graph';
+import {
+ getDenoisingStartAndEnd,
+ getOriginalAndScaledSizesForOtherModes,
+ getOriginalAndScaledSizesForTextToImage,
+} from 'features/nodes/util/graph/graphBuilderUtils';
+import type { ImageToLatentsNodes, MainModelLoaderNodes, VaeSourceNodes } from 'features/nodes/util/graph/types';
+import { getGridSize, PID_SCALE } from 'features/parameters/util/optimalDimension';
+import type { Invocation } from 'services/api/types';
+import { assert } from 'tsafe';
+
+type Size = { width: number; height: number };
+
+/**
+ * The base-specific PiD decode node types. Each replaces its base's VAE decode with the PiD super-res decode.
+ * Only bases whose graph builder actually wires PiD are listed; more are added as their builders gain support.
+ */
+type PidDecodeNodeType =
+ | 'flux_pid_decode'
+ | 'flux2_pid_decode'
+ | 'sd3_pid_decode'
+ | 'sdxl_pid_decode'
+ | 'z_image_pid_decode'
+ | 'qwen_image_pid_decode';
+
+/**
+ * Denoise nodes whose latents PiD can decode. The FLUX-family nodes carry their own width/height; `denoise_latents`
+ * (SD1.5/SD2/SDXL) does not - it is sized via a separate `noise` node, so callers using it must pass `noise`.
+ */
+type PidDenoiseNodeType =
+ | 'flux_denoise'
+ | 'flux2_denoise'
+ | 'sd3_denoise'
+ | 'z_image_denoise'
+ | 'qwen_image_denoise'
+ | 'denoise_latents';
+
+/** PiD decode node types that expose a `vae` input (used to read the VAE's scaling constants at runtime). */
+const PID_DECODE_NODES_WITH_VAE_INPUT = new Set([
+ 'flux2_pid_decode',
+ 'sdxl_pid_decode',
+ 'z_image_pid_decode',
+ 'qwen_image_pid_decode',
+]);
+
+/**
+ * Sets the generation dimensions for a PiD graph. The FLUX-family denoise nodes carry width/height directly;
+ * `denoise_latents` (SD1.5/SD2/SDXL) is sized via its `noise` node instead (mirrors {@link addTextToImage}).
+ */
+const setPidGenDimensions = (
+ denoise: Invocation,
+ noise: Invocation<'noise'> | undefined,
+ width: number,
+ height: number
+): void => {
+ if (denoise.type === 'denoise_latents') {
+ assert(noise, 'PiD with denoise_latents (SD1.5/SD2/SDXL) requires a noise node');
+ noise.width = width;
+ noise.height = height;
+ } else {
+ denoise.width = width;
+ denoise.height = height;
+ }
+};
+
+/** Reads back the generation dimensions set by {@link setPidGenDimensions} (from the noise node for `denoise_latents`). */
+const getPidGenDimensions = (denoise: Invocation, noise: Invocation<'noise'> | undefined): Size => {
+ if (denoise.type === 'denoise_latents') {
+ assert(
+ noise?.width !== undefined && noise.height !== undefined,
+ 'PiD native decode requires the noise dimensions to be set by the caller'
+ );
+ return { width: noise.width, height: noise.height };
+ }
+ assert(
+ denoise.width !== undefined && denoise.height !== undefined,
+ 'PiD native decode requires the denoise dimensions to be set by the caller'
+ );
+ return { width: denoise.width, height: denoise.height };
+};
+
+type BuildPidDecodeChainArg = {
+ g: Graph;
+ state: RootState;
+ /** The denoise node producing the latents PiD will decode. Its dimensions are set by the CALLER. */
+ denoise: Invocation;
+ /** The noise node, required when `denoise` is a `denoise_latents` node (SD1.5/SD2/SDXL) - it carries the size. */
+ noise?: Invocation<'noise'>;
+ /** Which base-specific PiD decode node to build (e.g. `flux_pid_decode`, `flux2_pid_decode`). */
+ decodeNodeType: PidDecodeNodeType;
+ /**
+ * Optional VAE source. If the chosen decode node has a `vae` input (e.g. `flux2_pid_decode`), it is wired so
+ * the node can read the VAE's scaling/shift constants at runtime. Ignored for nodes without a `vae` input.
+ */
+ vaeSource?: Invocation;
+ /** The positive prompt node - PiD conditions its decode on the same caption. */
+ positivePrompt: Invocation<'string'>;
+ /** The seed node - reused for PiD's internal decode noise so results are reproducible. */
+ seed: Invocation<'integer'>;
+ /**
+ * - 'fit': PiD decodes 4x, then the output is downscaled to `fitSize` (compositing-safe; used everywhere).
+ * - 'native': PiD's full 4x output is used directly (txt2img only; `fitSize` is ignored).
+ */
+ mode: 'fit' | 'native';
+ /** The size to downscale the 4x output to in 'fit' mode (the bbox / region the result must fit). */
+ fitSize: Size;
+};
+
+/**
+ * Builds the PiD (Pixel Diffusion Decoder) decode chain: the Gemma-2 + PiD loaders, the `flux_pid_decode` node
+ * wired to the given denoise latents, and (in 'fit' mode) an `img_resize` that downscales PiD's 4x output to
+ * `fitSize`. Returns the terminal image node, which is a drop-in for the regular VAE decode (`l2i`) - downstream
+ * nodes only consume its `.image` output.
+ *
+ * This does NOT modify the denoise node's dimensions or denoising start/end; the caller owns those (they differ
+ * between txt2img and img2img/inpaint).
+ */
+export const buildPidDecodeChain = ({
+ g,
+ state,
+ denoise,
+ noise,
+ decodeNodeType,
+ vaeSource,
+ positivePrompt,
+ seed,
+ mode,
+ fitSize,
+}: BuildPidDecodeChainArg): Invocation<'img_resize' | PidDecodeNodeType> => {
+ const params = selectParamsSlice(state);
+ const { pidDecoderModel, gemma2EncoderModel, pidSteps } = params;
+ assert(pidDecoderModel, 'No PiD decoder model selected');
+ assert(gemma2EncoderModel, 'No Gemma-2 encoder model selected');
+
+ const gemma2Loader = g.addNode({
+ type: 'gemma2_encoder_loader',
+ id: getPrefixedId('gemma2_encoder_loader'),
+ gemma2_model: gemma2EncoderModel,
+ });
+ const pidLoader = g.addNode({
+ type: 'pid_decoder_loader',
+ id: getPrefixedId('pid_decoder_loader'),
+ pid_decoder_model: pidDecoderModel,
+ });
+ const pidDecode = g.addNode({
+ type: decodeNodeType,
+ id: getPrefixedId(decodeNodeType),
+ num_inference_steps: pidSteps,
+ });
+
+ g.addEdge(denoise, 'latents', pidDecode, 'latents');
+ g.addEdge(positivePrompt, 'value', pidDecode, 'prompt');
+ g.addEdge(gemma2Loader, 'gemma2_encoder', pidDecode, 'gemma2_encoder');
+ g.addEdge(pidLoader, 'pid_decoder', pidDecode, 'pid_decoder');
+ g.addEdge(seed, 'value', pidDecode, 'seed');
+ // Wire the VAE only for decode nodes that read scaling constants from it (currently just flux2_pid_decode).
+ if (vaeSource && PID_DECODE_NODES_WITH_VAE_INPUT.has(decodeNodeType)) {
+ g.addEdge(vaeSource, 'vae', pidDecode as Invocation<'flux2_pid_decode'>, 'vae');
+ }
+
+ const commonMetadata = {
+ pid_decoder: pidDecoderModel,
+ gemma2_encoder: gemma2EncoderModel,
+ pid_steps: pidSteps,
+ };
+
+ if (mode === 'native') {
+ // PiD's 4x output IS the result (the caller generated at target / 4) - no downscale.
+ const genSize = getPidGenDimensions(denoise, noise);
+ g.upsertMetadata({
+ ...commonMetadata,
+ pid_mode: mode,
+ width: genSize.width * PID_SCALE,
+ height: genSize.height * PID_SCALE,
+ });
+ return pidDecode;
+ }
+
+ // Fit mode: downscale PiD's 4x output back to the requested size.
+ const resize = g.addNode({
+ id: getPrefixedId('pid_fit_resize'),
+ type: 'img_resize',
+ ...fitSize,
+ });
+ g.addEdge(pidDecode, 'image', resize, 'image');
+ g.upsertMetadata({ ...commonMetadata, pid_mode: mode, width: fitSize.width, height: fitSize.height });
+
+ return resize;
+};
+
+type AddPidDecodeArg = {
+ g: Graph;
+ state: RootState;
+ mode: 'fit' | 'native';
+ denoise: Invocation;
+ noise?: Invocation<'noise'>;
+ decodeNodeType: PidDecodeNodeType;
+ vaeSource?: Invocation;
+ positivePrompt: Invocation<'string'>;
+ seed: Invocation<'integer'>;
+};
+
+/**
+ * Text-to-image PiD decode: sets up the denoise node (full denoise, generation dimensions) and replaces the VAE
+ * decode with a PiD decode (see {@link buildPidDecodeChain}).
+ *
+ * - 'fit': generate at the requested size, PiD decodes 4x, then downscale back to it.
+ * - 'native': the requested dimensions are the 4x target; generate at target / 4 and use PiD's 4x output directly.
+ *
+ * The caller is responsible for having NOT wired a VAE decode for these latents (or for deleting it).
+ *
+ * @returns The terminal image node, to be used as the canvas output.
+ */
+export const addPidDecode = ({
+ g,
+ state,
+ mode,
+ denoise,
+ noise,
+ decodeNodeType,
+ vaeSource,
+ positivePrompt,
+ seed,
+}: AddPidDecodeArg): Invocation<'img_resize' | PidDecodeNodeType> => {
+ const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state);
+ // Round the generation resolution to the main model's native grid (16 for FLUX-family, 8 for SDXL). The bbox is
+ // pre-snapped to grid * PID_SCALE by the UI/readiness, so target / PID_SCALE lands exactly on the grid.
+ const gridSize = getGridSize(selectMainModelConfig(state)?.base);
+
+ denoise.denoising_start = 0;
+ denoise.denoising_end = 1;
+ if (mode === 'native') {
+ // The user-facing dimensions are the 4x target; generate at target / PID_SCALE (kept on the model grid).
+ setPidGenDimensions(
+ denoise,
+ noise,
+ Math.max(roundDownToMultiple(originalSize.width / PID_SCALE, gridSize), gridSize),
+ Math.max(roundDownToMultiple(originalSize.height / PID_SCALE, gridSize), gridSize)
+ );
+ } else {
+ // Generate at the normal resolution; PiD will 4x it and we downscale back to it.
+ setPidGenDimensions(denoise, noise, scaledSize.width, scaledSize.height);
+ }
+
+ return buildPidDecodeChain({
+ g,
+ state,
+ denoise,
+ noise,
+ decodeNodeType,
+ vaeSource,
+ positivePrompt,
+ seed,
+ mode,
+ fitSize: originalSize,
+ });
+};
+
+type AddPidImageToImageNativeArg = {
+ g: Graph;
+ state: RootState;
+ manager: CanvasManager;
+ /** The denoise node. Its dimensions are set here to the 4x target / PID_SCALE. */
+ denoise: Invocation;
+ /** The noise node, required when `denoise` is a `denoise_latents` node (SD1.5/SD2/SDXL) - it carries the size. */
+ noise?: Invocation<'noise'>;
+ /** Which base-specific PiD decode node to build. */
+ decodeNodeType: PidDecodeNodeType;
+ /** The VAE encode node for the init image. */
+ i2l: Invocation;
+ /** The model loader / VAE source providing the VAE for encoding the init image (and, if applicable, the decode). */
+ vaeSource: Invocation;
+ positivePrompt: Invocation<'string'>;
+ seed: Invocation<'integer'>;
+};
+
+/**
+ * Native-4x PiD image-to-image (Canvas only). The user-facing bbox IS the 4x target: generation runs at bbox /
+ * PID_SCALE, the init image is downscaled to that resolution before encoding, and PiD decodes the latents straight
+ * back up to the full bbox size - no post-decode downscale, so all of PiD's detail is preserved. Because the result
+ * is exactly the bbox size it composites cleanly back onto the canvas region.
+ *
+ * Requires the bbox to be a multiple of the PiD-scaled grid (enforced by the UI grid snapping / readiness) so that
+ * bbox / PID_SCALE lands on the FLUX grid and PiD's 4x output matches the bbox exactly.
+ *
+ * @returns The terminal `flux_pid_decode` node, to be used as the canvas output.
+ */
+export const addPidImageToImageNative = async ({
+ g,
+ state,
+ manager,
+ denoise,
+ noise,
+ decodeNodeType,
+ i2l,
+ vaeSource,
+ positivePrompt,
+ seed,
+}: AddPidImageToImageNativeArg): Promise> => {
+ const { denoising_start, denoising_end } = getDenoisingStartAndEnd(state);
+ denoise.denoising_start = denoising_start;
+ denoise.denoising_end = denoising_end;
+
+ const { originalSize, rect } = getOriginalAndScaledSizesForOtherModes(state);
+ const gridSize = getGridSize(selectMainModelConfig(state)?.base);
+
+ // The bbox is the 4x target; generate at target / PID_SCALE (kept on the model grid).
+ const genSize = {
+ width: Math.max(roundDownToMultiple(originalSize.width / PID_SCALE, gridSize), gridSize),
+ height: Math.max(roundDownToMultiple(originalSize.height / PID_SCALE, gridSize), gridSize),
+ };
+ setPidGenDimensions(denoise, noise, genSize.width, genSize.height);
+
+ const adapters = manager.compositor.getVisibleAdaptersOfType('raster_layer');
+ const { image_name } = await manager.compositor.getCompositeImageDTO(adapters, rect, {
+ is_intermediate: true,
+ silent: true,
+ });
+
+ // Downscale the init image to the generation resolution before encoding.
+ const resizeIn = g.addNode({
+ type: 'img_resize',
+ id: getPrefixedId('initial_image_resize_in'),
+ image: { image_name },
+ ...genSize,
+ });
+ g.addEdge(vaeSource, 'vae', i2l, 'vae');
+ g.addEdge(resizeIn, 'image', i2l, 'image');
+ g.addEdge(i2l, 'latents', denoise, 'latents');
+
+ // PiD decodes the genSize latents straight up to 4x = the bbox. fitSize is ignored in native mode.
+ return buildPidDecodeChain({
+ g,
+ state,
+ denoise,
+ noise,
+ decodeNodeType,
+ vaeSource,
+ positivePrompt,
+ seed,
+ mode: 'native',
+ fitSize: originalSize,
+ });
+};
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts
index 5b9f3d0a468..1704a1a12cb 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts
@@ -109,6 +109,7 @@ const mockParams = {
fluxVAE: null,
t5EncoderModel: null,
clipEmbedModel: null,
+ pidMode: 'off' as const,
};
vi.mock('features/controlLayers/store/paramsSlice', () => ({
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts
index dafcd9310ec..0d38cf249a9 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts
@@ -20,11 +20,20 @@ import { addImageToImage } from 'features/nodes/util/graph/generation/addImageTo
import { addInpaint } from 'features/nodes/util/graph/generation/addInpaint';
import { addNSFWChecker } from 'features/nodes/util/graph/generation/addNSFWChecker';
import { addOutpaint } from 'features/nodes/util/graph/generation/addOutpaint';
+import {
+ addPidDecode,
+ addPidImageToImageNative,
+ buildPidDecodeChain,
+} from 'features/nodes/util/graph/generation/addPidDecode';
import { addRegions } from 'features/nodes/util/graph/generation/addRegions';
import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage';
import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker';
import { Graph } from 'features/nodes/util/graph/generation/Graph';
-import { selectCanvasOutputFields } from 'features/nodes/util/graph/graphBuilderUtils';
+import {
+ getOriginalAndScaledSizesForOtherModes,
+ getOriginalAndScaledSizesForTextToImage,
+ selectCanvasOutputFields,
+} from 'features/nodes/util/graph/graphBuilderUtils';
import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types';
import { UnsupportedGenerationModeError } from 'features/nodes/util/graph/types';
import { isFlux2KleinQwen3Compatible } from 'features/parameters/util/flux2Klein';
@@ -62,6 +71,7 @@ export const buildFLUXGraph = async (arg: GraphBuilderArg): Promise;
const fluxL2i = l2i as Invocation<'flux_vae_decode'>;
+ if (pidMode !== 'off') {
+ // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native).
+ if (generationMode === 'inpaint' || generationMode === 'outpaint') {
+ throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode'));
+ }
+ // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate
+ // the generation size to the model optimal, blowing up the decode - require it off (scaled == original).
+ const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state);
+ if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) {
+ throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff'));
+ }
+ }
+
// Only add FLUX LoRAs for non-Klein models
addFLUXLoRAs(state, g, fluxDenoise, fluxModelLoader, fluxPosCond);
@@ -430,12 +522,26 @@ export const buildFLUXGraph = async (arg: GraphBuilderArg): Promise = l2i;
+ if (pidMode !== 'off') {
+ // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native).
+ if (generationMode === 'inpaint' || generationMode === 'outpaint') {
+ throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode'));
+ }
+ // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate
+ // the generation size to the model optimal, blowing up the decode - require it off (scaled == original).
+ const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state);
+ if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) {
+ throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff'));
+ }
+ }
+
if (generationMode === 'txt2img') {
- canvasOutput = addTextToImage({
- g,
- state,
- denoise,
- l2i,
- });
+ if (pidMode !== 'off') {
+ // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). The Qwen-Image VAE (from the
+ // model loader) is wired so the node reads its per-channel latents_mean / latents_std.
+ g.deleteNode(l2i.id);
+ canvasOutput = addPidDecode({
+ g,
+ state,
+ mode: pidMode,
+ denoise,
+ decodeNodeType: 'qwen_image_pid_decode',
+ vaeSource: modelLoader,
+ positivePrompt,
+ seed,
+ });
+ } else {
+ canvasOutput = addTextToImage({
+ g,
+ state,
+ denoise,
+ l2i,
+ });
+ }
g.upsertMetadata({ generation_mode: 'qwen_image_txt2img' });
} else if (generationMode === 'img2img') {
assert(manager !== null);
@@ -255,15 +296,56 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise = l2i;
+ if (pidMode !== 'off') {
+ // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native).
+ if (generationMode === 'inpaint' || generationMode === 'outpaint') {
+ throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode'));
+ }
+ // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate
+ // the generation size to the model optimal, blowing up the decode - require it off (scaled == original).
+ const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state);
+ if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) {
+ throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff'));
+ }
+ }
+
if (generationMode === 'txt2img') {
- canvasOutput = addTextToImage({
- g,
- state,
- denoise,
- l2i,
- });
+ if (pidMode !== 'off') {
+ // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). sd3_pid_decode has no vae
+ // input (fixed SD3 constants), so no vaeSource is passed.
+ g.deleteNode(l2i.id);
+ canvasOutput = addPidDecode({
+ g,
+ state,
+ mode: pidMode,
+ denoise,
+ decodeNodeType: 'sd3_pid_decode',
+ positivePrompt,
+ seed,
+ });
+ } else {
+ canvasOutput = addTextToImage({
+ g,
+ state,
+ denoise,
+ l2i,
+ });
+ }
g.upsertMetadata({ generation_mode: 'sd3_txt2img' });
} else if (generationMode === 'img2img') {
assert(manager !== null);
@@ -121,15 +160,55 @@ export const buildSD3Graph = async (arg: GraphBuilderArg): Promise
model,
negativePrompt: 'raw negative prompt',
positivePrompt: 'raw positive prompt',
+ pidMode: 'off',
refinerModel: null,
scheduler: 'euler',
seed: 123,
diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts
index f31c42ee561..0f9cba1c391 100644
--- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts
+++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts
@@ -9,15 +9,26 @@ import { addInpaint } from 'features/nodes/util/graph/generation/addInpaint';
import { addIPAdapters } from 'features/nodes/util/graph/generation/addIPAdapters';
import { addNSFWChecker } from 'features/nodes/util/graph/generation/addNSFWChecker';
import { addOutpaint } from 'features/nodes/util/graph/generation/addOutpaint';
+import {
+ addPidDecode,
+ addPidImageToImageNative,
+ buildPidDecodeChain,
+} from 'features/nodes/util/graph/generation/addPidDecode';
import { addSDXLLoRAs } from 'features/nodes/util/graph/generation/addSDXLLoRAs';
import { addSDXLRefiner } from 'features/nodes/util/graph/generation/addSDXLRefiner';
import { addSeamless } from 'features/nodes/util/graph/generation/addSeamless';
import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage';
import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker';
import { Graph } from 'features/nodes/util/graph/generation/Graph';
-import { selectCanvasOutputFields } from 'features/nodes/util/graph/graphBuilderUtils';
+import {
+ getOriginalAndScaledSizesForOtherModes,
+ getOriginalAndScaledSizesForTextToImage,
+ selectCanvasOutputFields,
+} from 'features/nodes/util/graph/graphBuilderUtils';
import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types';
+import { UnsupportedGenerationModeError } from 'features/nodes/util/graph/types';
import { selectActiveTab } from 'features/ui/store/uiSelectors';
+import { t } from 'i18next';
import type { Invocation } from 'services/api/types';
import type { Equals } from 'tsafe';
import { assert } from 'tsafe';
@@ -49,6 +60,7 @@ export const buildSDXLGraph = async (arg: GraphBuilderArg): Promise = l2i;
if (generationMode === 'txt2img') {
- canvasOutput = addTextToImage({
- g,
- state,
- noise,
- denoise,
- l2i,
- });
+ if (pidMode !== 'off') {
+ // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). SDXL's VAE source is wired
+ // so sdxl_pid_decode can read scaling_factor / shift_factor from it.
+ g.deleteNode(l2i.id);
+ canvasOutput = addPidDecode({
+ g,
+ state,
+ mode: pidMode,
+ denoise,
+ noise,
+ decodeNodeType: 'sdxl_pid_decode',
+ vaeSource,
+ positivePrompt,
+ seed,
+ });
+ } else {
+ canvasOutput = addTextToImage({
+ g,
+ state,
+ noise,
+ denoise,
+ l2i,
+ });
+ }
g.upsertMetadata({ generation_mode: 'sdxl_txt2img' });
} else if (generationMode === 'img2img') {
assert(manager !== null);
@@ -184,16 +231,60 @@ export const buildSDXLGraph = async (arg: GraphBuilderArg): Promise = l2i;
+ if (pidMode !== 'off') {
+ // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native).
+ if (generationMode === 'inpaint' || generationMode === 'outpaint') {
+ throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode'));
+ }
+ // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate
+ // the generation size to the model optimal, blowing up the decode - require it off (scaled == original).
+ const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state);
+ if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) {
+ throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff'));
+ }
+ }
+
if (generationMode === 'txt2img') {
- canvasOutput = addTextToImage({
- g,
- state,
- denoise,
- l2i,
- });
+ if (pidMode !== 'off') {
+ // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). Z-Image shares FLUX.1's VAE
+ // and uses the FLUX PiD decoder; the Z-Image VAE (from the model loader) is wired so the node reads its
+ // scaling_factor / shift_factor.
+ g.deleteNode(l2i.id);
+ canvasOutput = addPidDecode({
+ g,
+ state,
+ mode: pidMode,
+ denoise,
+ decodeNodeType: 'z_image_pid_decode',
+ vaeSource: modelLoader,
+ positivePrompt,
+ seed,
+ });
+ } else {
+ canvasOutput = addTextToImage({
+ g,
+ state,
+ denoise,
+ l2i,
+ });
+ }
g.upsertMetadata({ generation_mode: 'z_image_txt2img' });
} else if (generationMode === 'img2img') {
assert(manager !== null);
@@ -246,15 +288,56 @@ export const buildZImageGraph = async (arg: GraphBuilderArg): Promise {
+ const dispatch = useAppDispatch();
+ const { t } = useTranslation();
+ const selectedModel = useAppSelector(selectPidDecoderModel);
+ const mainModelConfig = useAppSelector(selectMainModelConfig);
+ // PiD decoders are pinned to a backbone; only decoders whose base matches the main model's PiD decoder base
+ // are valid (e.g. flux2 decoders for a FLUX.2 main model). getPidDecoderBaseForMainBase returns null when the
+ // base has no PiD support, so the filter rejects everything and the combobox shows no options.
+ const decoderBase = useMemo(() => getPidDecoderBaseForMainBase(mainModelConfig?.base), [mainModelConfig?.base]);
+ const baseFilter = useCallback(
+ (config: AnyModelConfig) => decoderBase !== null && config.base === decoderBase,
+ [decoderBase]
+ );
+ const [modelConfigs, { isLoading }] = usePiDDecoderModels(baseFilter);
+
+ const _onChange = useCallback(
+ (config: AnyModelConfig | null) => {
+ if (config) {
+ dispatch(pidDecoderModelSelected(zModelIdentifierField.parse(config)));
+ }
+ },
+ [dispatch]
+ );
+
+ const { options, value, onChange, noOptionsMessage } = useModelCombobox({
+ modelConfigs,
+ onChange: _onChange,
+ selectedModel,
+ isLoading,
+ });
+
+ return (
+
+ {t('modelManager.pidDecoder')}
+
+
+ );
+});
+ParamPidDecoderModelSelect.displayName = 'ParamPidDecoderModelSelect';
+
+const ParamGemma2EncoderModelSelect = memo(() => {
+ const dispatch = useAppDispatch();
+ const { t } = useTranslation();
+ const selectedModel = useAppSelector(selectGemma2EncoderModel);
+ const [modelConfigs, { isLoading }] = useGemma2EncoderModels();
+
+ const _onChange = useCallback(
+ (config: AnyModelConfig | null) => {
+ if (config) {
+ dispatch(gemma2EncoderModelSelected(zModelIdentifierField.parse(config)));
+ }
+ },
+ [dispatch]
+ );
+
+ const { options, value, onChange, noOptionsMessage } = useModelCombobox({
+ modelConfigs,
+ onChange: _onChange,
+ selectedModel,
+ isLoading,
+ });
+
+ return (
+
+ {t('modelManager.gemma2Encoder')}
+
+
+ );
+});
+ParamGemma2EncoderModelSelect.displayName = 'ParamGemma2EncoderModelSelect';
+
+const PidSettings = () => {
+ const dispatch = useAppDispatch();
+ const { t } = useTranslation();
+ const pidMode = useAppSelector(selectPidMode);
+
+ const options = useMemo(
+ () => [
+ { value: 'off', label: t('modelManager.pidModeOff') },
+ { value: 'fit', label: t('modelManager.pidModeFit') },
+ { value: 'native', label: t('modelManager.pidModeNative') },
+ ],
+ [t]
+ );
+
+ const value = useMemo(() => options.find((o) => o.value === pidMode) ?? null, [options, pidMode]);
+
+ const onChange = useCallback(
+ (v) => {
+ if (v) {
+ dispatch(pidModeChanged(v.value as PidMode));
+ }
+ },
+ [dispatch]
+ );
+
+ return (
+
+
+
+ {t('modelManager.pidMode')}
+
+
+
+ {pidMode !== 'off' && (
+ <>
+
+
+ >
+ )}
+
+ );
+};
+
+export default memo(PidSettings);
diff --git a/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts b/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts
index 2ac59a32e2b..71edada00eb 100644
--- a/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts
+++ b/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts
@@ -1,14 +1,34 @@
import type { BaseModelType } from 'features/nodes/types/common';
+/** PiD's fixed super-resolution factor (the released FLUX/SD3 checkpoints are 4x). */
+export const PID_SCALE = 4;
+// PiD res2k decoders are trained 512 -> 2048 (4x). In "native" mode the user-facing dimensions are the
+// 4x target, so the optimal *target* dimension is 512 * 4 = 2048, regardless of the base model's own optimum.
+const PID_NATIVE_OPTIMAL_DIMENSION = 512 * PID_SCALE;
+
+/**
+ * Returns the PiD generation scale that the dimension helpers should account for:
+ * - 4 in "native" mode (the user-facing dimensions are the 4x target; generation runs at target / 4)
+ * - 1 otherwise ('off' / 'fit' - dimensions are the generation resolution)
+ */
+export const getPidScale = (pidMode?: string | null): number => (pidMode === 'native' ? PID_SCALE : 1);
+
/**
* Gets the optimal dimension for a given base model:
* - sd-1, sd-2: 512
* - sdxl, flux, sd-3, cogview4, qwen-image, z-image, anima: 1024
* - default: 1024
+ *
+ * When `pidScale > 1` (PiD native mode) the user-facing dimensions are the 4x target, so the optimal is the
+ * PiD target dimension (2048) instead of the model's own optimum.
* @param base The base model
+ * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD)
* @returns The optimal dimension for the model, defaulting to 1024
*/
-export const getOptimalDimension = (base?: BaseModelType | null): number => {
+export const getOptimalDimension = (base?: BaseModelType | null, pidScale = 1): number => {
+ if (pidScale > 1) {
+ return PID_NATIVE_OPTIMAL_DIMENSION;
+ }
switch (base) {
case 'sd-1':
case 'sd-2':
@@ -66,26 +86,34 @@ export const isInSDXLTrainingDimensions = (width: number, height: number): boole
* - flux, sd-3, qwen-image, z-image: 16
* - cogview4: 32
* - default: 8
+ * When `pidScale > 1` (PiD native mode) the grid is multiplied so the user-facing target snaps to a value
+ * whose `/ pidScale` generation resolution still lands on the model's native grid.
* @param base The base model
+ * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD)
* @returns The grid size for the model, defaulting to 8
*/
-export const getGridSize = (base?: BaseModelType | null): number => {
+export const getGridSize = (base?: BaseModelType | null, pidScale = 1): number => {
+ let gridSize: number;
switch (base) {
case 'cogview4':
- return 32;
+ gridSize = 32;
+ break;
case 'flux':
case 'flux2':
case 'sd-3':
case 'qwen-image':
case 'z-image':
- return 16;
+ gridSize = 16;
+ break;
case 'sd-1':
case 'sd-2':
case 'sdxl':
case 'anima':
default:
- return 8;
+ gridSize = 8;
+ break;
}
+ return gridSize * pidScale;
};
const MIN_AREA_FACTOR = 0.8;
@@ -117,7 +145,7 @@ export const getIsSizeTooLarge = (width: number, height: number, optimalDimensio
* @param optimalDimension The optimal dimension
* @returns Whether the current width and height needs to be resized to the optimal dimension
*/
-export const getIsSizeOptimal = (width: number, height: number, base?: BaseModelType): boolean => {
- const optimalDimension = getOptimalDimension(base);
+export const getIsSizeOptimal = (width: number, height: number, base?: BaseModelType, pidScale = 1): boolean => {
+ const optimalDimension = getOptimalDimension(base, pidScale);
return !getIsSizeTooSmall(width, height, optimalDimension) && !getIsSizeTooLarge(width, height, optimalDimension);
};
diff --git a/invokeai/frontend/web/src/features/parameters/util/pid.ts b/invokeai/frontend/web/src/features/parameters/util/pid.ts
new file mode 100644
index 00000000000..51c6221a451
--- /dev/null
+++ b/invokeai/frontend/web/src/features/parameters/util/pid.ts
@@ -0,0 +1,29 @@
+import type { BaseModelType } from 'features/nodes/types/common';
+
+/**
+ * Maps a main-model base to the PiD decoder base whose checkpoints are valid for it.
+ *
+ * PiD decoders are trained per backbone, so only a base-matching decoder may be used (e.g. a FLUX.2 decoder for a
+ * FLUX.2 main model). Z-Image is the exception: it shares FLUX.1's 16-channel VAE and has no PiD checkpoints of its
+ * own, so it reuses the FLUX decoder. Returns `null` for bases whose graph builder does not (yet) wire a PiD decode.
+ * Additional bases are added here as their graph builders gain PiD support.
+ */
+export const getPidDecoderBaseForMainBase = (base?: BaseModelType | null): BaseModelType | null => {
+ switch (base) {
+ case 'z-image':
+ // Z-Image reuses the FLUX PiD decoder (shared 16-channel VAE) - there is no Z-Image-specific decoder.
+ return 'flux';
+ case 'flux':
+ case 'flux2':
+ case 'sd-3':
+ case 'sdxl':
+ case 'qwen-image':
+ return base;
+ default:
+ return null;
+ }
+};
+
+/** Whether the given main-model base supports PiD decoding (i.e. its graph builder wires a PiD decode). */
+export const getIsPidSupportedBase = (base?: BaseModelType | null): boolean =>
+ getPidDecoderBaseForMainBase(base) !== null;
diff --git a/invokeai/frontend/web/src/features/queue/store/readiness.ts b/invokeai/frontend/web/src/features/queue/store/readiness.ts
index 84bc374158f..5b764899d33 100644
--- a/invokeai/frontend/web/src/features/queue/store/readiness.ts
+++ b/invokeai/frontend/web/src/features/queue/store/readiness.ts
@@ -34,7 +34,7 @@ import { resolveBatchValue } from 'features/nodes/util/node/resolveBatchValue';
import type { UpscaleState } from 'features/parameters/store/upscaleSlice';
import { selectUpscaleSlice } from 'features/parameters/store/upscaleSlice';
import { isFlux2KleinQwen3Compatible } from 'features/parameters/util/flux2Klein';
-import { getGridSize } from 'features/parameters/util/optimalDimension';
+import { getGridSize, getPidScale } from 'features/parameters/util/optimalDimension';
import { selectActiveTab } from 'features/ui/store/uiSelectors';
import type { TabName } from 'features/ui/store/uiTypes';
import i18n from 'i18next';
@@ -287,6 +287,14 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: {
if (!params.fluxVAE) {
reasons.push({ content: i18n.t('parameters.invoke.noFLUXVAEModelSelected') });
}
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ }
}
if (model?.base === 'flux2' && model.format !== 'diffusers') {
@@ -301,6 +309,39 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: {
}
}
+ if (model?.base === 'flux2' && params.pidMode !== 'off') {
+ // PiD decode (any FLUX.2 format) needs both a PiD decoder and the Gemma-2 caption encoder.
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ }
+
+ if (model?.base === 'sd-3' && params.pidMode !== 'off') {
+ // PiD decode needs both a PiD decoder and the Gemma-2 caption encoder.
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ }
+
+ if (model?.base === 'sdxl' && params.pidMode !== 'off') {
+ // PiD decode needs the decoder + Gemma-2 encoder, and is not compatible with the SDXL Refiner.
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ if (params.refinerModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.pidIncompatibleWithRefiner') });
+ }
+ }
+
if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') {
// GGUF needs sources for VAE + encoder. Each can come from either a standalone
// model or the Component Source (Diffusers).
@@ -311,6 +352,16 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: {
}
}
+ if (model?.base === 'qwen-image' && params.pidMode !== 'off') {
+ // PiD decode (any Qwen-Image format) needs both a PiD decoder and the Gemma-2 caption encoder.
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ }
+
if (model?.base === 'z-image') {
// Check if VAE source is available (either separate VAE or Qwen3 Source)
const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null;
@@ -322,6 +373,15 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: {
if (!hasQwen3Source) {
reasons.push({ content: i18n.t('parameters.invoke.noZImageQwen3EncoderSourceSelected') });
}
+ // PiD decode (Z-Image reuses the FLUX decoder) needs both a PiD decoder and the Gemma-2 caption encoder.
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ }
}
if (model?.base === 'anima') {
@@ -571,7 +631,23 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: {
}
const { bbox } = canvas;
- const gridSize = getGridSize('flux');
+ // In PiD native mode the bbox is the 4x target, so it must snap to a larger grid (16 * 4) for bbox / 4 to land
+ // on the FLUX grid. getPidScale returns 1 for off/fit, leaving the normal 16px grid.
+ const gridSize = getGridSize('flux', getPidScale(params.pidMode));
+
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ // PiD decodes at 4x the generation resolution; "Scale Before Processing" would inflate the generation
+ // size and blow up the decode. Require it to be off (None) so generation == bbox.
+ if (bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
if (bbox.scaleMethod === 'none') {
if (bbox.rect.width % gridSize !== 0) {
@@ -628,7 +704,23 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: {
}
const { bbox } = canvas;
- const gridSize = getGridSize('flux'); // FLUX.2 uses same grid size as FLUX.1
+ // FLUX.2 uses the same 16px grid as FLUX.1. In PiD native mode the bbox is the 4x target, so it must snap to
+ // a larger grid (16 * 4) for bbox / 4 to land on the FLUX grid. getPidScale returns 1 for off/fit.
+ const gridSize = getGridSize('flux2', getPidScale(params.pidMode));
+
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ // PiD decodes at 4x the generation resolution; "Scale Before Processing" would inflate the generation
+ // size and blow up the decode. Require it to be off (None) so generation == bbox.
+ if (bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
if (bbox.scaleMethod === 'none') {
if (bbox.rect.width % gridSize !== 0) {
@@ -671,6 +763,37 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: {
}
}
+ if (model?.base === 'sd-3' && params.pidMode !== 'off') {
+ // PiD decode on the Canvas: needs the decoder + Gemma-2 encoder, and "Scale Before Processing" must be off
+ // (PiD decodes at 4x the generation resolution; scaling would inflate the generation size and blow up the decode).
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ if (canvas.bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
+
+ if (model?.base === 'sdxl' && params.pidMode !== 'off') {
+ // PiD decode on the Canvas: decoder + Gemma-2 encoder required, "Scale Before Processing" off, and not
+ // compatible with the SDXL Refiner.
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ if (params.refinerModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.pidIncompatibleWithRefiner') });
+ }
+ if (canvas.bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
+
if (model?.base === 'cogview4') {
const { bbox } = canvas;
const gridSize = getGridSize('cogview4');
@@ -718,7 +841,21 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: {
if (model?.base === 'qwen-image') {
const { bbox } = canvas;
- const gridSize = getGridSize('qwen-image');
+ // In PiD native mode the bbox is the 4x target, so it must snap to a larger grid (16 * 4) for bbox / 4 to land
+ // on the Qwen grid. getPidScale returns 1 for off/fit, leaving the normal 16px grid.
+ const gridSize = getGridSize('qwen-image', getPidScale(params.pidMode));
+
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ if (bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
if (bbox.scaleMethod === 'none') {
if (bbox.rect.width % gridSize !== 0) {
@@ -782,6 +919,18 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: {
if (!hasQwen3Source) {
reasons.push({ content: i18n.t('parameters.invoke.noZImageQwen3EncoderSourceSelected') });
}
+ // PiD decode on the Canvas: decoder + Gemma-2 encoder required, and "Scale Before Processing" must be off.
+ if (params.pidMode !== 'off') {
+ if (!params.pidDecoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') });
+ }
+ if (!params.gemma2EncoderModel) {
+ reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') });
+ }
+ if (canvas.bbox.scaleMethod !== 'none') {
+ reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') });
+ }
+ }
}
if (model?.base === 'anima') {
diff --git a/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx b/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx
index 220008a38b0..61e92080368 100644
--- a/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx
+++ b/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx
@@ -19,6 +19,7 @@ import {
} from 'features/controlLayers/store/paramsSlice';
import { LoRAList } from 'features/lora/components/LoRAList';
import LoRASelect from 'features/lora/components/LoRASelect';
+import PidSettings from 'features/parameters/components/Advanced/PidSettings';
import ParamAnimaScheduler from 'features/parameters/components/Core/ParamAnimaScheduler';
import ParamCFGScale from 'features/parameters/components/Core/ParamCFGScale';
import ParamFluxDypeExponent from 'features/parameters/components/Core/ParamFluxDypeExponent';
@@ -32,6 +33,7 @@ import ParamSteps from 'features/parameters/components/Core/ParamSteps';
import ParamZImageScheduler from 'features/parameters/components/Core/ParamZImageScheduler';
import ParamZImageShift from 'features/parameters/components/Core/ParamZImageShift';
import ParamZImageSeedVarianceSettings from 'features/parameters/components/SeedVariance/ParamZImageSeedVarianceSettings';
+import { getIsPidSupportedBase } from 'features/parameters/util/pid';
import { MainModelPicker } from 'features/settingsAccordions/components/GenerationSettingsAccordion/MainModelPicker';
import { useExpanderToggle } from 'features/settingsAccordions/hooks/useExpanderToggle';
import { useStandaloneAccordionToggle } from 'features/settingsAccordions/hooks/useStandaloneAccordionToggle';
@@ -58,6 +60,8 @@ export const GenerationSettingsAccordion = memo(() => {
const fluxDypePreset = useAppSelector(selectFluxDypePreset);
const modelSupportsGuidance = useAppSelector(selectModelSupportsGuidance);
const modelSupportsSteps = useAppSelector(selectModelSupportsSteps);
+ // PiD is available for any base whose graph builder wires a PiD decode (currently FLUX and FLUX.2).
+ const isPidSupported = getIsPidSupportedBase(modelConfig?.base);
const hasExpanderContent = isExternal ? modelSupportsGuidance || modelSupportsSteps : true;
const selectBadges = useMemo(
@@ -120,6 +124,7 @@ export const GenerationSettingsAccordion = memo(() => {
{!isExternal && isFLUX && fluxDypePreset === 'manual' && }
{!isExternal && isFLUX && fluxDypePreset === 'manual' && }
+ {!isExternal && isPidSupported && }
{!isExternal && isZImage && }
diff --git a/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts b/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts
index ca886789cea..8d5b7556908 100644
--- a/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts
+++ b/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts
@@ -23,10 +23,12 @@ import {
isFluxKontextModelConfig,
isFluxReduxModelConfig,
isFluxVAEModelConfig,
+ isGemma2EncoderModelConfig,
isIPAdapterModelConfig,
isLLaVAModelConfig,
isLoRAModelConfig,
isMainOrExternalModelConfig,
+ isPiDDecoderModelConfig,
isQwen3EncoderModelConfig,
isQwenImageDiffusersMainModelConfig,
isQwenImageVAEModelConfig,
@@ -111,6 +113,8 @@ export const useQwenImageDiffusersModels = () => buildModelsHook(isQwenImageDiff
export const useQwenImageVAEModels = () => buildModelsHook(isQwenImageVAEModelConfig)();
export const useQwenVLEncoderModels = () => buildModelsHook(isQwenVLEncoderModelConfig)();
export const useQwen3EncoderModels = () => buildModelsHook(isQwen3EncoderModelConfig)();
+export const usePiDDecoderModels = buildModelsHook(isPiDDecoderModelConfig);
+export const useGemma2EncoderModels = () => buildModelsHook(isGemma2EncoderModelConfig)();
export const useGlobalReferenceImageModels = buildModelsHook(
(config) => isIPAdapterModelConfig(config) || isFluxReduxModelConfig(config) || isFluxKontextModelConfig(config)
);
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
index 0bcfbb49106..de8d63d29fd 100644
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@@ -3615,7 +3615,7 @@ export type components = {
*/
type: "anima_text_encoder";
};
- AnyModelConfig: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ AnyModelConfig: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
/**
* AppVersion
* @description App Version Response
@@ -10629,6 +10629,91 @@ export type components = {
*/
type: "flux2_klein_text_encoder";
};
+ /**
+ * Latents to Image - FLUX.2 + PiD (4x SR)
+ * @description Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder.
+ *
+ * Produces a 4x super-resolved image in a single pass. The stored FLUX.2 latent
+ * is patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout
+ * PiD's FLUX.2 backbone expects, then decoded directly (it is already in raw,
+ * BN-denormalized space; see the module docstring).
+ */
+ Flux2PiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD FLUX.2 decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * VAE
+ * @description FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is normally an identity transform and the input can be left unconnected.
+ * @default null
+ */
+ vae?: components["schemas"]["VAEField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default flux2_pid_decode
+ * @constant
+ */
+ type: "flux2_pid_decode";
+ };
/**
* Latents to Image - FLUX2
* @description Generates an image from latents using FLUX.2 Klein's 32-channel VAE.
@@ -11770,6 +11855,84 @@ export type components = {
*/
type: "flux_model_loader_output";
};
+ /**
+ * Latents to Image - FLUX + PiD (4x SR)
+ * @description Decode a FLUX latent with the PiD pixel-diffusion decoder.
+ *
+ * The FLUX AutoEncoder usually denormalises the stored latent internally
+ * before its conv decoder runs (`z / scale + shift`); we apply the same
+ * transform manually here so PiD sees the raw latent it was trained on.
+ */
+ FluxPiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD FLUX decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default flux_pid_decode
+ * @constant
+ */
+ type: "flux_pid_decode";
+ };
/**
* FluxReduxConditioningField
* @description A FLUX Redux conditioning tensor primitive value
@@ -12235,6 +12398,155 @@ export type components = {
*/
type: "gemini_image_generation";
};
+ /**
+ * Gemma2EncoderField
+ * @description Field for the Gemma-2 text encoder used by PiD decoders.
+ */
+ Gemma2EncoderField: {
+ /** @description Info to load tokenizer submodel */
+ tokenizer: components["schemas"]["ModelIdentifierField"];
+ /** @description Info to load text_encoder submodel */
+ text_encoder: components["schemas"]["ModelIdentifierField"];
+ };
+ /**
+ * Gemma-2 Encoder - PiD
+ * @description Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder
+ * submodels for use by a PiD decode node.
+ */
+ Gemma2EncoderLoaderInvocation: {
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * Gemma-2
+ * @description Gemma-2 model used to encode captions for PiD decoders.
+ * @default null
+ */
+ gemma2_model?: components["schemas"]["ModelIdentifierField"] | null;
+ /**
+ * type
+ * @default gemma2_encoder_loader
+ * @constant
+ */
+ type: "gemma2_encoder_loader";
+ };
+ /** Gemma2EncoderOutput */
+ Gemma2EncoderOutput: {
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 text encoder used by PiD decoders
+ */
+ gemma2_encoder: components["schemas"]["Gemma2EncoderField"];
+ /**
+ * type
+ * @default gemma2_encoder_output
+ * @constant
+ */
+ type: "gemma2_encoder_output";
+ };
+ /**
+ * Gemma2Encoder_Gemma2Encoder_Config
+ * @description Standalone Gemma-2 causal LM directory used as a text encoder by PiD.
+ *
+ * Expected directory layout (HuggingFace `from_pretrained`-compatible)::
+ *
+ * /
+ * config.json # architectures: ["Gemma2ForCausalLM"]
+ * tokenizer.json
+ * tokenizer_config.json
+ * model-*.safetensors # or model.safetensors / *.bin
+ */
+ Gemma2Encoder_Gemma2Encoder_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Base
+ * @default any
+ * @constant
+ */
+ base: "any";
+ /**
+ * Type
+ * @default gemma2_encoder
+ * @constant
+ */
+ type: "gemma2_encoder";
+ /**
+ * Format
+ * @default gemma2_encoder
+ * @constant
+ */
+ format: "gemma2_encoder";
+ /**
+ * Cpu Only
+ * @description Whether this model should run on CPU only
+ */
+ cpu_only: boolean | null;
+ };
/**
* GeneratePasswordResponse
* @description Response containing a generated password.
@@ -12334,7 +12646,7 @@ export type components = {
* @description The nodes in this graph
*/
nodes?: {
- [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
+ [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
};
/**
* Edges
@@ -12371,7 +12683,7 @@ export type components = {
* @description The results of node executions
*/
results: {
- [key: string]: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"];
+ [key: string]: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["Gemma2EncoderOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PiDDecoderOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"];
};
/**
* Errors
@@ -15781,7 +16093,7 @@ export type components = {
* Invocation
* @description The ID of the invocation
*/
- invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
+ invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
/**
* Invocation Source Id
* @description The ID of the prepared invocation's source node
@@ -15791,7 +16103,7 @@ export type components = {
* Result
* @description The result of the invocation
*/
- result: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"];
+ result: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["Gemma2EncoderOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PiDDecoderOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"];
};
/**
* InvocationErrorEvent
@@ -15845,7 +16157,7 @@ export type components = {
* Invocation
* @description The ID of the invocation
*/
- invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
+ invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
/**
* Invocation Source Id
* @description The ID of the prepared invocation's source node
@@ -15936,6 +16248,7 @@ export type components = {
flux2_klein_lora_loader: components["schemas"]["Flux2KleinLoRALoaderOutput"];
flux2_klein_model_loader: components["schemas"]["Flux2KleinModelLoaderOutput"];
flux2_klein_text_encoder: components["schemas"]["FluxConditioningOutput"];
+ flux2_pid_decode: components["schemas"]["ImageOutput"];
flux2_vae_decode: components["schemas"]["ImageOutput"];
flux2_vae_encode: components["schemas"]["LatentsOutput"];
flux_control_lora_loader: components["schemas"]["FluxControlLoRALoaderOutput"];
@@ -15949,12 +16262,14 @@ export type components = {
flux_lora_collection_loader: components["schemas"]["FluxLoRALoaderOutput"];
flux_lora_loader: components["schemas"]["FluxLoRALoaderOutput"];
flux_model_loader: components["schemas"]["FluxModelLoaderOutput"];
+ flux_pid_decode: components["schemas"]["ImageOutput"];
flux_redux: components["schemas"]["FluxReduxOutput"];
flux_text_encoder: components["schemas"]["FluxConditioningOutput"];
flux_vae_decode: components["schemas"]["ImageOutput"];
flux_vae_encode: components["schemas"]["LatentsOutput"];
freeu: components["schemas"]["UNetOutput"];
gemini_image_generation: components["schemas"]["ImageCollectionOutput"];
+ gemma2_encoder_loader: components["schemas"]["Gemma2EncoderOutput"];
get_image_mask_bounding_box: components["schemas"]["BoundingBoxOutput"];
grounding_dino: components["schemas"]["BoundingBoxCollectionOutput"];
hed_edge_detection: components["schemas"]["ImageOutput"];
@@ -16057,6 +16372,8 @@ export type components = {
pair_tile_image: components["schemas"]["PairTileImageOutput"];
paste_image_into_bounding_box: components["schemas"]["ImageOutput"];
pbr_maps: components["schemas"]["PBRMapsOutput"];
+ pid_decoder_loader: components["schemas"]["PiDDecoderOutput"];
+ pid_upscale: components["schemas"]["ImageOutput"];
pidi_edge_detection: components["schemas"]["ImageOutput"];
prompt_from_file: components["schemas"]["StringCollectionOutput"];
prompt_template: components["schemas"]["PromptTemplateOutput"];
@@ -16066,6 +16383,7 @@ export type components = {
qwen_image_lora_collection_loader: components["schemas"]["QwenImageLoRALoaderOutput"];
qwen_image_lora_loader: components["schemas"]["QwenImageLoRALoaderOutput"];
qwen_image_model_loader: components["schemas"]["QwenImageModelLoaderOutput"];
+ qwen_image_pid_decode: components["schemas"]["ImageOutput"];
qwen_image_text_encoder: components["schemas"]["QwenImageConditioningOutput"];
rand_float: components["schemas"]["FloatOutput"];
rand_int: components["schemas"]["IntegerOutput"];
@@ -16081,11 +16399,13 @@ export type components = {
sd3_i2l: components["schemas"]["LatentsOutput"];
sd3_l2i: components["schemas"]["ImageOutput"];
sd3_model_loader: components["schemas"]["Sd3ModelLoaderOutput"];
+ sd3_pid_decode: components["schemas"]["ImageOutput"];
sd3_text_encoder: components["schemas"]["SD3ConditioningOutput"];
sdxl_compel_prompt: components["schemas"]["ConditioningOutput"];
sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"];
sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"];
sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"];
+ sdxl_pid_decode: components["schemas"]["ImageOutput"];
sdxl_refiner_compel_prompt: components["schemas"]["ConditioningOutput"];
sdxl_refiner_model_loader: components["schemas"]["SDXLRefinerModelLoaderOutput"];
seamless: components["schemas"]["SeamlessModeOutput"];
@@ -16121,6 +16441,7 @@ export type components = {
z_image_lora_collection_loader: components["schemas"]["ZImageLoRALoaderOutput"];
z_image_lora_loader: components["schemas"]["ZImageLoRALoaderOutput"];
z_image_model_loader: components["schemas"]["ZImageModelLoaderOutput"];
+ z_image_pid_decode: components["schemas"]["ImageOutput"];
z_image_seed_variance_enhancer: components["schemas"]["ZImageConditioningOutput"];
z_image_text_encoder: components["schemas"]["ZImageConditioningOutput"];
};
@@ -16176,7 +16497,7 @@ export type components = {
* Invocation
* @description The ID of the invocation
*/
- invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
+ invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
/**
* Invocation Source Id
* @description The ID of the prepared invocation's source node
@@ -16251,7 +16572,7 @@ export type components = {
* Invocation
* @description The ID of the invocation
*/
- invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
+ invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"];
/**
* Invocation Source Id
* @description The ID of the prepared invocation's source node
@@ -23465,7 +23786,7 @@ export type components = {
* @description Storage format of model.
* @enum {string}
*/
- ModelFormat: "omi" | "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "bnb_quantized_int8b" | "bnb_quantized_nf4b" | "gguf_quantized" | "external_api" | "unknown";
+ ModelFormat: "omi" | "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "gemma2_encoder" | "bnb_quantized_int8b" | "bnb_quantized_nf4b" | "gguf_quantized" | "external_api" | "unknown";
/** ModelIdentifierField */
ModelIdentifierField: {
/**
@@ -23602,7 +23923,7 @@ export type components = {
* Config
* @description The installed model's config
*/
- config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
/**
* ModelInstallDownloadProgressEvent
@@ -23768,7 +24089,7 @@ export type components = {
* Config Out
* @description After successful installation, this will hold the configuration object.
*/
- config_out?: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]) | null;
+ config_out?: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]) | null;
/**
* Inplace
* @description Leave model in its current location; otherwise install under models directory
@@ -23854,7 +24175,7 @@ export type components = {
* Config
* @description The model's config
*/
- config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
/**
* @description The submodel type, if any
* @default null
@@ -23875,7 +24196,7 @@ export type components = {
* Config
* @description The model's config
*/
- config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
/**
* @description The submodel type, if any
* @default null
@@ -24001,7 +24322,7 @@ export type components = {
* Variant
* @description The variant of the model.
*/
- variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null;
+ variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null;
/** @description The prediction type of the model. */
prediction_type?: components["schemas"]["SchedulerPredictionType"] | null;
/**
@@ -24083,7 +24404,7 @@ export type components = {
* @description Model type.
* @enum {string}
*/
- ModelType: "onnx" | "main" | "vae" | "lora" | "control_lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "clip_embed" | "t2i_adapter" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "spandrel_image_to_image" | "siglip" | "flux_redux" | "llava_onevision" | "text_llm" | "external_image_generator" | "unknown";
+ ModelType: "onnx" | "main" | "vae" | "lora" | "control_lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "clip_embed" | "t2i_adapter" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "gemma2_encoder" | "spandrel_image_to_image" | "siglip" | "flux_redux" | "llava_onevision" | "text_llm" | "external_image_generator" | "pid_decoder" | "unknown";
/**
* ModelVariantType
* @description Variant type.
@@ -24096,7 +24417,7 @@ export type components = {
*/
ModelsList: {
/** Models */
- models: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"])[];
+ models: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"])[];
};
/**
* Multiply Integers
@@ -24864,19 +25185,593 @@ export type components = {
type: "paste_image_into_bounding_box";
};
/**
- * PiDiNet Edge Detection
- * @description Generates an edge map using PiDiNet.
+ * PiDDecoderField
+ * @description Field for a PiD (Pixel Diffusion Decoder) checkpoint.
*/
- PiDiNetEdgeDetectionInvocation: {
+ PiDDecoderField: {
+ /** @description Info to load PiD decoder checkpoint */
+ decoder: components["schemas"]["ModelIdentifierField"];
+ };
+ /**
+ * PiD Decoder - FLUX / FLUX.2 / SD3
+ * @description Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use
+ * by the per-backbone PiD decode nodes.
+ */
+ PiDDecoderLoaderInvocation: {
/**
- * @description The board to save the image to
- * @default null
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
*/
- board?: components["schemas"]["BoardField"] | null;
+ id: string;
/**
- * @description Optional metadata to be saved with the image
- * @default null
- */
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * PiD Decoder
+ * @description PiD decoder checkpoint matching the upstream backbone.
+ * @default null
+ */
+ pid_decoder_model?: components["schemas"]["ModelIdentifierField"] | null;
+ /**
+ * type
+ * @default pid_decoder_loader
+ * @constant
+ */
+ type: "pid_decoder_loader";
+ };
+ /** PiDDecoderOutput */
+ PiDDecoderOutput: {
+ /**
+ * PiD Decoder
+ * @description PiD (Pixel Diffusion Decoder) checkpoint
+ */
+ pid_decoder: components["schemas"]["PiDDecoderField"];
+ /**
+ * type
+ * @default pid_decoder_output
+ * @constant
+ */
+ type: "pid_decoder_output";
+ };
+ /**
+ * PiDDecoderVariantType
+ * @description PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA.
+ *
+ * Each backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that
+ * differ only in target output resolution; the underlying network is the
+ * same. NVIDIA's checkpoint filenames encode this as e.g.
+ * `PiD_res2k_sr4x_official_flux_distill_4step` vs
+ * `PiD_res2kto4k_sr4x_official_flux_distill_4step`.
+ * @enum {string}
+ */
+ PiDDecoderVariantType: "res2k_sr4x" | "res2kto4k_sr4x";
+ /**
+ * PiDDecoder_Checkpoint_FLUX_Config
+ * @description PiD decoder for the FLUX.1 backbone (16-channel latent).
+ */
+ PiDDecoder_Checkpoint_FLUX_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Config Path
+ * @description Path to the config for this model, if any.
+ */
+ config_path: string | null;
+ /**
+ * Type
+ * @default pid_decoder
+ * @constant
+ */
+ type: "pid_decoder";
+ /**
+ * Format
+ * @default checkpoint
+ * @constant
+ */
+ format: "checkpoint";
+ /**
+ * Base
+ * @default flux
+ * @constant
+ */
+ base: "flux";
+ /** @description Resolution preset of the PiD decoder checkpoint. */
+ variant: components["schemas"]["PiDDecoderVariantType"];
+ };
+ /**
+ * PiDDecoder_Checkpoint_Flux2_Config
+ * @description PiD decoder for the FLUX.2 backbone (128-channel latent).
+ */
+ PiDDecoder_Checkpoint_Flux2_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Config Path
+ * @description Path to the config for this model, if any.
+ */
+ config_path: string | null;
+ /**
+ * Type
+ * @default pid_decoder
+ * @constant
+ */
+ type: "pid_decoder";
+ /**
+ * Format
+ * @default checkpoint
+ * @constant
+ */
+ format: "checkpoint";
+ /**
+ * Base
+ * @default flux2
+ * @constant
+ */
+ base: "flux2";
+ /** @description Resolution preset of the PiD decoder checkpoint. */
+ variant: components["schemas"]["PiDDecoderVariantType"];
+ };
+ /**
+ * PiDDecoder_Checkpoint_QwenImage_Config
+ * @description PiD decoder for the Qwen-Image backbone (16-channel latent).
+ *
+ * Shares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same
+ * filename / directory-name disambiguation (or a trusted explicit ``base`` override)
+ * as SD3 - see ``_validate_base``.
+ */
+ PiDDecoder_Checkpoint_QwenImage_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Config Path
+ * @description Path to the config for this model, if any.
+ */
+ config_path: string | null;
+ /**
+ * Type
+ * @default pid_decoder
+ * @constant
+ */
+ type: "pid_decoder";
+ /**
+ * Format
+ * @default checkpoint
+ * @constant
+ */
+ format: "checkpoint";
+ /**
+ * Base
+ * @default qwen-image
+ * @constant
+ */
+ base: "qwen-image";
+ /** @description Resolution preset of the PiD decoder checkpoint. */
+ variant: components["schemas"]["PiDDecoderVariantType"];
+ };
+ /**
+ * PiDDecoder_Checkpoint_SD3_Config
+ * @description PiD decoder for the Stable Diffusion 3 backbone (16-channel latent).
+ */
+ PiDDecoder_Checkpoint_SD3_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Config Path
+ * @description Path to the config for this model, if any.
+ */
+ config_path: string | null;
+ /**
+ * Type
+ * @default pid_decoder
+ * @constant
+ */
+ type: "pid_decoder";
+ /**
+ * Format
+ * @default checkpoint
+ * @constant
+ */
+ format: "checkpoint";
+ /**
+ * Base
+ * @default sd-3
+ * @constant
+ */
+ base: "sd-3";
+ /** @description Resolution preset of the PiD decoder checkpoint. */
+ variant: components["schemas"]["PiDDecoderVariantType"];
+ };
+ /**
+ * PiDDecoder_Checkpoint_SDXL_Config
+ * @description PiD decoder for the SDXL backbone (4-channel latent).
+ */
+ PiDDecoder_Checkpoint_SDXL_Config: {
+ /**
+ * Key
+ * @description A unique key for this model.
+ */
+ key: string;
+ /**
+ * Hash
+ * @description The hash of the model file(s).
+ */
+ hash: string;
+ /**
+ * Path
+ * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+ */
+ path: string;
+ /**
+ * File Size
+ * @description The size of the model in bytes.
+ */
+ file_size: number;
+ /**
+ * Name
+ * @description Name of the model.
+ */
+ name: string;
+ /**
+ * Description
+ * @description Model description
+ */
+ description: string | null;
+ /**
+ * Source
+ * @description The original source of the model (path, URL or repo_id).
+ */
+ source: string;
+ /** @description The type of source */
+ source_type: components["schemas"]["ModelSourceType"];
+ /**
+ * Source Api Response
+ * @description The original API response from the source, as stringified JSON.
+ */
+ source_api_response: string | null;
+ /**
+ * Source Url
+ * @description Optional URL for the model (e.g. download page or model page).
+ */
+ source_url: string | null;
+ /**
+ * Cover Image
+ * @description Url for image to preview model
+ */
+ cover_image: string | null;
+ /**
+ * Config Path
+ * @description Path to the config for this model, if any.
+ */
+ config_path: string | null;
+ /**
+ * Type
+ * @default pid_decoder
+ * @constant
+ */
+ type: "pid_decoder";
+ /**
+ * Format
+ * @default checkpoint
+ * @constant
+ */
+ format: "checkpoint";
+ /**
+ * Base
+ * @default sdxl
+ * @constant
+ */
+ base: "sdxl";
+ /** @description Resolution preset of the PiD decoder checkpoint. */
+ variant: components["schemas"]["PiDDecoderVariantType"];
+ };
+ /**
+ * PiD Upscale (4x) - FLUX VAE
+ * @description Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode.
+ *
+ * Works for source images that the FLUX VAE can encode (i.e. natural
+ * photos / generated images at any size that lands on the VAE's 8-pixel
+ * grid). The caption is used to condition the PiD decoder; leaving it
+ * empty produces an unconditional decode and is the cheapest option, but
+ * the model was distilled with rich captions and benefits from one.
+ */
+ PiDUpscaleInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Image to upscale.
+ * @default null
+ */
+ image?: components["schemas"]["ImageField"] | null;
+ /**
+ * @description FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder).
+ * @default null
+ */
+ vae?: components["schemas"]["VAEField"] | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD FLUX decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * Prompt
+ * @description Optional caption describing the image. Empty -> empty-caption decode.
+ * @default
+ */
+ prompt?: string;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default pid_upscale
+ * @constant
+ */
+ type: "pid_upscale";
+ };
+ /**
+ * PiDiNet Edge Detection
+ * @description Generates an edge map using PiDiNet.
+ */
+ PiDiNetEdgeDetectionInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
metadata?: components["schemas"]["MetadataField"] | null;
/**
* Id
@@ -25973,6 +26868,89 @@ export type components = {
*/
type: "qwen_image_model_loader_output";
};
+ /**
+ * Latents to Image - Qwen-Image + PiD (4x SR)
+ * @description Decode a Qwen-Image latent with the PiD pixel-diffusion decoder.
+ *
+ * Produces a 4x super-resolved image in a single pass. The 5D Qwen latent is
+ * reduced to 2D and per-channel denormalized (``z * std + mean``) before PiD.
+ */
+ QwenImagePiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD Qwen-Image decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * VAE
+ * @description Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. If omitted, the diffusers default Qwen-Image constants are used.
+ * @default null
+ */
+ vae?: components["schemas"]["VAEField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default qwen_image_pid_decode
+ * @constant
+ */
+ type: "qwen_image_pid_decode";
+ };
/**
* Prompt - Qwen Image
* @description Encodes text and reference images for Qwen Image using Qwen2.5-VL.
@@ -27138,6 +28116,80 @@ export type components = {
*/
type: "sd3_l2i";
};
+ /**
+ * Latents to Image - SD3 + PiD (4x SR)
+ * @description Decode an SD3 latent with the PiD pixel-diffusion decoder.
+ */
+ SD3PiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD SD3 decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default sd3_pid_decode
+ * @constant
+ */
+ type: "sd3_pid_decode";
+ };
/**
* Prompt - SDXL
* @description Parse prompt using compel package to conditioning.
@@ -27434,6 +28486,90 @@ export type components = {
*/
type: "sdxl_model_loader_output";
};
+ /**
+ * Latents to Image - SDXL + PiD (4x SR)
+ * @description Decode an SDXL latent with the PiD pixel-diffusion decoder.
+ *
+ * Produces a 4x super-resolved image in a single pass. The SDXL latent is
+ * 4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``)
+ * and handed straight to PiD - no packing needed.
+ */
+ SDXLPiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD SDXL decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * VAE
+ * @description SDXL VAE, used to read scaling_factor / shift_factor. If omitted, the SDXL fallback constants (0.13025 / 0.0) are used.
+ * @default null
+ */
+ vae?: components["schemas"]["VAEField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default sdxl_pid_decode
+ * @constant
+ */
+ type: "sdxl_pid_decode";
+ };
/**
* Prompt - SDXL Refiner
* @description Parse prompt using compel package to conditioning.
@@ -28854,7 +29990,7 @@ export type components = {
type: components["schemas"]["ModelType"];
format?: components["schemas"]["ModelFormat"] | null;
/** Variant */
- variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null;
+ variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null;
/**
* Is Installed
* @default false
@@ -28899,7 +30035,7 @@ export type components = {
type: components["schemas"]["ModelType"];
format?: components["schemas"]["ModelFormat"] | null;
/** Variant */
- variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null;
+ variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null;
/**
* Is Installed
* @default false
@@ -29430,7 +30566,7 @@ export type components = {
path_or_prefix: string;
model_type: components["schemas"]["ModelType"];
/** Variant */
- variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null;
+ variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null;
};
/**
* Subtract Integers
@@ -33099,6 +34235,90 @@ export type components = {
*/
type: "z_image_model_loader_output";
};
+ /**
+ * Latents to Image - Z-Image + PiD (4x SR)
+ * @description Decode a Z-Image latent with the PiD pixel-diffusion decoder.
+ *
+ * Produces a 4x super-resolved image in a single pass (Z-Image decoder is
+ * trained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial
+ * down-factor gives a 32x linear scale from latent to pixel).
+ */
+ ZImagePiDDecodeInvocation: {
+ /**
+ * @description The board to save the image to
+ * @default null
+ */
+ board?: components["schemas"]["BoardField"] | null;
+ /**
+ * @description Optional metadata to be saved with the image
+ * @default null
+ */
+ metadata?: components["schemas"]["MetadataField"] | null;
+ /**
+ * Id
+ * @description The id of this instance of an invocation. Must be unique among all instances of invocations.
+ */
+ id: string;
+ /**
+ * Is Intermediate
+ * @description Whether or not this is an intermediate invocation.
+ * @default false
+ */
+ is_intermediate?: boolean;
+ /**
+ * Use Cache
+ * @description Whether or not to use the cache
+ * @default true
+ */
+ use_cache?: boolean;
+ /**
+ * @description Latents tensor
+ * @default null
+ */
+ latents?: components["schemas"]["LatentsField"] | null;
+ /**
+ * Prompt
+ * @description Text prompt the latent was generated from. PiD conditions on it.
+ * @default null
+ */
+ prompt?: string | null;
+ /**
+ * Gemma-2 Encoder
+ * @description Gemma-2 caption encoder. Required by PiD.
+ * @default null
+ */
+ gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null;
+ /**
+ * PiD Decoder
+ * @description PiD FLUX decoder checkpoint.
+ * @default null
+ */
+ pid_decoder?: components["schemas"]["PiDDecoderField"] | null;
+ /**
+ * VAE
+ * @description Z-Image VAE used to read scaling_factor / shift_factor. If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used.
+ * @default null
+ */
+ vae?: components["schemas"]["VAEField"] | null;
+ /**
+ * Num Inference Steps
+ * @description Number of PiD distill steps. The released checkpoints are trained for 4.
+ * @default 4
+ */
+ num_inference_steps?: number;
+ /**
+ * Seed
+ * @description Seed for the PiD decoder's noise.
+ * @default 0
+ */
+ seed?: number;
+ /**
+ * type
+ * @default z_image_pid_decode
+ * @constant
+ */
+ type: "z_image_pid_decode";
+ };
/**
* Seed Variance Enhancer - Z-Image
* @description Adds seed-based noise to Z-Image conditioning to increase variance between seeds.
@@ -33736,7 +34956,7 @@ export interface operations {
[name: string]: unknown;
};
content: {
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Validation Error */
@@ -33768,7 +34988,7 @@ export interface operations {
[name: string]: unknown;
};
content: {
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Validation Error */
@@ -33820,7 +35040,7 @@ export interface operations {
* "upcast_attention": false
* }
*/
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Bad request */
@@ -33927,7 +35147,7 @@ export interface operations {
* "upcast_attention": false
* }
*/
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Bad request */
@@ -34000,7 +35220,7 @@ export interface operations {
* "upcast_attention": false
* }
*/
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Bad request */
@@ -34735,7 +35955,7 @@ export interface operations {
* "upcast_attention": false
* }
*/
- "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
+ "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"];
};
};
/** @description Bad request */
diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts
index 27c6fcbf3c3..c233e1da123 100644
--- a/invokeai/frontend/web/src/services/api/types.ts
+++ b/invokeai/frontend/web/src/services/api/types.ts
@@ -117,6 +117,8 @@ export type T5EncoderBnbQuantizedLlmInt8bModelConfig = Extract<
>;
export type Qwen3EncoderModelConfig = Extract;
export type QwenVLEncoderModelConfig = Extract;
+type Gemma2EncoderModelConfig = Extract;
+type PiDDecoderModelConfig = Extract;
export type SpandrelImageToImageModelConfig = Extract;
export type CheckpointModelConfig = Extract;
export type CLIPVisionModelConfig = Extract;
@@ -379,6 +381,14 @@ export const isQwenVLEncoderModelConfig = (config: AnyModelConfig): config is Qw
return config.type === 'qwen_vl_encoder';
};
+export const isGemma2EncoderModelConfig = (config: AnyModelConfig): config is Gemma2EncoderModelConfig => {
+ return config.type === 'gemma2_encoder';
+};
+
+export const isPiDDecoderModelConfig = (config: AnyModelConfig): config is PiDDecoderModelConfig => {
+ return config.type === 'pid_decoder';
+};
+
export const isCLIPEmbedModelConfigOrSubmodel = (
config: AnyModelConfig,
excludeSubmodels?: boolean