diff --git a/LICENSE-PiD.txt b/LICENSE-PiD.txt new file mode 100644 index 00000000000..81f434709b0 --- /dev/null +++ b/LICENSE-PiD.txt @@ -0,0 +1,68 @@ +PiD (Pixel Diffusion Decoder) — License notice + +Upstream project: https://github.com/nv-tlabs/PiD +Vendored under: invokeai/backend/pid/ + +================================================================================ +CODE (Apache License 2.0) +================================================================================ + +The PiD source code, including the `pid/_src/` subtree and the `pid/_ext/imaginaire/` +framework subset, is licensed under the Apache License, Version 2.0. + +Copyright 2026 NVIDIA CORPORATION & AFFILIATES. + +Portions of the framework (pid/_ext/imaginaire/) were originally adapted from +the cosmos-predict2.5 project (https://github.com/nvidia-cosmos/cosmos-predict2.5/). + +Files vendored into invokeai/backend/pid/ retain their original SPDX-License-Identifier +headers. The Apache 2.0 license text is available at: + + http://www.apache.org/licenses/LICENSE-2.0 + +================================================================================ +MODEL WEIGHTS (NVIDIA Source Code License v1 — non-commercial) +================================================================================ + +The pre-trained PiD decoder checkpoints distributed by NVIDIA at + + https://huggingface.co/nvidia/PiD + +are released under the NSCLv1 license. Per NSCLv1, the weights may only be used +for non-commercial (research or evaluation) purposes: + + https://huggingface.co/nvidia/PixelDiT-1300M-1024px/blob/main/LICENSE + +This restriction applies to the weights only, not to the InvokeAI source code +or the vendored PiD source code (which remain Apache 2.0). Users are responsible +for ensuring their use of the PiD weights complies with NSCLv1. + +================================================================================ +LOCAL MODIFICATIONS +================================================================================ + +The following changes were applied to the upstream PiD subset when vendoring: + +* All `pid.*` imports were rewritten to `invokeai.backend.pid.*`. +* `pid/_src/configs/`, `pid/_src/tokenizers/`, `pid/_src/checkpointer/`, + `pid/_src/inference/_demo_*.py`, `from_*.py`, `create_dataset.py`, + `rae_generation.py`, and `scale_rae_generation.py` were dropped (not needed + for the decoder-only inference subset). +* `pid/_ext/imaginaire/checkpointer/`, `trainer.py`, `visualize/`, `flags.py`, + `config.py`, `types/`, `utils/easy_io/`, `utils/callback.py`, + `utils/config_helper.py`, `utils/validator{,_params}.py` and the + `lazy_config/omegaconf_patch.py` were dropped. +* The upstream `utils/log.py` (loguru-based) and `utils/misc.py` were replaced + with stdlib-based stubs covering only the API surface used by the decoder. +* `lazy_config/file_io.py` (iopath PathManager) and `lazy_config/registry.py` + (fvcore Registry) were replaced with stdlib-only implementations. +* `lazy_config/lazy.py` was reduced to a minimal `LazyCall`/`LazyConfig` stub; + the upstream yaml/cloudpickle/dill/detectron2 config save/load paths are + intentionally not supported. +* `lazy_config/instantiate.py` was reduced to a stdlib-only implementation; + the upstream omegaconf `DictConfig`/`ListConfig` branches were dropped, so + no `omegaconf` dependency is required. +* `_src/utils/model_loader.py` (which depended on Imaginaire's distributed + checkpointer + easy_io) and `_src/inference/inference_utils.py` (S3 / video + helpers) were removed; their decode-path equivalents are reimplemented in + `invokeai/backend/pid/decode.py`. diff --git a/docs/src/content/docs/features/pid-decode.mdx b/docs/src/content/docs/features/pid-decode.mdx new file mode 100644 index 00000000000..d630c318e29 --- /dev/null +++ b/docs/src/content/docs/features/pid-decode.mdx @@ -0,0 +1,76 @@ +--- +title: PiD Super-Resolution Decode +lastUpdated: 2026-07-01 +sidebar: + order: 5 +--- + +import { Steps, Aside, Tabs, TabItem } from '@astrojs/starlight/components' + +**PiD** (Pixel Diffusion Decoder) is an alternative way to turn a model's latents into an image. Instead of the usual VAE decode, it runs a short pixel-space diffusion that produces a **4× super-resolved** result in a single, few-step pass — so a 512×512 generation comes out as a detailed 2048×2048 image. + +Because it decodes in pixel space and is conditioned on your prompt, PiD often recovers finer texture and edge detail than a plain VAE decode followed by an upscaler. + + + +## Supported models + +PiD works with these base models: + +| Base model | PiD decoder to install | +|---|---| +| FLUX.1 | PiD Decoder FLUX | +| FLUX.2 Klein (4B / 9B) | PiD Decoder FLUX.2 | +| Stable Diffusion 3 | PiD Decoder SD3 | +| SDXL | PiD Decoder SDXL | +| Z-Image / Z-Image Turbo | **PiD Decoder FLUX** (Z-Image shares FLUX.1's VAE) | +| Qwen-Image | PiD Decoder Qwen-Image | + + + +## What you need to install + +PiD needs two extra models, both available in **Model Manager → Starter Models**: + + +1. A **PiD Decoder** for your base model (e.g. *PiD Decoder FLUX (2K)*). Some bases offer a *2K* and a *2K-to-4K* preset; SDXL and Qwen-Image ship only the *2K-to-4K* preset. +2. The **Gemma 2 2B (PiD caption encoder)** — PiD uses it to condition the decode on your prompt. It installs automatically as a dependency of any PiD decoder, and is shared across all of them. + + +Each PiD decoder is roughly 5 GB and the shared Gemma-2 encoder is roughly 5 GB. + +## Enabling PiD + +Open the **Generation** settings for a supported model and expand the advanced options. You'll find a **PiD** control with three modes: + + + + Standard VAE decode. No PiD models required. + + + Generate at the requested size, decode 4× with PiD, then downscale the result back to the requested size. This is the safe default and works everywhere — the output matches your bounding box exactly, so it composites cleanly on the Canvas. + + + Treat the requested dimensions as the **4× target**: the image is generated at target ÷ 4 and PiD's full 4× output is used directly (no downscale), preserving all of the added detail. Great when you want a large, highly-detailed result. + + + +When PiD mode is not *Off*, pick your **PiD Decoder** and **Gemma-2 Encoder** below the mode selector. The **PiD Steps** control (default 4) sets how many decode steps run — the released checkpoints are trained for 4. + +PiD is available in both the **Generate** tab (text-to-image) and on the **Canvas** (image-to-image), in both Fit and Native modes. + +## Tips & limitations + +- **Turn off "Scale Before Processing"** on the Canvas when using PiD — PiD already decodes at 4×, so pre-scaling would inflate the work and is blocked. +- **Inpaint / Outpaint** are not supported with PiD yet; use text-to-image or image-to-image. +- **SDXL Refiner** cannot be combined with PiD — disable one of them. +- PiD's memory use scales with the *output* resolution. A 2048px output needs only a little more headroom than a normal decode, but Native mode at large target sizes (e.g. a 4096px result) is significantly heavier. +- Turbo variants (e.g. Z-Image Turbo) work as usual — the low step count / no-CFG only affects generation; PiD's own step count is separate. + + diff --git a/invokeai/app/invocations/flux2_pid_decode.py b/invokeai/app/invocations/flux2_pid_decode.py new file mode 100644 index 00000000000..3cc325abdf4 --- /dev/null +++ b/invokeai/app/invocations/flux2_pid_decode.py @@ -0,0 +1,223 @@ +"""FLUX.2 Klein PiD decode invocation. + +Replaces the regular FLUX.2 VAE decode with the PiD pixel-diffusion super-res +decoder (``PiD_res2k[to4k]_sr4x_official_flux2_distill_4step``). Produces a 4x +super-resolved image from a FLUX.2 latent in a single 4-step distill pass. The +4B and 9B FLUX.2 Klein variants share the same 32-channel VAE, so this one node +covers both. + +Latent layout (the important difference from the FLUX.1 node): + +* ``flux2_denoise`` stores an *unpacked* ``(B, 32, H/8, W/8)`` latent that is + already **BN-denormalized** (``x * bn_std + bn_mean`` is applied before the + unpack, see ``flux2_denoise.py``). That is exactly the raw latent the FLUX.2 + VAE's conv decoder consumes. +* PiD's FLUX.2 backbone expects the **packed** ``(B, 128, H/16, W/16)`` + representation (``lq_latent_channels=128``, ``latent_spatial_down_factor=16`` + in ``backend/pid/decode.py``). We therefore patchify the stored latent + (2x2 spatial patches folded into channels: 32*4 = 128) *before* handing it to + PiD - mirroring ``pack_flux2`` but keeping a spatial ``(B, C, h, w)`` layout + instead of the transformer's ``(B, seq, C)`` sequence layout. + +Denormalization: unlike FLUX.1 (single ``scale``/``shift``) and Z-Image +(checkpoint-specific ``scaling_factor``/``shift_factor``), the FLUX.2 VAE +(``AutoencoderKLFlux2``) exposes **no** scalar ``scaling_factor``/``shift_factor`` +at all - its only normalization is the per-channel BatchNorm applied/inverted +*outside* the VAE in ``flux2_denoise``. So the packed latent is already in PiD's +expected raw space and no further scaling is needed (identity fallbacks below). +We still accept an optional ``vae`` input and read the constants at runtime (like +the Z-Image node) so any future FLUX.2 VAE variant that does expose scalar +constants is honored automatically. +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + +# FLUX.2 uses per-channel BatchNorm (affine=False) for latent normalization, and +# that BN is already inverted in flux2_denoise before the latent is stored. The +# FLUX.2 VAE (AutoencoderKLFlux2) has no scalar scaling_factor/shift_factor, so +# the identity transform below is the correct default: the stored (packed) latent +# is already the raw representation PiD was trained on. +_FLUX2_VAE_SCALING_FACTOR_FALLBACK: float = 1.0 +_FLUX2_VAE_SHIFT_FACTOR_FALLBACK: float = 0.0 + + +@invocation( + "flux2_pid_decode", + title="Latents to Image - FLUX.2 + PiD (4x SR)", + tags=["latents", "image", "pid", "flux2", "klein", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class Flux2PiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder. + + Produces a 4x super-resolved image in a single pass. The stored FLUX.2 latent + is patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout + PiD's FLUX.2 backbone expects, then decoded directly (it is already in raw, + BN-denormalized space; see the module docstring). + """ + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD FLUX.2 decoder checkpoint.", + input=Input.Connection, + ) + vae: VAEField | None = InputField( + default=None, + title="VAE", + description="FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. " + "FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is " + "normally an identity transform and the input can be left unconnected.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + # 1) Patchify the stored FLUX.2 latent into PiD's expected layout. + # flux2_denoise stores an unpacked (B, 32, H/8, W/8) latent; PiD's + # FLUX.2 backbone wants the packed (B, 128, H/16, W/16) form (32*4=128 + # channels, spatial halved). This mirrors pack_flux2's 2x2 patchify but + # keeps a spatial (B, C, h, w) layout rather than a (B, seq, C) sequence. + if latents.shape[-3] != 32: + raise ValueError( + f"FLUX.2 PiD decode expected a 32-channel latent from flux2_denoise, got shape " + f"{tuple(latents.shape)}. The upstream node must output the unpacked FLUX.2 latent." + ) + packed = rearrange(latents, "b c (h ph) (w pw) -> b (c ph pw) h w", ph=2, pw=2) + context.logger.info( + f"FLUX.2 PiD decode: stored latent shape={tuple(latents.shape)} -> packed for PiD " + f"shape={tuple(packed.shape)} (expect [B, 128, H/16, W/16]) dtype={packed.dtype}" + ) + + # 2) Resolve the scalar scaling/shift (identity for current FLUX.2 VAEs). + scaling_factor = _FLUX2_VAE_SCALING_FACTOR_FALLBACK + shift_factor = _FLUX2_VAE_SHIFT_FACTOR_FALLBACK + if self.vae is not None: + vae_info = context.models.load(self.vae.vae) + with vae_info.model_on_device() as (_, vae): + config = getattr(vae, "config", None) + if config is not None and hasattr(config, "scaling_factor"): + scaling_factor = float(config.scaling_factor) + shift_factor = float(getattr(config, "shift_factor", None) or 0.0) + else: + scaling_factor = float(getattr(vae, "scale_factor", scaling_factor)) + shift_factor = float(getattr(vae, "shift_factor", shift_factor)) + del vae_info + TorchDevice.empty_cache() + + # 3) Encode caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 4) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + # The working-memory estimate scales with the OUTPUT pixel count, so it must see the PACKED latent + # (spatial H/16), not the unpacked one - otherwise it over-reserves by 4x. + estimated_working_memory = estimate_pid_decode_working_memory(packed, BaseModelType.Flux2) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + # The packed latent is already BN-denormalized (raw VAE-input space); the scalar transform below is + # identity for current FLUX.2 VAEs and only bites if a VAE ever exposes real scalar constants. + denorm_latent = packed.to(device=device, dtype=dtype) / scaling_factor + shift_factor + context.logger.info( + f"FLUX.2 PiD denorm_latent stats[min={denorm_latent.min().item():.3f} " + f"max={denorm_latent.max().item():.3f} mean={denorm_latent.mean().item():.3f}] " + f"using scale={scaling_factor:.4f} shift={shift_factor:.4f}" + ) + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux2) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/flux_pid_decode.py b/invokeai/app/invocations/flux_pid_decode.py new file mode 100644 index 00000000000..73d7c286a1d --- /dev/null +++ b/invokeai/app/invocations/flux_pid_decode.py @@ -0,0 +1,146 @@ +"""FLUX PiD decode invocation. + +Replaces the regular FLUX VAE decode with the PiD pixel-diffusion super-res +decoder (``PiD_res2k_sr4x_official_flux_distill_4step``). Produces a 4x +super-resolved image from a FLUX latent in a single 4-step distill pass. +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.flux.util import get_flux_ae_params +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + + +@invocation( + "flux_pid_decode", + title="Latents to Image - FLUX + PiD (4x SR)", + tags=["latents", "image", "pid", "flux", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class FluxPiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode a FLUX latent with the PiD pixel-diffusion decoder. + + The FLUX AutoEncoder usually denormalises the stored latent internally + before its conv decoder runs (`z / scale + shift`); we apply the same + transform manually here so PiD sees the raw latent it was trained on. + """ + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD FLUX decoder checkpoint.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + # 1) Encode caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 2) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.Flux) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + # FLUX latent is stored in normalised form (matching FluxAutoEncoder + # state); denormalise so PiD sees the same representation it + # consumed during training. + ae = get_flux_ae_params() + denorm_latent = latents.to(device=device, dtype=dtype) / ae.scale_factor + ae.shift_factor + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/gemma2_encoder_loader.py b/invokeai/app/invocations/gemma2_encoder_loader.py new file mode 100644 index 00000000000..7273fd5619e --- /dev/null +++ b/invokeai/app/invocations/gemma2_encoder_loader.py @@ -0,0 +1,49 @@ +from invokeai.app.invocations.baseinvocation import ( + BaseInvocation, + BaseInvocationOutput, + Classification, + invocation, + invocation_output, +) +from invokeai.app.invocations.fields import InputField, OutputField +from invokeai.app.invocations.model import Gemma2EncoderField, ModelIdentifierField +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import ModelType, SubModelType + + +@invocation_output("gemma2_encoder_output") +class Gemma2EncoderOutput(BaseInvocationOutput): + gemma2_encoder: Gemma2EncoderField = OutputField( + description="Gemma-2 text encoder used by PiD decoders", + title="Gemma-2 Encoder", + ) + + +@invocation( + "gemma2_encoder_loader", + title="Gemma-2 Encoder - PiD", + tags=["model", "gemma2", "pid"], + category="model", + version="1.0.0", + classification=Classification.Prototype, +) +class Gemma2EncoderLoaderInvocation(BaseInvocation): + """Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder + submodels for use by a PiD decode node.""" + + gemma2_model: ModelIdentifierField = InputField( + description="Gemma-2 model used to encode captions for PiD decoders.", + title="Gemma-2", + ui_model_type=ModelType.Gemma2Encoder, + ) + + def invoke(self, context: InvocationContext) -> Gemma2EncoderOutput: + key = self.gemma2_model.key + if not context.models.exists(key): + raise Exception(f"Unknown Gemma2 model: {key}") + + tokenizer = self.gemma2_model.model_copy(update={"submodel_type": SubModelType.Tokenizer}) + text_encoder = self.gemma2_model.model_copy(update={"submodel_type": SubModelType.TextEncoder}) + return Gemma2EncoderOutput( + gemma2_encoder=Gemma2EncoderField(tokenizer=tokenizer, text_encoder=text_encoder), + ) diff --git a/invokeai/app/invocations/model.py b/invokeai/app/invocations/model.py index 0c96cdb1d9d..a24e95984b9 100644 --- a/invokeai/app/invocations/model.py +++ b/invokeai/app/invocations/model.py @@ -92,6 +92,19 @@ class VAEField(BaseModel): seamless_axes: List[str] = Field(default_factory=list, description='Axes("x" and "y") to which apply seamless') +class Gemma2EncoderField(BaseModel): + """Field for the Gemma-2 text encoder used by PiD decoders.""" + + tokenizer: ModelIdentifierField = Field(description="Info to load tokenizer submodel") + text_encoder: ModelIdentifierField = Field(description="Info to load text_encoder submodel") + + +class PiDDecoderField(BaseModel): + """Field for a PiD (Pixel Diffusion Decoder) checkpoint.""" + + decoder: ModelIdentifierField = Field(description="Info to load PiD decoder checkpoint") + + class ControlLoRAField(LoRAField): img: ImageField = Field(description="Image to use in structural conditioning") diff --git a/invokeai/app/invocations/pid_decoder_loader.py b/invokeai/app/invocations/pid_decoder_loader.py new file mode 100644 index 00000000000..ff22702d3e2 --- /dev/null +++ b/invokeai/app/invocations/pid_decoder_loader.py @@ -0,0 +1,44 @@ +from invokeai.app.invocations.baseinvocation import ( + BaseInvocation, + BaseInvocationOutput, + Classification, + invocation, + invocation_output, +) +from invokeai.app.invocations.fields import InputField, OutputField +from invokeai.app.invocations.model import ModelIdentifierField, PiDDecoderField +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import ModelType + + +@invocation_output("pid_decoder_output") +class PiDDecoderOutput(BaseInvocationOutput): + pid_decoder: PiDDecoderField = OutputField( + description="PiD (Pixel Diffusion Decoder) checkpoint", + title="PiD Decoder", + ) + + +@invocation( + "pid_decoder_loader", + title="PiD Decoder - FLUX / FLUX.2 / SD3", + tags=["model", "pid", "decoder"], + category="model", + version="1.0.0", + classification=Classification.Prototype, +) +class PiDDecoderLoaderInvocation(BaseInvocation): + """Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use + by the per-backbone PiD decode nodes.""" + + pid_decoder_model: ModelIdentifierField = InputField( + description="PiD decoder checkpoint matching the upstream backbone.", + title="PiD Decoder", + ui_model_type=ModelType.PiDDecoder, + ) + + def invoke(self, context: InvocationContext) -> PiDDecoderOutput: + key = self.pid_decoder_model.key + if not context.models.exists(key): + raise Exception(f"Unknown PiD decoder: {key}") + return PiDDecoderOutput(pid_decoder=PiDDecoderField(decoder=self.pid_decoder_model)) diff --git a/invokeai/app/invocations/pid_upscale.py b/invokeai/app/invocations/pid_upscale.py new file mode 100644 index 00000000000..1cca5943241 --- /dev/null +++ b/invokeai/app/invocations/pid_upscale.py @@ -0,0 +1,176 @@ +"""PiD super-resolution upscale invocation. + +Stand-alone 4x super-resolution path that does **not** require a Generator +latent. Pipeline:: + + image + -> FLUX VAE encode (denormalised back to raw) + -> Gemma-2 caption encode + -> PiD decoder (4x SR) + -> image (4x linear) + +This is the PiD analogue of ESRGAN / SUPIR: a one-shot, end-to-end pixel +upscaler. The FLUX VAE is also valid for Z-Image inputs (they share the +same 16-channel encoder). SD3 / FLUX.2 upscale paths would each need their +own invocation with the matching VAE encode and latent denormalisation; +they are deferred until we have the matching PiD checkpoints to validate +against. +""" + +from contextlib import ExitStack + +import einops +import torch +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + ImageField, + Input, + InputField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.flux_vae_encode import FluxVaeEncodeInvocation +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.flux.util import get_flux_ae_params +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor +from invokeai.backend.util.devices import TorchDevice + + +@invocation( + "pid_upscale", + title="PiD Upscale (4x) - FLUX VAE", + tags=["upscale", "image", "pid", "super-resolution", "flux"], + category="image", + version="1.0.0", + classification=Classification.Prototype, +) +class PiDUpscaleInvocation(BaseInvocation, WithMetadata, WithBoard): + """Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode. + + Works for source images that the FLUX VAE can encode (i.e. natural + photos / generated images at any size that lands on the VAE's 8-pixel + grid). The caption is used to condition the PiD decoder; leaving it + empty produces an unconditional decode and is the cheapest option, but + the model was distilled with rich captions and benefits from one. + """ + + image: ImageField = InputField(description="Image to upscale.") + vae: VAEField = InputField( + description="FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder).", + input=Input.Connection, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD FLUX decoder checkpoint.", + input=Input.Connection, + ) + prompt: str = InputField( + default="", + description="Optional caption describing the image. Empty -> empty-caption decode.", + ui_component=UIComponent.Textarea, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + # 1) Encode the source image into a FLUX raw latent. + pil_image = context.images.get_pil(self.image.image_name).convert("RGB") + image_tensor = image_resized_to_grid_as_tensor(pil_image) + if image_tensor.dim() == 3: + image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w") + + vae_info = context.models.load(self.vae.vae) + context.util.signal_progress("Running VAE encode") + normalised_latent = FluxVaeEncodeInvocation.vae_encode(vae_info=vae_info, image_tensor=image_tensor) + # FluxAutoEncoder.encode emits `scale * (raw - shift)`. PiD expects raw, + # so undo it. Holds for the Z-Image case as well (same VAE constants). + ae = get_flux_ae_params() + raw_latent = normalised_latent / ae.scale_factor + ae.shift_factor + raw_latent = raw_latent.to("cpu") # park while we swap to Gemma + del normalised_latent + TorchDevice.empty_cache() + + # 2) Encode the caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 3) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(raw_latent, BaseModelType.Flux) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + latent_on_device = raw_latent.to(device=device, dtype=dtype) + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux) + x0 = decoder.decode( + latent=latent_on_device, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = einops.rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/qwen_image_pid_decode.py b/invokeai/app/invocations/qwen_image_pid_decode.py new file mode 100644 index 00000000000..ddf410b5c3c --- /dev/null +++ b/invokeai/app/invocations/qwen_image_pid_decode.py @@ -0,0 +1,212 @@ +"""Qwen-Image PiD decode invocation. + +Replaces Qwen-Image's AutoencoderKLQwenImage decode with the PiD pixel-diffusion +super-res decoder (``PiD_res2kto4k_sr4x_official_qwenimage_distill_4step``). +Produces a 4x super-resolved image from a Qwen-Image latent in a single 4-step +distill pass. + +Qwen-Image is 16-channel at an 8x spatial down-factor (``_PER_BACKBONE[QwenImage]`` +in ``backend/pid/decode.py``: ``lq_latent_channels=16``, ``latent_spatial_down_factor=8``), +so no packing is needed. Two Qwen-specific wrinkles, both handled below and both +verified against the existing ``qwen_image_l2i`` node: + +1. **5D latent.** The denoiser stores a 5D ``(B, 16, num_frames, H, W)`` latent + (Qwen's VAE is a video-style autoencoder). PiD is a 2D image decoder, so we + drop the singleton temporal dim before decoding. +2. **Per-channel normalization.** Unlike FLUX / Z-Image / SDXL (a scalar + ``scaling_factor`` / ``shift``), the Qwen VAE normalizes each of the 16 latent + channels by its own ``latents_mean`` / ``latents_std`` vector. Denormalization + is therefore ``z_raw = z_norm * latents_std + latents_mean`` per channel - + exactly the transform ``qwen_image_l2i`` applies before ``vae.decode``, so PiD + (which replaces that decode) sees the same raw latent. We read the vectors from + the VAE config when a ``vae`` is wired, with the diffusers defaults as fallback. +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + +# Per-channel Qwen-Image VAE normalization constants (diffusers AutoencoderKLQwenImage defaults, z_dim=16). Used +# only as a fallback when no `vae` is wired; prefer the wired VAE config's latents_mean / latents_std at runtime. +_QWEN_VAE_LATENTS_MEAN_FALLBACK: list[float] = [ + -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, + 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921, +] # fmt: skip +_QWEN_VAE_LATENTS_STD_FALLBACK: list[float] = [ + 2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, + 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160, +] # fmt: skip + + +@invocation( + "qwen_image_pid_decode", + title="Latents to Image - Qwen-Image + PiD (4x SR)", + tags=["latents", "image", "pid", "qwen-image", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class QwenImagePiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode a Qwen-Image latent with the PiD pixel-diffusion decoder. + + Produces a 4x super-resolved image in a single pass. The 5D Qwen latent is + reduced to 2D and per-channel denormalized (``z * std + mean``) before PiD. + """ + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD Qwen-Image decoder checkpoint.", + input=Input.Connection, + ) + vae: VAEField | None = InputField( + default=None, + title="VAE", + description="Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. " + "If omitted, the diffusers default Qwen-Image constants are used.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + # 1) Reduce the stored 5D (B, C, num_frames, H, W) latent to 2D (B, C, H, W). Qwen's VAE is a video-style + # autoencoder; for a single image num_frames == 1 (mirrors qwen_image_l2i's `img[:, :, 0]`). + if latents.ndim == 5: + if latents.shape[2] != 1: + raise ValueError( + f"Qwen-Image PiD decode expected a single temporal frame, got shape {tuple(latents.shape)}." + ) + latents = latents[:, :, 0] + if latents.ndim != 4 or latents.shape[-3] != 16: + raise ValueError(f"Qwen-Image PiD decode expected a 16-channel latent, got shape {tuple(latents.shape)}.") + + # 2) Resolve the per-channel latents_mean / latents_std used to denormalise the stored latent. + latents_mean = list(_QWEN_VAE_LATENTS_MEAN_FALLBACK) + latents_std = list(_QWEN_VAE_LATENTS_STD_FALLBACK) + if self.vae is not None: + vae_info = context.models.load(self.vae.vae) + with vae_info.model_on_device() as (_, vae): + config = getattr(vae, "config", None) + cfg_mean = getattr(config, "latents_mean", None) if config is not None else None + cfg_std = getattr(config, "latents_std", None) if config is not None else None + if cfg_mean is not None and cfg_std is not None: + latents_mean = [float(x) for x in cfg_mean] + latents_std = [float(x) for x in cfg_std] + del vae_info + TorchDevice.empty_cache() + if len(latents_mean) != 16 or len(latents_std) != 16: + raise ValueError( + f"Qwen-Image VAE latents_mean/latents_std must have 16 entries, got {len(latents_mean)}/{len(latents_std)}." + ) + context.logger.info( + f"Qwen-Image PiD decode: latent shape={tuple(latents.shape)} (expect [B, 16, H/8, W/8]) " + f"dtype={latents.dtype} per-channel denorm (mean/std from {'VAE config' if self.vae else 'fallback'})" + ) + + # 3) Encode caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 4) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.QwenImage) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + # Per-channel denormalise: z_raw = z_norm * std + mean (the transform qwen_image_l2i applies before + # vae.decode). mean/std are (16,) -> (1, 16, 1, 1) to broadcast over the (B, 16, H, W) latent. + mean_t = torch.tensor(latents_mean, device=device, dtype=dtype).view(1, 16, 1, 1) + std_t = torch.tensor(latents_std, device=device, dtype=dtype).view(1, 16, 1, 1) + denorm_latent = latents.to(device=device, dtype=dtype) * std_t + mean_t + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.QwenImage) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/sd3_pid_decode.py b/invokeai/app/invocations/sd3_pid_decode.py new file mode 100644 index 00000000000..ef65b3d98d9 --- /dev/null +++ b/invokeai/app/invocations/sd3_pid_decode.py @@ -0,0 +1,139 @@ +"""SD3 PiD decode invocation. + +Replaces SD3's AutoencoderKL decode with the PiD pixel-diffusion super-res +decoder (``PiD_res2k_sr4x_official_sd3_distill_4step``). Produces a 4x +super-resolved image from an SD3 latent in a 4-step distill pass. +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + +# SD3 medium VAE constants (see diffusers `stabilityai/stable-diffusion-3-medium` VAE config +# and PiD's pipeline_registry.py confirmation). +_SD3_VAE_SCALING_FACTOR: float = 1.5305 +_SD3_VAE_SHIFT_FACTOR: float = 0.0609 + + +@invocation( + "sd3_pid_decode", + title="Latents to Image - SD3 + PiD (4x SR)", + tags=["latents", "image", "pid", "sd3", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class SD3PiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode an SD3 latent with the PiD pixel-diffusion decoder.""" + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD SD3 decoder checkpoint.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.StableDiffusion3) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + denorm_latent = latents.to(device=device, dtype=dtype) / _SD3_VAE_SCALING_FACTOR + _SD3_VAE_SHIFT_FACTOR + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.StableDiffusion3) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/sdxl_pid_decode.py b/invokeai/app/invocations/sdxl_pid_decode.py new file mode 100644 index 00000000000..9c98be8b422 --- /dev/null +++ b/invokeai/app/invocations/sdxl_pid_decode.py @@ -0,0 +1,185 @@ +"""SDXL PiD decode invocation. + +Replaces SDXL's AutoencoderKL decode with the PiD pixel-diffusion super-res +decoder (``PiD_res2kto4k_sr4x_official_sdxl_distill_4step``). Produces a 4x +super-resolved image from an SDXL latent in a single 4-step distill pass. + +SDXL latents are 4-channel at an 8x spatial down-factor (``_PER_BACKBONE[SDXL]`` +in ``backend/pid/decode.py``: ``lq_latent_channels=4``, ``latent_spatial_down_factor=8``), +so - unlike FLUX.2 - no patchify/pack is needed; the stored latent goes straight +to PiD after denormalization. + +Denormalization: SDXL's VAE (``AutoencoderKL``) exposes a scalar +``scaling_factor`` (0.13025) and no shift, so the stored latent is denormalized +as ``z / scaling_factor + shift`` (matching the FLUX / Z-Image nodes). We read +the constants from the VAE config at runtime when a ``vae`` is wired, falling +back to the documented SDXL constants otherwise. +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + +# SDXL VAE constants (diffusers `stabilityai/sdxl-vae` config: scaling_factor=0.13025, no shift). Prefer reading +# scaling_factor / shift_factor from the wired VAE config at runtime; use these only as a fallback. +_SDXL_VAE_SCALING_FACTOR_FALLBACK: float = 0.13025 +_SDXL_VAE_SHIFT_FACTOR_FALLBACK: float = 0.0 + + +@invocation( + "sdxl_pid_decode", + title="Latents to Image - SDXL + PiD (4x SR)", + tags=["latents", "image", "pid", "sdxl", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class SDXLPiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode an SDXL latent with the PiD pixel-diffusion decoder. + + Produces a 4x super-resolved image in a single pass. The SDXL latent is + 4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``) + and handed straight to PiD - no packing needed. + """ + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD SDXL decoder checkpoint.", + input=Input.Connection, + ) + vae: VAEField | None = InputField( + default=None, + title="VAE", + description="SDXL VAE, used to read scaling_factor / shift_factor. " + "If omitted, the SDXL fallback constants (0.13025 / 0.0) are used.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + # 1) Resolve the VAE scaling/shift used to denormalise the stored SDXL latent. Prefer the VAE config; fall + # back to the documented SDXL constants (0.13025 / 0.0). + scaling_factor = _SDXL_VAE_SCALING_FACTOR_FALLBACK + shift_factor = _SDXL_VAE_SHIFT_FACTOR_FALLBACK + if self.vae is not None: + vae_info = context.models.load(self.vae.vae) + with vae_info.model_on_device() as (_, vae): + config = getattr(vae, "config", None) + if config is not None and hasattr(config, "scaling_factor"): + scaling_factor = float(config.scaling_factor) + shift_factor = float(getattr(config, "shift_factor", None) or 0.0) + else: + scaling_factor = float(getattr(vae, "scale_factor", scaling_factor)) + shift_factor = float(getattr(vae, "shift_factor", shift_factor)) + del vae_info + TorchDevice.empty_cache() + context.logger.info( + f"SDXL PiD decode: latent shape={tuple(latents.shape)} (expect [B, 4, H/8, W/8]) dtype={latents.dtype} " + f"using scale={scaling_factor:.5f} shift={shift_factor:.5f}" + ) + + # 2) Encode caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + caption_embs = caption_embs.detach().to("cpu") + caption_mask = caption_mask.detach().to("cpu") + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 3) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.StableDiffusionXL) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + # SDXL latents come out of the LDM in the VAE-normalized space; denormalise so PiD sees the raw latent. + denorm_latent = latents.to(device=device, dtype=dtype) / scaling_factor + shift_factor + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.StableDiffusionXL) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + + TorchDevice.empty_cache() + + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/invocations/z_image_pid_decode.py b/invokeai/app/invocations/z_image_pid_decode.py new file mode 100644 index 00000000000..e52a092c075 --- /dev/null +++ b/invokeai/app/invocations/z_image_pid_decode.py @@ -0,0 +1,204 @@ +"""Z-Image PiD decode invocation. + +Z-Image shares FLUX.1's 16-channel VAE, so the FLUX-trained PiD decoder +(``PiD_res2k_sr4x_official_flux_distill_4step``) is the correct choice for +Z-Image latents. This node replaces the regular Z-Image VAE decode with a +PiD super-resolution decode (4x scale, ~256×256 latent → 2048×2048 image +by default). +""" + +from contextlib import ExitStack + +import torch +from einops import rearrange +from PIL import Image +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import ( + FieldDescriptions, + Input, + InputField, + LatentsField, + UIComponent, + WithBoard, + WithMetadata, +) +from invokeai.app.invocations.model import Gemma2EncoderField, PiDDecoderField, VAEField +from invokeai.app.invocations.primitives import ImageOutput +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet +from invokeai.backend.pid.decode import ( + PiDDecodeConfig, + PiDDecoder, + encode_caption_for_pid, + estimate_pid_decode_working_memory, +) +from invokeai.backend.util.devices import TorchDevice + +# Fallback Z-Image VAE constants. PiD's pipeline_registry.py explicitly notes +# the exact values depend on the pretrained checkpoint, so prefer reading them +# from the VAE config at runtime (see `vae` input below) and use these only as +# a last resort. +_ZIMAGE_VAE_SCALING_FACTOR_FALLBACK: float = 0.3611 +_ZIMAGE_VAE_SHIFT_FACTOR_FALLBACK: float = 0.1159 + + +@invocation( + "z_image_pid_decode", + title="Latents to Image - Z-Image + PiD (4x SR)", + tags=["latents", "image", "pid", "z-image", "upscale"], + category="latents", + version="1.0.0", + classification=Classification.Prototype, +) +class ZImagePiDDecodeInvocation(BaseInvocation, WithMetadata, WithBoard): + """Decode a Z-Image latent with the PiD pixel-diffusion decoder. + + Produces a 4x super-resolved image in a single pass (Z-Image decoder is + trained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial + down-factor gives a 32x linear scale from latent to pixel). + """ + + latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) + prompt: str = InputField( + description="Text prompt the latent was generated from. PiD conditions on it.", + ui_component=UIComponent.Textarea, + ) + gemma2_encoder: Gemma2EncoderField = InputField( + title="Gemma-2 Encoder", + description="Gemma-2 caption encoder. Required by PiD.", + input=Input.Connection, + ) + pid_decoder: PiDDecoderField = InputField( + title="PiD Decoder", + description="PiD FLUX decoder checkpoint.", + input=Input.Connection, + ) + vae: VAEField | None = InputField( + default=None, + title="VAE", + description="Z-Image VAE used to read scaling_factor / shift_factor. " + "If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used.", + input=Input.Connection, + ) + num_inference_steps: int = InputField( + default=4, + ge=1, + le=8, + description="Number of PiD distill steps. The released checkpoints are trained for 4.", + ) + seed: int = InputField(default=0, description="Seed for the PiD decoder's noise.") + + @torch.no_grad() + def invoke(self, context: InvocationContext) -> ImageOutput: + latents = context.tensors.load(self.latents.latents_name) + + # 1) Resolve the VAE scaling/shift used to denormalise the stored + # Z-Image latent. PiD's pipeline_registry says these are + # checkpoint-specific for Z-Image, so prefer the VAE config when + # available and fall back to the FLUX values otherwise. + scaling_factor = _ZIMAGE_VAE_SCALING_FACTOR_FALLBACK + shift_factor = _ZIMAGE_VAE_SHIFT_FACTOR_FALLBACK + if self.vae is not None: + vae_info = context.models.load(self.vae.vae) + with vae_info.model_on_device() as (_, vae): + config = getattr(vae, "config", None) + if config is not None and hasattr(config, "scaling_factor"): + scaling_factor = float(config.scaling_factor) + shift_factor = float(getattr(config, "shift_factor", None) or 0.0) + else: + # FluxAutoEncoder stores the constants directly on the module. + scaling_factor = float(getattr(vae, "scale_factor", scaling_factor)) + shift_factor = float(getattr(vae, "shift_factor", shift_factor)) + del vae_info + TorchDevice.empty_cache() + context.logger.info( + f"Z-Image PiD decode: latent shape={tuple(latents.shape)} dtype={latents.dtype} " + f"stats[min={latents.min().item():.3f} max={latents.max().item():.3f} " + f"mean={latents.mean().item():.3f}] using scale={scaling_factor:.4f} shift={shift_factor:.4f}" + ) + + # 2) Encode caption with Gemma-2. + gemma_text_encoder_info = context.models.load(self.gemma2_encoder.text_encoder) + gemma_tokenizer_info = context.models.load(self.gemma2_encoder.tokenizer) + with ExitStack() as stack: + (_, gemma_encoder) = stack.enter_context(gemma_text_encoder_info.model_on_device()) + (_, gemma_tokenizer) = stack.enter_context(gemma_tokenizer_info.model_on_device()) + if not isinstance(gemma_encoder, PreTrainedModel): + raise TypeError(f"Expected PreTrainedModel for Gemma encoder, got {type(gemma_encoder).__name__}.") + if not isinstance(gemma_tokenizer, PreTrainedTokenizerBase): + raise TypeError( + f"Expected PreTrainedTokenizerBase for Gemma tokenizer, got {type(gemma_tokenizer).__name__}." + ) + + device = TorchDevice.choose_torch_device() + encode_dtype = TorchDevice.choose_bfloat16_safe_dtype(device) + + context.util.signal_progress("Encoding caption with Gemma-2") + caption_embs, caption_mask = encode_caption_for_pid( + [self.prompt], + tokenizer=gemma_tokenizer, + encoder=gemma_encoder, + device=device, + dtype=encode_dtype, + ) + # Move off-device so Gemma's slot in the cache can be reclaimed. + caption_embs = caption_embs.detach().to("cpu") + + caption_mask = caption_mask.detach().to("cpu") + # Drop Gemma references so the cache can evict it before we load PiD. + del gemma_encoder, gemma_tokenizer + # Gemma is only needed for the one-shot caption encode above. Offload it from VRAM (keeping it in the RAM + # cache) so its ~5GB is freed before the PiD decoder loads. The cache offloads anything else it needs to + # fit the decode on its own, so we deliberately do NOT evict every other model here. + context.models.offload_from_vram(self.gemma2_encoder.text_encoder) + TorchDevice.empty_cache() + + # 2) Run PiD decode (the loader already returns a live PidNet). + pid_info = context.models.load(self.pid_decoder.decoder) + estimated_working_memory = estimate_pid_decode_working_memory(latents, BaseModelType.Flux) + with pid_info.model_on_device(working_mem_bytes=estimated_working_memory) as (_, pid_net): + if not isinstance(pid_net, PidNet): + raise TypeError(f"Expected PidNet for PiD decoder, got {type(pid_net).__name__}.") + device = TorchDevice.choose_torch_device() + dtype = next(iter(pid_net.parameters())).dtype + + # Z-Image latents come out of the diffusers pipeline normalised + # by the VAE constants. PiD expects the raw latent. + denorm_latent = latents.to(device=device, dtype=dtype) / scaling_factor + shift_factor + context.logger.info( + f"denorm_latent stats[min={denorm_latent.min().item():.3f} " + f"max={denorm_latent.max().item():.3f} mean={denorm_latent.mean().item():.3f} " + f"std={denorm_latent.float().std().item():.3f}]; " + f"caption_embs shape={tuple(caption_embs.shape)} " + f"stats[min={caption_embs.min().item():.3f} max={caption_embs.max().item():.3f} " + f"mean={caption_embs.mean().item():.3f} std={caption_embs.float().std().item():.3f}]" + ) + caption_embs = caption_embs.to(device=device, dtype=dtype) + + context.util.signal_progress("Running PiD decoder") + decoder = PiDDecoder(pid_net, backbone=BaseModelType.Flux) + x0 = decoder.decode( + latent=denorm_latent, + caption_embs=caption_embs, + caption_mask=caption_mask, + config=PiDDecodeConfig(num_inference_steps=self.num_inference_steps, seed=self.seed), + ) + context.logger.info( + f"PiD output stats: shape={tuple(x0.shape)} dtype={x0.dtype} " + f"raw[min={x0.min().item():.3f} max={x0.max().item():.3f} " + f"mean={x0.mean().item():.3f} std={x0.float().std().item():.3f}] " + f"nan_count={int(torch.isnan(x0).sum().item())} " + f"inf_count={int(torch.isinf(x0).sum().item())}" + ) + + TorchDevice.empty_cache() + + # x0 is [B, 3, H, W] in [-1, 1]; convert the first item to a PIL image. + img = rearrange(x0[0].clamp(-1, 1), "c h w -> h w c") + img_pil = Image.fromarray((127.5 * (img + 1.0)).byte().cpu().numpy()) + + image_dto = context.images.save(image=img_pil) + return ImageOutput.build(image_dto) diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py index e06f8f2df91..56303dace97 100644 --- a/invokeai/app/services/model_records/model_records_base.py +++ b/invokeai/app/services/model_records/model_records_base.py @@ -30,6 +30,7 @@ ModelSourceType, ModelType, ModelVariantType, + PiDDecoderVariantType, Qwen3VariantType, QwenImageVariantType, SchedulerPredictionType, @@ -135,6 +136,7 @@ def validate_source_url(cls, v: Any) -> Optional[str]: | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + | PiDDecoderVariantType ] = Field(description="The variant of the model.", default=None) prediction_type: Optional[SchedulerPredictionType] = Field( description="The prediction type of the model.", default=None diff --git a/invokeai/app/services/shared/invocation_context.py b/invokeai/app/services/shared/invocation_context.py index e38766d5ba2..44b657bca80 100644 --- a/invokeai/app/services/shared/invocation_context.py +++ b/invokeai/app/services/shared/invocation_context.py @@ -426,6 +426,22 @@ def load_by_attrs( self._util.signal_progress(message) return self._services.model_manager.load.load_model(configs[0], submodel_type) + def offload_from_vram(self, identifier: Union[str, "ModelIdentifierField"]) -> int: + """Move a model (and all of its submodels) from VRAM to RAM, freeing its VRAM but keeping it cached. + + Use this when an invocation is done with a model for the rest of the run - e.g. a one-shot text encoder - + so the next, larger load does not have to compete with it for VRAM. The model stays in the RAM cache, so + a subsequent load only re-streams it back to VRAM rather than rebuilding it from disk. + + Args: + identifier: The key or ModelField representing the model to offload. + + Returns: + The number of VRAM bytes freed. + """ + key = identifier if isinstance(identifier, str) else identifier.key + return self._services.model_manager.load.ram_cache.offload_model_from_vram(key) + @staticmethod def _raise_if_external(model: AnyModelConfig) -> None: if model.base == BaseModelType.External or model.format == ModelFormat.ExternalApi: diff --git a/invokeai/backend/model_manager/configs/factory.py b/invokeai/backend/model_manager/configs/factory.py index b176a6ff0b2..f68741ba4c6 100644 --- a/invokeai/backend/model_manager/configs/factory.py +++ b/invokeai/backend/model_manager/configs/factory.py @@ -28,6 +28,7 @@ ) from invokeai.backend.model_manager.configs.external_api import ExternalApiModelConfig from invokeai.backend.model_manager.configs.flux_redux import FLUXRedux_Checkpoint_Config +from invokeai.backend.model_manager.configs.gemma2_encoder import Gemma2Encoder_Gemma2Encoder_Config from invokeai.backend.model_manager.configs.identification_utils import NotAMatchError from invokeai.backend.model_manager.configs.ip_adapter import ( IPAdapter_Checkpoint_FLUX_Config, @@ -86,6 +87,13 @@ Main_GGUF_ZImage_Config, MainModelDefaultSettings, ) +from invokeai.backend.model_manager.configs.pid_decoder import ( + PiDDecoder_Checkpoint_Flux2_Config, + PiDDecoder_Checkpoint_FLUX_Config, + PiDDecoder_Checkpoint_QwenImage_Config, + PiDDecoder_Checkpoint_SD3_Config, + PiDDecoder_Checkpoint_SDXL_Config, +) from invokeai.backend.model_manager.configs.qwen3_encoder import ( Qwen3Encoder_Checkpoint_Config, Qwen3Encoder_GGUF_Config, @@ -207,6 +215,12 @@ Annotated[VAE_Diffusers_SD1_Config, VAE_Diffusers_SD1_Config.get_tag()], Annotated[VAE_Diffusers_SDXL_Config, VAE_Diffusers_SDXL_Config.get_tag()], Annotated[VAE_Diffusers_Flux2_Config, VAE_Diffusers_Flux2_Config.get_tag()], + # PiD Decoder - checkpoint format + Annotated[PiDDecoder_Checkpoint_FLUX_Config, PiDDecoder_Checkpoint_FLUX_Config.get_tag()], + Annotated[PiDDecoder_Checkpoint_Flux2_Config, PiDDecoder_Checkpoint_Flux2_Config.get_tag()], + Annotated[PiDDecoder_Checkpoint_SD3_Config, PiDDecoder_Checkpoint_SD3_Config.get_tag()], + Annotated[PiDDecoder_Checkpoint_SDXL_Config, PiDDecoder_Checkpoint_SDXL_Config.get_tag()], + Annotated[PiDDecoder_Checkpoint_QwenImage_Config, PiDDecoder_Checkpoint_QwenImage_Config.get_tag()], # ControlNet - checkpoint format Annotated[ControlNet_Checkpoint_SD1_Config, ControlNet_Checkpoint_SD1_Config.get_tag()], Annotated[ControlNet_Checkpoint_SD2_Config, ControlNet_Checkpoint_SD2_Config.get_tag()], @@ -250,6 +264,8 @@ Annotated[Qwen3Encoder_Qwen3Encoder_Config, Qwen3Encoder_Qwen3Encoder_Config.get_tag()], Annotated[Qwen3Encoder_Checkpoint_Config, Qwen3Encoder_Checkpoint_Config.get_tag()], Annotated[Qwen3Encoder_GGUF_Config, Qwen3Encoder_GGUF_Config.get_tag()], + # Gemma 2 Encoder (used by PiD) + Annotated[Gemma2Encoder_Gemma2Encoder_Config, Gemma2Encoder_Gemma2Encoder_Config.get_tag()], # Qwen VL Encoder (Qwen2.5-VL multimodal encoder for Qwen Image) Annotated[QwenVLEncoder_Diffusers_Config, QwenVLEncoder_Diffusers_Config.get_tag()], Annotated[QwenVLEncoder_Checkpoint_Config, QwenVLEncoder_Checkpoint_Config.get_tag()], diff --git a/invokeai/backend/model_manager/configs/gemma2_encoder.py b/invokeai/backend/model_manager/configs/gemma2_encoder.py new file mode 100644 index 00000000000..b922f4e060e --- /dev/null +++ b/invokeai/backend/model_manager/configs/gemma2_encoder.py @@ -0,0 +1,70 @@ +"""Model config for the Gemma-2-2b-it text encoder used by PiD. + +PiD's pre-trained decoders condition on Gemma-2-2b-it caption embeddings +(2304-dim). This config recognises a stand-alone diffusers/transformers +directory containing a Gemma2 causal LM (config.json + safetensors weights + +tokenizer files). + +The reference model PiD uses is `Efficient-Large-Model/gemma-2-2b-it`, an +ungated mirror of `google/gemma-2-2b-it`. Both produce a +`Gemma2ForCausalLM` config which is what we match on. + +License note: Gemma 2 is distributed under the Gemma Terms of Use (Google). +This config only describes how to recognise the model on disk; downloading +and accepting Gemma's license is the user's responsibility. +""" + +from typing import Any, Literal, Self + +from pydantic import Field + +from invokeai.backend.model_manager.configs.base import Config_Base +from invokeai.backend.model_manager.configs.identification_utils import ( + NotAMatchError, + raise_for_class_name, + raise_for_override_fields, + raise_if_not_dir, +) +from invokeai.backend.model_manager.model_on_disk import ModelOnDisk +from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat, ModelType + + +class Gemma2Encoder_Gemma2Encoder_Config(Config_Base): + """Standalone Gemma-2 causal LM directory used as a text encoder by PiD. + + Expected directory layout (HuggingFace `from_pretrained`-compatible):: + + / + config.json # architectures: ["Gemma2ForCausalLM"] + tokenizer.json + tokenizer_config.json + model-*.safetensors # or model.safetensors / *.bin + """ + + base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any) + type: Literal[ModelType.Gemma2Encoder] = Field(default=ModelType.Gemma2Encoder) + format: Literal[ModelFormat.Gemma2Encoder] = Field(default=ModelFormat.Gemma2Encoder) + cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only") + + @classmethod + def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: + raise_if_not_dir(mod) + raise_for_override_fields(cls, override_fields) + + config_path = mod.path / "config.json" + if not config_path.exists(): + raise NotAMatchError(f"missing config.json at {config_path}") + + # Reject full diffusers pipelines (they have model_index.json at root). + if (mod.path / "model_index.json").exists(): + raise NotAMatchError("directory looks like a full diffusers pipeline, not a standalone Gemma2 encoder") + + # Architecture marker is the canonical signal. + raise_for_class_name(config_path, {"Gemma2ForCausalLM"}) + + # Sanity check that tokenizer files live alongside the model (PiD calls + # AutoTokenizer.from_pretrained on the same directory). + if not any((mod.path / f).exists() for f in ("tokenizer.json", "tokenizer.model")): + raise NotAMatchError("directory does not contain Gemma2 tokenizer files (tokenizer.json/tokenizer.model)") + + return cls(**override_fields) diff --git a/invokeai/backend/model_manager/configs/pid_decoder.py b/invokeai/backend/model_manager/configs/pid_decoder.py new file mode 100644 index 00000000000..7a677c0e5ca --- /dev/null +++ b/invokeai/backend/model_manager/configs/pid_decoder.py @@ -0,0 +1,234 @@ +"""Model configs for PiD (Pixel Diffusion Decoder) checkpoints. + +PiD decoders are released by NVIDIA at https://huggingface.co/nvidia/PiD and +ship per supported backbone (FLUX.1, FLUX.2, SD3) in two resolution presets +(`res2k_sr4x_*` and `res2kto4k_sr4x_*`). See `LICENSE-PiD.txt` at the repo +root — code is Apache-2.0, weights are NSCLv1 (non-commercial / research). +""" + +import re +from typing import Any, Literal, Self + +from pydantic import Field + +from invokeai.backend.model_manager.configs.base import Checkpoint_Config_Base, Config_Base +from invokeai.backend.model_manager.configs.identification_utils import ( + NotAMatchError, + raise_for_override_fields, + raise_if_not_file, +) +from invokeai.backend.model_manager.model_on_disk import ModelOnDisk +from invokeai.backend.model_manager.taxonomy import ( + BaseModelType, + ModelFormat, + ModelType, + PiDDecoderVariantType, +) + +# Marker substring produced by `PidNet.lq_proj` (see +# invokeai/backend/pid/_src/networks/pid_net.py). The pretrained PixDiT_T2I +# weights do not contain `lq_proj`, so its presence in any key is diagnostic +# of a PiD-style checkpoint. We match by substring (not prefix) because the +# official `.pth` files keep PidDistillModel's `net.` prefix, so keys look +# like `net.lq_proj.layers.0.weight`. +_PID_MARKER_SUBSTRING = "lq_proj" + + +def _looks_like_pid_decoder(state_dict: dict[str | int, Any]) -> bool: + return any(isinstance(k, str) and _PID_MARKER_SUBSTRING in k for k in state_dict) + + +# The latent input projection (`lq_proj.latent_proj.0`) is a Conv2d whose +# in-channel count equals the backbone's latent channel count — the released +# sr4x checkpoints apply no spatial fold here, so the Conv's dim-1 is exactly +# `lq_latent_channels` (see `_PER_BACKBONE` in invokeai/backend/pid/decode.py): +# FLUX.1 / SD3 = 16, FLUX.2 = 128. This is the only architectural dimension +# that varies between backbones and is therefore a filename-independent +# discriminator between FLUX.2 and the 16-channel family. (FLUX.1 and SD3 are +# architecturally identical and cannot be told apart from the weights alone.) +# We match the key by suffix because the official `.pth` keep the `net.` prefix. +_LATENT_PROJ_KEY_SUFFIX = "lq_proj.latent_proj.0.weight" + +_LATENT_CHANNELS_TO_BASES: dict[int, set[BaseModelType]] = { + 4: {BaseModelType.StableDiffusionXL}, + 16: {BaseModelType.Flux, BaseModelType.StableDiffusion3, BaseModelType.QwenImage}, + 128: {BaseModelType.Flux2}, +} + + +def _latent_channels_from_state_dict(state_dict: dict[str | int, Any]) -> int | None: + """Read the backbone's latent channel count from the `lq_proj` input Conv. + + Returns None if the diagnostic weight is absent or not a 4D conv tensor. + """ + for k, v in state_dict.items(): + if isinstance(k, str) and k.endswith(_LATENT_PROJ_KEY_SUFFIX): + shape = getattr(v, "shape", None) + if shape is not None and len(shape) == 4: + return int(shape[1]) + return None + + +def _name_for_matching(mod: ModelOnDisk) -> str: + """Searchable name for backbone/variant heuristics. + + NVIDIA distributes PiD checkpoints as + ``PiD_res2k_sr4x_official__distill_4step/model_ema_bf16.pth`` — the + backbone + variant live in the *directory* name, not the weights filename. + We therefore match against both the filename and its parent directory. + """ + return f"{mod.path.parent.name} {mod.path.name}" + + +def _backbone_from_filename(name: str) -> BaseModelType | None: + """Heuristic backbone match against NVIDIA's checkpoint filename conventions. + + Returns None if no backbone can be inferred. + """ + n = name.lower() + # Order matters: 'flux2' must match before 'flux'. + if re.search(r"\bflux[_-]?2\b|flux2", n): + return BaseModelType.Flux2 + if "flux" in n: + return BaseModelType.Flux + if re.search(r"\bsdxl\b|sdxl", n): + return BaseModelType.StableDiffusionXL + if re.search(r"qwen[_-]?image|qwenimage", n): + return BaseModelType.QwenImage + if re.search(r"\bsd[_-]?3\b|sd3", n): + return BaseModelType.StableDiffusion3 + return None + + +def _variant_from_filename(name: str) -> PiDDecoderVariantType: + """Map NVIDIA's `res2k_sr4x` / `res2kto4k_sr4x` filename slice to a variant. + + Defaults to ``Res2k_Sr4x`` when no clear marker is present. + """ + n = name.lower() + if "res2kto4k" in n or "res2k_to_4k" in n or "res2k_to4k" in n: + return PiDDecoderVariantType.Res2kTo4k_Sr4x + return PiDDecoderVariantType.Res2k_Sr4x + + +class PiDDecoder_Checkpoint_Config_Base(Checkpoint_Config_Base): + """Shared logic for PiD decoder checkpoint configs. + + Concrete subclasses pin `base` to a specific backbone. Backbone matching is + driven primarily by the latent channel count read from the weights, with the + filename / directory name as a tie-breaker for the architecturally identical + FLUX.1 / SD3 pair. `variant` is carried as data without participating in the + discriminator tag (one config class per backbone). + """ + + type: Literal[ModelType.PiDDecoder] = Field(default=ModelType.PiDDecoder) + format: Literal[ModelFormat.Checkpoint] = Field(default=ModelFormat.Checkpoint) + + @classmethod + def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: + raise_if_not_file(mod) + raise_for_override_fields(cls, override_fields) + + state_dict = mod.load_state_dict() + if not _looks_like_pid_decoder(state_dict): + raise NotAMatchError("state dict does not look like a PiD decoder (no 'lq_proj.*' keys)") + + # Whether the caller explicitly pinned a base (e.g. a starter-model install passes base=sd-3). + # In the ambiguous 16-channel FLUX.1/SD3 case this override is trusted when the filename is silent. + had_base_override = override_fields.get("base") is not None + cls._validate_base(mod, state_dict, had_base_override=had_base_override) + + variant = override_fields.pop("variant", None) or _variant_from_filename(_name_for_matching(mod)) + return cls(**override_fields, variant=variant) + + @classmethod + def _validate_base( + cls, mod: ModelOnDisk, state_dict: dict[str | int, Any], *, had_base_override: bool = False + ) -> None: + """Confirm this checkpoint belongs to the config's pinned backbone. + + The latent channel count (read from the weights) is authoritative and + separates FLUX.2 (128ch) from the 16ch family. FLUX.1 and SD3 share an + identical architecture, so within the 16ch family we fall back to the + filename / directory name, defaulting to FLUX.1 when it is silent. + + ``had_base_override`` is True when the caller explicitly pinned ``base`` + (e.g. a starter-model install). In the ambiguous 16ch case, a trusted + override wins over the FLUX.1 default — necessary because the HF + single-file download renames the parent directory, dropping the + ``…official_sd3_distill…`` hint that would otherwise identify SD3. + """ + expected_base = cls.model_fields["base"].default + channels = _latent_channels_from_state_dict(state_dict) + + if channels is not None: + candidate_bases = _LATENT_CHANNELS_TO_BASES.get(channels) + if candidate_bases is None: + raise NotAMatchError( + f"PiD checkpoint has {channels} latent channels; no supported backbone uses this " + "(supported: 16 for FLUX.1/SD3, 128 for FLUX.2)" + ) + if expected_base not in candidate_bases: + raise NotAMatchError(f"latent channels={channels} do not match backbone {expected_base}") + if len(candidate_bases) > 1: + # Ambiguous 16ch family — disambiguate FLUX.1 vs SD3 by name. + named_base = _backbone_from_filename(_name_for_matching(mod)) + if named_base in candidate_bases: + if named_base is not expected_base: + raise NotAMatchError(f"name indicates {named_base}, not {expected_base}") + elif had_base_override: + # Name is silent, but the caller explicitly pinned this base → trust it. + return + elif expected_base is not BaseModelType.Flux: + # Name gives no usable hint and no override → default the family to FLUX.1. + raise NotAMatchError("ambiguous 16-channel PiD checkpoint; defaulting to FLUX.1") + return + + # No diagnostic weight (unexpected) → fall back to filename-only matching. + inferred_base = _backbone_from_filename(_name_for_matching(mod)) + if inferred_base is None: + raise NotAMatchError( + "cannot determine PiD decoder backbone from weights or filename (expected one of: flux, flux2, sd3)" + ) + if inferred_base is not expected_base: + raise NotAMatchError(f"backbone is {inferred_base}, not {expected_base}") + + +class PiDDecoder_Checkpoint_FLUX_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base): + """PiD decoder for the FLUX.1 backbone (16-channel latent).""" + + base: Literal[BaseModelType.Flux] = Field(default=BaseModelType.Flux) + variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.") + + +class PiDDecoder_Checkpoint_Flux2_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base): + """PiD decoder for the FLUX.2 backbone (128-channel latent).""" + + base: Literal[BaseModelType.Flux2] = Field(default=BaseModelType.Flux2) + variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.") + + +class PiDDecoder_Checkpoint_SD3_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base): + """PiD decoder for the Stable Diffusion 3 backbone (16-channel latent).""" + + base: Literal[BaseModelType.StableDiffusion3] = Field(default=BaseModelType.StableDiffusion3) + variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.") + + +class PiDDecoder_Checkpoint_SDXL_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base): + """PiD decoder for the SDXL backbone (4-channel latent).""" + + base: Literal[BaseModelType.StableDiffusionXL] = Field(default=BaseModelType.StableDiffusionXL) + variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.") + + +class PiDDecoder_Checkpoint_QwenImage_Config(PiDDecoder_Checkpoint_Config_Base, Config_Base): + """PiD decoder for the Qwen-Image backbone (16-channel latent). + + Shares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same + filename / directory-name disambiguation (or a trusted explicit ``base`` override) + as SD3 - see ``_validate_base``. + """ + + base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage) + variant: PiDDecoderVariantType = Field(description="Resolution preset of the PiD decoder checkpoint.") diff --git a/invokeai/backend/model_manager/configs/text_llm.py b/invokeai/backend/model_manager/configs/text_llm.py index a0fb3e009f9..edac40ea57a 100644 --- a/invokeai/backend/model_manager/configs/text_llm.py +++ b/invokeai/backend/model_manager/configs/text_llm.py @@ -41,6 +41,14 @@ def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) - if not class_name.endswith("ForCausalLM"): raise NotAMatchError(f"model architecture '{class_name}' is not a causal language model") + # Defer to specialised text-encoder configs for models that have a + # dedicated wrapper. Without this both configs match the same + # directory and the user ends up with a `text_llm` entry even though + # a more specific type exists. + _SPECIALISED_CAUSAL_LM_ARCHITECTURES = {"Gemma2ForCausalLM"} + if class_name in _SPECIALISED_CAUSAL_LM_ARCHITECTURES: + raise NotAMatchError(f"architecture '{class_name}' is handled by a dedicated encoder config, not TextLLM") + # Verify tokenizer files exist to avoid runtime failures tokenizer_files = {"tokenizer.json", "tokenizer.model", "tokenizer_config.json"} if not any((mod.path / f).exists() for f in tokenizer_files): diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py index e3a0928e52b..7808104a047 100644 --- a/invokeai/backend/model_manager/load/model_cache/model_cache.py +++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py @@ -929,3 +929,23 @@ def drop_model(self, model_key: str) -> int: gc.collect() TorchDevice.empty_cache() return len(dropped) + + def offload_model_from_vram(self, model_key: str) -> int: + """Move a model (and its submodels) from VRAM to RAM without dropping it from the cache. + + Unlike `drop_model`, the cache entry is kept, so the model stays resident in RAM and the next load does + not have to rebuild it from disk - only re-stream its weights back to VRAM. This is useful for freeing + VRAM after a one-shot use (e.g. a text encoder that has already produced its embeddings) before a much + larger model loads. Locked (in-use) entries are skipped. + + Returns the number of VRAM bytes freed. + """ + prefix = f"{model_key}:" + bytes_freed = 0 + for key, entry in list(self._cached_models.items()): + if (key == model_key or key.startswith(prefix)) and not entry.is_locked: + bytes_freed += self._move_model_to_ram(entry, entry.cached_model.total_bytes()) + if bytes_freed > 0: + gc.collect() + TorchDevice.empty_cache() + return bytes_freed diff --git a/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py b/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py new file mode 100644 index 00000000000..b9db92b31a6 --- /dev/null +++ b/invokeai/backend/model_manager/load/model_loaders/gemma2_encoder.py @@ -0,0 +1,61 @@ +"""Loader for the Gemma-2 text encoder used by PiD. + +PiD only consumes the decoder block of the causal LM (see +`pid/_src/models/pixeldit_model.py::_load_text_encoder`: +`AutoModelForCausalLM.from_pretrained(...).get_decoder()`), so this loader +returns the decoder sub-module for the `TextEncoder` submodel and the +tokenizer for the `Tokenizer` submodel. +""" + +from pathlib import Path +from typing import Optional + +from transformers import AutoModelForCausalLM, AutoTokenizer + +from invokeai.backend.model_manager.configs.factory import AnyModelConfig +from invokeai.backend.model_manager.configs.gemma2_encoder import Gemma2Encoder_Gemma2Encoder_Config +from invokeai.backend.model_manager.load.load_default import ModelLoader +from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry +from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType +from invokeai.backend.util.devices import TorchDevice + + +@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.Gemma2Encoder, format=ModelFormat.Gemma2Encoder) +class Gemma2EncoderLoader(ModelLoader): + """Loads a Gemma-2 causal LM directory and exposes its decoder + tokenizer.""" + + def _load_model( + self, + config: AnyModelConfig, + submodel_type: Optional[SubModelType] = None, + ) -> AnyModel: + if not isinstance(config, Gemma2Encoder_Gemma2Encoder_Config): + raise ValueError("Only Gemma2Encoder_Gemma2Encoder_Config models are supported here.") + + model_path = Path(config.path) + + match submodel_type: + case SubModelType.Tokenizer: + return AutoTokenizer.from_pretrained(model_path, local_files_only=True) + case SubModelType.TextEncoder: + target_device = TorchDevice.choose_torch_device() + model_dtype = TorchDevice.choose_bfloat16_safe_dtype(target_device) + causal_lm = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=model_dtype, + low_cpu_mem_usage=True, + local_files_only=True, + ) + # PiD only ever uses the decoder block — the transformer stack + # without the LM head. Upstream calls `.get_decoder()`, but + # transformers 4.56 returns None for Gemma2, so we reach for + # `.model` (the underlying Gemma2Model) directly and let the + # rest of `causal_lm` (lm_head etc.) be garbage-collected. + inner = getattr(causal_lm, "get_decoder", lambda: None)() or causal_lm.model + inner.eval() + inner.requires_grad_(False) + return inner + + raise ValueError( + f"Unsupported submodel type for Gemma2 encoder: {submodel_type!r}. Expected Tokenizer or TextEncoder." + ) diff --git a/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py b/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py new file mode 100644 index 00000000000..0a91c27ee4f --- /dev/null +++ b/invokeai/backend/model_manager/load/model_loaders/pid_decoder.py @@ -0,0 +1,104 @@ +"""Loader for PiD (Pixel Diffusion Decoder) checkpoints. + +Returns a fully-constructed `PidNet` so the model cache can size it +correctly and apply its standard sequential-offload / partial-load +policies. We instantiate the architecture (per backbone) here and pour the +checkpoint's tensors directly into it, then discard the intermediate state +dict — avoiding the 2x VRAM peak you would get from holding both a `dict` +and the live module at the same time. +""" + +from pathlib import Path +from typing import Optional + +import torch +from safetensors.torch import load_file as safetensors_load_file + +from invokeai.backend.model_manager.configs.factory import AnyModelConfig +from invokeai.backend.model_manager.load.load_default import ModelLoader +from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry +from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType +from invokeai.backend.pid.decode import load_pid_decoder + +# NVIDIA's official PiD `.pth` checkpoints store the student under the `net.` +# prefix (see `PidDistillModel.state_dict(prefix="net.")` in the vendored +# upstream). We strip it on load so PidNet.load_state_dict() can consume the +# dict directly. +_NET_PREFIX = "net." + + +def _load_raw_checkpoint(path: Path) -> dict[str, torch.Tensor]: + suffix = path.suffix.lower() + if suffix == ".safetensors": + return safetensors_load_file(str(path)) + if suffix in {".pth", ".pt", ".ckpt", ".bin"}: + # NVIDIA's PiD `.pth` checkpoints are plain tensor dicts (verified + # against the released res2k_sr4x_official_flux checkpoint). + sd = torch.load(str(path), map_location="cpu", weights_only=True) + if isinstance(sd, dict) and "state_dict" in sd and isinstance(sd["state_dict"], dict): + sd = sd["state_dict"] + return sd # type: ignore[return-value] + raise ValueError(f"Unrecognised PiD decoder checkpoint extension: {suffix!r}") + + +def _strip_net_prefix(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + if not any(k.startswith(_NET_PREFIX) for k in state_dict if isinstance(k, str)): + return state_dict + out: dict[str, torch.Tensor] = {} + for k, v in state_dict.items(): + if isinstance(k, str) and k.startswith(_NET_PREFIX): + out[k[len(_NET_PREFIX) :]] = v + elif isinstance(k, str) and ( + k.startswith("net_ema.") or k.startswith("fake_score.") or k.startswith("discriminator.") + ): + continue + else: + out[k] = v + return out + + +@ModelLoaderRegistry.register(base=BaseModelType.Flux, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint) +@ModelLoaderRegistry.register(base=BaseModelType.Flux2, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint) +@ModelLoaderRegistry.register( + base=BaseModelType.StableDiffusion3, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint +) +@ModelLoaderRegistry.register( + base=BaseModelType.StableDiffusionXL, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint +) +@ModelLoaderRegistry.register(base=BaseModelType.QwenImage, type=ModelType.PiDDecoder, format=ModelFormat.Checkpoint) +class PiDDecoderLoader(ModelLoader): + """Loads a PiD checkpoint into a fully-constructed PidNet of the matching backbone.""" + + def _load_model( + self, + config: AnyModelConfig, + submodel_type: Optional[SubModelType] = None, + ) -> AnyModel: + if submodel_type is not None: + raise ValueError("Unexpected submodel requested for PiD decoder.") + + # Backbone is encoded in the config's `base` field — populated by + # PiDDecoder_Checkpoint_*_Config when the user added the model. + backbone: BaseModelType = config.base + + raw_sd = _strip_net_prefix(_load_raw_checkpoint(Path(config.path))) + + # Build the live PidNet on CPU and pour the checkpoint in — then drop + # the dict so we don't hold two copies in RAM at once. + pid_net = load_pid_decoder(raw_sd, backbone) + del raw_sd + + # We deliberately keep PidNet's parameters in float32 here. PiD + # consumes Gemma-2 hidden states that contain large outliers + # (per-token max well past 100) and the in-network RMSNorm + # (`variance = hidden_states.pow(2).mean(-1, keepdim=True)`) loses + # precision badly in bf16, producing all-NaN outputs. The decode + # wrapper runs the forward pass under `torch.autocast(bf16)` so the + # bulk of the matmuls still execute in bf16 — only the precision- + # critical reductions stay fp32. This roughly doubles VRAM for the + # weights (~5 GB instead of ~2.5 GB) but is the only configuration + # we have measured to be numerically stable end-to-end. + + pid_net.eval() + pid_net.requires_grad_(False) + return pid_net diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index 9bc58e44269..4ca1a275bb6 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -14,6 +14,7 @@ BaseModelType, ModelFormat, ModelType, + PiDDecoderVariantType, QwenImageVariantType, ) @@ -128,6 +129,116 @@ class StarterModelBundle(BaseModel): # endregion +# region PiD (Pixel Diffusion Decoder) +# PiD's pretrained decoders condition on Gemma-2-2b-it caption embeddings (2304-dim). NVIDIA references the ungated +# mirror Efficient-Large-Model/gemma-2-2b-it. It is shared across all PiD backbones, so it is a dependency of each +# decoder below (and offered standalone here so it can be installed once). +gemma2_2b_encoder = StarterModel( + name="Gemma 2 2B (PiD caption encoder)", + base=BaseModelType.Any, + source="Efficient-Large-Model/gemma-2-2b-it", + description="Gemma-2-2b-it text encoder that PiD uses to condition its diffusion decode on a caption. ~5GB", + type=ModelType.Gemma2Encoder, + format=ModelFormat.Gemma2Encoder, +) + +# NVIDIA PiD decoders (https://huggingface.co/nvidia/PiD). Code is Apache-2.0; weights are NSCLv1 (non-commercial / +# research). Each is a 4x super-resolution decoder that replaces the regular VAE decode and needs the Gemma-2 encoder. +pid_decoder_flux_2k = StarterModel( + name="PiD Decoder FLUX (2K)", + base=BaseModelType.Flux, + source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for FLUX latents, 2K target preset (e.g. 512 -> 2048). ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +pid_decoder_flux_2kto4k = StarterModel( + name="PiD Decoder FLUX (2K to 4K)", + base=BaseModelType.Flux, + source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_flux_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for FLUX latents, 2K-to-4K preset for higher-resolution output. ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2kTo4k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +# FLUX.2 Klein shares one 32-channel VAE across the 4B and 9B variants, so a single decoder per preset covers both. +# The 128-channel packed latent is unambiguous (unlike the 16ch FLUX/SD3 case), so no directory-name disambiguation +# is needed for the config probe. +pid_decoder_flux2_2k = StarterModel( + name="PiD Decoder FLUX.2 (2K)", + base=BaseModelType.Flux2, + source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for FLUX.2 Klein latents, 2K target preset (e.g. 512 -> 2048). ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +pid_decoder_flux2_2kto4k = StarterModel( + name="PiD Decoder FLUX.2 (2K to 4K)", + base=BaseModelType.Flux2, + source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for FLUX.2 Klein latents, 2K-to-4K preset for higher-resolution output. ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2kTo4k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +# SD3 uses a 16-channel latent, architecturally identical to FLUX.1. The config probe disambiguates via the +# checkpoint's directory name (`…official_sd3_distill…`); if the HF single-file download drops that name, the +# explicit base=StableDiffusion3 override the installer sends is trusted instead (see pid_decoder.py::_validate_base). +pid_decoder_sd3_2k = StarterModel( + name="PiD Decoder SD3 (2K)", + base=BaseModelType.StableDiffusion3, + source="nvidia/PiD::checkpoints/PiD_res2k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for SD3 latents, 2K target preset (e.g. 512 -> 2048). ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +pid_decoder_sd3_2kto4k = StarterModel( + name="PiD Decoder SD3 (2K to 4K)", + base=BaseModelType.StableDiffusion3, + source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for SD3 latents, 2K-to-4K preset for higher-resolution output. ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2kTo4k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +# SDXL uses a 4-channel latent, which is unambiguous (no FLUX/SD3-style directory-name disambiguation needed). +# NVIDIA ships only the 2K-to-4K preset for SDXL (no plain 2K checkpoint). +pid_decoder_sdxl_2kto4k = StarterModel( + name="PiD Decoder SDXL (2K to 4K)", + base=BaseModelType.StableDiffusionXL, + source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_sdxl_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for SDXL latents, 2K-to-4K preset. ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2kTo4k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +# Qwen-Image uses a 16-channel latent (ambiguous with FLUX/SD3). The config probe disambiguates via the checkpoint's +# directory name (`…official_qwenimage_distill…`); if the HF single-file download drops it, the explicit +# base=QwenImage override the installer sends is trusted instead (see pid_decoder.py::_validate_base). Only the +# 2K-to-4K preset exists. +pid_decoder_qwenimage_2kto4k = StarterModel( + name="PiD Decoder Qwen-Image (2K to 4K)", + base=BaseModelType.QwenImage, + source="nvidia/PiD::checkpoints/PiD_res2kto4k_sr4x_official_qwenimage_distill_4step/model_ema_bf16.pth", + description="NVIDIA PiD 4x super-resolution decoder for Qwen-Image latents, 2K-to-4K preset. ~5GB", + type=ModelType.PiDDecoder, + format=ModelFormat.Checkpoint, + variant=PiDDecoderVariantType.Res2kTo4k_Sr4x, + dependencies=[gemma2_2b_encoder], +) +# endregion + + # region: Main flux_schnell_quantized = StarterModel( name="FLUX.1 schnell (quantized)", @@ -1710,6 +1821,15 @@ def _gemini_3_resolution_presets( anima_base, anima_qwen3_encoder, anima_vae, + gemma2_2b_encoder, + pid_decoder_flux_2k, + pid_decoder_flux_2kto4k, + pid_decoder_flux2_2k, + pid_decoder_flux2_2kto4k, + pid_decoder_sd3_2k, + pid_decoder_sd3_2kto4k, + pid_decoder_sdxl_2kto4k, + pid_decoder_qwenimage_2kto4k, ] sd1_bundle: list[StarterModel] = [ diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py index a2e4e58bdc4..0bb2eb2bf33 100644 --- a/invokeai/backend/model_manager/taxonomy.py +++ b/invokeai/backend/model_manager/taxonomy.py @@ -79,12 +79,14 @@ class ModelType(str, Enum): T5Encoder = "t5_encoder" Qwen3Encoder = "qwen3_encoder" QwenVLEncoder = "qwen_vl_encoder" + Gemma2Encoder = "gemma2_encoder" SpandrelImageToImage = "spandrel_image_to_image" SigLIP = "siglip" FluxRedux = "flux_redux" LlavaOnevision = "llava_onevision" TextLLM = "text_llm" ExternalImageGenerator = "external_image_generator" + PiDDecoder = "pid_decoder" Unknown = "unknown" @@ -178,6 +180,23 @@ class Qwen3VariantType(str, Enum): """Qwen3 0.6B text encoder (hidden_size=1024). Used by Anima.""" +class PiDDecoderVariantType(str, Enum): + """PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA. + + Each backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that + differ only in target output resolution; the underlying network is the + same. NVIDIA's checkpoint filenames encode this as e.g. + `PiD_res2k_sr4x_official_flux_distill_4step` vs + `PiD_res2kto4k_sr4x_official_flux_distill_4step`. + """ + + Res2k_Sr4x = "res2k_sr4x" + """Standard 2K target preset (decodes to ~2K via 4x super-resolution).""" + + Res2kTo4k_Sr4x = "res2kto4k_sr4x" + """Upsampling preset (designed for chaining to push ~2K inputs to ~4K).""" + + class ModelFormat(str, Enum): """Storage format of model.""" @@ -193,6 +212,7 @@ class ModelFormat(str, Enum): T5Encoder = "t5_encoder" Qwen3Encoder = "qwen3_encoder" QwenVLEncoder = "qwen_vl_encoder" + Gemma2Encoder = "gemma2_encoder" BnbQuantizedLlmInt8b = "bnb_quantized_int8b" BnbQuantizednf4b = "bnb_quantized_nf4b" GGUFQuantized = "gguf_quantized" @@ -249,6 +269,7 @@ class FluxLoRAFormat(str, Enum): ZImageVariantType, QwenImageVariantType, Qwen3VariantType, + PiDDecoderVariantType, ] variant_type_adapter = TypeAdapter[ ModelVariantType @@ -258,6 +279,7 @@ class FluxLoRAFormat(str, Enum): | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + | PiDDecoderVariantType ]( ModelVariantType | ClipVariantType @@ -266,4 +288,5 @@ class FluxLoRAFormat(str, Enum): | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + | PiDDecoderVariantType ) diff --git a/invokeai/backend/pid/__init__.py b/invokeai/backend/pid/__init__.py new file mode 100644 index 00000000000..a247ebb89dd --- /dev/null +++ b/invokeai/backend/pid/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Vendored from https://github.com/nv-tlabs/PiD (Apache-2.0). +# Original upstream lives at pid/. Files here have been re-rooted to +# invokeai.backend.pid.* and the configs/, tokenizers/, _demo_*, from_*, +# checkpointer/, trainer.py, visualize/ subtrees have been excluded. +# +# See THIRD_PARTY_LICENSES.md for the full attribution. diff --git a/invokeai/backend/pid/_ext/__init__.py b/invokeai/backend/pid/_ext/__init__.py new file mode 100644 index 00000000000..1e792c09005 --- /dev/null +++ b/invokeai/backend/pid/_ext/__init__.py @@ -0,0 +1,3 @@ +# Vendored from PiD's _ext/ subtree (https://github.com/nv-tlabs/PiD). +# Originally copied from cosmos-predict2.5 (https://github.com/nvidia-cosmos/cosmos-predict2.5/). +# Apache-2.0. diff --git a/invokeai/backend/pid/_ext/imaginaire/__init__.py b/invokeai/backend/pid/_ext/imaginaire/__init__.py new file mode 100644 index 00000000000..3159bfe6564 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py new file mode 100644 index 00000000000..fbbe88ede08 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Upstream re-exported `LazyDict = omegaconf.DictConfig`; in this vendored +# subset configs are plain Python mappings, so `LazyDict` aliases the +# attribute-accessible dict subclass produced by `LazyCall`. + +from invokeai.backend.pid._ext.imaginaire.lazy_config.instantiate import instantiate +from invokeai.backend.pid._ext.imaginaire.lazy_config.lazy import LazyCall, LazyConfig, _LazyCallResult + +PLACEHOLDER = None +LazyDict = _LazyCallResult + +__all__ = ["instantiate", "LazyCall", "LazyConfig", "PLACEHOLDER", "LazyDict"] diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py new file mode 100644 index 00000000000..0579fe5f56e --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/file_io.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal stdlib-based stand-in for the upstream iopath PathManager. +# Only `open()` on local paths and trivial helpers are supported; the upstream +# HTTPURLHandler / OneDrivePathHandler paths are not used by the decoder +# inference subset we vendor. + +import io +import shutil +from typing import IO, Any + +__all__ = ["PathManager", "PathHandler"] + + +class PathHandler: + """Base no-op handler (kept for API parity).""" + + def _open(self, path: str, mode: str = "r", **kwargs: Any) -> IO: + return io.open(path, mode, **kwargs) + + +class _LocalPathManager: + def open(self, path: str, mode: str = "r", **kwargs: Any) -> IO: + return io.open(path, mode, **kwargs) + + def get_local_path(self, path: str, **kwargs: Any) -> str: + return path + + def exists(self, path: str) -> bool: + import os.path + + return os.path.exists(path) + + def isfile(self, path: str) -> bool: + import os.path + + return os.path.isfile(path) + + def isdir(self, path: str) -> bool: + import os.path + + return os.path.isdir(path) + + def mkdirs(self, path: str) -> None: + import os + + os.makedirs(path, exist_ok=True) + + def copy(self, src: str, dst: str, overwrite: bool = False) -> bool: + shutil.copy(src, dst) + return True + + def register_handler(self, handler: PathHandler) -> None: + pass + + +PathManager = _LocalPathManager() diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py new file mode 100644 index 00000000000..bdb5b4abb07 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/instantiate.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Stdlib-only `instantiate()`. The upstream module also handled +# omegaconf.DictConfig / ListConfig structured configs and OmegaConf.to_object +# round-trips. In the vendored decoder-inference subset all configs are +# constructed as plain Python mappings (see invokeai/backend/pid/decode.py), +# so the omegaconf paths are not required. + +import collections.abc as abc +import dataclasses +import logging +from typing import Any + +import attrs + +from invokeai.backend.pid._ext.imaginaire.lazy_config.registry import _convert_target_to_string, locate + +__all__ = ["dump_dataclass", "instantiate"] + + +def is_dataclass_or_attrs(target: Any) -> bool: + return dataclasses.is_dataclass(target) or attrs.has(target) + + +def dump_dataclass(obj: Any) -> dict: + """Recursively dump a dataclass into a dict that can be re-instantiated.""" + assert dataclasses.is_dataclass(obj) and not isinstance(obj, type), ( + "dump_dataclass() requires an instance of a dataclass." + ) + ret: dict = {"_target_": _convert_target_to_string(type(obj))} + for f in dataclasses.fields(obj): + v = getattr(obj, f.name) + if dataclasses.is_dataclass(v): + v = dump_dataclass(v) + if isinstance(v, (list, tuple)): + v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v] + ret[f.name] = v + return ret + + +def instantiate(cfg: Any, *args: Any, **kwargs: Any) -> Any: + """Recursively instantiate objects defined by `_target_` + arguments. + + Accepts any Mapping with a `_target_` key (e.g. plain dict or the + `_LazyCallResult` produced by `LazyCall`). Lists are walked recursively. + """ + if isinstance(cfg, list): + return [instantiate(x) for x in cfg] + + if isinstance(cfg, abc.Mapping) and "_target_" in cfg: + is_recursive = bool(cfg.get("_recursive_", True)) + if is_recursive: + resolved = {k: instantiate(v) for k, v in cfg.items()} + else: + resolved = dict(cfg) + resolved.pop("_recursive_", None) + cls = resolved.pop("_target_") + if isinstance(cls, str): + cls_name = cls + cls = locate(cls_name) + assert cls is not None, cls_name + else: + cls_name = getattr(cls, "__qualname__", str(cls)) + assert callable(cls), f"_target_ {cls_name} does not define a callable object" + try: + return cls(*args, **{**resolved, **kwargs}) + except TypeError: + logging.getLogger(__name__).error("Error when instantiating %s!", cls_name) + raise + + return cfg diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py new file mode 100644 index 00000000000..65069589396 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/lazy.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal LazyCall / LazyConfig stub. The upstream module supports file-based +# config save/load via yaml + cloudpickle + dill + detectron2 helpers; the +# vendored decoder-inference subset only needs `LazyCall(cls)(**kwargs)` as a +# convenient producer of `{_target_: "cls.fqn", **kwargs}` dicts that +# `instantiate()` can resolve. + +from typing import Any + +from invokeai.backend.pid._ext.imaginaire.lazy_config.registry import _convert_target_to_string + +__all__ = ["LazyCall", "LazyConfig"] + + +class _LazyCallResult(dict): + """A plain dict tagged for `instantiate()`. Behaves like a DictConfig + enough for our subset (attribute access falls back to item access).""" + + def __getattr__(self, key: str) -> Any: + try: + return self[key] + except KeyError as e: + raise AttributeError(key) from e + + def __setattr__(self, key: str, value: Any) -> None: + self[key] = value + + +class LazyCall: + """`LazyCall(cls)(**kwargs)` -> `{_target_: , **kwargs}`.""" + + def __init__(self, target: Any) -> None: + self._target = target + + def __call__(self, **kwargs: Any) -> _LazyCallResult: + target_str = _convert_target_to_string(self._target) if not isinstance(self._target, str) else self._target + return _LazyCallResult(_target_=target_str, **kwargs) + + +class LazyConfig: + """File-IO helpers from the upstream module are not used in the inference + subset and are intentionally omitted.""" + + @staticmethod + def load(*args: Any, **kwargs: Any) -> Any: + raise NotImplementedError("LazyConfig.load is not supported in the vendored PiD inference subset.") + + @staticmethod + def save(*args: Any, **kwargs: Any) -> Any: + raise NotImplementedError("LazyConfig.save is not supported in the vendored PiD inference subset.") diff --git a/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py b/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py new file mode 100644 index 00000000000..73d8bc973a2 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/lazy_config/registry.py @@ -0,0 +1,117 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pydoc +from typing import Any + + +class Registry: + """Minimal stand-in for fvcore.common.registry.Registry. + + Only the subset used by the vendored PiD decode path is implemented: + name-keyed object registry with ``register``/``get``. + """ + + def __init__(self, name: str) -> None: + self._name = name + self._obj_map: dict[str, Any] = {} + + def register(self, obj: Any = None, *, name: str | None = None) -> Any: + if obj is None: + + def deco(x: Any) -> Any: + self._do_register(name or x.__name__, x) + return x + + return deco + self._do_register(name or obj.__name__, obj) + return obj + + def _do_register(self, name: str, obj: Any) -> None: + if name in self._obj_map: + raise KeyError(f"{name} already registered in {self._name}") + self._obj_map[name] = obj + + def get(self, name: str) -> Any: + if name not in self._obj_map: + raise KeyError(f"{name} not found in {self._name}") + return self._obj_map[name] + + def __contains__(self, name: str) -> bool: + return name in self._obj_map + + def __iter__(self): + return iter(self._obj_map.items()) + + +""" +``Registry`` and `locate` provide ways to map a string (typically found +in config files) to callable objects. +""" + +__all__ = ["Registry", "locate"] + + +def _convert_target_to_string(t: Any) -> str: + """ + Inverse of ``locate()``. + + Args: + t: any object with ``__module__`` and ``__qualname__`` + """ + module, qualname = t.__module__, t.__qualname__ + + # Compress the path to this object, e.g. ``module.submodule._impl.class`` + # may become ``module.submodule.class``, if the later also resolves to the same + # object. This simplifies the string, and also is less affected by moving the + # class implementation. + module_parts = module.split(".") + for k in range(1, len(module_parts)): + prefix = ".".join(module_parts[:k]) + candidate = f"{prefix}.{qualname}" + try: + if locate(candidate) is t: + return candidate + except ImportError: + pass + return f"{module}.{qualname}" + + +def locate(name: str) -> Any: + """ + Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``, + such as "module.submodule.class_name". + + Raise Exception if it cannot be found. + """ + obj = pydoc.locate(name) + if obj is None: + # Fallback: walk the module path manually for cases pydoc.locate misses + # (e.g. nested classes, re-exports). + import importlib + + parts = name.split(".") + for k in range(len(parts) - 1, 0, -1): + mod_path, attr_path = ".".join(parts[:k]), parts[k:] + try: + obj = importlib.import_module(mod_path) + for a in attr_path: + obj = getattr(obj, a) + break + except (ImportError, AttributeError): + obj = None + if obj is None: + raise ImportError(f"Cannot dynamically locate object {name}!") + return obj diff --git a/invokeai/backend/pid/_ext/imaginaire/model.py b/invokeai/backend/pid/_ext/imaginaire/model.py new file mode 100644 index 00000000000..f2ab6e02c3a --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/model.py @@ -0,0 +1,129 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from invokeai.backend.pid._ext.imaginaire.lazy_config import LazyDict, instantiate + + +class ImaginaireModel(torch.nn.Module): + """The base model class of Imaginaire. It is inherited from torch.nn.Module. + + All models in Imaginaire should inherit ImaginaireModel. It should include the implementions for all the + computation graphs. All inheriting child classes should implement the following methods: + - training_step(): The training step of the model, including the loss computation. + - validation_step(): The validation step of the model, including the loss computation. + - forward(): The computation graph for model inference. + The following methods have default implementations in ImaginaireModel: + - init_optimizer_scheduler(): Creates the optimizer and scheduler for the model. + """ + + def __init__(self) -> None: + super().__init__() + + def init_optimizer_scheduler( + self, optimizer_config: LazyDict, scheduler_config: LazyDict + ) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]: + """Creates the optimizer and scheduler for the model. + + Args: + config_model (ModelConfig): The config object for the model. + + Returns: + optimizer (torch.optim.Optimizer): The model optimizer. + scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler. + """ + optimizer_config.params = self.parameters() + optimizer = instantiate(optimizer_config) + scheduler_config.optimizer = optimizer + scheduler = instantiate(scheduler_config) + return optimizer, scheduler + + def training_step( + self, data_batch: dict[str, torch.Tensor], iteration: int + ) -> tuple[dict[str, torch.Tensor], torch.Tensor]: + """The training step of the model, including the loss computation. + + Args: + data (dict[str, torch.Tensor]): Data batch (dictionary of tensors). + iteration (int): Current iteration number. + + Returns: + output_batch (dict[str, torch.Tensor]): Auxiliary model output from the training batch. + loss (torch.Tensor): The total loss for backprop (weighted sum of various losses). + """ + raise NotImplementedError + + @torch.no_grad() + def validation_step( + self, data_batch: dict[str, torch.Tensor], iteration: int + ) -> tuple[dict[str, torch.Tensor], torch.Tensor]: + """The validation step of the model, including the loss computation. + + Args: + data (dict[str, torch.Tensor]): Data batch (dictionary of tensors). + iteration (int): Current iteration number. + + Returns: + output_batch (dict[str, torch.Tensor]): Auxiliary model output from the validation batch. + loss (torch.Tensor): The total loss (weighted sum of various losses). + """ + raise NotImplementedError + + @torch.inference_mode() + def forward(self, *args: Any, **kwargs: Any) -> Any: + """The computation graph for model inference. + + Args: + *args: Whatever you decide to pass into the forward method. + **kwargs: Keyword arguments are also possible. + + Return: + Your model's output. + """ + raise NotImplementedError + + def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None: + """The model preparation before the training is launched + + Args: + memory_format (torch.memory_format): Memory format of the model. + """ + pass + + def on_before_zero_grad( + self, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LRScheduler, iteration: int + ) -> None: + """Hook before zero_grad() is called. + + Args: + optimizer (torch.optim.Optimizer): The model optimizer. + scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler. + iteration (int): Current iteration number. + """ + pass + + def on_after_backward(self, iteration: int = 0) -> None: + """Hook after loss.backward() is called. + + This method is called immediately after the backward pass, allowing for custom operations + or modifications to be performed on the gradients before the optimizer step. + + Args: + iteration (int): Current iteration number. + """ + pass diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py b/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py new file mode 100644 index 00000000000..3159bfe6564 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py b/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py new file mode 100644 index 00000000000..c42805a66df --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/count_params.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from torch import nn + + +def disabled_train(self, mode: bool = True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +def count_params(model: nn.Module, verbose=False) -> int: + total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + if verbose: + print(f"{model.__class__.__name__} has {total_params * 1.0e-6:.2f} M params.") + return total_params diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/device.py b/invokeai/backend/pid/_ext/imaginaire/utils/device.py new file mode 100644 index 00000000000..aab75fc59fa --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/device.py @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import math +import os + +from invokeai.backend.pid._ext.imaginaire.utils.log import logger as logging + + +def get_gpu_architecture(): + """ + Retrieves the GPU architecture of the available GPUs. + + Returns: + str: The GPU architecture, which can be "H100", "A100", or "Other". + """ + import pynvml + + try: + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + for i in range(device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + model_name = pynvml.nvmlDeviceGetName(handle) + if isinstance(model_name, bytes): + model_name = model_name.decode("utf-8") + print(f"GPU {i}: Model: {model_name}") + + # Check for specific models like H100 or A100 + if "H100" in model_name or "H200" in model_name: + return "H100" + elif "A100" in model_name: + return "A100" + elif "L40S" in model_name: + return "L40S" + elif "B200" in model_name: + return "B200" + except pynvml.NVMLError as error: + print(f"Failed to get GPU info: {error}") + finally: + pynvml.nvmlShutdown() + + # return "Other" incase of non hopper/ampere or error + return "Other" + + +class GPUArchitectureNotSupported(Exception): + """ + Custom exception raised when the expected GPU architecture is not supported. + """ + + pass + + +def print_gpu_mem(str=None): + import pynvml + + try: + pynvml.nvmlInit() + meminfo = pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(0)) + logging.info( + f"{str}: {meminfo.used / 1024 / 1024}/{meminfo.total / 1024 / 1024}MiB used ({meminfo.free / 1024 / 1024}MiB free)" + ) + except pynvml.NVMLError as error: + print(f"Failed to get GPU memory info: {error}") + + +def force_gc(): + print_gpu_mem() + print("gc()") + gc.collect() + print_gpu_mem() + print("empty cuda cache") + # print(torch.cuda.memory_summary()) + print_gpu_mem() + + +def gpu0_has_80gb_or_less(): + import pynvml + + try: + pynvml.nvmlInit() + meminfo = pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(0)) + return meminfo.total / 1024 / 1024 / 1024 <= 80 + except pynvml.NVMLError as error: + print(f"Failed to get GPU memory info: {error}") + + +class Device: + _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) # type: ignore + + def __init__(self, device_idx: int): + import pynvml + + super().__init__() + self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx) + + def get_name(self) -> str: + import pynvml + + return pynvml.nvmlDeviceGetName(self.handle) + + def get_cpu_affinity(self) -> list[int]: + import pynvml + + affinity_string = "" + for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements): + # assume nvml returns list of 64 bit ints + affinity_string = "{:064b}".format(j) + affinity_string + affinity_list = [int(x) for x in affinity_string] + affinity_list.reverse() # so core 0 is in 0th element of list + return [i for i, e in enumerate(affinity_list) if e != 0] diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py b/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py new file mode 100644 index 00000000000..78d8599abcd --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/distributed.py @@ -0,0 +1,444 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections +import collections.abc +import ctypes +import functools +import os +from contextlib import contextmanager +from datetime import timedelta +from typing import TYPE_CHECKING, Any, Callable, Container, Optional + +import torch +import torch.distributed as dist +from torch.distributed import get_process_group_ranks + +from invokeai.backend.pid._ext.imaginaire.utils.device import Device + +if dist.is_available(): + from torch.distributed.distributed_c10d import _get_default_group + from torch.distributed.utils import _sync_module_states, _verify_param_shape_across_processes + +from invokeai.backend.pid._ext.imaginaire.utils import log + +if TYPE_CHECKING: + DDPConfig = Any # config module not vendored; type hint kept for parity + +try: + from megatron.core import parallel_state +except ImportError: + parallel_state = None # type: ignore[assignment] + + +def init() -> int | None: + """Initialize distributed training.""" + import pynvml + + if dist.is_initialized(): + return torch.cuda.current_device() + + # Set GPU affinity. + pynvml.nvmlInit() + local_rank = int(os.getenv("LOCAL_RANK", 0)) + try: + device = Device(local_rank) + os.sched_setaffinity(0, device.get_cpu_affinity()) + except Exception as e: + log.warning(f"Failed to set device affinity: {e}") + # Set up NCCL communication. + os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "0" + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1" + if dist.is_available(): + torch.cuda.set_device(local_rank) + # Get the timeout value from environment variable + timeout_seconds = os.getenv("TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC", 1800) + # Convert the timeout to an integer (if it isn't already) and then to a timedelta + timeout_timedelta = timedelta(seconds=int(timeout_seconds)) + dist.init_process_group(backend="nccl", init_method="env://", timeout=timeout_timedelta) + log.info( + f"Initialized distributed training with local rank {local_rank} with timeout {timeout_seconds}", + rank0_only=False, + ) + # Increase the L2 fetch granularity for faster speed. + _libcudart = ctypes.CDLL("libcudart.so") + # Set device limit on the current device. + p_value = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) + _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) + _libcudart.cudaDeviceGetLimit(p_value, ctypes.c_int(0x05)) + log.info(f"Training with {get_world_size()} GPUs.") + + +def get_rank(group: Optional[dist.ProcessGroup] = None) -> int: + """Get the rank (GPU device) of the worker. + + Returns: + rank (int): The rank of the worker. + """ + rank = 0 + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank(group) + return rank + + +def get_world_size(group: Optional[dist.ProcessGroup] = None) -> int: + """Get world size. How many GPUs are available in this job. + + Returns: + world_size (int): The total number of GPUs available in this job. + """ + world_size = 1 + if dist.is_available() and dist.is_initialized(): + world_size = dist.get_world_size(group) + return world_size + + +def is_rank0() -> bool: + """Check if current process is the master GPU. + + Returns: + (bool): True if this function is called from the master GPU, else False. + """ + return get_rank() == 0 + + +def is_local_rank0() -> bool: + """Check if current process is the local master GPU in the current node. + + Returns: + (bool): True if this function is called from the local master GPU, else False. + """ + return torch.cuda.current_device() == 0 + + +def rank0_only(func: Callable) -> Callable: + """Apply this function only to the master GPU. + + Example usage: + @rank0_only + def func(x): + return x + 3 + + Args: + func (Callable): a function. + + Returns: + (Callable): A function wrapper executing the function only on the master GPU. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): # noqa: ANN202 + if is_rank0(): + return func(*args, **kwargs) + else: + return None + + return wrapper + + +def barrier() -> None: + """Barrier for all GPUs.""" + if dist.is_available() and dist.is_initialized(): + dist.barrier() + + +def rank0_first(func: Callable) -> Callable: + """run the function on rank 0 first, then on other ranks.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): # noqa: ANN202 + if is_rank0(): + result = func(*args, **kwargs) + barrier() + if not is_rank0(): + result = func(*args, **kwargs) + return result + + return wrapper + + +def parallel_model_wrapper(config_ddp: DDPConfig, model: torch.nn.Module) -> torch.nn.Module | DistributedDataParallel: + """Wraps the model to enable data parallalism for training across multiple GPU devices. + + Args: + config_ddp (DDPConfig): The data parallel config. + model (torch.nn.Module): The PyTorch module. + + Returns: + model (torch.nn.Module | DistributedDataParallel): The data parallel model wrapper + if distributed environment is available, otherwise return the original model. + """ + if dist.is_available() and dist.is_initialized(): + local_rank = int(os.getenv("LOCAL_RANK", 0)) + try: + ddp_group = parallel_state.get_data_parallel_group(with_context_parallel=True) + except Exception as e: + log.info(e) + log.info("parallel_state not initialized, treating all GPUs equally for DDP") + ddp_group = None + + model = DistributedDataParallel( + model, + device_ids=[local_rank], + output_device=local_rank, + find_unused_parameters=config_ddp.find_unused_parameters, + static_graph=config_ddp.static_graph, + broadcast_buffers=config_ddp.broadcast_buffers, + process_group=ddp_group, + ) + return model + + +class DistributedDataParallel(torch.nn.parallel.DistributedDataParallel): + """This extends torch.nn.parallel.DistributedDataParallel with .training_step(). + + This borrows the concept of `forward-redirection` from Pytorch lightning. It wraps an ImaginaireModel such that + model.training_step() would be executed when calling self.training_step(), while preserving the behavior of calling + model() for Pytorch modules. Internally, this is a double rerouting mechanism (training_step -> forward -> + training_step), allowing us to preserve the function names and signatures. + """ + + def __init__(self, model: torch.nn.Module, *args, **kwargs): + super().__init__(model, *args, **kwargs) + self.show_sync_grad_static_graph_warning = True + + def training_step(self, *args, **kwargs) -> Any: + # Cache the original model.forward() method. + original_forward = self.module.forward + + def wrapped_training_step(*_args, **_kwargs): # noqa: ANN202 + # Unpatch immediately before calling training_step() because itself may want to call the real forward. + self.module.forward = original_forward + # The actual .training_step(). + return self.module.training_step(*_args, **_kwargs) + + # Patch the original_module's forward so we can redirect the arguments back to the real method. + self.module.forward = wrapped_training_step + # Call self, which implicitly calls self.forward() --> model.forward(), which is now model.training_step(). + # Without calling self.forward() or model.forward() explciitly, implicit hooks are also executed. + return self(*args, **kwargs) + + +@contextmanager +def ddp_sync_grad(model, enabled): + r""" + Context manager to enable/disable gradient synchronizations across DDP processes for DDP model. + Modified from: + https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.no_sync + Note that this is incompatible with static_graph=True and will be an no-op if static_graph=True. + + Within this context, gradients will be accumulated on module + variables, which will later be synchronized in the first + forward-backward pass exiting the context. + + .. warning:: + The forward pass should be included inside the context manager, or + else gradients will still be synchronized. + """ + assert isinstance(model, torch.nn.Module) + if isinstance(model, DistributedDataParallel): + old_require_backward_grad_sync = model.require_backward_grad_sync + if model.static_graph and model.require_backward_grad_sync != enabled: + if model.show_sync_grad_static_graph_warning: + log.warning("DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced.") + model.show_sync_grad_static_graph_warning = False + else: + model.require_backward_grad_sync = enabled + try: + yield + finally: + if isinstance(model, DistributedDataParallel): + model.require_backward_grad_sync = old_require_backward_grad_sync + + +def collate_batches(data_batches: list[dict[str, torch.Tensor]]) -> torch.Tensor | dict[str, torch.Tensor]: + """Aggregate the list of data batches from all devices and process the results. + + This is used for gathering validation data batches with pid._ext.imaginaire.utils.dataloader.DistributedEvalSampler. + It will return the data/output of the entire validation set in its original index order. The sizes of data_batches + in different ranks may differ by 1 (if dataset size is not evenly divisible), in which case a dummy sample will be + created before calling dis.all_gather(). + + Args: + data_batches (list[dict[str, torch.Tensor]]): List of tensors or (hierarchical) dictionary where + leaf entries are tensors. + + Returns: + data_gather (torch.Tensor | dict[str, torch.Tensor]): tensors or (hierarchical) dictionary where + leaf entries are concatenated tensors. + """ + if isinstance(data_batches[0], torch.Tensor): + # Concatenate the local data batches. + data_concat = torch.cat(data_batches, dim=0) # type: ignore + # Get the largest number of local samples from all ranks to determine whether to dummy-pad on this rank. + max_num_local_samples = torch.tensor(len(data_concat), device="cuda") + dist.all_reduce(max_num_local_samples, op=dist.ReduceOp.MAX) + if len(data_concat) < max_num_local_samples: + assert len(data_concat) + 1 == max_num_local_samples + dummy = torch.empty_like(data_concat[:1]) + data_concat = torch.cat([data_concat, dummy], dim=0) + dummy_count = torch.tensor(1, device="cuda") + else: + dummy_count = torch.tensor(0, device="cuda") + # Get all concatenated batches from all ranks and concatenate again. + dist.all_reduce(dummy_count, op=dist.ReduceOp.SUM) + data_concat = all_gather_tensor(data_concat.contiguous()) + data_collate = torch.stack(data_concat, dim=1).flatten(start_dim=0, end_dim=1) + # Remove the dummy samples. + if dummy_count > 0: + data_collate = data_collate[:-dummy_count] + elif isinstance(data_batches[0], collections.abc.Mapping): + data_collate = {} + for key in data_batches[0].keys(): + data_collate[key] = collate_batches([data[key] for data in data_batches]) # type: ignore + else: + raise TypeError + return data_collate + + +@torch.no_grad() +def all_gather_tensor(tensor: torch.Tensor) -> list[torch.Tensor]: + """Gather the corresponding tensor from all GPU devices to a list. + + Args: + tensor (torch.Tensor): Pytorch tensor. + + Returns: + tensor_list (list[torch.Tensor]): A list of Pytorch tensors gathered from all GPU devices. + """ + tensor_list = [torch.zeros_like(tensor) for _ in range(get_world_size())] + dist.all_gather(tensor_list, tensor) + return tensor_list + + +def broadcast(tensor, src, group=None, async_op=False): + world_size = get_world_size() + if world_size < 2: + return tensor + dist.broadcast(tensor, src=src, group=group, async_op=async_op) + + +def dist_reduce_tensor(tensor, rank=0, reduce="mean"): + r"""Reduce to rank 0""" + world_size = get_world_size() + if world_size < 2: + return tensor + with torch.no_grad(): + dist.reduce(tensor, dst=rank) + if get_rank() == rank: + if reduce == "mean": + tensor /= world_size + elif reduce == "sum": + pass + else: + raise NotImplementedError + return tensor + + +def sync_model_states( + model: torch.nn.Module, + process_group: Optional[dist.ProcessGroup] = None, + src: int = 0, + params_and_buffers_to_ignore: Optional[Container[str]] = None, + broadcast_buffers: bool = True, +): + """ + Modify based on DDP source code + Synchronizes the parameters and buffers of a model across different processes in a distributed setting. + + This function ensures that all processes in the specified process group have the same initial parameters and + buffers from the source rank, typically rank 0. It is useful when different processes start with different model + states and a synchronization is required to ensure consistency across all ranks. + + Args: + model (nn.Module): The model whose parameters and buffers are to be synchronized. + process_group (dist.ProcessGroup, optional): The process group for communication. If None, + the default group is used. Defaults to None. + src (int, optional): The source rank from which parameters and buffers will be broadcasted. + Defaults to 0. + params_and_buffers_to_ignore (Optional[Container[str]], optional): A container of parameter and buffer + names to exclude from synchronization. Defaults to None, which means all parameters and buffers are + included. + broadcast_buffers (bool, optional): Whether to broadcast buffers or not. Defaults to True. + + Side Effects: + This function modifies the state of the model in-place to synchronize it with the source rank's model state. + + Raises: + RuntimeError: If the shapes of parameters across processes do not match, a runtime error will be raised. + + Examples: + >>> # downloading duplicated model weights from s3 in each rank and save network bandwidth + >>> # useful and save our time when model weights are huge + >>> if dist.get_rank == 0: + >>> model.load_state_dict(network_bound_weights_download_fn(s3_weights_path)) + >>> dist.barrir() + >>> sync_model_states(model) # sync rank0 weights to other ranks + """ + if not dist.is_available() or not dist.is_initialized(): + return + if process_group is None: + process_group = _get_default_group() + if not params_and_buffers_to_ignore: + params_and_buffers_to_ignore = set() + + log.info( + f"Synchronizing model states from rank {src} to all ranks in process group {get_process_group_ranks(process_group)}." + ) + + # Build tuple of (module, parameter) for all parameters that require grads. + modules_and_parameters = [ + (module, parameter) + for module_name, module in model.named_modules() + for parameter in [ + param + # Note that we access module.named_parameters instead of + # parameters(module). parameters(module) is only needed in the + # single-process multi device case, where it accesses replicated + # parameters through _former_parameters. + for param_name, param in module.named_parameters(recurse=False) + if f"{module_name}.{param_name}" not in params_and_buffers_to_ignore + # if param.requires_grad + # and f"{module_name}.{param_name}" not in params_and_buffers_to_ignore + ] + ] + + # Deduplicate any parameters that might be shared across child modules. + memo = set() + modules_and_parameters = [ + # "p not in memo" is the deduplication check. + # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed. + (m, p) + for m, p in modules_and_parameters + if p not in memo and not memo.add(p) # type: ignore[func-returns-value] + ] + + # Build list of parameters. + parameters = [parameter for _, parameter in modules_and_parameters] + if len(parameters) == 0: + return + + _verify_param_shape_across_processes(process_group, parameters) + + _sync_module_states( + module=model, + process_group=process_group, + broadcast_bucket_size=int(250 * 1024 * 1024), + src=src, + params_and_buffers_to_ignore=params_and_buffers_to_ignore, + broadcast_buffers=broadcast_buffers, + ) diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/log.py b/invokeai/backend/pid/_ext/imaginaire/utils/log.py new file mode 100644 index 00000000000..c29f1265955 --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/log.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# stdlib-based replacement for the upstream loguru-based logger. +# Provides a drop-in `logger` plus `info/warning/error/...` module-level +# functions so vendored call sites do not need to be touched. + +import logging +from typing import Any + +logger = logging.getLogger("invokeai.backend.pid") + + +def info(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.info(str(msg), *args) + + +def warning(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.warning(str(msg), *args) + + +warn = warning + + +def error(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.error(str(msg), *args) + + +def debug(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.debug(str(msg), *args) + + +def critical(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.critical(str(msg), *args) + + +def exception(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.exception(str(msg), *args) + + +def trace(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.debug(str(msg), *args) + + +def success(msg: Any, *args: Any, **kwargs: Any) -> None: + logger.info(str(msg), *args) + + +def init_loguru_stdout() -> None: + pass + + +def init_loguru_file(path: str) -> None: + pass diff --git a/invokeai/backend/pid/_ext/imaginaire/utils/misc.py b/invokeai/backend/pid/_ext/imaginaire/utils/misc.py new file mode 100644 index 00000000000..08170c8b99d --- /dev/null +++ b/invokeai/backend/pid/_ext/imaginaire/utils/misc.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal stand-in for the upstream misc module. The full module pulled in +# wandb / straggler / termcolor / easy_io / DTensor helpers that the decoder +# inference subset does not use. + +from __future__ import annotations + +import random +import time +from contextlib import contextmanager +from typing import Iterator + +import numpy as np +import torch + +from invokeai.backend.pid._ext.imaginaire.utils.log import logger + + +@contextmanager +def timer(label: str) -> Iterator[None]: + start = time.perf_counter() + try: + yield + finally: + logger.info("%s took %.2fs", label, time.perf_counter() - start) + + +def set_random_seed(seed: int, by_rank: bool = False) -> None: + if by_rank: + try: + import torch.distributed as dist + + if dist.is_available() and dist.is_initialized(): + seed = seed + dist.get_rank() + except Exception: + pass + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def requires_grad(model: torch.nn.Module, value: bool = True) -> None: + for p in model.parameters(): + p.requires_grad = value diff --git a/invokeai/backend/pid/_src/__init__.py b/invokeai/backend/pid/_src/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/pid/_src/inference/__init__.py b/invokeai/backend/pid/_src/inference/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/pid/_src/inference/checkpoint_registry.py b/invokeai/backend/pid/_src/inference/checkpoint_registry.py new file mode 100644 index 00000000000..00099c4e71b --- /dev/null +++ b/invokeai/backend/pid/_src/inference/checkpoint_registry.py @@ -0,0 +1,122 @@ +# Shared official PID checkpoint registry. +# +# Single source of truth for the (experiment_name, checkpoint_path) pair used by +# every pixel-decoder demo in `pid/_src/inference/`. The registry is keyed by +# (backbone, ckpt_type): +# +# ckpt_type = "2k" Original 2048px-trained decoders, used as +# 512→2048 (4×) decoder for diffusers-style backbones, +# or 256→2048 (8×) for Scale-RAE. +# ckpt_type = "2kto4k" Multi-resolution-trained decoders (data bucketing +# 2048→3840 + SD3-style dynamic shift). Designed for +# 1024 LDM → 4K (3840) decoding. Only registered for +# the diffusers backbones (flux/flux2/sd3/zimage); +# rae / scale_rae have no 2kto4k variant. +# +# Backbone-tag namespace: +# flux Flux1-dev (16-ch VAE) LDM + from_clean (2k + 2kto4k) +# flux2 Flux2-dev (128-ch BN VAE) LDM + from_clean (2k + 2kto4k) +# sd3 SD3 medium (16-ch VAE) LDM + from_clean (2k + 2kto4k) +# zimage ZImage (Flux1's 16-ch VAE) LDM only — reuses Flux1 model (2k + 2kto4k) +# zimage_turbo ZImage-Turbo (same 16-ch VAE) LDM only — reuses Flux1 model (2k + 2kto4k) +# rae DINOv2-B + RAE ViT-XL (768-ch RAE) LDM + from_clean (2k only, sr4x) +# scale_rae SigLIP-2 So400M + Scale-RAE ViT-XL LDM + from_clean (2k only, sr8x) +# +# `pid_scale` is the spatial upscaling factor baked into the PID network +# (sr4x → 4, sr8x → 8) and is forwarded to the demo's --scale argument. + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class PIDCheckpoint: + experiment: str + checkpoint_path: str + pid_scale: int + + +_CKPT_ROOT = "checkpoints" + +VALID_CKPT_TYPES = ("2k", "2kto4k") + + +PID_CHECKPOINT_REGISTRY: dict[tuple[str, str], PIDCheckpoint] = { + # ---- 2k (the original 2048-trained release) ---- + ("flux", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr4x_official_flux_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("flux2", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr4x_official_flux2_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("sd3", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr4x_official_sd3_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("zimage", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr4x_official_flux_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_flux_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("rae", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr4x_official_dinov2_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr4x_official_dinov2_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("scale_rae", "2k"): PIDCheckpoint( + experiment="PiD_res2k_sr8x_official_siglip_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2k_sr8x_official_siglip_distill_4step/model_ema_bf16.pth", + pid_scale=8, + ), + # ---- 2kto4k (multi-res-trained, dynamic_shift-aware) ---- + ("flux", "2kto4k"): PIDCheckpoint( + experiment="PiD_res2kto4k_sr4x_official_flux_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_flux_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("flux2", "2kto4k"): PIDCheckpoint( + experiment="PiD_res2kto4k_sr4x_official_flux2_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_flux2_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), + ("sd3", "2kto4k"): PIDCheckpoint( + experiment="PiD_res2kto4k_sr4x_official_sd3_distill_4step", + checkpoint_path=f"{_CKPT_ROOT}/PiD_res2kto4k_sr4x_official_sd3_distill_4step/model_ema_bf16.pth", + pid_scale=4, + ), +} +# ZImage and ZImage-Turbo use Flux1's 16-ch VAE for both ckpt types → alias to +# the flux entries. Keep explicit aliases (vs. duplicating) so updating "flux" +# updates these backbones too. +PID_CHECKPOINT_REGISTRY[("zimage_turbo", "2k")] = PID_CHECKPOINT_REGISTRY[("flux", "2k")] +PID_CHECKPOINT_REGISTRY[("zimage", "2kto4k")] = PID_CHECKPOINT_REGISTRY[("flux", "2kto4k")] +PID_CHECKPOINT_REGISTRY[("zimage_turbo", "2kto4k")] = PID_CHECKPOINT_REGISTRY[("flux", "2kto4k")] + + +def get_pid_checkpoint(backbone: str, ckpt_type: str = "2k") -> PIDCheckpoint: + """Return the registered official PID checkpoint for `(backbone, ckpt_type)`. + + `ckpt_type` defaults to `"2k"` so existing call sites keep their pre-2kto4k + behavior. Raises KeyError with the list of valid keys when the pair is + unknown — typical cause is asking for a `2kto4k` variant of a backbone + that doesn't ship one (rae / scale_rae). + """ + if ckpt_type not in VALID_CKPT_TYPES: + raise KeyError(f"Unknown ckpt_type {ckpt_type!r}. Valid: {VALID_CKPT_TYPES}") + try: + return PID_CHECKPOINT_REGISTRY[(backbone, ckpt_type)] + except KeyError as exc: + valid = ", ".join(sorted(f"{b}+{t}" for b, t in PID_CHECKPOINT_REGISTRY)) + raise KeyError(f"Unknown (backbone, ckpt_type)=({backbone!r}, {ckpt_type!r}). Valid: {valid}") from exc + + +__all__ = [ + "PIDCheckpoint", + "PID_CHECKPOINT_REGISTRY", + "VALID_CKPT_TYPES", + "get_pid_checkpoint", +] diff --git a/invokeai/backend/pid/_src/inference/pipeline_registry.py b/invokeai/backend/pid/_src/inference/pipeline_registry.py new file mode 100644 index 00000000000..4994e2c878a --- /dev/null +++ b/invokeai/backend/pid/_src/inference/pipeline_registry.py @@ -0,0 +1,364 @@ +""" +Registry of diffusers pipelines for FPD-vs-VAE evaluation on generated images. + +Each DiffusionPipelineConfig describes how to load a diffusers pipeline, extract +latents in (B, C, H, W) format, denormalize them, and decode with the pipeline's VAE. + +Supported backbones: flux, sdxl, sd3, flux2, qwenimage, zimage, zimage_turbo. + +Latent normalization conventions: + - Flux/SDXL/SD3: simple affine scale+shift → raw = latent / scale + shift + - Flux2: BatchNorm-based → raw = latent * bn_std + bn_mean + (running stats stored in AutoencoderKLFlux2.latent_norm) + - QwenImage: per-channel mean/std → raw = latent * std + mean + (vectors stored in pipeline.vae.config.latents_mean / latents_std) + - ZImage/ZImage-Turbo: affine scale+shift read from pipeline.vae.config at runtime + (vae_scale_factor=0 in registry signals runtime lookup) + +Diffusers `output_type="latent"` returns the denoised latent in the *normalized* +space (same convention as tokenizer.encode()). For FPD the latent is used directly +— no extra denormalization is needed. denormalize_latent() is only needed for VAE +decode when the pipeline's decode path doesn't handle it internally. + +Requires diffusers >= 0.37.0 for Flux2/QwenImage/ZImage support. +""" + +import importlib +import os +from dataclasses import dataclass, field +from typing import Optional + +import torch + +# --------------------------------------------------------------------------- +# Config dataclass +# --------------------------------------------------------------------------- + + +@dataclass +class DiffusionPipelineConfig: + name: str # "flux", "sdxl", "sd3", "flux2" + pipeline_class: str # e.g. "diffusers.FluxPipeline" + default_model_id: str # HuggingFace model ID + latent_channels: int # 16 (Flux/SD3), 4 (SDXL), 32 (Flux2) + spatial_compression: int # 8 + # Affine normalization (Flux1/SDXL/SD3). Set both to 0 for BN-based (Flux2). + vae_scale_factor: float # diffusers VAE scaling + vae_shift_factor: float # diffusers VAE shift (0 if none) + # Whether this backbone uses BatchNorm-based latent normalization (Flux2) + uses_bn_normalization: bool = False + # Whether this backbone uses per-channel mean/std normalization (QwenImage) + uses_perchannel_normalization: bool = False + # Whether the VAE is a video-style 3D VAE that produces 5D latents (QwenImage) + has_temporal_dim: bool = False + default_resolution: tuple[int, int] = (1024, 1024) + default_num_inference_steps: int = 28 + default_guidance_scale: float = 3.5 + # Extra kwargs forwarded to pipeline.__call__ + extra_generate_kwargs: dict = field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + +PIPELINE_REGISTRY: dict[str, DiffusionPipelineConfig] = { + "flux": DiffusionPipelineConfig( + name="flux", + pipeline_class="diffusers.FluxPipeline", + default_model_id="black-forest-labs/FLUX.1-dev", + latent_channels=16, + spatial_compression=8, + vae_scale_factor=0.3611, + vae_shift_factor=0.1159, + default_resolution=(1024, 1024), + default_num_inference_steps=28, + default_guidance_scale=3.5, + extra_generate_kwargs={"max_sequence_length": 512}, + ), + "sdxl": DiffusionPipelineConfig( + name="sdxl", + pipeline_class="diffusers.StableDiffusionXLPipeline", + default_model_id="stabilityai/stable-diffusion-xl-base-1.0", + latent_channels=4, + spatial_compression=8, + vae_scale_factor=0.13025, + vae_shift_factor=0.0, + default_resolution=(1024, 1024), + default_num_inference_steps=30, + default_guidance_scale=7.5, + ), + "sd3": DiffusionPipelineConfig( + name="sd3", + pipeline_class="diffusers.StableDiffusion3Pipeline", + default_model_id="stabilityai/stable-diffusion-3-medium-diffusers", + latent_channels=16, + spatial_compression=8, + vae_scale_factor=1.5305, + vae_shift_factor=0.0609, + default_resolution=(1024, 1024), + default_num_inference_steps=28, + default_guidance_scale=4.0, + ), + "flux2": DiffusionPipelineConfig( + name="flux2", + pipeline_class="diffusers.Flux2Pipeline", + default_model_id="black-forest-labs/FLUX.2-dev", + latent_channels=32, + spatial_compression=8, + # Flux2 uses BatchNorm-based normalization, not affine scale/shift. + # Set to 0 — actual denormalization uses pipeline.vae.latent_norm running stats. + vae_scale_factor=0.0, + vae_shift_factor=0.0, + uses_bn_normalization=True, + default_resolution=(1024, 1024), + default_num_inference_steps=50, + default_guidance_scale=4.0, + extra_generate_kwargs={"max_sequence_length": 512}, + ), + "qwenimage": DiffusionPipelineConfig( + name="qwenimage", + pipeline_class="diffusers.QwenImagePipeline", + default_model_id="Qwen/Qwen-Image", + latent_channels=16, + spatial_compression=8, + # QwenImage uses per-channel mean/std normalization, not affine scale/shift. + # Actual denormalization reads pipeline.vae.config.latents_mean / latents_std. + vae_scale_factor=0.0, + vae_shift_factor=0.0, + uses_perchannel_normalization=True, + has_temporal_dim=True, + default_resolution=(1024, 1024), + default_num_inference_steps=50, + default_guidance_scale=4.0, + extra_generate_kwargs={"max_sequence_length": 512, "true_cfg_scale": 4.0, "negative_prompt": " "}, + ), + "zimage": DiffusionPipelineConfig( + name="zimage", + pipeline_class="diffusers.ZImagePipeline", + default_model_id="Tongyi-MAI/Z-Image", + latent_channels=16, + spatial_compression=8, + # ZImage uses affine normalization but exact values depend on the pretrained + # checkpoint. Set to 0 so denormalize_latent() reads from pipeline.vae.config. + vae_scale_factor=0.0, + vae_shift_factor=0.0, + default_resolution=(1024, 1024), + default_num_inference_steps=50, + default_guidance_scale=5.0, + extra_generate_kwargs={"max_sequence_length": 512}, + ), + "zimage_turbo": DiffusionPipelineConfig( + name="zimage_turbo", + pipeline_class="diffusers.ZImagePipeline", + default_model_id="Tongyi-MAI/Z-Image-Turbo", + latent_channels=16, + spatial_compression=8, + # ZImage-Turbo shares ZImage's VAE/latent convention. Runtime values are + # read from pipeline.vae.config by denormalize_latent(). + vae_scale_factor=0.0, + vae_shift_factor=0.0, + default_resolution=(1024, 1024), + # The model card describes Turbo as an 8-NFE distilled model. Diffusers' + # example uses num_inference_steps=9, yielding 8 non-zero scheduler jumps + # followed by the terminal sigma=0 sample. + default_num_inference_steps=9, + default_guidance_scale=0.0, + extra_generate_kwargs={"max_sequence_length": 512}, + ), +} + + +def get_config(name: str) -> DiffusionPipelineConfig: + if name not in PIPELINE_REGISTRY: + raise ValueError(f"Unknown backbone '{name}'. Available: {list(PIPELINE_REGISTRY.keys())}") + return PIPELINE_REGISTRY[name] + + +# --------------------------------------------------------------------------- +# Pipeline loading +# --------------------------------------------------------------------------- + + +def load_pipeline( + name: str, model_id: Optional[str] = None, dtype=torch.bfloat16, device: str = "cuda", cpu_offload: bool = False +): + """Dynamically import and load a diffusers pipeline. + + Args: + cpu_offload: If True, use enable_model_cpu_offload() instead of .to(device). + Keeps model weights on CPU and only moves the active component to GPU during + forward pass. Essential for large models (Flux2, QwenImage, etc.) that exceed + single-GPU VRAM when all components are loaded simultaneously. + + Returns (pipeline, cfg) where pipeline is ready to call and cfg is the + DiffusionPipelineConfig for this backbone. + """ + cfg = get_config(name) + model_id = model_id or cfg.default_model_id + + # e.g. "diffusers.FluxPipeline" -> module="diffusers", cls="FluxPipeline" + module_path, cls_name = cfg.pipeline_class.rsplit(".", 1) + mod = importlib.import_module(module_path) + PipelineClass = getattr(mod, cls_name) + + token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") + print(f"Loading {cfg.pipeline_class} from {model_id} (dtype={dtype}) ...") + pipeline = PipelineClass.from_pretrained(model_id, torch_dtype=dtype, token=token) + if cpu_offload: + # Only the active component (text encoder / transformer / VAE) lives on GPU at a time. + # enable_model_cpu_offload() defaults to gpu_id=0 — must pass the correct device + # explicitly for multi-GPU torchrun, otherwise all ranks pile onto GPU 0. + gpu_id = torch.cuda.current_device() + pipeline.enable_model_cpu_offload(gpu_id=gpu_id) + print(f"Pipeline loaded with model CPU offload (gpu_id={gpu_id}).") + else: + pipeline = pipeline.to(device) + print(f"Pipeline loaded on {device}.") + return pipeline, cfg + + +# --------------------------------------------------------------------------- +# Latent handling +# --------------------------------------------------------------------------- + + +def denormalize_latent(pipeline, latent: torch.Tensor, cfg: DiffusionPipelineConfig) -> torch.Tensor: + """Reverse the latent normalization applied during VAE encode. + + For Flux1/SDXL/SD3 (affine): raw = latent / scale + shift + For Flux2 (BatchNorm): raw = latent * bn_std + bn_mean + where bn_std/bn_mean come from pipeline.vae.latent_norm running stats. + + Only needed when manually feeding latent to the pipeline's VAE.decode(), + which expects the *raw* (un-normalized) latent space. + """ + if cfg.uses_bn_normalization: + # Flux2: denormalize via BatchNorm running statistics. + # diffusers 0.37+: stored as pipeline.vae.bn (BatchNorm2d, affine=False). + bn = pipeline.vae.bn + # running_mean/var are (C_packed,) where C_packed = latent_channels * patch_h * patch_w + # The latent from output_type="latent" is already in packed BN-normalized space. + bn_mean = bn.running_mean.to(latent.device, latent.dtype) + bn_var = bn.running_var.to(latent.device, latent.dtype) + bn_std = (bn_var + bn.eps).sqrt() + # Reshape to broadcast: (1, C_packed, 1, 1) + bn_mean = bn_mean.view(1, -1, 1, 1) + bn_std = bn_std.view(1, -1, 1, 1) + return latent * bn_std + bn_mean + elif cfg.uses_perchannel_normalization: + # QwenImage: denormalize via per-channel mean/std from VAE config + latents_mean = torch.tensor(pipeline.vae.config.latents_mean).view(1, -1, 1, 1).to(latent.device, latent.dtype) + latents_std = torch.tensor(pipeline.vae.config.latents_std).view(1, -1, 1, 1).to(latent.device, latent.dtype) + return latent * latents_std + latents_mean + else: + # Affine scale/shift + scale = cfg.vae_scale_factor + shift = cfg.vae_shift_factor + if scale == 0.0: + # Fallback: read from pipeline's VAE config at runtime (e.g., ZImage) + scale = pipeline.vae.config.scaling_factor + shift = getattr(pipeline.vae.config, "shift_factor", None) or 0.0 + return latent / scale + shift + + +def extract_latent(pipeline, raw_output, cfg: DiffusionPipelineConfig, height: int, width: int) -> torch.Tensor: + """Normalize pipeline output_type="latent" to (B, C, H, W). + + Flux1 packs latents into (B, seq_len, C) — needs _unpack_latents(). + Flux2 packs latents into (B, seq_len, C) — needs _unpack_latents_with_ids(). + SDXL / SD3 already return (B, C, H, W). + """ + latent = raw_output.images # could be packed for Flux/Flux2 + + if cfg.name == "flux": + # Flux1: packed (B, seq_len, C) → (B, C, H, W) + from diffusers.pipelines.flux.pipeline_flux import FluxPipeline + + latent = FluxPipeline._unpack_latents( + latent, + height=height, + width=width, + vae_scale_factor=pipeline.vae_scale_factor, + ) + elif cfg.name == "flux2": + # Flux2: packed (B, seq_len, C) → (B, C, H, W) using position IDs. + # diffusers 0.37+ API: _unpack_latents_with_ids(x, x_ids) where x_ids are + # (B, H*W, 4) position coordinates generated by _prepare_latent_ids. + from diffusers.pipelines.flux2.pipeline_flux2 import Flux2Pipeline + + # Compute expected spatial dims in latent space (after VAE + 2x2 packing) + vae_sf = pipeline.vae_scale_factor # typically 8 + latent_h = height // (vae_sf * 2) + latent_w = width // (vae_sf * 2) + # _prepare_latent_ids takes a (B, C, H, W) tensor and reads .shape + dummy = torch.zeros(latent.shape[0], 1, latent_h, latent_w, device=latent.device) + latent_ids = Flux2Pipeline._prepare_latent_ids(dummy).to(latent.device) + result = Flux2Pipeline._unpack_latents_with_ids(latent, latent_ids) + # _unpack_latents_with_ids returns a list/stacked tensor (B, C, H, W) + latent = result if isinstance(result, torch.Tensor) else torch.stack(result, dim=0) + elif cfg.name == "qwenimage": + # QwenImage: packed (B, seq_len, C) → (B, C, 1, H, W) with temporal dim + from diffusers.pipelines.qwenimage.pipeline_qwenimage import QwenImagePipeline + + latent = QwenImagePipeline._unpack_latents( + latent, + height=height, + width=width, + vae_scale_factor=pipeline.vae_scale_factor, + ) + # Squeeze temporal dim: (B, C, 1, H, W) → (B, C, H, W) + latent = latent.squeeze(2) + + # ZImage: already (B, C, H, W), no unpacking needed. + + if latent.ndim != 4: + raise RuntimeError(f"Expected 4-D latent (B, C, H, W) after extraction, got shape {latent.shape}") + return latent + + +def decode_with_pipeline_vae(pipeline, latent: torch.Tensor, cfg: DiffusionPipelineConfig) -> torch.Tensor: + """Standard VAE decode using the pipeline's own VAE. + + Takes the *normalized* latent (as returned by output_type="latent"), + denormalizes it, and decodes to pixel space. + + Returns: (B, 3, H, W) float tensor in [0, 1]. + """ + raw_latent = denormalize_latent(pipeline, latent, cfg) + + if cfg.uses_bn_normalization: + # Flux2 VAE: unpatch before decoding. + # raw_latent is (B, C_packed, pH, pW) — C_packed = latent_channels * patch_h * patch_w. + # Must undo patchification to get (B, latent_channels, H/8, W/8) before vae.decode(). + from diffusers.pipelines.flux2.pipeline_flux2 import Flux2Pipeline + + raw_latent = Flux2Pipeline._unpatchify_latents(raw_latent) + + if cfg.has_temporal_dim: + # Video-style 3D VAE (e.g., QwenImage): expects (B, C, T, H, W) + raw_latent = raw_latent.unsqueeze(2) + + # Match VAE dtype — schedulers often output float32 while VAE weights are bfloat16. + raw_latent = raw_latent.to(pipeline.vae.dtype) + + with torch.no_grad(): + decoded = pipeline.vae.decode(raw_latent, return_dict=False)[0] + + if cfg.has_temporal_dim: + # 3D VAE returns (B, 3, T, H, W) — take first frame + decoded = decoded[:, :, 0] + + # diffusers VAE outputs in [-1, 1] — map to [0, 1] + decoded = (decoded * 0.5 + 0.5).clamp(0, 1) + return decoded + + +def print_latent_stats(latent: torch.Tensor, label: str = "latent"): + """Print mean/std/min/max for latent debugging.""" + with torch.no_grad(): + print( + f" [{label}] shape={list(latent.shape)} " + f"mean={latent.mean().item():.4f} std={latent.std().item():.4f} " + f"min={latent.min().item():.4f} max={latent.max().item():.4f}" + ) diff --git a/invokeai/backend/pid/_src/models/__init__.py b/invokeai/backend/pid/_src/models/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/pid/_src/models/pid_distill_model.py b/invokeai/backend/pid/_src/models/pid_distill_model.py new file mode 100644 index 00000000000..c061deda36b --- /dev/null +++ b/invokeai/backend/pid/_src/models/pid_distill_model.py @@ -0,0 +1,315 @@ +# PID distillation model — inference subset of the DMD2-distilled student. +# +# The training-time teacher / fake_score / discriminator / DMD-loss machinery has been +# stripped; what remains is the student net (`self.net`) plus the few-step sampler +# (`_get_t_list`, `_student_sample_loop`, `_velocity_to_x0`) consumed by +# `generate_samples_from_batch`. + +from __future__ import annotations + +import logging +from collections import OrderedDict +from contextlib import nullcontext +from typing import Optional + +import attrs +import torch + +from invokeai.backend.pid._src.models.pid_model import PidModel, PidModelConfig + +logger = logging.getLogger(__name__) + + +@attrs.define(slots=False) +class PidDistillModelConfig(PidModelConfig): + """Inference config for the distilled student.""" + + # Few-step student schedule. + student_timestep: float = 1.0 + student_sample_steps: int = 1 + student_sample_type: str = "sde" + student_t_list: Optional[list] = None + student_input_mode: str = "teacher_forcing" + + +class PidDistillModel(PidModel): + """Inference-only PID distilled student.""" + + def __init__(self, config: PidDistillModelConfig): + # Stubs left in place so any parent code that probes for these attributes + # gets None instead of AttributeError. + self.teacher = None + self.fake_score = None + self.discriminator = None + super().__init__(config) + + # --------------------------------------------------------------------- + # Net output ↔ (x0, velocity) conversion + # --------------------------------------------------------------------- + + def _net_output_to_x0( + self, + x_t: torch.Tensor, + net_output: torch.Tensor, + t: torch.Tensor, + prediction_type: str, + ) -> torch.Tensor: + if prediction_type == "x0": + return net_output.to(x_t.dtype) + if prediction_type == "velocity": + original_dtype = x_t.dtype + s = [x_t.shape[0]] + [1] * (x_t.ndim - 1) + t_shaped = t.double().view(*s) + return (x_t.double() - t_shaped * net_output.double()).to(original_dtype) + raise ValueError(f"Invalid prediction_type: {prediction_type}") + + def _net_output_to_velocity( + self, + x_t: torch.Tensor, + net_output: torch.Tensor, + t: torch.Tensor, + prediction_type: str, + ) -> torch.Tensor: + if prediction_type == "velocity": + return net_output + if prediction_type == "x0": + original_dtype = x_t.dtype + s = [x_t.shape[0]] + [1] * (x_t.ndim - 1) + t_shaped = t.double().view(*s).clamp(min=5e-2) + return ((x_t.double() - net_output.double()) / t_shaped).to(original_dtype) + raise ValueError(f"Invalid prediction_type: {prediction_type}") + + def _velocity_to_x0(self, x_t: torch.Tensor, net_output: torch.Tensor, t: torch.Tensor) -> torch.Tensor: + return self._net_output_to_x0(x_t, net_output, t, self.config.prediction_type) + + # --------------------------------------------------------------------- + # Multi-step student sampler + # --------------------------------------------------------------------- + + def _get_t_list(self, device, num_steps: Optional[int] = None) -> torch.Tensor: + target_steps = num_steps if num_steps is not None else self.config.student_sample_steps + + if self.config.student_t_list is not None: + full_t = torch.tensor(self.config.student_t_list, device=device, dtype=torch.float32) + if target_steps != self.config.student_sample_steps: + indices = torch.linspace(0, len(full_t) - 1, target_steps + 1).round().long() + t_list = full_t[indices] + else: + t_list = full_t + else: + t_list = torch.linspace( + self.config.student_timestep, + 0.0, + target_steps + 1, + device=device, + dtype=torch.float32, + ) + assert abs(t_list[-1].item()) < 1e-6, "t_list must end at 0" + if num_steps is not None: + logger.info(f"[distill inference] num_steps={num_steps}, t_list={t_list.tolist()}") + return t_list + + def _student_sample_loop( + self, + noise: torch.Tensor, + t_list: torch.Tensor, + caption_embs: torch.Tensor, + lq_video_or_image: Optional[torch.Tensor], + lq_latent: Optional[torch.Tensor], + degrade_sigma_tensor: Optional[torch.Tensor], + generator: Optional[torch.Generator] = None, + ) -> torch.Tensor: + B = noise.shape[0] + timescale = self.fm_trainer.timescale + autocast_ctx = torch.autocast("cuda", dtype=self.autocast_dtype) if self.autocast_dtype else nullcontext() + x = noise + net = self.net + + with autocast_ctx: + for t_cur, t_next in zip(t_list[:-1], t_list[1:], strict=True): + t_cur_batch = t_cur.expand(B) + t_cur_scaled = t_cur_batch * timescale + + v_pred = net( + x, + t_cur_scaled, + caption_embs, + lq_video_or_image=lq_video_or_image, + lq_latent=lq_latent, + degrade_sigma=degrade_sigma_tensor, + ) + + if t_next.item() > 0: + if self.config.student_sample_type == "ode": + v_for_step = self._net_output_to_velocity(x, v_pred, t_cur_batch, self.config.prediction_type) + dt = t_next - t_cur + x = x + dt * v_for_step + else: + x0_pred = self._velocity_to_x0(x, v_pred, t_cur_batch) + eps_infer = torch.randn( + x0_pred.shape, + device=x0_pred.device, + dtype=x0_pred.dtype, + generator=generator, + ) + s = [B] + [1] * (x.ndim - 1) + t_next_bcast = t_next.reshape(1).expand(s) + x = (1.0 - t_next_bcast) * x0_pred + t_next_bcast * eps_infer + else: + x = self._velocity_to_x0(x, v_pred, t_cur_batch) + + return x + + # --------------------------------------------------------------------- + # Inference entry point + # --------------------------------------------------------------------- + + @torch.no_grad() + def generate_samples_from_batch( + self, + data_batch: dict, + guidance: float = None, + cfg_scale: float = None, + num_steps: int = None, + seed: int = 0, + image_size=None, + shift: float = None, + is_negative_prompt: bool = False, + **kwargs, + ): + # Encode any missing LQ_latent via the frozen VAE so callers can pass either + # LQ_video_or_image or LQ_latent. + if "LQ_latent" not in data_batch and "LQ_video_or_image" in data_batch and self.vae_encoder is not None: + data_batch["LQ_latent"] = ( + self.encode_lq_latent(data_batch["LQ_video_or_image"]).contiguous().to(**self.tensor_kwargs) + ) + if "degrade_sigma" not in data_batch and "LQ_latent" in data_batch: + B = data_batch["LQ_latent"].shape[0] + data_batch["degrade_sigma"] = torch.zeros(B, device=data_batch["LQ_latent"].device, dtype=torch.float32) + + x0_key = self.config.input_data_key + if image_size is None and x0_key in data_batch: + x0_shape = data_batch[x0_key].shape + img_h, img_w = x0_shape[-2], x0_shape[-1] + else: + image_size = image_size or self.config.image_size + if isinstance(image_size, (list, tuple)): + img_h, img_w = int(image_size[0]), int(image_size[1]) + else: + img_h = img_w = int(image_size) + + # Determine shift: explicit arg > SD3-style dynamic_shift (if configured) > config default. + # The 4-step distilled sampler doesn't consume `shift` directly (it uses + # student_t_list), but we keep the precedence ladder symmetric with the + # non-distilled inference path in case future call sites read it. + if shift is None and self.config.dynamic_shift is not None: + import math + + _ds = self.config.dynamic_shift + shift = _ds["base_shift"] * math.sqrt(max(img_h, img_w) / _ds["base_image_size_for_shift_calc"]) + + captions = data_batch[self.config.input_caption_key] + if isinstance(captions, str): + captions = [captions] + B = len(captions) + if self.config.use_fixed_prompt: + captions = [self.config.fixed_positive_prompt] * B + caption_embs, _ = self._encode_text_raw(captions) + caption_embs = caption_embs.to(**self.tensor_kwargs) + + lq_video_or_image = None + lq_latent = None + if self.config.lq_condition_type in ("image", "image_latent"): + lq_video_or_image = data_batch.get("LQ_video_or_image") + if lq_video_or_image is not None: + lq_video_or_image = lq_video_or_image.to(**self.tensor_kwargs) + if self.config.lq_condition_type in ("latent", "image_latent"): + lq_latent = data_batch.get("LQ_latent") + if lq_latent is not None: + lq_latent = lq_latent.to(**self.tensor_kwargs) + + sigma_val = data_batch.get("degrade_sigma", 0.0) + if isinstance(sigma_val, torch.Tensor): + degrade_sigma_tensor = sigma_val.to(device="cuda", dtype=torch.float32).reshape(-1) + if degrade_sigma_tensor.numel() == 1: + degrade_sigma_tensor = degrade_sigma_tensor.expand(B).contiguous() + assert degrade_sigma_tensor.shape == (B,), ( + f"data_batch['degrade_sigma'] expected [B={B}], got {tuple(degrade_sigma_tensor.shape)}" + ) + elif isinstance(sigma_val, (list, tuple)): + degrade_sigma_tensor = torch.tensor(sigma_val, device="cuda", dtype=torch.float32) + assert degrade_sigma_tensor.shape == (B,), ( + f"data_batch['degrade_sigma'] expected length {B}, got {len(sigma_val)}" + ) + else: + degrade_sigma_tensor = torch.full((B,), float(sigma_val), device="cuda", dtype=torch.float32) + + gen = torch.Generator(device="cuda").manual_seed(int(seed)) + noise = torch.randn(B, 3, img_h, img_w, device="cuda", generator=gen) + + autocast_ctx = torch.autocast("cuda", dtype=self.autocast_dtype) if self.autocast_dtype else nullcontext() + net = self.net + net.eval() + + effective_steps = num_steps if num_steps is not None else self.config.student_sample_steps + + if effective_steps == 1: + t_student = torch.full((B,), self.config.student_timestep, device="cuda", dtype=torch.float32) + t_student_scaled = t_student * self.fm_trainer.timescale + with autocast_ctx: + v_student = net( + noise, + t_student_scaled, + caption_embs, + lq_video_or_image=lq_video_or_image, + lq_latent=lq_latent, + degrade_sigma=degrade_sigma_tensor, + ) + x0_student = self._velocity_to_x0(noise, v_student, t_student) + else: + t_list = self._get_t_list(device=torch.device("cuda"), num_steps=num_steps) + x0_student = self._student_sample_loop( + noise, + t_list, + caption_embs, + lq_video_or_image, + lq_latent, + degrade_sigma_tensor, + generator=gen, + ) + + return x0_student.clamp(-1, 1).unsqueeze(2) + + # --------------------------------------------------------------------- + # Checkpoint helpers (only the student `net.` prefix matters at inference) + # --------------------------------------------------------------------- + + def model_dict(self) -> dict: + return {"net": self.net} + + def state_dict(self, *args, **kwargs): + return self.net.state_dict(prefix="net.") + + def load_state_dict(self, state_dict, strict=True, assign=False, **kwargs): + _net_sd = OrderedDict() + for k, v in state_dict.items(): + if k.startswith("net.") and not k.startswith("net_ema."): + _net_sd[k[len("net.") :]] = v + elif k.startswith("net_ema.") or k.startswith("fake_score.") or k.startswith("discriminator."): + continue + else: + _net_sd[k] = v + + missing, unexpected = self.net.load_state_dict(_net_sd, strict=False, assign=assign) + if missing: + lq_missing = [k for k in missing if "lq_proj" in k] + other_missing = [k for k in missing if "lq_proj" not in k] + if lq_missing: + logger.info(f"Expected missing LQ keys ({len(lq_missing)} keys)") + if other_missing and strict: + logger.warning(f"Missing keys in net: {other_missing}") + if unexpected: + logger.warning(f"Unexpected keys in net: {unexpected}") + + def on_train_start(self, memory_format=torch.preserve_format) -> None: + super().on_train_start(memory_format) diff --git a/invokeai/backend/pid/_src/models/pid_model.py b/invokeai/backend/pid/_src/models/pid_model.py new file mode 100644 index 00000000000..976c931e29d --- /dev/null +++ b/invokeai/backend/pid/_src/models/pid_model.py @@ -0,0 +1,75 @@ +# PID (PixelDiT SR) model — inference subset. +# +# At inference the only thing this class adds on top of PixelDiTModel is the +# frozen VAE (`vae_encoder`) used by `encode_lq_latent`. The training-time +# degradation pipeline, LoRA injection, LPIPS loss, and training/validation +# steps have all been removed. + +from __future__ import annotations + +import logging +from typing import Any + +import attrs +import torch +from torch import Tensor + +from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate as lazy_instantiate +from invokeai.backend.pid._ext.imaginaire.utils import misc +from invokeai.backend.pid._src.models.pixeldit_model import PixelDiTModel, PixelDiTModelConfig + +logger = logging.getLogger(__name__) + + +@attrs.define(slots=False) +class PidModelConfig(PixelDiTModelConfig): + # "image" = LQ image only, "latent" = LQ latent only, "image_latent" = both. + lq_condition_type: str = "latent" + + # Frozen VAE config for encoding LQ images to latent. + tokenizer: Any = None + + # VAE latent channels (must match tokenizer.latent_ch). + state_ch: int = 16 + + # Fixed prompt override (training convenience kept here so checkpoints that set + # use_fixed_prompt=True still load). + use_fixed_prompt: bool = False + fixed_positive_prompt: str = "" + + +class PidModel(PixelDiTModel): + """PID (PixelDiT SR) inference model (frozen VAE + LQ-conditioned student).""" + + def __init__(self, config: PidModelConfig): + super().__init__(config) + + if config.tokenizer is not None: + with misc.timer("PidModel: load_vae"): + from invokeai.backend.pid._src.tokenizers.base_vae import BaseVAE + + self.vae_encoder: BaseVAE = lazy_instantiate(config.tokenizer) + if config.state_ch > 0: + assert self.vae_encoder.latent_ch == config.state_ch, ( + f"latent_ch {self.vae_encoder.latent_ch} != state_ch {config.state_ch}" + ) + else: + self.vae_encoder = None + logger.warning("No VAE configured — LQ latent encoding disabled.") + + @torch.no_grad() + def encode_lq_latent(self, lq_image: Tensor) -> Tensor: + """Encode an LQ image through the frozen VAE. + + Args: + lq_image: [B, C, H_lq, W_lq] in [-1, 1]. + + Returns: + LQ latent [B, z_dim, zH, zW]. + """ + if lq_image.ndim == 4: + lq_image = lq_image.unsqueeze(2) + latent = self.vae_encoder.encode(lq_image) + if latent.ndim == 5: + latent = latent[:, :, 0, :, :] + return latent diff --git a/invokeai/backend/pid/_src/models/pixeldit_model.py b/invokeai/backend/pid/_src/models/pixeldit_model.py new file mode 100644 index 00000000000..168cd016be1 --- /dev/null +++ b/invokeai/backend/pid/_src/models/pixeldit_model.py @@ -0,0 +1,269 @@ +# PixelDiT T2I model — inference subset. +# +# Provides the bare minimum needed by PidDistillModel: net + frozen text +# encoder + caption embedding helper + a flow-matching `timescale` field. +# Training-time machinery (EMA, REPA, flow-matching trainer, training/validation +# steps) has been removed. + +from __future__ import annotations + +import logging +from typing import Any + +import attrs +import torch +import torch.nn as nn +from torch import Tensor + +from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate as lazy_instantiate +from invokeai.backend.pid._ext.imaginaire.model import ImaginaireModel +from invokeai.backend.pid._ext.imaginaire.utils import misc +from invokeai.backend.pid._src.utils.context_parallel import broadcast as cp_broadcast +from invokeai.backend.pid._src.utils.context_parallel import robust_broadcast + +try: + from megatron.core import parallel_state +except ImportError: + parallel_state = None # CP is opt-in; gracefully degrade when megatron is absent + +logger = logging.getLogger(__name__) + + +@attrs.define(slots=False) +class _EMAStubConfig: + """Minimal stub kept so that DCP ModelWrapper.state_dict() can read `config.ema.enabled`.""" + + enabled: bool = False + rate: float = 0.1 + iteration_shift: int = 0 + + +@attrs.define(slots=False) +class PixelDiTModelConfig: + net: Any = None + precision: str = "bfloat16" + ema: _EMAStubConfig = attrs.Factory(_EMAStubConfig) + + input_data_key: str = "image" + input_caption_key: str = "caption" + + text_encoder_name: str = "gemma-2-2b-it" + caption_channels: int = 2304 + y_norm: bool = True + y_norm_scale_factor: float = 0.01 + model_max_length: int = 300 + chi_prompt: list = attrs.Factory(list) + conditioner: Any = None + + # Flow matching: only `fm_timescale` is read at inference (network expects + # t * timescale as its scalar timestep input). + fm_timescale: float = 1000.0 + logit_mean: float = 0.0 + logit_std: float = 1.0 + prediction_type: str = "velocity" + + shift: float = 4.0 + cfg_scale: float = 2.75 + image_size: int = 1024 + negative_prompt: str = "low quality, worst quality, over-saturated, three legs, six fingers, cartoon, anime, cgi, low res, blurry, deformed, distortion, duplicated limbs, plastic skin, jpeg artifacts, watermark" + num_sample_steps: int = 50 + + dynamic_shift: dict | None = None + + +_TEXT_ENCODER_DICT = { + "gemma-2b": "google/gemma-2b", + "gemma-2b-it": "google/gemma-2b-it", + "gemma-2-2b": "google/gemma-2-2b", + "gemma-2-2b-it": "Efficient-Large-Model/gemma-2-2b-it", + "gemma-2-9b": "google/gemma-2-9b", + "gemma-2-9b-it": "google/gemma-2-9b-it", + "Qwen2-0.5B-Instruct": "Qwen/Qwen2-0.5B-Instruct", + "Qwen2-1.5B-Instruct": "Qwen/Qwen2-1.5B-Instruct", +} + + +def _load_text_encoder(name: str, device: str = "cuda"): + import torch.distributed as dist + from transformers import AutoModelForCausalLM, AutoTokenizer + + assert name in _TEXT_ENCODER_DICT, f"Unsupported text encoder: {name}" + model_id = _TEXT_ENCODER_DICT[name] + + is_distributed = dist.is_initialized() + is_rank0 = (not is_distributed) or (dist.get_rank() == 0) + + if is_distributed and not is_rank0: + dist.barrier() + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.padding_side = "right" + text_encoder = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).get_decoder().to(device) + text_encoder.eval() + text_encoder.requires_grad_(False) + + if is_distributed and is_rank0: + dist.barrier() + + return tokenizer, text_encoder + + +class _FlowMatchingTimescale(nn.Module): + """Tiny stand-in for the deleted `FlowMatchingTrainer` — only `timescale` is read.""" + + def __init__(self, timescale: float): + super().__init__() + self.timescale = timescale + + +class PixelDiTModel(ImaginaireModel): + SUPPORTS_CONTEXT_PARALLEL: bool = False + + def __init__(self, config: PixelDiTModelConfig): + super().__init__() + self.config = config + + if config.dynamic_shift is not None: + _ds = config.dynamic_shift + logger.info( + f"PixelDiT dynamic shift: base_shift={_ds['base_shift']} " + f"base_image_size={_ds['base_image_size_for_shift_calc']}" + ) + + _dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + requested_dtype = _dtype_map[config.precision] + if requested_dtype != torch.float32: + self.autocast_dtype = requested_dtype + self.precision = torch.float32 + else: + self.autocast_dtype = None + self.precision = torch.float32 + self.tensor_kwargs = {"device": "cuda", "dtype": self.precision} + + with misc.timer("PixelDiTModel: build_net"): + self.net = lazy_instantiate(config.net) + self.net = self.net.to(device="cuda", dtype=torch.float32) + self.net.requires_grad_(True) + if hasattr(self.net, "init_weights"): + self.net.init_weights() + logger.info(f"PixDiT_T2I params: {sum(p.numel() for p in self.net.parameters()):,}") + + # Frozen text encoder. Use object.__setattr__ so DCP / nn.Module don't try to + # register it as a child / save it in state_dict. + with misc.timer("PixelDiTModel: load_text_encoder"): + _tokenizer, _text_encoder = _load_text_encoder(config.text_encoder_name, device="cuda") + object.__setattr__(self, "tokenizer", _tokenizer) + object.__setattr__(self, "text_encoder", _text_encoder) + self._chi_prompt_str = "\n".join(config.chi_prompt) if config.chi_prompt else "" + self._num_chi_tokens = len(self.tokenizer.encode(self._chi_prompt_str)) if self._chi_prompt_str else 0 + self._null_caption_embs = self._encode_text_raw([config.negative_prompt if config.negative_prompt else ""])[ + 0 + ] + + # Tiny flow-matching shim: only `timescale` is consumed by inference. + self.fm_trainer = _FlowMatchingTimescale(config.fm_timescale) + + self.conditioner = lazy_instantiate(config.conditioner) + logger.info(f"PixelDiT conditioner: {self.conditioner}") + + # --------------------------------------------------------------------- + # Text encoding + # --------------------------------------------------------------------- + + @torch.no_grad() + def _encode_text_raw(self, captions: list[str]) -> tuple[Tensor, Tensor]: + if self._chi_prompt_str: + prompts_all = [self._chi_prompt_str + cap for cap in captions] + max_length_all = self._num_chi_tokens + self.config.model_max_length - 2 + else: + prompts_all = captions + max_length_all = self.config.model_max_length + + caption_token = self.tokenizer( + prompts_all, + max_length=max_length_all, + padding="max_length", + truncation=True, + return_tensors="pt", + ).to("cuda") + + caption_embs = self.text_encoder(caption_token.input_ids, caption_token.attention_mask)[0] + + select_index = [0] + list(range(-self.config.model_max_length + 1, 0)) + caption_embs = caption_embs[:, select_index] + emb_masks = caption_token.attention_mask[:, select_index] + return caption_embs, emb_masks + + def _normalize_image(self, img: Tensor) -> Tensor: + if img.dtype == torch.uint8: + return img.float() / 127.5 - 1.0 + elif img.max() > 1.0: + return img.float() / 127.5 - 1.0 + else: + if img.min() >= 0: + return img.float() * 2.0 - 1.0 + return img.float() + + # --------------------------------------------------------------------- + # Context-parallel helpers (no-op when megatron CP isn't initialized). + # --------------------------------------------------------------------- + + @staticmethod + def get_context_parallel_group(): + if parallel_state is not None and parallel_state.is_initialized(): + return parallel_state.get_context_parallel_group() + return None + + def _maybe_enable_cp_on_nets(self, nets: list) -> None: + cp_group = self.get_context_parallel_group() + for net in nets: + if net is None: + continue + if cp_group is None or cp_group.size() <= 1: + if hasattr(net, "disable_context_parallel") and getattr(net, "is_context_parallel_enabled", False): + net.disable_context_parallel() + else: + if hasattr(net, "enable_context_parallel"): + net.enable_context_parallel(cp_group) + + def _broadcast_tensor_for_cp(self, t: Tensor | None) -> Tensor | None: + cp_group = self.get_context_parallel_group() + if t is None or cp_group is None or cp_group.size() <= 1: + return t + from torch.distributed import get_process_group_ranks + + src = min(get_process_group_ranks(cp_group)) + return robust_broadcast(t.contiguous(), src=src, pg=cp_group) + + def _broadcast_object_for_cp(self, obj): + return cp_broadcast(obj, self.get_context_parallel_group()) + + # --------------------------------------------------------------------- + # Checkpoint helpers — the distill subclass overrides these for its + # net.* / fake_score.* / discriminator.* prefix routing. + # --------------------------------------------------------------------- + + def state_dict(self, *args, **kwargs): + return self.net.state_dict(prefix="net.") + + def load_state_dict(self, state_dict, strict=True, assign=False, **kwargs): + has_core_keys = any(k.startswith("core.") for k in state_dict) + has_net_keys = any(k.startswith("net.") for k in state_dict) + + if has_core_keys and not has_net_keys: + logger.info("Loading original PixelDiT checkpoint (core.* prefix)") + net_sd = {} + for k, v in state_dict.items(): + if k == "pos_embed": + continue + if k.startswith("core."): + net_sd[k[len("core.") :]] = v + self.net.load_state_dict(net_sd, strict=False, assign=assign) + else: + _net_sd = { + k[len("net.") :]: v + for k, v in state_dict.items() + if k.startswith("net.") and not k.startswith("net_ema.") + } + if _net_sd: + self.net.load_state_dict(_net_sd, strict=strict, assign=assign) diff --git a/invokeai/backend/pid/_src/modules/__init__.py b/invokeai/backend/pid/_src/modules/__init__.py new file mode 100644 index 00000000000..7ab23eecabc --- /dev/null +++ b/invokeai/backend/pid/_src/modules/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/invokeai/backend/pid/_src/modules/conditioner.py b/invokeai/backend/pid/_src/modules/conditioner.py new file mode 100644 index 00000000000..84629c53496 --- /dev/null +++ b/invokeai/backend/pid/_src/modules/conditioner.py @@ -0,0 +1,563 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import defaultdict +from contextlib import nullcontext +from dataclasses import dataclass, fields +from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union + +import torch +import torch.nn as nn +from torch.distributed import ProcessGroup + +from invokeai.backend.pid._ext.imaginaire.lazy_config import instantiate +from invokeai.backend.pid._ext.imaginaire.utils import log +from invokeai.backend.pid._ext.imaginaire.utils.count_params import count_params, disabled_train +from invokeai.backend.pid._src.utils.context_parallel import broadcast + + +def batch_mul(x, y): + """Broadcast-multiply x by y, padding the shorter shape with trailing 1s.""" + nd1, nd2 = x.ndim, y.ndim + common = min(nd1, nd2) + for axis in range(common): + assert x.shape[axis] == y.shape[axis], f"Dimensions not equal at axis {axis}" + if nd1 < nd2: + x = x.reshape(x.shape + (1,) * (nd2 - nd1)) + elif nd2 < nd1: + y = y.reshape(y.shape + (1,) * (nd1 - nd2)) + return x * y + + +T = TypeVar("T", bound="BaseCondition") + + +def broadcast_condition(condition: BaseCondition, process_group: Optional[ProcessGroup] = None) -> BaseCondition: + """ + Broadcast the condition from the minimum rank in the specified group(s). + """ + if condition.is_broadcasted: + return condition + + kwargs = condition.to_dict(skip_underscore=False) + for key, value in kwargs.items(): + if value is not None: + kwargs[key] = broadcast(value, process_group) + kwargs["_is_broadcasted"] = True + return type(condition)(**kwargs) + + +@dataclass(frozen=True) +class BaseCondition(ABC): # noqa: B024 # upstream marker base class — no abstract methods by design + """ + Attributes: + _is_broadcasted: Flag indicating if parallel broadcast splitting + has been performed. This is an internal implementation detail. + """ + + _is_broadcasted: bool = False + + def to_dict(self, skip_underscore: bool = True) -> Dict[str, Any]: + """Converts the condition to a dictionary. + + Returns: + Dictionary containing the condition's fields and values. + """ + # return {f.name: getattr(self, f.name) for f in fields(self) if not f.name.startswith("_")} + return {f.name: getattr(self, f.name) for f in fields(self) if not (f.name.startswith("_") and skip_underscore)} + + @property + def is_broadcasted(self) -> bool: + return self._is_broadcasted + + def broadcast(self, process_group: torch.distributed.ProcessGroup) -> BaseCondition: + """Broadcasts and splits the condition across the checkpoint parallelism group. + For most condition, such asT2VCondition, we do not need split. + + Args: + process_group: The process group for broadcast and split + + Returns: + A new BaseCondition instance with the broadcasted and split condition. + """ + if self.is_broadcasted: + return self + return broadcast_condition(self, process_group) + + +@dataclass(frozen=True) +class PixelDiTCondition(BaseCondition): + """Condition for PixelDiT T2I models. + + caption: list[str] — raw caption strings (after dropout). The model's internal + text encoder (e.g. Gemma-2-2b-it) handles encoding. + """ + + caption: Optional[list] = None + + +@dataclass(frozen=True) +class PidCondition(BaseCondition): + """Condition for PID (PixelDiT SR) models. + + caption: list[str] — raw caption strings (after dropout). + lq_video_or_image: [B, 3, H_lq, W_lq] — LQ image at original low resolution. + lq_latent: [B, z_dim, zH, zW] — LQ VAE latent. + """ + + caption: Optional[list] = None + lq_video_or_image: Optional[torch.Tensor] = None + lq_latent: Optional[torch.Tensor] = None + + +class AbstractEmbModel(nn.Module): + def __init__(self): + super().__init__() + + self._is_trainable = None + self._dropout_rate = None + self._input_key = None + self._return_dict = False + + @property + def is_trainable(self) -> bool: + return self._is_trainable + + @property + def dropout_rate(self) -> Union[float, torch.Tensor]: + return self._dropout_rate + + @property + def input_key(self) -> str: + return self._input_key + + @property + def is_return_dict(self) -> bool: + return self._return_dict + + @is_trainable.setter + def is_trainable(self, value: bool): + self._is_trainable = value + + @dropout_rate.setter + def dropout_rate(self, value: Union[float, torch.Tensor]): + self._dropout_rate = value + + @input_key.setter + def input_key(self, value: str): + self._input_key = value + + @is_return_dict.setter + def is_return_dict(self, value: bool): + self._return_dict = value + + @is_trainable.deleter + def is_trainable(self): + del self._is_trainable + + @dropout_rate.deleter + def dropout_rate(self): + del self._dropout_rate + + @input_key.deleter + def input_key(self): + del self._input_key + + @is_return_dict.deleter + def is_return_dict(self): + del self._return_dict + + def random_dropout_input( + self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None + ) -> torch.Tensor: + del key + dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate + return batch_mul( + torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor), + in_tensor, + ) + + def details(self) -> str: + return "" + + def summary(self) -> str: + input_key = self.input_key if self.input_key is not None else getattr(self, "input_keys", None) + return ( + f"{self.__class__.__name__} \n\tinput key: {input_key}" + f"\n\tParam count: {count_params(self, False)} \n\tTrainable: {self.is_trainable}" + f"\n\tDropout rate: {self.dropout_rate}" + f"\n\t{self.details()}" + ) + + +class CaptionStringDrop(AbstractEmbModel): + """Embedder for raw caption strings with dropout (replaces with empty string). + + Unlike TextAttrEmptyStringDrop which operates on pre-computed tensor embeddings, + this embedder handles raw caption strings (list[str]) from the data batch. On + dropout, the caption is replaced with an empty string so the model's own text + encoder produces null embeddings. + + Used by PixelDiT which encodes text inside the model (Gemma-2-2b-it) rather + than consuming pre-computed UMT5 embeddings from the dataset. + + Args: + input_key: key in data_batch containing caption strings (default: "caption") + output_key: key in condition output (default: "caption") + dropout_rate: probability of replacing caption with "" (for CFG training) + """ + + def __init__(self, input_key: str = "caption", output_key: str = "caption", dropout_rate: float = 0.0): + super().__init__() + self._input_key = input_key + self._dropout_rate = dropout_rate + self._output_key = output_key + + def forward(self, captions): + # Ensure list[str] — random_dropout_input normalizes, but guard forward too + if isinstance(captions, str): + captions = [captions] + return {self._output_key: captions} + + def random_dropout_input(self, in_data, dropout_rate=None, key=None): + """Per-sample caption dropout: replace each caption with "" independently.""" + del key + import random as _random + + if in_data is None: + return in_data + # Normalize: webdataset collate may return a single string when batch_size=1 + if isinstance(in_data, str): + in_data = [in_data] + dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate + if dropout_rate <= 0: + return in_data + return ["" if _random.random() < dropout_rate else cap for cap in in_data] + + def details(self) -> str: + return f"Output key: [{self._output_key}]" + + +class GeneralConditioner(nn.Module, ABC): + """ + An abstract module designed to handle various embedding models with conditional and unconditional configurations. + This abstract base class initializes and manages a collection of embedders that can dynamically adjust + their dropout rates based on conditioning. + + Attributes: + KEY2DIM (dict): A mapping from output keys to dimensions used for concatenation. + embedders (nn.ModuleDict): A dictionary containing all embedded models initialized and configured + based on the provided configurations. + + Parameters: + emb_models (Union[List, Any]): A dictionary where keys are embedder names and values are configurations + for initializing the embedders. + + Example: + See Edify4ConditionerConfig + """ + + KEY2DIM = {"crossattn_emb": 1} + + def __init__(self, **emb_models: Union[List, Any]): + super().__init__() + self.embedders = nn.ModuleDict() + for n, (emb_name, emb_config) in enumerate(emb_models.items()): + embedder = instantiate(emb_config) + # assert isinstance( + # embedder, AbstractEmbModel + # ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel" + embedder.is_trainable = getattr(emb_config, "is_trainable", True) + embedder.dropout_rate = getattr(emb_config, "dropout_rate", 0.0) + if not embedder.is_trainable: + embedder.train = disabled_train + for param in embedder.parameters(): + param.requires_grad = False + embedder.eval() + + log.info(f"Initialized embedder #{n}-{emb_name}: \n {embedder.summary()}") + self.embedders[emb_name] = embedder + + @abstractmethod + def forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> Any: + """Should be implemented in subclasses to handle conditon datatype""" + raise NotImplementedError + + def _forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> Dict: + """ + Processes the input batch through all configured embedders, applying conditional dropout rates if specified. + Output tensors for each key are concatenated along the dimensions specified in KEY2DIM. + + Parameters: + batch (Dict): The input data batch to process. + override_dropout_rate (Optional[Dict[str, float]]): Optional dictionary to override default dropout rates + per embedder key. + + Returns: + Dict: A dictionary of output tensors concatenated by specified dimensions. + + Note: + In case the network code is sensitive to the order of concatenation, you can either control the order via \ + config file or make sure the embedders return a unique key for each output. + """ + output = defaultdict(list) + if override_dropout_rate is None: + override_dropout_rate = {} + + # make sure emb_name in override_dropout_rate is valid + for emb_name in override_dropout_rate.keys(): + assert emb_name in self.embedders, f"invalid name found {emb_name}" + + for emb_name, embedder in self.embedders.items(): + embedding_context = nullcontext if embedder.is_trainable else torch.no_grad + with embedding_context(): + if isinstance(embedder.input_key, str): + emb_out = embedder( + embedder.random_dropout_input( + batch[embedder.input_key], override_dropout_rate.get(emb_name, None) + ) + ) + elif isinstance(embedder.input_key, list): + emb_out = embedder( + *[ + embedder.random_dropout_input(batch.get(k), override_dropout_rate.get(emb_name, None), k) + for k in embedder.input_key + ] + ) + else: + raise KeyError( + f"Embedder '{embedder.__class__.__name__}' requires an 'input_key' attribute to be defined as either a string or list of strings" + ) + for k, v in emb_out.items(): + output[k].append(v) + # Concatenate the outputs + return {k: torch.cat(v, dim=self.KEY2DIM.get(k, -1)) for k, v in output.items()} + + def get_condition_uncondition( + self, + data_batch: Dict, + ) -> Tuple[Any, Any]: + """ + Processes the provided data batch to generate two sets of outputs: conditioned and unconditioned. This method + manipulates the dropout rates of embedders to simulate two scenarios — one where all conditions are applied + (conditioned), and one where they are removed or reduced to the minimum (unconditioned). + + This method first sets the dropout rates to zero for the conditioned scenario to fully apply the embedders' effects. + For the unconditioned scenario, it sets the dropout rates to 1 (or to 0 if the initial unconditional dropout rate + is insignificant) to minimize the embedders' influences, simulating an unconditioned generation. + + Parameters: + data_batch (Dict): The input data batch that contains all necessary information for embedding processing. The + data is expected to match the required format and keys expected by the embedders. + + Returns: + Tuple[Any, Any]: A tuple containing two condition: + - The first one contains the outputs with all embedders fully applied (conditioned outputs). + - The second one contains the outputs with embedders minimized or not applied (unconditioned outputs). + """ + cond_dropout_rates, dropout_rates = {}, {} + for emb_name, embedder in self.embedders.items(): + cond_dropout_rates[emb_name] = 0.0 + dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0 + + condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates) + un_condition: Any = self(data_batch, override_dropout_rate=dropout_rates) + return condition, un_condition + + +class PixelDiTConditioner(GeneralConditioner): + """Conditioner for PixelDiT T2I models. Returns PixelDiTCondition. + + Unlike FPDConditioner which works with pre-computed tensor embeddings, + this conditioner handles raw caption strings. The model's internal text + encoder does the actual encoding after conditioning. + + Overrides _forward to skip torch.cat (caption outputs are list[str], not tensors). + + Embedders typically include: + - caption: CaptionStringDrop (raw string with empty-string dropout for CFG) + """ + + def _forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> Dict: + """Like GeneralConditioner._forward but returns values directly (no torch.cat).""" + output = {} + if override_dropout_rate is None: + override_dropout_rate = {} + for emb_name, embedder in self.embedders.items(): + embedding_context = nullcontext if embedder.is_trainable else torch.no_grad + with embedding_context(): + in_data = batch[embedder.input_key] + in_data = embedder.random_dropout_input(in_data, override_dropout_rate.get(emb_name, None)) + emb_out = embedder(in_data) + output.update(emb_out) + return output + + def forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> PixelDiTCondition: + output = self._forward(batch, override_dropout_rate) + return PixelDiTCondition(**output) + + def get_condition_uncondition(self, data_batch: Dict) -> Tuple[PixelDiTCondition, PixelDiTCondition]: + """Returns (condition, uncondition) pair for CFG inference.""" + condition = self(data_batch, override_dropout_rate=dict.fromkeys(self.embedders, 0.0)) + uncondition = self(data_batch, override_dropout_rate=dict.fromkeys(self.embedders, 1.0)) + return condition, uncondition + + +# ============================================================================= +# PID (PixelDiT SR) — condition, embedder, and conditioner +# ============================================================================= + + +class LQTensorDrop(AbstractEmbModel): + """Embedder for LQ tensors (image or latent) with per-sample zero dropout. + + On dropout, the tensor is replaced with a zero tensor of the same shape. + Supports coupled dropout: when coupled_with is set, this embedder reuses + the dropout mask from the coupled embedder (stored in _shared_lq_keep_mask). + + Args: + input_key: key in data_batch (e.g. "LQ_video_or_image" or "LQ_latent"). + output_key: key in condition output (e.g. "lq_video_or_image" or "lq_latent"). + dropout_rate: probability of zeroing out the tensor (for CFG training). + is_primary: if True, this embedder generates the shared dropout mask. + If False, it reuses the mask from the primary embedder. + """ + + # Class-level shared mask for coupled dropout (reset each forward pass) + _shared_lq_keep_mask: Optional[torch.Tensor] = None + + def __init__( + self, + input_key: str = "LQ_video_or_image", + output_key: str = "lq_video_or_image", + dropout_rate: float = 0.0, + is_primary: bool = True, + ): + super().__init__() + self._input_key = input_key + self._dropout_rate = dropout_rate + self._output_key = output_key + self._is_primary = is_primary + + def forward(self, element: torch.Tensor) -> Dict[str, torch.Tensor]: + return {self._output_key: element} + + def random_dropout_input( + self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None + ) -> torch.Tensor: + del key + dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate + if dropout_rate <= 0 or in_tensor is None: + if self._is_primary: + LQTensorDrop._shared_lq_keep_mask = None + return in_tensor + + B = in_tensor.shape[0] + if self._is_primary: + # Generate and store shared mask + keep_mask = torch.bernoulli((1.0 - dropout_rate) * torch.ones(B, device=in_tensor.device)) + LQTensorDrop._shared_lq_keep_mask = keep_mask + else: + # Reuse mask from primary embedder + keep_mask = LQTensorDrop._shared_lq_keep_mask + if keep_mask is None: + # Fallback: generate own mask if primary hasn't run yet + keep_mask = torch.bernoulli((1.0 - dropout_rate) * torch.ones(B, device=in_tensor.device)) + + keep_mask_expanded = keep_mask.view(B, *[1] * (in_tensor.dim() - 1)).type_as(in_tensor) + return keep_mask_expanded * in_tensor + + def details(self) -> str: + return f"Output key: {self._output_key}, primary: {self._is_primary}" + + +class PidConditioner(PixelDiTConditioner): + """Conditioner for PID (PixelDiT SR) models. Returns PidCondition. + + Handles caption strings (CaptionStringDrop) + LQ tensors (LQTensorDrop). + LQ image and LQ latent share coupled dropout: when one is dropped, both are. + + Inherits get_condition_uncondition from GeneralConditioner which respects + per-embedder dropout_rate: if caption dropout_rate=0, caption is never + dropped in uncondition (only LQ gets dropped for CFG). + + Embedders typically include: + - caption: CaptionStringDrop (raw string dropout) + - lq_video_or_image: LQTensorDrop (primary, generates shared mask) + - lq_latent: LQTensorDrop (secondary, reuses shared mask) + """ + + def _forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> Dict: + """Process embedders. Handles both string (caption) and tensor (LQ) outputs.""" + output = {} + if override_dropout_rate is None: + override_dropout_rate = {} + # Reset shared mask at start of each forward + LQTensorDrop._shared_lq_keep_mask = None + for emb_name, embedder in self.embedders.items(): + embedding_context = nullcontext if embedder.is_trainable else torch.no_grad + with embedding_context(): + in_data = batch[embedder.input_key] + in_data = embedder.random_dropout_input(in_data, override_dropout_rate.get(emb_name, None)) + emb_out = embedder(in_data) + output.update(emb_out) + return output + + def forward( + self, + batch: Dict, + override_dropout_rate: Optional[Dict[str, float]] = None, + ) -> PidCondition: + output = self._forward(batch, override_dropout_rate) + return PidCondition(**output) + + def get_condition_uncondition(self, data_batch: Dict) -> Tuple[PidCondition, PidCondition]: + """Returns (condition, uncondition) pair for CFG inference. + + Respects per-embedder dropout_rate: embedders with dropout_rate=0 in config + are NOT dropped in uncondition (e.g. caption with dropout_rate=0 stays). + """ + cond_dropout_rates, uncond_dropout_rates = {}, {} + for emb_name, embedder in self.embedders.items(): + cond_dropout_rates[emb_name] = 0.0 + uncond_dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0 + + condition = self(data_batch, override_dropout_rate=cond_dropout_rates) + uncondition = self(data_batch, override_dropout_rate=uncond_dropout_rates) + return condition, uncondition diff --git a/invokeai/backend/pid/_src/networks/__init__.py b/invokeai/backend/pid/_src/networks/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/pid/_src/networks/lq_projection_2d.py b/invokeai/backend/pid/_src/networks/lq_projection_2d.py new file mode 100644 index 00000000000..b18b5f86c13 --- /dev/null +++ b/invokeai/backend/pid/_src/networks/lq_projection_2d.py @@ -0,0 +1,413 @@ +# 2D LQ projection for pixel-space image super-resolution. +# +# Takes LQ image [B, 3, H_lq, W_lq] at original low resolution and/or +# LQ VAE latent [B, z_dim, zH, zW], projects them to patch-aligned tokens +# for injection into the PixDiT_T2I transformer. +# +# Spatial alignment (lossless): +# Image branch: PixelUnshuffle to fold spatial dims into channels, aligning +# to the patch grid without any interpolation. +# Latent branch: Nearest interpolate or fold to align to the patch grid. +# +# ControlNet-style injection gate (single implementation): +# "sigma_aware_per_token_per_dim": +# x + sigmoid(Linear([x, lq]) - exp(log_alpha)*sigma) * lq (per-token per-dim, B,N,D; monotonic in sigma) + +import math +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# --------------------------------------------------------------------------- +# Gate module +# --------------------------------------------------------------------------- + + +class SigmaAwareGatePerTokenPerDim(nn.Module): + """Per-token per-dim variant of SigmaAwareGatePerTokenPerDim. + + Content branch projects to dim instead of 1, so the gate is independent per + (token, channel) instead of shared across channels. Sigma branch stays scalar + per sample and broadcasts (B, 1, 1) → (B, N, D). + + Init: content_proj.bias=2.0, log_alpha=log(5) → + gate ≈ sigmoid(2.0 - 5*sigma): ~0.88 at sigma=0, ~0.5 at sigma=0.4, ~0.05 at sigma=1. + Requires sigma to always be provided (asserts at forward time). + """ + + def __init__(self, dim: int): + super().__init__() + self.content_proj = nn.Linear(dim * 2, dim) + nn.init.trunc_normal_(self.content_proj.weight, std=0.01) + nn.init.constant_(self.content_proj.bias, 2.0) + self.log_alpha = nn.Parameter(torch.tensor(math.log(5.0))) + + def compute_gate_scalar( + self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None + ) -> torch.Tensor: + assert sigma is not None, "SigmaAwareGatePerTokenPerDim requires sigma input" + content_logit = self.content_proj(torch.cat([x, lq], dim=-1)) # (B, N, D) + sigma_offset = -self.log_alpha.exp() * sigma.float().view(-1, 1, 1) # (B, 1, 1) + return torch.sigmoid(content_logit + sigma_offset) # (B, N, D) + + def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None) -> torch.Tensor: + return x + self.compute_gate_scalar(x, lq, sigma) * lq + + +_SUPPORTED_GATE_TYPE = "sigma_aware_per_token_per_dim" + + +def _build_gate(gate_type: str, dim: int, zero_init: bool = True) -> nn.Module: + # zero_init is intentionally not forwarded: redundant with zero-init output_heads. + if gate_type != _SUPPORTED_GATE_TYPE: + raise ValueError(f"Unknown gate_type: {gate_type!r}. Only {_SUPPORTED_GATE_TYPE!r} is supported.") + return SigmaAwareGatePerTokenPerDim(dim) + + +# --------------------------------------------------------------------------- +# Pre-activation residual block (used by image / latent encoders below). +# --------------------------------------------------------------------------- + + +class ResBlock(nn.Module): + """Pre-activation residual block: GroupNorm → SiLU → Conv → GroupNorm → SiLU → Conv + skip.""" + + def __init__(self, channels: int, num_groups: int = 4): + super().__init__() + self.block = nn.Sequential( + nn.GroupNorm(num_groups, channels), + nn.SiLU(), + nn.Conv2d(channels, channels, kernel_size=3, padding=1), + nn.GroupNorm(num_groups, channels), + nn.SiLU(), + nn.Conv2d(channels, channels, kernel_size=3, padding=1), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.block(x) + + +# --------------------------------------------------------------------------- +# LQ Projection 2D +# --------------------------------------------------------------------------- + + +class LQProjection2D(nn.Module): + """2D LQ projection for image super-resolution in pixel space. + + Spatial alignment strategy (lossless, no bilinear interpolation): + + Image branch: + LQ image is at H_lq = H_hq / sr_scale. Patch grid is pH = H_hq / patch_size. + Ratio = H_lq / pH = patch_size / sr_scale. + - If ratio >= 1 (LQ res >= patch grid): PixelUnshuffle(ratio) to fold spatial + dims into channels. E.g. sr_scale=4, ps=16: ratio=4, unshuffle folds 4x4 pixels + into channels: [B, 3, 256, 256] → [B, 3*16, 64, 64] = [B, 48, 64, 64]. + - If ratio < 1 (LQ res < patch grid): Conv2d with PixelShuffle to upsample. + + Latent branch: + LQ latent is at zH = H_lq / lsdf. Patch grid is pH = H_hq / patch_size. + z_patch_ratio = pH / zH = (sr_scale * lsdf) / patch_size. + - If z_patch_ratio <= 1 (latent res >= patch grid): fold z_patch_ratio×z_patch_ratio + spatial elements into channels (same as FastPixelDecoder._align_z_to_patch_grid). + - If z_patch_ratio > 1 (latent res < patch grid): nearest interpolate to upsample. + + Args: + in_channels: LQ image channels (3 for RGB, 0 to disable image branch). + latent_channels: LQ latent channels (e.g. 16 for Wan VAE, 0 to disable). + hidden_dim: internal feature dimension for conv processing. + out_dim: output dimension (must match transformer hidden_size). + patch_size: spatial patch size of the transformer (e.g. 16). + sr_scale: super-resolution scale factor (LQ is sr_scale times smaller). + latent_spatial_down_factor: VAE spatial downscale factor (default 8). + num_res_blocks: number of ResBlocks after initial conv projection in each branch. + 0 = no ResBlocks (original shallow design). + 4 = recommended for stronger feature extraction (~4x deeper). + num_outputs: number of output feature sets — one per transformer block + for controlnet injection. + gate_type: must be "sigma_aware_per_token_per_dim" (sigma-conditioned per-token per-dim gate). + interval: inject every N blocks (only relevant when num_outputs > 1). + zero_init: if True, zero-init all output projections for safe pretrained start. + pit_output: if True, add a dedicated output head for PiT block injection. + The PiT head output is appended as the last element of forward() output. + """ + + def __init__( + self, + in_channels: int = 3, + latent_channels: int = 0, + hidden_dim: int = 512, + out_dim: int = 1536, + patch_size: int = 16, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + num_res_blocks: int = 4, + num_outputs: int = 1, + gate_type: str = _SUPPORTED_GATE_TYPE, + interval: int = 1, + zero_init: bool = True, + pit_output: bool = False, + ): + super().__init__() + assert in_channels > 0 or latent_channels > 0, "At least one of in_channels or latent_channels must be > 0" + + self.in_channels = in_channels + self.latent_channels = latent_channels + self.hidden_dim = hidden_dim + self.out_dim = out_dim + self.patch_size = patch_size + self.sr_scale = sr_scale + self.latent_spatial_down_factor = latent_spatial_down_factor + self.num_outputs = num_outputs + self.interval = interval + self.zero_init = zero_init + self.pit_output = pit_output + + # --- Image branch --- + # PixelUnshuffle → Conv proj → ResBlocks for deep feature extraction + if in_channels > 0: + assert patch_size >= sr_scale and patch_size % sr_scale == 0, ( + f"patch_size ({patch_size}) must be >= sr_scale ({sr_scale}) and divisible" + ) + self.image_unshuffle_factor = patch_size // sr_scale + unshuffle_ch = in_channels * self.image_unshuffle_factor**2 + layers = [ + nn.Conv2d(unshuffle_ch, hidden_dim, kernel_size=3, stride=1, padding=1), + nn.SiLU(), + nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1), + ] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim)) + self.image_conv = nn.Sequential(*layers) + else: + self.image_conv = None + self.image_unshuffle_factor = 0 + + # --- Latent branch --- + # Spatial alignment (fold / upsample) → Conv proj → ResBlocks + if latent_channels > 0: + z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size + self.z_to_patch_ratio = z_to_patch_ratio + + if z_to_patch_ratio > 1: + # Latent is lower res than patch grid → nearest upsample (no learnable params). + # LearnedLatentUpsampler (PixelShuffle) caused DDP numerical issues on multi-node. + self.latent_upsampler = None + self.latent_upsample_ratio = int(z_to_patch_ratio) + latent_proj_in_ch = latent_channels + elif z_to_patch_ratio == 1: + self.latent_upsampler = None + latent_proj_in_ch = latent_channels + else: + fold_factor = int(1 / z_to_patch_ratio) + assert fold_factor * z_to_patch_ratio == 1.0, ( + f"fold_factor {fold_factor} * z_to_patch_ratio {z_to_patch_ratio} != 1" + ) + self.latent_upsampler = None + self.latent_fold_factor = fold_factor + latent_proj_in_ch = latent_channels * fold_factor**2 + + layers = [ + nn.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, stride=1, padding=1), + nn.SiLU(), + nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1), + ] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim)) + self.latent_proj = nn.Sequential(*layers) + else: + self.latent_proj = None + self.z_to_patch_ratio = 0 + self.latent_upsampler = None + + # --- Merge + shared ResBlocks (if both branches active) --- + if in_channels > 0 and latent_channels > 0: + layers = [nn.Conv2d(hidden_dim * 2, hidden_dim, kernel_size=1), nn.SiLU()] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim)) + self.merge = nn.Sequential(*layers) + else: + self.merge = None + + # --- Output heads --- + self.output_heads = nn.ModuleList([nn.Linear(hidden_dim, out_dim) for _ in range(num_outputs)]) + + # --- Dedicated PiT output head (separate from DiT heads) --- + if pit_output: + self.pit_head = nn.Linear(hidden_dim, out_dim) + else: + self.pit_head = None + + # --- Gate modules (one per injection point, for controlnet-style injection) --- + # Using a ModuleList instead of a single shared module allows each block to learn + # independent gating behaviour (different content_proj weights and log_alpha). + self.gate_modules = nn.ModuleList( + [_build_gate(gate_type, out_dim, zero_init=zero_init) for _ in range(num_outputs)] + ) + + def init_weights(self): + """Initialize weights. Zero-init output heads when zero_init=True. + + Conv layers use truncated normal (std=0.02) instead of kaiming_normal_ + to keep intermediate activations small under bfloat16 autocast. + With zero-init output heads the forward output is zero regardless of + conv init scale, but large conv activations cause grad overflow in + bfloat16 backward (output_head.weight.grad ∝ conv_features). + """ + for module in self.modules(): + if isinstance(module, nn.Conv2d): + nn.init.trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + for head in self.output_heads: + if self.zero_init: + nn.init.zeros_(head.weight) + if head.bias is not None: + nn.init.zeros_(head.bias) + else: + # Small init so LQ signal is present from the start but doesn't + # overwhelm the pretrained base model. + nn.init.trunc_normal_(head.weight, std=0.02) + if head.bias is not None: + nn.init.zeros_(head.bias) + + # PiT head follows same init strategy + if self.pit_head is not None: + if self.zero_init: + nn.init.zeros_(self.pit_head.weight) + if self.pit_head.bias is not None: + nn.init.zeros_(self.pit_head.bias) + else: + nn.init.trunc_normal_(self.pit_head.weight, std=0.02) + if self.pit_head.bias is not None: + nn.init.zeros_(self.pit_head.bias) + + def is_gate_active(self, block_idx: int) -> bool: + """Whether gate() should be called for this block index.""" + if self.interval > 1: + return block_idx % self.interval == 0 + return True + + def _get_output_index(self, block_idx: int) -> int: + """Map block_idx to output head index, respecting interval.""" + if self.interval > 1: + return block_idx // self.interval + return block_idx + + def gate( + self, x: torch.Tensor, lq: torch.Tensor, sigma: Optional[torch.Tensor] = None, out_idx: int = 0 + ) -> torch.Tensor: + """Apply gating: inject lq features into transformer hidden state x.""" + return self.gate_modules[out_idx](x, lq, sigma=sigma) + + def _align_image_to_patch_grid( + self, lq_video_or_image: torch.Tensor, target_pH: int, target_pW: int + ) -> torch.Tensor: + """Align LQ image to patch grid via PixelUnshuffle. + + [B, C, H_lq, W_lq] → pad if needed → PixelUnshuffle(factor) → [B, C*f*f, pH, pW] + Then conv to [B, hidden_dim, pH, pW]. + + Multi-AR images may have H_lq not divisible by unshuffle_factor. We pad to + target_pH * f, target_pW * f to ensure exact alignment with the patch grid. + """ + f = self.image_unshuffle_factor + B, C, H_lq, W_lq = lq_video_or_image.shape + target_H_lq = target_pH * f + target_W_lq = target_pW * f + + # Pad or crop to exact target size if needed (multi-AR may not align perfectly) + if H_lq != target_H_lq or W_lq != target_W_lq: + lq_video_or_image = F.interpolate( + lq_video_or_image, size=(target_H_lq, target_W_lq), mode="bilinear", align_corners=False + ) + + x = F.pixel_unshuffle(lq_video_or_image, f) # [B, C*f*f, target_pH, target_pW] + return self.image_conv(x) # [B, hidden_dim, target_pH, target_pW] + + def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor: + """Align LQ latent to patch grid via nearest interpolate or fold. + + Returns [B, hidden_dim, pH, pW]. + """ + B, z_dim = lq_latent.shape[:2] + + if self.z_to_patch_ratio > 1: + # Upsample: latent is lower res than patch grid → nearest interpolate + z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest") + elif self.z_to_patch_ratio == 1: + z_aligned = lq_latent + if z_aligned.shape[2] != pH or z_aligned.shape[3] != pW: + z_aligned = F.interpolate(z_aligned, size=(pH, pW), mode="nearest", align_corners=False) + else: + # Fold: latent is higher res than patch grid + f = self.latent_fold_factor + # Ensure latent spatial matches expected fold size + zH_expected, zW_expected = pH * f, pW * f + if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected: + lq_latent = F.interpolate( + lq_latent, size=(zH_expected, zW_expected), mode="nearest", align_corners=False + ) + z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f) + z_aligned = z_aligned.permute(0, 1, 3, 5, 2, 4) + z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW) + + return self.latent_proj(z_aligned) # [B, hidden_dim, pH, pW] + + def forward( + self, + lq_video_or_image: Optional[torch.Tensor] = None, + lq_latent: Optional[torch.Tensor] = None, + target_pH: int = 0, + target_pW: int = 0, + ) -> List[torch.Tensor]: + """Project LQ inputs to patch-aligned token features. + + Args: + lq_video_or_image: [B, C, H_lq, W_lq] LQ image at original low resolution. Can be None. + lq_latent: [B, z_dim, zH, zW] LQ VAE latent. Can be None. + target_pH: target patch grid height (H_hq / patch_size). + target_pW: target patch grid width (W_hq / patch_size). + + Returns: + List of [B, N, out_dim] tensors where N = target_pH * target_pW. + Length = num_outputs (+ 1 if pit_output=True). + """ + assert target_pH > 0 and target_pW > 0, "Must provide target_pH and target_pW" + features = [] + + # Image branch: PixelUnshuffle → Conv + if self.image_conv is not None and lq_video_or_image is not None: + features.append(self._align_image_to_patch_grid(lq_video_or_image, target_pH, target_pW)) + + # Latent branch: Fold/Upsample → Conv + if self.latent_proj is not None and lq_latent is not None: + features.append(self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW)) + + # Merge or select single branch + if len(features) == 2 and self.merge is not None: + merged = self.merge(torch.cat(features, dim=1)) # [B, hidden_dim, pH, pW] + elif len(features) == 1: + merged = features[0] + else: + # Both inputs are None — return zero features + ref = lq_video_or_image if lq_video_or_image is not None else lq_latent + B, device, dtype = ref.shape[0], ref.device, ref.dtype + N = target_pH * target_pW + num_total = self.num_outputs + (1 if self.pit_output else 0) + return [torch.zeros(B, N, self.out_dim, device=device, dtype=dtype) for _ in range(num_total)] + + # Flatten to tokens: [B, hidden_dim, pH, pW] -> [B, N, hidden_dim] + tokens = merged.flatten(2).transpose(1, 2) + + # Project through output heads + outputs = [head(tokens) for head in self.output_heads] + + # Append dedicated PiT head output as last element + if self.pit_head is not None: + outputs.append(self.pit_head(tokens)) + + return outputs diff --git a/invokeai/backend/pid/_src/networks/pid_net.py b/invokeai/backend/pid/_src/networks/pid_net.py new file mode 100644 index 00000000000..290ccd50a3d --- /dev/null +++ b/invokeai/backend/pid/_src/networks/pid_net.py @@ -0,0 +1,469 @@ +# PidNet — Super-resolution variant of PixDiT_T2I. +# +# Extends the text-to-image PixDiT model with LQ (low-quality) image/latent +# conditioning for image super-resolution. The base T2I architecture is unchanged; +# LQ information is injected via per-block gated injection between transformer +# blocks ("controlnet" mode — the only mode supported in this inference subset). +# Gate: sigma_aware_per_token_per_dim (sigma-conditioned LQ injection). +# +# All LQ modules are zero-initialized by default (zero_init_lq=True) so the network +# starts identical to the pretrained T2I model. +# +# Loading pretrained T2I checkpoint: use strict=False to ignore missing LQ keys. +# +# Reference: +# - PixDiT_T2I: pid/_src/networks/pixeldit_official.py +# - LQ projection: pid/_src/networks/lq_projection_2d.py + +from typing import Optional + +import torch + +from invokeai.backend.pid._ext.imaginaire.utils import log +from invokeai.backend.pid._src.networks.lq_projection_2d import LQProjection2D +from invokeai.backend.pid._src.networks.pixeldit_official import PixDiT_T2I +from invokeai.backend.pid._src.utils.context_parallel import cat_outputs_cp_with_grad, split_inputs_cp + + +class PidNet(PixDiT_T2I): + """PixDiT T2I with LQ condition injection for super-resolution. + + Inherits all PixDiT_T2I functionality (MMDiT patch blocks, PiT pixel blocks, + text conditioning, RoPE, encoder-decoder compression, REPA). Adds LQ projection + module and controlnet-style gated injection logic. + + Args (in addition to PixDiT_T2I args): + lq_inject_mode: kept as a parameter for config compatibility — only + "controlnet" is supported in this inference subset. + lq_in_channels: LQ image channels (3 for RGB, 0 to disable image branch). + lq_latent_channels: LQ latent channels (e.g. 16 for Wan VAE, 0 to disable). + lq_hidden_dim: internal projection hidden dimension. + lq_num_res_blocks: number of ResBlocks per branch for deeper feature extraction. + lq_gate_type: "sigma_aware_per_token_per_dim" only. + lq_interval: inject every N blocks. + zero_init_lq: zero-init all LQ projections for safe pretrained start. + train_lq_proj_only: freeze base T2I, train only LQ projection modules. + sr_scale: super-resolution scale factor (default 4). + latent_spatial_down_factor: VAE spatial downscale factor (default 8). + """ + + def __init__( + self, + # --- PixDiT_T2I base args --- + in_channels=3, + num_groups=16, + hidden_size=1152, + pixel_hidden_size=64, + pixel_attn_hidden_size=None, + pixel_num_groups=None, + patch_depth=26, + pixel_depth=2, + num_text_blocks=4, + patch_size=16, + txt_embed_dim=4096, + txt_max_length=1024, + use_text_rope: bool = True, + text_rope_theta: float = 10000.0, + rope_mode: str = "ntk_aware", + rope_ref_h: int = 1024, + rope_ref_w: int = 1024, + repa_encoder_index: int = -1, + enable_ed: bool = False, + ed_compress_ratio: int = 1, + ed_depth_per_stage: int = 1, + ed_window_size: int = 2, + ed_num_heads: Optional[int] = None, + ed_hidden_size: Optional[int] = None, + ed_use_token_shuffle: bool = True, + # --- SR-specific args --- + lq_inject_mode: str = "controlnet", + lq_in_channels: int = 3, + lq_latent_channels: int = 0, + lq_hidden_dim: int = 512, + lq_num_res_blocks: int = 4, + lq_gate_type: str = "sigma_aware_per_token_per_dim", + lq_interval: int = 1, + zero_init_lq: bool = True, + train_lq_proj_only: bool = False, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + # --- PiT LQ injection args --- + # Inject LQ features into PiT pixel blocks via a dedicated output head + # from the same LQ projection CNN backbone. Added to s_cond before PiT loop. + pit_lq_inject: bool = False, + pit_lq_gate_type: str = "sigma_aware_per_token_per_dim", + ): + super().__init__( + in_channels=in_channels, + num_groups=num_groups, + hidden_size=hidden_size, + pixel_hidden_size=pixel_hidden_size, + pixel_attn_hidden_size=pixel_attn_hidden_size, + pixel_num_groups=pixel_num_groups, + patch_depth=patch_depth, + pixel_depth=pixel_depth, + num_text_blocks=num_text_blocks, + patch_size=patch_size, + txt_embed_dim=txt_embed_dim, + txt_max_length=txt_max_length, + use_text_rope=use_text_rope, + text_rope_theta=text_rope_theta, + rope_mode=rope_mode, + rope_ref_h=rope_ref_h, + rope_ref_w=rope_ref_w, + repa_encoder_index=repa_encoder_index, + enable_ed=enable_ed, + ed_compress_ratio=ed_compress_ratio, + ed_depth_per_stage=ed_depth_per_stage, + ed_window_size=ed_window_size, + ed_num_heads=ed_num_heads, + ed_hidden_size=ed_hidden_size, + ed_use_token_shuffle=ed_use_token_shuffle, + ) + + assert lq_inject_mode == "controlnet", ( + f"Only lq_inject_mode='controlnet' is supported in this inference subset, got '{lq_inject_mode}'" + ) + self.lq_inject_mode = lq_inject_mode + self.sr_scale = sr_scale + self.train_lq_proj_only = train_lq_proj_only + + num_lq_outputs = (patch_depth + lq_interval - 1) // lq_interval + + self.pit_lq_inject = pit_lq_inject + + self.lq_proj = LQProjection2D( + in_channels=lq_in_channels, + latent_channels=lq_latent_channels, + hidden_dim=lq_hidden_dim, + out_dim=hidden_size, + patch_size=patch_size, + sr_scale=sr_scale, + latent_spatial_down_factor=latent_spatial_down_factor, + num_res_blocks=lq_num_res_blocks, + num_outputs=num_lq_outputs, + gate_type=lq_gate_type, + interval=lq_interval, + zero_init=zero_init_lq, + pit_output=pit_lq_inject, + ) + + # PiT LQ gate (applied to s_cond before pixel blocks) + if pit_lq_inject: + from invokeai.backend.pid._src.networks.lq_projection_2d import _build_gate + + self.pit_lq_gate = _build_gate(pit_lq_gate_type, hidden_size, zero_init=zero_init_lq) + else: + self.pit_lq_gate = None + + if train_lq_proj_only: + for p in self.parameters(): + p.requires_grad_(False) + for p in self.lq_proj.parameters(): + p.requires_grad_(True) + if self.pit_lq_gate is not None and hasattr(self.pit_lq_gate, "parameters"): + for p in self.pit_lq_gate.parameters(): + p.requires_grad_(True) + + def init_weights(self): + """Initialize LQ projection.""" + self.lq_proj.init_weights() + log.info("LQ projection init_weights complete") + + def _compute_lq_features(self, lq_video_or_image, lq_latent, lq_mask, Hs, Ws): + lq_features = self.lq_proj( + lq_video_or_image=lq_video_or_image, + lq_latent=lq_latent, + target_pH=Hs, + target_pW=Ws, + ) + if lq_mask is not None: + lq_features = [f * lq_mask.view(-1, 1, 1) for f in lq_features] + # Under CP, lq_features are produced at full L (LQ inputs are replicated + # across CP ranks). Split each along the token axis so they line up with + # the rank-local image stream the patch blocks consume. + if self._cp_group is not None: + lq_features = [split_inputs_cp(f, seq_dim=1, cp_group=self._cp_group) for f in lq_features] + return lq_features + + def _run_patch_blocks( + self, + s_main, + y_emb, + condition, + pos, + pos_txt, + attn_mask_joint, + lq_features, + degrade_sigma=None, + feature_indices=None, + ): + """Run patch_blocks loop with controlnet-style LQ injection. + + Args: + feature_indices: Optional set of block indices whose output features should be + collected and returned (for GAN discriminator). None = no collection. + + Returns: + (s_main, y_emb, collected_features) where collected_features is a list of + [B, L, D] tensors (one per index in feature_indices), or None if not requested. + """ + has_lq = lq_features is not None + + collected_features = [] if feature_indices is not None else None + + for i in range(self.patch_depth): + if has_lq and self.lq_proj.is_gate_active(i): + out_idx = self.lq_proj._get_output_index(i) + if out_idx < len(lq_features): + s_main = self.lq_proj.gate(s_main, lq_features[out_idx], sigma=degrade_sigma, out_idx=out_idx) + + s_main, y_emb = self.patch_blocks[i]( + s_main, + y_emb, + condition, + pos, + pos_txt, + attn_mask_joint, + ) + + # Collect intermediate features for GAN discriminator + if feature_indices is not None and i in feature_indices: + collected_features.append(s_main.clone()) + + if 0 < self.repa_encoder_index == (i + 1): + self.last_repa_tokens = s_main + + return s_main, y_emb, collected_features + + def _unpatchify_features(self, features: list, Hs: int, Ws: int) -> list: + """Reshape patch token features [B, L, D] → [B, D, Hs, Ws] for discriminator. + + PixDiT tokens are 1-to-1 with spatial patches (no sub-patch splitting in the + token dimension), so we just reshape to a 2D spatial feature map. + Compatible with Discriminator_ImageDiT which uses Conv2D heads. + + Under CP, collected features are rank-local [B, L_local, D]. We gather + them along the token axis here so the discriminator (which has no CP + plumbing) sees the full feature map. + + Args: + features: List of [B, L_local_or_full, D] token tensors. + Hs, Ws: Spatial patch grid dimensions (full). + + Returns: + List of [B, D, Hs, Ws] tensors. + """ + result = [] + for feat in features: + if self._cp_group is not None: + feat = cat_outputs_cp_with_grad(feat.contiguous(), seq_dim=1, cp_group=self._cp_group) + B, _L, D = feat.shape + result.append(feat.view(B, Hs, Ws, D).permute(0, 3, 1, 2)) # [B, D, Hs, Ws] + return result + + def forward( + self, + x, + t, + y, + s=None, + mask=None, + lq_video_or_image=None, + lq_latent=None, + lq_mask=None, + degrade_sigma=None, + # --- Feature extraction for GAN discriminator --- + feature_indices=None, + return_features_early: bool = False, + ): + B, _, H, W = x.shape + Hs = H // self.patch_size + Ws = W // self.patch_size + L = Hs * Ws + + # Context-parallel local sequence length. When CP is enabled, every rank + # sees the same full inputs (x, y, t, lq_*) — we patchify on full size, + # then immediately split tokens along L so the heavy transformer/pixel + # blocks operate on L_local = L / cp_size each. + cp_group = self._cp_group + cp_size = cp_group.size() if cp_group is not None else 1 + if cp_size > 1: + assert L % cp_size == 0, f"L={L} not divisible by cp_size={cp_size}" + L_local = L // cp_size + + # Compute LQ features (split along L internally when CP is active). + has_lq = lq_video_or_image is not None or lq_latent is not None + lq_features = self._compute_lq_features(lq_video_or_image, lq_latent, lq_mask, Hs, Ws) if has_lq else None + + collected_features = None # populated by _run_patch_blocks when feature_indices is set + + # Patch tokens — full unfolding on every rank (cheap; identical across ranks). + pos = self.fetch_pos(Hs, Ws, x.device) # full pos; the CP-aware attention slices for q internally + x_patches = torch.nn.functional.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2) + + t_emb = self.t_embedder(t.view(-1)).view(B, -1, self.hidden_size) + + # Text tokens (replicated across CP ranks; not split). + if y.dim() != 3: + raise ValueError("Text embedding y must be [B, L, D]") + Ltxt = min(y.shape[1], self.txt_max_length) + y = y[:, :Ltxt, :] + y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size) + y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb.dtype) + + # Condition signal: silu(t_emb), [B, 1, D] + condition = torch.nn.functional.silu(t_emb) + + # Mask + pad = None + pos_txt = self.fetch_pos_text(Ltxt, x.device) if self.use_text_rope else None + if mask is not None and isinstance(mask, torch.Tensor): + m = mask + while m.dim() > 2 and m.size(1) == 1: + m = m.squeeze(1) + if m.dim() == 3 and m.size(1) == 1: + m = m.squeeze(1) + if m.dim() == 2: + pad = m == 0 + + if s is None: + s0 = self.s_embedder(x_patches) + # Split image patch tokens across the CP group along the sequence axis. + # Everything downstream (lq injection, patch_blocks, pixel pathway) + # operates on the rank-local slice until the final fold gather. + if cp_group is not None: + s0 = split_inputs_cp(s0, seq_dim=1, cp_group=cp_group) + self.last_repa_tokens = None + + if self.use_ed and self.encoder_ed is not None and self.decoder_ed is not None: + # Encoder-decoder path (CP not supported here; PixDiT_T2I.enable_context_parallel asserts) + H_tokens, W_tokens = Hs, Ws + s_ed = s0 if self.s_ed_proj_in is None else self.s_ed_proj_in(s0) + if self.s_ed_in_norm is not None: + s_ed = self.s_ed_in_norm(s_ed) + c_ed = condition if self.s_ed_cond_proj is None else self.s_ed_cond_proj(condition) + bottleneck, skip_tokens, Hb, Wb = self.encoder_ed(s_ed, H_tokens, W_tokens, c_ed) + pos_b = self.fetch_pos(Hb, Wb, x.device) + s_main = bottleneck if self.s_ed_proj_out is None else self.s_ed_proj_out(bottleneck) + if self.s_ed_out_norm is not None: + s_main = self.s_ed_out_norm(s_main) + s_main = torch.nn.functional.silu(t_emb + s_main) + + attn_mask_joint = None + if pad is not None: + L_img_curr = s_main.shape[1] + pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device) + pad_txt = ( + pad[:, :Ltxt] + if pad.size(1) >= Ltxt + else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True) + ) + attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr) + + s_main, y_emb, collected_features = self._run_patch_blocks( + s_main, + y_emb, + condition, + pos_b, + pos_txt, + attn_mask_joint, + lq_features, + degrade_sigma=degrade_sigma, + feature_indices=feature_indices, + ) + + s_bottleneck2 = s_main if self.s_ed_proj_in is None else self.s_ed_proj_in(s_main) + if self.s_ed_in_norm is not None: + s_bottleneck2 = self.s_ed_in_norm(s_bottleneck2) + decoded, _, _ = self.decoder_ed(s_bottleneck2, Hb, Wb, skip_tokens, c_ed) + s = decoded if self.s_ed_proj_out is None else self.s_ed_proj_out(decoded) + if self.s_ed_out_norm is not None: + s = self.s_ed_out_norm(s) + s = torch.nn.functional.silu(t_emb + s) + else: + # Standard path (no encoder-decoder). + s_main = s0 + attn_mask_joint = None + if pad is not None: + # SDPA's K dimension is full image length (CP gathers K/V across + # CP ranks inside the joint attention). Use full L for the K-side + # mask regardless of CP. + pad_img = torch.zeros((B, L), dtype=torch.bool, device=x.device) + pad_txt = ( + pad[:, :Ltxt] + if pad.size(1) >= Ltxt + else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True) + ) + attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L) + + s_main, y_emb, collected_features = self._run_patch_blocks( + s_main, + y_emb, + condition, + pos, + pos_txt, + attn_mask_joint, + lq_features, + degrade_sigma=degrade_sigma, + feature_indices=feature_indices, + ) + + s = torch.nn.functional.silu(t_emb + s_main) + + if not (0 < self.repa_encoder_index <= self.patch_depth): + self.last_repa_tokens = s + + # Early exit for discriminator feature extraction (skip pixel blocks). + # `_unpatchify_features` handles the CP all-gather along L internally. + if return_features_early and feature_indices is not None and collected_features: + return self._unpatchify_features(collected_features, Hs, Ws) + + # Ensure patch token length matches the rank-local grid (L_local under CP, + # L otherwise). This guard exists for ED/token-shuffle paths where the + # block stack may emit a different length than the input. + batch_size, length, _ = s.shape + if length != L_local: + if length > L_local: + s = s[:, :L_local, :] + else: + pad_len = L_local - length + s = torch.cat([s, s.new_zeros(B, pad_len, s.shape[2])], dim=1) + + # Pixel pathway with optional PiT LQ injection — operates on rank-local + # patches under CP. lq_features[-1] was already split along L in + # `_compute_lq_features`, so its B*L_local view lines up with s. + s_cond = s.reshape(B * L_local, self.hidden_size) + if self.pit_lq_inject and lq_features is not None: + pit_lq = lq_features[-1].reshape(B * L_local, self.hidden_size) + sigma_flat = degrade_sigma.repeat_interleave(L_local) if degrade_sigma is not None else None + s_cond = self.pit_lq_gate(s_cond, pit_lq, sigma=sigma_flat) + + # Pixel embedder runs on the full image (cheap; identical across CP + # ranks). Reshape and slice to the rank-local subset of patches so that + # the per-pixel branch processes exactly L_local patches. + x_pixels = self.pixel_embedder(x, img_height=H, img_width=W, patch_size=self.patch_size) + if cp_group is not None: + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L, P2, self.pixel_hidden_size) + x_pixels = split_inputs_cp(x_pixels, seq_dim=1, cp_group=cp_group) + x_pixels = x_pixels.reshape(B * L_local, P2, self.pixel_hidden_size) + for blk in self.pixel_blocks: + x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask) + + x_pixels = self.final_layer(x_pixels) # [B*L_local, P², C_out] + C_out = self.out_channels + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L_local, P2, C_out).permute(0, 3, 2, 1).contiguous() + x_pixels = x_pixels.view(B, C_out * P2, L_local) + # Gather pixel patches across CP ranks along L so `fold` reconstructs + # the full image. `cat_outputs_cp_with_grad` keeps gradients on each + # rank's local slice. + if cp_group is not None: + x_pixels = cat_outputs_cp_with_grad(x_pixels.contiguous(), seq_dim=2, cp_group=cp_group) + output = torch.nn.functional.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size) + + # Return (output, features) when feature extraction is enabled (without early exit) + if feature_indices is not None and collected_features is not None: + return output, self._unpatchify_features(collected_features, Hs, Ws) + return output diff --git a/invokeai/backend/pid/_src/networks/pixeldit_official.py b/invokeai/backend/pid/_src/networks/pixeldit_official.py new file mode 100644 index 00000000000..6fdda4917db --- /dev/null +++ b/invokeai/backend/pid/_src/networks/pixeldit_official.py @@ -0,0 +1,1438 @@ +# PixelDiT T2I — consolidated network architecture. +# Verbatim copy from the original PixelDiT repo, merged into a single file. +# Sources: +# pixdit_core/modules.py — building blocks (RMSNorm, RoPE, attention, etc.) +# pixdit_core/pixeldit_c2i.py — PatchTokenEmbedder, PixelTokenEmbedder, PiTBlock +# pixdit_core/pixeldit_t2i.py — MMDiT joint attention, encoder-decoder, PixDiT_T2I +# +# Only import statements were changed (everything is now local). Logic is unchanged. + +import math +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributed import ProcessGroup +from torch.nn.functional import scaled_dot_product_attention + +from invokeai.backend.pid._src.utils.context_parallel import cat_outputs_cp_with_grad + +# ============================================================================= +# From pixdit_core/modules.py +# ============================================================================= + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def apply_adaln(x, shift, scale): + return x * (1 + scale) + shift + + +class TimestepConditioner(nn.Module): + def __init__(self, hidden_size, frequency_embedding_size=256): + super().__init__() + self.mlp = nn.Sequential( + nn.Linear(frequency_embedding_size, hidden_size, bias=True), + nn.SiLU(), + nn.Linear(hidden_size, hidden_size, bias=True), + ) + self.frequency_embedding_size = frequency_embedding_size + + @staticmethod + def timestep_embedding(t, dim, max_period=10): + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half + ) + args = t[..., None].float() * freqs[None, ...] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + def forward(self, t): + t_freq = self.timestep_embedding(t, self.frequency_embedding_size) + mlp_dtype = next(self.mlp.parameters()).dtype + if t_freq.dtype != mlp_dtype: + t_freq = t_freq.to(mlp_dtype) + t_emb = self.mlp(t_freq) + return t_emb + + +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class FeedForward(nn.Module): + def __init__(self, dim: int, hidden_dim: int): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + + def forward(self, x): + x = self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x)) + return x + + +def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float = 10000.0, scale=16.0): + x_pos = torch.linspace(0, scale, width) + y_pos = torch.linspace(0, scale, height) + y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij") + y_pos = y_pos.reshape(-1) + x_pos = x_pos.reshape(-1) + freqs = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + x_freqs = torch.outer(x_pos, freqs).float() + y_freqs = torch.outer(y_pos, freqs).float() + x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs) + y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs) + freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1) + freqs_cis = freqs_cis.reshape(height * width, -1) + return freqs_cis + + +def precompute_freqs_cis_2d_ntk( + dim: int, + height: int, + width: int, + ref_grid_h: int, + ref_grid_w: int, + theta: float = 10000.0, + scale: float = 16.0, +): + """NTK-aware 2D RoPE. Identical to precompute_freqs_cis_2d when + height == ref_grid_h and width == ref_grid_w. For other resolutions + the base theta is scaled per-axis following the NTK-aware formula: + ntk_factor = (current / ref) ** (dim_axis / (dim_axis - 2)) + theta_axis = theta * ntk_factor + where dim_axis = dim // 2 (half the head dim per spatial axis). + """ + dim_axis = dim // 2 # each axis gets dim//4 complex pairs → dim//2 real dims + h_scale = height / ref_grid_h + w_scale = width / ref_grid_w + h_ntk = h_scale ** (dim_axis / (dim_axis - 2)) if dim_axis > 2 else 1.0 + w_ntk = w_scale ** (dim_axis / (dim_axis - 2)) if dim_axis > 2 else 1.0 + h_theta = theta * h_ntk + w_theta = theta * w_ntk + + x_pos = torch.linspace(0, scale, width) + y_pos = torch.linspace(0, scale, height) + y_pos, x_pos = torch.meshgrid(y_pos, x_pos, indexing="ij") + y_pos = y_pos.reshape(-1) + x_pos = x_pos.reshape(-1) + + freqs_w = 1.0 / (w_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + freqs_h = 1.0 / (h_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + x_freqs = torch.outer(x_pos, freqs_w).float() + y_freqs = torch.outer(y_pos, freqs_h).float() + x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs) + y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs) + freqs_cis = torch.cat([x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1) + freqs_cis = freqs_cis.reshape(height * width, -1) + return freqs_cis + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + freqs_cis = freqs_cis[None, :, None, :] + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class RotaryAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = RMSNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + # Context-parallel group; when set, `forward` runs split-Q / gather-K,V. + self._cp_group: Optional[ProcessGroup] = None + + def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]): + self._cp_group = cp_group + + def forward(self, x: torch.Tensor, pos, mask) -> torch.Tensor: + # CP convention: caller passes `pos` of full sequence length (N_full). + # When `_cp_group` is set, `x` is the rank-local slice [B, N_local, C] + # with N_local = N_full / cp_size. We gather k/v to full length, apply + # RoPE with the appropriate slice/full pos, and run SDPA producing + # local-Q output [B, N_local, C]. + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + q = self.q_norm(q) + k = self.k_norm(k) + if self._cp_group is None: + q, k = apply_rotary_emb(q, k, freqs_cis=pos) + else: + cp_size = self._cp_group.size() + cp_rank = self._cp_group.rank() + N_full = pos.shape[0] + assert N_full % cp_size == 0, f"pos length {N_full} not divisible by cp_size {cp_size}" + N_local = N_full // cp_size + assert N == N_local, f"local x length {N} != expected {N_local}" + pos_local = pos.view(cp_size, N_local, -1)[cp_rank] + # Apply RoPE to local q with local pos. + q, _ = apply_rotary_emb(q, q, freqs_cis=pos_local) + # Gather k, v across CP ranks along the sequence dim, then RoPE with full pos. + # `all_gather` requires contiguous tensors; the qkv permute leaves k/v as non-contiguous views. + k = cat_outputs_cp_with_grad(k.contiguous(), seq_dim=1, cp_group=self._cp_group) + v = cat_outputs_cp_with_grad(v.contiguous(), seq_dim=1, cp_group=self._cp_group) + _, k = apply_rotary_emb(k, k, freqs_cis=pos) + q = q.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2) + k = k.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous() + v = v.view(B, -1, self.num_heads, C // self.num_heads).transpose(1, 2).contiguous() + + x = scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MLP(nn.Module): + def __init__(self, dim: int, mlp_ratio: float = 4.0, drop: float = 0.0): + super().__init__() + hidden_dim = int(dim * mlp_ratio) + self.fc1 = nn.Linear(dim, hidden_dim) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_dim, dim) + self.drop = nn.Dropout(drop) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class FinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels): + super().__init__() + self.norm = RMSNorm(hidden_size, eps=1e-6) + self.linear = nn.Linear(hidden_size, out_channels, bias=True) + + def forward(self, x): + x = self.norm(x) + x = self.linear(x) + return x + + +# ============================================================================= +# From pixdit_core/pixeldit_c2i.py (PatchTokenEmbedder, PixelTokenEmbedder, PiTBlock) +# ============================================================================= + + +class PatchTokenEmbedder(nn.Module): + def __init__( + self, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer=None, + bias: bool = True, + ): + super().__init__() + self.in_chans = in_chans + self.embed_dim = embed_dim + self.proj = nn.Linear(in_chans, embed_dim, bias=bias) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + x = self.norm(x) + return x + + +class PixelTokenEmbedder(nn.Module): + def __init__(self, in_channels: int, hidden_size_output: int): + super().__init__() + self.in_channels = int(in_channels) + self.hidden_size_output = int(hidden_size_output) + self.proj = nn.Linear(self.in_channels, self.hidden_size_output, bias=True) + self._pos_cache = {} + + def _fetch_pixel_pos_patch(self, patch_size: int, device, dtype): + key = ("patch", patch_size) + if key in self._pos_cache: + pe = self._pos_cache[key] + return pe.to(device=device, dtype=dtype) + pos_np = get_2d_sincos_pos_embed(self.hidden_size_output, patch_size) + pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [P2, D] + self._pos_cache[key] = pos + return pos + + def _fetch_pixel_pos_image(self, height: int, width: int, device, dtype): + if height == width: + key = ("image", height, width) + if key in self._pos_cache: + pe = self._pos_cache[key] + return pe.to(device=device, dtype=dtype) + pos_np = get_2d_sincos_pos_embed(self.hidden_size_output, height) + pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [H*W, D] + self._pos_cache[key] = pos + return pos + else: + key = ("image", height, width) + if key in self._pos_cache: + pe = self._pos_cache[key] + return pe.to(device=device, dtype=dtype) + # Build a non-square grid (H x W) and compute 2D sin/cos embedding + grid_h = np.arange(height, dtype=np.float32) + grid_w = np.arange(width, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # w first to match existing convention + grid = np.stack(grid, axis=0).reshape(2, 1, height, width) + pos_np = get_2d_sincos_pos_embed_from_grid(self.hidden_size_output, grid) + pos = torch.from_numpy(pos_np).to(device=device, dtype=dtype) # [H*W, D] + self._pos_cache[key] = pos + return pos + + def forward(self, inputs: torch.Tensor, img_height: int = None, img_width: int = None, patch_size: int = None): + # Two modes: + # 1) Legacy patch mode: inputs [B*L, P2, C] -> add 2D sincos within patch (P2 = patch_size^2) + # 2) Image mode: inputs [B, C, H, W] -> patchify inside and add full-image (H*W) pixel-space sincos sampled per patch + if inputs.dim() == 3: + # Legacy: [B*L, P2, C] + batch_tokens, p2, _ = inputs.shape + patch_sz = int(p2**0.5) + pos = self._fetch_pixel_pos_patch(patch_sz, inputs.device, inputs.dtype) # [P2, D] + x = self.proj(inputs) + x = x + pos.unsqueeze(0) + return x + elif inputs.dim() == 4: + # Image mode: [B, C, H, W] + assert img_height is not None and img_width is not None and patch_size is not None, ( + "Need H, W, patch_size for image mode" + ) + B, C, H, W = inputs.shape + assert H == img_height and W == img_width, "Input spatial size mismatch" + assert (H % patch_size == 0) and (W % patch_size == 0), "H and W must be divisible by patch_size" + Hs, Ws = H // patch_size, W // patch_size + P2 = patch_size * patch_size + # linear proj per pixel + x = inputs.permute(0, 2, 3, 1).contiguous() # [B, H, W, C] + x = self.proj(x) # [B, H, W, D] + # full-image pixel-space pos + pos_full = self._fetch_pixel_pos_image(H, W, inputs.device, inputs.dtype) # [H*W, D] + pos_full = pos_full.view(H, W, self.hidden_size_output) + # add pos at image grid then patchify to [B*L, P2, D] + x = x + pos_full.unsqueeze(0) + x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous() # [B, Hs, Ws, ps, ps, D] + x = x.view(B * Hs * Ws, P2, self.hidden_size_output) + return x + else: + raise ValueError("PixelTokenEmbedder expects inputs of shape [B*L,P2,C] or [B,C,H,W]") + + +class PiTBlock(nn.Module): + def __init__( + self, + pixel_hidden_size: int, + patch_hidden_size: int, + patch_size: int, + num_heads: int, + mlp_ratio: float = 4.0, + attn_hidden_size: Optional[int] = None, + attn_num_heads: Optional[int] = None, + rope_mode: str = "original", + rope_ref_grid_h: int = 32, + rope_ref_grid_w: int = 32, + ): + super().__init__() + self.pixel_dim = int(pixel_hidden_size) + self.context_dim = int(patch_hidden_size) + self.patch_size = int(patch_size) + self.attn_dim = int(attn_hidden_size) if attn_hidden_size is not None else self.context_dim + self.num_heads = int(attn_num_heads) if attn_num_heads is not None else int(num_heads) + self.rope_mode = rope_mode + self.rope_ref_grid_h = rope_ref_grid_h + self.rope_ref_grid_w = rope_ref_grid_w + assert self.attn_dim % self.num_heads == 0, "pixel attention hidden size must be divisible by pixel num_heads" + p2 = self.patch_size * self.patch_size + self.compress_to_attn = nn.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True) + self.expand_from_attn = nn.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True) + self.norm1 = RMSNorm(self.pixel_dim, eps=1e-6) + self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False) + self.norm2 = RMSNorm(self.pixel_dim, eps=1e-6) + self.mlp = MLP(self.pixel_dim, mlp_ratio=mlp_ratio, drop=0.0) + self.adaLN_modulation = nn.Sequential(nn.Linear(self.context_dim, 6 * self.pixel_dim * p2, bias=True)) + self._pos_cache = {} + # CP group; when set, the attention runs split-Q / gather-K,V across L. + self._cp_group: Optional[ProcessGroup] = None + + def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]): + self._cp_group = cp_group + self.attn.set_context_parallel_group(cp_group) + + def _fetch_pos(self, height: int, width: int, device): + key = (height, width) + if key in self._pos_cache: + return self._pos_cache[key].to(device) + head_dim = self.attn_dim // self.num_heads + if self.rope_mode == "ntk_aware": + pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to( + device + ) + else: + pos = precompute_freqs_cis_2d(head_dim, height, width).to(device) + self._pos_cache[key] = pos + return pos + + def forward( + self, x: torch.Tensor, s_cond: torch.Tensor, image_height: int, image_width: int, patch_size: int, mask=None + ) -> torch.Tensor: + # x: [B*L_local, P2, C]; under CP, L_local = (Hs*Ws)/cp_size. Without CP, + # L_local == L_full. The reshape uses L_local for the (B, L_local, ...) + # axis; the inner attention all-gathers k/v back to full length. + BL, P2, C = x.shape + if C != self.pixel_dim: + raise ValueError(f"PiTBlock expected pixel_dim={self.pixel_dim}, got {C}") + assert patch_size == self.patch_size, "PiTBlock expects fixed patch_size" + assert P2 == patch_size * patch_size, "Token count per patch must equal patch_size^2" + assert (image_height % patch_size == 0) and (image_width % patch_size == 0), ( + "H and W must be divisible by patch_size" + ) + Hs, Ws = image_height // patch_size, image_width // patch_size + L = Hs * Ws + cp_size = self._cp_group.size() if self._cp_group is not None else 1 + assert L % cp_size == 0, f"L={L} not divisible by cp_size={cp_size}" + L_local = L // cp_size + assert s_cond.shape[0] == BL, "s_cond batch must match x batch" + assert BL % L_local == 0, "Total sequences must be a multiple of local patch count" + B = BL // L_local + # adaLN per pixel (within patch): params + cond_params = self.adaLN_modulation(s_cond) # [BL, 6*pixel_dim*P2] + cond_params = cond_params.view(BL, P2, 6 * self.pixel_dim) + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(cond_params, 6, dim=-1) + x_norm = apply_adaln(self.norm1(x), shift_msa, scale_msa) + x_flat = x_norm.view(BL, P2 * self.pixel_dim) + x_comp = self.compress_to_attn(x_flat).view(B, L_local, self.attn_dim) + # attention across patch tokens (L) — pos is full-length; the CP-aware + # RotaryAttention gathers k/v across CP ranks internally. + pos_comp = self._fetch_pos(Hs, Ws, x.device) + attn_out = self.attn(x_comp, pos_comp, mask) # [B, L_local, attn_dim] + attn_flat = self.expand_from_attn(attn_out.view(B * L_local, self.attn_dim)) + attn_exp = attn_flat.view(BL, P2, self.pixel_dim) + # residual & MLP locally + x = x + gate_msa * attn_exp + mlp_out = self.mlp(apply_adaln(self.norm2(x), shift_mlp, scale_mlp)) + x = x + gate_mlp * mlp_out + return x + + +# ============================================================================= +# From pixdit_core/pixeldit_t2i.py +# ============================================================================= + + +class MMDiTJointAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + + # Separate QKV projections for image (x) and text (y) streams + self.qkv_x = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.qkv_y = nn.Linear(dim, dim * 3, bias=qkv_bias) + + # Per-stream QK normalization (head-wise) + self.q_norm_x = RMSNorm(self.head_dim) + self.k_norm_x = RMSNorm(self.head_dim) + self.q_norm_y = RMSNorm(self.head_dim) + self.k_norm_y = RMSNorm(self.head_dim) + + # Output projections for each stream + self.proj_x = nn.Linear(dim, dim) + self.proj_y = nn.Linear(dim, dim) + self.attn_drop = nn.Dropout(attn_drop) + self.proj_drop_x = nn.Dropout(proj_drop) + self.proj_drop_y = nn.Dropout(proj_drop) + # CP group for the image stream. Text is replicated across CP ranks. + self._cp_group: Optional[ProcessGroup] = None + + def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]): + self._cp_group = cp_group + + def forward( + self, + x: torch.Tensor, # [B, Nx, C] image stream (Nx = Nx_local under CP) + y: torch.Tensor, # [B, Ny, C] text stream (always full / replicated) + pos_img: torch.Tensor, # [Nx_full, head_dim/2] complex RoPE freqs (always full) + pos_txt: torch.Tensor = None, # [Ny, head_dim/2] complex RoPE freqs for text (optional) + attn_mask: torch.Tensor = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + B, Nx, C = x.shape + By, Ny, Cy = y.shape + assert B == By and C == Cy, "x and y must share batch and channel dims" + + # QKV for image + qkv_x = self.qkv_x(x).reshape(B, Nx, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4) + qx, kx, vx = qkv_x[0], qkv_x[1], qkv_x[2] # [B, Nx, H, Hc] + qx = self.q_norm_x(qx) + kx = self.k_norm_x(kx) + + # QKV for text + qkv_y = self.qkv_y(y).reshape(B, Ny, 3, self.num_heads, C // self.num_heads).permute(2, 0, 1, 3, 4) + qy, ky, vy = qkv_y[0], qkv_y[1], qkv_y[2] # [B, Ny, H, Hc] + qy = self.q_norm_y(qy) + ky = self.k_norm_y(ky) + + # Image RoPE — under CP, q uses the rank-local slice of pos_img, k (after + # all-gather along the sequence dim) uses the full pos_img. + if self._cp_group is None: + qx, kx = apply_rotary_emb(qx, kx, freqs_cis=pos_img) + else: + cp_size = self._cp_group.size() + cp_rank = self._cp_group.rank() + Nx_full = pos_img.shape[0] + assert Nx_full % cp_size == 0, f"pos_img length {Nx_full} not divisible by cp_size {cp_size}" + Nx_local = Nx_full // cp_size + assert Nx == Nx_local, f"local image stream length {Nx} != expected {Nx_local}" + pos_img_local = pos_img.view(cp_size, Nx_local, -1)[cp_rank] + qx, _ = apply_rotary_emb(qx, qx, freqs_cis=pos_img_local) + # `all_gather` requires contiguous tensors; the qkv permute leaves k/v as non-contiguous views. + kx = cat_outputs_cp_with_grad(kx.contiguous(), seq_dim=1, cp_group=self._cp_group) + vx = cat_outputs_cp_with_grad(vx.contiguous(), seq_dim=1, cp_group=self._cp_group) + _, kx = apply_rotary_emb(kx, kx, freqs_cis=pos_img) + if pos_txt is not None: + qy, ky = apply_rotary_emb(qy, ky, freqs_cis=pos_txt) + + # SDPA expects [B, H, S, Hc]; build joint sequence [text, image]. + # Under CP: qx is [B, H, Nx_local, Hc]; kx, vx are [B, H, Nx_full, Hc]. + qx = qx.transpose(1, 2) + kx = kx.transpose(1, 2) + vx = vx.transpose(1, 2) + + qy = qy.transpose(1, 2) # [B, H, Ny, Hc] + ky = ky.transpose(1, 2) + vy = vy.transpose(1, 2) + + q_joint = torch.cat([qy, qx], dim=2) # [B, H, Ny + Nx_local, Hc] + k_joint = torch.cat([ky, kx], dim=2) # [B, H, Ny + Nx_full, Hc] + v_joint = torch.cat([vy, vx], dim=2) + + out_joint = F.scaled_dot_product_attention(q_joint, k_joint, v_joint, dropout_p=0.0, attn_mask=attn_mask) + # Split back to [text, image]; image output is local under CP. + out_y = out_joint[:, :, :Ny, :] + out_x = out_joint[:, :, Ny:, :] + + # Merge heads + out_y = out_y.transpose(1, 2).reshape(B, Ny, C) + out_x = out_x.transpose(1, 2).reshape(B, Nx, C) + + # Output projections + out_x = self.proj_drop_x(self.proj_x(out_x)) + out_y = self.proj_drop_y(self.proj_y(out_y)) + return out_x, out_y + + +class MMDiTBlockT2I(nn.Module): + def __init__(self, hidden_size, groups, mlp_ratio=4.0, adaLN_modulation_img=None, adaLN_modulation_txt=None): + super().__init__() + self.hidden_size = hidden_size + self.groups = groups + self.head_dim = hidden_size // groups + + # Per-stream norms + self.norm_x1 = RMSNorm(hidden_size, eps=1e-6) + self.norm_y1 = RMSNorm(hidden_size, eps=1e-6) + + self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False) + + self.norm_x2 = RMSNorm(hidden_size, eps=1e-6) + self.norm_y2 = RMSNorm(hidden_size, eps=1e-6) + + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.mlp_x = FeedForward(hidden_size, mlp_hidden_dim) + self.mlp_y = FeedForward(hidden_size, mlp_hidden_dim) + + # Per-stream AdaLN modulation + self.adaLN_modulation_img = ( + adaLN_modulation_img + if adaLN_modulation_img is not None + else nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True)) + ) + self.adaLN_modulation_txt = ( + adaLN_modulation_txt + if adaLN_modulation_txt is not None + else nn.Sequential(nn.Linear(hidden_size, 6 * hidden_size, bias=True)) + ) + + def set_context_parallel_group(self, cp_group: Optional[ProcessGroup]): + # The block itself has no CP-affecting state; only the joint attention does. + self.attn.set_context_parallel_group(cp_group) + + def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None): + # c: [B, 1, C] typically, broadcast across tokens + shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk( + 6, dim=-1 + ) + shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk( + 6, dim=-1 + ) + + # 1) Joint attention with dual-stream + x_norm = apply_adaln(self.norm_x1(x), shift_msa_x, scale_msa_x) + y_norm = apply_adaln(self.norm_y1(y), shift_msa_y, scale_msa_y) + attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask) + x = x + gate_msa_x * attn_x + y = y + gate_msa_y * attn_y + + # 2) Per-stream MLP with AdaLN + x = x + gate_mlp_x * self.mlp_x(apply_adaln(self.norm_x2(x), shift_mlp_x, scale_mlp_x)) + y = y + gate_mlp_y * self.mlp_y(apply_adaln(self.norm_y2(y), shift_mlp_y, scale_mlp_y)) + return x, y + + +def _compute_num_stages_from_ratio(compress_ratio: int) -> int: + if compress_ratio <= 1: + return 0 + if compress_ratio & (compress_ratio - 1) != 0: + raise ValueError(f"ed_compress_ratio must be power of 2, got {compress_ratio}") + return int(math.log2(compress_ratio)) + + +class _TransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + drop: float = 0.0, + use_token_compression: bool = False, + token_shuffle_window_size: int = 1, + rope_mode: str = "original", + rope_ref_grid_h: int = 32, + rope_ref_grid_w: int = 32, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.rope_mode = rope_mode + self.rope_ref_grid_h = rope_ref_grid_h + self.rope_ref_grid_w = rope_ref_grid_w + self.norm1 = RMSNorm(dim, eps=1e-6) + self.attn = RotaryAttention(dim, num_heads=num_heads, qkv_bias=False) + self.norm2 = RMSNorm(dim, eps=1e-6) + self.mlp = MLP(dim, mlp_ratio=mlp_ratio, drop=drop) + self.adaLN_modulation = nn.Sequential(nn.Linear(dim, 6 * dim, bias=True)) + self.use_token_compression = bool(use_token_compression) + ts_ws = int(token_shuffle_window_size) if self.use_token_compression else 1 + + if self.use_token_compression and ts_ws > 1: + + class _AttnTokenShuffleCompression(nn.Module): + def __init__(self): + super().__init__() + s2 = ts_ws * ts_ws + adapted_hidden = ((dim + s2 - 1) // s2) * s2 + needs_adapter_in = adapted_hidden != dim + compressed_dim = adapted_hidden // s2 + self.s = ts_ws + self.adapted_hidden = adapted_hidden + self.compressed_dim = compressed_dim + self.adapter_in = ( + nn.Sequential(nn.Linear(dim, adapted_hidden, bias=True), nn.GELU()) + if needs_adapter_in + else nn.Identity() + ) + self.proj_down = nn.Linear(adapted_hidden, compressed_dim, bias=True) + self.proj_to_attn = ( + nn.Identity() if adapted_hidden == dim else nn.Linear(adapted_hidden, dim, bias=True) + ) + + def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor: + B, N, C = x.shape + assert N == height * width, f"Token count {N} != {height}*{width}" + s = self.s + assert height % s == 0 and width % s == 0, ( + f"Height {height} and Width {width} must be divisible by token shuffle size {s}" + ) + x = x.view(B, height, width, C) + x = self.adapter_in(x) + x = self.proj_down(x) + c_per = self.compressed_dim + x = x.view(B, height // s, s, width // s, s, c_per) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous() + x = x.view(B, (height // s) * (width // s), s * s * c_per) + x = self.proj_to_attn(x) + return x + + class _AttnTokenShuffleExpansion(nn.Module): + def __init__(self): + super().__init__() + s2 = ts_ws * ts_ws + adapted_hidden = ((dim + s2 - 1) // s2) * s2 + needs_adapter_out = adapted_hidden != dim + compressed_dim = adapted_hidden // s2 + self.s = ts_ws + self.adapted_hidden = adapted_hidden + self.compressed_dim = compressed_dim + self.proj_from_attn = ( + nn.Identity() if adapted_hidden == dim else nn.Linear(dim, adapted_hidden, bias=True) + ) + self.proj_up = nn.Sequential(nn.Linear(compressed_dim, adapted_hidden, bias=True), nn.GELU()) + self.adapter_out = ( + nn.Sequential(nn.Linear(adapted_hidden, dim, bias=True), nn.GELU()) + if needs_adapter_out + else nn.Identity() + ) + + def forward(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor: + B, Np, C = x.shape + s = self.s + Hs, Ws = height // s, width // s + assert Np == Hs * Ws, f"Token count {Np} != {Hs}*{Ws}" + x = self.proj_from_attn(x) + c_per = self.compressed_dim + x = x.view(B, Hs, Ws, s, s, c_per) + x_flat = x.reshape(B * Hs * Ws * s * s, c_per) + x_expanded = self.proj_up(x_flat) + x_expanded = x_expanded.view(B, Hs, Ws, s, s, self.adapted_hidden) + x_expanded = x_expanded.permute(0, 1, 3, 2, 4, 5).contiguous() + x_expanded = x_expanded.view(B, Hs * s, Ws * s, self.adapted_hidden) + x_expanded = self.adapter_out(x_expanded) + x_expanded = x_expanded.view(B, height * width, dim) + return x_expanded + + self._ts_compress = _AttnTokenShuffleCompression() + self._ts_expand = _AttnTokenShuffleExpansion() + else: + self._ts_compress = None + self._ts_expand = None + + def forward( + self, + x: torch.Tensor, + c: torch.Tensor, + pos: torch.Tensor, + mask: Optional[torch.Tensor] = None, + height: Optional[int] = None, + width: Optional[int] = None, + ) -> torch.Tensor: + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1) + use_ts = ( + self.use_token_compression + and self._ts_compress is not None + and self._ts_expand is not None + and height is not None + and width is not None + ) + if use_ts: + x_norm = apply_adaln(self.norm1(x), shift_msa, scale_msa) + x_comp = self._ts_compress(x_norm, height, width) + s = self._ts_compress.s + Hs, Ws = height // s, width // s + head_dim = self.dim // self.num_heads + if self.rope_mode == "ntk_aware": + pos_comp = precompute_freqs_cis_2d_ntk(head_dim, Hs, Ws, self.rope_ref_grid_h, self.rope_ref_grid_w).to( + x.device + ) + else: + pos_comp = precompute_freqs_cis_2d(head_dim, Hs, Ws).to(x.device) + attn_out = self.attn(x_comp, pos_comp, mask) + attn_out = self._ts_expand(attn_out, height, width) + x = x + gate_msa * attn_out + else: + attn_out = self.attn(apply_adaln(self.norm1(x), shift_msa, scale_msa), pos, mask) + x = x + gate_msa * attn_out + x = x + gate_mlp * self.mlp(apply_adaln(self.norm2(x), shift_mlp, scale_mlp)) + return x + + +class _PatchMerging(nn.Module): + def __init__(self, hidden_size: int, window_size: int = 2): + super().__init__() + self.hidden_size = hidden_size + self.window_size = int(window_size) + s2 = self.window_size * self.window_size + self.adapted_hidden = ((hidden_size + s2 - 1) // s2) * s2 + self.needs_adapter = self.adapted_hidden != hidden_size + self.adapter_in = ( + nn.Sequential(nn.Linear(hidden_size, self.adapted_hidden, bias=True), nn.GELU()) + if self.needs_adapter + else nn.Identity() + ) + self.compressed_dim = self.adapted_hidden // s2 + self.proj_down = nn.Linear(self.adapted_hidden, self.compressed_dim, bias=True) + self.proj_to_hidden = ( + nn.Identity() + if self.adapted_hidden == hidden_size + else nn.Sequential(nn.Linear(self.adapted_hidden, hidden_size, bias=True), nn.GELU()) + ) + + def forward(self, x: torch.Tensor, height: int, width: int): + B, N, C = x.shape + assert N == height * width, f"Token count {N} doesn't match H*W={height * width}" + s = self.window_size + assert height % s == 0 and width % s == 0, f"H and W must be divisible by {s}" + x = x.view(B, height, width, C) + x = self.adapter_in(x) + x = self.proj_down(x) + c_per = self.compressed_dim + x = x.view(B, height // s, s, width // s, s, c_per) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous() + x = x.view(B, (height // s) * (width // s), s * s * c_per) + x = self.proj_to_hidden(x) + return x, height // s, width // s + + +class _PatchExpanding(nn.Module): + def __init__(self, hidden_size: int, window_size: int = 2): + super().__init__() + self.hidden_size = hidden_size + self.window_size = int(window_size) + s2 = self.window_size * self.window_size + self.adapted_hidden = ((hidden_size + s2 - 1) // s2) * s2 + self.needs_adapter = self.adapted_hidden != hidden_size + self.proj_from_hidden = ( + nn.Identity() + if self.adapted_hidden == hidden_size + else nn.Linear(hidden_size, self.adapted_hidden, bias=True) + ) + self.compressed_dim = self.adapted_hidden // s2 + self.proj_up = nn.Sequential(nn.Linear(self.compressed_dim, self.adapted_hidden, bias=True), nn.GELU()) + self.adapter_out = ( + nn.Sequential(nn.Linear(self.adapted_hidden, hidden_size, bias=True), nn.GELU()) + if self.needs_adapter + else nn.Identity() + ) + + def forward(self, x: torch.Tensor, height: int, width: int): + B, Np, C = x.shape + Hs, Ws = height, width + s = self.window_size + x = self.proj_from_hidden(x) + c_per = self.adapted_hidden // (s * s) + x = x.view(B, Hs, Ws, s, s, c_per) + x_flat = x.reshape(B * Hs * Ws * s * s, c_per) + x_expanded = self.proj_up(x_flat) + x_expanded = x_expanded.view(B, Hs, Ws, s, s, self.adapted_hidden) + x_expanded = x_expanded.permute(0, 1, 3, 2, 4, 5).contiguous() + x_expanded = x_expanded.view(B, Hs * s, Ws * s, self.adapted_hidden) + x_expanded = self.adapter_out(x_expanded) + x_expanded = x_expanded.view(B, (Hs * s) * (Ws * s), self.hidden_size) + return x_expanded, Hs * s, Ws * s + + +class _EncoderED(nn.Module): + def __init__( + self, + hidden_size: int, + num_stages: int, + depth_per_stage: int = 1, + num_heads: int = 8, + window_size: int = 2, + mlp_ratio: float = 4.0, + drop: float = 0.0, + use_attn_token_shuffle: bool = False, + rope_mode: str = "original", + rope_ref_grid_h: int = 32, + rope_ref_grid_w: int = 32, + ): + super().__init__() + self.hidden_size = int(hidden_size) + self.num_heads = int(num_heads) + self.num_stages = int(num_stages) + self.window_size = int(window_size) + self.use_attn_token_shuffle = bool(use_attn_token_shuffle) + self.rope_mode = rope_mode + self.rope_ref_grid_h = rope_ref_grid_h + self.rope_ref_grid_w = rope_ref_grid_w + self._pos_cache = {} + stages = [] + for i_stage in range(self.num_stages): + ts_ws = 2 ** (self.num_stages - i_stage) if self.use_attn_token_shuffle else 1 + blocks = nn.ModuleList( + [ + _TransformerBlock( + hidden_size, + num_heads, + mlp_ratio, + drop, + use_token_compression=self.use_attn_token_shuffle, + token_shuffle_window_size=ts_ws, + rope_mode=rope_mode, + rope_ref_grid_h=rope_ref_grid_h, + rope_ref_grid_w=rope_ref_grid_w, + ) + for _ in range(int(depth_per_stage)) + ] + ) + compress = _PatchMerging(hidden_size, window_size=self.window_size) + stages.append(nn.ModuleDict({"blocks": blocks, "compress": compress})) + self.stages = nn.ModuleList(stages) + + def _fetch_pos(self, height: int, width: int, device: torch.device): + key = (height, width) + if key in self._pos_cache: + return self._pos_cache[key].to(device) + head_dim = self.hidden_size // self.num_heads + if self.rope_mode == "ntk_aware": + pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to( + device + ) + else: + pos = precompute_freqs_cis_2d(head_dim, height, width).to(device) + self._pos_cache[key] = pos + return pos + + def forward(self, x: torch.Tensor, height: int, width: int, c: torch.Tensor): + H, W = height, width + skip_tokens = [] + for stage in self.stages: + for blk in stage["blocks"]: + pos = self._fetch_pos(H, W, x.device) + x = blk(x, c, pos, None, H, W) if self.use_attn_token_shuffle else blk(x, c, pos, None) + skip_tokens.append(x) + x, H, W = stage["compress"](x, H, W) + return x, skip_tokens, H, W + + +class _DecoderED(nn.Module): + def __init__( + self, + hidden_size: int, + num_stages: int, + depth_per_stage: int = 1, + num_heads: int = 8, + window_size: int = 2, + mlp_ratio: float = 4.0, + drop: float = 0.0, + use_attn_token_shuffle: bool = False, + rope_mode: str = "original", + rope_ref_grid_h: int = 32, + rope_ref_grid_w: int = 32, + ): + super().__init__() + self.hidden_size = int(hidden_size) + self.num_heads = int(num_heads) + self.num_stages = int(num_stages) + self.window_size = int(window_size) + self.use_attn_token_shuffle = bool(use_attn_token_shuffle) + self.rope_mode = rope_mode + self.rope_ref_grid_h = rope_ref_grid_h + self.rope_ref_grid_w = rope_ref_grid_w + self._pos_cache = {} + stages = [] + for i_stage in range(self.num_stages): + ts_ws = 2**i_stage if self.use_attn_token_shuffle else 1 + blocks = nn.ModuleList( + [ + _TransformerBlock( + hidden_size, + num_heads, + mlp_ratio, + drop, + use_token_compression=self.use_attn_token_shuffle, + token_shuffle_window_size=ts_ws, + rope_mode=rope_mode, + rope_ref_grid_h=rope_ref_grid_h, + rope_ref_grid_w=rope_ref_grid_w, + ) + for _ in range(int(depth_per_stage)) + ] + ) + expand = _PatchExpanding(hidden_size, window_size=self.window_size) + stages.append(nn.ModuleDict({"blocks": blocks, "expand": expand})) + self.stages = nn.ModuleList(stages) + + def _fetch_pos(self, height: int, width: int, device: torch.device): + key = (height, width) + if key in self._pos_cache: + return self._pos_cache[key].to(device) + head_dim = self.hidden_size // self.num_heads + if self.rope_mode == "ntk_aware": + pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to( + device + ) + else: + pos = precompute_freqs_cis_2d(head_dim, height, width).to(device) + self._pos_cache[key] = pos + return pos + + def forward(self, x: torch.Tensor, bottleneck_h: int, bottleneck_w: int, skip_tokens, c: torch.Tensor): + H, W = bottleneck_h, bottleneck_w + for i, stage in enumerate(self.stages): + for blk in stage["blocks"]: + pos = self._fetch_pos(H, W, x.device) + x = blk(x, c, pos, None, H, W) if self.use_attn_token_shuffle else blk(x, c, pos, None) + x, H, W = stage["expand"](x, H, W) + skip_idx = len(self.stages) - 1 - i + if 0 <= skip_idx < len(skip_tokens): + skip = skip_tokens[skip_idx] + expected_tokens = H * W + if skip.shape[1] == expected_tokens: + x = x + skip + return x, H, W + + +# ============================================================================= +# Main T2I network: PixDiT_T2I +# ============================================================================= + + +class PixDiT_T2I(nn.Module): + def __init__( + self, + in_channels=3, + num_groups=16, + hidden_size=1152, + pixel_hidden_size=64, + pixel_attn_hidden_size=None, + pixel_num_groups=None, + patch_depth=26, + pixel_depth=2, + num_text_blocks=4, + patch_size=16, + txt_embed_dim=4096, + txt_max_length=1024, + use_text_rope: bool = True, + text_rope_theta: float = 10000.0, + # NTK-aware RoPE: set rope_mode="ntk_aware" and provide the reference + # pixel resolution used during training. When the actual grid size + # differs from ref, the base theta is scaled per-axis. + rope_mode: str = "original", # "original" | "ntk_aware" + rope_ref_h: int = 1024, + rope_ref_w: int = 1024, + repa_encoder_index: int = -1, + enable_ed: bool = False, + ed_compress_ratio: int = 1, + ed_depth_per_stage: int = 1, + ed_window_size: int = 2, + ed_num_heads: Optional[int] = None, + ed_hidden_size: Optional[int] = None, + ed_use_token_shuffle: bool = True, + ): + super().__init__() + self.in_channels = int(in_channels) + self.out_channels = int(in_channels) + self.hidden_size = int(hidden_size) + self.num_groups = int(num_groups) + self.patch_depth = int(patch_depth) + self.pixel_depth = int(pixel_depth) + self.num_text_blocks = int(num_text_blocks) + self.patch_size = int(patch_size) + self.pixel_hidden_size = int(pixel_hidden_size) + self.txt_embed_dim = int(txt_embed_dim) + self.txt_max_length = int(txt_max_length) + self.use_text_rope = bool(use_text_rope) + self.text_rope_theta = float(text_rope_theta) + self.rope_mode = rope_mode + self.rope_ref_grid_h = rope_ref_h // self.patch_size + self.rope_ref_grid_w = rope_ref_w // self.patch_size + self.repa_encoder_index = int(repa_encoder_index) + if self.pixel_depth <= 0: + raise ValueError("PixDiT_T2I expects pixel_depth > 0 to retain the pixel pathway") + + # Embedders + self.pixel_embedder = PixelTokenEmbedder(in_channels, self.pixel_hidden_size) + self.s_embedder = PatchTokenEmbedder(in_channels * patch_size**2, hidden_size, bias=True) + self.t_embedder = TimestepConditioner(hidden_size) + self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, hidden_size, bias=True, norm_layer=RMSNorm) + self.y_pos_embedding = nn.Parameter(torch.randn(1, self.txt_max_length, hidden_size)) + + # Blocks + # Shared AdaLN modulator for conditional blocks (optional) + self._shared_cond_adaln = None + self._shared_cond_adaln_img = None + self._shared_cond_adaln_txt = None + self.patch_blocks = nn.ModuleList( + [ + MMDiTBlockT2I( + self.hidden_size, + self.num_groups, + adaLN_modulation_img=self._shared_cond_adaln_img, + adaLN_modulation_txt=self._shared_cond_adaln_txt, + ) + for _ in range(self.patch_depth) + ] + ) + # Remove AdaLN-based text refinement; PixDiT keeps cross-attn-only text handling + self.text_refine_blocks = None + self.pixel_attn_hidden_size = ( + int(pixel_attn_hidden_size) if pixel_attn_hidden_size is not None else self.hidden_size + ) + self.pixel_num_groups = int(pixel_num_groups) if pixel_num_groups is not None else self.num_groups + self.pixel_blocks = nn.ModuleList( + [ + PiTBlock( + self.pixel_hidden_size, + self.hidden_size, + patch_size=self.patch_size, + num_heads=self.num_groups, + mlp_ratio=4.0, + attn_hidden_size=self.pixel_attn_hidden_size, + attn_num_heads=self.pixel_num_groups, + rope_mode=self.rope_mode, + rope_ref_grid_h=self.rope_ref_grid_h, + rope_ref_grid_w=self.rope_ref_grid_w, + ) + for _ in range(self.pixel_depth) + ] + ) + + self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels) + + self.precompute_pos = {} + self.precompute_pos_txt = {} # cache for 1D text RoPE + self.last_repa_tokens = None + + self.enable_ed = bool(enable_ed) + self.ed_compress_ratio = int(ed_compress_ratio) + self.ed_depth_per_stage = int(ed_depth_per_stage) + self.ed_window_size = int(ed_window_size) + self.ed_num_heads = int(ed_num_heads) if ed_num_heads is not None else self.num_groups + self.ed_hidden_size = int(ed_hidden_size) if ed_hidden_size is not None else self.hidden_size + self.ed_use_token_shuffle = bool(ed_use_token_shuffle) + self.encoder_ed: Optional[_EncoderED] = None + self.decoder_ed: Optional[_DecoderED] = None + self.s_ed_proj_in: Optional[nn.Module] = None + self.s_ed_proj_out: Optional[nn.Module] = None + self.s_ed_cond_proj: Optional[nn.Module] = None + self.s_ed_in_norm: Optional[RMSNorm] = None + self.s_ed_out_norm: Optional[RMSNorm] = None + num_stages = _compute_num_stages_from_ratio(self.ed_compress_ratio) if self.enable_ed else 0 + self.use_ed = self.enable_ed and num_stages > 0 + if self.use_ed: + if self.ed_hidden_size % self.ed_num_heads != 0: + raise ValueError( + f"ed_hidden_size {self.ed_hidden_size} must be divisible by ed_num_heads {self.ed_num_heads}" + ) + self.s_ed_proj_in = ( + nn.Identity() + if self.ed_hidden_size == self.hidden_size + else nn.Linear(self.hidden_size, self.ed_hidden_size, bias=True) + ) + self.s_ed_proj_out = ( + nn.Identity() + if self.ed_hidden_size == self.hidden_size + else nn.Linear(self.ed_hidden_size, self.hidden_size, bias=True) + ) + self.s_ed_cond_proj = ( + nn.Identity() + if self.ed_hidden_size == self.hidden_size + else nn.Linear(self.hidden_size, self.ed_hidden_size, bias=True) + ) + self.s_ed_in_norm = RMSNorm(self.ed_hidden_size, eps=1e-6) + self.s_ed_out_norm = RMSNorm(self.hidden_size, eps=1e-6) + self.encoder_ed = _EncoderED( + hidden_size=self.ed_hidden_size, + num_stages=num_stages, + depth_per_stage=self.ed_depth_per_stage, + num_heads=self.ed_num_heads, + window_size=self.ed_window_size, + use_attn_token_shuffle=self.ed_use_token_shuffle, + rope_mode=self.rope_mode, + rope_ref_grid_h=self.rope_ref_grid_h, + rope_ref_grid_w=self.rope_ref_grid_w, + ) + self.decoder_ed = _DecoderED( + hidden_size=self.ed_hidden_size, + num_stages=num_stages, + depth_per_stage=self.ed_depth_per_stage, + num_heads=self.ed_num_heads, + window_size=self.ed_window_size, + use_attn_token_shuffle=self.ed_use_token_shuffle, + rope_mode=self.rope_mode, + rope_ref_grid_h=self.rope_ref_grid_h, + rope_ref_grid_w=self.rope_ref_grid_w, + ) + + self.initialize_weights() + + # Context-parallel state — set by `enable_context_parallel`. The base + # class does not split tokens itself; subclasses (e.g. PidNet) + # are responsible for splitting along L in `forward` and gathering + # before the final fold. This attribute is propagated to every patch + # block (joint MMDiT attention) and pixel block (RotaryAttention). + self._cp_group: Optional[ProcessGroup] = None + self._is_context_parallel_enabled: bool = False + + @property + def is_context_parallel_enabled(self) -> bool: + return self._is_context_parallel_enabled + + def enable_context_parallel(self, cp_group: ProcessGroup): + # CP for the ED (encoder-decoder) path is not implemented; refuse to + # enable CP if the network was built with use_ed=True so we don't + # silently produce wrong results. + if self.use_ed: + raise NotImplementedError( + "PixDiT_T2I context parallel is not implemented for the encoder-decoder path. " + "Build with enable_ed=False to use CP." + ) + for block in self.patch_blocks: + block.set_context_parallel_group(cp_group) + for block in self.pixel_blocks: + block.set_context_parallel_group(cp_group) + self._cp_group = cp_group + self._is_context_parallel_enabled = True + + def disable_context_parallel(self): + for block in self.patch_blocks: + block.set_context_parallel_group(None) + for block in self.pixel_blocks: + block.set_context_parallel_group(None) + self._cp_group = None + self._is_context_parallel_enabled = False + + def fetch_pos(self, height, width, device): + if (height, width) in self.precompute_pos: + return self.precompute_pos[(height, width)].to(device) + head_dim = self.hidden_size // self.num_groups + if self.rope_mode == "ntk_aware": + pos = precompute_freqs_cis_2d_ntk(head_dim, height, width, self.rope_ref_grid_h, self.rope_ref_grid_w).to( + device + ) + else: + pos = precompute_freqs_cis_2d(head_dim, height, width).to(device) + self.precompute_pos[(height, width)] = pos + return pos + + def fetch_pos_text(self, length, device): + if length in self.precompute_pos_txt: + return self.precompute_pos_txt[length].to(device) + # Build 1D RoPE freqs for text stream using the same per-head dim as image + head_dim = self.hidden_size // self.num_groups + # Create frequencies for complex rotation: [length, head_dim//2] + freqs = 1.0 / (self.text_rope_theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim)) + positions = torch.arange(0, length, device=device).float().unsqueeze(1) # [length,1] + angles = positions * freqs.unsqueeze(0) # [length, head_dim//2] + freqs_cis = torch.polar(torch.ones_like(angles), angles) # complex64/complex32 + self.precompute_pos_txt[length] = freqs_cis + return freqs_cis + + def initialize_weights(self): + # Initialize s_embedder like nn.Linear + w = self.s_embedder.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.s_embedder.proj.bias, 0) + + # Initialize timestep embedding MLP + nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) + nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) + + # zero init final layer + nn.init.zeros_(self.final_layer.linear.weight) + nn.init.zeros_(self.final_layer.linear.bias) + + def forward(self, x, t, y, s=None, mask=None): + B, _, H, W = x.shape + # Derive grid token count deterministically from spatial size + Hs = H // self.patch_size + Ws = W // self.patch_size + L = Hs * Ws + + # Patch tokens for condition pathway + pos = self.fetch_pos(Hs, Ws, x.device) + x_patches = torch.nn.functional.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2) + + t_emb = self.t_embedder(t.view(-1)).view(B, -1, self.hidden_size) + + # Text tokens -> project to hidden_size and add learned pos + if y.dim() != 3: + raise ValueError("Text embedding y must be [B, L, D]") + Ltxt = min(y.shape[1], self.txt_max_length) + y = y[:, :Ltxt, :] + y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size) + y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb.dtype) + + # PixDiT design: no AdaLN modulation applied on text stream + condition = torch.nn.functional.silu(t_emb) + + # Condition blocks on patch tokens with MM-DiT joint attention to text tokens + pad = None + pos_txt = self.fetch_pos_text(Ltxt, x.device) if self.use_text_rope else None + if mask is not None and isinstance(mask, torch.Tensor): + m = mask + while m.dim() > 2 and m.size(1) == 1: + m = m.squeeze(1) + if m.dim() == 3 and m.size(1) == 1: + m = m.squeeze(1) + if m.dim() == 2: + pad = m == 0 + + if s is None: + s0 = self.s_embedder(x_patches) + self.last_repa_tokens = None + if self.use_ed and self.encoder_ed is not None and self.decoder_ed is not None: + H_tokens, W_tokens = Hs, Ws + s_ed = s0 if self.s_ed_proj_in is None else self.s_ed_proj_in(s0) + if self.s_ed_in_norm is not None: + s_ed = self.s_ed_in_norm(s_ed) + c_ed = condition if self.s_ed_cond_proj is None else self.s_ed_cond_proj(condition) + bottleneck, skip_tokens, Hb, Wb = self.encoder_ed(s_ed, H_tokens, W_tokens, c_ed) + pos_b = self.fetch_pos(Hb, Wb, x.device) + s_main = bottleneck if self.s_ed_proj_out is None else self.s_ed_proj_out(bottleneck) + if self.s_ed_out_norm is not None: + s_main = self.s_ed_out_norm(s_main) + s_main = torch.nn.functional.silu(t_emb + s_main) + + attn_mask_joint = None + if pad is not None: + L_img_curr = s_main.shape[1] + pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device) + pad_txt = ( + pad[:, :Ltxt] + if pad.size(1) >= Ltxt + else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True) + ) + attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr) + + for i in range(self.patch_depth): + s_main, y_emb = self.patch_blocks[i](s_main, y_emb, condition, pos_b, pos_txt, attn_mask_joint) + if 0 < self.repa_encoder_index == (i + 1): + self.last_repa_tokens = s_main + s_bottleneck2 = s_main if self.s_ed_proj_in is None else self.s_ed_proj_in(s_main) + if self.s_ed_in_norm is not None: + s_bottleneck2 = self.s_ed_in_norm(s_bottleneck2) + decoded, _, _ = self.decoder_ed(s_bottleneck2, Hb, Wb, skip_tokens, c_ed) + s = decoded if self.s_ed_proj_out is None else self.s_ed_proj_out(decoded) + if self.s_ed_out_norm is not None: + s = self.s_ed_out_norm(s) + s = torch.nn.functional.silu(t_emb + s) + else: + s_main = s0 + attn_mask_joint = None + if pad is not None: + L_img_curr = s_main.shape[1] + pad_img = torch.zeros((B, L_img_curr), dtype=torch.bool, device=x.device) + pad_txt = ( + pad[:, :Ltxt] + if pad.size(1) >= Ltxt + else torch.nn.functional.pad(pad, (0, Ltxt - pad.size(1)), value=True) + ) + attn_mask_joint = torch.cat([pad_txt, pad_img], dim=1).view(B, 1, 1, Ltxt + L_img_curr) + + for i in range(self.patch_depth): + s_main, y_emb = self.patch_blocks[i](s_main, y_emb, condition, pos, pos_txt, attn_mask_joint) + if 0 < self.repa_encoder_index == (i + 1): + self.last_repa_tokens = s_main + s = torch.nn.functional.silu(t_emb + s_main) + # If no valid tap index is specified, expose last conditional output + if not (0 < self.repa_encoder_index <= self.patch_depth): + self.last_repa_tokens = s + + # Ensure the patch token length matches the spatial grid L + batch_size, length, _ = s.shape + if length != L: + if length > L: + s = s[:, :L, :] + else: + pad_len = L - length + s = torch.cat([s, s.new_zeros(B, pad_len, s.shape[2])], dim=1) + length = L + + # Pixel pathway + s_cond = s.view(B * L, self.hidden_size) + x_pixels = self.pixel_embedder(x, img_height=H, img_width=W, patch_size=self.patch_size) + for blk in self.pixel_blocks: + x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask) + + # Project back to image and fold + x_pixels = self.final_layer(x_pixels) # [B*L, P2, C] + C_out = self.out_channels + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).contiguous() + x_pixels = x_pixels.view(B, C_out * P2, L) + x_img = torch.nn.functional.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size) + return x_img diff --git a/invokeai/backend/pid/_src/utils/__init__.py b/invokeai/backend/pid/_src/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/pid/_src/utils/context_parallel.py b/invokeai/backend/pid/_src/utils/context_parallel.py new file mode 100644 index 00000000000..a89c4fe4aa4 --- /dev/null +++ b/invokeai/backend/pid/_src/utils/context_parallel.py @@ -0,0 +1,194 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional + +import torch +from torch import Tensor +from torch.distributed import ProcessGroup, all_gather, broadcast_object_list, get_process_group_ranks, get_world_size +from torch.distributed.utils import _verify_param_shape_across_processes + +from invokeai.backend.pid._ext.imaginaire.utils import distributed + + +def split_inputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor: + """ + Split input tensor along the sequence dimension for context parallelism. + + This function divides the input tensor into equal parts along the specified + sequence dimension, based on the number of ranks in the context parallelism group. + It then selects the part corresponding to the current rank. + + Args: + x: Input tensor to be split. + seq_dim: The dimension along which to split the input (sequence dimension). + cp_group: The process group for context parallelism. + + Returns: + A slice of the input tensor corresponding to the current rank. + + Raises: + AssertionError: If the sequence dimension is not divisible by the number of ranks. + """ + cp_ranks = get_process_group_ranks(cp_group) + cp_size = len(cp_ranks) + + assert x.shape[seq_dim] % cp_size == 0, f"{x.shape[seq_dim]} cannot divide cp_size {cp_size}" + x = x.view(*x.shape[:seq_dim], cp_size, x.shape[seq_dim] // cp_size, *x.shape[(seq_dim + 1) :]) + seq_idx = torch.tensor([cp_group.rank()], device=x.device) + x = x.index_select(seq_dim, seq_idx) + # Note that the new sequence length is the original sequence length / cp_size + x = x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim + 2) :]) + return x + + +def cat_outputs_cp(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor: + """ + Concatenate outputs from different ranks in the checkpoint parallelism group. + + This function gathers tensors from all ranks in the checkpoint parallelism group + and concatenates them along the specified sequence dimension. + + Args: + x: Input tensor to be concatenated. + seq_dim: The dimension along which to concatenate the tensors (sequence dimension). + cp_group: The process group for checkpoint parallelism. + + Returns: + A tensor that is the concatenation of tensors from all ranks in the cp_group. + + Raises: + RuntimeError: If the gather operation fails. + """ + # Get the world size (number of processes in the group) + world_size = get_world_size(cp_group) + + # Create a list to store tensors from all ranks + gathered_tensors = [torch.zeros_like(x) for _ in range(world_size)] + + # Gather tensors from all ranks + try: + all_gather(gathered_tensors, x, group=cp_group) + except RuntimeError as e: + raise RuntimeError(f"Failed to gather tensors: {e}") + + # Concatenate the gathered tensors along the specified dimension + return torch.cat(gathered_tensors, dim=seq_dim) + + +def cat_outputs_cp_with_grad(x: Tensor, seq_dim: int, cp_group: ProcessGroup) -> Tensor: + """ + Concatenate outputs from different ranks in the context parallelism group. + + This function gathers tensors from all ranks in the checkpoint parallelism group + and concatenates them along the specified sequence dimension. + + It retains computational graph locally for each rank by replacing the portion of the tensor with original output. + + Args: + x: Input tensor to be concatenated. + seq_dim: The dimension along which to concatenate the tensors (sequence dimension). + cp_group: The process group for checkpoint parallelism. + + Returns: + A tensor that is the concatenation of tensors from all ranks in the cp_group. + + Raises: + RuntimeError: If the gather operation fails. + """ + # Get the world size (number of processes in the group) + cp_size = cp_group.size() + assert cp_size > 0, "cp_size should be greater than 0" + + # Create a list to store tensors from all ranks + gathered_tensors = [torch.zeros_like(x) for _ in range(cp_size)] + + # Gather tensors from all ranks + try: + all_gather(gathered_tensors, x, group=cp_group) + except RuntimeError as e: + raise RuntimeError(f"Failed to gather tensors: {e}") + + rank = cp_group.rank() + gathered_tensors[rank] = x + # Concatenate the gathered tensors along the specified dimension + return torch.cat(gathered_tensors, dim=seq_dim) + + +def robust_broadcast(tensor: torch.Tensor, src: int, pg: ProcessGroup, is_check_shape: bool = False) -> torch.Tensor: + """ + Perform a robust broadcast operation that works regardless of tensor shapes on different ranks. + + Args: + tensor (torch.Tensor): The tensor to broadcast (on src rank) or receive (on other ranks). + src (int): The source rank for the broadcast. Defaults to 0. + + Returns: + torch.Tensor: The broadcasted tensor on all ranks. + """ + # First, broadcast the shape of the tensor + if distributed.get_rank() == src: + shape = torch.tensor(tensor.shape, dtype=torch.long).cuda() + else: + shape = torch.empty(tensor.dim(), dtype=torch.long).cuda() + if is_check_shape: + _verify_param_shape_across_processes(pg, [shape]) + torch.distributed.broadcast(shape, src, group=pg) + + # Resize the tensor on non-src ranks if necessary + if distributed.get_rank() != src: + tensor = tensor.new_empty(shape.tolist()).type_as(tensor) + + # Now broadcast the tensor data; torch.distributed.broadcast requires contiguous tensors + # (e.g. tensors from expand() are non-contiguous views with stride=0) + tensor = tensor.contiguous() + torch.distributed.broadcast(tensor, src, group=pg) + + return tensor + + +def broadcast( + item: torch.Tensor | str | None, process_group: Optional[ProcessGroup] = None +) -> torch.Tensor | str | None: + """ + Broadcast the item from the minimum rank in the specified group(s). + """ + if process_group is None: + return item + + min_rank = min(get_process_group_ranks(process_group)) + if isinstance(item, torch.Tensor): # assume the device is cuda + item = robust_broadcast(item, min_rank, process_group) + elif item is not None: + broadcastable_list = [item] + broadcast_object_list(broadcastable_list, min_rank, group=process_group) + item = broadcastable_list[0] + return item + + +def broadcast_split_tensor( + tensor: torch.Tensor, + seq_dim: int, + process_group: Optional[ProcessGroup] = None, +) -> torch.Tensor: + """ + Broadcast the tensor from the minimum rank in the specified group(s). + """ + if tensor is None: + return tensor + min_rank = min(get_process_group_ranks(process_group)) + tensor = robust_broadcast(tensor, min_rank, process_group) + return split_inputs_cp(tensor, seq_dim, process_group) diff --git a/invokeai/backend/pid/decode.py b/invokeai/backend/pid/decode.py new file mode 100644 index 00000000000..f7946c5fa22 --- /dev/null +++ b/invokeai/backend/pid/decode.py @@ -0,0 +1,489 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Decode pipeline for the vendored PiD (Pixel Diffusion Decoder). + +This module bridges between InvokeAI's model-manager-loaded PiD checkpoints +(state dicts produced by `model_loaders/pid_decoder.py`) and the underlying +`PidNet` super-resolution network. It deliberately reimplements the small +sampling loop from `PidDistillModel.generate_samples_from_batch` (vendored +in `_src/models/pid_distill_model.py`) so the wrapper stays free of the +upstream's CUDA-only, distributed-training-flavoured init paths and can be +driven entirely by InvokeAI's per-call device / dtype choices. + +Hyperparameters were extracted from PiD's `pid_sr4x` base net config and +the per-backbone experiment overrides (NVIDIA's upstream `pid/_src/configs/`, +not vendored here — only the values needed at inference). See +`shared_config.py` and `experiment/{flux,flux2,sd3}.py` in the upstream +repository for the source of truth. +""" + +from __future__ import annotations + +from contextlib import nullcontext +from dataclasses import dataclass, field +from typing import Optional + +import torch +from torch import Tensor + +from invokeai.backend.model_manager.taxonomy import BaseModelType +from invokeai.backend.pid._src.networks.pid_net import PidNet + +# --------------------------------------------------------------------------- +# Network hyperparameters per backbone +# --------------------------------------------------------------------------- + +# `pid_sr4x` base config (defaults/model_pid.py upstream) plus the shared +# `_common_model_overrides` net dict (experiment/shared_config.py upstream). +_PID_SR4X_BASE: dict = { + # T2I backbone (PixDiT_T2I args) + "in_channels": 3, + "num_groups": 24, + "hidden_size": 1536, + "pixel_hidden_size": 16, + "pixel_attn_hidden_size": 1152, + "pixel_num_groups": 16, + "patch_depth": 14, + "pixel_depth": 2, + "patch_size": 16, + "txt_embed_dim": 2304, # Gemma-2-2b-it hidden size + "txt_max_length": 300, + "use_text_rope": True, + "text_rope_theta": 10000.0, + "rope_mode": "ntk_aware", + "rope_ref_h": 1024, + "rope_ref_w": 1024, + "repa_encoder_index": -1, # REPA disabled at inference + # SR / LQ branch + "lq_inject_mode": "controlnet", + "lq_in_channels": 0, + "lq_hidden_dim": 512, + "lq_gate_type": "sigma_aware_per_token_per_dim", + "lq_interval": 2, # overridden by shared_config + "zero_init_lq": True, + "train_lq_proj_only": False, + "sr_scale": 4, + "pit_lq_inject": False, + "pit_lq_gate_type": "sigma_aware_per_token_per_dim", +} + +# Per-backbone net deltas (mirrors upstream experiment/{name}.py). +_PER_BACKBONE: dict[BaseModelType, dict] = { + BaseModelType.Flux: { + "lq_latent_channels": 16, + "latent_spatial_down_factor": 8, + }, + BaseModelType.Flux2: { + "lq_latent_channels": 128, + "latent_spatial_down_factor": 16, + }, + BaseModelType.StableDiffusion3: { + "lq_latent_channels": 16, + "latent_spatial_down_factor": 8, + }, + BaseModelType.StableDiffusionXL: { + "lq_latent_channels": 4, + "latent_spatial_down_factor": 8, + }, + BaseModelType.QwenImage: { + "lq_latent_channels": 16, + "latent_spatial_down_factor": 8, + }, +} + +# Distilled-student schedule (`student_t_list` from shared_config). +_STUDENT_T_LIST: list[float] = [0.999, 0.866, 0.634, 0.342, 0.0] + +# Flow-matching timescale that maps the [0,1] schedule to the network's +# expected timestep range. +_FM_TIMESCALE: float = 1000.0 + +# Caption pre-processing constants from PiD's `shared_config.py`. The model +# was trained with these strings prepended; using anything else degrades +# quality. See `_encode_text_raw` in the upstream pixeldit_model.py. +PID_CHI_PROMPT: str = "\n".join( + [ + 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:', + "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.", + "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.", + "Here are examples of how to transform or refine prompts:", + "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.", + "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.", + "Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:", + "User Prompt: ", + ] +) +PID_NEGATIVE_PROMPT: str = ( + "low quality, worst quality, over-saturated, three legs, six fingers, cartoon, anime, " + "cgi, low res, blurry, deformed, distortion, duplicated limbs, plastic skin, jpeg artifacts, " + "watermark" +) +PID_MODEL_MAX_LENGTH: int = 300 + + +# Working-memory (activation) estimate for the PiD decode, mirroring `estimate_vae_working_memory_*` (see #8414). +# PiD runs a multi-step pixel-diffusion in float32 at the full super-resolved output resolution, so its peak +# activation memory scales with the OUTPUT pixel count. +# +# This is ONLY the activation headroom reserved for the decode itself - it does NOT do the heavy lifting of +# evicting the main transformer/encoders (the nodes call context.models.offload_all_from_vram() for that before +# loading PidNet). It must therefore stay modest: the cache uses max(this_estimate, device_working_mem_gb=3GB), +# and an over-large value pushes the working set negative and forces PidNet to partial-load onto the CPU (slow). +# ~4GB at a 2048px output is a small headroom above the 3GB default. Experimentally-tunable; calibrate to peak. +_PID_DECODE_WORKING_MEMORY_SCALING_CONSTANT = 250 + + +def estimate_pid_decode_working_memory(latent: Tensor, backbone: BaseModelType) -> int: + """Estimate the working (activation) memory in bytes for a PiD decode of *latent*. + + The decoded image is ``latent_spatial * sr_scale * latent_spatial_down_factor`` pixels per side. PidNet runs + in float32 (see ``model_loaders/pid_decoder.py``), so the element size is 4 bytes. Returns 0 for unsupported + backbones so callers fall back to the cache's default working-memory reservation. + """ + per_backbone = _PER_BACKBONE.get(backbone) + if per_backbone is None: + return 0 + total_up = int(_PID_SR4X_BASE["sr_scale"]) * int(per_backbone["latent_spatial_down_factor"]) + out_h = int(latent.shape[-2]) * total_up + out_w = int(latent.shape[-1]) * total_up + element_size = 4 # PidNet runs in float32 (see model_loaders/pid_decoder.py) + return int(out_h * out_w * element_size * _PID_DECODE_WORKING_MEMORY_SCALING_CONSTANT) + + +def build_pid_net(backbone: BaseModelType) -> PidNet: + """Build an uninitialised PidNet of the right shape for *backbone*. + + The returned network is on CPU and in float32; the caller is responsible + for casting it to the desired dtype/device before loading weights. + """ + if backbone not in _PER_BACKBONE: + raise ValueError( + f"PiD decoder backbone {backbone!r} is not supported. Expected one of: {list(_PER_BACKBONE.keys())}." + ) + kwargs = {**_PID_SR4X_BASE, **_PER_BACKBONE[backbone]} + return PidNet(**kwargs) + + +def load_pid_decoder(state_dict: dict[str, Tensor], backbone: BaseModelType) -> PidNet: + """Instantiate a PidNet for *backbone* and populate it with *state_dict*. + + The state dict is expected to be the model-manager loader's output, i.e. + already stripped of the `net.` prefix used by NVIDIA's distill model + serialisation. The caller still owns dtype/device placement of the + returned net. + """ + net = build_pid_net(backbone) + # strict=False keeps parity with the upstream loader: missing LQ-projection + # keys are tolerated when reloading PixDiT_T2I weights into PidNet, and + # extra keys (e.g. legacy EMA artefacts) are dropped. + missing, unexpected = net.load_state_dict(state_dict, strict=False) + if unexpected: + raise RuntimeError( + f"PiD checkpoint has unexpected keys not present in PidNet: {unexpected[:5]}" + + (f" (+ {len(unexpected) - 5} more)" if len(unexpected) > 5 else "") + ) + if missing: + # We tolerate missing `lq_proj.*` (e.g. if the user accidentally + # passed a vanilla PixDiT_T2I checkpoint), but anything else points + # to a real architecture mismatch. + non_lq = [k for k in missing if "lq_proj" not in k] + if non_lq: + raise RuntimeError( + f"PiD checkpoint is missing non-LQ keys required by PidNet: {non_lq[:5]}" + + (f" (+ {len(non_lq) - 5} more)" if len(non_lq) > 5 else "") + ) + return net + + +# --------------------------------------------------------------------------- +# Sampling +# --------------------------------------------------------------------------- + + +def _get_t_list(device: torch.device, *, num_steps: Optional[int] = None) -> Tensor: + """Distill-student sigma schedule. + + When *num_steps* differs from the trained 4 steps, linearly sub-sample + the canonical 5-point list (mirrors `PidDistillModel._get_t_list`). + """ + full = torch.tensor(_STUDENT_T_LIST, device=device, dtype=torch.float32) + if num_steps is None or num_steps == 4: + t = full + else: + idx = torch.linspace(0, len(full) - 1, num_steps + 1).round().long() + t = full[idx] + assert abs(t[-1].item()) < 1e-6, "t_list must end at 0" + return t + + +def _velocity_to_x0(x_t: Tensor, net_output: Tensor, t: Tensor) -> Tensor: + """Convert the network's velocity prediction back to x0 at time *t*.""" + s = [x_t.shape[0]] + [1] * (x_t.ndim - 1) + t_shaped = t.double().view(*s) + return (x_t.double() - t_shaped * net_output.double()).to(x_t.dtype) + + +@torch.no_grad() +def _student_sample_loop( + net: PidNet, + *, + noise: Tensor, + t_list: Tensor, + caption_embs: Tensor, + caption_mask: Optional[Tensor], + lq_latent: Optional[Tensor], + degrade_sigma: Tensor, + sample_type: str = "sde", + autocast_dtype: Optional[torch.dtype] = None, + generator: Optional[torch.Generator] = None, +) -> Tensor: + """Few-step distilled sampler. + + Mirrors `PidDistillModel._student_sample_loop` — the only mode supported + here is "sde" (the default for the released res2k_sr4x checkpoints). + + ``autocast_dtype`` mirrors PiD's training-time precision config (bf16): + the parameters can stay in float32 but cosines / RoPE tensors created + inside the forward must be cast on the fly. Set to ``None`` to disable. + """ + batch_size = noise.shape[0] + x = noise + autocast_ctx = ( + torch.autocast(noise.device.type, dtype=autocast_dtype) + if autocast_dtype is not None and noise.device.type == "cuda" + else nullcontext() + ) + for t_cur, t_next in zip(t_list[:-1], t_list[1:], strict=True): + t_cur_batch = t_cur.expand(batch_size) + with autocast_ctx: + # Do not pass the caption mask through here: upstream PiD's + # PidDistillModel sampler omits it too, and PidNet forwards the + # same `mask` argument unchanged to its pixel blocks where the + # shape (B, T_text) is incompatible with the patch-token K + # dimension that block expects. We keep `caption_mask` available + # in the signature so a future patch-block-only path can reuse + # it without another API change. + v_pred = net( + x, + t_cur_batch * _FM_TIMESCALE, + caption_embs, + lq_video_or_image=None, + lq_latent=lq_latent, + degrade_sigma=degrade_sigma, + ) + if t_next.item() > 0: + x0_pred = _velocity_to_x0(x, v_pred, t_cur_batch) + eps_infer = torch.randn( + x0_pred.shape, + device=x0_pred.device, + dtype=x0_pred.dtype, + generator=generator, + ) + broadcast_shape = [batch_size] + [1] * (x.ndim - 1) + t_next_b = t_next.reshape(1).expand(broadcast_shape) + if sample_type == "ode": + # ODE step (kept for symmetry; unused by the 4-step preset). + dt = t_next - t_cur + x = x + dt * v_pred + else: + x = (1.0 - t_next_b) * x0_pred + t_next_b * eps_infer + else: + x = _velocity_to_x0(x, v_pred, t_cur_batch) + return x + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class PiDDecodeConfig: + """Per-call decode knobs. + + The defaults match NVIDIA's released `res2k_sr4x_*_distill_4step` + presets; callers (i.e. the Phase 6.x invocations) may override them. + """ + + num_inference_steps: int = 4 + scale: int = 4 + sample_type: str = "sde" + # Caller-supplied per-sample noise levels of the input latent — 0.0 means + # "the latent is the clean x0 from the LDM" (the from_ldm path); the + # from_clean upscale path passes the LDM scheduler's per-step sigma here. + degrade_sigma: float | list[float] | Tensor = 0.0 + seed: int = 0 + student_t_list: list[float] = field(default_factory=lambda: list(_STUDENT_T_LIST)) + + +class PiDDecoder: + """High-level decoder that hides PidNet construction and sampling. + + Usage:: + + net = load_pid_decoder(state_dict, backbone) + net = net.to(device=..., dtype=...) + decoder = PiDDecoder(net, backbone=BaseModelType.Flux) + image = decoder.decode(latent=..., caption_embs=...) + """ + + def __init__(self, net: PidNet, backbone: BaseModelType) -> None: + if backbone not in _PER_BACKBONE: + raise ValueError(f"Unsupported PiD backbone: {backbone!r}") + self.net = net + self.backbone = backbone + + @property + def sr_scale(self) -> int: + return int(self.net.sr_scale) + + @property + def latent_spatial_down_factor(self) -> int: + return int(_PER_BACKBONE[self.backbone]["latent_spatial_down_factor"]) + + @torch.no_grad() + def decode( + self, + *, + latent: Tensor, + caption_embs: Tensor, + caption_mask: Optional[Tensor] = None, + config: Optional[PiDDecodeConfig] = None, + ) -> Tensor: + """Decode *latent* + *caption_embs* into a pixel tensor in [-1, 1]. + + Args: + latent: ``[B, C_lat, H_lat, W_lat]`` LQ latent (the LDM's x0 + output, scaled per the backbone's VAE convention). + caption_embs: ``[B, T, 2304]`` Gemma-2-2b-it caption embeddings + (output of `_encode_text_raw` upstream — InvokeAI callers + produce this via `Gemma2EncoderLoader`). + config: per-call sampling overrides; defaults to the released + `res2k_sr4x_*_distill_4step` preset. + + Returns: + ``[B, 3, H_lat * sr_scale * latent_spatial_down_factor, + W_lat * sr_scale * latent_spatial_down_factor]`` in [-1, 1]. + """ + cfg = config or PiDDecodeConfig() + device = latent.device + dtype = next(self.net.parameters()).dtype + # On CUDA, always run the forward pass under bf16 autocast: matmuls and + # convolutions execute in bf16 (fast + small activations), while + # numerically sensitive reductions like RMSNorm stay in the parameter + # dtype. PidNet is intentionally loaded in fp32 (see the loader) so + # those reductions actually keep their precision. + autocast_dtype = torch.bfloat16 if device.type == "cuda" else None + batch_size = latent.shape[0] + + # Spatial size of the noise tensor — the decoder operates in pixel + # space at sr_scale * latent_spatial_down_factor times the latent. + total_up = self.sr_scale * self.latent_spatial_down_factor + img_h = int(latent.shape[-2] * total_up) + img_w = int(latent.shape[-1] * total_up) + + gen = torch.Generator(device=device).manual_seed(int(cfg.seed)) + noise = torch.randn(batch_size, 3, img_h, img_w, device=device, generator=gen, dtype=dtype) + + sigma = cfg.degrade_sigma + if isinstance(sigma, Tensor): + degrade_sigma_t = sigma.to(device=device, dtype=torch.float32).reshape(-1) + if degrade_sigma_t.numel() == 1: + degrade_sigma_t = degrade_sigma_t.expand(batch_size).contiguous() + elif isinstance(sigma, (list, tuple)): + degrade_sigma_t = torch.tensor(sigma, device=device, dtype=torch.float32) + else: + degrade_sigma_t = torch.full((batch_size,), float(sigma), device=device, dtype=torch.float32) + if degrade_sigma_t.shape != (batch_size,): + raise ValueError( + f"degrade_sigma must broadcast to [B={batch_size}], got shape {tuple(degrade_sigma_t.shape)}" + ) + + caption_embs = caption_embs.to(device=device, dtype=dtype) + if caption_mask is not None: + caption_mask = caption_mask.to(device=device) + lq_latent = latent.to(device=device, dtype=dtype) + + t_list = _get_t_list(device, num_steps=cfg.num_inference_steps) + + self.net.eval() + x0 = _student_sample_loop( + self.net, + noise=noise, + t_list=t_list, + caption_embs=caption_embs, + caption_mask=caption_mask, + lq_latent=lq_latent, + degrade_sigma=degrade_sigma_t, + sample_type=cfg.sample_type, + autocast_dtype=autocast_dtype, + generator=gen, + ) + return x0.clamp(-1, 1) + + +@torch.no_grad() +def encode_caption_for_pid( + captions: list[str], + *, + tokenizer: "object", # AutoTokenizer; typed loose to avoid importing transformers at module load + encoder: "object", # Gemma2Model + device: torch.device, + dtype: torch.dtype = torch.bfloat16, + chi_prompt: str = PID_CHI_PROMPT, + model_max_length: int = PID_MODEL_MAX_LENGTH, +) -> tuple[Tensor, Tensor]: + """Mirror of `PixelDiTModel._encode_text_raw`. + + Prepends the chi-prompt, tokenises with right-padding, runs Gemma's + `model` (the transformer stack without the LM head), and selects + ``[CLS] + last (model_max_length - 1)`` tokens to yield a fixed + ``[B, model_max_length, 2304]`` embedding plus the matching attention + mask. The mask is critical: PidNet's joint attention zeros padded text + tokens out via this mask. Without it the decoder treats all ~300 slots + (including the padding) as valid caption tokens and produces a + washed-out average image. + """ + if not captions: + raise ValueError("encode_caption_for_pid requires at least one caption.") + n_chi_tokens = len(tokenizer.encode(chi_prompt)) if chi_prompt else 0 + prompts = [chi_prompt + c for c in captions] + max_len = (n_chi_tokens + model_max_length - 2) if chi_prompt else model_max_length + # PiD was trained with right-padding (see PixelDiTModel._load_text_encoder + # upstream). Gemma2's tokenizer defaults to "left" which would push the + # BOS token away from index 0 and shove pads into the slice the decoder + # consumes — yielding a garbled caption embedding. We toggle the value + # for the duration of this call and restore it afterwards so we don't + # poison the shared cached tokenizer. + old_padding_side = getattr(tokenizer, "padding_side", "right") + try: + tokenizer.padding_side = "right" + toks = tokenizer( + prompts, + max_length=max_len, + padding="max_length", + truncation=True, + return_tensors="pt", + ).to(device) + finally: + tokenizer.padding_side = old_padding_side + hidden = encoder(toks.input_ids, toks.attention_mask)[0] + select_idx = [0] + list(range(-(model_max_length - 1), 0)) + caption_embs = hidden[:, select_idx].to(dtype=dtype) + # Cast to bool: HF tokenizers emit attention_mask as int64, but PidNet's + # SDPA call (scaled_dot_product_attention) refuses any int dtype — it + # requires bool or matching float. Bool also matches the upstream + # `pad = mask == 0` reduction in pid_net.py. + caption_mask = toks.attention_mask[:, select_idx].to(torch.bool) + return caption_embs, caption_mask + + +__all__ = [ + "PID_CHI_PROMPT", + "PID_MODEL_MAX_LENGTH", + "PID_NEGATIVE_PROMPT", + "PiDDecodeConfig", + "PiDDecoder", + "build_pid_net", + "encode_caption_for_pid", + "load_pid_decoder", +] diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json index e822e8b260e..5621184e87e 100644 --- a/invokeai/frontend/web/openapi.json +++ b/invokeai/frontend/web/openapi.json @@ -876,6 +876,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -969,6 +984,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -1197,6 +1215,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -1290,6 +1323,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -1518,6 +1554,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -1611,6 +1662,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -1889,6 +1943,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -1982,6 +2051,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -2284,6 +2356,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -2377,6 +2464,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -3499,6 +3589,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -3592,6 +3697,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -11542,6 +11650,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -11635,6 +11758,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -25118,11 +25244,11 @@ "$ref": "#/components/schemas/FluxConditioningOutput" } }, - "Flux2VaeDecodeInvocation": { + "Flux2PiDDecodeInvocation": { "category": "latents", "class": "invocation", "classification": "prototype", - "description": "Generates an image from latents using FLUX.2 Klein's 32-channel VAE.", + "description": "Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The stored FLUX.2 latent\nis patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout\nPiD's FLUX.2 backbone expects, then decoded directly (it is already in raw,\nBN-denormalized space; see the module docstring).", "node_pack": "invokeai", "properties": { "board": { @@ -25196,83 +25322,54 @@ "input": "connection", "orig_required": true }, - "vae": { + "prompt": { "anyOf": [ { - "$ref": "#/components/schemas/VAEField" + "type": "string" }, { "type": "null" } ], "default": null, - "description": "VAE", + "description": "Text prompt the latent was generated from. PiD conditions on it.", "field_kind": "input", - "input": "connection", - "orig_required": true - }, - "type": { - "const": "flux2_vae_decode", - "default": "flux2_vae_decode", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["type", "id"], - "tags": ["latents", "image", "vae", "l2i", "flux2", "klein"], - "title": "Latents to Image - FLUX2", - "type": "object", - "version": "1.0.0", - "output": { - "$ref": "#/components/schemas/ImageOutput" - } - }, - "Flux2VaeEncodeInvocation": { - "category": "latents", - "class": "invocation", - "classification": "prototype", - "description": "Encodes an image into latents using FLUX.2 Klein's 32-channel VAE.", - "node_pack": "invokeai", - "properties": { - "id": { - "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", - "field_kind": "node_attribute", - "title": "Id", - "type": "string" - }, - "is_intermediate": { - "default": false, - "description": "Whether or not this is an intermediate invocation.", - "field_kind": "node_attribute", - "input": "direct", + "input": "any", "orig_required": true, - "title": "Is Intermediate", - "type": "boolean", - "ui_hidden": false, - "ui_type": "IsIntermediate" + "title": "Prompt", + "ui_component": "textarea" }, - "use_cache": { - "default": true, - "description": "Whether or not to use the cache", - "field_kind": "node_attribute", - "title": "Use Cache", - "type": "boolean" + "gemma2_encoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/Gemma2EncoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" }, - "image": { + "pid_decoder": { "anyOf": [ { - "$ref": "#/components/schemas/ImageField" + "$ref": "#/components/schemas/PiDDecoderField" }, { "type": "null" } ], "default": null, - "description": "The image to encode.", + "description": "PiD FLUX.2 decoder checkpoint.", "field_kind": "input", - "input": "any", - "orig_required": true + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" }, "vae": { "anyOf": [ @@ -25284,114 +25381,326 @@ } ], "default": null, - "description": "VAE", + "description": "FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is normally an identity transform and the input can be left unconnected.", "field_kind": "input", "input": "connection", - "orig_required": true + "orig_default": null, + "orig_required": false, + "title": "VAE" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" }, "type": { - "const": "flux2_vae_encode", - "default": "flux2_vae_encode", + "const": "flux2_pid_decode", + "default": "flux2_pid_decode", "field_kind": "node_attribute", "title": "type", "type": "string" } }, "required": ["type", "id"], - "tags": ["latents", "image", "vae", "i2l", "flux2", "klein"], - "title": "Image to Latents - FLUX2", + "tags": ["latents", "image", "pid", "flux2", "klein", "upscale"], + "title": "Latents to Image - FLUX.2 + PiD (4x SR)", "type": "object", "version": "1.0.0", "output": { - "$ref": "#/components/schemas/LatentsOutput" + "$ref": "#/components/schemas/ImageOutput" } }, - "Flux2VariantType": { - "type": "string", - "enum": ["klein_4b", "klein_4b_base", "klein_9b", "klein_9b_base"], - "title": "Flux2VariantType", - "description": "FLUX.2 model variants." - }, - "FluxConditioningCollectionOutput": { - "class": "output", - "description": "Base class for nodes that output a collection of conditioning tensors", + "Flux2VaeDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Generates an image from latents using FLUX.2 Klein's 32-channel VAE.", + "node_pack": "invokeai", "properties": { - "collection": { - "description": "The output conditioning tensors", - "field_kind": "output", - "items": { - "$ref": "#/components/schemas/FluxConditioningField" - }, - "title": "Collection", - "type": "array", + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, "ui_hidden": false }, - "type": { - "const": "flux_conditioning_collection_output", - "default": "flux_conditioning_collection_output", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["output_meta", "collection", "type", "type"], - "title": "FluxConditioningCollectionOutput", - "type": "object" - }, - "FluxConditioningField": { - "description": "A conditioning tensor primitive value", - "properties": { - "conditioning_name": { - "description": "The name of conditioning tensor", - "title": "Conditioning Name", - "type": "string" - }, - "mask": { + "metadata": { "anyOf": [ { - "$ref": "#/components/schemas/TensorField" + "$ref": "#/components/schemas/MetadataField" }, { "type": "null" } ], "default": null, - "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True." - } - }, - "required": ["conditioning_name"], - "title": "FluxConditioningField", - "type": "object" - }, - "FluxConditioningOutput": { - "class": "output", - "description": "Base class for nodes that output a single conditioning tensor", - "properties": { - "conditioning": { - "$ref": "#/components/schemas/FluxConditioningField", - "description": "Conditioning tensor", - "field_kind": "output", + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, "ui_hidden": false }, - "type": { - "const": "flux_conditioning_output", - "default": "flux_conditioning_output", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["output_meta", "conditioning", "type", "type"], - "title": "FluxConditioningOutput", - "type": "object" - }, - "FluxControlLoRALoaderInvocation": { - "category": "model", - "class": "invocation", - "classification": "stable", - "description": "LoRA model and Image to use with FLUX transformer generation.", - "node_pack": "invokeai", - "properties": { + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { + "anyOf": [ + { + "$ref": "#/components/schemas/LatentsField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latents tensor", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "VAE", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "type": { + "const": "flux2_vae_decode", + "default": "flux2_vae_decode", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["latents", "image", "vae", "l2i", "flux2", "klein"], + "title": "Latents to Image - FLUX2", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, + "Flux2VaeEncodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Encodes an image into latents using FLUX.2 Klein's 32-channel VAE.", + "node_pack": "invokeai", + "properties": { + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "image": { + "anyOf": [ + { + "$ref": "#/components/schemas/ImageField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The image to encode.", + "field_kind": "input", + "input": "any", + "orig_required": true + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "VAE", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "type": { + "const": "flux2_vae_encode", + "default": "flux2_vae_encode", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["latents", "image", "vae", "i2l", "flux2", "klein"], + "title": "Image to Latents - FLUX2", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/LatentsOutput" + } + }, + "Flux2VariantType": { + "type": "string", + "enum": ["klein_4b", "klein_4b_base", "klein_9b", "klein_9b_base"], + "title": "Flux2VariantType", + "description": "FLUX.2 model variants." + }, + "FluxConditioningCollectionOutput": { + "class": "output", + "description": "Base class for nodes that output a collection of conditioning tensors", + "properties": { + "collection": { + "description": "The output conditioning tensors", + "field_kind": "output", + "items": { + "$ref": "#/components/schemas/FluxConditioningField" + }, + "title": "Collection", + "type": "array", + "ui_hidden": false + }, + "type": { + "const": "flux_conditioning_collection_output", + "default": "flux_conditioning_collection_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "collection", "type", "type"], + "title": "FluxConditioningCollectionOutput", + "type": "object" + }, + "FluxConditioningField": { + "description": "A conditioning tensor primitive value", + "properties": { + "conditioning_name": { + "description": "The name of conditioning tensor", + "title": "Conditioning Name", + "type": "string" + }, + "mask": { + "anyOf": [ + { + "$ref": "#/components/schemas/TensorField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True." + } + }, + "required": ["conditioning_name"], + "title": "FluxConditioningField", + "type": "object" + }, + "FluxConditioningOutput": { + "class": "output", + "description": "Base class for nodes that output a single conditioning tensor", + "properties": { + "conditioning": { + "$ref": "#/components/schemas/FluxConditioningField", + "description": "Conditioning tensor", + "field_kind": "output", + "ui_hidden": false + }, + "type": { + "const": "flux_conditioning_output", + "default": "flux_conditioning_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "conditioning", "type", "type"], + "title": "FluxConditioningOutput", + "type": "object" + }, + "FluxControlLoRALoaderInvocation": { + "category": "model", + "class": "invocation", + "classification": "stable", + "description": "LoRA model and Image to use with FLUX transformer generation.", + "node_pack": "invokeai", + "properties": { "id": { "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", "field_kind": "node_attribute", @@ -27576,37 +27885,203 @@ "title": "FluxModelLoaderOutput", "type": "object" }, - "FluxReduxConditioningField": { - "description": "A FLUX Redux conditioning tensor primitive value", + "FluxPiDDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Decode a FLUX latent with the PiD pixel-diffusion decoder.\n\nThe FLUX AutoEncoder usually denormalises the stored latent internally\nbefore its conv decoder runs (`z / scale + shift`); we apply the same\ntransform manually here so PiD sees the raw latent it was trained on.", + "node_pack": "invokeai", "properties": { - "conditioning": { - "$ref": "#/components/schemas/TensorField", - "description": "The Redux image conditioning tensor." + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false }, - "mask": { + "metadata": { "anyOf": [ { - "$ref": "#/components/schemas/TensorField" + "$ref": "#/components/schemas/MetadataField" }, { "type": "null" } ], "default": null, - "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True." - } - }, - "required": ["conditioning"], - "title": "FluxReduxConditioningField", - "type": "object" - }, - "FluxReduxInvocation": { - "category": "conditioning", - "class": "invocation", - "classification": "beta", - "description": "Runs a FLUX Redux model to generate a conditioning tensor.", - "node_pack": "invokeai", - "properties": { + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { + "anyOf": [ + { + "$ref": "#/components/schemas/LatentsField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latents tensor", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Text prompt the latent was generated from. PiD conditions on it.", + "field_kind": "input", + "input": "any", + "orig_required": true, + "title": "Prompt", + "ui_component": "textarea" + }, + "gemma2_encoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/Gemma2EncoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" + }, + "pid_decoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/PiDDecoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PiD FLUX decoder checkpoint.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" + }, + "type": { + "const": "flux_pid_decode", + "default": "flux_pid_decode", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["latents", "image", "pid", "flux", "upscale"], + "title": "Latents to Image - FLUX + PiD (4x SR)", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, + "FluxReduxConditioningField": { + "description": "A FLUX Redux conditioning tensor primitive value", + "properties": { + "conditioning": { + "$ref": "#/components/schemas/TensorField", + "description": "The Redux image conditioning tensor." + }, + "mask": { + "anyOf": [ + { + "$ref": "#/components/schemas/TensorField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The mask associated with this conditioning tensor. Excluded regions should be set to False, included regions should be set to True." + } + }, + "required": ["conditioning"], + "title": "FluxReduxConditioningField", + "type": "object" + }, + "FluxReduxInvocation": { + "category": "conditioning", + "class": "invocation", + "classification": "beta", + "description": "Runs a FLUX Redux model to generate a conditioning tensor.", + "node_pack": "invokeai", + "properties": { "id": { "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", "field_kind": "node_attribute", @@ -28522,6 +28997,245 @@ "$ref": "#/components/schemas/ImageCollectionOutput" } }, + "Gemma2EncoderField": { + "description": "Field for the Gemma-2 text encoder used by PiD decoders.", + "properties": { + "tokenizer": { + "$ref": "#/components/schemas/ModelIdentifierField", + "description": "Info to load tokenizer submodel" + }, + "text_encoder": { + "$ref": "#/components/schemas/ModelIdentifierField", + "description": "Info to load text_encoder submodel" + } + }, + "required": ["tokenizer", "text_encoder"], + "title": "Gemma2EncoderField", + "type": "object" + }, + "Gemma2EncoderLoaderInvocation": { + "category": "model", + "class": "invocation", + "classification": "prototype", + "description": "Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder\nsubmodels for use by a PiD decode node.", + "node_pack": "invokeai", + "properties": { + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "gemma2_model": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 model used to encode captions for PiD decoders.", + "field_kind": "input", + "input": "any", + "orig_required": true, + "title": "Gemma-2", + "ui_model_type": ["gemma2_encoder"] + }, + "type": { + "const": "gemma2_encoder_loader", + "default": "gemma2_encoder_loader", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["model", "gemma2", "pid"], + "title": "Gemma-2 Encoder - PiD", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/Gemma2EncoderOutput" + } + }, + "Gemma2EncoderOutput": { + "class": "output", + "properties": { + "gemma2_encoder": { + "$ref": "#/components/schemas/Gemma2EncoderField", + "description": "Gemma-2 text encoder used by PiD decoders", + "field_kind": "output", + "title": "Gemma-2 Encoder", + "ui_hidden": false + }, + "type": { + "const": "gemma2_encoder_output", + "default": "gemma2_encoder_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "gemma2_encoder", "type", "type"], + "title": "Gemma2EncoderOutput", + "type": "object" + }, + "Gemma2Encoder_Gemma2Encoder_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "base": { + "type": "string", + "const": "any", + "title": "Base", + "default": "any" + }, + "type": { + "type": "string", + "const": "gemma2_encoder", + "title": "Type", + "default": "gemma2_encoder" + }, + "format": { + "type": "string", + "const": "gemma2_encoder", + "title": "Format", + "default": "gemma2_encoder" + }, + "cpu_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Cpu Only", + "description": "Whether this model should run on CPU only" + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "base", + "type", + "format", + "cpu_only" + ], + "title": "Gemma2Encoder_Gemma2Encoder_Config", + "description": "Standalone Gemma-2 causal LM directory used as a text encoder by PiD.\n\nExpected directory layout (HuggingFace `from_pretrained`-compatible)::\n\n /\n config.json # architectures: [\"Gemma2ForCausalLM\"]\n tokenizer.json\n tokenizer_config.json\n model-*.safetensors # or model.safetensors / *.bin" + }, "GeneratePasswordResponse": { "properties": { "password": { @@ -28896,6 +29610,9 @@ { "$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation" }, + { + "$ref": "#/components/schemas/Flux2PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/Flux2VaeDecodeInvocation" }, @@ -28932,6 +29649,9 @@ { "$ref": "#/components/schemas/FluxModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/FluxPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/FluxReduxInvocation" }, @@ -28950,6 +29670,9 @@ { "$ref": "#/components/schemas/GeminiImageGenerationInvocation" }, + { + "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation" + }, { "$ref": "#/components/schemas/GetMaskBoundingBoxInvocation" }, @@ -29250,6 +29973,12 @@ { "$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation" }, + { + "$ref": "#/components/schemas/PiDDecoderLoaderInvocation" + }, + { + "$ref": "#/components/schemas/PiDUpscaleInvocation" + }, { "$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation" }, @@ -29277,6 +30006,9 @@ { "$ref": "#/components/schemas/QwenImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/QwenImageTextEncoderInvocation" }, @@ -29313,6 +30045,9 @@ { "$ref": "#/components/schemas/SD3LatentsToImageInvocation" }, + { + "$ref": "#/components/schemas/SD3PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLCompelPromptInvocation" }, @@ -29325,6 +30060,9 @@ { "$ref": "#/components/schemas/SDXLModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/SDXLPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation" }, @@ -29439,6 +30177,9 @@ { "$ref": "#/components/schemas/ZImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/ZImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation" }, @@ -29604,6 +30345,9 @@ { "$ref": "#/components/schemas/FluxReduxOutput" }, + { + "$ref": "#/components/schemas/Gemma2EncoderOutput" + }, { "$ref": "#/components/schemas/GradientMaskOutput" }, @@ -29697,6 +30441,9 @@ { "$ref": "#/components/schemas/PairTileImageOutput" }, + { + "$ref": "#/components/schemas/PiDDecoderOutput" + }, { "$ref": "#/components/schemas/PromptTemplateOutput" }, @@ -36479,6 +37226,9 @@ { "$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation" }, + { + "$ref": "#/components/schemas/Flux2PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/Flux2VaeDecodeInvocation" }, @@ -36515,6 +37265,9 @@ { "$ref": "#/components/schemas/FluxModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/FluxPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/FluxReduxInvocation" }, @@ -36533,6 +37286,9 @@ { "$ref": "#/components/schemas/GeminiImageGenerationInvocation" }, + { + "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation" + }, { "$ref": "#/components/schemas/GetMaskBoundingBoxInvocation" }, @@ -36833,6 +37589,12 @@ { "$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation" }, + { + "$ref": "#/components/schemas/PiDDecoderLoaderInvocation" + }, + { + "$ref": "#/components/schemas/PiDUpscaleInvocation" + }, { "$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation" }, @@ -36860,6 +37622,9 @@ { "$ref": "#/components/schemas/QwenImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/QwenImageTextEncoderInvocation" }, @@ -36896,6 +37661,9 @@ { "$ref": "#/components/schemas/SD3LatentsToImageInvocation" }, + { + "$ref": "#/components/schemas/SD3PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLCompelPromptInvocation" }, @@ -36908,6 +37676,9 @@ { "$ref": "#/components/schemas/SDXLModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/SDXLPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation" }, @@ -37022,6 +37793,9 @@ { "$ref": "#/components/schemas/ZImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/ZImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation" }, @@ -37144,6 +37918,9 @@ { "$ref": "#/components/schemas/FluxReduxOutput" }, + { + "$ref": "#/components/schemas/Gemma2EncoderOutput" + }, { "$ref": "#/components/schemas/GradientMaskOutput" }, @@ -37237,6 +38014,9 @@ { "$ref": "#/components/schemas/PairTileImageOutput" }, + { + "$ref": "#/components/schemas/PiDDecoderOutput" + }, { "$ref": "#/components/schemas/PromptTemplateOutput" }, @@ -37608,6 +38388,9 @@ { "$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation" }, + { + "$ref": "#/components/schemas/Flux2PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/Flux2VaeDecodeInvocation" }, @@ -37644,6 +38427,9 @@ { "$ref": "#/components/schemas/FluxModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/FluxPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/FluxReduxInvocation" }, @@ -37662,6 +38448,9 @@ { "$ref": "#/components/schemas/GeminiImageGenerationInvocation" }, + { + "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation" + }, { "$ref": "#/components/schemas/GetMaskBoundingBoxInvocation" }, @@ -37962,6 +38751,12 @@ { "$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation" }, + { + "$ref": "#/components/schemas/PiDDecoderLoaderInvocation" + }, + { + "$ref": "#/components/schemas/PiDUpscaleInvocation" + }, { "$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation" }, @@ -37989,6 +38784,9 @@ { "$ref": "#/components/schemas/QwenImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/QwenImageTextEncoderInvocation" }, @@ -38025,6 +38823,9 @@ { "$ref": "#/components/schemas/SD3LatentsToImageInvocation" }, + { + "$ref": "#/components/schemas/SD3PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLCompelPromptInvocation" }, @@ -38037,6 +38838,9 @@ { "$ref": "#/components/schemas/SDXLModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/SDXLPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation" }, @@ -38151,6 +38955,9 @@ { "$ref": "#/components/schemas/ZImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/ZImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation" }, @@ -38406,6 +39213,9 @@ "flux2_klein_text_encoder": { "$ref": "#/components/schemas/FluxConditioningOutput" }, + "flux2_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "flux2_vae_decode": { "$ref": "#/components/schemas/ImageOutput" }, @@ -38445,6 +39255,9 @@ "flux_model_loader": { "$ref": "#/components/schemas/FluxModelLoaderOutput" }, + "flux_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "flux_redux": { "$ref": "#/components/schemas/FluxReduxOutput" }, @@ -38463,6 +39276,9 @@ "gemini_image_generation": { "$ref": "#/components/schemas/ImageCollectionOutput" }, + "gemma2_encoder_loader": { + "$ref": "#/components/schemas/Gemma2EncoderOutput" + }, "get_image_mask_bounding_box": { "$ref": "#/components/schemas/BoundingBoxOutput" }, @@ -38769,6 +39585,12 @@ "pbr_maps": { "$ref": "#/components/schemas/PBRMapsOutput" }, + "pid_decoder_loader": { + "$ref": "#/components/schemas/PiDDecoderOutput" + }, + "pid_upscale": { + "$ref": "#/components/schemas/ImageOutput" + }, "pidi_edge_detection": { "$ref": "#/components/schemas/ImageOutput" }, @@ -38796,6 +39618,9 @@ "qwen_image_model_loader": { "$ref": "#/components/schemas/QwenImageModelLoaderOutput" }, + "qwen_image_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "qwen_image_text_encoder": { "$ref": "#/components/schemas/QwenImageConditioningOutput" }, @@ -38841,6 +39666,9 @@ "sd3_model_loader": { "$ref": "#/components/schemas/Sd3ModelLoaderOutput" }, + "sd3_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "sd3_text_encoder": { "$ref": "#/components/schemas/SD3ConditioningOutput" }, @@ -38856,6 +39684,9 @@ "sdxl_model_loader": { "$ref": "#/components/schemas/SDXLModelLoaderOutput" }, + "sdxl_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "sdxl_refiner_compel_prompt": { "$ref": "#/components/schemas/ConditioningOutput" }, @@ -38961,6 +39792,9 @@ "z_image_model_loader": { "$ref": "#/components/schemas/ZImageModelLoaderOutput" }, + "z_image_pid_decode": { + "$ref": "#/components/schemas/ImageOutput" + }, "z_image_seed_variance_enhancer": { "$ref": "#/components/schemas/ZImageConditioningOutput" }, @@ -39037,6 +39871,7 @@ "flux2_klein_lora_loader", "flux2_klein_model_loader", "flux2_klein_text_encoder", + "flux2_pid_decode", "flux2_vae_decode", "flux2_vae_encode", "flux_control_lora_loader", @@ -39050,12 +39885,14 @@ "flux_lora_collection_loader", "flux_lora_loader", "flux_model_loader", + "flux_pid_decode", "flux_redux", "flux_text_encoder", "flux_vae_decode", "flux_vae_encode", "freeu", "gemini_image_generation", + "gemma2_encoder_loader", "get_image_mask_bounding_box", "grounding_dino", "hed_edge_detection", @@ -39158,6 +39995,8 @@ "pair_tile_image", "paste_image_into_bounding_box", "pbr_maps", + "pid_decoder_loader", + "pid_upscale", "pidi_edge_detection", "prompt_from_file", "prompt_template", @@ -39167,6 +40006,7 @@ "qwen_image_lora_collection_loader", "qwen_image_lora_loader", "qwen_image_model_loader", + "qwen_image_pid_decode", "qwen_image_text_encoder", "rand_float", "rand_int", @@ -39182,11 +40022,13 @@ "sd3_i2l", "sd3_l2i", "sd3_model_loader", + "sd3_pid_decode", "sd3_text_encoder", "sdxl_compel_prompt", "sdxl_lora_collection_loader", "sdxl_lora_loader", "sdxl_model_loader", + "sdxl_pid_decode", "sdxl_refiner_compel_prompt", "sdxl_refiner_model_loader", "seamless", @@ -39222,6 +40064,7 @@ "z_image_lora_collection_loader", "z_image_lora_loader", "z_image_model_loader", + "z_image_pid_decode", "z_image_seed_variance_enhancer", "z_image_text_encoder" ] @@ -39505,6 +40348,9 @@ { "$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation" }, + { + "$ref": "#/components/schemas/Flux2PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/Flux2VaeDecodeInvocation" }, @@ -39541,6 +40387,9 @@ { "$ref": "#/components/schemas/FluxModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/FluxPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/FluxReduxInvocation" }, @@ -39559,6 +40408,9 @@ { "$ref": "#/components/schemas/GeminiImageGenerationInvocation" }, + { + "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation" + }, { "$ref": "#/components/schemas/GetMaskBoundingBoxInvocation" }, @@ -39859,6 +40711,12 @@ { "$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation" }, + { + "$ref": "#/components/schemas/PiDDecoderLoaderInvocation" + }, + { + "$ref": "#/components/schemas/PiDUpscaleInvocation" + }, { "$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation" }, @@ -39886,6 +40744,9 @@ { "$ref": "#/components/schemas/QwenImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/QwenImageTextEncoderInvocation" }, @@ -39922,6 +40783,9 @@ { "$ref": "#/components/schemas/SD3LatentsToImageInvocation" }, + { + "$ref": "#/components/schemas/SD3PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLCompelPromptInvocation" }, @@ -39934,6 +40798,9 @@ { "$ref": "#/components/schemas/SDXLModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/SDXLPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation" }, @@ -40048,6 +40915,9 @@ { "$ref": "#/components/schemas/ZImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/ZImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation" }, @@ -40392,6 +41262,9 @@ { "$ref": "#/components/schemas/Flux2KleinTextEncoderInvocation" }, + { + "$ref": "#/components/schemas/Flux2PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/Flux2VaeDecodeInvocation" }, @@ -40428,6 +41301,9 @@ { "$ref": "#/components/schemas/FluxModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/FluxPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/FluxReduxInvocation" }, @@ -40446,6 +41322,9 @@ { "$ref": "#/components/schemas/GeminiImageGenerationInvocation" }, + { + "$ref": "#/components/schemas/Gemma2EncoderLoaderInvocation" + }, { "$ref": "#/components/schemas/GetMaskBoundingBoxInvocation" }, @@ -40746,6 +41625,12 @@ { "$ref": "#/components/schemas/PasteImageIntoBoundingBoxInvocation" }, + { + "$ref": "#/components/schemas/PiDDecoderLoaderInvocation" + }, + { + "$ref": "#/components/schemas/PiDUpscaleInvocation" + }, { "$ref": "#/components/schemas/PiDiNetEdgeDetectionInvocation" }, @@ -40773,6 +41658,9 @@ { "$ref": "#/components/schemas/QwenImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/QwenImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/QwenImageTextEncoderInvocation" }, @@ -40809,6 +41697,9 @@ { "$ref": "#/components/schemas/SD3LatentsToImageInvocation" }, + { + "$ref": "#/components/schemas/SD3PiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLCompelPromptInvocation" }, @@ -40821,6 +41712,9 @@ { "$ref": "#/components/schemas/SDXLModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/SDXLPiDDecodeInvocation" + }, { "$ref": "#/components/schemas/SDXLRefinerCompelPromptInvocation" }, @@ -40935,6 +41829,9 @@ { "$ref": "#/components/schemas/ZImageModelLoaderInvocation" }, + { + "$ref": "#/components/schemas/ZImagePiDDecodeInvocation" + }, { "$ref": "#/components/schemas/ZImageSeedVarianceEnhancerInvocation" }, @@ -54788,6 +55685,7 @@ "t5_encoder", "qwen3_encoder", "qwen_vl_encoder", + "gemma2_encoder", "bnb_quantized_int8b", "bnb_quantized_nf4b", "gguf_quantized", @@ -55133,6 +56031,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -55226,6 +56139,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -55705,6 +56621,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -55798,6 +56729,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -56162,6 +57096,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -56255,6 +57204,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -56469,6 +57421,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -56562,6 +57529,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -56962,6 +57932,9 @@ { "$ref": "#/components/schemas/Qwen3VariantType" }, + { + "$ref": "#/components/schemas/PiDDecoderVariantType" + }, { "type": "null" } @@ -57101,12 +58074,14 @@ "t5_encoder", "qwen3_encoder", "qwen_vl_encoder", + "gemma2_encoder", "spandrel_image_to_image", "siglip", "flux_redux", "llava_onevision", "text_llm", "external_image_generator", + "pid_decoder", "unknown" ], "title": "ModelType", @@ -57225,6 +58200,21 @@ { "$ref": "#/components/schemas/VAE_Diffusers_Flux2_Config" }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_FLUX_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_Flux2_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SD3_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_SDXL_Config" + }, + { + "$ref": "#/components/schemas/PiDDecoder_Checkpoint_QwenImage_Config" + }, { "$ref": "#/components/schemas/ControlNet_Checkpoint_SD1_Config" }, @@ -57318,6 +58308,9 @@ { "$ref": "#/components/schemas/Qwen3Encoder_GGUF_Config" }, + { + "$ref": "#/components/schemas/Gemma2Encoder_Gemma2Encoder_Config" + }, { "$ref": "#/components/schemas/QwenVLEncoder_Diffusers_Config" }, @@ -58845,174 +59838,1159 @@ "$ref": "#/components/schemas/ImageOutput" } }, - "PiDiNetEdgeDetectionInvocation": { - "category": "controlnet_preprocessors", - "class": "invocation", - "classification": "stable", - "description": "Generates an edge map using PiDiNet.", - "node_pack": "invokeai", - "properties": { - "board": { - "anyOf": [ - { - "$ref": "#/components/schemas/BoardField" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The board to save the image to", - "field_kind": "internal", - "input": "direct", - "orig_required": false, - "ui_hidden": false - }, - "metadata": { - "anyOf": [ - { - "$ref": "#/components/schemas/MetadataField" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Optional metadata to be saved with the image", - "field_kind": "internal", - "input": "connection", - "orig_required": false, - "ui_hidden": false - }, - "id": { - "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", - "field_kind": "node_attribute", - "title": "Id", - "type": "string" - }, - "is_intermediate": { - "default": false, - "description": "Whether or not this is an intermediate invocation.", - "field_kind": "node_attribute", - "input": "direct", - "orig_required": true, - "title": "Is Intermediate", - "type": "boolean", - "ui_hidden": false, - "ui_type": "IsIntermediate" - }, - "use_cache": { - "default": true, - "description": "Whether or not to use the cache", - "field_kind": "node_attribute", - "title": "Use Cache", - "type": "boolean" - }, - "image": { - "anyOf": [ - { - "$ref": "#/components/schemas/ImageField" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The image to process", - "field_kind": "input", - "input": "any", - "orig_required": true - }, - "quantize_edges": { - "default": false, - "description": "Whether or not to use safe mode", - "field_kind": "input", - "input": "any", - "orig_default": false, - "orig_required": false, - "title": "Quantize Edges", - "type": "boolean" - }, - "scribble": { - "default": false, - "description": "Whether or not to use scribble mode", - "field_kind": "input", - "input": "any", - "orig_default": false, - "orig_required": false, - "title": "Scribble", - "type": "boolean" - }, - "type": { - "const": "pidi_edge_detection", - "default": "pidi_edge_detection", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["type", "id"], - "tags": ["controlnet", "edge"], - "title": "PiDiNet Edge Detection", - "type": "object", - "version": "1.0.0", - "output": { - "$ref": "#/components/schemas/ImageOutput" - } - }, - "PresetData": { + "PiDDecoderField": { + "description": "Field for a PiD (Pixel Diffusion Decoder) checkpoint.", "properties": { - "positive_prompt": { - "type": "string", - "title": "Positive Prompt", - "description": "Positive prompt" - }, - "negative_prompt": { - "type": "string", - "title": "Negative Prompt", - "description": "Negative prompt" - } - }, - "additionalProperties": false, - "type": "object", - "required": ["positive_prompt", "negative_prompt"], - "title": "PresetData" - }, - "PresetType": { - "type": "string", - "enum": ["user", "default"], - "title": "PresetType" - }, - "ProgressImage": { - "description": "The progress image sent intermittently during processing", - "properties": { - "width": { - "description": "The effective width of the image in pixels", - "minimum": 1, - "title": "Width", - "type": "integer" - }, - "height": { - "description": "The effective height of the image in pixels", - "minimum": 1, - "title": "Height", - "type": "integer" - }, - "dataURL": { - "description": "The image data as a b64 data URL", - "title": "Dataurl", - "type": "string" + "decoder": { + "$ref": "#/components/schemas/ModelIdentifierField", + "description": "Info to load PiD decoder checkpoint" } }, - "required": ["width", "height", "dataURL"], - "title": "ProgressImage", + "required": ["decoder"], + "title": "PiDDecoderField", "type": "object" }, - "PromptTemplateInvocation": { - "category": "prompt", + "PiDDecoderLoaderInvocation": { + "category": "model", "class": "invocation", - "classification": "stable", - "description": "Applies a Style Preset template to positive and negative prompts.\n\nSelect a Style Preset and provide positive/negative prompts. The node replaces\n{prompt} placeholders in the template with your input prompts.", + "classification": "prototype", + "description": "Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use\nby the per-backbone PiD decode nodes.", + "node_pack": "invokeai", + "properties": { + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "pid_decoder_model": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PiD decoder checkpoint matching the upstream backbone.", + "field_kind": "input", + "input": "any", + "orig_required": true, + "title": "PiD Decoder", + "ui_model_type": ["pid_decoder"] + }, + "type": { + "const": "pid_decoder_loader", + "default": "pid_decoder_loader", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["model", "pid", "decoder"], + "title": "PiD Decoder - FLUX / FLUX.2 / SD3", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/PiDDecoderOutput" + } + }, + "PiDDecoderOutput": { + "class": "output", + "properties": { + "pid_decoder": { + "$ref": "#/components/schemas/PiDDecoderField", + "description": "PiD (Pixel Diffusion Decoder) checkpoint", + "field_kind": "output", + "title": "PiD Decoder", + "ui_hidden": false + }, + "type": { + "const": "pid_decoder_output", + "default": "pid_decoder_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "pid_decoder", "type", "type"], + "title": "PiDDecoderOutput", + "type": "object" + }, + "PiDDecoderVariantType": { + "type": "string", + "enum": ["res2k_sr4x", "res2kto4k_sr4x"], + "title": "PiDDecoderVariantType", + "description": "PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA.\n\nEach backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that\ndiffer only in target output resolution; the underlying network is the\nsame. NVIDIA's checkpoint filenames encode this as e.g.\n`PiD_res2k_sr4x_official_flux_distill_4step` vs\n`PiD_res2kto4k_sr4x_official_flux_distill_4step`." + }, + "PiDDecoder_Checkpoint_FLUX_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "config_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Config Path", + "description": "Path to the config for this model, if any." + }, + "type": { + "type": "string", + "const": "pid_decoder", + "title": "Type", + "default": "pid_decoder" + }, + "format": { + "type": "string", + "const": "checkpoint", + "title": "Format", + "default": "checkpoint" + }, + "base": { + "type": "string", + "const": "flux", + "title": "Base", + "default": "flux" + }, + "variant": { + "$ref": "#/components/schemas/PiDDecoderVariantType", + "description": "Resolution preset of the PiD decoder checkpoint." + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "config_path", + "type", + "format", + "base", + "variant" + ], + "title": "PiDDecoder_Checkpoint_FLUX_Config", + "description": "PiD decoder for the FLUX.1 backbone (16-channel latent)." + }, + "PiDDecoder_Checkpoint_Flux2_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "config_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Config Path", + "description": "Path to the config for this model, if any." + }, + "type": { + "type": "string", + "const": "pid_decoder", + "title": "Type", + "default": "pid_decoder" + }, + "format": { + "type": "string", + "const": "checkpoint", + "title": "Format", + "default": "checkpoint" + }, + "base": { + "type": "string", + "const": "flux2", + "title": "Base", + "default": "flux2" + }, + "variant": { + "$ref": "#/components/schemas/PiDDecoderVariantType", + "description": "Resolution preset of the PiD decoder checkpoint." + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "config_path", + "type", + "format", + "base", + "variant" + ], + "title": "PiDDecoder_Checkpoint_Flux2_Config", + "description": "PiD decoder for the FLUX.2 backbone (128-channel latent)." + }, + "PiDDecoder_Checkpoint_QwenImage_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "config_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Config Path", + "description": "Path to the config for this model, if any." + }, + "type": { + "type": "string", + "const": "pid_decoder", + "title": "Type", + "default": "pid_decoder" + }, + "format": { + "type": "string", + "const": "checkpoint", + "title": "Format", + "default": "checkpoint" + }, + "base": { + "type": "string", + "const": "qwen-image", + "title": "Base", + "default": "qwen-image" + }, + "variant": { + "$ref": "#/components/schemas/PiDDecoderVariantType", + "description": "Resolution preset of the PiD decoder checkpoint." + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "config_path", + "type", + "format", + "base", + "variant" + ], + "title": "PiDDecoder_Checkpoint_QwenImage_Config", + "description": "PiD decoder for the Qwen-Image backbone (16-channel latent).\n\nShares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same\nfilename / directory-name disambiguation (or a trusted explicit ``base`` override)\nas SD3 - see ``_validate_base``." + }, + "PiDDecoder_Checkpoint_SD3_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "config_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Config Path", + "description": "Path to the config for this model, if any." + }, + "type": { + "type": "string", + "const": "pid_decoder", + "title": "Type", + "default": "pid_decoder" + }, + "format": { + "type": "string", + "const": "checkpoint", + "title": "Format", + "default": "checkpoint" + }, + "base": { + "type": "string", + "const": "sd-3", + "title": "Base", + "default": "sd-3" + }, + "variant": { + "$ref": "#/components/schemas/PiDDecoderVariantType", + "description": "Resolution preset of the PiD decoder checkpoint." + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "config_path", + "type", + "format", + "base", + "variant" + ], + "title": "PiDDecoder_Checkpoint_SD3_Config", + "description": "PiD decoder for the Stable Diffusion 3 backbone (16-channel latent)." + }, + "PiDDecoder_Checkpoint_SDXL_Config": { + "properties": { + "key": { + "type": "string", + "title": "Key", + "description": "A unique key for this model." + }, + "hash": { + "type": "string", + "title": "Hash", + "description": "The hash of the model file(s)." + }, + "path": { + "type": "string", + "title": "Path", + "description": "Path to the model on the filesystem. Relative paths are relative to the Invoke root directory." + }, + "file_size": { + "type": "integer", + "title": "File Size", + "description": "The size of the model in bytes." + }, + "name": { + "type": "string", + "title": "Name", + "description": "Name of the model." + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description", + "description": "Model description" + }, + "source": { + "type": "string", + "title": "Source", + "description": "The original source of the model (path, URL or repo_id)." + }, + "source_type": { + "$ref": "#/components/schemas/ModelSourceType", + "description": "The type of source" + }, + "source_api_response": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Api Response", + "description": "The original API response from the source, as stringified JSON." + }, + "source_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Source Url", + "description": "Optional URL for the model (e.g. download page or model page)." + }, + "cover_image": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Cover Image", + "description": "Url for image to preview model" + }, + "config_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Config Path", + "description": "Path to the config for this model, if any." + }, + "type": { + "type": "string", + "const": "pid_decoder", + "title": "Type", + "default": "pid_decoder" + }, + "format": { + "type": "string", + "const": "checkpoint", + "title": "Format", + "default": "checkpoint" + }, + "base": { + "type": "string", + "const": "sdxl", + "title": "Base", + "default": "sdxl" + }, + "variant": { + "$ref": "#/components/schemas/PiDDecoderVariantType", + "description": "Resolution preset of the PiD decoder checkpoint." + } + }, + "type": "object", + "required": [ + "key", + "hash", + "path", + "file_size", + "name", + "description", + "source", + "source_type", + "source_api_response", + "source_url", + "cover_image", + "config_path", + "type", + "format", + "base", + "variant" + ], + "title": "PiDDecoder_Checkpoint_SDXL_Config", + "description": "PiD decoder for the SDXL backbone (4-channel latent)." + }, + "PiDUpscaleInvocation": { + "category": "image", + "class": "invocation", + "classification": "prototype", + "description": "Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode.\n\nWorks for source images that the FLUX VAE can encode (i.e. natural\nphotos / generated images at any size that lands on the VAE's 8-pixel\ngrid). The caption is used to condition the PiD decoder; leaving it\nempty produces an unconditional decode and is the cheapest option, but\nthe model was distilled with rich captions and benefits from one.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "image": { + "anyOf": [ + { + "$ref": "#/components/schemas/ImageField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image to upscale.", + "field_kind": "input", + "input": "any", + "orig_required": true + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder).", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "gemma2_encoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/Gemma2EncoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" + }, + "pid_decoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/PiDDecoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PiD FLUX decoder checkpoint.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" + }, + "prompt": { + "default": "", + "description": "Optional caption describing the image. Empty -> empty-caption decode.", + "field_kind": "input", + "input": "any", + "orig_default": "", + "orig_required": false, + "title": "Prompt", + "type": "string", + "ui_component": "textarea" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" + }, + "type": { + "const": "pid_upscale", + "default": "pid_upscale", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["upscale", "image", "pid", "super-resolution", "flux"], + "title": "PiD Upscale (4x) - FLUX VAE", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, + "PiDiNetEdgeDetectionInvocation": { + "category": "controlnet_preprocessors", + "class": "invocation", + "classification": "stable", + "description": "Generates an edge map using PiDiNet.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "image": { + "anyOf": [ + { + "$ref": "#/components/schemas/ImageField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The image to process", + "field_kind": "input", + "input": "any", + "orig_required": true + }, + "quantize_edges": { + "default": false, + "description": "Whether or not to use safe mode", + "field_kind": "input", + "input": "any", + "orig_default": false, + "orig_required": false, + "title": "Quantize Edges", + "type": "boolean" + }, + "scribble": { + "default": false, + "description": "Whether or not to use scribble mode", + "field_kind": "input", + "input": "any", + "orig_default": false, + "orig_required": false, + "title": "Scribble", + "type": "boolean" + }, + "type": { + "const": "pidi_edge_detection", + "default": "pidi_edge_detection", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["controlnet", "edge"], + "title": "PiDiNet Edge Detection", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, + "PresetData": { + "properties": { + "positive_prompt": { + "type": "string", + "title": "Positive Prompt", + "description": "Positive prompt" + }, + "negative_prompt": { + "type": "string", + "title": "Negative Prompt", + "description": "Negative prompt" + } + }, + "additionalProperties": false, + "type": "object", + "required": ["positive_prompt", "negative_prompt"], + "title": "PresetData" + }, + "PresetType": { + "type": "string", + "enum": ["user", "default"], + "title": "PresetType" + }, + "ProgressImage": { + "description": "The progress image sent intermittently during processing", + "properties": { + "width": { + "description": "The effective width of the image in pixels", + "minimum": 1, + "title": "Width", + "type": "integer" + }, + "height": { + "description": "The effective height of the image in pixels", + "minimum": 1, + "title": "Height", + "type": "integer" + }, + "dataURL": { + "description": "The image data as a b64 data URL", + "title": "Dataurl", + "type": "string" + } + }, + "required": ["width", "height", "dataURL"], + "title": "ProgressImage", + "type": "object" + }, + "PromptTemplateInvocation": { + "category": "prompt", + "class": "invocation", + "classification": "stable", + "description": "Applies a Style Preset template to positive and negative prompts.\n\nSelect a Style Preset and provide positive/negative prompts. The node replaces\n{prompt} placeholders in the template with your input prompts.", "node_pack": "invokeai", "properties": { "id": { @@ -60779,127 +62757,310 @@ "title": "Use Cache", "type": "boolean" }, - "model": { - "$ref": "#/components/schemas/ModelIdentifierField", - "description": "Qwen Image Edit model (Transformer) to load", + "model": { + "$ref": "#/components/schemas/ModelIdentifierField", + "description": "Qwen Image Edit model (Transformer) to load", + "field_kind": "input", + "input": "direct", + "orig_required": true, + "title": "Transformer", + "ui_model_base": ["qwen-image"], + "ui_model_type": ["main"] + }, + "vae_model": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standalone Qwen Image VAE model. If not provided, VAE will be loaded from the Component Source (or from the main model if it is Diffusers).", + "field_kind": "input", + "input": "direct", + "orig_default": null, + "orig_required": false, + "title": "VAE", + "ui_model_base": ["qwen-image"], + "ui_model_type": ["vae"] + }, + "qwen_vl_encoder_model": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Standalone Qwen2.5-VL encoder model. If not provided, the encoder will be loaded from the Component Source (or from the main model if it is Diffusers).", + "field_kind": "input", + "input": "direct", + "orig_default": null, + "orig_required": false, + "title": "Qwen VL Encoder", + "ui_model_type": ["qwen_vl_encoder"] + }, + "component_source": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Diffusers Qwen Image model to extract VAE and/or Qwen VL encoder from. Use this if you don't have separate VAE/encoder models. Ignored for any submodel that is provided separately.", + "field_kind": "input", + "input": "direct", + "orig_default": null, + "orig_required": false, + "title": "Component Source (Diffusers)", + "ui_model_base": ["qwen-image"], + "ui_model_format": ["diffusers"], + "ui_model_type": ["main"] + }, + "type": { + "const": "qwen_image_model_loader", + "default": "qwen_image_model_loader", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["model", "type", "id"], + "tags": ["model", "qwen_image"], + "title": "Main Model - Qwen Image", + "type": "object", + "version": "1.2.0", + "output": { + "$ref": "#/components/schemas/QwenImageModelLoaderOutput" + } + }, + "QwenImageModelLoaderOutput": { + "class": "output", + "description": "Qwen Image model loader output.", + "properties": { + "transformer": { + "$ref": "#/components/schemas/TransformerField", + "description": "Transformer", + "field_kind": "output", + "title": "Transformer", + "ui_hidden": false + }, + "qwen_vl_encoder": { + "$ref": "#/components/schemas/QwenVLEncoderField", + "description": "Qwen2.5-VL tokenizer, processor and text/vision encoder", + "field_kind": "output", + "title": "Qwen VL Encoder", + "ui_hidden": false + }, + "vae": { + "$ref": "#/components/schemas/VAEField", + "description": "VAE", + "field_kind": "output", + "title": "VAE", + "ui_hidden": false + }, + "type": { + "const": "qwen_image_model_loader_output", + "default": "qwen_image_model_loader_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "transformer", "qwen_vl_encoder", "vae", "type", "type"], + "title": "QwenImageModelLoaderOutput", + "type": "object" + }, + "QwenImagePiDDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Decode a Qwen-Image latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The 5D Qwen latent is\nreduced to 2D and per-channel denormalized (``z * std + mean``) before PiD.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { + "anyOf": [ + { + "$ref": "#/components/schemas/LatentsField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latents tensor", "field_kind": "input", - "input": "direct", + "input": "connection", + "orig_required": true + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Text prompt the latent was generated from. PiD conditions on it.", + "field_kind": "input", + "input": "any", "orig_required": true, - "title": "Transformer", - "ui_model_base": ["qwen-image"], - "ui_model_type": ["main"] + "title": "Prompt", + "ui_component": "textarea" }, - "vae_model": { + "gemma2_encoder": { "anyOf": [ { - "$ref": "#/components/schemas/ModelIdentifierField" + "$ref": "#/components/schemas/Gemma2EncoderField" }, { "type": "null" } ], "default": null, - "description": "Standalone Qwen Image VAE model. If not provided, VAE will be loaded from the Component Source (or from the main model if it is Diffusers).", + "description": "Gemma-2 caption encoder. Required by PiD.", "field_kind": "input", - "input": "direct", - "orig_default": null, - "orig_required": false, - "title": "VAE", - "ui_model_base": ["qwen-image"], - "ui_model_type": ["vae"] + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" }, - "qwen_vl_encoder_model": { + "pid_decoder": { "anyOf": [ { - "$ref": "#/components/schemas/ModelIdentifierField" + "$ref": "#/components/schemas/PiDDecoderField" }, { "type": "null" } ], "default": null, - "description": "Standalone Qwen2.5-VL encoder model. If not provided, the encoder will be loaded from the Component Source (or from the main model if it is Diffusers).", + "description": "PiD Qwen-Image decoder checkpoint.", "field_kind": "input", - "input": "direct", - "orig_default": null, - "orig_required": false, - "title": "Qwen VL Encoder", - "ui_model_type": ["qwen_vl_encoder"] + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" }, - "component_source": { + "vae": { "anyOf": [ { - "$ref": "#/components/schemas/ModelIdentifierField" + "$ref": "#/components/schemas/VAEField" }, { "type": "null" } ], "default": null, - "description": "Diffusers Qwen Image model to extract VAE and/or Qwen VL encoder from. Use this if you don't have separate VAE/encoder models. Ignored for any submodel that is provided separately.", + "description": "Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. If omitted, the diffusers default Qwen-Image constants are used.", "field_kind": "input", - "input": "direct", + "input": "connection", "orig_default": null, "orig_required": false, - "title": "Component Source (Diffusers)", - "ui_model_base": ["qwen-image"], - "ui_model_format": ["diffusers"], - "ui_model_type": ["main"] + "title": "VAE" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" }, "type": { - "const": "qwen_image_model_loader", - "default": "qwen_image_model_loader", + "const": "qwen_image_pid_decode", + "default": "qwen_image_pid_decode", "field_kind": "node_attribute", "title": "type", "type": "string" } }, - "required": ["model", "type", "id"], - "tags": ["model", "qwen_image"], - "title": "Main Model - Qwen Image", + "required": ["type", "id"], + "tags": ["latents", "image", "pid", "qwen-image", "upscale"], + "title": "Latents to Image - Qwen-Image + PiD (4x SR)", "type": "object", - "version": "1.2.0", + "version": "1.0.0", "output": { - "$ref": "#/components/schemas/QwenImageModelLoaderOutput" + "$ref": "#/components/schemas/ImageOutput" } }, - "QwenImageModelLoaderOutput": { - "class": "output", - "description": "Qwen Image model loader output.", - "properties": { - "transformer": { - "$ref": "#/components/schemas/TransformerField", - "description": "Transformer", - "field_kind": "output", - "title": "Transformer", - "ui_hidden": false - }, - "qwen_vl_encoder": { - "$ref": "#/components/schemas/QwenVLEncoderField", - "description": "Qwen2.5-VL tokenizer, processor and text/vision encoder", - "field_kind": "output", - "title": "Qwen VL Encoder", - "ui_hidden": false - }, - "vae": { - "$ref": "#/components/schemas/VAEField", - "description": "VAE", - "field_kind": "output", - "title": "VAE", - "ui_hidden": false - }, - "type": { - "const": "qwen_image_model_loader_output", - "default": "qwen_image_model_loader_output", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["output_meta", "transformer", "qwen_vl_encoder", "vae", "type", "type"], - "title": "QwenImageModelLoaderOutput", - "type": "object" - }, "QwenImageTextEncoderInvocation": { "category": "conditioning", "class": "invocation", @@ -62907,159 +65068,325 @@ "title": "Use Cache", "type": "boolean" }, - "image": { - "anyOf": [ - { - "$ref": "#/components/schemas/ImageField" - }, - { - "type": "null" - } - ], - "default": null, - "description": "The image to encode", - "field_kind": "input", - "input": "any", - "orig_required": true - }, - "vae": { + "image": { + "anyOf": [ + { + "$ref": "#/components/schemas/ImageField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The image to encode", + "field_kind": "input", + "input": "any", + "orig_required": true + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "VAE", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "type": { + "const": "sd3_i2l", + "default": "sd3_i2l", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["image", "latents", "vae", "i2l", "sd3"], + "title": "Image to Latents - SD3", + "type": "object", + "version": "1.0.1", + "output": { + "$ref": "#/components/schemas/LatentsOutput" + } + }, + "SD3LatentsToImageInvocation": { + "category": "latents", + "class": "invocation", + "classification": "stable", + "description": "Generates an image from latents.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { + "anyOf": [ + { + "$ref": "#/components/schemas/LatentsField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latents tensor", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "VAE", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "type": { + "const": "sd3_l2i", + "default": "sd3_l2i", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["latents", "image", "vae", "l2i", "sd3"], + "title": "Latents to Image - SD3", + "type": "object", + "version": "1.3.2", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, + "SD3PiDDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Decode an SD3 latent with the PiD pixel-diffusion decoder.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { "anyOf": [ { - "$ref": "#/components/schemas/VAEField" + "$ref": "#/components/schemas/LatentsField" }, { "type": "null" } ], "default": null, - "description": "VAE", + "description": "Latents tensor", "field_kind": "input", "input": "connection", "orig_required": true }, - "type": { - "const": "sd3_i2l", - "default": "sd3_i2l", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["type", "id"], - "tags": ["image", "latents", "vae", "i2l", "sd3"], - "title": "Image to Latents - SD3", - "type": "object", - "version": "1.0.1", - "output": { - "$ref": "#/components/schemas/LatentsOutput" - } - }, - "SD3LatentsToImageInvocation": { - "category": "latents", - "class": "invocation", - "classification": "stable", - "description": "Generates an image from latents.", - "node_pack": "invokeai", - "properties": { - "board": { + "prompt": { "anyOf": [ { - "$ref": "#/components/schemas/BoardField" + "type": "string" }, { "type": "null" } ], "default": null, - "description": "The board to save the image to", - "field_kind": "internal", - "input": "direct", - "orig_required": false, - "ui_hidden": false + "description": "Text prompt the latent was generated from. PiD conditions on it.", + "field_kind": "input", + "input": "any", + "orig_required": true, + "title": "Prompt", + "ui_component": "textarea" }, - "metadata": { + "gemma2_encoder": { "anyOf": [ { - "$ref": "#/components/schemas/MetadataField" + "$ref": "#/components/schemas/Gemma2EncoderField" }, { "type": "null" } ], "default": null, - "description": "Optional metadata to be saved with the image", - "field_kind": "internal", + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", "input": "connection", - "orig_required": false, - "ui_hidden": false - }, - "id": { - "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", - "field_kind": "node_attribute", - "title": "Id", - "type": "string" - }, - "is_intermediate": { - "default": false, - "description": "Whether or not this is an intermediate invocation.", - "field_kind": "node_attribute", - "input": "direct", "orig_required": true, - "title": "Is Intermediate", - "type": "boolean", - "ui_hidden": false, - "ui_type": "IsIntermediate" - }, - "use_cache": { - "default": true, - "description": "Whether or not to use the cache", - "field_kind": "node_attribute", - "title": "Use Cache", - "type": "boolean" + "title": "Gemma-2 Encoder" }, - "latents": { + "pid_decoder": { "anyOf": [ { - "$ref": "#/components/schemas/LatentsField" + "$ref": "#/components/schemas/PiDDecoderField" }, { "type": "null" } ], "default": null, - "description": "Latents tensor", + "description": "PiD SD3 decoder checkpoint.", "field_kind": "input", "input": "connection", - "orig_required": true + "orig_required": true, + "title": "PiD Decoder" }, - "vae": { - "anyOf": [ - { - "$ref": "#/components/schemas/VAEField" - }, - { - "type": "null" - } - ], - "default": null, - "description": "VAE", + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", "field_kind": "input", - "input": "connection", - "orig_required": true + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" }, "type": { - "const": "sd3_l2i", - "default": "sd3_l2i", + "const": "sd3_pid_decode", + "default": "sd3_pid_decode", "field_kind": "node_attribute", "title": "type", "type": "string" } }, "required": ["type", "id"], - "tags": ["latents", "image", "vae", "l2i", "sd3"], - "title": "Latents to Image - SD3", + "tags": ["latents", "image", "pid", "sd3", "upscale"], + "title": "Latents to Image - SD3 + PiD (4x SR)", "type": "object", - "version": "1.3.2", + "version": "1.0.0", "output": { "$ref": "#/components/schemas/ImageOutput" } @@ -63583,83 +65910,266 @@ "title": "Use Cache", "type": "boolean" }, - "model": { + "model": { + "anyOf": [ + { + "$ref": "#/components/schemas/ModelIdentifierField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "SDXL Main model (UNet, VAE, CLIP1, CLIP2) to load", + "field_kind": "input", + "input": "any", + "orig_required": true, + "ui_model_base": ["sdxl"], + "ui_model_type": ["main"] + }, + "type": { + "const": "sdxl_model_loader", + "default": "sdxl_model_loader", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["model", "sdxl"], + "title": "Main Model - SDXL", + "type": "object", + "version": "1.0.4", + "output": { + "$ref": "#/components/schemas/SDXLModelLoaderOutput" + } + }, + "SDXLModelLoaderOutput": { + "class": "output", + "description": "SDXL base model loader output", + "properties": { + "unet": { + "$ref": "#/components/schemas/UNetField", + "description": "UNet (scheduler, LoRAs)", + "field_kind": "output", + "title": "UNet", + "ui_hidden": false + }, + "clip": { + "$ref": "#/components/schemas/CLIPField", + "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count", + "field_kind": "output", + "title": "CLIP 1", + "ui_hidden": false + }, + "clip2": { + "$ref": "#/components/schemas/CLIPField", + "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count", + "field_kind": "output", + "title": "CLIP 2", + "ui_hidden": false + }, + "vae": { + "$ref": "#/components/schemas/VAEField", + "description": "VAE", + "field_kind": "output", + "title": "VAE", + "ui_hidden": false + }, + "type": { + "const": "sdxl_model_loader_output", + "default": "sdxl_model_loader_output", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["output_meta", "unet", "clip", "clip2", "vae", "type", "type"], + "title": "SDXLModelLoaderOutput", + "type": "object" + }, + "SDXLPiDDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Decode an SDXL latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass. The SDXL latent is\n4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``)\nand handed straight to PiD - no packing needed.", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { "anyOf": [ { - "$ref": "#/components/schemas/ModelIdentifierField" + "$ref": "#/components/schemas/LatentsField" }, { "type": "null" } ], "default": null, - "description": "SDXL Main model (UNet, VAE, CLIP1, CLIP2) to load", + "description": "Latents tensor", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Text prompt the latent was generated from. PiD conditions on it.", "field_kind": "input", "input": "any", "orig_required": true, - "ui_model_base": ["sdxl"], - "ui_model_type": ["main"] - }, - "type": { - "const": "sdxl_model_loader", - "default": "sdxl_model_loader", - "field_kind": "node_attribute", - "title": "type", - "type": "string" - } - }, - "required": ["type", "id"], - "tags": ["model", "sdxl"], - "title": "Main Model - SDXL", - "type": "object", - "version": "1.0.4", - "output": { - "$ref": "#/components/schemas/SDXLModelLoaderOutput" - } - }, - "SDXLModelLoaderOutput": { - "class": "output", - "description": "SDXL base model loader output", - "properties": { - "unet": { - "$ref": "#/components/schemas/UNetField", - "description": "UNet (scheduler, LoRAs)", - "field_kind": "output", - "title": "UNet", - "ui_hidden": false + "title": "Prompt", + "ui_component": "textarea" }, - "clip": { - "$ref": "#/components/schemas/CLIPField", - "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count", - "field_kind": "output", - "title": "CLIP 1", - "ui_hidden": false + "gemma2_encoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/Gemma2EncoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" }, - "clip2": { - "$ref": "#/components/schemas/CLIPField", - "description": "CLIP (tokenizer, text encoder, LoRAs) and skipped layer count", - "field_kind": "output", - "title": "CLIP 2", - "ui_hidden": false + "pid_decoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/PiDDecoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PiD SDXL decoder checkpoint.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" }, "vae": { - "$ref": "#/components/schemas/VAEField", - "description": "VAE", - "field_kind": "output", - "title": "VAE", - "ui_hidden": false + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "SDXL VAE, used to read scaling_factor / shift_factor. If omitted, the SDXL fallback constants (0.13025 / 0.0) are used.", + "field_kind": "input", + "input": "connection", + "orig_default": null, + "orig_required": false, + "title": "VAE" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" }, "type": { - "const": "sdxl_model_loader_output", - "default": "sdxl_model_loader_output", + "const": "sdxl_pid_decode", + "default": "sdxl_pid_decode", "field_kind": "node_attribute", "title": "type", "type": "string" } }, - "required": ["output_meta", "unet", "clip", "clip2", "vae", "type", "type"], - "title": "SDXLModelLoaderOutput", - "type": "object" + "required": ["type", "id"], + "tags": ["latents", "image", "pid", "sdxl", "upscale"], + "title": "Latents to Image - SDXL + PiD (4x SR)", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } }, "SDXLRefinerCompelPromptInvocation": { "category": "prompt", @@ -66439,6 +68949,9 @@ { "$ref": "#/components/schemas/Qwen3VariantType" }, + { + "$ref": "#/components/schemas/PiDDecoderVariantType" + }, { "type": "null" } @@ -66599,6 +69112,9 @@ { "$ref": "#/components/schemas/Qwen3VariantType" }, + { + "$ref": "#/components/schemas/PiDDecoderVariantType" + }, { "type": "null" } @@ -67531,6 +70047,9 @@ { "$ref": "#/components/schemas/Qwen3VariantType" }, + { + "$ref": "#/components/schemas/PiDDecoderVariantType" + }, { "type": "null" } @@ -74203,6 +76722,189 @@ "title": "ZImageModelLoaderOutput", "type": "object" }, + "ZImagePiDDecodeInvocation": { + "category": "latents", + "class": "invocation", + "classification": "prototype", + "description": "Decode a Z-Image latent with the PiD pixel-diffusion decoder.\n\nProduces a 4x super-resolved image in a single pass (Z-Image decoder is\ntrained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial\ndown-factor gives a 32x linear scale from latent to pixel).", + "node_pack": "invokeai", + "properties": { + "board": { + "anyOf": [ + { + "$ref": "#/components/schemas/BoardField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The board to save the image to", + "field_kind": "internal", + "input": "direct", + "orig_required": false, + "ui_hidden": false + }, + "metadata": { + "anyOf": [ + { + "$ref": "#/components/schemas/MetadataField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Optional metadata to be saved with the image", + "field_kind": "internal", + "input": "connection", + "orig_required": false, + "ui_hidden": false + }, + "id": { + "description": "The id of this instance of an invocation. Must be unique among all instances of invocations.", + "field_kind": "node_attribute", + "title": "Id", + "type": "string" + }, + "is_intermediate": { + "default": false, + "description": "Whether or not this is an intermediate invocation.", + "field_kind": "node_attribute", + "input": "direct", + "orig_required": true, + "title": "Is Intermediate", + "type": "boolean", + "ui_hidden": false, + "ui_type": "IsIntermediate" + }, + "use_cache": { + "default": true, + "description": "Whether or not to use the cache", + "field_kind": "node_attribute", + "title": "Use Cache", + "type": "boolean" + }, + "latents": { + "anyOf": [ + { + "$ref": "#/components/schemas/LatentsField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Latents tensor", + "field_kind": "input", + "input": "connection", + "orig_required": true + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Text prompt the latent was generated from. PiD conditions on it.", + "field_kind": "input", + "input": "any", + "orig_required": true, + "title": "Prompt", + "ui_component": "textarea" + }, + "gemma2_encoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/Gemma2EncoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Gemma-2 caption encoder. Required by PiD.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "Gemma-2 Encoder" + }, + "pid_decoder": { + "anyOf": [ + { + "$ref": "#/components/schemas/PiDDecoderField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PiD FLUX decoder checkpoint.", + "field_kind": "input", + "input": "connection", + "orig_required": true, + "title": "PiD Decoder" + }, + "vae": { + "anyOf": [ + { + "$ref": "#/components/schemas/VAEField" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Z-Image VAE used to read scaling_factor / shift_factor. If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used.", + "field_kind": "input", + "input": "connection", + "orig_default": null, + "orig_required": false, + "title": "VAE" + }, + "num_inference_steps": { + "default": 4, + "description": "Number of PiD distill steps. The released checkpoints are trained for 4.", + "field_kind": "input", + "input": "any", + "maximum": 8, + "minimum": 1, + "orig_default": 4, + "orig_required": false, + "title": "Num Inference Steps", + "type": "integer" + }, + "seed": { + "default": 0, + "description": "Seed for the PiD decoder's noise.", + "field_kind": "input", + "input": "any", + "orig_default": 0, + "orig_required": false, + "title": "Seed", + "type": "integer" + }, + "type": { + "const": "z_image_pid_decode", + "default": "z_image_pid_decode", + "field_kind": "node_attribute", + "title": "type", + "type": "string" + } + }, + "required": ["type", "id"], + "tags": ["latents", "image", "pid", "z-image", "upscale"], + "title": "Latents to Image - Z-Image + PiD (4x SR)", + "type": "object", + "version": "1.0.0", + "output": { + "$ref": "#/components/schemas/ImageOutput" + } + }, "ZImageSeedVarianceEnhancerInvocation": { "category": "prompt", "class": "invocation", diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index fdb9391f976..a4619b8a9e5 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -1354,6 +1354,12 @@ "selectModelToView": "Select a model to view its details", "typePhraseHere": "Type phrase here", "t5Encoder": "T5 Encoder", + "gemma2Encoder": "Gemma-2 Encoder", + "pidDecoder": "PiD Decoder", + "pidMode": "PiD Decode", + "pidModeOff": "Off", + "pidModeFit": "On (Fit to size)", + "pidModeNative": "On (Native 4×)", "qwen3Encoder": "Qwen3 Encoder", "qwenVLEncoder": "Qwen2.5-VL Encoder", "animaVae": "VAE", @@ -1678,6 +1684,10 @@ "noStartingFrameImage": "No starting frame image", "noT5EncoderModelSelected": "No T5 Encoder model selected for FLUX generation", "noFLUXVAEModelSelected": "No VAE model selected for FLUX generation", + "noPidDecoderModelSelected": "No PiD decoder model selected", + "noGemma2EncoderModelSelected": "No Gemma-2 encoder model selected (required by PiD)", + "pidScaleBeforeProcessingMustBeOff": "Turn off Scale Before Processing (set it to None) to use PiD decode", + "pidIncompatibleWithRefiner": "PiD decode is not compatible with the SDXL Refiner. Disable one of them.", "noCLIPEmbedModelSelected": "No CLIP Embed model selected for FLUX generation", "noQwen3EncoderModelSelected": "No Qwen3 Encoder model selected for FLUX2 Klein generation", "noFlux2KleinVaeModelSelected": "No VAE selected. Non-diffusers FLUX.2 Klein models require a standalone VAE", @@ -1990,6 +2000,8 @@ "imagenIncompatibleGenerationMode": "Google {{model}} supports Text to Image only. Use other models for Image to Image, Inpainting and Outpainting tasks.", "chatGPT4oIncompatibleGenerationMode": "ChatGPT 4o supports Text to Image and Image to Image only. Use other models Inpainting and Outpainting tasks.", "fluxKontextIncompatibleGenerationMode": "FLUX Kontext does not support generation from images placed on the canvas. Re-try using the Reference Image section and disable any Raster Layers.", + "pidUnsupportedMode": "PiD decode currently supports Text to Image and Image to Image only. Disable PiD for Inpaint/Outpaint.", + "pidScaleBeforeProcessingOff": "Turn off Scale Before Processing (set it to None) to use PiD decode.", "problemUnpublishingWorkflow": "Problem Unpublishing Workflow", "problemUnpublishingWorkflowDescription": "There was a problem unpublishing the workflow. Please try again.", "workflowUnpublished": "Workflow Unpublished", @@ -2057,6 +2069,15 @@ "0.5: Gentler schedule for resolutions just above native (1024px)." ] }, + "pidMode": { + "heading": "PiD Decode (Super-Resolution Decoder)", + "paragraphs": [ + "PiD replaces the standard VAE decode with NVIDIA's Pixel Diffusion Decoder, a diffusion-based 4x super-resolution decoder. It requires a PiD decoder model and a Gemma-2 caption encoder.", + "Fit: generate at your chosen resolution, PiD decodes it 4x, then downscales back to that size - extra detail at the same output size, and it composites onto the canvas (works for Image to Image too).", + "Native (4x): your dimensions are the 4x target. Generation runs at a quarter of them (e.g. 512 -> 2048) and PiD's full 4x output is used directly. The PiD decoders are trained for 2K output (512px sources), with 2K-to-4K variants for 4K.", + "Because PiD's diffusion decode reconstructs detail, you can usually lower the generation Steps to save time. 'Scale Before Processing' must be set to None while PiD is enabled." + ] + }, "seedVarianceEnhancer": { "heading": "Seed Variance Enhancer", "paragraphs": [ diff --git a/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts b/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts index e9d855648ad..e97466d51e5 100644 --- a/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts +++ b/invokeai/frontend/web/src/common/components/InformationalPopover/constants.ts @@ -6,6 +6,7 @@ export type Feature = | 'fluxDypePreset' | 'fluxDypeScale' | 'fluxDypeExponent' + | 'pidMode' | 'hrf' | 'paramNegativeConditioning' | 'paramPositiveConditioning' @@ -102,6 +103,10 @@ export const POPOVER_DATA: { [key in Feature]?: PopoverData } = { fluxDypeExponent: { placement: 'right', }, + pidMode: { + placement: 'right', + href: 'https://github.com/nv-tlabs/PiD', + }, inpainting: { href: 'https://support.invoke.ai/support/solutions/articles/151000096702-inpainting-outpainting-and-bounding-box', }, diff --git a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts index d210d2fd2ac..9f9824f6ab2 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.test.ts @@ -157,7 +157,8 @@ describe('paramsSliceConfig persisted state migration', () => { const result = migrate?.(v2State) as ReturnType; - expect(result._version).toBe(3); + // v2 migrates all the way through the current chain (v2 -> v3 adds Qwen fields, v3 -> v4 adds PiD fields). + expect(result._version).toBe(4); expect(result.qwenImageVaeModel).toBeNull(); expect(result.qwenImageQwenVLEncoderModel).toBeNull(); // Existing params should be preserved diff --git a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts index c4c90cf98e7..de17d4676f8 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/paramsSlice.ts @@ -11,6 +11,7 @@ import type { AspectRatioID, InfillMethod, ParamsState, + PidMode, PromptHistoryItem, RgbaColor, } from 'features/controlLayers/store/types'; @@ -49,7 +50,12 @@ import type { ParameterVAEModel, } from 'features/parameters/types/parameterSchemas'; import { getExternalPanelControl, hasExternalPanelControl } from 'features/parameters/util/externalPanelSchema'; -import { getGridSize, getIsSizeOptimal, getOptimalDimension } from 'features/parameters/util/optimalDimension'; +import { + getGridSize, + getIsSizeOptimal, + getOptimalDimension, + getPidScale, +} from 'features/parameters/util/optimalDimension'; import { modelConfigsAdapterSelectors, selectModelConfigsQuery } from 'services/api/endpoints/models'; import type { AnyModelConfigWithExternal } from 'services/api/types'; import { isExternalApiModelConfig, isNonRefinerMainModelConfig } from 'services/api/types'; @@ -260,6 +266,39 @@ const slice = createSlice({ } state.kleinQwen3EncoderModel = result.data; }, + pidModeChanged: (state, action: PayloadAction) => { + const prevPidScale = getPidScale(state.pidMode); + const nextPidScale = getPidScale(action.payload); + state.pidMode = action.payload; + // Entering/leaving native mode reinterprets the dimensions (4x target <-> generation resolution), so + // re-fit them to the new mode's optimal target on the new grid, preserving aspect ratio. + if (prevPidScale !== nextPidScale) { + const base = state.model?.base as BaseModelType | undefined; + const optimalDimension = getOptimalDimension(base, nextPidScale); + const { width, height } = calculateNewSize( + state.dimensions.aspectRatio.value, + optimalDimension * optimalDimension, + base, + nextPidScale + ); + state.dimensions.width = width; + state.dimensions.height = height; + } + }, + pidDecoderModelSelected: (state, action: PayloadAction<{ key: string; name: string; base: string } | null>) => { + const result = zParamsState.shape.pidDecoderModel.safeParse(action.payload); + if (!result.success) { + return; + } + state.pidDecoderModel = result.data; + }, + gemma2EncoderModelSelected: (state, action: PayloadAction<{ key: string; name: string; base: string } | null>) => { + const result = zParamsState.shape.gemma2EncoderModel.safeParse(action.payload); + if (!result.success) { + return; + } + state.gemma2EncoderModel = result.data; + }, qwenImageComponentSourceSelected: (state, action: PayloadAction) => { const result = zParamsState.shape.qwenImageComponentSource.safeParse(action.payload); if (!result.success) { @@ -392,7 +431,7 @@ const slice = createSlice({ //#region Dimensions sizeRecalled: (state, action: PayloadAction<{ width: number; height: number }>) => { const { width, height } = action.payload; - const gridSize = getGridSize(state.model?.base as BaseModelType | undefined); + const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode)); state.dimensions.width = Math.max(roundDownToMultiple(width, gridSize), 64); state.dimensions.height = Math.max(roundDownToMultiple(height, gridSize), 64); state.dimensions.aspectRatio.value = state.dimensions.width / state.dimensions.height; @@ -401,7 +440,7 @@ const slice = createSlice({ }, widthChanged: (state, action: PayloadAction<{ width: number; updateAspectRatio?: boolean; clamp?: boolean }>) => { const { width, updateAspectRatio, clamp } = action.payload; - const gridSize = getGridSize(state.model?.base as BaseModelType | undefined); + const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode)); state.dimensions.width = clamp ? Math.max(roundDownToMultiple(width, gridSize), 64) : width; if (state.dimensions.aspectRatio.isLocked) { @@ -419,7 +458,7 @@ const slice = createSlice({ }, heightChanged: (state, action: PayloadAction<{ height: number; updateAspectRatio?: boolean; clamp?: boolean }>) => { const { height, updateAspectRatio, clamp } = action.payload; - const gridSize = getGridSize(state.model?.base as BaseModelType | undefined); + const gridSize = getGridSize(state.model?.base as BaseModelType | undefined, getPidScale(state.pidMode)); state.dimensions.height = clamp ? Math.max(roundDownToMultiple(height, gridSize), 64) : height; if (state.dimensions.aspectRatio.isLocked) { @@ -457,7 +496,8 @@ const slice = createSlice({ const { width, height } = calculateNewSize( state.dimensions.aspectRatio.value, state.dimensions.width * state.dimensions.height, - state.model?.base as BaseModelType | undefined + state.model?.base as BaseModelType | undefined, + getPidScale(state.pidMode) ); state.dimensions.width = width; state.dimensions.height = height; @@ -475,7 +515,8 @@ const slice = createSlice({ const { width, height } = calculateNewSize( state.dimensions.aspectRatio.value, state.dimensions.width * state.dimensions.height, - state.model?.base as BaseModelType | undefined + state.model?.base as BaseModelType | undefined, + getPidScale(state.pidMode) ); state.dimensions.width = width; state.dimensions.height = height; @@ -483,12 +524,14 @@ const slice = createSlice({ } }, sizeOptimized: (state) => { - const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined); + const pidScale = getPidScale(state.pidMode); + const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined, pidScale); if (state.dimensions.aspectRatio.isLocked) { const { width, height } = calculateNewSize( state.dimensions.aspectRatio.value, optimalDimension * optimalDimension, - state.model?.base as BaseModelType | undefined + state.model?.base as BaseModelType | undefined, + pidScale ); state.dimensions.width = width; state.dimensions.height = height; @@ -499,19 +542,22 @@ const slice = createSlice({ } }, syncedToOptimalDimension: (state) => { - const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined); + const pidScale = getPidScale(state.pidMode); + const optimalDimension = getOptimalDimension(state.model?.base as BaseModelType | undefined, pidScale); if ( !getIsSizeOptimal( state.dimensions.width, state.dimensions.height, - state.model?.base as BaseModelType | undefined + state.model?.base as BaseModelType | undefined, + pidScale ) ) { const bboxDims = calculateNewSize( state.dimensions.aspectRatio.value, optimalDimension * optimalDimension, - state.model?.base as BaseModelType | undefined + state.model?.base as BaseModelType | undefined, + pidScale ); state.dimensions.width = bboxDims.width; state.dimensions.height = bboxDims.height; @@ -616,6 +662,10 @@ const resetState = (state: ParamsState): ParamsState => { newState.animaQwen3EncoderModel = oldState.animaQwen3EncoderModel; newState.kleinVaeModel = oldState.kleinVaeModel; newState.kleinQwen3EncoderModel = oldState.kleinQwen3EncoderModel; + newState.pidMode = oldState.pidMode; + newState.pidDecoderModel = oldState.pidDecoderModel; + newState.gemma2EncoderModel = oldState.gemma2EncoderModel; + newState.pidSteps = oldState.pidSteps; newState.qwenImageComponentSource = oldState.qwenImageComponentSource; newState.qwenImageVaeModel = oldState.qwenImageVaeModel; newState.qwenImageQwenVLEncoderModel = oldState.qwenImageQwenVLEncoderModel; @@ -668,6 +718,9 @@ export const { zImageQwen3SourceModelSelected, kleinVaeModelSelected, kleinQwen3EncoderModelSelected, + pidModeChanged, + pidDecoderModelSelected, + gemma2EncoderModelSelected, qwenImageComponentSourceSelected, qwenImageVaeModelSelected, qwenImageQwenVLEncoderModelSelected, @@ -744,6 +797,15 @@ export const paramsSliceConfig: SliceConfig = { state.qwenImageQwenVLEncoderModel = null; } + if (state._version === 3) { + // v3 -> v4, add PiD (Pixel Diffusion Decoder) fields + state._version = 4; + state.pidMode = 'off'; + state.pidDecoderModel = null; + state.gemma2EncoderModel = null; + state.pidSteps = 4; + } + return zParamsState.parse(state); }, }, @@ -787,6 +849,9 @@ export const selectAnimaQwen3EncoderModel = createParamsSelector((params) => par export const selectAnimaScheduler = createParamsSelector((params) => params.animaScheduler); export const selectKleinVaeModel = createParamsSelector((params) => params.kleinVaeModel); export const selectKleinQwen3EncoderModel = createParamsSelector((params) => params.kleinQwen3EncoderModel); +export const selectPidMode = createParamsSelector((params) => params.pidMode); +export const selectPidDecoderModel = createParamsSelector((params) => params.pidDecoderModel); +export const selectGemma2EncoderModel = createParamsSelector((params) => params.gemma2EncoderModel); export const selectQwenImageComponentSource = createParamsSelector((params) => params.qwenImageComponentSource); export const selectQwenImageVaeModel = createParamsSelector((params) => params.qwenImageVaeModel); export const selectQwenImageQwenVLEncoderModel = createParamsSelector((params) => params.qwenImageQwenVLEncoderModel); diff --git a/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts b/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts index db37f32d49e..7aa4748cf74 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/selectors.ts @@ -14,7 +14,7 @@ import type { CanvasState, } from 'features/controlLayers/store/types'; import type { BaseModelType } from 'features/nodes/types/common'; -import { getGridSize, getOptimalDimension } from 'features/parameters/util/optimalDimension'; +import { getGridSize, getOptimalDimension, getPidScale } from 'features/parameters/util/optimalDimension'; import type { Equals } from 'tsafe'; import { assert } from 'tsafe'; @@ -76,7 +76,7 @@ export const selectHasEntities = createSelector(selectEntityCountAll, (count) => */ export const selectOptimalDimension = createSelector(selectParamsSlice, (params): number => { const modelBase = params.model?.base as BaseModelType | undefined; - return getOptimalDimension(modelBase ?? null); + return getOptimalDimension(modelBase ?? null, getPidScale(params.pidMode)); }); /** @@ -84,7 +84,7 @@ export const selectOptimalDimension = createSelector(selectParamsSlice, (params) */ export const selectGridSize = createSelector(selectParamsSlice, (params): number => { const modelBase = params.model?.base as BaseModelType | undefined; - return getGridSize(modelBase ?? null); + return getGridSize(modelBase ?? null, getPidScale(params.pidMode)); }); /** diff --git a/invokeai/frontend/web/src/features/controlLayers/store/types.ts b/invokeai/frontend/web/src/features/controlLayers/store/types.ts index 53cb70d8f0e..4701c3157b1 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/types.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/types.ts @@ -780,8 +780,11 @@ const zPositivePromptHistory = z export const zInfillMethod = z.enum(['patchmatch', 'lama', 'cv2', 'color', 'tile']); export type InfillMethod = z.infer; +const zPidMode = z.enum(['off', 'fit', 'native']); +export type PidMode = z.infer; + export const zParamsState = z.object({ - _version: z.literal(3), + _version: z.literal(4), maskBlur: z.number(), maskBlurMethod: zParameterMaskBlurMethod, canvasCoherenceMode: zParameterCanvasCoherenceMode, @@ -844,6 +847,14 @@ export const zParamsState = z.object({ // Flux2 Klein model components - uses Qwen3 instead of CLIP+T5 kleinVaeModel: zParameterVAEModel.nullable(), // Optional: Separate FLUX.2 VAE for Klein kleinQwen3EncoderModel: zModelIdentifierField.nullable(), // Optional: Separate Qwen3 Encoder for Klein + // PiD (Pixel Diffusion Decoder) - optional 4x super-resolution decode replacing the VAE decode. + // - 'off': regular VAE decode + // - 'fit': PiD decodes 4x internally, then downscales back to the bbox (compositing-safe; works in canvas/inpaint) + // - 'native': PiD's full 4x output IS the result; the user-facing dimensions are the target, generation runs at target / 4 + pidMode: zPidMode, + pidDecoderModel: zModelIdentifierField.nullable(), // PiD decoder checkpoint (matched to the main model's base) + gemma2EncoderModel: zModelIdentifierField.nullable(), // Gemma-2 caption encoder required by PiD + pidSteps: z.number(), // PiD distill steps (released checkpoints are trained for 4) // Qwen Image Edit model components - GGUF transformer needs a Diffusers source for VAE/encoder qwenImageComponentSource: zParameterModel.nullable(), // Diffusers model providing VAE + text encoder qwenImageVaeModel: zParameterVAEModel.nullable(), // Optional: Standalone Qwen Image VAE checkpoint @@ -869,7 +880,7 @@ export const zParamsState = z.object({ }); export type ParamsState = z.infer; export const getInitialParamsState = (): ParamsState => ({ - _version: 3, + _version: 4, maskBlur: 16, maskBlurMethod: 'box', canvasCoherenceMode: 'Gaussian Blur', @@ -929,6 +940,10 @@ export const getInitialParamsState = (): ParamsState => ({ animaScheduler: 'euler', kleinVaeModel: null, kleinQwen3EncoderModel: null, + pidMode: 'off', + pidDecoderModel: null, + gemma2EncoderModel: null, + pidSteps: 4, qwenImageComponentSource: null, qwenImageVaeModel: null, qwenImageQwenVLEncoderModel: null, diff --git a/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts b/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts index 5f58e77545a..a183e1b5b8f 100644 --- a/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts +++ b/invokeai/frontend/web/src/features/controlLayers/util/getScaledBoundingBoxDimensions.ts @@ -57,12 +57,13 @@ export const getScaledBoundingBoxDimensions = (dimensions: Dimensions, base?: Ba * @param ratio The aspect ratio to calculate the new size for * @param area The input area * @param base The base model + * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD) * @returns The width and height that will fit the given aspect ratio, retaining the input area */ -export const calculateNewSize = (ratio: number, area: number, base?: BaseModelType): Dimensions => { +export const calculateNewSize = (ratio: number, area: number, base?: BaseModelType, pidScale = 1): Dimensions => { const exactWidth = Math.sqrt(area * ratio); const exactHeight = exactWidth / ratio; - const gridSize = getGridSize(base); + const gridSize = getGridSize(base, pidScale); return { width: roundToMultiple(exactWidth, gridSize), diff --git a/invokeai/frontend/web/src/features/modelManagerV2/models.ts b/invokeai/frontend/web/src/features/modelManagerV2/models.ts index cf295c9af6a..a95b6348c15 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/models.ts +++ b/invokeai/frontend/web/src/features/modelManagerV2/models.ts @@ -8,10 +8,12 @@ import { isControlNetModelConfig, isExternalApiModelConfig, isFluxReduxModelConfig, + isGemma2EncoderModelConfig, isIPAdapterModelConfig, isLLaVAModelConfig, isLoRAModelConfig, isNonRefinerMainModelConfig, + isPiDDecoderModelConfig, isQwen3EncoderModelConfig, isQwenVLEncoderModelConfig, isRefinerMainModelModelConfig, @@ -85,6 +87,16 @@ const MODEL_CATEGORIES: Record = { i18nKey: 'modelManager.qwenVLEncoder', filter: isQwenVLEncoderModelConfig, }, + gemma2_encoder: { + category: 'gemma2_encoder', + i18nKey: 'modelManager.gemma2Encoder', + filter: isGemma2EncoderModelConfig, + }, + pid_decoder: { + category: 'pid_decoder', + i18nKey: 'modelManager.pidDecoder', + filter: isPiDDecoderModelConfig, + }, control_lora: { category: 'control_lora', i18nKey: 'modelManager.controlLora', @@ -187,11 +199,13 @@ export const MODEL_TYPE_TO_LONG_NAME: Record = { t5_encoder: 'T5 Encoder', qwen3_encoder: 'Qwen3 Encoder', qwen_vl_encoder: 'Qwen2.5-VL Encoder', + gemma2_encoder: 'Gemma-2 Encoder', clip_embed: 'CLIP Embed', siglip: 'SigLIP', flux_redux: 'FLUX Redux', text_llm: 'Text LLM', external_image_generator: 'External Image Generator', + pid_decoder: 'PiD Decoder', unknown: 'Unknown', }; @@ -255,6 +269,8 @@ export const MODEL_VARIANT_TO_LONG_NAME: Record = { qwen3_4b: 'Qwen3 4B', qwen3_8b: 'Qwen3 8B', qwen3_06b: 'Qwen3 0.6B', + res2k_sr4x: 'PiD 2K (4x SR)', + res2kto4k_sr4x: 'PiD 4K (4x SR Upscale)', }; export const MODEL_FORMAT_TO_LONG_NAME: Record = { @@ -271,6 +287,7 @@ export const MODEL_FORMAT_TO_LONG_NAME: Record = { t5_encoder: 'T5 Encoder', qwen3_encoder: 'Qwen3 Encoder', qwen_vl_encoder: 'Qwen2.5-VL Encoder', + gemma2_encoder: 'Gemma-2 Encoder', bnb_quantized_int8b: 'BNB Quantized (int8b)', bnb_quantized_nf4b: 'BNB Quantized (nf4b)', gguf_quantized: 'GGUF Quantized', diff --git a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx index 71d2efe0e45..79e35926667 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx +++ b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelManagerPanel/ModelFormatBadge.tsx @@ -16,6 +16,7 @@ const FORMAT_NAME_MAP: Record = { t5_encoder: 't5_encoder', qwen3_encoder: 'qwen3_encoder', qwen_vl_encoder: 'qwen_vl_encoder', + gemma2_encoder: 'gemma2_encoder', bnb_quantized_int8b: 'bnb_quantized_int8b', bnb_quantized_nf4b: 'quantized', gguf_quantized: 'gguf', @@ -37,6 +38,7 @@ const FORMAT_COLOR_MAP: Record = { t5_encoder: 'base', qwen3_encoder: 'base', qwen_vl_encoder: 'base', + gemma2_encoder: 'base', bnb_quantized_int8b: 'base', bnb_quantized_nf4b: 'base', gguf_quantized: 'base', diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts index fb2a1ce946a..24e54996642 100644 --- a/invokeai/frontend/web/src/features/nodes/types/common.ts +++ b/invokeai/frontend/web/src/features/nodes/types/common.ts @@ -134,10 +134,12 @@ export const zModelType = z.enum([ 't5_encoder', 'qwen3_encoder', 'qwen_vl_encoder', + 'gemma2_encoder', 'clip_embed', 'siglip', 'flux_redux', 'external_image_generator', + 'pid_decoder', 'unknown', ]); export type ModelType = z.infer; @@ -164,6 +166,7 @@ export const zFlux2VariantType = z.enum(['klein_4b', 'klein_4b_base', 'klein_9b' export const zZImageVariantType = z.enum(['turbo', 'zbase']); const zQwenImageVariantType = z.enum(['generate', 'edit']); export const zQwen3VariantType = z.enum(['qwen3_4b', 'qwen3_8b', 'qwen3_06b']); +const zPiDDecoderVariantType = z.enum(['res2k_sr4x', 'res2kto4k_sr4x']); export const zAnyModelVariant = z.union([ zModelVariantType, zClipVariantType, @@ -172,6 +175,7 @@ export const zAnyModelVariant = z.union([ zZImageVariantType, zQwenImageVariantType, zQwen3VariantType, + zPiDDecoderVariantType, ]); export type AnyModelVariant = z.infer; export const zModelFormat = z.enum([ @@ -187,6 +191,7 @@ export const zModelFormat = z.enum([ 't5_encoder', 'qwen3_encoder', 'qwen_vl_encoder', + 'gemma2_encoder', 'bnb_quantized_int8b', 'bnb_quantized_nf4b', 'gguf_quantized', diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts index f17ff970f27..1df9d6ec658 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addImageToImage.ts @@ -9,7 +9,7 @@ import { } from 'features/nodes/util/graph/graphBuilderUtils'; import type { DenoiseLatentsNodes, - LatentToImageNodes, + ImageOutputNodes, MainModelLoaderNodes, VaeSourceNodes, } from 'features/nodes/util/graph/types'; @@ -20,7 +20,9 @@ type AddImageToImageArg = { g: Graph; state: RootState; manager: CanvasManager; - l2i: Invocation; + // Only the `.image` output is consumed downstream, so any image-producing node works here (e.g. a PiD decode + // chain substituted for the regular VAE decode). + l2i: Invocation; i2l: Invocation< | 'i2l' | 'flux_vae_encode' @@ -45,19 +47,7 @@ export const addImageToImage = async ({ noise, denoise, vaeSource, -}: AddImageToImageArg): Promise< - Invocation< - | 'img_resize' - | 'l2i' - | 'flux_vae_decode' - | 'flux2_vae_decode' - | 'sd3_l2i' - | 'cogview4_l2i' - | 'qwen_image_l2i' - | 'z_image_l2i' - | 'anima_l2i' - > -> => { +}: AddImageToImageArg): Promise> => { const { denoising_start, denoising_end } = getDenoisingStartAndEnd(state); denoise.denoising_start = denoising_start; denoise.denoising_end = denoising_end; diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts new file mode 100644 index 00000000000..fb8f65f6bc3 --- /dev/null +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addPidDecode.ts @@ -0,0 +1,349 @@ +import type { RootState } from 'app/store/store'; +import { roundDownToMultiple } from 'common/util/roundDownToMultiple'; +import type { CanvasManager } from 'features/controlLayers/konva/CanvasManager'; +import { getPrefixedId } from 'features/controlLayers/konva/util'; +import { selectMainModelConfig, selectParamsSlice } from 'features/controlLayers/store/paramsSlice'; +import type { Graph } from 'features/nodes/util/graph/generation/Graph'; +import { + getDenoisingStartAndEnd, + getOriginalAndScaledSizesForOtherModes, + getOriginalAndScaledSizesForTextToImage, +} from 'features/nodes/util/graph/graphBuilderUtils'; +import type { ImageToLatentsNodes, MainModelLoaderNodes, VaeSourceNodes } from 'features/nodes/util/graph/types'; +import { getGridSize, PID_SCALE } from 'features/parameters/util/optimalDimension'; +import type { Invocation } from 'services/api/types'; +import { assert } from 'tsafe'; + +type Size = { width: number; height: number }; + +/** + * The base-specific PiD decode node types. Each replaces its base's VAE decode with the PiD super-res decode. + * Only bases whose graph builder actually wires PiD are listed; more are added as their builders gain support. + */ +type PidDecodeNodeType = + | 'flux_pid_decode' + | 'flux2_pid_decode' + | 'sd3_pid_decode' + | 'sdxl_pid_decode' + | 'z_image_pid_decode' + | 'qwen_image_pid_decode'; + +/** + * Denoise nodes whose latents PiD can decode. The FLUX-family nodes carry their own width/height; `denoise_latents` + * (SD1.5/SD2/SDXL) does not - it is sized via a separate `noise` node, so callers using it must pass `noise`. + */ +type PidDenoiseNodeType = + | 'flux_denoise' + | 'flux2_denoise' + | 'sd3_denoise' + | 'z_image_denoise' + | 'qwen_image_denoise' + | 'denoise_latents'; + +/** PiD decode node types that expose a `vae` input (used to read the VAE's scaling constants at runtime). */ +const PID_DECODE_NODES_WITH_VAE_INPUT = new Set([ + 'flux2_pid_decode', + 'sdxl_pid_decode', + 'z_image_pid_decode', + 'qwen_image_pid_decode', +]); + +/** + * Sets the generation dimensions for a PiD graph. The FLUX-family denoise nodes carry width/height directly; + * `denoise_latents` (SD1.5/SD2/SDXL) is sized via its `noise` node instead (mirrors {@link addTextToImage}). + */ +const setPidGenDimensions = ( + denoise: Invocation, + noise: Invocation<'noise'> | undefined, + width: number, + height: number +): void => { + if (denoise.type === 'denoise_latents') { + assert(noise, 'PiD with denoise_latents (SD1.5/SD2/SDXL) requires a noise node'); + noise.width = width; + noise.height = height; + } else { + denoise.width = width; + denoise.height = height; + } +}; + +/** Reads back the generation dimensions set by {@link setPidGenDimensions} (from the noise node for `denoise_latents`). */ +const getPidGenDimensions = (denoise: Invocation, noise: Invocation<'noise'> | undefined): Size => { + if (denoise.type === 'denoise_latents') { + assert( + noise?.width !== undefined && noise.height !== undefined, + 'PiD native decode requires the noise dimensions to be set by the caller' + ); + return { width: noise.width, height: noise.height }; + } + assert( + denoise.width !== undefined && denoise.height !== undefined, + 'PiD native decode requires the denoise dimensions to be set by the caller' + ); + return { width: denoise.width, height: denoise.height }; +}; + +type BuildPidDecodeChainArg = { + g: Graph; + state: RootState; + /** The denoise node producing the latents PiD will decode. Its dimensions are set by the CALLER. */ + denoise: Invocation; + /** The noise node, required when `denoise` is a `denoise_latents` node (SD1.5/SD2/SDXL) - it carries the size. */ + noise?: Invocation<'noise'>; + /** Which base-specific PiD decode node to build (e.g. `flux_pid_decode`, `flux2_pid_decode`). */ + decodeNodeType: PidDecodeNodeType; + /** + * Optional VAE source. If the chosen decode node has a `vae` input (e.g. `flux2_pid_decode`), it is wired so + * the node can read the VAE's scaling/shift constants at runtime. Ignored for nodes without a `vae` input. + */ + vaeSource?: Invocation; + /** The positive prompt node - PiD conditions its decode on the same caption. */ + positivePrompt: Invocation<'string'>; + /** The seed node - reused for PiD's internal decode noise so results are reproducible. */ + seed: Invocation<'integer'>; + /** + * - 'fit': PiD decodes 4x, then the output is downscaled to `fitSize` (compositing-safe; used everywhere). + * - 'native': PiD's full 4x output is used directly (txt2img only; `fitSize` is ignored). + */ + mode: 'fit' | 'native'; + /** The size to downscale the 4x output to in 'fit' mode (the bbox / region the result must fit). */ + fitSize: Size; +}; + +/** + * Builds the PiD (Pixel Diffusion Decoder) decode chain: the Gemma-2 + PiD loaders, the `flux_pid_decode` node + * wired to the given denoise latents, and (in 'fit' mode) an `img_resize` that downscales PiD's 4x output to + * `fitSize`. Returns the terminal image node, which is a drop-in for the regular VAE decode (`l2i`) - downstream + * nodes only consume its `.image` output. + * + * This does NOT modify the denoise node's dimensions or denoising start/end; the caller owns those (they differ + * between txt2img and img2img/inpaint). + */ +export const buildPidDecodeChain = ({ + g, + state, + denoise, + noise, + decodeNodeType, + vaeSource, + positivePrompt, + seed, + mode, + fitSize, +}: BuildPidDecodeChainArg): Invocation<'img_resize' | PidDecodeNodeType> => { + const params = selectParamsSlice(state); + const { pidDecoderModel, gemma2EncoderModel, pidSteps } = params; + assert(pidDecoderModel, 'No PiD decoder model selected'); + assert(gemma2EncoderModel, 'No Gemma-2 encoder model selected'); + + const gemma2Loader = g.addNode({ + type: 'gemma2_encoder_loader', + id: getPrefixedId('gemma2_encoder_loader'), + gemma2_model: gemma2EncoderModel, + }); + const pidLoader = g.addNode({ + type: 'pid_decoder_loader', + id: getPrefixedId('pid_decoder_loader'), + pid_decoder_model: pidDecoderModel, + }); + const pidDecode = g.addNode({ + type: decodeNodeType, + id: getPrefixedId(decodeNodeType), + num_inference_steps: pidSteps, + }); + + g.addEdge(denoise, 'latents', pidDecode, 'latents'); + g.addEdge(positivePrompt, 'value', pidDecode, 'prompt'); + g.addEdge(gemma2Loader, 'gemma2_encoder', pidDecode, 'gemma2_encoder'); + g.addEdge(pidLoader, 'pid_decoder', pidDecode, 'pid_decoder'); + g.addEdge(seed, 'value', pidDecode, 'seed'); + // Wire the VAE only for decode nodes that read scaling constants from it (currently just flux2_pid_decode). + if (vaeSource && PID_DECODE_NODES_WITH_VAE_INPUT.has(decodeNodeType)) { + g.addEdge(vaeSource, 'vae', pidDecode as Invocation<'flux2_pid_decode'>, 'vae'); + } + + const commonMetadata = { + pid_decoder: pidDecoderModel, + gemma2_encoder: gemma2EncoderModel, + pid_steps: pidSteps, + }; + + if (mode === 'native') { + // PiD's 4x output IS the result (the caller generated at target / 4) - no downscale. + const genSize = getPidGenDimensions(denoise, noise); + g.upsertMetadata({ + ...commonMetadata, + pid_mode: mode, + width: genSize.width * PID_SCALE, + height: genSize.height * PID_SCALE, + }); + return pidDecode; + } + + // Fit mode: downscale PiD's 4x output back to the requested size. + const resize = g.addNode({ + id: getPrefixedId('pid_fit_resize'), + type: 'img_resize', + ...fitSize, + }); + g.addEdge(pidDecode, 'image', resize, 'image'); + g.upsertMetadata({ ...commonMetadata, pid_mode: mode, width: fitSize.width, height: fitSize.height }); + + return resize; +}; + +type AddPidDecodeArg = { + g: Graph; + state: RootState; + mode: 'fit' | 'native'; + denoise: Invocation; + noise?: Invocation<'noise'>; + decodeNodeType: PidDecodeNodeType; + vaeSource?: Invocation; + positivePrompt: Invocation<'string'>; + seed: Invocation<'integer'>; +}; + +/** + * Text-to-image PiD decode: sets up the denoise node (full denoise, generation dimensions) and replaces the VAE + * decode with a PiD decode (see {@link buildPidDecodeChain}). + * + * - 'fit': generate at the requested size, PiD decodes 4x, then downscale back to it. + * - 'native': the requested dimensions are the 4x target; generate at target / 4 and use PiD's 4x output directly. + * + * The caller is responsible for having NOT wired a VAE decode for these latents (or for deleting it). + * + * @returns The terminal image node, to be used as the canvas output. + */ +export const addPidDecode = ({ + g, + state, + mode, + denoise, + noise, + decodeNodeType, + vaeSource, + positivePrompt, + seed, +}: AddPidDecodeArg): Invocation<'img_resize' | PidDecodeNodeType> => { + const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state); + // Round the generation resolution to the main model's native grid (16 for FLUX-family, 8 for SDXL). The bbox is + // pre-snapped to grid * PID_SCALE by the UI/readiness, so target / PID_SCALE lands exactly on the grid. + const gridSize = getGridSize(selectMainModelConfig(state)?.base); + + denoise.denoising_start = 0; + denoise.denoising_end = 1; + if (mode === 'native') { + // The user-facing dimensions are the 4x target; generate at target / PID_SCALE (kept on the model grid). + setPidGenDimensions( + denoise, + noise, + Math.max(roundDownToMultiple(originalSize.width / PID_SCALE, gridSize), gridSize), + Math.max(roundDownToMultiple(originalSize.height / PID_SCALE, gridSize), gridSize) + ); + } else { + // Generate at the normal resolution; PiD will 4x it and we downscale back to it. + setPidGenDimensions(denoise, noise, scaledSize.width, scaledSize.height); + } + + return buildPidDecodeChain({ + g, + state, + denoise, + noise, + decodeNodeType, + vaeSource, + positivePrompt, + seed, + mode, + fitSize: originalSize, + }); +}; + +type AddPidImageToImageNativeArg = { + g: Graph; + state: RootState; + manager: CanvasManager; + /** The denoise node. Its dimensions are set here to the 4x target / PID_SCALE. */ + denoise: Invocation; + /** The noise node, required when `denoise` is a `denoise_latents` node (SD1.5/SD2/SDXL) - it carries the size. */ + noise?: Invocation<'noise'>; + /** Which base-specific PiD decode node to build. */ + decodeNodeType: PidDecodeNodeType; + /** The VAE encode node for the init image. */ + i2l: Invocation; + /** The model loader / VAE source providing the VAE for encoding the init image (and, if applicable, the decode). */ + vaeSource: Invocation; + positivePrompt: Invocation<'string'>; + seed: Invocation<'integer'>; +}; + +/** + * Native-4x PiD image-to-image (Canvas only). The user-facing bbox IS the 4x target: generation runs at bbox / + * PID_SCALE, the init image is downscaled to that resolution before encoding, and PiD decodes the latents straight + * back up to the full bbox size - no post-decode downscale, so all of PiD's detail is preserved. Because the result + * is exactly the bbox size it composites cleanly back onto the canvas region. + * + * Requires the bbox to be a multiple of the PiD-scaled grid (enforced by the UI grid snapping / readiness) so that + * bbox / PID_SCALE lands on the FLUX grid and PiD's 4x output matches the bbox exactly. + * + * @returns The terminal `flux_pid_decode` node, to be used as the canvas output. + */ +export const addPidImageToImageNative = async ({ + g, + state, + manager, + denoise, + noise, + decodeNodeType, + i2l, + vaeSource, + positivePrompt, + seed, +}: AddPidImageToImageNativeArg): Promise> => { + const { denoising_start, denoising_end } = getDenoisingStartAndEnd(state); + denoise.denoising_start = denoising_start; + denoise.denoising_end = denoising_end; + + const { originalSize, rect } = getOriginalAndScaledSizesForOtherModes(state); + const gridSize = getGridSize(selectMainModelConfig(state)?.base); + + // The bbox is the 4x target; generate at target / PID_SCALE (kept on the model grid). + const genSize = { + width: Math.max(roundDownToMultiple(originalSize.width / PID_SCALE, gridSize), gridSize), + height: Math.max(roundDownToMultiple(originalSize.height / PID_SCALE, gridSize), gridSize), + }; + setPidGenDimensions(denoise, noise, genSize.width, genSize.height); + + const adapters = manager.compositor.getVisibleAdaptersOfType('raster_layer'); + const { image_name } = await manager.compositor.getCompositeImageDTO(adapters, rect, { + is_intermediate: true, + silent: true, + }); + + // Downscale the init image to the generation resolution before encoding. + const resizeIn = g.addNode({ + type: 'img_resize', + id: getPrefixedId('initial_image_resize_in'), + image: { image_name }, + ...genSize, + }); + g.addEdge(vaeSource, 'vae', i2l, 'vae'); + g.addEdge(resizeIn, 'image', i2l, 'image'); + g.addEdge(i2l, 'latents', denoise, 'latents'); + + // PiD decodes the genSize latents straight up to 4x = the bbox. fitSize is ignored in native mode. + return buildPidDecodeChain({ + g, + state, + denoise, + noise, + decodeNodeType, + vaeSource, + positivePrompt, + seed, + mode: 'native', + fitSize: originalSize, + }); +}; diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts index 5b9f3d0a468..1704a1a12cb 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.test.ts @@ -109,6 +109,7 @@ const mockParams = { fluxVAE: null, t5EncoderModel: null, clipEmbedModel: null, + pidMode: 'off' as const, }; vi.mock('features/controlLayers/store/paramsSlice', () => ({ diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts index dafcd9310ec..0d38cf249a9 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts @@ -20,11 +20,20 @@ import { addImageToImage } from 'features/nodes/util/graph/generation/addImageTo import { addInpaint } from 'features/nodes/util/graph/generation/addInpaint'; import { addNSFWChecker } from 'features/nodes/util/graph/generation/addNSFWChecker'; import { addOutpaint } from 'features/nodes/util/graph/generation/addOutpaint'; +import { + addPidDecode, + addPidImageToImageNative, + buildPidDecodeChain, +} from 'features/nodes/util/graph/generation/addPidDecode'; import { addRegions } from 'features/nodes/util/graph/generation/addRegions'; import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage'; import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker'; import { Graph } from 'features/nodes/util/graph/generation/Graph'; -import { selectCanvasOutputFields } from 'features/nodes/util/graph/graphBuilderUtils'; +import { + getOriginalAndScaledSizesForOtherModes, + getOriginalAndScaledSizesForTextToImage, + selectCanvasOutputFields, +} from 'features/nodes/util/graph/graphBuilderUtils'; import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types'; import { UnsupportedGenerationModeError } from 'features/nodes/util/graph/types'; import { isFlux2KleinQwen3Compatible } from 'features/parameters/util/flux2Klein'; @@ -62,6 +71,7 @@ export const buildFLUXGraph = async (arg: GraphBuilderArg): Promise; const fluxL2i = l2i as Invocation<'flux_vae_decode'>; + if (pidMode !== 'off') { + // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native). + if (generationMode === 'inpaint' || generationMode === 'outpaint') { + throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode')); + } + // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate + // the generation size to the model optimal, blowing up the decode - require it off (scaled == original). + const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state); + if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) { + throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff')); + } + } + // Only add FLUX LoRAs for non-Klein models addFLUXLoRAs(state, g, fluxDenoise, fluxModelLoader, fluxPosCond); @@ -430,12 +522,26 @@ export const buildFLUXGraph = async (arg: GraphBuilderArg): Promise = l2i; + if (pidMode !== 'off') { + // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native). + if (generationMode === 'inpaint' || generationMode === 'outpaint') { + throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode')); + } + // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate + // the generation size to the model optimal, blowing up the decode - require it off (scaled == original). + const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state); + if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) { + throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff')); + } + } + if (generationMode === 'txt2img') { - canvasOutput = addTextToImage({ - g, - state, - denoise, - l2i, - }); + if (pidMode !== 'off') { + // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). The Qwen-Image VAE (from the + // model loader) is wired so the node reads its per-channel latents_mean / latents_std. + g.deleteNode(l2i.id); + canvasOutput = addPidDecode({ + g, + state, + mode: pidMode, + denoise, + decodeNodeType: 'qwen_image_pid_decode', + vaeSource: modelLoader, + positivePrompt, + seed, + }); + } else { + canvasOutput = addTextToImage({ + g, + state, + denoise, + l2i, + }); + } g.upsertMetadata({ generation_mode: 'qwen_image_txt2img' }); } else if (generationMode === 'img2img') { assert(manager !== null); @@ -255,15 +296,56 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise = l2i; + if (pidMode !== 'off') { + // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native). + if (generationMode === 'inpaint' || generationMode === 'outpaint') { + throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode')); + } + // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate + // the generation size to the model optimal, blowing up the decode - require it off (scaled == original). + const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state); + if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) { + throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff')); + } + } + if (generationMode === 'txt2img') { - canvasOutput = addTextToImage({ - g, - state, - denoise, - l2i, - }); + if (pidMode !== 'off') { + // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). sd3_pid_decode has no vae + // input (fixed SD3 constants), so no vaeSource is passed. + g.deleteNode(l2i.id); + canvasOutput = addPidDecode({ + g, + state, + mode: pidMode, + denoise, + decodeNodeType: 'sd3_pid_decode', + positivePrompt, + seed, + }); + } else { + canvasOutput = addTextToImage({ + g, + state, + denoise, + l2i, + }); + } g.upsertMetadata({ generation_mode: 'sd3_txt2img' }); } else if (generationMode === 'img2img') { assert(manager !== null); @@ -121,15 +160,55 @@ export const buildSD3Graph = async (arg: GraphBuilderArg): Promise model, negativePrompt: 'raw negative prompt', positivePrompt: 'raw positive prompt', + pidMode: 'off', refinerModel: null, scheduler: 'euler', seed: 123, diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts index f31c42ee561..0f9cba1c391 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildSDXLGraph.ts @@ -9,15 +9,26 @@ import { addInpaint } from 'features/nodes/util/graph/generation/addInpaint'; import { addIPAdapters } from 'features/nodes/util/graph/generation/addIPAdapters'; import { addNSFWChecker } from 'features/nodes/util/graph/generation/addNSFWChecker'; import { addOutpaint } from 'features/nodes/util/graph/generation/addOutpaint'; +import { + addPidDecode, + addPidImageToImageNative, + buildPidDecodeChain, +} from 'features/nodes/util/graph/generation/addPidDecode'; import { addSDXLLoRAs } from 'features/nodes/util/graph/generation/addSDXLLoRAs'; import { addSDXLRefiner } from 'features/nodes/util/graph/generation/addSDXLRefiner'; import { addSeamless } from 'features/nodes/util/graph/generation/addSeamless'; import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage'; import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker'; import { Graph } from 'features/nodes/util/graph/generation/Graph'; -import { selectCanvasOutputFields } from 'features/nodes/util/graph/graphBuilderUtils'; +import { + getOriginalAndScaledSizesForOtherModes, + getOriginalAndScaledSizesForTextToImage, + selectCanvasOutputFields, +} from 'features/nodes/util/graph/graphBuilderUtils'; import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types'; +import { UnsupportedGenerationModeError } from 'features/nodes/util/graph/types'; import { selectActiveTab } from 'features/ui/store/uiSelectors'; +import { t } from 'i18next'; import type { Invocation } from 'services/api/types'; import type { Equals } from 'tsafe'; import { assert } from 'tsafe'; @@ -49,6 +60,7 @@ export const buildSDXLGraph = async (arg: GraphBuilderArg): Promise = l2i; if (generationMode === 'txt2img') { - canvasOutput = addTextToImage({ - g, - state, - noise, - denoise, - l2i, - }); + if (pidMode !== 'off') { + // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). SDXL's VAE source is wired + // so sdxl_pid_decode can read scaling_factor / shift_factor from it. + g.deleteNode(l2i.id); + canvasOutput = addPidDecode({ + g, + state, + mode: pidMode, + denoise, + noise, + decodeNodeType: 'sdxl_pid_decode', + vaeSource, + positivePrompt, + seed, + }); + } else { + canvasOutput = addTextToImage({ + g, + state, + noise, + denoise, + l2i, + }); + } g.upsertMetadata({ generation_mode: 'sdxl_txt2img' }); } else if (generationMode === 'img2img') { assert(manager !== null); @@ -184,16 +231,60 @@ export const buildSDXLGraph = async (arg: GraphBuilderArg): Promise = l2i; + if (pidMode !== 'off') { + // Inpaint/outpaint are not wired for PiD yet - only txt2img and img2img are supported (Fit and Native). + if (generationMode === 'inpaint' || generationMode === 'outpaint') { + throw new UnsupportedGenerationModeError(t('toast.pidUnsupportedMode')); + } + // PiD decodes at 4x the generation resolution. "Scale Before Processing" (Canvas) would silently inflate + // the generation size to the model optimal, blowing up the decode - require it off (scaled == original). + const { originalSize, scaledSize } = getOriginalAndScaledSizesForTextToImage(state); + if (scaledSize.width !== originalSize.width || scaledSize.height !== originalSize.height) { + throw new UnsupportedGenerationModeError(t('toast.pidScaleBeforeProcessingOff')); + } + } + if (generationMode === 'txt2img') { - canvasOutput = addTextToImage({ - g, - state, - denoise, - l2i, - }); + if (pidMode !== 'off') { + // PiD replaces the VAE decode entirely - drop the unused l2i (and its edges). Z-Image shares FLUX.1's VAE + // and uses the FLUX PiD decoder; the Z-Image VAE (from the model loader) is wired so the node reads its + // scaling_factor / shift_factor. + g.deleteNode(l2i.id); + canvasOutput = addPidDecode({ + g, + state, + mode: pidMode, + denoise, + decodeNodeType: 'z_image_pid_decode', + vaeSource: modelLoader, + positivePrompt, + seed, + }); + } else { + canvasOutput = addTextToImage({ + g, + state, + denoise, + l2i, + }); + } g.upsertMetadata({ generation_mode: 'z_image_txt2img' }); } else if (generationMode === 'img2img') { assert(manager !== null); @@ -246,15 +288,56 @@ export const buildZImageGraph = async (arg: GraphBuilderArg): Promise { + const dispatch = useAppDispatch(); + const { t } = useTranslation(); + const selectedModel = useAppSelector(selectPidDecoderModel); + const mainModelConfig = useAppSelector(selectMainModelConfig); + // PiD decoders are pinned to a backbone; only decoders whose base matches the main model's PiD decoder base + // are valid (e.g. flux2 decoders for a FLUX.2 main model). getPidDecoderBaseForMainBase returns null when the + // base has no PiD support, so the filter rejects everything and the combobox shows no options. + const decoderBase = useMemo(() => getPidDecoderBaseForMainBase(mainModelConfig?.base), [mainModelConfig?.base]); + const baseFilter = useCallback( + (config: AnyModelConfig) => decoderBase !== null && config.base === decoderBase, + [decoderBase] + ); + const [modelConfigs, { isLoading }] = usePiDDecoderModels(baseFilter); + + const _onChange = useCallback( + (config: AnyModelConfig | null) => { + if (config) { + dispatch(pidDecoderModelSelected(zModelIdentifierField.parse(config))); + } + }, + [dispatch] + ); + + const { options, value, onChange, noOptionsMessage } = useModelCombobox({ + modelConfigs, + onChange: _onChange, + selectedModel, + isLoading, + }); + + return ( + + {t('modelManager.pidDecoder')} + + + ); +}); +ParamPidDecoderModelSelect.displayName = 'ParamPidDecoderModelSelect'; + +const ParamGemma2EncoderModelSelect = memo(() => { + const dispatch = useAppDispatch(); + const { t } = useTranslation(); + const selectedModel = useAppSelector(selectGemma2EncoderModel); + const [modelConfigs, { isLoading }] = useGemma2EncoderModels(); + + const _onChange = useCallback( + (config: AnyModelConfig | null) => { + if (config) { + dispatch(gemma2EncoderModelSelected(zModelIdentifierField.parse(config))); + } + }, + [dispatch] + ); + + const { options, value, onChange, noOptionsMessage } = useModelCombobox({ + modelConfigs, + onChange: _onChange, + selectedModel, + isLoading, + }); + + return ( + + {t('modelManager.gemma2Encoder')} + + + ); +}); +ParamGemma2EncoderModelSelect.displayName = 'ParamGemma2EncoderModelSelect'; + +const PidSettings = () => { + const dispatch = useAppDispatch(); + const { t } = useTranslation(); + const pidMode = useAppSelector(selectPidMode); + + const options = useMemo( + () => [ + { value: 'off', label: t('modelManager.pidModeOff') }, + { value: 'fit', label: t('modelManager.pidModeFit') }, + { value: 'native', label: t('modelManager.pidModeNative') }, + ], + [t] + ); + + const value = useMemo(() => options.find((o) => o.value === pidMode) ?? null, [options, pidMode]); + + const onChange = useCallback( + (v) => { + if (v) { + dispatch(pidModeChanged(v.value as PidMode)); + } + }, + [dispatch] + ); + + return ( + + + + {t('modelManager.pidMode')} + + + + {pidMode !== 'off' && ( + <> + + + + )} + + ); +}; + +export default memo(PidSettings); diff --git a/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts b/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts index 2ac59a32e2b..71edada00eb 100644 --- a/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts +++ b/invokeai/frontend/web/src/features/parameters/util/optimalDimension.ts @@ -1,14 +1,34 @@ import type { BaseModelType } from 'features/nodes/types/common'; +/** PiD's fixed super-resolution factor (the released FLUX/SD3 checkpoints are 4x). */ +export const PID_SCALE = 4; +// PiD res2k decoders are trained 512 -> 2048 (4x). In "native" mode the user-facing dimensions are the +// 4x target, so the optimal *target* dimension is 512 * 4 = 2048, regardless of the base model's own optimum. +const PID_NATIVE_OPTIMAL_DIMENSION = 512 * PID_SCALE; + +/** + * Returns the PiD generation scale that the dimension helpers should account for: + * - 4 in "native" mode (the user-facing dimensions are the 4x target; generation runs at target / 4) + * - 1 otherwise ('off' / 'fit' - dimensions are the generation resolution) + */ +export const getPidScale = (pidMode?: string | null): number => (pidMode === 'native' ? PID_SCALE : 1); + /** * Gets the optimal dimension for a given base model: * - sd-1, sd-2: 512 * - sdxl, flux, sd-3, cogview4, qwen-image, z-image, anima: 1024 * - default: 1024 + * + * When `pidScale > 1` (PiD native mode) the user-facing dimensions are the 4x target, so the optimal is the + * PiD target dimension (2048) instead of the model's own optimum. * @param base The base model + * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD) * @returns The optimal dimension for the model, defaulting to 1024 */ -export const getOptimalDimension = (base?: BaseModelType | null): number => { +export const getOptimalDimension = (base?: BaseModelType | null, pidScale = 1): number => { + if (pidScale > 1) { + return PID_NATIVE_OPTIMAL_DIMENSION; + } switch (base) { case 'sd-1': case 'sd-2': @@ -66,26 +86,34 @@ export const isInSDXLTrainingDimensions = (width: number, height: number): boole * - flux, sd-3, qwen-image, z-image: 16 * - cogview4: 32 * - default: 8 + * When `pidScale > 1` (PiD native mode) the grid is multiplied so the user-facing target snaps to a value + * whose `/ pidScale` generation resolution still lands on the model's native grid. * @param base The base model + * @param pidScale The PiD generation scale (see {@link getPidScale}); defaults to 1 (no PiD) * @returns The grid size for the model, defaulting to 8 */ -export const getGridSize = (base?: BaseModelType | null): number => { +export const getGridSize = (base?: BaseModelType | null, pidScale = 1): number => { + let gridSize: number; switch (base) { case 'cogview4': - return 32; + gridSize = 32; + break; case 'flux': case 'flux2': case 'sd-3': case 'qwen-image': case 'z-image': - return 16; + gridSize = 16; + break; case 'sd-1': case 'sd-2': case 'sdxl': case 'anima': default: - return 8; + gridSize = 8; + break; } + return gridSize * pidScale; }; const MIN_AREA_FACTOR = 0.8; @@ -117,7 +145,7 @@ export const getIsSizeTooLarge = (width: number, height: number, optimalDimensio * @param optimalDimension The optimal dimension * @returns Whether the current width and height needs to be resized to the optimal dimension */ -export const getIsSizeOptimal = (width: number, height: number, base?: BaseModelType): boolean => { - const optimalDimension = getOptimalDimension(base); +export const getIsSizeOptimal = (width: number, height: number, base?: BaseModelType, pidScale = 1): boolean => { + const optimalDimension = getOptimalDimension(base, pidScale); return !getIsSizeTooSmall(width, height, optimalDimension) && !getIsSizeTooLarge(width, height, optimalDimension); }; diff --git a/invokeai/frontend/web/src/features/parameters/util/pid.ts b/invokeai/frontend/web/src/features/parameters/util/pid.ts new file mode 100644 index 00000000000..51c6221a451 --- /dev/null +++ b/invokeai/frontend/web/src/features/parameters/util/pid.ts @@ -0,0 +1,29 @@ +import type { BaseModelType } from 'features/nodes/types/common'; + +/** + * Maps a main-model base to the PiD decoder base whose checkpoints are valid for it. + * + * PiD decoders are trained per backbone, so only a base-matching decoder may be used (e.g. a FLUX.2 decoder for a + * FLUX.2 main model). Z-Image is the exception: it shares FLUX.1's 16-channel VAE and has no PiD checkpoints of its + * own, so it reuses the FLUX decoder. Returns `null` for bases whose graph builder does not (yet) wire a PiD decode. + * Additional bases are added here as their graph builders gain PiD support. + */ +export const getPidDecoderBaseForMainBase = (base?: BaseModelType | null): BaseModelType | null => { + switch (base) { + case 'z-image': + // Z-Image reuses the FLUX PiD decoder (shared 16-channel VAE) - there is no Z-Image-specific decoder. + return 'flux'; + case 'flux': + case 'flux2': + case 'sd-3': + case 'sdxl': + case 'qwen-image': + return base; + default: + return null; + } +}; + +/** Whether the given main-model base supports PiD decoding (i.e. its graph builder wires a PiD decode). */ +export const getIsPidSupportedBase = (base?: BaseModelType | null): boolean => + getPidDecoderBaseForMainBase(base) !== null; diff --git a/invokeai/frontend/web/src/features/queue/store/readiness.ts b/invokeai/frontend/web/src/features/queue/store/readiness.ts index 84bc374158f..5b764899d33 100644 --- a/invokeai/frontend/web/src/features/queue/store/readiness.ts +++ b/invokeai/frontend/web/src/features/queue/store/readiness.ts @@ -34,7 +34,7 @@ import { resolveBatchValue } from 'features/nodes/util/node/resolveBatchValue'; import type { UpscaleState } from 'features/parameters/store/upscaleSlice'; import { selectUpscaleSlice } from 'features/parameters/store/upscaleSlice'; import { isFlux2KleinQwen3Compatible } from 'features/parameters/util/flux2Klein'; -import { getGridSize } from 'features/parameters/util/optimalDimension'; +import { getGridSize, getPidScale } from 'features/parameters/util/optimalDimension'; import { selectActiveTab } from 'features/ui/store/uiSelectors'; import type { TabName } from 'features/ui/store/uiTypes'; import i18n from 'i18next'; @@ -287,6 +287,14 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: { if (!params.fluxVAE) { reasons.push({ content: i18n.t('parameters.invoke.noFLUXVAEModelSelected') }); } + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + } } if (model?.base === 'flux2' && model.format !== 'diffusers') { @@ -301,6 +309,39 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: { } } + if (model?.base === 'flux2' && params.pidMode !== 'off') { + // PiD decode (any FLUX.2 format) needs both a PiD decoder and the Gemma-2 caption encoder. + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + } + + if (model?.base === 'sd-3' && params.pidMode !== 'off') { + // PiD decode needs both a PiD decoder and the Gemma-2 caption encoder. + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + } + + if (model?.base === 'sdxl' && params.pidMode !== 'off') { + // PiD decode needs the decoder + Gemma-2 encoder, and is not compatible with the SDXL Refiner. + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + if (params.refinerModel) { + reasons.push({ content: i18n.t('parameters.invoke.pidIncompatibleWithRefiner') }); + } + } + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { // GGUF needs sources for VAE + encoder. Each can come from either a standalone // model or the Component Source (Diffusers). @@ -311,6 +352,16 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: { } } + if (model?.base === 'qwen-image' && params.pidMode !== 'off') { + // PiD decode (any Qwen-Image format) needs both a PiD decoder and the Gemma-2 caption encoder. + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + } + if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; @@ -322,6 +373,15 @@ export const getReasonsWhyCannotEnqueueGenerateTab = (arg: { if (!hasQwen3Source) { reasons.push({ content: i18n.t('parameters.invoke.noZImageQwen3EncoderSourceSelected') }); } + // PiD decode (Z-Image reuses the FLUX decoder) needs both a PiD decoder and the Gemma-2 caption encoder. + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + } } if (model?.base === 'anima') { @@ -571,7 +631,23 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: { } const { bbox } = canvas; - const gridSize = getGridSize('flux'); + // In PiD native mode the bbox is the 4x target, so it must snap to a larger grid (16 * 4) for bbox / 4 to land + // on the FLUX grid. getPidScale returns 1 for off/fit, leaving the normal 16px grid. + const gridSize = getGridSize('flux', getPidScale(params.pidMode)); + + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + // PiD decodes at 4x the generation resolution; "Scale Before Processing" would inflate the generation + // size and blow up the decode. Require it to be off (None) so generation == bbox. + if (bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } if (bbox.scaleMethod === 'none') { if (bbox.rect.width % gridSize !== 0) { @@ -628,7 +704,23 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: { } const { bbox } = canvas; - const gridSize = getGridSize('flux'); // FLUX.2 uses same grid size as FLUX.1 + // FLUX.2 uses the same 16px grid as FLUX.1. In PiD native mode the bbox is the 4x target, so it must snap to + // a larger grid (16 * 4) for bbox / 4 to land on the FLUX grid. getPidScale returns 1 for off/fit. + const gridSize = getGridSize('flux2', getPidScale(params.pidMode)); + + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + // PiD decodes at 4x the generation resolution; "Scale Before Processing" would inflate the generation + // size and blow up the decode. Require it to be off (None) so generation == bbox. + if (bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } if (bbox.scaleMethod === 'none') { if (bbox.rect.width % gridSize !== 0) { @@ -671,6 +763,37 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: { } } + if (model?.base === 'sd-3' && params.pidMode !== 'off') { + // PiD decode on the Canvas: needs the decoder + Gemma-2 encoder, and "Scale Before Processing" must be off + // (PiD decodes at 4x the generation resolution; scaling would inflate the generation size and blow up the decode). + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + if (canvas.bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } + + if (model?.base === 'sdxl' && params.pidMode !== 'off') { + // PiD decode on the Canvas: decoder + Gemma-2 encoder required, "Scale Before Processing" off, and not + // compatible with the SDXL Refiner. + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + if (params.refinerModel) { + reasons.push({ content: i18n.t('parameters.invoke.pidIncompatibleWithRefiner') }); + } + if (canvas.bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } + if (model?.base === 'cogview4') { const { bbox } = canvas; const gridSize = getGridSize('cogview4'); @@ -718,7 +841,21 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: { if (model?.base === 'qwen-image') { const { bbox } = canvas; - const gridSize = getGridSize('qwen-image'); + // In PiD native mode the bbox is the 4x target, so it must snap to a larger grid (16 * 4) for bbox / 4 to land + // on the Qwen grid. getPidScale returns 1 for off/fit, leaving the normal 16px grid. + const gridSize = getGridSize('qwen-image', getPidScale(params.pidMode)); + + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + if (bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } if (bbox.scaleMethod === 'none') { if (bbox.rect.width % gridSize !== 0) { @@ -782,6 +919,18 @@ export const getReasonsWhyCannotEnqueueCanvasTab = (arg: { if (!hasQwen3Source) { reasons.push({ content: i18n.t('parameters.invoke.noZImageQwen3EncoderSourceSelected') }); } + // PiD decode on the Canvas: decoder + Gemma-2 encoder required, and "Scale Before Processing" must be off. + if (params.pidMode !== 'off') { + if (!params.pidDecoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noPidDecoderModelSelected') }); + } + if (!params.gemma2EncoderModel) { + reasons.push({ content: i18n.t('parameters.invoke.noGemma2EncoderModelSelected') }); + } + if (canvas.bbox.scaleMethod !== 'none') { + reasons.push({ content: i18n.t('parameters.invoke.pidScaleBeforeProcessingMustBeOff') }); + } + } } if (model?.base === 'anima') { diff --git a/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx b/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx index 220008a38b0..61e92080368 100644 --- a/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx +++ b/invokeai/frontend/web/src/features/settingsAccordions/components/GenerationSettingsAccordion/GenerationSettingsAccordion.tsx @@ -19,6 +19,7 @@ import { } from 'features/controlLayers/store/paramsSlice'; import { LoRAList } from 'features/lora/components/LoRAList'; import LoRASelect from 'features/lora/components/LoRASelect'; +import PidSettings from 'features/parameters/components/Advanced/PidSettings'; import ParamAnimaScheduler from 'features/parameters/components/Core/ParamAnimaScheduler'; import ParamCFGScale from 'features/parameters/components/Core/ParamCFGScale'; import ParamFluxDypeExponent from 'features/parameters/components/Core/ParamFluxDypeExponent'; @@ -32,6 +33,7 @@ import ParamSteps from 'features/parameters/components/Core/ParamSteps'; import ParamZImageScheduler from 'features/parameters/components/Core/ParamZImageScheduler'; import ParamZImageShift from 'features/parameters/components/Core/ParamZImageShift'; import ParamZImageSeedVarianceSettings from 'features/parameters/components/SeedVariance/ParamZImageSeedVarianceSettings'; +import { getIsPidSupportedBase } from 'features/parameters/util/pid'; import { MainModelPicker } from 'features/settingsAccordions/components/GenerationSettingsAccordion/MainModelPicker'; import { useExpanderToggle } from 'features/settingsAccordions/hooks/useExpanderToggle'; import { useStandaloneAccordionToggle } from 'features/settingsAccordions/hooks/useStandaloneAccordionToggle'; @@ -58,6 +60,8 @@ export const GenerationSettingsAccordion = memo(() => { const fluxDypePreset = useAppSelector(selectFluxDypePreset); const modelSupportsGuidance = useAppSelector(selectModelSupportsGuidance); const modelSupportsSteps = useAppSelector(selectModelSupportsSteps); + // PiD is available for any base whose graph builder wires a PiD decode (currently FLUX and FLUX.2). + const isPidSupported = getIsPidSupportedBase(modelConfig?.base); const hasExpanderContent = isExternal ? modelSupportsGuidance || modelSupportsSteps : true; const selectBadges = useMemo( @@ -120,6 +124,7 @@ export const GenerationSettingsAccordion = memo(() => { {!isExternal && isFLUX && fluxDypePreset === 'manual' && } {!isExternal && isFLUX && fluxDypePreset === 'manual' && } + {!isExternal && isPidSupported && } {!isExternal && isZImage && } diff --git a/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts b/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts index ca886789cea..8d5b7556908 100644 --- a/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts +++ b/invokeai/frontend/web/src/services/api/hooks/modelsByType.ts @@ -23,10 +23,12 @@ import { isFluxKontextModelConfig, isFluxReduxModelConfig, isFluxVAEModelConfig, + isGemma2EncoderModelConfig, isIPAdapterModelConfig, isLLaVAModelConfig, isLoRAModelConfig, isMainOrExternalModelConfig, + isPiDDecoderModelConfig, isQwen3EncoderModelConfig, isQwenImageDiffusersMainModelConfig, isQwenImageVAEModelConfig, @@ -111,6 +113,8 @@ export const useQwenImageDiffusersModels = () => buildModelsHook(isQwenImageDiff export const useQwenImageVAEModels = () => buildModelsHook(isQwenImageVAEModelConfig)(); export const useQwenVLEncoderModels = () => buildModelsHook(isQwenVLEncoderModelConfig)(); export const useQwen3EncoderModels = () => buildModelsHook(isQwen3EncoderModelConfig)(); +export const usePiDDecoderModels = buildModelsHook(isPiDDecoderModelConfig); +export const useGemma2EncoderModels = () => buildModelsHook(isGemma2EncoderModelConfig)(); export const useGlobalReferenceImageModels = buildModelsHook( (config) => isIPAdapterModelConfig(config) || isFluxReduxModelConfig(config) || isFluxKontextModelConfig(config) ); diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index 0bcfbb49106..de8d63d29fd 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -3615,7 +3615,7 @@ export type components = { */ type: "anima_text_encoder"; }; - AnyModelConfig: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + AnyModelConfig: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; /** * AppVersion * @description App Version Response @@ -10629,6 +10629,91 @@ export type components = { */ type: "flux2_klein_text_encoder"; }; + /** + * Latents to Image - FLUX.2 + PiD (4x SR) + * @description Decode a FLUX.2 Klein latent with the PiD pixel-diffusion decoder. + * + * Produces a 4x super-resolved image in a single pass. The stored FLUX.2 latent + * is patchified from ``(B, 32, H/8, W/8)`` to the ``(B, 128, H/16, W/16)`` layout + * PiD's FLUX.2 backbone expects, then decoded directly (it is already in raw, + * BN-denormalized space; see the module docstring). + */ + Flux2PiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD FLUX.2 decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * VAE + * @description FLUX.2 VAE, used only to read a scalar scaling_factor / shift_factor if one exists. FLUX.2 normalises latents with BatchNorm (already inverted in flux2_denoise), so this is normally an identity transform and the input can be left unconnected. + * @default null + */ + vae?: components["schemas"]["VAEField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default flux2_pid_decode + * @constant + */ + type: "flux2_pid_decode"; + }; /** * Latents to Image - FLUX2 * @description Generates an image from latents using FLUX.2 Klein's 32-channel VAE. @@ -11770,6 +11855,84 @@ export type components = { */ type: "flux_model_loader_output"; }; + /** + * Latents to Image - FLUX + PiD (4x SR) + * @description Decode a FLUX latent with the PiD pixel-diffusion decoder. + * + * The FLUX AutoEncoder usually denormalises the stored latent internally + * before its conv decoder runs (`z / scale + shift`); we apply the same + * transform manually here so PiD sees the raw latent it was trained on. + */ + FluxPiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD FLUX decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default flux_pid_decode + * @constant + */ + type: "flux_pid_decode"; + }; /** * FluxReduxConditioningField * @description A FLUX Redux conditioning tensor primitive value @@ -12235,6 +12398,155 @@ export type components = { */ type: "gemini_image_generation"; }; + /** + * Gemma2EncoderField + * @description Field for the Gemma-2 text encoder used by PiD decoders. + */ + Gemma2EncoderField: { + /** @description Info to load tokenizer submodel */ + tokenizer: components["schemas"]["ModelIdentifierField"]; + /** @description Info to load text_encoder submodel */ + text_encoder: components["schemas"]["ModelIdentifierField"]; + }; + /** + * Gemma-2 Encoder - PiD + * @description Loads a Gemma-2 causal LM directory and exposes its tokenizer + decoder + * submodels for use by a PiD decode node. + */ + Gemma2EncoderLoaderInvocation: { + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * Gemma-2 + * @description Gemma-2 model used to encode captions for PiD decoders. + * @default null + */ + gemma2_model?: components["schemas"]["ModelIdentifierField"] | null; + /** + * type + * @default gemma2_encoder_loader + * @constant + */ + type: "gemma2_encoder_loader"; + }; + /** Gemma2EncoderOutput */ + Gemma2EncoderOutput: { + /** + * Gemma-2 Encoder + * @description Gemma-2 text encoder used by PiD decoders + */ + gemma2_encoder: components["schemas"]["Gemma2EncoderField"]; + /** + * type + * @default gemma2_encoder_output + * @constant + */ + type: "gemma2_encoder_output"; + }; + /** + * Gemma2Encoder_Gemma2Encoder_Config + * @description Standalone Gemma-2 causal LM directory used as a text encoder by PiD. + * + * Expected directory layout (HuggingFace `from_pretrained`-compatible):: + * + * / + * config.json # architectures: ["Gemma2ForCausalLM"] + * tokenizer.json + * tokenizer_config.json + * model-*.safetensors # or model.safetensors / *.bin + */ + Gemma2Encoder_Gemma2Encoder_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Base + * @default any + * @constant + */ + base: "any"; + /** + * Type + * @default gemma2_encoder + * @constant + */ + type: "gemma2_encoder"; + /** + * Format + * @default gemma2_encoder + * @constant + */ + format: "gemma2_encoder"; + /** + * Cpu Only + * @description Whether this model should run on CPU only + */ + cpu_only: boolean | null; + }; /** * GeneratePasswordResponse * @description Response containing a generated password. @@ -12334,7 +12646,7 @@ export type components = { * @description The nodes in this graph */ nodes?: { - [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; + [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; }; /** * Edges @@ -12371,7 +12683,7 @@ export type components = { * @description The results of node executions */ results: { - [key: string]: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"]; + [key: string]: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["Gemma2EncoderOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PiDDecoderOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"]; }; /** * Errors @@ -15781,7 +16093,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -15791,7 +16103,7 @@ export type components = { * Result * @description The result of the invocation */ - result: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"]; + result: components["schemas"]["AnimaConditioningOutput"] | components["schemas"]["AnimaLoRALoaderOutput"] | components["schemas"]["AnimaModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["BoundingBoxCollectionOutput"] | components["schemas"]["BoundingBoxOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["CogView4ConditioningOutput"] | components["schemas"]["CogView4ModelLoaderOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["FloatGeneratorOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["Flux2KleinLoRALoaderOutput"] | components["schemas"]["Flux2KleinModelLoaderOutput"] | components["schemas"]["FluxConditioningCollectionOutput"] | components["schemas"]["FluxConditioningOutput"] | components["schemas"]["FluxControlLoRALoaderOutput"] | components["schemas"]["FluxControlNetOutput"] | components["schemas"]["FluxFillOutput"] | components["schemas"]["FluxKontextOutput"] | components["schemas"]["FluxLoRALoaderOutput"] | components["schemas"]["FluxModelLoaderOutput"] | components["schemas"]["FluxReduxOutput"] | components["schemas"]["Gemma2EncoderOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IfInvocationOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ImageGeneratorOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ImagePanelCoordinateOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IntegerGeneratorOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["LatentsMetaOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LoRASelectorOutput"] | components["schemas"]["MDControlListOutput"] | components["schemas"]["MDIPAdapterListOutput"] | components["schemas"]["MDT2IAdapterListOutput"] | components["schemas"]["MaskOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["MetadataToLorasCollectionOutput"] | components["schemas"]["MetadataToModelOutput"] | components["schemas"]["MetadataToSDXLModelOutput"] | components["schemas"]["ModelIdentifierOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["PBRMapsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["PiDDecoderOutput"] | components["schemas"]["PromptTemplateOutput"] | components["schemas"]["QwenImageConditioningOutput"] | components["schemas"]["QwenImageLoRALoaderOutput"] | components["schemas"]["QwenImageModelLoaderOutput"] | components["schemas"]["SD3ConditioningOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["Sd3ModelLoaderOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["String2Output"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["StringGeneratorOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["ZImageConditioningOutput"] | components["schemas"]["ZImageControlOutput"] | components["schemas"]["ZImageLoRALoaderOutput"] | components["schemas"]["ZImageModelLoaderOutput"]; }; /** * InvocationErrorEvent @@ -15845,7 +16157,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -15936,6 +16248,7 @@ export type components = { flux2_klein_lora_loader: components["schemas"]["Flux2KleinLoRALoaderOutput"]; flux2_klein_model_loader: components["schemas"]["Flux2KleinModelLoaderOutput"]; flux2_klein_text_encoder: components["schemas"]["FluxConditioningOutput"]; + flux2_pid_decode: components["schemas"]["ImageOutput"]; flux2_vae_decode: components["schemas"]["ImageOutput"]; flux2_vae_encode: components["schemas"]["LatentsOutput"]; flux_control_lora_loader: components["schemas"]["FluxControlLoRALoaderOutput"]; @@ -15949,12 +16262,14 @@ export type components = { flux_lora_collection_loader: components["schemas"]["FluxLoRALoaderOutput"]; flux_lora_loader: components["schemas"]["FluxLoRALoaderOutput"]; flux_model_loader: components["schemas"]["FluxModelLoaderOutput"]; + flux_pid_decode: components["schemas"]["ImageOutput"]; flux_redux: components["schemas"]["FluxReduxOutput"]; flux_text_encoder: components["schemas"]["FluxConditioningOutput"]; flux_vae_decode: components["schemas"]["ImageOutput"]; flux_vae_encode: components["schemas"]["LatentsOutput"]; freeu: components["schemas"]["UNetOutput"]; gemini_image_generation: components["schemas"]["ImageCollectionOutput"]; + gemma2_encoder_loader: components["schemas"]["Gemma2EncoderOutput"]; get_image_mask_bounding_box: components["schemas"]["BoundingBoxOutput"]; grounding_dino: components["schemas"]["BoundingBoxCollectionOutput"]; hed_edge_detection: components["schemas"]["ImageOutput"]; @@ -16057,6 +16372,8 @@ export type components = { pair_tile_image: components["schemas"]["PairTileImageOutput"]; paste_image_into_bounding_box: components["schemas"]["ImageOutput"]; pbr_maps: components["schemas"]["PBRMapsOutput"]; + pid_decoder_loader: components["schemas"]["PiDDecoderOutput"]; + pid_upscale: components["schemas"]["ImageOutput"]; pidi_edge_detection: components["schemas"]["ImageOutput"]; prompt_from_file: components["schemas"]["StringCollectionOutput"]; prompt_template: components["schemas"]["PromptTemplateOutput"]; @@ -16066,6 +16383,7 @@ export type components = { qwen_image_lora_collection_loader: components["schemas"]["QwenImageLoRALoaderOutput"]; qwen_image_lora_loader: components["schemas"]["QwenImageLoRALoaderOutput"]; qwen_image_model_loader: components["schemas"]["QwenImageModelLoaderOutput"]; + qwen_image_pid_decode: components["schemas"]["ImageOutput"]; qwen_image_text_encoder: components["schemas"]["QwenImageConditioningOutput"]; rand_float: components["schemas"]["FloatOutput"]; rand_int: components["schemas"]["IntegerOutput"]; @@ -16081,11 +16399,13 @@ export type components = { sd3_i2l: components["schemas"]["LatentsOutput"]; sd3_l2i: components["schemas"]["ImageOutput"]; sd3_model_loader: components["schemas"]["Sd3ModelLoaderOutput"]; + sd3_pid_decode: components["schemas"]["ImageOutput"]; sd3_text_encoder: components["schemas"]["SD3ConditioningOutput"]; sdxl_compel_prompt: components["schemas"]["ConditioningOutput"]; sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"]; sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"]; sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"]; + sdxl_pid_decode: components["schemas"]["ImageOutput"]; sdxl_refiner_compel_prompt: components["schemas"]["ConditioningOutput"]; sdxl_refiner_model_loader: components["schemas"]["SDXLRefinerModelLoaderOutput"]; seamless: components["schemas"]["SeamlessModeOutput"]; @@ -16121,6 +16441,7 @@ export type components = { z_image_lora_collection_loader: components["schemas"]["ZImageLoRALoaderOutput"]; z_image_lora_loader: components["schemas"]["ZImageLoRALoaderOutput"]; z_image_model_loader: components["schemas"]["ZImageModelLoaderOutput"]; + z_image_pid_decode: components["schemas"]["ImageOutput"]; z_image_seed_variance_enhancer: components["schemas"]["ZImageConditioningOutput"]; z_image_text_encoder: components["schemas"]["ZImageConditioningOutput"]; }; @@ -16176,7 +16497,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -16251,7 +16572,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlibabaCloudImageGenerationInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["AnimaDenoiseInvocation"] | components["schemas"]["AnimaImageToLatentsInvocation"] | components["schemas"]["AnimaLatentsToImageInvocation"] | components["schemas"]["AnimaLoRACollectionLoader"] | components["schemas"]["AnimaLoRALoaderInvocation"] | components["schemas"]["AnimaModelLoaderInvocation"] | components["schemas"]["AnimaTextEncoderInvocation"] | components["schemas"]["ApplyMaskTensorToImageInvocation"] | components["schemas"]["ApplyMaskToImageInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CanvasOutputInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CogView4DenoiseInvocation"] | components["schemas"]["CogView4ImageToLatentsInvocation"] | components["schemas"]["CogView4LatentsToImageInvocation"] | components["schemas"]["CogView4ModelLoaderInvocation"] | components["schemas"]["CogView4TextEncoderInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropImageToBoundingBoxInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DecodeInvisibleWatermarkInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DenoiseLatentsMetaInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ExpandMaskWithFadeInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatBatchInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatGenerator"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["Flux2DenoiseInvocation"] | components["schemas"]["Flux2KleinLoRACollectionLoader"] | components["schemas"]["Flux2KleinLoRALoaderInvocation"] | components["schemas"]["Flux2KleinModelLoaderInvocation"] | components["schemas"]["Flux2KleinTextEncoderInvocation"] | components["schemas"]["Flux2PiDDecodeInvocation"] | components["schemas"]["Flux2VaeDecodeInvocation"] | components["schemas"]["Flux2VaeEncodeInvocation"] | components["schemas"]["FluxControlLoRALoaderInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxDenoiseLatentsMetaInvocation"] | components["schemas"]["FluxFillInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxKontextConcatenateImagesInvocation"] | components["schemas"]["FluxKontextInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxPiDDecodeInvocation"] | components["schemas"]["FluxReduxInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GeminiImageGenerationInvocation"] | components["schemas"]["Gemma2EncoderLoaderInvocation"] | components["schemas"]["GetMaskBoundingBoxInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["IfInvocation"] | components["schemas"]["ImageBatchInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageGenerator"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImageNoiseInvocation"] | components["schemas"]["ImagePanelLayoutInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerBatchInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerGenerator"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["InvokeAdjustImageHuePlusInvocation"] | components["schemas"]["InvokeEquivalentAchromaticLightnessInvocation"] | components["schemas"]["InvokeImageBlendInvocation"] | components["schemas"]["InvokeImageCompositorInvocation"] | components["schemas"]["InvokeImageDilateOrErodeInvocation"] | components["schemas"]["InvokeImageEnhanceInvocation"] | components["schemas"]["InvokeImageValueThresholdsInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LlavaOnevisionVllmInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataFieldExtractorInvocation"] | components["schemas"]["MetadataFromImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MetadataItemLinkedInvocation"] | components["schemas"]["MetadataToBoolCollectionInvocation"] | components["schemas"]["MetadataToBoolInvocation"] | components["schemas"]["MetadataToControlnetsInvocation"] | components["schemas"]["MetadataToFloatCollectionInvocation"] | components["schemas"]["MetadataToFloatInvocation"] | components["schemas"]["MetadataToIPAdaptersInvocation"] | components["schemas"]["MetadataToIntegerCollectionInvocation"] | components["schemas"]["MetadataToIntegerInvocation"] | components["schemas"]["MetadataToLorasCollectionInvocation"] | components["schemas"]["MetadataToLorasInvocation"] | components["schemas"]["MetadataToModelInvocation"] | components["schemas"]["MetadataToSDXLLorasInvocation"] | components["schemas"]["MetadataToSDXLModelInvocation"] | components["schemas"]["MetadataToSchedulerInvocation"] | components["schemas"]["MetadataToStringCollectionInvocation"] | components["schemas"]["MetadataToStringInvocation"] | components["schemas"]["MetadataToT2IAdaptersInvocation"] | components["schemas"]["MetadataToVAEInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["OklabUnsharpMaskInvocation"] | components["schemas"]["OklchImageHueAdjustmentInvocation"] | components["schemas"]["OpenAIImageGenerationInvocation"] | components["schemas"]["PBRMapsInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PasteImageIntoBoundingBoxInvocation"] | components["schemas"]["PiDDecoderLoaderInvocation"] | components["schemas"]["PiDUpscaleInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PromptTemplateInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["QwenImageDenoiseInvocation"] | components["schemas"]["QwenImageImageToLatentsInvocation"] | components["schemas"]["QwenImageLatentsToImageInvocation"] | components["schemas"]["QwenImageLoRACollectionLoader"] | components["schemas"]["QwenImageLoRALoaderInvocation"] | components["schemas"]["QwenImageModelLoaderInvocation"] | components["schemas"]["QwenImagePiDDecodeInvocation"] | components["schemas"]["QwenImageTextEncoderInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SD3DenoiseInvocation"] | components["schemas"]["SD3ImageToLatentsInvocation"] | components["schemas"]["SD3LatentsToImageInvocation"] | components["schemas"]["SD3PiDDecodeInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLPiDDecodeInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["SaveImageToFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["Sd3ModelLoaderInvocation"] | components["schemas"]["Sd3TextEncoderInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SeedreamImageGenerationInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StringBatchInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringGenerator"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TextLLMInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZImageControlInvocation"] | components["schemas"]["ZImageDenoiseInvocation"] | components["schemas"]["ZImageDenoiseMetaInvocation"] | components["schemas"]["ZImageImageToLatentsInvocation"] | components["schemas"]["ZImageLatentsToImageInvocation"] | components["schemas"]["ZImageLoRACollectionLoader"] | components["schemas"]["ZImageLoRALoaderInvocation"] | components["schemas"]["ZImageModelLoaderInvocation"] | components["schemas"]["ZImagePiDDecodeInvocation"] | components["schemas"]["ZImageSeedVarianceEnhancerInvocation"] | components["schemas"]["ZImageTextEncoderInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -23465,7 +23786,7 @@ export type components = { * @description Storage format of model. * @enum {string} */ - ModelFormat: "omi" | "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "bnb_quantized_int8b" | "bnb_quantized_nf4b" | "gguf_quantized" | "external_api" | "unknown"; + ModelFormat: "omi" | "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "gemma2_encoder" | "bnb_quantized_int8b" | "bnb_quantized_nf4b" | "gguf_quantized" | "external_api" | "unknown"; /** ModelIdentifierField */ ModelIdentifierField: { /** @@ -23602,7 +23923,7 @@ export type components = { * Config * @description The installed model's config */ - config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; /** * ModelInstallDownloadProgressEvent @@ -23768,7 +24089,7 @@ export type components = { * Config Out * @description After successful installation, this will hold the configuration object. */ - config_out?: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]) | null; + config_out?: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]) | null; /** * Inplace * @description Leave model in its current location; otherwise install under models directory @@ -23854,7 +24175,7 @@ export type components = { * Config * @description The model's config */ - config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; /** * @description The submodel type, if any * @default null @@ -23875,7 +24196,7 @@ export type components = { * Config * @description The model's config */ - config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + config: components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; /** * @description The submodel type, if any * @default null @@ -24001,7 +24322,7 @@ export type components = { * Variant * @description The variant of the model. */ - variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null; + variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null; /** @description The prediction type of the model. */ prediction_type?: components["schemas"]["SchedulerPredictionType"] | null; /** @@ -24083,7 +24404,7 @@ export type components = { * @description Model type. * @enum {string} */ - ModelType: "onnx" | "main" | "vae" | "lora" | "control_lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "clip_embed" | "t2i_adapter" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "spandrel_image_to_image" | "siglip" | "flux_redux" | "llava_onevision" | "text_llm" | "external_image_generator" | "unknown"; + ModelType: "onnx" | "main" | "vae" | "lora" | "control_lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "clip_embed" | "t2i_adapter" | "t5_encoder" | "qwen3_encoder" | "qwen_vl_encoder" | "gemma2_encoder" | "spandrel_image_to_image" | "siglip" | "flux_redux" | "llava_onevision" | "text_llm" | "external_image_generator" | "pid_decoder" | "unknown"; /** * ModelVariantType * @description Variant type. @@ -24096,7 +24417,7 @@ export type components = { */ ModelsList: { /** Models */ - models: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"])[]; + models: (components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"])[]; }; /** * Multiply Integers @@ -24864,19 +25185,593 @@ export type components = { type: "paste_image_into_bounding_box"; }; /** - * PiDiNet Edge Detection - * @description Generates an edge map using PiDiNet. + * PiDDecoderField + * @description Field for a PiD (Pixel Diffusion Decoder) checkpoint. */ - PiDiNetEdgeDetectionInvocation: { + PiDDecoderField: { + /** @description Info to load PiD decoder checkpoint */ + decoder: components["schemas"]["ModelIdentifierField"]; + }; + /** + * PiD Decoder - FLUX / FLUX.2 / SD3 + * @description Loads a PiD decoder checkpoint, outputting a PiDDecoderField for use + * by the per-backbone PiD decode nodes. + */ + PiDDecoderLoaderInvocation: { /** - * @description The board to save the image to - * @default null + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. */ - board?: components["schemas"]["BoardField"] | null; + id: string; /** - * @description Optional metadata to be saved with the image - * @default null - */ + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * PiD Decoder + * @description PiD decoder checkpoint matching the upstream backbone. + * @default null + */ + pid_decoder_model?: components["schemas"]["ModelIdentifierField"] | null; + /** + * type + * @default pid_decoder_loader + * @constant + */ + type: "pid_decoder_loader"; + }; + /** PiDDecoderOutput */ + PiDDecoderOutput: { + /** + * PiD Decoder + * @description PiD (Pixel Diffusion Decoder) checkpoint + */ + pid_decoder: components["schemas"]["PiDDecoderField"]; + /** + * type + * @default pid_decoder_output + * @constant + */ + type: "pid_decoder_output"; + }; + /** + * PiDDecoderVariantType + * @description PiD (Pixel Diffusion Decoder) variants distributed by NVIDIA. + * + * Each backbone (FLUX.1, FLUX.2, SD3) ships in two resolution presets that + * differ only in target output resolution; the underlying network is the + * same. NVIDIA's checkpoint filenames encode this as e.g. + * `PiD_res2k_sr4x_official_flux_distill_4step` vs + * `PiD_res2kto4k_sr4x_official_flux_distill_4step`. + * @enum {string} + */ + PiDDecoderVariantType: "res2k_sr4x" | "res2kto4k_sr4x"; + /** + * PiDDecoder_Checkpoint_FLUX_Config + * @description PiD decoder for the FLUX.1 backbone (16-channel latent). + */ + PiDDecoder_Checkpoint_FLUX_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Config Path + * @description Path to the config for this model, if any. + */ + config_path: string | null; + /** + * Type + * @default pid_decoder + * @constant + */ + type: "pid_decoder"; + /** + * Format + * @default checkpoint + * @constant + */ + format: "checkpoint"; + /** + * Base + * @default flux + * @constant + */ + base: "flux"; + /** @description Resolution preset of the PiD decoder checkpoint. */ + variant: components["schemas"]["PiDDecoderVariantType"]; + }; + /** + * PiDDecoder_Checkpoint_Flux2_Config + * @description PiD decoder for the FLUX.2 backbone (128-channel latent). + */ + PiDDecoder_Checkpoint_Flux2_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Config Path + * @description Path to the config for this model, if any. + */ + config_path: string | null; + /** + * Type + * @default pid_decoder + * @constant + */ + type: "pid_decoder"; + /** + * Format + * @default checkpoint + * @constant + */ + format: "checkpoint"; + /** + * Base + * @default flux2 + * @constant + */ + base: "flux2"; + /** @description Resolution preset of the PiD decoder checkpoint. */ + variant: components["schemas"]["PiDDecoderVariantType"]; + }; + /** + * PiDDecoder_Checkpoint_QwenImage_Config + * @description PiD decoder for the Qwen-Image backbone (16-channel latent). + * + * Shares the 16-channel latent shape with FLUX.1 and SD3, so it relies on the same + * filename / directory-name disambiguation (or a trusted explicit ``base`` override) + * as SD3 - see ``_validate_base``. + */ + PiDDecoder_Checkpoint_QwenImage_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Config Path + * @description Path to the config for this model, if any. + */ + config_path: string | null; + /** + * Type + * @default pid_decoder + * @constant + */ + type: "pid_decoder"; + /** + * Format + * @default checkpoint + * @constant + */ + format: "checkpoint"; + /** + * Base + * @default qwen-image + * @constant + */ + base: "qwen-image"; + /** @description Resolution preset of the PiD decoder checkpoint. */ + variant: components["schemas"]["PiDDecoderVariantType"]; + }; + /** + * PiDDecoder_Checkpoint_SD3_Config + * @description PiD decoder for the Stable Diffusion 3 backbone (16-channel latent). + */ + PiDDecoder_Checkpoint_SD3_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Config Path + * @description Path to the config for this model, if any. + */ + config_path: string | null; + /** + * Type + * @default pid_decoder + * @constant + */ + type: "pid_decoder"; + /** + * Format + * @default checkpoint + * @constant + */ + format: "checkpoint"; + /** + * Base + * @default sd-3 + * @constant + */ + base: "sd-3"; + /** @description Resolution preset of the PiD decoder checkpoint. */ + variant: components["schemas"]["PiDDecoderVariantType"]; + }; + /** + * PiDDecoder_Checkpoint_SDXL_Config + * @description PiD decoder for the SDXL backbone (4-channel latent). + */ + PiDDecoder_Checkpoint_SDXL_Config: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * File Size + * @description The size of the model in bytes. + */ + file_size: number; + /** + * Name + * @description Name of the model. + */ + name: string; + /** + * Description + * @description Model description + */ + description: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response: string | null; + /** + * Source Url + * @description Optional URL for the model (e.g. download page or model page). + */ + source_url: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image: string | null; + /** + * Config Path + * @description Path to the config for this model, if any. + */ + config_path: string | null; + /** + * Type + * @default pid_decoder + * @constant + */ + type: "pid_decoder"; + /** + * Format + * @default checkpoint + * @constant + */ + format: "checkpoint"; + /** + * Base + * @default sdxl + * @constant + */ + base: "sdxl"; + /** @description Resolution preset of the PiD decoder checkpoint. */ + variant: components["schemas"]["PiDDecoderVariantType"]; + }; + /** + * PiD Upscale (4x) - FLUX VAE + * @description Upscale any image 4x via FLUX VAE encode + PiD pixel-diffusion decode. + * + * Works for source images that the FLUX VAE can encode (i.e. natural + * photos / generated images at any size that lands on the VAE's 8-pixel + * grid). The caption is used to condition the PiD decoder; leaving it + * empty produces an unconditional decode and is the cheapest option, but + * the model was distilled with rich captions and benefits from one. + */ + PiDUpscaleInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Image to upscale. + * @default null + */ + image?: components["schemas"]["ImageField"] | null; + /** + * @description FLUX-compatible VAE (FLUX.1, Z-Image, anything sharing the 16-channel encoder). + * @default null + */ + vae?: components["schemas"]["VAEField"] | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD FLUX decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * Prompt + * @description Optional caption describing the image. Empty -> empty-caption decode. + * @default + */ + prompt?: string; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default pid_upscale + * @constant + */ + type: "pid_upscale"; + }; + /** + * PiDiNet Edge Detection + * @description Generates an edge map using PiDiNet. + */ + PiDiNetEdgeDetectionInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ metadata?: components["schemas"]["MetadataField"] | null; /** * Id @@ -25973,6 +26868,89 @@ export type components = { */ type: "qwen_image_model_loader_output"; }; + /** + * Latents to Image - Qwen-Image + PiD (4x SR) + * @description Decode a Qwen-Image latent with the PiD pixel-diffusion decoder. + * + * Produces a 4x super-resolved image in a single pass. The 5D Qwen latent is + * reduced to 2D and per-channel denormalized (``z * std + mean``) before PiD. + */ + QwenImagePiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD Qwen-Image decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * VAE + * @description Qwen-Image VAE, used to read the per-channel latents_mean / latents_std. If omitted, the diffusers default Qwen-Image constants are used. + * @default null + */ + vae?: components["schemas"]["VAEField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default qwen_image_pid_decode + * @constant + */ + type: "qwen_image_pid_decode"; + }; /** * Prompt - Qwen Image * @description Encodes text and reference images for Qwen Image using Qwen2.5-VL. @@ -27138,6 +28116,80 @@ export type components = { */ type: "sd3_l2i"; }; + /** + * Latents to Image - SD3 + PiD (4x SR) + * @description Decode an SD3 latent with the PiD pixel-diffusion decoder. + */ + SD3PiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD SD3 decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default sd3_pid_decode + * @constant + */ + type: "sd3_pid_decode"; + }; /** * Prompt - SDXL * @description Parse prompt using compel package to conditioning. @@ -27434,6 +28486,90 @@ export type components = { */ type: "sdxl_model_loader_output"; }; + /** + * Latents to Image - SDXL + PiD (4x SR) + * @description Decode an SDXL latent with the PiD pixel-diffusion decoder. + * + * Produces a 4x super-resolved image in a single pass. The SDXL latent is + * 4-channel at an 8x down-factor, so it is denormalized (``z / scaling_factor``) + * and handed straight to PiD - no packing needed. + */ + SDXLPiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD SDXL decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * VAE + * @description SDXL VAE, used to read scaling_factor / shift_factor. If omitted, the SDXL fallback constants (0.13025 / 0.0) are used. + * @default null + */ + vae?: components["schemas"]["VAEField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default sdxl_pid_decode + * @constant + */ + type: "sdxl_pid_decode"; + }; /** * Prompt - SDXL Refiner * @description Parse prompt using compel package to conditioning. @@ -28854,7 +29990,7 @@ export type components = { type: components["schemas"]["ModelType"]; format?: components["schemas"]["ModelFormat"] | null; /** Variant */ - variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null; + variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null; /** * Is Installed * @default false @@ -28899,7 +30035,7 @@ export type components = { type: components["schemas"]["ModelType"]; format?: components["schemas"]["ModelFormat"] | null; /** Variant */ - variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null; + variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null; /** * Is Installed * @default false @@ -29430,7 +30566,7 @@ export type components = { path_or_prefix: string; model_type: components["schemas"]["ModelType"]; /** Variant */ - variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | null; + variant?: components["schemas"]["ModelVariantType"] | components["schemas"]["ClipVariantType"] | components["schemas"]["FluxVariantType"] | components["schemas"]["Flux2VariantType"] | components["schemas"]["ZImageVariantType"] | components["schemas"]["QwenImageVariantType"] | components["schemas"]["Qwen3VariantType"] | components["schemas"]["PiDDecoderVariantType"] | null; }; /** * Subtract Integers @@ -33099,6 +34235,90 @@ export type components = { */ type: "z_image_model_loader_output"; }; + /** + * Latents to Image - Z-Image + PiD (4x SR) + * @description Decode a Z-Image latent with the PiD pixel-diffusion decoder. + * + * Produces a 4x super-resolved image in a single pass (Z-Image decoder is + * trained on FLUX.1 latents; ``sr_scale=4`` with the FLUX VAE's 8x spatial + * down-factor gives a 32x linear scale from latent to pixel). + */ + ZImagePiDDecodeInvocation: { + /** + * @description The board to save the image to + * @default null + */ + board?: components["schemas"]["BoardField"] | null; + /** + * @description Optional metadata to be saved with the image + * @default null + */ + metadata?: components["schemas"]["MetadataField"] | null; + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * @description Latents tensor + * @default null + */ + latents?: components["schemas"]["LatentsField"] | null; + /** + * Prompt + * @description Text prompt the latent was generated from. PiD conditions on it. + * @default null + */ + prompt?: string | null; + /** + * Gemma-2 Encoder + * @description Gemma-2 caption encoder. Required by PiD. + * @default null + */ + gemma2_encoder?: components["schemas"]["Gemma2EncoderField"] | null; + /** + * PiD Decoder + * @description PiD FLUX decoder checkpoint. + * @default null + */ + pid_decoder?: components["schemas"]["PiDDecoderField"] | null; + /** + * VAE + * @description Z-Image VAE used to read scaling_factor / shift_factor. If omitted, the FLUX.1 fallback constants (0.3611 / 0.1159) are used. + * @default null + */ + vae?: components["schemas"]["VAEField"] | null; + /** + * Num Inference Steps + * @description Number of PiD distill steps. The released checkpoints are trained for 4. + * @default 4 + */ + num_inference_steps?: number; + /** + * Seed + * @description Seed for the PiD decoder's noise. + * @default 0 + */ + seed?: number; + /** + * type + * @default z_image_pid_decode + * @constant + */ + type: "z_image_pid_decode"; + }; /** * Seed Variance Enhancer - Z-Image * @description Adds seed-based noise to Z-Image conditioning to increase variance between seeds. @@ -33736,7 +34956,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Validation Error */ @@ -33768,7 +34988,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Validation Error */ @@ -33820,7 +35040,7 @@ export interface operations { * "upcast_attention": false * } */ - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Bad request */ @@ -33927,7 +35147,7 @@ export interface operations { * "upcast_attention": false * } */ - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Bad request */ @@ -34000,7 +35220,7 @@ export interface operations { * "upcast_attention": false * } */ - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Bad request */ @@ -34735,7 +35955,7 @@ export interface operations { * "upcast_attention": false * } */ - "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; + "application/json": components["schemas"]["Main_Diffusers_SD1_Config"] | components["schemas"]["Main_Diffusers_SD2_Config"] | components["schemas"]["Main_Diffusers_SDXL_Config"] | components["schemas"]["Main_Diffusers_SDXLRefiner_Config"] | components["schemas"]["Main_Diffusers_SD3_Config"] | components["schemas"]["Main_Diffusers_FLUX_Config"] | components["schemas"]["Main_Diffusers_Flux2_Config"] | components["schemas"]["Main_Diffusers_CogView4_Config"] | components["schemas"]["Main_Diffusers_QwenImage_Config"] | components["schemas"]["Main_Diffusers_ZImage_Config"] | components["schemas"]["Main_Checkpoint_SD1_Config"] | components["schemas"]["Main_Checkpoint_SD2_Config"] | components["schemas"]["Main_Checkpoint_SDXL_Config"] | components["schemas"]["Main_Checkpoint_SDXLRefiner_Config"] | components["schemas"]["Main_Checkpoint_Flux2_Config"] | components["schemas"]["Main_Checkpoint_FLUX_Config"] | components["schemas"]["Main_Checkpoint_QwenImage_Config"] | components["schemas"]["Main_Checkpoint_ZImage_Config"] | components["schemas"]["Main_Checkpoint_Anima_Config"] | components["schemas"]["Main_BnBNF4_FLUX_Config"] | components["schemas"]["Main_GGUF_Flux2_Config"] | components["schemas"]["Main_GGUF_FLUX_Config"] | components["schemas"]["Main_GGUF_QwenImage_Config"] | components["schemas"]["Main_GGUF_ZImage_Config"] | components["schemas"]["VAE_Checkpoint_SD1_Config"] | components["schemas"]["VAE_Checkpoint_SD2_Config"] | components["schemas"]["VAE_Checkpoint_SDXL_Config"] | components["schemas"]["VAE_Checkpoint_FLUX_Config"] | components["schemas"]["VAE_Checkpoint_Flux2_Config"] | components["schemas"]["VAE_Checkpoint_QwenImage_Config"] | components["schemas"]["VAE_Checkpoint_Anima_Config"] | components["schemas"]["VAE_Diffusers_SD1_Config"] | components["schemas"]["VAE_Diffusers_SDXL_Config"] | components["schemas"]["VAE_Diffusers_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_FLUX_Config"] | components["schemas"]["PiDDecoder_Checkpoint_Flux2_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SD3_Config"] | components["schemas"]["PiDDecoder_Checkpoint_SDXL_Config"] | components["schemas"]["PiDDecoder_Checkpoint_QwenImage_Config"] | components["schemas"]["ControlNet_Checkpoint_SD1_Config"] | components["schemas"]["ControlNet_Checkpoint_SD2_Config"] | components["schemas"]["ControlNet_Checkpoint_SDXL_Config"] | components["schemas"]["ControlNet_Checkpoint_FLUX_Config"] | components["schemas"]["ControlNet_Checkpoint_ZImage_Config"] | components["schemas"]["ControlNet_Diffusers_SD1_Config"] | components["schemas"]["ControlNet_Diffusers_SD2_Config"] | components["schemas"]["ControlNet_Diffusers_SDXL_Config"] | components["schemas"]["ControlNet_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_SD1_Config"] | components["schemas"]["LoRA_LyCORIS_SD2_Config"] | components["schemas"]["LoRA_LyCORIS_SDXL_Config"] | components["schemas"]["LoRA_LyCORIS_Flux2_Config"] | components["schemas"]["LoRA_LyCORIS_FLUX_Config"] | components["schemas"]["LoRA_LyCORIS_ZImage_Config"] | components["schemas"]["LoRA_LyCORIS_QwenImage_Config"] | components["schemas"]["LoRA_LyCORIS_Anima_Config"] | components["schemas"]["LoRA_OMI_SDXL_Config"] | components["schemas"]["LoRA_OMI_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_SD1_Config"] | components["schemas"]["LoRA_Diffusers_SD2_Config"] | components["schemas"]["LoRA_Diffusers_SDXL_Config"] | components["schemas"]["LoRA_Diffusers_Flux2_Config"] | components["schemas"]["LoRA_Diffusers_FLUX_Config"] | components["schemas"]["LoRA_Diffusers_ZImage_Config"] | components["schemas"]["ControlLoRA_LyCORIS_FLUX_Config"] | components["schemas"]["T5Encoder_T5Encoder_Config"] | components["schemas"]["T5Encoder_BnBLLMint8_Config"] | components["schemas"]["Qwen3Encoder_Qwen3Encoder_Config"] | components["schemas"]["Qwen3Encoder_Checkpoint_Config"] | components["schemas"]["Qwen3Encoder_GGUF_Config"] | components["schemas"]["Gemma2Encoder_Gemma2Encoder_Config"] | components["schemas"]["QwenVLEncoder_Diffusers_Config"] | components["schemas"]["QwenVLEncoder_Checkpoint_Config"] | components["schemas"]["TI_File_SD1_Config"] | components["schemas"]["TI_File_SD2_Config"] | components["schemas"]["TI_File_SDXL_Config"] | components["schemas"]["TI_Folder_SD1_Config"] | components["schemas"]["TI_Folder_SD2_Config"] | components["schemas"]["TI_Folder_SDXL_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD1_Config"] | components["schemas"]["IPAdapter_InvokeAI_SD2_Config"] | components["schemas"]["IPAdapter_InvokeAI_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD1_Config"] | components["schemas"]["IPAdapter_Checkpoint_SD2_Config"] | components["schemas"]["IPAdapter_Checkpoint_SDXL_Config"] | components["schemas"]["IPAdapter_Checkpoint_FLUX_Config"] | components["schemas"]["T2IAdapter_Diffusers_SD1_Config"] | components["schemas"]["T2IAdapter_Diffusers_SDXL_Config"] | components["schemas"]["Spandrel_Checkpoint_Config"] | components["schemas"]["CLIPEmbed_Diffusers_G_Config"] | components["schemas"]["CLIPEmbed_Diffusers_L_Config"] | components["schemas"]["CLIPVision_Diffusers_Config"] | components["schemas"]["SigLIP_Diffusers_Config"] | components["schemas"]["FLUXRedux_Checkpoint_Config"] | components["schemas"]["LlavaOnevision_Diffusers_Config"] | components["schemas"]["TextLLM_Diffusers_Config"] | components["schemas"]["ExternalApiModelConfig"] | components["schemas"]["Unknown_Config"]; }; }; /** @description Bad request */ diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index 27c6fcbf3c3..c233e1da123 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -117,6 +117,8 @@ export type T5EncoderBnbQuantizedLlmInt8bModelConfig = Extract< >; export type Qwen3EncoderModelConfig = Extract; export type QwenVLEncoderModelConfig = Extract; +type Gemma2EncoderModelConfig = Extract; +type PiDDecoderModelConfig = Extract; export type SpandrelImageToImageModelConfig = Extract; export type CheckpointModelConfig = Extract; export type CLIPVisionModelConfig = Extract; @@ -379,6 +381,14 @@ export const isQwenVLEncoderModelConfig = (config: AnyModelConfig): config is Qw return config.type === 'qwen_vl_encoder'; }; +export const isGemma2EncoderModelConfig = (config: AnyModelConfig): config is Gemma2EncoderModelConfig => { + return config.type === 'gemma2_encoder'; +}; + +export const isPiDDecoderModelConfig = (config: AnyModelConfig): config is PiDDecoderModelConfig => { + return config.type === 'pid_decoder'; +}; + export const isCLIPEmbedModelConfigOrSubmodel = ( config: AnyModelConfig, excludeSubmodels?: boolean