invoke-ai · Pfannkuchensack · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
@@ -31,14 +31,14 @@
 from invokeai.app.invocations.model import Qwen3EncoderField
 from invokeai.app.invocations.primitives import AnimaConditioningOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
-from invokeai.backend.anima.t5_tokenizer import load_bundled_t5_tokenizer
 from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.lora_conversions.anima_lora_constants import ANIMA_LORA_QWEN3_PREFIX
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
     AnimaConditioningInfo,
     ConditioningFieldData,
 )
+from invokeai.backend.t5.t5_tokenizer import load_bundled_t5_tokenizer
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
 

@@ -101,7 +101,11 @@
     T2IAdapter_Diffusers_SD1_Config,
     T2IAdapter_Diffusers_SDXL_Config,
 )
-from invokeai.backend.model_manager.configs.t5_encoder import T5Encoder_BnBLLMint8_Config, T5Encoder_T5Encoder_Config
+from invokeai.backend.model_manager.configs.t5_encoder import (
+    T5Encoder_BnBLLMint8_Config,
+    T5Encoder_GGUF_Config,
+    T5Encoder_T5Encoder_Config,
+)
 from invokeai.backend.model_manager.configs.text_llm import TextLLM_Diffusers_Config
 from invokeai.backend.model_manager.configs.textual_inversion import (
     TI_File_SD1_Config,
@@ -246,6 +250,7 @@
         # T5 Encoder - all formats
         Annotated[T5Encoder_T5Encoder_Config, T5Encoder_T5Encoder_Config.get_tag()],
         Annotated[T5Encoder_BnBLLMint8_Config, T5Encoder_BnBLLMint8_Config.get_tag()],
+        Annotated[T5Encoder_GGUF_Config, T5Encoder_GGUF_Config.get_tag()],
         # Qwen3 Encoder
         Annotated[Qwen3Encoder_Qwen3Encoder_Config, Qwen3Encoder_Qwen3Encoder_Config.get_tag()],
         Annotated[Qwen3Encoder_Checkpoint_Config, Qwen3Encoder_Checkpoint_Config.get_tag()],

@@ -46,6 +46,20 @@ def _has_ggml_tensors(state_dict: dict[str | int, Any]) -> bool:
     return any(isinstance(v, GGMLTensor) for v in state_dict.values())
 
 
+def _has_t5_encoder_keys(state_dict: dict[str | int, Any]) -> bool:
+    """Check if state dict looks like a llama.cpp T5 encoder.
+
+    T5 encoder GGUFs (e.g. city96/t5-v1_1-xxl-encoder-gguf) also carry a ``token_embd.weight`` tensor,
+    which makes them satisfy the Qwen3 GGUF key heuristic. But their transformer blocks use the ``enc.``
+    prefix (``enc.blk.*``, ``enc.output_norm.weight``), which a Qwen3 encoder never has. We use this to
+    keep the T5 and Qwen3 encoder configs mutually exclusive.
+    """
+    for key in state_dict.keys():
+        if isinstance(key, str) and (key.startswith("enc.blk.") or key == "enc.output_norm.weight"):
+            return True
+    return False
+
+
 def _has_qwen_vl_visual_tower(state_dict: dict[str | int, Any]) -> bool:
     """Check if state dict bundles a Qwen2.5-VL / Qwen2-VL vision tower.
 
@@ -156,6 +170,10 @@ def _validate_looks_like_qwen3_model(cls, mod: ModelOnDisk) -> None:
         state_dict = mod.load_state_dict()
         if not _has_qwen3_keys(state_dict):
             raise NotAMatchError("state dict does not look like a Qwen3 model")
+        # Reject T5 encoders: they share the token_embd.weight key with Qwen3 GGUFs but use the ``enc.``
+        # block prefix, and must be classified as T5Encoder (Qwen3 encoders never have ``enc.blk.*`` keys).
+        if _has_t5_encoder_keys(state_dict):
+            raise NotAMatchError("state dict looks like a T5 encoder (has 'enc.blk.*' keys), not a Qwen3 encoder")
         # Reject Qwen2.5-VL / Qwen2-VL encoders: they carry a visual tower and must be
         # classified as QwenVLEncoder (text-only Qwen3 encoders never have one).
         if _has_qwen_vl_visual_tower(state_dict):
@@ -297,6 +315,10 @@ def _validate_looks_like_qwen3_model(cls, mod: ModelOnDisk) -> None:
         state_dict = mod.load_state_dict()
         if not _has_qwen3_keys(state_dict):
             raise NotAMatchError("state dict does not look like a Qwen3 model")
+        # Reject T5 encoders: they share the token_embd.weight key with Qwen3 GGUFs but use the ``enc.``
+        # block prefix, and must be classified as T5Encoder (Qwen3 encoders never have ``enc.blk.*`` keys).
+        if _has_t5_encoder_keys(state_dict):
+            raise NotAMatchError("state dict looks like a T5 encoder (has 'enc.blk.*' keys), not a Qwen3 encoder")
         # Reject Qwen2.5-VL / Qwen2-VL encoders: they carry a visual tower and must be
         # classified as QwenVLEncoder (text-only Qwen3 encoders never have one).
         if _has_qwen_vl_visual_tower(state_dict):

@@ -2,16 +2,19 @@
 
 from pydantic import Field
 
-from invokeai.backend.model_manager.configs.base import Config_Base
+from invokeai.backend.model_manager.configs.base import Checkpoint_Config_Base, Config_Base
 from invokeai.backend.model_manager.configs.identification_utils import (
     NotAMatchError,
     raise_for_class_name,
     raise_for_override_fields,
     raise_if_not_dir,
+    raise_if_not_file,
     state_dict_has_any_keys_ending_with,
+    state_dict_has_any_keys_starting_with,
 )
 from invokeai.backend.model_manager.model_on_disk import ModelOnDisk
 from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat, ModelType
+from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor
 
 
 class T5Encoder_T5Encoder_Config(Config_Base):
@@ -80,3 +83,43 @@ def raise_if_state_dict_doesnt_look_like_bnb_quantized(cls, mod: ModelOnDisk) ->
         has_scb_key_suffix = state_dict_has_any_keys_ending_with(mod.load_state_dict(), "SCB")
         if not has_scb_key_suffix:
             raise NotAMatchError("state dict does not look like bnb quantized llm_int8")
+
+
+class T5Encoder_GGUF_Config(Checkpoint_Config_Base, Config_Base):
+    """Configuration for GGUF-quantized T5 text encoder models in a single .gguf file.
+
+    These are conversions like city96/t5-v1_1-xxl-encoder-gguf, which use llama.cpp's T5 encoder
+    tensor naming (``enc.blk.N.*``, ``token_embd.weight``, ``enc.output_norm.weight``)."""
+
+    base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
+    type: Literal[ModelType.T5Encoder] = Field(default=ModelType.T5Encoder)
+    format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized)
+    cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
+
+    @classmethod
+    def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
+        raise_if_not_file(mod)
+
+        raise_for_override_fields(cls, override_fields)
+
+        cls.raise_if_doesnt_look_like_t5_encoder(mod)
+
+        cls.raise_if_doesnt_look_like_gguf_quantized(mod)
+
+        return cls(**override_fields)
+
+    @classmethod
+    def raise_if_doesnt_look_like_t5_encoder(cls, mod: ModelOnDisk) -> None:
+        # llama.cpp T5 encoders use the ``enc.`` prefix on their transformer blocks and final norm. This
+        # distinguishes them from decoder-only GGUF models (e.g. Qwen3, which uses bare ``blk.*``).
+        state_dict = mod.load_state_dict()
+        if not state_dict_has_any_keys_starting_with(
+            state_dict, "enc.blk."
+        ) and not state_dict_has_any_keys_ending_with(state_dict, "enc.output_norm.weight"):
+            raise NotAMatchError("state dict does not look like a T5 encoder (no 'enc.blk.*' keys)")
+
+    @classmethod
+    def raise_if_doesnt_look_like_gguf_quantized(cls, mod: ModelOnDisk) -> None:
+        has_ggml = any(isinstance(v, GGMLTensor) for v in mod.load_state_dict().values())
+        if not has_ggml:
+            raise NotAMatchError("state dict does not look like GGUF quantized")