diff --git a/SECOND_GPU_TEXT_ENCODER_FEATURE.md b/SECOND_GPU_TEXT_ENCODER_FEATURE.md
new file mode 100644
index 00000000000..c97465d9d1d
--- /dev/null
+++ b/SECOND_GPU_TEXT_ENCODER_FEATURE.md
@@ -0,0 +1,246 @@
+# Second GPU Text Encoder Feature
+
+## Goal
+
+Add an optional runtime setting that keeps the main generation model on the app's configured CUDA GPU while running text encoders on the other CUDA GPU.
+
+Example behavior:
+
+- App device `cuda:0` -> text encoders use `cuda:1`
+- App device `cuda:1` -> text encoders use `cuda:0`
+- App device `cuda` -> normalized by InvokeAI, then the opposite CUDA device is selected
+- Fewer than 2 CUDA GPUs -> setting is unavailable/disabled
+
+This is intentionally a small model-loader/cache feature, not a full multi-GPU scheduler.
+
+## User-Facing Behavior
+
+- Advanced settings shows a toggle: `Use Second GPU for Text Encoder`
+- Toggle is disabled unless the backend reports at least two CUDA GPUs and the app is using CUDA
+- No CPU option is added
+- Setting is persisted through `/api/v1/app/runtime_config`
+- Manual `Clear Model Cache` still clears everything
+
+## First Upstream Concern: Dynamic Toggle Behavior
+
+The toggle should actively change the loaded model state instead of only changing future model loads.
+
+Expected behavior:
+
+- Toggle on:
+ - Persist `use_second_gpu_for_text_encoder=true`
+ - Drop any cached text encoder entries that were loaded on the main device
+ - Trigger/prewarm loading of the currently selected text encoder onto the second CUDA device, if the active workflow/model selection exposes a current encoder
+ - Keep the denoise model resident on the main CUDA device when possible
+
+- Toggle off:
+ - Persist `use_second_gpu_for_text_encoder=false`
+ - Drop protected second-GPU text encoder cache entries
+ - Let the next generation reload the encoder through the normal stock device path
+ - Ideally unload the second-GPU encoder immediately so the GPU memory visibly frees
+
+Implementation options:
+
+- Minimum safe version:
+ - On runtime config update, backend drops cached text encoder cache records affected by the setting change
+ - UI shows the changed setting immediately
+ - Next generation reloads the encoder onto the correct device
+
+- Better interactive version:
+ - Backend exposes a small endpoint to refresh/prewarm current text encoder models after the setting changes
+ - Frontend calls it after the toggle saves
+ - If no current encoder can be inferred, fallback to cache-drop-only behavior
+
+This should be addressed before proposing upstream, because users will expect the toggle to have immediate, visible effect.
+
+## Backend Config/API Changes
+
+### `invokeai/app/services/config/config_default.py`
+
+Add:
+
+```py
+use_second_gpu_for_text_encoder: bool = Field(
+ default=False,
+ description="When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.",
+)
+```
+
+This belongs near the existing `device` setting.
+
+### `invokeai/app/api/routers/app_info.py`
+
+Expose CUDA device count through `get_app_deps()`:
+
+```py
+deps["CUDA Devices"] = str(cuda_device_count)
+deps[f"CUDA Device {device_index}"] = torch.cuda.get_device_name(device_index)
+```
+
+Allow runtime updates:
+
+```py
+use_second_gpu_for_text_encoder: bool | None = Field(default=None, ...)
+```
+
+## Model Routing Changes
+
+### `invokeai/backend/model_manager/load/load_default.py`
+
+Add text encoder detection for:
+
+- `SubModelType.TextEncoder`
+- `SubModelType.TextEncoder2`
+- `SubModelType.TextEncoder3`
+- `ModelType.CLIPEmbed`
+- `ModelType.T5Encoder`
+- `ModelType.Qwen3Encoder`
+- `ModelType.QwenVLEncoder`
+- `ModelType.TextLLM`
+
+When the setting is enabled and two CUDA GPUs exist, return the CUDA device whose index differs from the app's main execution device.
+
+The current local version also protects CUDA-resident models from automatic eviction while this mode is active:
+
+```py
+prevent_auto_evict = (
+ self._app_config.use_second_gpu_for_text_encoder and effective_execution_device.type == "cuda"
+)
+```
+
+## Cache Changes Needed
+
+### `invokeai/backend/model_manager/load/model_cache/cache_record.py`
+
+Add:
+
+```py
+prevent_auto_evict: bool = False
+```
+
+Protected entries are skipped by automatic cleanup, but explicit model reloads and manual cache clearing may still remove them.
+
+### `invokeai/backend/model_manager/load/model_cache/model_cache.py`
+
+Add `prevent_auto_evict` to `ModelCache.put()`.
+
+Automatic VRAM offload skips protected entries.
+
+Automatic RAM cache eviction skips protected entries.
+
+Manual cache clear can bypass protection:
+
+```py
+make_room(bytes_needed, preserve_auto_evict_protected=False)
+```
+
+Make VRAM accounting device-aware so loading on `cuda:0` does not try to free models resident on `cuda:1`.
+
+Optional split-GPU cache sizing:
+
+- When split-GPU mode is enabled, calculate the RAM cache cap using total VRAM across CUDA devices
+- Use a larger RAM fraction so paired denoise + encoder models can remain cached
+
+### `invokeai/app/services/model_manager/model_manager_default.py`
+
+Pass the config flag into `ModelCache`:
+
+```py
+use_multi_cuda_ram_cache=app_config.use_second_gpu_for_text_encoder
+```
+
+### `invokeai/app/api/routers/model_manager.py`
+
+Manual empty-cache endpoint should bypass protection:
+
+```py
+ram_cache.make_room(1000 * 2**30, preserve_auto_evict_protected=False)
+```
+
+## Frontend Changes
+
+### New Component
+
+`invokeai/frontend/web/src/features/parameters/components/Advanced/ParamUseSecondGpuForTextEncoder.tsx`
+
+Responsibilities:
+
+- Read `CUDA Devices` from `useGetAppDepsQuery()`
+- Read current runtime config from `useGetRuntimeConfigQuery()`
+- Save changes with `useUpdateRuntimeConfigMutation()`
+- Disable switch if:
+ - runtime config is not loaded
+ - user cannot edit runtime config
+ - CUDA device count is below 2
+ - app device is not CUDA
+
+### Advanced Accordion
+
+`invokeai/frontend/web/src/features/settingsAccordions/components/AdvancedSettingsAccordion/AdvancedSettingsAccordion.tsx`
+
+Render the new toggle once globally in Advanced settings. Do not tie it to specific model picker visibility.
+
+### API Schema
+
+`invokeai/frontend/web/src/services/api/schema.ts`
+
+Add `use_second_gpu_for_text_encoder` to:
+
+- `InvokeAIAppConfig`
+- `UpdateAppGenerationSettingsRequest`
+
+If preparing for upstream, prefer regenerating OpenAPI/types if the official workflow supports it.
+
+## Local-Only Change Not For Upstream
+
+`update.bat` was changed locally so this repo can build with pnpm 10 even when PATH has pnpm 11.
+
+Do not include that in an upstream PR unless upstream wants Windows helper changes.
+
+## Testing Checklist
+
+Test with app device set to `cuda:0`:
+
+- Toggle off: behavior matches stock InvokeAI
+- Toggle on: text encoder spikes and remains on GPU 1
+- Denoise remains on GPU 0
+- Running a second generation should reuse the resident encoder
+
+Test with app device set to `cuda:1`:
+
+- Toggle on: text encoder moves to GPU 0
+- Denoise remains on GPU 1
+- Loading encoder should not evict denoise
+- Loading denoise should not evict encoder
+
+Test encoder families:
+
+- FLUX T5
+- SD3 T5
+- CLIP embed paths
+- Qwen3 / Z-Image
+- Qwen VL / Qwen Image
+- Any text LLM path that uses `ModelType.TextLLM`
+
+Test cache controls:
+
+- Manual Clear Model Cache should remove both models
+- Switching the toggle should require cache reload/new model loads to reflect the device assignment
+- Fewer than two CUDA devices should disable the UI control
+
+## Upstream PR Shape
+
+For upstream, create a clean branch from `upstream/main` and bring over only:
+
+- config/API setting
+- model loader routing
+- cache/device accounting/protection
+- Advanced UI toggle
+- generated schema/type updates
+
+Leave out:
+
+- Batch+ custom page changes
+- local `update.bat`
+- runtime/install wrapper changes
+- unrelated object serializer cleanup
diff --git a/invokeai/app/api/routers/app_info.py b/invokeai/app/api/routers/app_info.py
index 832e58f5e24..bc26dce1199 100644
--- a/invokeai/app/api/routers/app_info.py
+++ b/invokeai/app/api/routers/app_info.py
@@ -1,16 +1,21 @@
+import csv
+import io
import locale
+import subprocess
from enum import Enum
from importlib.metadata import distributions
from pathlib import Path as FilePath
from threading import Lock
from typing import Any
+import psutil
import torch
import yaml
from fastapi import Body, HTTPException, Path
from fastapi.routing import APIRouter
from pydantic import BaseModel, Field, model_validator
+from invokeai.app.invocations.model import ModelIdentifierField
from invokeai.app.api.auth_dependencies import AdminUserOrDefault
from invokeai.app.api.dependencies import ApiDependencies
from invokeai.app.services.config.config_default import (
@@ -25,8 +30,12 @@
from invokeai.app.services.external_generation.external_generation_common import ExternalProviderStatus
from invokeai.app.services.invocation_cache.invocation_cache_common import InvocationCacheStatus
from invokeai.app.services.model_records.model_records_base import UnknownModelException
+from invokeai.app.util.t5_model_identifier import preprocess_t5_encoder_model_identifier
from invokeai.backend.image_util.infill_methods.patchmatch import PatchMatch
+from invokeai.backend.model_manager.load.model_cache.model_cache import get_model_cache_key
from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelType
+from invokeai.backend.model_manager.taxonomy import SubModelType
+from invokeai.backend.util.devices import TorchDevice
from invokeai.backend.util.logging import logging
from invokeai.version import __version__
@@ -63,6 +72,16 @@ async def get_app_deps() -> dict[str, str]:
cuda = "N/A"
deps["CUDA"] = cuda
+ try:
+ cuda_device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0
+ except Exception:
+ cuda_device_count = 0
+ deps["CUDA Devices"] = str(cuda_device_count)
+ for device_index in range(cuda_device_count):
+ try:
+ deps[f"CUDA Device {device_index}"] = torch.cuda.get_device_name(device_index)
+ except Exception:
+ deps[f"CUDA Device {device_index}"] = "Unknown CUDA device"
sorted_deps = dict(sorted(deps.items(), key=lambda item: item[0].lower()))
@@ -131,6 +150,15 @@ class UpdateAppGenerationSettingsRequest(BaseModel):
ge=0,
description="Keep the last N completed, failed, and canceled queue items on startup. Set to 0 to prune all terminal items.",
)
+ model_cache_keep_alive_min: float | None = Field(
+ default=None,
+ ge=0,
+ description="How long to keep unlocked models in cache after last use, in minutes. 0 keeps models indefinitely.",
+ )
+ use_second_gpu_for_text_encoder: bool | None = Field(
+ default=None,
+ description="Run text encoder models on the CUDA device that is not the main execution device when at least two CUDA GPUs are available.",
+ )
@model_validator(mode="after")
def validate_explicit_nulls(self) -> "UpdateAppGenerationSettingsRequest":
@@ -139,6 +167,266 @@ def validate_explicit_nulls(self) -> "UpdateAppGenerationSettingsRequest":
return self
+class SyncTextEncoderCacheRequest(BaseModel):
+ """Request to actively sync selected text encoder cache entries with the second-GPU toggle state."""
+
+ enabled: bool = Field(description="Whether second-GPU text encoder mode is enabled.")
+ text_encoder_models: list[ModelIdentifierField] = Field(
+ default_factory=list,
+ description="Selected text encoder models to unload or prewarm.",
+ )
+
+
+class SyncTextEncoderCacheResponse(BaseModel):
+ """Text encoder cache sync result."""
+
+ dropped: int = Field(description="Number of cache entries immediately dropped.")
+ loaded: int = Field(description="Number of selected encoder entries loaded onto their target device.")
+ status: "TextEncoderCacheStatusResponse" = Field(description="Text encoder cache status after sync.")
+
+
+class TextEncoderCacheModelStatus(BaseModel):
+ """Status for one selected text encoder cache entry."""
+
+ key: str = Field(description="Model key.")
+ name: str = Field(description="Model name.")
+ cache_key: str = Field(description="Resolved cache key.")
+ loaded: bool = Field(description="Whether the cache entry exists and has weights on its execution device.")
+ device: str | None = Field(default=None, description="Execution device for the cache entry.")
+ vram_gb: float = Field(description="Estimated model VRAM resident size in GB.")
+ total_gb: float = Field(description="Estimated model size in GB.")
+
+
+class CudaDeviceStatus(BaseModel):
+ """CUDA device memory status."""
+
+ index: int = Field(description="CUDA device index.")
+ name: str = Field(description="CUDA device name.")
+ used_gb: float = Field(description="Total device memory used in GB, including non-InvokeAI processes.")
+ invoke_cache_gb: float = Field(description="InvokeAI model cache memory used on this device in GB.")
+ total_gb: float = Field(description="Total device memory in GB.")
+
+
+class TextEncoderCacheStatusResponse(BaseModel):
+ """Selected text encoder cache and CUDA memory status."""
+
+ models: list[TextEncoderCacheModelStatus] = Field(description="Selected text encoder cache statuses.")
+ cuda_devices: list[CudaDeviceStatus] = Field(description="CUDA memory status.")
+
+
+class SystemGpuStatus(BaseModel):
+ """Basic GPU status."""
+
+ index: int = Field(description="GPU device index.")
+ name: str = Field(description="GPU device name.")
+ utilization_percent: float | None = Field(default=None, description="GPU utilization percent.")
+ loaded_gb: float = Field(description="GPU memory used in GB.")
+ total_gb: float = Field(description="Total GPU memory in GB.")
+
+
+class SystemStatusResponse(BaseModel):
+ """Basic system status."""
+
+ cpu_percent: float = Field(description="CPU utilization percent.")
+ cpu_frequency_ghz: float | None = Field(default=None, description="Current CPU frequency in GHz.")
+ memory_used_gb: float = Field(description="System memory used in GB.")
+ memory_total_gb: float = Field(description="Total system memory in GB.")
+ memory_percent: float = Field(description="System memory utilization percent.")
+ gpus: list[SystemGpuStatus] = Field(description="GPU statuses.")
+
+
+_PREWARM_STANDALONE_TEXT_ENCODER_TYPES = {
+ ModelType.CLIPEmbed,
+ ModelType.Qwen3Encoder,
+ ModelType.QwenVLEncoder,
+ ModelType.T5Encoder,
+}
+
+
+def _normalize_text_encoder_identifier(model: ModelIdentifierField) -> ModelIdentifierField:
+ if model.type == ModelType.T5Encoder:
+ return preprocess_t5_encoder_model_identifier(model)
+ if model.submodel_type is None and (
+ model.type in _PREWARM_STANDALONE_TEXT_ENCODER_TYPES or model.type == ModelType.Main
+ ):
+ return model.model_copy(update={"submodel_type": SubModelType.TextEncoder})
+ return model
+
+
+def _get_cuda_device_statuses() -> list[CudaDeviceStatus]:
+ if not torch.cuda.is_available():
+ return []
+ ram_cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
+ invoke_cache_usage = ram_cache.get_cuda_cache_usage_bytes()
+ statuses: list[CudaDeviceStatus] = []
+ for device_index in range(torch.cuda.device_count()):
+ device = torch.device("cuda", device_index)
+ free_bytes, total_bytes = torch.cuda.mem_get_info(device)
+ used_bytes = total_bytes - free_bytes
+ statuses.append(
+ CudaDeviceStatus(
+ index=device_index,
+ name=torch.cuda.get_device_name(device),
+ used_gb=round(used_bytes / (1024**3), 2),
+ invoke_cache_gb=round(invoke_cache_usage.get(device_index, 0) / (1024**3), 2),
+ total_gb=round(total_bytes / (1024**3), 2),
+ )
+ )
+ return statuses
+
+
+def _get_nvidia_smi_statuses() -> dict[int, tuple[float | None, float, float]]:
+ """Return GPU utilization percent, memory used MB, and memory total MB keyed by device index."""
+ try:
+ result = subprocess.run(
+ [
+ "nvidia-smi",
+ "--query-gpu=index,utilization.gpu,memory.used,memory.total",
+ "--format=csv,noheader,nounits",
+ ],
+ check=False,
+ capture_output=True,
+ text=True,
+ timeout=1,
+ )
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ return {}
+
+ if result.returncode != 0:
+ return {}
+
+ statuses: dict[int, tuple[float | None, float, float]] = {}
+ for line in result.stdout.splitlines():
+ parts = [part.strip() for part in line.split(",")]
+ if len(parts) != 4:
+ continue
+ try:
+ index = int(parts[0])
+ utilization_percent = float(parts[1])
+ memory_used_mb = float(parts[2])
+ memory_total_mb = float(parts[3])
+ except ValueError:
+ continue
+ statuses[index] = (utilization_percent, memory_used_mb, memory_total_mb)
+ return statuses
+
+
+def _get_system_gpu_statuses() -> list[SystemGpuStatus]:
+ if not torch.cuda.is_available():
+ return []
+
+ nvidia_smi_statuses = _get_nvidia_smi_statuses()
+ statuses: list[SystemGpuStatus] = []
+ for device_index in range(torch.cuda.device_count()):
+ device = torch.device("cuda", device_index)
+ nvidia_smi_status = nvidia_smi_statuses.get(device_index)
+ if nvidia_smi_status is not None:
+ utilization_percent, memory_used_mb, memory_total_mb = nvidia_smi_status
+ loaded_gb = memory_used_mb / 1024
+ total_gb = memory_total_mb / 1024
+ else:
+ utilization_percent = None
+ free_bytes, total_bytes = torch.cuda.mem_get_info(device)
+ loaded_gb = (total_bytes - free_bytes) / (1024**3)
+ total_gb = total_bytes / (1024**3)
+
+ statuses.append(
+ SystemGpuStatus(
+ index=device_index,
+ name=torch.cuda.get_device_name(device),
+ utilization_percent=(
+ None if utilization_percent is None else round(utilization_percent, 1)
+ ),
+ loaded_gb=round(loaded_gb, 1),
+ total_gb=round(total_gb, 1),
+ )
+ )
+ return statuses
+
+
+def _get_windows_task_manager_cpu_status() -> tuple[float, float | None] | None:
+ """Get CPU status using the same Windows performance counters Task Manager uses."""
+ try:
+ result = subprocess.run(
+ [
+ "typeperf",
+ r"\Processor Information(_Total)\% Processor Utility",
+ r"\Processor Information(_Total)\% Processor Performance",
+ "-sc",
+ "1",
+ ],
+ check=False,
+ capture_output=True,
+ text=True,
+ timeout=3,
+ )
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ return None
+
+ if result.returncode != 0:
+ return None
+
+ reader = csv.reader(io.StringIO(result.stdout))
+ for row in reader:
+ if len(row) < 3 or row[0].startswith("(PDH-CSV"):
+ continue
+ try:
+ cpu_percent = float(row[1])
+ processor_performance = float(row[2])
+ except ValueError:
+ continue
+
+ cpu_frequency = psutil.cpu_freq()
+ base_frequency_mhz = None if cpu_frequency is None else cpu_frequency.max or cpu_frequency.current
+ cpu_frequency_ghz = (
+ None if base_frequency_mhz is None else round((base_frequency_mhz * processor_performance / 100) / 1000, 2)
+ )
+ return round(cpu_percent, 1), cpu_frequency_ghz
+
+ return None
+
+
+def _get_system_status() -> SystemStatusResponse:
+ memory = psutil.virtual_memory()
+ cpu_status = _get_windows_task_manager_cpu_status()
+ if cpu_status is None:
+ cpu_frequency = psutil.cpu_freq()
+ cpu_percent = round(psutil.cpu_percent(interval=0.2), 1)
+ cpu_frequency_ghz = None if cpu_frequency is None else round(cpu_frequency.current / 1000, 2)
+ else:
+ cpu_percent, cpu_frequency_ghz = cpu_status
+
+ return SystemStatusResponse(
+ cpu_percent=cpu_percent,
+ cpu_frequency_ghz=cpu_frequency_ghz,
+ memory_used_gb=round((memory.total - memory.available) / (1024**3), 1),
+ memory_total_gb=round(memory.total / (1024**3), 1),
+ memory_percent=round(memory.percent, 1),
+ gpus=_get_system_gpu_statuses(),
+ )
+
+
+def _get_text_encoder_cache_status(models: list[ModelIdentifierField]) -> TextEncoderCacheStatusResponse:
+ ram_cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
+ statuses: list[TextEncoderCacheModelStatus] = []
+ normalized_models = [_normalize_text_encoder_identifier(model) for model in models]
+ for model in normalized_models:
+ cache_key = get_model_cache_key(model.key, model.submodel_type)
+ snapshot = ram_cache.get_cache_entry_snapshot(cache_key)
+ statuses.append(
+ TextEncoderCacheModelStatus(
+ key=model.key,
+ name=model.name,
+ cache_key=cache_key,
+ loaded=snapshot is not None and snapshot.current_vram_bytes > 0,
+ device=snapshot.compute_device if snapshot is not None else None,
+ vram_gb=round((snapshot.current_vram_bytes if snapshot is not None else 0) / (1024**3), 2),
+ total_gb=round((snapshot.total_bytes if snapshot is not None else 0) / (1024**3), 2),
+ )
+ )
+ return TextEncoderCacheStatusResponse(models=statuses, cuda_devices=_get_cuda_device_statuses())
+
+
@app_router.get(
"/runtime_config", operation_id="get_runtime_config", status_code=200, response_model=InvokeAIAppConfigWithSetFields
)
@@ -161,6 +449,14 @@ async def update_runtime_config(
config = get_config()
update_dict = changes.model_dump(exclude_unset=True)
config.update_config(update_dict)
+ if "model_cache_keep_alive_min" in update_dict:
+ ApiDependencies.invoker.services.model_manager.load.ram_cache.set_keep_alive_minutes(
+ config.model_cache_keep_alive_min
+ )
+ if update_dict.get("use_second_gpu_for_text_encoder") is False:
+ ApiDependencies.invoker.services.model_manager.load.ram_cache.drop_cuda_entries_except(
+ keep_execution_device=TorchDevice.choose_torch_device()
+ )
if config.config_file_path.exists():
persisted_config = load_and_migrate_config(config.config_file_path)
@@ -172,6 +468,68 @@ async def update_runtime_config(
return InvokeAIAppConfigWithSetFields(set_fields=config.model_fields_set, config=config)
+@app_router.post(
+ "/sync_text_encoder_cache",
+ operation_id="sync_text_encoder_cache",
+ status_code=200,
+ response_model=SyncTextEncoderCacheResponse,
+)
+async def sync_text_encoder_cache(
+ _: AdminUserOrDefault,
+ request: SyncTextEncoderCacheRequest = Body(description="Selected text encoder cache sync request"),
+) -> SyncTextEncoderCacheResponse:
+ ram_cache = ApiDependencies.invoker.services.model_manager.load.ram_cache
+ dropped = 0
+ loaded = 0
+
+ normalized_models = [_normalize_text_encoder_identifier(model) for model in request.text_encoder_models]
+ for model in normalized_models:
+ dropped += ram_cache.drop_cache_key(get_model_cache_key(model.key, model.submodel_type))
+
+ if not request.enabled:
+ dropped += ram_cache.drop_cuda_entries_except(keep_execution_device=TorchDevice.choose_torch_device())
+ return SyncTextEncoderCacheResponse(
+ dropped=dropped, loaded=loaded, status=_get_text_encoder_cache_status(request.text_encoder_models)
+ )
+
+ for model in normalized_models:
+ try:
+ config = ApiDependencies.invoker.services.model_manager.store.get_model(model.key)
+ loaded_model = ApiDependencies.invoker.services.model_manager.load.load_model(config, model.submodel_type)
+ with loaded_model.model_on_device():
+ pass
+ loaded += 1
+ except UnknownModelException:
+ raise HTTPException(status_code=404, detail=f"Unknown model: {model.key}")
+
+ return SyncTextEncoderCacheResponse(
+ dropped=dropped, loaded=loaded, status=_get_text_encoder_cache_status(request.text_encoder_models)
+ )
+
+
+@app_router.post(
+ "/text_encoder_cache_status",
+ operation_id="get_text_encoder_cache_status",
+ status_code=200,
+ response_model=TextEncoderCacheStatusResponse,
+)
+async def get_text_encoder_cache_status(
+ _: AdminUserOrDefault,
+ request: SyncTextEncoderCacheRequest = Body(description="Selected text encoder cache status request"),
+) -> TextEncoderCacheStatusResponse:
+ return _get_text_encoder_cache_status(request.text_encoder_models)
+
+
+@app_router.get(
+ "/system_status",
+ operation_id="get_system_status",
+ status_code=200,
+ response_model=SystemStatusResponse,
+)
+async def get_system_status(_: AdminUserOrDefault) -> SystemStatusResponse:
+ return _get_system_status()
+
+
@app_router.get(
"/external_providers/status",
operation_id="get_external_provider_statuses",
diff --git a/invokeai/app/api/routers/model_manager.py b/invokeai/app/api/routers/model_manager.py
index bdd2e406444..71cbbff2f87 100644
--- a/invokeai/app/api/routers/model_manager.py
+++ b/invokeai/app/api/routers/model_manager.py
@@ -1306,7 +1306,9 @@ async def empty_model_cache(current_admin: AdminUserOrDefault) -> None:
"""Drop all models from the model cache to free RAM/VRAM. 'Locked' models that are in active use will not be dropped."""
# Request 1000GB of room in order to force the cache to drop all models.
ApiDependencies.invoker.services.logger.info("Emptying model cache.")
- ApiDependencies.invoker.services.model_manager.load.ram_cache.make_room(1000 * 2**30)
+ ApiDependencies.invoker.services.model_manager.load.ram_cache.make_room(
+ 1000 * 2**30, preserve_auto_evict_protected=False
+ )
class HFTokenStatus(str, Enum):
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index e6cc7c2798c..35d946a68e8 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -107,6 +107,7 @@ class InvokeAIAppConfig(BaseSettings):
lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.
pytorch_cuda_alloc_conf: Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to "backend:cudaMallocAsync" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.
device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)
+ use_second_gpu_for_text_encoder: When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.
precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.
Valid values: `auto`, `float16`, `bfloat16`, `float32`
sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
attention_type: Attention type.
Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`
@@ -205,6 +206,7 @@ class InvokeAIAppConfig(BaseSettings):
# DEVICE
device: str = Field(default="auto", description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)", pattern=r"^(auto|cpu|mps|cuda(:\d+)?)$")
+ use_second_gpu_for_text_encoder: bool = Field(default=False, description="When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.")
precision: PRECISION = Field(default="auto", description="Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.")
# GENERATION
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index 6141a635f4d..ebed92ab454 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -96,6 +96,7 @@ def build_model_manager(
log_memory_usage=app_config.log_memory_usage,
logger=logger,
keep_alive_minutes=app_config.model_cache_keep_alive_min,
+ use_multi_cuda_ram_cache=app_config.use_second_gpu_for_text_encoder,
)
loader = ModelLoadService(
app_config=app_config,
diff --git a/invokeai/backend/model_manager/load/load_default.py b/invokeai/backend/model_manager/load/load_default.py
index 040b55cb6ec..585a1aad82e 100644
--- a/invokeai/backend/model_manager/load/load_default.py
+++ b/invokeai/backend/model_manager/load/load_default.py
@@ -18,6 +18,7 @@
from invokeai.backend.model_manager.load.optimizations import skip_torch_weight_init
from invokeai.backend.model_manager.taxonomy import (
AnyModel,
+ ModelType,
SubModelType,
)
from invokeai.backend.util.devices import TorchDevice
@@ -51,6 +52,20 @@
r"^proj_out$",
)
+_TEXT_ENCODER_SUBMODEL_TYPES = {
+ SubModelType.TextEncoder,
+ SubModelType.TextEncoder2,
+ SubModelType.TextEncoder3,
+}
+
+_STANDALONE_TEXT_ENCODER_MODEL_TYPES = {
+ ModelType.CLIPEmbed,
+ ModelType.Qwen3Encoder,
+ ModelType.QwenVLEncoder,
+ ModelType.T5Encoder,
+ ModelType.TextLLM,
+}
+
# TO DO: The loader is not thread safe!
class ModelLoader(ModelLoaderBase):
@@ -103,17 +118,19 @@ def _get_execution_device(
) -> Optional[torch.device]:
"""Determine the execution device for a model based on its configuration.
- CPU-only execution is only applied to text encoder submodels to save VRAM while keeping
- the denoiser on GPU for performance. Conditioning tensors are moved to GPU after encoding.
-
Returns:
- torch.device("cpu") if the model should run on CPU only, None otherwise (use cache default).
+ A specific execution device if the model should run somewhere other than the cache default.
"""
+ if self._should_use_second_gpu_for_text_encoder(config, submodel_type):
+ second_cuda_device = self._get_second_cuda_device()
+ if second_cuda_device is not None:
+ return second_cuda_device
+
# Check if this is a text encoder submodel of a main model with cpu_only setting
if hasattr(config, "default_settings") and config.default_settings is not None:
if hasattr(config.default_settings, "cpu_only") and config.default_settings.cpu_only is True:
# Only apply CPU execution to text encoder submodels
- if submodel_type in [SubModelType.TextEncoder, SubModelType.TextEncoder2, SubModelType.TextEncoder3]:
+ if submodel_type in _TEXT_ENCODER_SUBMODEL_TYPES:
return torch.device("cpu")
# Check if this is a standalone text encoder config with cpu_only field (T5Encoder, Qwen3Encoder, etc.)
@@ -122,6 +139,28 @@ def _get_execution_device(
return None
+ def _should_use_second_gpu_for_text_encoder(
+ self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None
+ ) -> bool:
+ if not self._app_config.use_second_gpu_for_text_encoder:
+ return False
+ if submodel_type in _TEXT_ENCODER_SUBMODEL_TYPES:
+ return True
+ return getattr(config, "type", None) in _STANDALONE_TEXT_ENCODER_MODEL_TYPES
+
+ def _get_second_cuda_device(self) -> Optional[torch.device]:
+ if self._torch_device.type != "cuda" or not torch.cuda.is_available() or torch.cuda.device_count() < 2:
+ return None
+
+ main_device_index = self._torch_device.index
+ if main_device_index is None:
+ main_device_index = torch.cuda.current_device()
+
+ for device_index in range(torch.cuda.device_count()):
+ if device_index != main_device_index:
+ return torch.device("cuda", device_index)
+ return None
+
def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> CacheRecord:
stats_name = ":".join([config.base, config.type, config.name, (submodel_type or "")])
try:
@@ -133,13 +172,17 @@ def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubMod
self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
loaded_model = self._load_model(config, submodel_type)
- # Determine execution device from model config, considering submodel type
execution_device = self._get_execution_device(config, submodel_type)
+ effective_execution_device = execution_device or self._torch_device
+ prevent_auto_evict = (
+ self._app_config.use_second_gpu_for_text_encoder and effective_execution_device.type == "cuda"
+ )
self._ram_cache.put(
get_model_cache_key(config.key, submodel_type),
model=loaded_model,
execution_device=execution_device,
+ prevent_auto_evict=prevent_auto_evict,
)
return self._ram_cache.get(key=get_model_cache_key(config.key, submodel_type), stats_name=stats_name)
diff --git a/invokeai/backend/model_manager/load/model_cache/cache_record.py b/invokeai/backend/model_manager/load/model_cache/cache_record.py
index 5b4880a177c..142a4d85618 100644
--- a/invokeai/backend/model_manager/load/model_cache/cache_record.py
+++ b/invokeai/backend/model_manager/load/model_cache/cache_record.py
@@ -16,6 +16,9 @@ class CacheRecord:
key: str
# Model in memory.
cached_model: CachedModelWithPartialLoad | CachedModelOnlyFullLoad
+ # Protected entries are skipped by automatic RAM/VRAM eviction. Explicit cache clears
+ # and model reloads may still drop them.
+ prevent_auto_evict: bool = False
_locks: int = 0
# Set by ModelCache.drop_model() when the entry was locked at invalidation time.
# ModelCache.unlock() evicts the entry as soon as the last lock releases so a setting
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
index e3a0928e52b..6328fa3b81c 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -75,6 +75,7 @@ class CacheEntrySnapshot:
cache_key: str
total_bytes: int
current_vram_bytes: int
+ compute_device: str
class CacheMissCallback(Protocol):
@@ -148,6 +149,7 @@ def __init__(
log_memory_usage: bool = False,
logger: Optional[Logger] = None,
keep_alive_minutes: float = 0,
+ use_multi_cuda_ram_cache: bool = False,
):
"""Initialize the model RAM cache.
@@ -168,6 +170,7 @@ def __init__(
behaviour.
:param logger: InvokeAILogger to use (otherwise creates one)
:param keep_alive_minutes: How long to keep models in cache after last use (in minutes). 0 means keep indefinitely.
+ :param use_multi_cuda_ram_cache: Increase the RAM cache budget for dual-CUDA workflows.
"""
self._enable_partial_loading = enable_partial_loading
self._keep_ram_copy_of_weights = keep_ram_copy_of_weights
@@ -177,6 +180,7 @@ def __init__(
self._max_ram_cache_size_gb = max_ram_cache_size_gb
self._max_vram_cache_size_gb = max_vram_cache_size_gb
+ self._use_multi_cuda_ram_cache = use_multi_cuda_ram_cache
self._logger = PrefixedLoggerAdapter(
logger or InvokeAILogger.get_logger(self.__class__.__name__), "MODEL CACHE"
@@ -314,9 +318,25 @@ def shutdown(self) -> None:
self._timeout_timer.cancel()
self._timeout_timer = None
+ @synchronized
+ def set_keep_alive_minutes(self, keep_alive_minutes: float) -> None:
+ """Update the cache keep-alive timeout."""
+ self._keep_alive_minutes = keep_alive_minutes
+ if self._timeout_timer is not None:
+ self._timeout_timer.cancel()
+ self._timeout_timer = None
+ if keep_alive_minutes > 0:
+ self._record_activity()
+
@synchronized
@record_activity
- def put(self, key: str, model: AnyModel, execution_device: Optional[torch.device] = None) -> None:
+ def put(
+ self,
+ key: str,
+ model: AnyModel,
+ execution_device: Optional[torch.device] = None,
+ prevent_auto_evict: bool = False,
+ ) -> None:
"""Add a model to the cache.
Args:
@@ -324,6 +344,7 @@ def put(self, key: str, model: AnyModel, execution_device: Optional[torch.device
model: The model to cache
execution_device: Optional device to use for this specific model. If None, uses the cache's default
execution_device. Use torch.device("cpu") to force a model to run on CPU.
+ prevent_auto_evict: Whether to keep this model resident during automatic RAM/VRAM pressure cleanup.
"""
if key in self._cached_models:
self._logger.debug(
@@ -357,11 +378,12 @@ def put(self, key: str, model: AnyModel, execution_device: Optional[torch.device
model, effective_execution_device, size, keep_ram_copy=self._keep_ram_copy_of_weights
)
- cache_record = CacheRecord(key=key, cached_model=wrapped_model)
+ cache_record = CacheRecord(key=key, cached_model=wrapped_model, prevent_auto_evict=prevent_auto_evict)
self._cached_models[key] = cache_record
self._cache_stack.append(key)
self._logger.debug(
- f"Added model {key} (Type: {model.__class__.__name__}, Wrap mode: {wrapped_model.__class__.__name__}, Model size: {size / MB:.2f}MB)"
+ f"Added model {key} (Type: {model.__class__.__name__}, Wrap mode: {wrapped_model.__class__.__name__}, "
+ f"Model size: {size / MB:.2f}MB, Prevent auto-evict: {prevent_auto_evict})"
)
@synchronized
@@ -374,6 +396,7 @@ def _get_cache_snapshot(self) -> dict[str, CacheEntrySnapshot]:
cache_key=cache_key,
total_bytes=total_bytes,
current_vram_bytes=current_vram_bytes,
+ compute_device=str(cache_entry.cached_model.compute_device),
)
return overview
@@ -507,7 +530,8 @@ def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Option
model_total_bytes = cache_entry.cached_model.total_bytes()
model_vram_needed = model_total_bytes - model_cur_vram_bytes
- vram_available = self._get_vram_available(working_mem_bytes)
+ model_compute_device = cache_entry.cached_model.compute_device
+ vram_available = self._get_vram_available(working_mem_bytes, model_compute_device)
self._logger.debug(
f"Before unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
)
@@ -516,11 +540,11 @@ def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Option
# 1. If the model can fit entirely in VRAM, then make enough room for it to be loaded fully.
# 2. If the model can't fit fully into VRAM, then unload all other models and load as much of the model as
# possible.
- vram_bytes_freed = self._offload_unlocked_models(model_vram_needed, working_mem_bytes)
+ vram_bytes_freed = self._offload_unlocked_models(model_vram_needed, working_mem_bytes, model_compute_device)
self._logger.debug(f"Unloaded models (if necessary): vram_bytes_freed={(vram_bytes_freed / MB):.2f}MB")
# Check the updated vram_available after offloading.
- vram_available = self._get_vram_available(working_mem_bytes)
+ vram_available = self._get_vram_available(working_mem_bytes, model_compute_device)
self._logger.debug(
f"After unloading: {self._get_vram_state_str(model_cur_vram_bytes, model_total_bytes, vram_available)}"
)
@@ -529,7 +553,7 @@ def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Option
# There is insufficient VRAM available. As a last resort, try to unload the model being locked from VRAM,
# as it may still be loaded from a previous use.
vram_bytes_freed_from_own_model = self._move_model_to_ram(cache_entry, -vram_available)
- vram_available = self._get_vram_available(working_mem_bytes)
+ vram_available = self._get_vram_available(working_mem_bytes, model_compute_device)
self._logger.debug(
f"Unloaded {vram_bytes_freed_from_own_model / MB:.2f}MB from the model being locked ({cache_entry.key})."
)
@@ -542,7 +566,7 @@ def _load_locked_model(self, cache_entry: CacheRecord, working_mem_bytes: Option
model_bytes_loaded = self._move_model_to_vram(cache_entry, vram_available + MB)
model_cur_vram_bytes = cache_entry.cached_model.cur_vram_bytes()
- vram_available = self._get_vram_available(working_mem_bytes)
+ vram_available = self._get_vram_available(working_mem_bytes, model_compute_device)
loaded_percent = model_cur_vram_bytes / model_total_bytes if model_total_bytes > 0 else 0
# Use the model's actual compute_device for logging, not the cache's default
model_device = cache_entry.cached_model.compute_device
@@ -590,46 +614,52 @@ def _move_model_to_ram(self, cache_entry: CacheRecord, vram_bytes_to_free: int)
self._delete_cache_entry(cache_entry)
raise
- def _get_vram_available(self, working_mem_bytes: Optional[int]) -> int:
+ def _get_vram_available(
+ self, working_mem_bytes: Optional[int], execution_device: Optional[torch.device] = None
+ ) -> int:
"""Calculate the amount of additional VRAM available for the cache to use (takes into account the working
memory).
"""
+ execution_device = execution_device or self._execution_device
+
# If self._max_vram_cache_size_gb is set, then it overrides the default logic.
if self._max_vram_cache_size_gb is not None:
vram_total_available_to_cache = int(self._max_vram_cache_size_gb * GB)
- return vram_total_available_to_cache - self._get_vram_in_use()
+ return vram_total_available_to_cache - self._get_vram_in_use(execution_device)
working_mem_bytes_default = int(self._execution_device_working_mem_gb * GB)
working_mem_bytes = max(working_mem_bytes or working_mem_bytes_default, working_mem_bytes_default)
- if self._execution_device.type == "cuda":
+ if execution_device.type == "cuda":
# TODO(ryand): It is debatable whether we should use memory_reserved() or memory_allocated() here.
# memory_reserved() includes memory reserved by the torch CUDA memory allocator that may or may not be
# re-used for future allocations. For now, we use memory_allocated() to be conservative.
- # vram_reserved = torch.cuda.memory_reserved(self._execution_device)
- vram_allocated = torch.cuda.memory_allocated(self._execution_device)
- vram_free, _vram_total = torch.cuda.mem_get_info(self._execution_device)
+ # vram_reserved = torch.cuda.memory_reserved(execution_device)
+ vram_allocated = torch.cuda.memory_allocated(execution_device)
+ vram_free, _vram_total = torch.cuda.mem_get_info(execution_device)
vram_available_to_process = vram_free + vram_allocated
- elif self._execution_device.type == "mps":
+ elif execution_device.type == "mps":
vram_reserved = torch.mps.driver_allocated_memory()
# TODO(ryand): Is it accurate that MPS shares memory with the CPU?
vram_free = psutil.virtual_memory().available
vram_available_to_process = vram_free + vram_reserved
else:
- raise ValueError(f"Unsupported execution device: {self._execution_device.type}")
+ raise ValueError(f"Unsupported execution device: {execution_device.type}")
vram_total_available_to_cache = vram_available_to_process - working_mem_bytes
- vram_cur_available_to_cache = vram_total_available_to_cache - self._get_vram_in_use()
+ vram_cur_available_to_cache = vram_total_available_to_cache - self._get_vram_in_use(execution_device)
return vram_cur_available_to_cache
- def _get_vram_in_use(self) -> int:
+ def _get_vram_in_use(self, execution_device: Optional[torch.device] = None) -> int:
"""Get the amount of VRAM currently in use by the cache."""
- if self._execution_device.type == "cuda":
- return torch.cuda.memory_allocated()
- elif self._execution_device.type == "mps":
+ execution_device = execution_device or self._execution_device
+
+ if execution_device.type == "cuda":
+ return torch.cuda.memory_allocated(execution_device)
+ elif execution_device.type == "mps":
return torch.mps.current_allocated_memory()
else:
- raise ValueError(f"Unsupported execution device type: {self._execution_device.type}")
+ raise ValueError(f"Unsupported execution device type: {execution_device.type}")
# Alternative definition of VRAM in use:
# return sum(ce.cached_model.cur_vram_bytes() for ce in self._cached_models.values())
@@ -663,7 +693,13 @@ def _calc_ram_available_to_model_cache(self) -> int:
# Lookup the total VRAM size for the CUDA execution device.
total_cuda_vram_bytes: int | None = None
if self._execution_device.type == "cuda":
- _, total_cuda_vram_bytes = torch.cuda.mem_get_info(self._execution_device)
+ if self._use_multi_cuda_ram_cache and torch.cuda.device_count() > 1:
+ total_cuda_vram_bytes = 0
+ for device_index in range(torch.cuda.device_count()):
+ _, device_total_vram_bytes = torch.cuda.mem_get_info(torch.device("cuda", device_index))
+ total_cuda_vram_bytes += device_total_vram_bytes
+ else:
+ _, total_cuda_vram_bytes = torch.cuda.mem_get_info(self._execution_device)
# Apply heuristic 1.
# ------------------
@@ -671,7 +707,8 @@ def _calc_ram_available_to_model_cache(self) -> int:
total_system_ram_bytes = psutil.virtual_memory().total
# Assumed baseline RAM used by InvokeAI for non-model stuff.
baseline_ram_used_by_invokeai = 2 * GB
- ram_available_to_model_cache = int(total_system_ram_bytes * 0.5 - baseline_ram_used_by_invokeai)
+ ram_cache_fraction = 0.75 if self._use_multi_cuda_ram_cache else 0.5
+ ram_available_to_model_cache = int(total_system_ram_bytes * ram_cache_fraction - baseline_ram_used_by_invokeai)
# Apply heuristic 2.
# ------------------
@@ -680,7 +717,10 @@ def _calc_ram_available_to_model_cache(self) -> int:
if self._max_vram_cache_size_gb is not None:
max_ram_cache_size_bytes = int(self._max_vram_cache_size_gb * GB)
else:
- max_ram_cache_size_bytes = total_cuda_vram_bytes - int(self._execution_device_working_mem_gb * GB)
+ cuda_device_count = torch.cuda.device_count() if self._use_multi_cuda_ram_cache else 1
+ max_ram_cache_size_bytes = total_cuda_vram_bytes - int(
+ self._execution_device_working_mem_gb * cuda_device_count * GB
+ )
if ram_available_to_model_cache > max_ram_cache_size_bytes:
heuristics_applied.append(2)
ram_available_to_model_cache = max_ram_cache_size_bytes
@@ -719,7 +759,12 @@ def _get_vram_state_str(self, model_cur_vram_bytes: int, model_total_bytes: int,
+ f"vram_available={(vram_available / MB):.0f} MB, "
)
- def _offload_unlocked_models(self, vram_bytes_required: int, working_mem_bytes: Optional[int] = None) -> int:
+ def _offload_unlocked_models(
+ self,
+ vram_bytes_required: int,
+ working_mem_bytes: Optional[int] = None,
+ execution_device: Optional[torch.device] = None,
+ ) -> int:
"""Offload models from the execution_device until vram_bytes_required bytes are available, or all models are
offloaded. Of course, locked models are not offloaded.
@@ -729,15 +774,20 @@ def _offload_unlocked_models(self, vram_bytes_required: int, working_mem_bytes:
self._logger.debug(
f"Offloading unlocked models with goal of making room for {vram_bytes_required / MB:.2f}MB of VRAM."
)
+ execution_device = execution_device or self._execution_device
vram_bytes_freed = 0
# TODO(ryand): Give more thought to the offloading policy used here.
cache_entries_increasing_size = sorted(self._cached_models.values(), key=lambda x: x.cached_model.total_bytes())
for cache_entry in cache_entries_increasing_size:
# We do not fully trust the count of bytes freed, so we check again on each iteration.
- vram_available = self._get_vram_available(working_mem_bytes)
+ vram_available = self._get_vram_available(working_mem_bytes, execution_device)
vram_bytes_to_free = vram_bytes_required - vram_available
if vram_bytes_to_free <= 0:
break
+ if cache_entry.cached_model.compute_device != execution_device:
+ continue
+ if cache_entry.prevent_auto_evict:
+ continue
if cache_entry.is_locked:
# TODO(ryand): In the future, we may want to partially unload locked models, but this requires careful
# handling of model patches (e.g. LoRA).
@@ -820,16 +870,16 @@ def _log_cache_state(self, title: str = "Model cache state:", include_entry_deta
self._logger.debug(log)
@synchronized
- def make_room(self, bytes_needed: int) -> None:
+ def make_room(self, bytes_needed: int, preserve_auto_evict_protected: bool = True) -> None:
"""Make enough room in the cache to accommodate a new model of indicated size.
Note: This function deletes all of the cache's internal references to a model in order to free it. If there are
external references to the model, there's nothing that the cache can do about it, and those models will not be
garbage-collected.
"""
- self._make_room_internal(bytes_needed)
+ self._make_room_internal(bytes_needed, preserve_auto_evict_protected=preserve_auto_evict_protected)
- def _make_room_internal(self, bytes_needed: int) -> None:
+ def _make_room_internal(self, bytes_needed: int, preserve_auto_evict_protected: bool = True) -> None:
"""Internal implementation of make_room(). Assumes the lock is already held."""
self._logger.debug(f"Making room for {bytes_needed / MB:.2f}MB of RAM.")
self._log_cache_state(title="Before dropping models:")
@@ -844,7 +894,9 @@ def _make_room_internal(self, bytes_needed: int) -> None:
model_key = self._cache_stack[pos]
cache_entry = self._cached_models[model_key]
- if not cache_entry.is_locked:
+ if preserve_auto_evict_protected and cache_entry.prevent_auto_evict:
+ pos += 1
+ elif not cache_entry.is_locked:
ram_bytes_freed += cache_entry.cached_model.total_bytes()
self._logger.debug(
f"Dropping {model_key} from RAM cache to free {(cache_entry.cached_model.total_bytes() / MB):.2f}MB."
@@ -884,9 +936,164 @@ def _make_room_internal(self, bytes_needed: int) -> None:
def _delete_cache_entry(self, cache_entry: CacheRecord) -> None:
"""Delete cache_entry from the cache if it exists. No exception is thrown if it doesn't exist."""
+ vram_bytes = cache_entry.cached_model.cur_vram_bytes()
+ if vram_bytes > 0:
+ try:
+ unloaded_bytes = cache_entry.cached_model.full_unload_from_vram()
+ self._logger.debug(
+ f"Unloaded {unloaded_bytes / MB:.2f}MB from {cache_entry.cached_model.compute_device} "
+ f"before deleting cache entry {cache_entry.key}."
+ )
+ except Exception:
+ self._logger.exception(f"Failed to unload cache entry {cache_entry.key} before deleting it.")
self._cache_stack = [key for key in self._cache_stack if key != cache_entry.key]
self._cached_models.pop(cache_entry.key, None)
+ @synchronized
+ def drop_cache_key(self, cache_key: str) -> int:
+ """Drop one exact cache entry.
+
+ Returns 1 if the entry was immediately dropped, otherwise 0. Locked entries are marked stale and dropped when
+ the last lock releases.
+ """
+ entry = self._cached_models.get(cache_key)
+ if entry is None:
+ return 0
+ if entry.is_locked:
+ entry.is_stale = True
+ entry.prevent_auto_evict = False
+ return 0
+
+ bytes_freed = entry.cached_model.total_bytes()
+ self._delete_cache_entry(entry)
+ if self.stats:
+ self.stats.cleared = 1
+ snapshot = self._get_cache_snapshot()
+ for cb in self._on_cache_models_cleared_callbacks:
+ cb(
+ models_cleared=1,
+ bytes_requested=0,
+ bytes_freed=bytes_freed,
+ cache_snapshot=snapshot,
+ )
+ gc.collect()
+ TorchDevice.empty_cache()
+ self._logger.info(f"Dropped cache entry {cache_key} to free {bytes_freed / MB:.2f}MB.")
+ return 1
+
+ @synchronized
+ def drop_auto_evict_protected(self, exclude_execution_device: Optional[torch.device] = None) -> int:
+ """Drop protected cache entries, optionally keeping entries on a specific execution device.
+
+ This is used when split-GPU mode is disabled. It frees the second GPU immediately while letting the main CUDA
+ model stay resident if it is already loaded.
+
+ Returns the number of entries immediately dropped.
+ """
+ dropped: list[CacheRecord] = []
+ bytes_freed = 0
+ exclude_execution_device = (
+ torch.device(exclude_execution_device) if exclude_execution_device is not None else None
+ )
+
+ for entry in list(self._cached_models.values()):
+ if not entry.prevent_auto_evict:
+ continue
+ if exclude_execution_device is not None and entry.cached_model.compute_device == exclude_execution_device:
+ entry.prevent_auto_evict = False
+ continue
+ if entry.is_locked:
+ entry.is_stale = True
+ entry.prevent_auto_evict = False
+ continue
+
+ bytes_freed += entry.cached_model.total_bytes()
+ self._delete_cache_entry(entry)
+ dropped.append(entry)
+
+ if dropped:
+ if self.stats:
+ self.stats.cleared = len(dropped)
+ snapshot = self._get_cache_snapshot()
+ for cb in self._on_cache_models_cleared_callbacks:
+ cb(
+ models_cleared=len(dropped),
+ bytes_requested=0,
+ bytes_freed=bytes_freed,
+ cache_snapshot=snapshot,
+ )
+ gc.collect()
+ TorchDevice.empty_cache()
+ self._logger.info(
+ f"Dropped {len(dropped)} split-GPU protected model(s) to free {bytes_freed / MB:.2f}MB."
+ )
+
+ return len(dropped)
+
+ @synchronized
+ def drop_cuda_entries_except(self, keep_execution_device: torch.device) -> int:
+ """Drop every cached model on CUDA devices other than the requested one."""
+ keep_execution_device = torch.device(keep_execution_device)
+ dropped: list[CacheRecord] = []
+ bytes_freed = 0
+
+ for entry in list(self._cached_models.values()):
+ compute_device = entry.cached_model.compute_device
+ if compute_device.type != "cuda" or compute_device == keep_execution_device:
+ if compute_device == keep_execution_device:
+ entry.prevent_auto_evict = False
+ continue
+ if entry.is_locked:
+ entry.is_stale = True
+ entry.prevent_auto_evict = False
+ continue
+
+ bytes_freed += entry.cached_model.total_bytes()
+ self._delete_cache_entry(entry)
+ dropped.append(entry)
+
+ if dropped:
+ if self.stats:
+ self.stats.cleared = len(dropped)
+ snapshot = self._get_cache_snapshot()
+ for cb in self._on_cache_models_cleared_callbacks:
+ cb(
+ models_cleared=len(dropped),
+ bytes_requested=0,
+ bytes_freed=bytes_freed,
+ cache_snapshot=snapshot,
+ )
+ gc.collect()
+ TorchDevice.empty_cache()
+ self._logger.info(f"Dropped {len(dropped)} non-main CUDA model(s) to free {bytes_freed / MB:.2f}MB.")
+
+ return len(dropped)
+
+ @synchronized
+ def get_cache_entry_snapshot(self, cache_key: str) -> CacheEntrySnapshot | None:
+ """Get a lightweight snapshot for a single cache entry."""
+ entry = self._cached_models.get(cache_key)
+ if entry is None:
+ return None
+ return CacheEntrySnapshot(
+ cache_key=cache_key,
+ total_bytes=entry.cached_model.total_bytes(),
+ current_vram_bytes=entry.cached_model.cur_vram_bytes(),
+ compute_device=str(entry.cached_model.compute_device),
+ )
+
+ @synchronized
+ def get_cuda_cache_usage_bytes(self) -> dict[int, int]:
+ """Get cached model VRAM usage grouped by CUDA device index."""
+ usage: dict[int, int] = {}
+ for entry in self._cached_models.values():
+ compute_device = entry.cached_model.compute_device
+ if compute_device.type != "cuda":
+ continue
+ device_index = compute_device.index if compute_device.index is not None else torch.cuda.current_device()
+ usage[device_index] = usage.get(device_index, 0) + entry.cached_model.cur_vram_bytes()
+ return usage
+
@synchronized
def drop_model(self, model_key: str) -> int:
"""Drop all cache entries belonging to a model so the next load rebuilds them.
diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json
index 7033408b197..ca36b038e49 100644
--- a/invokeai/frontend/web/openapi.json
+++ b/invokeai/frontend/web/openapi.json
@@ -6511,6 +6511,120 @@
]
}
},
+ "/api/v1/app/sync_text_encoder_cache": {
+ "post": {
+ "tags": ["app"],
+ "summary": "Sync Text Encoder Cache",
+ "operationId": "sync_text_encoder_cache",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SyncTextEncoderCacheRequest",
+ "description": "Selected text encoder cache sync request"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SyncTextEncoderCacheResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ },
+ "security": [
+ {
+ "HTTPBearer": []
+ }
+ ]
+ }
+ },
+ "/api/v1/app/text_encoder_cache_status": {
+ "post": {
+ "tags": ["app"],
+ "summary": "Get Text Encoder Cache Status",
+ "operationId": "get_text_encoder_cache_status",
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SyncTextEncoderCacheRequest",
+ "description": "Selected text encoder cache status request"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/TextEncoderCacheStatusResponse"
+ }
+ }
+ }
+ },
+ "422": {
+ "description": "Validation Error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/HTTPValidationError"
+ }
+ }
+ }
+ }
+ },
+ "security": [
+ {
+ "HTTPBearer": []
+ }
+ ]
+ }
+ },
+ "/api/v1/app/system_status": {
+ "get": {
+ "tags": ["app"],
+ "summary": "Get System Status",
+ "operationId": "get_system_status",
+ "responses": {
+ "200": {
+ "description": "Successful Response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SystemStatusResponse"
+ }
+ }
+ }
+ }
+ },
+ "security": [
+ {
+ "HTTPBearer": []
+ }
+ ]
+ }
+ },
"/api/v1/app/external_providers/status": {
"get": {
"tags": ["app"],
@@ -20006,6 +20120,39 @@
"$ref": "#/components/schemas/LatentsOutput"
}
},
+ "CudaDeviceStatus": {
+ "properties": {
+ "index": {
+ "type": "integer",
+ "title": "Index",
+ "description": "CUDA device index."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "CUDA device name."
+ },
+ "used_gb": {
+ "type": "number",
+ "title": "Used Gb",
+ "description": "Total device memory used in GB, including non-InvokeAI processes."
+ },
+ "invoke_cache_gb": {
+ "type": "number",
+ "title": "Invoke Cache Gb",
+ "description": "InvokeAI model cache memory used on this device in GB."
+ },
+ "total_gb": {
+ "type": "number",
+ "title": "Total Gb",
+ "description": "Total device memory in GB."
+ }
+ },
+ "type": "object",
+ "required": ["index", "name", "used_gb", "invoke_cache_gb", "total_gb"],
+ "title": "CudaDeviceStatus",
+ "description": "CUDA device memory status."
+ },
"CvInpaintInvocation": {
"category": "inpaint",
"class": "invocation",
@@ -41153,6 +41300,12 @@
"description": "Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)",
"default": "auto"
},
+ "use_second_gpu_for_text_encoder": {
+ "type": "boolean",
+ "title": "Use Second Gpu For Text Encoder",
+ "description": "When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.",
+ "default": false
+ },
"precision": {
"type": "string",
"enum": ["auto", "float16", "bfloat16", "float32"],
@@ -41423,7 +41576,7 @@
"additionalProperties": false,
"type": "object",
"title": "InvokeAIAppConfig",
- "description": "Invoke's global app configuration.\n\nTypically, you won't need to interact with this class directly. Instead, use the `get_config` function from `invokeai.app.services.config` to get a singleton config object.\n\nAttributes:\n host: IP address to bind to. Use `0.0.0.0` to serve to your local network.\n port: Port to bind to.\n allow_origins: Allowed CORS origins.\n allow_credentials: Allow CORS credentials.\n allow_methods: Methods allowed for CORS.\n allow_headers: Headers allowed for CORS.\n ssl_certfile: SSL certificate file for HTTPS. See https://www.uvicorn.dev/settings/#https.\n ssl_keyfile: SSL key file for HTTPS. See https://www.uvicorn.dev/settings/#https.\n log_tokenization: Enable logging of parsed prompt tokens.\n patchmatch: Enable patchmatch inpaint code.\n models_dir: Path to the models directory.\n convert_cache_dir: Path to the converted models cache directory (DEPRECATED, but do not delete because it is needed for migration from previous versions).\n download_cache_dir: Path to the directory that contains dynamically downloaded models.\n legacy_conf_dir: Path to directory of legacy checkpoint config files.\n db_dir: Path to InvokeAI databases directory.\n outputs_dir: Path to directory for outputs.\n image_subfolder_strategy: Strategy for organizing images into subfolders. 'flat' stores all images in a single folder. 'date' organizes by YYYY/MM/DD. 'type' organizes by image category. 'hash' uses first 2 characters of UUID for filesystem performance.
Valid values: `flat`, `date`, `type`, `hash`\n custom_nodes_dir: Path to directory for custom nodes.\n style_presets_dir: Path to directory for style presets.\n workflow_thumbnails_dir: Path to directory for workflow thumbnails.\n log_handlers: Log handler. Valid options are \"console\", \"file=\", \"syslog=path|address:host:port\", \"http=\".\n log_format: Log format. Use \"plain\" for text-only, \"color\" for colorized output, \"legacy\" for 2.3-style logging and \"syslog\" for syslog-style.
Valid values: `plain`, `color`, `syslog`, `legacy`\n log_level: Emit logging messages at this level or higher.
Valid values: `debug`, `info`, `warning`, `error`, `critical`\n log_sql: Log SQL queries. `log_level` must be `debug` for this to do anything. Extremely verbose.\n log_level_network: Log level for network-related messages. 'info' and 'debug' are very verbose.
Valid values: `debug`, `info`, `warning`, `error`, `critical`\n use_memory_db: Use in-memory database. Useful for development.\n dev_reload: Automatically reload when Python sources are changed. Does not reload node definitions.\n profile_graphs: Enable graph profiling using `cProfile`.\n profile_prefix: An optional prefix for profile output files.\n profiles_dir: Path to profiles output directory.\n max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.\n max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.\n log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.\n model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.\n device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.\n enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.\n keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.\n ram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.\n vram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.\n lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.\n pytorch_cuda_alloc_conf: Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to \"backend:cudaMallocAsync\" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.\n device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)\n precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.
Valid values: `auto`, `float16`, `bfloat16`, `float32`\n sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.\n attention_type: Attention type.
Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`\n attention_slice_size: Slice size, valid when attention_type==\"sliced\".
Valid values: `auto`, `balanced`, `max`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`\n force_tiled_decode: Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).\n pil_compress_level: The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.\n max_queue_size: Maximum number of items in the session queue.\n clear_queue_on_startup: Empties session queue on startup. If true, disables `max_queue_history`.\n max_queue_history: Keep the last N completed, failed, and canceled queue items. Older items are deleted on startup. Set to 0 to prune all terminal items. Ignored if `clear_queue_on_startup` is true.\n allow_nodes: List of nodes to allow. Omit to allow all.\n deny_nodes: List of nodes to deny. Omit to deny none.\n node_cache_size: How many cached nodes to keep in memory.\n hashing_algorithm: Model hashing algorthim for model installs. 'blake3_multi' is best for SSDs. 'blake3_single' is best for spinning disk HDDs. 'random' disables hashing, instead assigning a UUID to models. Useful when using a memory db to reduce model installation time, or if you don't care about storing stable hashes for models. Alternatively, any other hashlib algorithm is accepted, though these are not nearly as performant as blake3.
Valid values: `blake3_multi`, `blake3_single`, `random`, `md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `blake2b`, `blake2s`, `sha3_224`, `sha3_256`, `sha3_384`, `sha3_512`, `shake_128`, `shake_256`\n remote_api_tokens: List of regular expression and token pairs used when downloading models from URLs. The download URL is tested against the regex, and if it matches, the token is provided in as a Bearer token.\n scan_models_on_startup: Scan the models directory on startup, registering orphaned models. This is typically only used in conjunction with `use_memory_db` for testing purposes.\n unsafe_disable_picklescan: UNSAFE. Disable the picklescan security check during model installation. Recommended only for development and testing purposes. This will allow arbitrary code execution during model installation, so should never be used in production.\n allow_unknown_models: Allow installation of models that we are unable to identify. If enabled, models will be marked as `unknown` in the database, and will not have any metadata associated with them. If disabled, unknown models will be rejected during installation.\n multiuser: Enable multiuser support. When disabled, the application runs in single-user mode using a default system account with administrator privileges. When enabled, requires user authentication and authorization.\n strict_password_checking: Enforce strict password requirements. When True, passwords must contain uppercase, lowercase, and numbers. When False (default), any password is accepted but its strength (weak/moderate/strong) is reported to the user.\n external_alibabacloud_api_key: API key for Alibaba Cloud DashScope image generation.\n external_alibabacloud_base_url: Base URL override for Alibaba Cloud DashScope image generation.\n external_gemini_api_key: API key for Gemini image generation.\n external_openai_api_key: API key for OpenAI image generation.\n external_gemini_base_url: Base URL override for Gemini image generation.\n external_openai_base_url: Base URL override for OpenAI image generation.\n external_seedream_api_key: API key for Seedream image generation.\n external_seedream_base_url: Base URL override for Seedream image generation."
+ "description": "Invoke's global app configuration.\n\nTypically, you won't need to interact with this class directly. Instead, use the `get_config` function from `invokeai.app.services.config` to get a singleton config object.\n\nAttributes:\n host: IP address to bind to. Use `0.0.0.0` to serve to your local network.\n port: Port to bind to.\n allow_origins: Allowed CORS origins.\n allow_credentials: Allow CORS credentials.\n allow_methods: Methods allowed for CORS.\n allow_headers: Headers allowed for CORS.\n ssl_certfile: SSL certificate file for HTTPS. See https://www.uvicorn.dev/settings/#https.\n ssl_keyfile: SSL key file for HTTPS. See https://www.uvicorn.dev/settings/#https.\n log_tokenization: Enable logging of parsed prompt tokens.\n patchmatch: Enable patchmatch inpaint code.\n models_dir: Path to the models directory.\n convert_cache_dir: Path to the converted models cache directory (DEPRECATED, but do not delete because it is needed for migration from previous versions).\n download_cache_dir: Path to the directory that contains dynamically downloaded models.\n legacy_conf_dir: Path to directory of legacy checkpoint config files.\n db_dir: Path to InvokeAI databases directory.\n outputs_dir: Path to directory for outputs.\n image_subfolder_strategy: Strategy for organizing images into subfolders. 'flat' stores all images in a single folder. 'date' organizes by YYYY/MM/DD. 'type' organizes by image category. 'hash' uses first 2 characters of UUID for filesystem performance.
Valid values: `flat`, `date`, `type`, `hash`\n custom_nodes_dir: Path to directory for custom nodes.\n style_presets_dir: Path to directory for style presets.\n workflow_thumbnails_dir: Path to directory for workflow thumbnails.\n log_handlers: Log handler. Valid options are \"console\", \"file=\", \"syslog=path|address:host:port\", \"http=\".\n log_format: Log format. Use \"plain\" for text-only, \"color\" for colorized output, \"legacy\" for 2.3-style logging and \"syslog\" for syslog-style.
Valid values: `plain`, `color`, `syslog`, `legacy`\n log_level: Emit logging messages at this level or higher.
Valid values: `debug`, `info`, `warning`, `error`, `critical`\n log_sql: Log SQL queries. `log_level` must be `debug` for this to do anything. Extremely verbose.\n log_level_network: Log level for network-related messages. 'info' and 'debug' are very verbose.
Valid values: `debug`, `info`, `warning`, `error`, `critical`\n use_memory_db: Use in-memory database. Useful for development.\n dev_reload: Automatically reload when Python sources are changed. Does not reload node definitions.\n profile_graphs: Enable graph profiling using `cProfile`.\n profile_prefix: An optional prefix for profile output files.\n profiles_dir: Path to profiles output directory.\n max_cache_ram_gb: The maximum amount of CPU RAM to use for model caching in GB. If unset, the limit will be configured based on the available RAM. In most cases, it is recommended to leave this unset.\n max_cache_vram_gb: The amount of VRAM to use for model caching in GB. If unset, the limit will be configured based on the available VRAM and the device_working_mem_gb. In most cases, it is recommended to leave this unset.\n log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.\n model_cache_keep_alive_min: How long to keep models in cache after last use, in minutes. A value of 0 (the default) means models are kept in cache indefinitely. If no model generations occur within the timeout period, the model cache is cleared using the same logic as the 'Clear Model Cache' button.\n device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.\n enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.\n keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.\n ram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.\n vram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.\n lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.\n pytorch_cuda_alloc_conf: Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to \"backend:cudaMallocAsync\" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.\n device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)\n use_second_gpu_for_text_encoder: When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.\n precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.
Valid values: `auto`, `float16`, `bfloat16`, `float32`\n sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.\n attention_type: Attention type.
Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`\n attention_slice_size: Slice size, valid when attention_type==\"sliced\".
Valid values: `auto`, `balanced`, `max`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`\n force_tiled_decode: Whether to enable tiled VAE decode (reduces memory consumption with some performance penalty).\n pil_compress_level: The compress_level setting of PIL.Image.save(), used for PNG encoding. All settings are lossless. 0 = no compression, 1 = fastest with slightly larger filesize, 9 = slowest with smallest filesize. 1 is typically the best setting.\n max_queue_size: Maximum number of items in the session queue.\n clear_queue_on_startup: Empties session queue on startup. If true, disables `max_queue_history`.\n max_queue_history: Keep the last N completed, failed, and canceled queue items. Older items are deleted on startup. Set to 0 to prune all terminal items. Ignored if `clear_queue_on_startup` is true.\n allow_nodes: List of nodes to allow. Omit to allow all.\n deny_nodes: List of nodes to deny. Omit to deny none.\n node_cache_size: How many cached nodes to keep in memory.\n hashing_algorithm: Model hashing algorthim for model installs. 'blake3_multi' is best for SSDs. 'blake3_single' is best for spinning disk HDDs. 'random' disables hashing, instead assigning a UUID to models. Useful when using a memory db to reduce model installation time, or if you don't care about storing stable hashes for models. Alternatively, any other hashlib algorithm is accepted, though these are not nearly as performant as blake3.
Valid values: `blake3_multi`, `blake3_single`, `random`, `md5`, `sha1`, `sha224`, `sha256`, `sha384`, `sha512`, `blake2b`, `blake2s`, `sha3_224`, `sha3_256`, `sha3_384`, `sha3_512`, `shake_128`, `shake_256`\n remote_api_tokens: List of regular expression and token pairs used when downloading models from URLs. The download URL is tested against the regex, and if it matches, the token is provided in as a Bearer token.\n scan_models_on_startup: Scan the models directory on startup, registering orphaned models. This is typically only used in conjunction with `use_memory_db` for testing purposes.\n unsafe_disable_picklescan: UNSAFE. Disable the picklescan security check during model installation. Recommended only for development and testing purposes. This will allow arbitrary code execution during model installation, so should never be used in production.\n allow_unknown_models: Allow installation of models that we are unable to identify. If enabled, models will be marked as `unknown` in the database, and will not have any metadata associated with them. If disabled, unknown models will be rejected during installation.\n multiuser: Enable multiuser support. When disabled, the application runs in single-user mode using a default system account with administrator privileges. When enabled, requires user authentication and authorization.\n strict_password_checking: Enforce strict password requirements. When True, passwords must contain uppercase, lowercase, and numbers. When False (default), any password is accepted but its strength (weak/moderate/strong) is reported to the user.\n external_alibabacloud_api_key: API key for Alibaba Cloud DashScope image generation.\n external_alibabacloud_base_url: Base URL override for Alibaba Cloud DashScope image generation.\n external_gemini_api_key: API key for Gemini image generation.\n external_openai_api_key: API key for OpenAI image generation.\n external_gemini_base_url: Base URL override for Gemini image generation.\n external_openai_base_url: Base URL override for OpenAI image generation.\n external_seedream_api_key: API key for Seedream image generation.\n external_seedream_base_url: Base URL override for Seedream image generation."
},
"InvokeAIAppConfigWithSetFields": {
"properties": {
@@ -54611,19 +54764,19 @@
"ModelIdentifierField": {
"properties": {
"key": {
- "description": "The model's unique key",
+ "type": "string",
"title": "Key",
- "type": "string"
+ "description": "The model's unique key"
},
"hash": {
- "description": "The model's BLAKE3 hash",
+ "type": "string",
"title": "Hash",
- "type": "string"
+ "description": "The model's BLAKE3 hash"
},
"name": {
- "description": "The model's name",
+ "type": "string",
"title": "Name",
- "type": "string"
+ "description": "The model's name"
},
"base": {
"$ref": "#/components/schemas/BaseModelType",
@@ -54642,13 +54795,12 @@
"type": "null"
}
],
- "default": null,
"description": "The submodel to load, if this is a main model"
}
},
+ "type": "object",
"required": ["key", "hash", "name", "base", "type"],
- "title": "ModelIdentifierField",
- "type": "object"
+ "title": "ModelIdentifierField"
},
"ModelIdentifierInvocation": {
"category": "model",
@@ -67421,6 +67573,137 @@
"$ref": "#/components/schemas/IntegerOutput"
}
},
+ "SyncTextEncoderCacheRequest": {
+ "properties": {
+ "enabled": {
+ "type": "boolean",
+ "title": "Enabled",
+ "description": "Whether second-GPU text encoder mode is enabled."
+ },
+ "text_encoder_models": {
+ "items": {
+ "$ref": "#/components/schemas/ModelIdentifierField"
+ },
+ "type": "array",
+ "title": "Text Encoder Models",
+ "description": "Selected text encoder models to unload or prewarm."
+ }
+ },
+ "type": "object",
+ "required": ["enabled"],
+ "title": "SyncTextEncoderCacheRequest",
+ "description": "Request to actively sync selected text encoder cache entries with the second-GPU toggle state."
+ },
+ "SyncTextEncoderCacheResponse": {
+ "properties": {
+ "dropped": {
+ "type": "integer",
+ "title": "Dropped",
+ "description": "Number of cache entries immediately dropped."
+ },
+ "loaded": {
+ "type": "integer",
+ "title": "Loaded",
+ "description": "Number of selected encoder entries loaded onto their target device."
+ },
+ "status": {
+ "$ref": "#/components/schemas/TextEncoderCacheStatusResponse",
+ "description": "Text encoder cache status after sync."
+ }
+ },
+ "type": "object",
+ "required": ["dropped", "loaded", "status"],
+ "title": "SyncTextEncoderCacheResponse",
+ "description": "Text encoder cache sync result."
+ },
+ "SystemGpuStatus": {
+ "properties": {
+ "index": {
+ "type": "integer",
+ "title": "Index",
+ "description": "GPU device index."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "GPU device name."
+ },
+ "utilization_percent": {
+ "anyOf": [
+ {
+ "type": "number"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Utilization Percent",
+ "description": "GPU utilization percent."
+ },
+ "loaded_gb": {
+ "type": "number",
+ "title": "Loaded Gb",
+ "description": "GPU memory used in GB."
+ },
+ "total_gb": {
+ "type": "number",
+ "title": "Total Gb",
+ "description": "Total GPU memory in GB."
+ }
+ },
+ "type": "object",
+ "required": ["index", "name", "loaded_gb", "total_gb"],
+ "title": "SystemGpuStatus",
+ "description": "Basic GPU status."
+ },
+ "SystemStatusResponse": {
+ "properties": {
+ "cpu_percent": {
+ "type": "number",
+ "title": "Cpu Percent",
+ "description": "CPU utilization percent."
+ },
+ "cpu_frequency_ghz": {
+ "anyOf": [
+ {
+ "type": "number"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Cpu Frequency Ghz",
+ "description": "Current CPU frequency in GHz."
+ },
+ "memory_used_gb": {
+ "type": "number",
+ "title": "Memory Used Gb",
+ "description": "System memory used in GB."
+ },
+ "memory_total_gb": {
+ "type": "number",
+ "title": "Memory Total Gb",
+ "description": "Total system memory in GB."
+ },
+ "memory_percent": {
+ "type": "number",
+ "title": "Memory Percent",
+ "description": "System memory utilization percent."
+ },
+ "gpus": {
+ "items": {
+ "$ref": "#/components/schemas/SystemGpuStatus"
+ },
+ "type": "array",
+ "title": "Gpus",
+ "description": "GPU statuses."
+ }
+ },
+ "type": "object",
+ "required": ["cpu_percent", "memory_used_gb", "memory_total_gb", "memory_percent", "gpus"],
+ "title": "SystemStatusResponse",
+ "description": "Basic system status."
+ },
"T2IAdapterField": {
"properties": {
"image": {
@@ -69040,6 +69323,80 @@
"title": "TensorField",
"type": "object"
},
+ "TextEncoderCacheModelStatus": {
+ "properties": {
+ "key": {
+ "type": "string",
+ "title": "Key",
+ "description": "Model key."
+ },
+ "name": {
+ "type": "string",
+ "title": "Name",
+ "description": "Model name."
+ },
+ "cache_key": {
+ "type": "string",
+ "title": "Cache Key",
+ "description": "Resolved cache key."
+ },
+ "loaded": {
+ "type": "boolean",
+ "title": "Loaded",
+ "description": "Whether the cache entry exists and has weights on its execution device."
+ },
+ "device": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Device",
+ "description": "Execution device for the cache entry."
+ },
+ "vram_gb": {
+ "type": "number",
+ "title": "Vram Gb",
+ "description": "Estimated model VRAM resident size in GB."
+ },
+ "total_gb": {
+ "type": "number",
+ "title": "Total Gb",
+ "description": "Estimated model size in GB."
+ }
+ },
+ "type": "object",
+ "required": ["key", "name", "cache_key", "loaded", "vram_gb", "total_gb"],
+ "title": "TextEncoderCacheModelStatus",
+ "description": "Status for one selected text encoder cache entry."
+ },
+ "TextEncoderCacheStatusResponse": {
+ "properties": {
+ "models": {
+ "items": {
+ "$ref": "#/components/schemas/TextEncoderCacheModelStatus"
+ },
+ "type": "array",
+ "title": "Models",
+ "description": "Selected text encoder cache statuses."
+ },
+ "cuda_devices": {
+ "items": {
+ "$ref": "#/components/schemas/CudaDeviceStatus"
+ },
+ "type": "array",
+ "title": "Cuda Devices",
+ "description": "CUDA memory status."
+ }
+ },
+ "type": "object",
+ "required": ["models", "cuda_devices"],
+ "title": "TextEncoderCacheStatusResponse",
+ "description": "Selected text encoder cache and CUDA memory status."
+ },
"TextLLMInvocation": {
"category": "llm",
"class": "invocation",
@@ -70355,6 +70712,31 @@
],
"title": "Max Queue History",
"description": "Keep the last N completed, failed, and canceled queue items on startup. Set to 0 to prune all terminal items."
+ },
+ "model_cache_keep_alive_min": {
+ "anyOf": [
+ {
+ "type": "number",
+ "minimum": 0.0
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Model Cache Keep Alive Min",
+ "description": "How long to keep unlocked models in cache after last use, in minutes. 0 keeps models indefinitely."
+ },
+ "use_second_gpu_for_text_encoder": {
+ "anyOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "title": "Use Second Gpu For Text Encoder",
+ "description": "Run text encoder models on the CUDA device that is not the main execution device when at least two CUDA GPUs are available."
}
},
"type": "object",
diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json
index cb0226c2c44..9851378a1b4 100644
--- a/invokeai/frontend/web/public/locales/en.json
+++ b/invokeai/frontend/web/public/locales/en.json
@@ -1846,6 +1846,14 @@
"imageSubfolderStrategyUnknown": "Unknown ({{strategy}})",
"maxQueueHistory": "Max Queue History",
"maxQueueHistorySaveFailed": "Failed to save Max Queue History",
+ "modelCacheSleepTimer": "Model Sleep Timer",
+ "modelCacheSleepTimerOff": "Off",
+ "modelCacheSleepTimer1Min": "1 minute",
+ "modelCacheSleepTimer5Min": "5 minutes",
+ "modelCacheSleepTimer10Min": "10 minutes",
+ "modelCacheSleepTimer30Min": "30 minutes",
+ "modelCacheSleepTimerCustom": "{{minutes}} minutes",
+ "modelCacheSleepTimerSaveFailed": "Failed to save Model Sleep Timer",
"models": "Models",
"preferAttentionStyleNumeric": "Prefer Numeric Attention Style",
"prompt": "Prompt",
diff --git a/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamUseSecondGpuForTextEncoder.tsx b/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamUseSecondGpuForTextEncoder.tsx
new file mode 100644
index 00000000000..60e16311d01
--- /dev/null
+++ b/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamUseSecondGpuForTextEncoder.tsx
@@ -0,0 +1,220 @@
+import { Flex, FormControl, FormLabel, Switch, Text } from '@invoke-ai/ui-library';
+import { useAppSelector } from 'app/store/storeHooks';
+import { selectCurrentUser } from 'features/auth/store/authSlice';
+import {
+ selectAnimaQwen3EncoderModel,
+ selectCLIPEmbedModel,
+ selectCLIPGEmbedModel,
+ selectCLIPLEmbedModel,
+ selectIsAnima,
+ selectIsFLUX,
+ selectIsFlux2,
+ selectIsQwenImage,
+ selectIsSD3,
+ selectIsZImage,
+ selectKleinQwen3EncoderModel,
+ selectQwenImageComponentSource,
+ selectQwenImageQwenVLEncoderModel,
+ selectT5EncoderModel,
+ selectZImageQwen3EncoderModel,
+ selectZImageQwen3SourceModel,
+} from 'features/controlLayers/store/paramsSlice';
+import type { ModelIdentifierField } from 'features/nodes/types/common';
+import { toast } from 'features/toast/toast';
+import type { ChangeEvent } from 'react';
+import { memo, useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import {
+ useGetAppDepsQuery,
+ useGetRuntimeConfigQuery,
+ useGetTextEncoderCacheStatusMutation,
+ useSyncTextEncoderCacheMutation,
+ useUpdateRuntimeConfigMutation,
+} from 'services/api/endpoints/appInfo';
+
+const getCudaDeviceCount = (appDeps: Record | undefined): number => {
+ const count = Number(appDeps?.['CUDA Devices'] ?? 0);
+ return Number.isFinite(count) ? count : 0;
+};
+
+const isCudaDevice = (device: string | undefined, cudaDeviceCount: number): boolean => {
+ if (!device || device === 'auto') {
+ return cudaDeviceCount > 0;
+ }
+ return device === 'cuda' || device.startsWith('cuda:');
+};
+
+type TextEncoderCacheStatus = {
+ models: {
+ loaded: boolean;
+ device: string | null;
+ vram_gb: number;
+ total_gb: number;
+ }[];
+ cuda_devices: {
+ index: number;
+ used_gb: number;
+ invoke_cache_gb: number;
+ total_gb: number;
+ }[];
+};
+
+const ParamUseSecondGpuForTextEncoder = () => {
+ const currentUser = useAppSelector(selectCurrentUser);
+ const isFLUX = useAppSelector(selectIsFLUX);
+ const isFlux2 = useAppSelector(selectIsFlux2);
+ const isSD3 = useAppSelector(selectIsSD3);
+ const isZImage = useAppSelector(selectIsZImage);
+ const isQwenImage = useAppSelector(selectIsQwenImage);
+ const isAnima = useAppSelector(selectIsAnima);
+ const t5EncoderModel = useAppSelector(selectT5EncoderModel);
+ const clipEmbedModel = useAppSelector(selectCLIPEmbedModel);
+ const clipLEmbedModel = useAppSelector(selectCLIPLEmbedModel);
+ const clipGEmbedModel = useAppSelector(selectCLIPGEmbedModel);
+ const zImageQwen3EncoderModel = useAppSelector(selectZImageQwen3EncoderModel);
+ const zImageQwen3SourceModel = useAppSelector(selectZImageQwen3SourceModel);
+ const qwenImageQwenVLEncoderModel = useAppSelector(selectQwenImageQwenVLEncoderModel);
+ const qwenImageComponentSource = useAppSelector(selectQwenImageComponentSource);
+ const animaQwen3EncoderModel = useAppSelector(selectAnimaQwen3EncoderModel);
+ const kleinQwen3EncoderModel = useAppSelector(selectKleinQwen3EncoderModel);
+ const { data: appDeps } = useGetAppDepsQuery();
+ const { data: runtimeConfig } = useGetRuntimeConfigQuery();
+ const [updateRuntimeConfig, { isLoading }] = useUpdateRuntimeConfigMutation();
+ const [syncTextEncoderCache, { isLoading: isSyncing }] = useSyncTextEncoderCacheMutation();
+ const [getTextEncoderCacheStatus, { isLoading: isLoadingStatus }] = useGetTextEncoderCacheStatusMutation();
+ const [cacheStatus, setCacheStatus] = useState(null);
+ const autoSyncKeyRef = useRef(null);
+
+ const cudaDeviceCount = useMemo(() => getCudaDeviceCount(appDeps), [appDeps]);
+ const isAvailable = runtimeConfig
+ ? cudaDeviceCount >= 2 && isCudaDevice(runtimeConfig.config.device, cudaDeviceCount)
+ : false;
+ const isChecked = Boolean(runtimeConfig?.config.use_second_gpu_for_text_encoder);
+ const canEditRuntimeConfig = runtimeConfig ? !runtimeConfig.config.multiuser || currentUser?.is_admin : false;
+ const selectedTextEncoderModels = useMemo(() => {
+ const models: (ModelIdentifierField | null | undefined)[] = [];
+
+ if (isFLUX && !isFlux2) {
+ models.push(t5EncoderModel, clipEmbedModel);
+ } else if (isSD3) {
+ models.push(t5EncoderModel, clipLEmbedModel, clipGEmbedModel);
+ } else if (isZImage) {
+ models.push(zImageQwen3EncoderModel ?? zImageQwen3SourceModel);
+ } else if (isQwenImage) {
+ models.push(qwenImageQwenVLEncoderModel ?? qwenImageComponentSource);
+ } else if (isAnima) {
+ models.push(animaQwen3EncoderModel);
+ } else if (isFlux2) {
+ models.push(kleinQwen3EncoderModel);
+ }
+
+ return models.filter((model): model is ModelIdentifierField => Boolean(model));
+ }, [
+ animaQwen3EncoderModel,
+ clipEmbedModel,
+ clipGEmbedModel,
+ clipLEmbedModel,
+ isAnima,
+ isFLUX,
+ isFlux2,
+ isQwenImage,
+ isSD3,
+ isZImage,
+ kleinQwen3EncoderModel,
+ qwenImageComponentSource,
+ qwenImageQwenVLEncoderModel,
+ t5EncoderModel,
+ zImageQwen3EncoderModel,
+ zImageQwen3SourceModel,
+ ]);
+ const loadedCount = cacheStatus?.models.filter((model) => model.loaded).length ?? 0;
+ const selectedCount = selectedTextEncoderModels.length;
+ const selectedTextEncoderModelKey = selectedTextEncoderModels.map((model) => model.key).join('|');
+ const encoderCacheStatusLabel =
+ isSyncing || isLoadingStatus
+ ? 'Syncing'
+ : isChecked && selectedCount > 0 && loadedCount === selectedCount
+ ? 'Loaded'
+ : 'Unloaded';
+
+ useEffect(() => {
+ if (!runtimeConfig || !isAvailable) {
+ return;
+ }
+ getTextEncoderCacheStatus({
+ enabled: Boolean(runtimeConfig.config.use_second_gpu_for_text_encoder),
+ text_encoder_models: selectedTextEncoderModels,
+ })
+ .unwrap()
+ .then((status) => {
+ setCacheStatus(status);
+
+ const shouldAutoLoad =
+ runtimeConfig.config.use_second_gpu_for_text_encoder &&
+ selectedTextEncoderModels.length > 0 &&
+ status.models.some((model) => !model.loaded);
+
+ if (!shouldAutoLoad || autoSyncKeyRef.current === selectedTextEncoderModelKey) {
+ return;
+ }
+
+ autoSyncKeyRef.current = selectedTextEncoderModelKey;
+ syncTextEncoderCache({
+ enabled: true,
+ text_encoder_models: selectedTextEncoderModels,
+ })
+ .unwrap()
+ .then((result) => setCacheStatus(result.status))
+ .catch(() => {
+ autoSyncKeyRef.current = null;
+ });
+ })
+ .catch(() => {
+ setCacheStatus(null);
+ });
+ }, [
+ getTextEncoderCacheStatus,
+ isAvailable,
+ runtimeConfig,
+ selectedTextEncoderModelKey,
+ selectedTextEncoderModels,
+ syncTextEncoderCache,
+ ]);
+
+ const onChange = useCallback(
+ async (event: ChangeEvent) => {
+ const enabled = event.target.checked;
+ try {
+ await updateRuntimeConfig({ use_second_gpu_for_text_encoder: enabled }).unwrap();
+ await syncTextEncoderCache({
+ enabled,
+ text_encoder_models: selectedTextEncoderModels,
+ })
+ .unwrap()
+ .then((result) => setCacheStatus(result.status));
+ } catch {
+ toast({
+ id: 'USE_SECOND_GPU_FOR_TEXT_ENCODER_SAVE_FAILED',
+ title: 'Could not update second GPU encoder setting',
+ status: 'error',
+ });
+ }
+ },
+ [selectedTextEncoderModels, syncTextEncoderCache, updateRuntimeConfig]
+ );
+
+ return (
+
+
+
+ Use Second GPU for Text Encoder
+
+ {encoderCacheStatusLabel}
+
+
+
+
+
+ );
+};
+
+export default memo(ParamUseSecondGpuForTextEncoder);
diff --git a/invokeai/frontend/web/src/features/parameters/components/Advanced/SystemHardwareStatus.tsx b/invokeai/frontend/web/src/features/parameters/components/Advanced/SystemHardwareStatus.tsx
new file mode 100644
index 00000000000..0b853084eb9
--- /dev/null
+++ b/invokeai/frontend/web/src/features/parameters/components/Advanced/SystemHardwareStatus.tsx
@@ -0,0 +1,33 @@
+import { Flex, Text } from '@invoke-ai/ui-library';
+import { memo } from 'react';
+import { useGetSystemStatusQuery } from 'services/api/endpoints/appInfo';
+
+const formatPercent = (value: number | null): string => (value === null ? '--%' : `${Math.round(value)}%`);
+
+const SystemHardwareStatus = () => {
+ const { data } = useGetSystemStatusQuery(undefined, { pollingInterval: 3000 });
+
+ if (!data) {
+ return null;
+ }
+
+ return (
+
+
+ CPU: {Math.round(data.cpu_percent)}% {data.cpu_frequency_ghz?.toFixed(2) ?? '--'}GHz
+
+
+ MEMORY: {data.memory_used_gb.toFixed(1)}/{data.memory_total_gb.toFixed(1)}GB ({Math.round(data.memory_percent)}
+ %)
+
+ {data.gpus.map((gpu) => (
+
+ GPU{gpu.index}: {formatPercent(gpu.utilization_percent)} utilization - loaded {gpu.loaded_gb.toFixed(1)}
+ GB, total {gpu.total_gb.toFixed(1)}GB
+
+ ))}
+
+ );
+};
+
+export default memo(SystemHardwareStatus);
diff --git a/invokeai/frontend/web/src/features/settingsAccordions/components/AdvancedSettingsAccordion/AdvancedSettingsAccordion.tsx b/invokeai/frontend/web/src/features/settingsAccordions/components/AdvancedSettingsAccordion/AdvancedSettingsAccordion.tsx
index bfb69b945c8..7bc822db18d 100644
--- a/invokeai/frontend/web/src/features/settingsAccordions/components/AdvancedSettingsAccordion/AdvancedSettingsAccordion.tsx
+++ b/invokeai/frontend/web/src/features/settingsAccordions/components/AdvancedSettingsAccordion/AdvancedSettingsAccordion.tsx
@@ -24,7 +24,9 @@ import ParamFlux2KleinModelSelect from 'features/parameters/components/Advanced/
import ParamQwenImageComponentSourceSelect from 'features/parameters/components/Advanced/ParamQwenImageComponentSourceSelect';
import ParamQwenImageQuantization from 'features/parameters/components/Advanced/ParamQwenImageQuantization';
import ParamT5EncoderModelSelect from 'features/parameters/components/Advanced/ParamT5EncoderModelSelect';
+import ParamUseSecondGpuForTextEncoder from 'features/parameters/components/Advanced/ParamUseSecondGpuForTextEncoder';
import ParamZImageQwen3VaeModelSelect from 'features/parameters/components/Advanced/ParamZImageQwen3VaeModelSelect';
+import SystemHardwareStatus from 'features/parameters/components/Advanced/SystemHardwareStatus';
import ParamSeamlessXAxis from 'features/parameters/components/Seamless/ParamSeamlessXAxis';
import ParamSeamlessYAxis from 'features/parameters/components/Seamless/ParamSeamlessYAxis';
import ParamColorCompensation from 'features/parameters/components/VAEModel/ParamColorCompensation';
@@ -136,11 +138,13 @@ export const AdvancedSettingsAccordion = memo(() => {
+
)}
{isFlux2 && (
+
)}
{isSD3 && (
@@ -148,24 +152,29 @@ export const AdvancedSettingsAccordion = memo(() => {
+
)}
{isZImage && (
+
)}
{isQwenImage && (
+
)}
{isAnima && (
+
)}
+
);
diff --git a/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModal.tsx b/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModal.tsx
index 8e331645a9b..a284f28acde 100644
--- a/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModal.tsx
+++ b/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModal.tsx
@@ -29,6 +29,7 @@ import { SettingsDeveloperLogIsEnabled } from 'features/system/components/Settin
import { SettingsDeveloperLogLevel } from 'features/system/components/SettingsModal/SettingsDeveloperLogLevel';
import { SettingsDeveloperLogNamespaces } from 'features/system/components/SettingsModal/SettingsDeveloperLogNamespaces';
import { SettingsImageSubfolderStrategySelect } from 'features/system/components/SettingsModal/SettingsImageSubfolderStrategySelect';
+import { SettingsModelCacheSleepTimerSelect } from 'features/system/components/SettingsModal/SettingsModelCacheSleepTimerSelect';
import { useClearIntermediates } from 'features/system/components/SettingsModal/useClearIntermediates';
import { StickyScrollable } from 'features/system/components/StickyScrollable';
import {
@@ -327,6 +328,7 @@ const SettingsModal = (props: { children: ReactElement<{ onClick?: () => void }>
+
diff --git a/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModelCacheSleepTimerSelect.tsx b/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModelCacheSleepTimerSelect.tsx
new file mode 100644
index 00000000000..3f0d0f0590d
--- /dev/null
+++ b/invokeai/frontend/web/src/features/system/components/SettingsModal/SettingsModelCacheSleepTimerSelect.tsx
@@ -0,0 +1,94 @@
+import type { ComboboxOnChange } from '@invoke-ai/ui-library';
+import { Combobox, FormControl, FormLabel } from '@invoke-ai/ui-library';
+import { useAppSelector } from 'app/store/storeHooks';
+import { selectCurrentUser } from 'features/auth/store/authSlice';
+import { toast } from 'features/toast/toast';
+import { memo, useCallback, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useGetRuntimeConfigQuery, useUpdateRuntimeConfigMutation } from 'services/api/endpoints/appInfo';
+
+type ModelCacheSleepTimerOption = {
+ label: string;
+ value: string;
+};
+
+const modelCacheSleepTimerOptions = [
+ { label: 'settings.modelCacheSleepTimerOff', value: '0' },
+ { label: 'settings.modelCacheSleepTimer1Min', value: '1' },
+ { label: 'settings.modelCacheSleepTimer5Min', value: '5' },
+ { label: 'settings.modelCacheSleepTimer10Min', value: '10' },
+ { label: 'settings.modelCacheSleepTimer30Min', value: '30' },
+] satisfies ModelCacheSleepTimerOption[];
+
+const getModelCacheSleepTimerOption = (minutes: number): ModelCacheSleepTimerOption => {
+ const value = String(minutes);
+ return (
+ modelCacheSleepTimerOptions.find((option) => option.value === value) ?? {
+ label: 'settings.modelCacheSleepTimerCustom',
+ value,
+ }
+ );
+};
+
+export const SettingsModelCacheSleepTimerSelect = memo(() => {
+ const { t } = useTranslation();
+ const currentUser = useAppSelector(selectCurrentUser);
+ const { data: runtimeConfig } = useGetRuntimeConfigQuery();
+ const [updateRuntimeConfig, { isLoading }] = useUpdateRuntimeConfigMutation();
+ const modelCacheSleepTimer = runtimeConfig?.config.model_cache_keep_alive_min ?? 0;
+ const canEditRuntimeConfig = runtimeConfig ? !runtimeConfig.config.multiuser || currentUser?.is_admin : false;
+
+ const options = useMemo(
+ () =>
+ modelCacheSleepTimerOptions.map((option) => ({
+ ...option,
+ label: t(option.label),
+ })),
+ [t]
+ );
+
+ const value = useMemo(() => {
+ const option = getModelCacheSleepTimerOption(modelCacheSleepTimer);
+ return {
+ ...option,
+ label:
+ option.label === 'settings.modelCacheSleepTimerCustom'
+ ? t(option.label, { minutes: modelCacheSleepTimer })
+ : t(option.label),
+ };
+ }, [modelCacheSleepTimer, t]);
+
+ const onChange = useCallback(
+ async (selection) => {
+ const minutes = Number(selection?.value);
+ if (!Number.isFinite(minutes) || minutes < 0 || minutes === modelCacheSleepTimer) {
+ return;
+ }
+
+ try {
+ await updateRuntimeConfig({ model_cache_keep_alive_min: minutes }).unwrap();
+ } catch {
+ toast({
+ id: 'SETTINGS_MODEL_CACHE_SLEEP_TIMER_SAVE_FAILED',
+ title: t('settings.modelCacheSleepTimerSaveFailed'),
+ status: 'error',
+ });
+ }
+ },
+ [modelCacheSleepTimer, t, updateRuntimeConfig]
+ );
+
+ return (
+
+ {t('settings.modelCacheSleepTimer')}
+
+
+ );
+});
+
+SettingsModelCacheSleepTimerSelect.displayName = 'SettingsModelCacheSleepTimerSelect';
diff --git a/invokeai/frontend/web/src/services/api/endpoints/appInfo.ts b/invokeai/frontend/web/src/services/api/endpoints/appInfo.ts
index 653f458dde8..5fc3b9af258 100644
--- a/invokeai/frontend/web/src/services/api/endpoints/appInfo.ts
+++ b/invokeai/frontend/web/src/services/api/endpoints/appInfo.ts
@@ -1,3 +1,4 @@
+import type { ModelIdentifierField } from 'features/nodes/types/common';
import type { OpenAPIV3_1 } from 'openapi-types';
import type { stringify } from 'querystring';
import type { paths } from 'services/api/schema';
@@ -77,6 +78,27 @@ export const appInfoApi = api.injectEndpoints({
},
invalidatesTags: ['AppConfig'],
}),
+ syncTextEncoderCache: build.mutation({
+ query: (body) => ({
+ url: buildAppInfoUrl('sync_text_encoder_cache'),
+ method: 'POST',
+ body,
+ }),
+ invalidatesTags: ['AppConfig'],
+ }),
+ getTextEncoderCacheStatus: build.mutation({
+ query: (body) => ({
+ url: buildAppInfoUrl('text_encoder_cache_status'),
+ method: 'POST',
+ body,
+ }),
+ }),
+ getSystemStatus: build.query({
+ query: () => ({
+ url: buildAppInfoUrl('system_status'),
+ method: 'GET',
+ }),
+ }),
getExternalProviderStatuses: build.query({
query: () => ({
url: buildAppInfoUrl('external_providers/status'),
@@ -154,6 +176,9 @@ export const {
useSetExternalProviderConfigMutation,
useResetExternalProviderConfigMutation,
useUpdateRuntimeConfigMutation,
+ useSyncTextEncoderCacheMutation,
+ useGetTextEncoderCacheStatusMutation,
+ useGetSystemStatusQuery,
useClearInvocationCacheMutation,
useDisableInvocationCacheMutation,
useEnableInvocationCacheMutation,
@@ -165,3 +190,48 @@ export const {
type SetExternalProviderConfigArg = ExternalProviderConfigUpdate & {
provider_id: string;
};
+
+type SyncTextEncoderCacheArg = {
+ enabled: boolean;
+ text_encoder_models: ModelIdentifierField[];
+};
+
+type SyncTextEncoderCacheResponse = {
+ dropped: number;
+ loaded: number;
+ status: TextEncoderCacheStatusResponse;
+};
+
+type TextEncoderCacheStatusResponse = {
+ models: {
+ key: string;
+ name: string;
+ cache_key: string;
+ loaded: boolean;
+ device: string | null;
+ vram_gb: number;
+ total_gb: number;
+ }[];
+ cuda_devices: {
+ index: number;
+ name: string;
+ used_gb: number;
+ invoke_cache_gb: number;
+ total_gb: number;
+ }[];
+};
+
+type SystemStatusResponse = {
+ cpu_percent: number;
+ cpu_frequency_ghz: number | null;
+ memory_used_gb: number;
+ memory_total_gb: number;
+ memory_percent: number;
+ gpus: {
+ index: number;
+ name: string;
+ utilization_percent: number | null;
+ loaded_gb: number;
+ total_gb: number;
+ }[];
+};
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
index 7864579706a..72320236614 100644
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@@ -1670,6 +1670,57 @@ export type paths = {
patch: operations["update_runtime_config"];
trace?: never;
};
+ "/api/v1/app/sync_text_encoder_cache": {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ get?: never;
+ put?: never;
+ /** Sync Text Encoder Cache */
+ post: operations["sync_text_encoder_cache"];
+ delete?: never;
+ options?: never;
+ head?: never;
+ patch?: never;
+ trace?: never;
+ };
+ "/api/v1/app/text_encoder_cache_status": {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ get?: never;
+ put?: never;
+ /** Get Text Encoder Cache Status */
+ post: operations["get_text_encoder_cache_status"];
+ delete?: never;
+ options?: never;
+ head?: never;
+ patch?: never;
+ trace?: never;
+ };
+ "/api/v1/app/system_status": {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ /** Get System Status */
+ get: operations["get_system_status"];
+ put?: never;
+ post?: never;
+ delete?: never;
+ options?: never;
+ head?: never;
+ patch?: never;
+ trace?: never;
+ };
"/api/v1/app/external_providers/status": {
parameters: {
query?: never;
@@ -8016,6 +8067,37 @@ export type components = {
*/
type: "crop_latents";
};
+ /**
+ * CudaDeviceStatus
+ * @description CUDA device memory status.
+ */
+ CudaDeviceStatus: {
+ /**
+ * Index
+ * @description CUDA device index.
+ */
+ index: number;
+ /**
+ * Name
+ * @description CUDA device name.
+ */
+ name: string;
+ /**
+ * Used Gb
+ * @description Total device memory used in GB, including non-InvokeAI processes.
+ */
+ used_gb: number;
+ /**
+ * Invoke Cache Gb
+ * @description InvokeAI model cache memory used on this device in GB.
+ */
+ invoke_cache_gb: number;
+ /**
+ * Total Gb
+ * @description Total device memory in GB.
+ */
+ total_gb: number;
+ };
/**
* OpenCV Inpaint
* @description Simple inpaint using opencv.
@@ -16204,6 +16286,7 @@ export type components = {
* lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.
* pytorch_cuda_alloc_conf: Configure the Torch CUDA memory allocator. This will impact peak reserved VRAM usage and performance. Setting to "backend:cudaMallocAsync" works well on many systems. The optimal configuration is highly dependent on the system configuration (device type, VRAM, CUDA driver version, etc.), so must be tuned experimentally.
* device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.
Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)
+ * use_second_gpu_for_text_encoder: When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.
* precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.
Valid values: `auto`, `float16`, `bfloat16`, `float32`
* sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
* attention_type: Attention type.
Valid values: `auto`, `normal`, `xformers`, `sliced`, `torch-sdp`
@@ -16508,6 +16591,12 @@ export type components = {
* @default auto
*/
device?: string;
+ /**
+ * Use Second Gpu For Text Encoder
+ * @description When at least two CUDA GPUs are available, run text encoder models on the CUDA device that is not the main execution device.
+ * @default false
+ */
+ use_second_gpu_for_text_encoder?: boolean;
/**
* Precision
* @description Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.
@@ -23374,10 +23463,7 @@ export type components = {
base: components["schemas"]["BaseModelType"];
/** @description The model's type */
type: components["schemas"]["ModelType"];
- /**
- * @description The submodel to load, if this is a main model
- * @default null
- */
+ /** @description The submodel to load, if this is a main model */
submodel_type?: components["schemas"]["SubModelType"] | null;
};
/**
@@ -29360,6 +29446,107 @@ export type components = {
*/
type: "sub";
};
+ /**
+ * SyncTextEncoderCacheRequest
+ * @description Request to actively sync selected text encoder cache entries with the second-GPU toggle state.
+ */
+ SyncTextEncoderCacheRequest: {
+ /**
+ * Enabled
+ * @description Whether second-GPU text encoder mode is enabled.
+ */
+ enabled: boolean;
+ /**
+ * Text Encoder Models
+ * @description Selected text encoder models to unload or prewarm.
+ */
+ text_encoder_models?: components["schemas"]["ModelIdentifierField"][];
+ };
+ /**
+ * SyncTextEncoderCacheResponse
+ * @description Text encoder cache sync result.
+ */
+ SyncTextEncoderCacheResponse: {
+ /**
+ * Dropped
+ * @description Number of cache entries immediately dropped.
+ */
+ dropped: number;
+ /**
+ * Loaded
+ * @description Number of selected encoder entries loaded onto their target device.
+ */
+ loaded: number;
+ /** @description Text encoder cache status after sync. */
+ status: components["schemas"]["TextEncoderCacheStatusResponse"];
+ };
+ /**
+ * SystemGpuStatus
+ * @description Basic GPU status.
+ */
+ SystemGpuStatus: {
+ /**
+ * Index
+ * @description GPU device index.
+ */
+ index: number;
+ /**
+ * Name
+ * @description GPU device name.
+ */
+ name: string;
+ /**
+ * Utilization Percent
+ * @description GPU utilization percent.
+ */
+ utilization_percent?: number | null;
+ /**
+ * Loaded Gb
+ * @description GPU memory used in GB.
+ */
+ loaded_gb: number;
+ /**
+ * Total Gb
+ * @description Total GPU memory in GB.
+ */
+ total_gb: number;
+ };
+ /**
+ * SystemStatusResponse
+ * @description Basic system status.
+ */
+ SystemStatusResponse: {
+ /**
+ * Cpu Percent
+ * @description CPU utilization percent.
+ */
+ cpu_percent: number;
+ /**
+ * Cpu Frequency Ghz
+ * @description Current CPU frequency in GHz.
+ */
+ cpu_frequency_ghz?: number | null;
+ /**
+ * Memory Used Gb
+ * @description System memory used in GB.
+ */
+ memory_used_gb: number;
+ /**
+ * Memory Total Gb
+ * @description Total system memory in GB.
+ */
+ memory_total_gb: number;
+ /**
+ * Memory Percent
+ * @description System memory utilization percent.
+ */
+ memory_percent: number;
+ /**
+ * Gpus
+ * @description GPU statuses.
+ */
+ gpus: components["schemas"]["SystemGpuStatus"][];
+ };
/** T2IAdapterField */
T2IAdapterField: {
/** @description The T2I-Adapter image prompt. */
@@ -30295,6 +30482,63 @@ export type components = {
*/
tensor_name: string;
};
+ /**
+ * TextEncoderCacheModelStatus
+ * @description Status for one selected text encoder cache entry.
+ */
+ TextEncoderCacheModelStatus: {
+ /**
+ * Key
+ * @description Model key.
+ */
+ key: string;
+ /**
+ * Name
+ * @description Model name.
+ */
+ name: string;
+ /**
+ * Cache Key
+ * @description Resolved cache key.
+ */
+ cache_key: string;
+ /**
+ * Loaded
+ * @description Whether the cache entry exists and has weights on its execution device.
+ */
+ loaded: boolean;
+ /**
+ * Device
+ * @description Execution device for the cache entry.
+ */
+ device?: string | null;
+ /**
+ * Vram Gb
+ * @description Estimated model VRAM resident size in GB.
+ */
+ vram_gb: number;
+ /**
+ * Total Gb
+ * @description Estimated model size in GB.
+ */
+ total_gb: number;
+ };
+ /**
+ * TextEncoderCacheStatusResponse
+ * @description Selected text encoder cache and CUDA memory status.
+ */
+ TextEncoderCacheStatusResponse: {
+ /**
+ * Models
+ * @description Selected text encoder cache statuses.
+ */
+ models: components["schemas"]["TextEncoderCacheModelStatus"][];
+ /**
+ * Cuda Devices
+ * @description CUDA memory status.
+ */
+ cuda_devices: components["schemas"]["CudaDeviceStatus"][];
+ };
/**
* Text LLM
* @description Run a text language model to generate or expand text (e.g. for prompt expansion).
@@ -30995,6 +31239,16 @@ export type components = {
* @description Keep the last N completed, failed, and canceled queue items on startup. Set to 0 to prune all terminal items.
*/
max_queue_history?: number | null;
+ /**
+ * Model Cache Keep Alive Min
+ * @description How long to keep unlocked models in cache after last use, in minutes. 0 keeps models indefinitely.
+ */
+ model_cache_keep_alive_min?: number | null;
+ /**
+ * Use Second Gpu For Text Encoder
+ * @description Run text encoder models on the CUDA device that is not the main execution device when at least two CUDA GPUs are available.
+ */
+ use_second_gpu_for_text_encoder?: boolean | null;
};
/**
* UserDTO
@@ -36510,6 +36764,92 @@ export interface operations {
};
};
};
+ sync_text_encoder_cache: {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ requestBody: {
+ content: {
+ "application/json": components["schemas"]["SyncTextEncoderCacheRequest"];
+ };
+ };
+ responses: {
+ /** @description Successful Response */
+ 200: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "application/json": components["schemas"]["SyncTextEncoderCacheResponse"];
+ };
+ };
+ /** @description Validation Error */
+ 422: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "application/json": components["schemas"]["HTTPValidationError"];
+ };
+ };
+ };
+ };
+ get_text_encoder_cache_status: {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ requestBody: {
+ content: {
+ "application/json": components["schemas"]["SyncTextEncoderCacheRequest"];
+ };
+ };
+ responses: {
+ /** @description Successful Response */
+ 200: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "application/json": components["schemas"]["TextEncoderCacheStatusResponse"];
+ };
+ };
+ /** @description Validation Error */
+ 422: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "application/json": components["schemas"]["HTTPValidationError"];
+ };
+ };
+ };
+ };
+ get_system_status: {
+ parameters: {
+ query?: never;
+ header?: never;
+ path?: never;
+ cookie?: never;
+ };
+ requestBody?: never;
+ responses: {
+ /** @description Successful Response */
+ 200: {
+ headers: {
+ [name: string]: unknown;
+ };
+ content: {
+ "application/json": components["schemas"]["SystemStatusResponse"];
+ };
+ };
+ };
+ };
get_external_provider_statuses: {
parameters: {
query?: never;