"""Memory strategy selection. CUDA supports cpu_offload; DirectML/CPU do not. Apply per-pipeline based on backend + VRAM tier. All paths reduce peak VRAM without breaking on non-CUDA devices. """ from __future__ import annotations from .device import get_device, hardware_info def apply_memory_strategy(pipe) -> None: """Apply VRAM-saving knobs that match the active backend.""" info = hardware_info() backend = info["backend"] vram = info["vram_gb"] # Always-safe: VAE tiling/slicing work on any device. Cuts peak VRAM during decode. # Newer diffusers (>=0.32) prefers calling on the VAE directly. vae = getattr(pipe, "vae", None) if vae is not None: for fn in ("enable_slicing", "enable_tiling"): if hasattr(vae, fn): try: getattr(vae, fn)() except Exception: pass if hasattr(pipe, "enable_attention_slicing"): try: pipe.enable_attention_slicing() except Exception: pass if backend == "cuda": # Offload only if VRAM tight. cpu_offload is CUDA-only via accelerate hooks. if vram < 10: try: pipe.enable_sequential_cpu_offload() return except Exception: pass try: pipe.enable_model_cpu_offload() return except Exception: pass pipe.to(get_device()) return if backend == "directml": # DirectML lacks accelerate hook support. Move whole pipe to device. # Slicing already enabled above keeps peak in check. try: pipe.to(get_device()) except Exception: # Some pipes have components that won't move cleanly; fall back to CPU. pipe.to("cpu") return # CPU pipe.to("cpu")