From 05ac5f5c958099ce6dbda917d4b46728b16f6690 Mon Sep 17 00:00:00 2001 From: bghira Date: Thu, 4 Jun 2026 17:07:48 -0600 Subject: [PATCH 1/2] anima: officially support release architecture --- simpletuner/helpers/models/anima/model.py | 11 +- .../helpers/models/anima/transformer.py | 97 ++++++++++++---- tests/test_anima_model.py | 104 ++++++++++++++++++ 3 files changed, 191 insertions(+), 21 deletions(-) diff --git a/simpletuner/helpers/models/anima/model.py b/simpletuner/helpers/models/anima/model.py index 7a682a52f..420efc904 100644 --- a/simpletuner/helpers/models/anima/model.py +++ b/simpletuner/helpers/models/anima/model.py @@ -68,6 +68,8 @@ class Anima(ImageModelFoundation): DEFAULT_MODEL_FLAVOUR = "preview-3" HUGGINGFACE_PATHS = { + "release": "circlestone-labs/Anima-Base-v1.0-Diffusers", + "base-v1.0": "circlestone-labs/Anima-Base-v1.0-Diffusers", "preview-3": "CalamitousFelicitousness/Anima-Preview-3-sdnext-diffusers", "preview-2": "CalamitousFelicitousness/Anima-Preview-2-sdnext-diffusers", "preview": "CalamitousFelicitousness/Anima-sdnext-diffusers", @@ -180,8 +182,15 @@ def _expand_crepa_self_flow_patch_values( def _latent_sequence_length(self, latent_tensor: torch.Tensor) -> int: p_t, p_h, p_w = self._crepa_self_flow_patch_size() + _, _, frames, height, width = latent_tensor.shape + if frames % p_t != 0 or height % p_h != 0 or width % p_w != 0: + raise ValueError( + "Anima latent shape must be divisible by transformer patch size " + f"{(p_t, p_h, p_w)}, got latent shape {tuple(latent_tensor.shape)}. " + "Rebuild the VAE/aspect bucket cache with pixel resolutions divisible by 16." + ) return max( - (latent_tensor.shape[2] // p_t) * (latent_tensor.shape[3] // p_h) * (latent_tensor.shape[4] // p_w), + (frames // p_t) * (height // p_h) * (width // p_w), 1, ) diff --git a/simpletuner/helpers/models/anima/transformer.py b/simpletuner/helpers/models/anima/transformer.py index bdc36c8b4..bb72b26a2 100644 --- a/simpletuner/helpers/models/anima/transformer.py +++ b/simpletuner/helpers/models/anima/transformer.py @@ -18,6 +18,7 @@ from diffusers.models.normalization import RMSNorm as DiffusersRMSNorm from diffusers.utils import USE_PEFT_BACKEND, set_weights_and_activate_adapters from huggingface_hub import hf_hub_download +from huggingface_hub.errors import EntryNotFoundError, LocalEntryNotFoundError from safetensors.torch import load_file from torch import nn @@ -26,6 +27,16 @@ DEFAULT_ANIMA_TRANSFORMER_FILENAME = "anima-preview.safetensors" DIFFUSERS_LLM_ADAPTER_FILENAME = "llm_adapter/diffusion_pytorch_model.safetensors" DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME = "llm_adapter/config.json" +DIFFUSERS_TEXT_CONDITIONER_FILENAME = "text_conditioner/diffusion_pytorch_model.safetensors" +DIFFUSERS_TEXT_CONDITIONER_CONFIG_FILENAME = "text_conditioner/config.json" +DIFFUSERS_ADAPTER_WEIGHT_FILENAMES = ( + DIFFUSERS_LLM_ADAPTER_FILENAME, + DIFFUSERS_TEXT_CONDITIONER_FILENAME, +) +DIFFUSERS_ADAPTER_CONFIG_FILENAMES = ( + DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME, + DIFFUSERS_TEXT_CONDITIONER_CONFIG_FILENAME, +) def _rotate_half(x: torch.Tensor) -> torch.Tensor: @@ -496,9 +507,11 @@ def _diffusers_repo_root(pretrained_model_name_or_path: str, subfolder: Optional return pretrained_model_name_or_path @staticmethod - def _resolve_diffusers_llm_adapter_path( + def _resolve_diffusers_adapter_file( pretrained_model_name_or_path: str, *, + filenames: tuple[str, ...], + component_name: str, subfolder: Optional[str] = None, revision: Optional[str] = None, cache_dir: Optional[str] = None, @@ -508,16 +521,55 @@ def _resolve_diffusers_llm_adapter_path( ) -> str: if os.path.isdir(pretrained_model_name_or_path): repo_root = AnimaTransformerModel._diffusers_repo_root(pretrained_model_name_or_path, subfolder=subfolder) - return os.path.join(repo_root, DIFFUSERS_LLM_ADAPTER_FILENAME) + for filename in filenames: + candidate = os.path.join(repo_root, filename) + if os.path.isfile(candidate): + return candidate + raise FileNotFoundError( + f"Anima Diffusers directory {repo_root!r} is missing {component_name}; " + f"expected one of: {', '.join(filenames)}" + ) normalized_token = None if token is False else token - return hf_hub_download( + last_error: Exception | None = None + for filename in filenames: + try: + return hf_hub_download( + pretrained_model_name_or_path, + filename=filename, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=normalized_token, + ) + except (EntryNotFoundError, LocalEntryNotFoundError) as exc: + last_error = exc + raise FileNotFoundError( + f"Anima Diffusers repository {pretrained_model_name_or_path!r} is missing {component_name}; " + f"expected one of: {', '.join(filenames)}" + ) from last_error + + @staticmethod + def _resolve_diffusers_llm_adapter_path( + pretrained_model_name_or_path: str, + *, + subfolder: Optional[str] = None, + revision: Optional[str] = None, + cache_dir: Optional[str] = None, + force_download: bool = False, + local_files_only: bool = False, + token: str | bool | None = None, + ) -> str: + return AnimaTransformerModel._resolve_diffusers_adapter_file( pretrained_model_name_or_path, - filename=DIFFUSERS_LLM_ADAPTER_FILENAME, + filenames=DIFFUSERS_ADAPTER_WEIGHT_FILENAMES, + component_name="text adapter weights", + subfolder=subfolder, revision=revision, cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, - token=normalized_token, + token=token, ) @staticmethod @@ -531,20 +583,25 @@ def _resolve_diffusers_llm_adapter_config_path( local_files_only: bool = False, token: str | bool | None = None, ) -> str: - if os.path.isdir(pretrained_model_name_or_path): - repo_root = AnimaTransformerModel._diffusers_repo_root(pretrained_model_name_or_path, subfolder=subfolder) - return os.path.join(repo_root, DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME) - normalized_token = None if token is False else token - return hf_hub_download( + return AnimaTransformerModel._resolve_diffusers_adapter_file( pretrained_model_name_or_path, - filename=DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME, + filenames=DIFFUSERS_ADAPTER_CONFIG_FILENAMES, + component_name="text adapter config", + subfolder=subfolder, revision=revision, cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, - token=normalized_token, + token=token, ) + @staticmethod + def _adapter_config_int(adapter_config: dict[str, Any], *keys: str) -> int: + for key in keys: + if key in adapter_config: + return int(adapter_config[key]) + raise KeyError(f"Anima text adapter config is missing one of: {', '.join(keys)}") + @classmethod def _load_diffusers_llm_adapter_config( cls, @@ -563,12 +620,12 @@ def _load_diffusers_llm_adapter_config( with open(config_path, encoding="utf-8") as handle: adapter_config = json.load(handle) - model_dim = int(adapter_config["model_dim"]) + model_dim = cls._adapter_config_int(adapter_config, "model_dim") if ( - int(adapter_config.get("source_dim", model_dim)) != model_dim - or int(adapter_config.get("target_dim", model_dim)) != model_dim + cls._adapter_config_int(adapter_config, "source_dim") != model_dim + or cls._adapter_config_int(adapter_config, "target_dim") != model_dim ): - raise ValueError("Anima llm_adapter source_dim, target_dim, and model_dim must match.") + raise ValueError("Anima text adapter source_dim, target_dim, and model_dim must match.") return adapter_config @classmethod @@ -592,10 +649,10 @@ def _from_diffusers_components( max_size=tuple(config.max_size), patch_size=tuple(config.patch_size), rope_scale=tuple(config.rope_scale), - adapter_vocab_size=int(adapter_config["vocab_size"]), - adapter_dim=int(adapter_config["model_dim"]), - adapter_layers=int(adapter_config["num_layers"]), - adapter_heads=int(adapter_config["num_heads"]), + adapter_vocab_size=cls._adapter_config_int(adapter_config, "vocab_size", "target_vocab_size"), + adapter_dim=cls._adapter_config_int(adapter_config, "model_dim"), + adapter_layers=cls._adapter_config_int(adapter_config, "num_layers"), + adapter_heads=cls._adapter_config_int(adapter_config, "num_heads", "num_attention_heads"), ) _patch_diffusers_rmsnorm_to_anima(core) transformer.core = core diff --git a/tests/test_anima_model.py b/tests/test_anima_model.py index 2f0a5d844..2def8d89d 100644 --- a/tests/test_anima_model.py +++ b/tests/test_anima_model.py @@ -20,6 +20,14 @@ def test_model_import(self): def test_model_flavours_use_converted_diffusers_repos(self): from simpletuner.helpers.models.anima.model import Anima + self.assertEqual( + Anima.HUGGINGFACE_PATHS["release"], + "circlestone-labs/Anima-Base-v1.0-Diffusers", + ) + self.assertEqual( + Anima.HUGGINGFACE_PATHS["base-v1.0"], + "circlestone-labs/Anima-Base-v1.0-Diffusers", + ) self.assertEqual( Anima.HUGGINGFACE_PATHS["preview-3"], "CalamitousFelicitousness/Anima-Preview-3-sdnext-diffusers", @@ -58,6 +66,24 @@ def test_diffusers_layout_switches_component_sources(self): self.assertEqual(model.MODEL_SUBFOLDER, "transformer") mock_load_model.assert_called_once_with(move_to_device=False) + def test_release_diffusers_layout_switches_component_sources(self): + from simpletuner.helpers.models.anima.model import Anima + + model = Anima.__new__(Anima) + model.config = SimpleNamespace( + pretrained_model_name_or_path="circlestone-labs/Anima-Base-v1.0-Diffusers", + model_flavour="release", + ) + + self.assertTrue(model._uses_diffusers_repo_layout()) + self.assertEqual( + model._prompt_tokenizer_sources(), + ( + "circlestone-labs/Anima-Base-v1.0-Diffusers::tokenizer", + "circlestone-labs/Anima-Base-v1.0-Diffusers::t5_tokenizer", + ), + ) + def test_diffusers_layout_loads_text_encoder_and_vae_from_standard_subfolders(self): from simpletuner.helpers.models.anima.model import Anima @@ -332,6 +358,62 @@ def test_diffusers_transformer_loads_sibling_llm_adapter(self): for name, parameter in loaded.llm_adapter.state_dict().items(): torch.testing.assert_close(parameter, source.llm_adapter.state_dict()[name]) + def test_diffusers_transformer_loads_release_text_conditioner(self): + from tempfile import TemporaryDirectory + + from safetensors.torch import save_file + + from simpletuner.helpers.models.anima.transformer import AnimaTransformerModel + + source = AnimaTransformerModel( + in_channels=2, + out_channels=2, + num_attention_heads=2, + attention_head_dim=4, + num_layers=1, + mlp_ratio=2.0, + text_embed_dim=8, + adaln_lora_dim=8, + max_size=(2, 4, 4), + patch_size=(1, 2, 2), + adapter_dim=8, + adapter_layers=1, + adapter_heads=2, + ) + + with TemporaryDirectory() as tmpdir: + repo_path = Path(tmpdir) + transformer_path = repo_path / "transformer" + adapter_dir = repo_path / "text_conditioner" + adapter_dir.mkdir() + source.core.save_pretrained(str(transformer_path), safe_serialization=True) + with open(adapter_dir / "config.json", "w", encoding="utf-8") as handle: + json.dump( + { + "source_dim": 8, + "target_dim": 8, + "model_dim": 8, + "num_layers": 1, + "num_attention_heads": 2, + "target_vocab_size": 32128, + }, + handle, + ) + adapter_path = adapter_dir / "diffusion_pytorch_model.safetensors" + save_file(source.llm_adapter.state_dict(), str(adapter_path)) + + loaded = AnimaTransformerModel.from_pretrained( + str(repo_path), + subfolder="transformer", + local_files_only=True, + token=False, + ) + + for name, parameter in loaded.core.state_dict().items(): + torch.testing.assert_close(parameter, source.core.state_dict()[name]) + for name, parameter in loaded.llm_adapter.state_dict().items(): + torch.testing.assert_close(parameter, source.llm_adapter.state_dict()[name]) + def test_pipeline_import(self): from simpletuner.helpers.models.anima.pipeline import AnimaPipeline @@ -539,6 +621,28 @@ def test_model_predict_preserves_frame_axis_to_match_flow_target(self): self.assertEqual(result["model_prediction"].shape, target.shape) self.assertEqual((result["model_prediction"] - target).shape, target.shape) + def test_model_predict_rejects_latents_not_divisible_by_patch_size(self): + from simpletuner.helpers.models.anima.model import Anima + + model = Anima.__new__(Anima) + model.accelerator = SimpleNamespace(device=torch.device("cpu")) + model.config = SimpleNamespace(weight_dtype=torch.float32) + model.model = MagicMock(config=SimpleNamespace(patch_size=(1, 2, 2))) + model.unwrap_model = lambda model=None, wrapped=None: model if model is not None else wrapped + + prepared_batch = { + "noisy_latents": torch.randn(1, 16, 1, 147, 110), + "timesteps": torch.tensor([500.0], dtype=torch.float32), + "encoder_hidden_states": torch.randn(1, 3, 8), + "t5xxl_ids": None, + "t5xxl_weights": None, + } + + with self.assertRaisesRegex(ValueError, "pixel resolutions divisible by 16"): + model.model_predict(prepared_batch) + + model.model.assert_not_called() + def test_expand_sigmas_matches_anima_latent_rank(self): from simpletuner.helpers.models.anima.model import Anima From 12acb334660f14b9cb7d88f85bee8c4641be8ee9 Mon Sep 17 00:00:00 2001 From: bghira Date: Thu, 4 Jun 2026 17:41:58 -0600 Subject: [PATCH 2/2] anima: improve brittle code --- simpletuner/helpers/models/anima/model.py | 8 ++- .../helpers/models/anima/transformer.py | 20 +++++-- tests/test_anima_model.py | 58 ++++++++++++++++++- 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/simpletuner/helpers/models/anima/model.py b/simpletuner/helpers/models/anima/model.py index 420efc904..3cbf8544d 100644 --- a/simpletuner/helpers/models/anima/model.py +++ b/simpletuner/helpers/models/anima/model.py @@ -20,6 +20,7 @@ load_text_encoder_single_file, load_vae_single_file, resolve_text_encoder_dtype, + resolve_vae_scale_factor, ) from .options import AnimaLoaderOptions from .pipeline import AnimaPipeline @@ -184,10 +185,15 @@ def _latent_sequence_length(self, latent_tensor: torch.Tensor) -> int: p_t, p_h, p_w = self._crepa_self_flow_patch_size() _, _, frames, height, width = latent_tensor.shape if frames % p_t != 0 or height % p_h != 0 or width % p_w != 0: + vae = getattr(self, "vae", None) + vae_scale_factor = resolve_vae_scale_factor(vae=vae) if vae is not None else 8 + pixel_multiples = (p_h * vae_scale_factor, p_w * vae_scale_factor) raise ValueError( "Anima latent shape must be divisible by transformer patch size " f"{(p_t, p_h, p_w)}, got latent shape {tuple(latent_tensor.shape)}. " - "Rebuild the VAE/aspect bucket cache with pixel resolutions divisible by 16." + f"Latent frames/height/width must be multiples of {(p_t, p_h, p_w)}. " + "Rebuild the VAE/aspect bucket cache with " + f"frame counts divisible by {p_t} and source pixel height/width multiples of {pixel_multiples}." ) return max( (frames // p_t) * (height // p_h) * (width // p_w), diff --git a/simpletuner/helpers/models/anima/transformer.py b/simpletuner/helpers/models/anima/transformer.py index bb72b26a2..b97a7011b 100644 --- a/simpletuner/helpers/models/anima/transformer.py +++ b/simpletuner/helpers/models/anima/transformer.py @@ -18,7 +18,18 @@ from diffusers.models.normalization import RMSNorm as DiffusersRMSNorm from diffusers.utils import USE_PEFT_BACKEND, set_weights_and_activate_adapters from huggingface_hub import hf_hub_download -from huggingface_hub.errors import EntryNotFoundError, LocalEntryNotFoundError + +try: + from huggingface_hub.utils import EntryNotFoundError +except ImportError: + from huggingface_hub.errors import EntryNotFoundError +try: + from huggingface_hub.utils import LocalEntryNotFoundError +except ImportError: + try: + from huggingface_hub.errors import LocalEntryNotFoundError + except ImportError: + LocalEntryNotFoundError = EntryNotFoundError from safetensors.torch import load_file from torch import nn @@ -621,10 +632,9 @@ def _load_diffusers_llm_adapter_config( adapter_config = json.load(handle) model_dim = cls._adapter_config_int(adapter_config, "model_dim") - if ( - cls._adapter_config_int(adapter_config, "source_dim") != model_dim - or cls._adapter_config_int(adapter_config, "target_dim") != model_dim - ): + source_dim = int(adapter_config.get("source_dim", model_dim)) + target_dim = int(adapter_config.get("target_dim", model_dim)) + if source_dim != model_dim or target_dim != model_dim: raise ValueError("Anima text adapter source_dim, target_dim, and model_dim must match.") return adapter_config diff --git a/tests/test_anima_model.py b/tests/test_anima_model.py index 2def8d89d..f310ace0c 100644 --- a/tests/test_anima_model.py +++ b/tests/test_anima_model.py @@ -414,6 +414,55 @@ def test_diffusers_transformer_loads_release_text_conditioner(self): for name, parameter in loaded.llm_adapter.state_dict().items(): torch.testing.assert_close(parameter, source.llm_adapter.state_dict()[name]) + def test_diffusers_adapter_config_defaults_source_and_target_dim(self): + from tempfile import TemporaryDirectory + + from simpletuner.helpers.models.anima.transformer import AnimaTransformerModel + + with TemporaryDirectory() as tmpdir: + config_path = Path(tmpdir) / "config.json" + with open(config_path, "w", encoding="utf-8") as handle: + json.dump( + { + "model_dim": 8, + "num_layers": 1, + "num_attention_heads": 2, + "target_vocab_size": 32128, + }, + handle, + ) + + with patch.object( + AnimaTransformerModel, + "_resolve_diffusers_llm_adapter_config_path", + return_value=str(config_path), + ): + adapter_config = AnimaTransformerModel._load_diffusers_llm_adapter_config("repo") + + with open(config_path, "w", encoding="utf-8") as handle: + json.dump( + { + "source_dim": 16, + "target_dim": 8, + "model_dim": 8, + "num_layers": 1, + "num_attention_heads": 2, + "target_vocab_size": 32128, + }, + handle, + ) + with ( + patch.object( + AnimaTransformerModel, + "_resolve_diffusers_llm_adapter_config_path", + return_value=str(config_path), + ), + self.assertRaisesRegex(ValueError, "source_dim, target_dim, and model_dim must match"), + ): + AnimaTransformerModel._load_diffusers_llm_adapter_config("repo") + + self.assertEqual(adapter_config["model_dim"], 8) + def test_pipeline_import(self): from simpletuner.helpers.models.anima.pipeline import AnimaPipeline @@ -627,7 +676,8 @@ def test_model_predict_rejects_latents_not_divisible_by_patch_size(self): model = Anima.__new__(Anima) model.accelerator = SimpleNamespace(device=torch.device("cpu")) model.config = SimpleNamespace(weight_dtype=torch.float32) - model.model = MagicMock(config=SimpleNamespace(patch_size=(1, 2, 2))) + patch_size = (1, 3, 4) + model.model = MagicMock(config=SimpleNamespace(patch_size=patch_size)) model.unwrap_model = lambda model=None, wrapped=None: model if model is not None else wrapped prepared_batch = { @@ -638,9 +688,13 @@ def test_model_predict_rejects_latents_not_divisible_by_patch_size(self): "t5xxl_weights": None, } - with self.assertRaisesRegex(ValueError, "pixel resolutions divisible by 16"): + with self.assertRaises(ValueError) as cm: model.model_predict(prepared_batch) + message = str(cm.exception) + self.assertIn("divisible by transformer patch size", message) + self.assertIn(f"Latent frames/height/width must be multiples of {patch_size}", message) + self.assertIn(f"source pixel height/width multiples of {(patch_size[1] * 8, patch_size[2] * 8)}", message) model.model.assert_not_called() def test_expand_sigmas_matches_anima_latent_rank(self):