Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion simpletuner/helpers/models/anima/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
load_text_encoder_single_file,
load_vae_single_file,
resolve_text_encoder_dtype,
resolve_vae_scale_factor,
)
from .options import AnimaLoaderOptions
from .pipeline import AnimaPipeline
Expand Down Expand Up @@ -68,6 +69,8 @@ class Anima(ImageModelFoundation):

DEFAULT_MODEL_FLAVOUR = "preview-3"
HUGGINGFACE_PATHS = {
"release": "circlestone-labs/Anima-Base-v1.0-Diffusers",
"base-v1.0": "circlestone-labs/Anima-Base-v1.0-Diffusers",
"preview-3": "CalamitousFelicitousness/Anima-Preview-3-sdnext-diffusers",
"preview-2": "CalamitousFelicitousness/Anima-Preview-2-sdnext-diffusers",
"preview": "CalamitousFelicitousness/Anima-sdnext-diffusers",
Expand Down Expand Up @@ -180,8 +183,20 @@ def _expand_crepa_self_flow_patch_values(

def _latent_sequence_length(self, latent_tensor: torch.Tensor) -> int:
p_t, p_h, p_w = self._crepa_self_flow_patch_size()
_, _, frames, height, width = latent_tensor.shape
if frames % p_t != 0 or height % p_h != 0 or width % p_w != 0:
vae = getattr(self, "vae", None)
vae_scale_factor = resolve_vae_scale_factor(vae=vae) if vae is not None else 8
pixel_multiples = (p_h * vae_scale_factor, p_w * vae_scale_factor)
raise ValueError(
"Anima latent shape must be divisible by transformer patch size "
f"{(p_t, p_h, p_w)}, got latent shape {tuple(latent_tensor.shape)}. "
f"Latent frames/height/width must be multiples of {(p_t, p_h, p_w)}. "
"Rebuild the VAE/aspect bucket cache with "
f"frame counts divisible by {p_t} and source pixel height/width multiples of {pixel_multiples}."
)
Comment thread
bghira marked this conversation as resolved.
return max(
(latent_tensor.shape[2] // p_t) * (latent_tensor.shape[3] // p_h) * (latent_tensor.shape[4] // p_w),
(frames // p_t) * (height // p_h) * (width // p_w),
1,
)

Expand Down
111 changes: 89 additions & 22 deletions simpletuner/helpers/models/anima/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@
from diffusers.models.normalization import RMSNorm as DiffusersRMSNorm
from diffusers.utils import USE_PEFT_BACKEND, set_weights_and_activate_adapters
from huggingface_hub import hf_hub_download

try:
from huggingface_hub.utils import EntryNotFoundError
except ImportError:
from huggingface_hub.errors import EntryNotFoundError
try:
from huggingface_hub.utils import LocalEntryNotFoundError
except ImportError:
try:
from huggingface_hub.errors import LocalEntryNotFoundError
except ImportError:
LocalEntryNotFoundError = EntryNotFoundError
from safetensors.torch import load_file
from torch import nn

Expand All @@ -26,6 +38,16 @@
DEFAULT_ANIMA_TRANSFORMER_FILENAME = "anima-preview.safetensors"
DIFFUSERS_LLM_ADAPTER_FILENAME = "llm_adapter/diffusion_pytorch_model.safetensors"
DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME = "llm_adapter/config.json"
DIFFUSERS_TEXT_CONDITIONER_FILENAME = "text_conditioner/diffusion_pytorch_model.safetensors"
DIFFUSERS_TEXT_CONDITIONER_CONFIG_FILENAME = "text_conditioner/config.json"
DIFFUSERS_ADAPTER_WEIGHT_FILENAMES = (
DIFFUSERS_LLM_ADAPTER_FILENAME,
DIFFUSERS_TEXT_CONDITIONER_FILENAME,
)
DIFFUSERS_ADAPTER_CONFIG_FILENAMES = (
DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME,
DIFFUSERS_TEXT_CONDITIONER_CONFIG_FILENAME,
)


def _rotate_half(x: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -496,9 +518,11 @@ def _diffusers_repo_root(pretrained_model_name_or_path: str, subfolder: Optional
return pretrained_model_name_or_path

@staticmethod
def _resolve_diffusers_llm_adapter_path(
def _resolve_diffusers_adapter_file(
pretrained_model_name_or_path: str,
*,
filenames: tuple[str, ...],
component_name: str,
subfolder: Optional[str] = None,
revision: Optional[str] = None,
cache_dir: Optional[str] = None,
Expand All @@ -508,16 +532,55 @@ def _resolve_diffusers_llm_adapter_path(
) -> str:
if os.path.isdir(pretrained_model_name_or_path):
repo_root = AnimaTransformerModel._diffusers_repo_root(pretrained_model_name_or_path, subfolder=subfolder)
return os.path.join(repo_root, DIFFUSERS_LLM_ADAPTER_FILENAME)
for filename in filenames:
candidate = os.path.join(repo_root, filename)
if os.path.isfile(candidate):
return candidate
raise FileNotFoundError(
f"Anima Diffusers directory {repo_root!r} is missing {component_name}; "
f"expected one of: {', '.join(filenames)}"
)
normalized_token = None if token is False else token
return hf_hub_download(
last_error: Exception | None = None
for filename in filenames:
try:
return hf_hub_download(
pretrained_model_name_or_path,
filename=filename,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
token=normalized_token,
)
except (EntryNotFoundError, LocalEntryNotFoundError) as exc:
last_error = exc
raise FileNotFoundError(
f"Anima Diffusers repository {pretrained_model_name_or_path!r} is missing {component_name}; "
f"expected one of: {', '.join(filenames)}"
) from last_error

@staticmethod
def _resolve_diffusers_llm_adapter_path(
pretrained_model_name_or_path: str,
*,
subfolder: Optional[str] = None,
revision: Optional[str] = None,
cache_dir: Optional[str] = None,
force_download: bool = False,
local_files_only: bool = False,
token: str | bool | None = None,
) -> str:
return AnimaTransformerModel._resolve_diffusers_adapter_file(
pretrained_model_name_or_path,
filename=DIFFUSERS_LLM_ADAPTER_FILENAME,
filenames=DIFFUSERS_ADAPTER_WEIGHT_FILENAMES,
component_name="text adapter weights",
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
token=normalized_token,
token=token,
)

@staticmethod
Expand All @@ -531,20 +594,25 @@ def _resolve_diffusers_llm_adapter_config_path(
local_files_only: bool = False,
token: str | bool | None = None,
) -> str:
if os.path.isdir(pretrained_model_name_or_path):
repo_root = AnimaTransformerModel._diffusers_repo_root(pretrained_model_name_or_path, subfolder=subfolder)
return os.path.join(repo_root, DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME)
normalized_token = None if token is False else token
return hf_hub_download(
return AnimaTransformerModel._resolve_diffusers_adapter_file(
pretrained_model_name_or_path,
filename=DIFFUSERS_LLM_ADAPTER_CONFIG_FILENAME,
filenames=DIFFUSERS_ADAPTER_CONFIG_FILENAMES,
component_name="text adapter config",
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
token=normalized_token,
token=token,
)

@staticmethod
def _adapter_config_int(adapter_config: dict[str, Any], *keys: str) -> int:
for key in keys:
if key in adapter_config:
return int(adapter_config[key])
raise KeyError(f"Anima text adapter config is missing one of: {', '.join(keys)}")

@classmethod
def _load_diffusers_llm_adapter_config(
cls,
Expand All @@ -563,12 +631,11 @@ def _load_diffusers_llm_adapter_config(
with open(config_path, encoding="utf-8") as handle:
adapter_config = json.load(handle)

model_dim = int(adapter_config["model_dim"])
if (
int(adapter_config.get("source_dim", model_dim)) != model_dim
or int(adapter_config.get("target_dim", model_dim)) != model_dim
):
raise ValueError("Anima llm_adapter source_dim, target_dim, and model_dim must match.")
model_dim = cls._adapter_config_int(adapter_config, "model_dim")
source_dim = int(adapter_config.get("source_dim", model_dim))
target_dim = int(adapter_config.get("target_dim", model_dim))
if source_dim != model_dim or target_dim != model_dim:
raise ValueError("Anima text adapter source_dim, target_dim, and model_dim must match.")
Comment thread
bghira marked this conversation as resolved.
return adapter_config

@classmethod
Expand All @@ -592,10 +659,10 @@ def _from_diffusers_components(
max_size=tuple(config.max_size),
patch_size=tuple(config.patch_size),
rope_scale=tuple(config.rope_scale),
adapter_vocab_size=int(adapter_config["vocab_size"]),
adapter_dim=int(adapter_config["model_dim"]),
adapter_layers=int(adapter_config["num_layers"]),
adapter_heads=int(adapter_config["num_heads"]),
adapter_vocab_size=cls._adapter_config_int(adapter_config, "vocab_size", "target_vocab_size"),
adapter_dim=cls._adapter_config_int(adapter_config, "model_dim"),
adapter_layers=cls._adapter_config_int(adapter_config, "num_layers"),
adapter_heads=cls._adapter_config_int(adapter_config, "num_heads", "num_attention_heads"),
)
_patch_diffusers_rmsnorm_to_anima(core)
transformer.core = core
Expand Down
158 changes: 158 additions & 0 deletions tests/test_anima_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ def test_model_import(self):
def test_model_flavours_use_converted_diffusers_repos(self):
from simpletuner.helpers.models.anima.model import Anima

self.assertEqual(
Anima.HUGGINGFACE_PATHS["release"],
"circlestone-labs/Anima-Base-v1.0-Diffusers",
)
self.assertEqual(
Anima.HUGGINGFACE_PATHS["base-v1.0"],
"circlestone-labs/Anima-Base-v1.0-Diffusers",
)
self.assertEqual(
Anima.HUGGINGFACE_PATHS["preview-3"],
"CalamitousFelicitousness/Anima-Preview-3-sdnext-diffusers",
Expand Down Expand Up @@ -58,6 +66,24 @@ def test_diffusers_layout_switches_component_sources(self):
self.assertEqual(model.MODEL_SUBFOLDER, "transformer")
mock_load_model.assert_called_once_with(move_to_device=False)

def test_release_diffusers_layout_switches_component_sources(self):
from simpletuner.helpers.models.anima.model import Anima

model = Anima.__new__(Anima)
model.config = SimpleNamespace(
pretrained_model_name_or_path="circlestone-labs/Anima-Base-v1.0-Diffusers",
model_flavour="release",
)

self.assertTrue(model._uses_diffusers_repo_layout())
self.assertEqual(
model._prompt_tokenizer_sources(),
(
"circlestone-labs/Anima-Base-v1.0-Diffusers::tokenizer",
"circlestone-labs/Anima-Base-v1.0-Diffusers::t5_tokenizer",
),
)

def test_diffusers_layout_loads_text_encoder_and_vae_from_standard_subfolders(self):
from simpletuner.helpers.models.anima.model import Anima

Expand Down Expand Up @@ -332,6 +358,111 @@ def test_diffusers_transformer_loads_sibling_llm_adapter(self):
for name, parameter in loaded.llm_adapter.state_dict().items():
torch.testing.assert_close(parameter, source.llm_adapter.state_dict()[name])

def test_diffusers_transformer_loads_release_text_conditioner(self):
from tempfile import TemporaryDirectory

from safetensors.torch import save_file

from simpletuner.helpers.models.anima.transformer import AnimaTransformerModel

source = AnimaTransformerModel(
in_channels=2,
out_channels=2,
num_attention_heads=2,
attention_head_dim=4,
num_layers=1,
mlp_ratio=2.0,
text_embed_dim=8,
adaln_lora_dim=8,
max_size=(2, 4, 4),
patch_size=(1, 2, 2),
adapter_dim=8,
adapter_layers=1,
adapter_heads=2,
)

with TemporaryDirectory() as tmpdir:
repo_path = Path(tmpdir)
transformer_path = repo_path / "transformer"
adapter_dir = repo_path / "text_conditioner"
adapter_dir.mkdir()
source.core.save_pretrained(str(transformer_path), safe_serialization=True)
with open(adapter_dir / "config.json", "w", encoding="utf-8") as handle:
json.dump(
{
"source_dim": 8,
"target_dim": 8,
"model_dim": 8,
"num_layers": 1,
"num_attention_heads": 2,
"target_vocab_size": 32128,
},
handle,
)
adapter_path = adapter_dir / "diffusion_pytorch_model.safetensors"
save_file(source.llm_adapter.state_dict(), str(adapter_path))

loaded = AnimaTransformerModel.from_pretrained(
str(repo_path),
subfolder="transformer",
local_files_only=True,
token=False,
)

for name, parameter in loaded.core.state_dict().items():
torch.testing.assert_close(parameter, source.core.state_dict()[name])
for name, parameter in loaded.llm_adapter.state_dict().items():
torch.testing.assert_close(parameter, source.llm_adapter.state_dict()[name])

def test_diffusers_adapter_config_defaults_source_and_target_dim(self):
from tempfile import TemporaryDirectory

from simpletuner.helpers.models.anima.transformer import AnimaTransformerModel

with TemporaryDirectory() as tmpdir:
config_path = Path(tmpdir) / "config.json"
with open(config_path, "w", encoding="utf-8") as handle:
json.dump(
{
"model_dim": 8,
"num_layers": 1,
"num_attention_heads": 2,
"target_vocab_size": 32128,
},
handle,
)

with patch.object(
AnimaTransformerModel,
"_resolve_diffusers_llm_adapter_config_path",
return_value=str(config_path),
):
adapter_config = AnimaTransformerModel._load_diffusers_llm_adapter_config("repo")

with open(config_path, "w", encoding="utf-8") as handle:
json.dump(
{
"source_dim": 16,
"target_dim": 8,
"model_dim": 8,
"num_layers": 1,
"num_attention_heads": 2,
"target_vocab_size": 32128,
},
handle,
)
with (
patch.object(
AnimaTransformerModel,
"_resolve_diffusers_llm_adapter_config_path",
return_value=str(config_path),
),
self.assertRaisesRegex(ValueError, "source_dim, target_dim, and model_dim must match"),
):
AnimaTransformerModel._load_diffusers_llm_adapter_config("repo")

self.assertEqual(adapter_config["model_dim"], 8)

def test_pipeline_import(self):
from simpletuner.helpers.models.anima.pipeline import AnimaPipeline

Expand Down Expand Up @@ -539,6 +670,33 @@ def test_model_predict_preserves_frame_axis_to_match_flow_target(self):
self.assertEqual(result["model_prediction"].shape, target.shape)
self.assertEqual((result["model_prediction"] - target).shape, target.shape)

def test_model_predict_rejects_latents_not_divisible_by_patch_size(self):
from simpletuner.helpers.models.anima.model import Anima

model = Anima.__new__(Anima)
model.accelerator = SimpleNamespace(device=torch.device("cpu"))
model.config = SimpleNamespace(weight_dtype=torch.float32)
patch_size = (1, 3, 4)
model.model = MagicMock(config=SimpleNamespace(patch_size=patch_size))
model.unwrap_model = lambda model=None, wrapped=None: model if model is not None else wrapped

prepared_batch = {
"noisy_latents": torch.randn(1, 16, 1, 147, 110),
"timesteps": torch.tensor([500.0], dtype=torch.float32),
"encoder_hidden_states": torch.randn(1, 3, 8),
"t5xxl_ids": None,
"t5xxl_weights": None,
}

with self.assertRaises(ValueError) as cm:
model.model_predict(prepared_batch)

message = str(cm.exception)
self.assertIn("divisible by transformer patch size", message)
self.assertIn(f"Latent frames/height/width must be multiples of {patch_size}", message)
self.assertIn(f"source pixel height/width multiples of {(patch_size[1] * 8, patch_size[2] * 8)}", message)
model.model.assert_not_called()

def test_expand_sigmas_matches_anima_latent_rank(self):
from simpletuner.helpers.models.anima.model import Anima

Expand Down
Loading