From 88ccfaa0183a837d63f88f410629822a9d0b4648 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Thu, 21 May 2026 16:19:26 +0000 Subject: [PATCH 1/9] Add FRoD tuner --- docs/source/_toctree.yml | 2 + docs/source/package_reference/frod.md | 38 ++++ src/peft/__init__.py | 4 + src/peft/tuners/__init__.py | 3 + src/peft/tuners/frod/__init__.py | 10 + src/peft/tuners/frod/config.py | 146 +++++++++++++ src/peft/tuners/frod/layer.py | 303 ++++++++++++++++++++++++++ src/peft/tuners/frod/model.py | 276 +++++++++++++++++++++++ src/peft/utils/__init__.py | 2 + src/peft/utils/constants.py | 2 + src/peft/utils/other.py | 2 + src/peft/utils/peft_types.py | 2 + src/peft/utils/save_and_load.py | 41 ++++ tests/test_custom_models.py | 16 +- 14 files changed, 845 insertions(+), 2 deletions(-) create mode 100644 docs/source/package_reference/frod.md create mode 100644 src/peft/tuners/frod/__init__.py create mode 100644 src/peft/tuners/frod/config.py create mode 100644 src/peft/tuners/frod/layer.py create mode 100644 src/peft/tuners/frod/model.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a9db3fa19a..de928e9d6e 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -124,6 +124,8 @@ title: PVeRA - local: package_reference/fourierft title: FourierFT + - local: package_reference/frod + title: FRoD - local: package_reference/gralora title: GraLoRA - local: package_reference/vblora diff --git a/docs/source/package_reference/frod.md b/docs/source/package_reference/frod.md new file mode 100644 index 0000000000..7fceb6fa41 --- /dev/null +++ b/docs/source/package_reference/frod.md @@ -0,0 +1,38 @@ + + +# FRoD: Full-Rank Efficient Fine-Tuning with Rotational Degrees + +FRoD is a parameter-efficient fine-tuning method that combines a shared full-rank basis with sparse learnable +rotational degrees. The adapter update is expressed through fixed projection tensors and trainable coefficients, which +allows FRoD to apply full-rank updates while keeping the number of trained parameters small. + +When saving the adapter parameters, it is possible to avoid storing the projection tensors by setting +`save_projection=False` on the `FRODConfig`. In that case, the projections are restored from the base model weights and +the fixed random seed from `projection_prng_key`. This reduces checkpoint size, but the default is +`save_projection=True` to make checkpoint loading independent of regeneration details. + +FRoD currently has the following constraint: + +- Only `nn.Linear` and `transformers.pytorch_utils.Conv1D` layers are supported. + +## FRODConfig + +[[autodoc]] tuners.frod.config.FRODConfig + +## FRODModel + +[[autodoc]] tuners.frod.model.FRODModel diff --git a/src/peft/__init__.py b/src/peft/__init__.py index ec12d52583..df185d9d76 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -70,6 +70,8 @@ EvaConfig, FourierFTConfig, FourierFTModel, + FRODConfig, + FRODModel, GraloraConfig, GraloraModel, HiraConfig, @@ -198,6 +200,8 @@ "DeloraConfig", "DeloraModel", "EvaConfig", + "FRODConfig", + "FRODModel", "FourierFTConfig", "FourierFTModel", "GraloraConfig", diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index 4900a71aa8..9ff82ea725 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -22,6 +22,7 @@ from .cpt import CPTConfig, CPTEmbedding from .delora import DeloraConfig, DeloraModel from .fourierft import FourierFTConfig, FourierFTModel +from .frod import FRODConfig, FRODModel from .gralora import GraloraConfig, GraloraModel from .hira import HiraConfig, HiraModel from .hra import HRAConfig, HRAModel @@ -93,6 +94,8 @@ "DeloraConfig", "DeloraModel", "EvaConfig", + "FRODConfig", + "FRODModel", "FourierFTConfig", "FourierFTModel", "GraloraConfig", diff --git a/src/peft/tuners/frod/__init__.py b/src/peft/tuners/frod/__init__.py new file mode 100644 index 0000000000..3be7152c07 --- /dev/null +++ b/src/peft/tuners/frod/__init__.py @@ -0,0 +1,10 @@ +from peft.utils import register_peft_method + +from .config import FRODConfig +from .layer import FRODLayer, Linear +from .model import FRODModel + + +__all__ = ["FRODConfig", "FRODLayer", "FRODModel", "Linear"] + +register_peft_method(name="frod", config_cls=FRODConfig, model_cls=FRODModel, prefix="frod_") diff --git a/src/peft/tuners/frod/config.py b/src/peft/tuners/frod/config.py new file mode 100644 index 0000000000..354c1279b4 --- /dev/null +++ b/src/peft/tuners/frod/config.py @@ -0,0 +1,146 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class FRODConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`FRODModel`]. + + Paper: https://doi.org/10.1609/aaai.v40i31.39813. + + Args: + target_modules (`Union[List[str], str]`): + The names of the modules to apply FRoD to. Only linear layers are supported. + projection_prng_key (`int`): + Random seed used when initializing the sparse FRoD COO pattern. + save_projection (`bool`): + Whether to save the FRoD projection tensors in the state dict. This increases checkpoint size but makes + adapter reloading independent of local cache regeneration. Defaults to `True`. + frod_dropout (`float`): + The dropout probability for FRoD layers. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for FRoD. Can be 'none', 'all' or 'frod_only'. If 'all' or 'frod_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + modules_to_save (`List[str]`): + List of modules apart from FRoD layers to be set as trainable and saved in the final checkpoint. + init_weights (`bool`): + Whether to initialize the weights of the FRoD layers with their default initialization. Don't change this + setting, except if you know exactly what you're doing. + layers_to_transform (`Union[List[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply the FRoD transformations on + the layer indexes that are specified in this list. If a single integer is passed, it will apply the FRoD + transformations on the layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + """ + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with FRoD." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + projection_prng_key: int = field( + default=0, + metadata={"help": "Random seed used when initializing the FRoD sparse COO structure."}, + ) + save_projection: bool = field( + default=True, + metadata={ + "help": ( + "Whether to save the FRoD projection tensors in the state dict. This increases checkpoint size but " + "guarantees that we can reload the adapter on all system configurations." + ) + }, + ) + frod_dropout: float = field(default=0.0, metadata={"help": "Dropout in the FRoD adapter layers"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field(default="none", metadata={"help": "Bias type for FRoD. Can be 'none', 'all' or 'frod_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from FRoD layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the FRoD layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer " + "pattern is not in the common layers pattern. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + ) + }, + ) + sparse_rate: float = field(default=0.01, metadata={"help": "Sparse rate"}) + regularization_alpha: float = field( + default=1e-3, + metadata={ + "help": ("Regularization parameter used when building the shared FRoD basis."), + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.FROD + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + if self.sparse_rate < 0 or self.sparse_rate > 1: + raise ValueError(f"`sparse_rate` should be between 0 and 1, got {self.sparse_rate}.") diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py new file mode 100644 index 0000000000..edaf2ea2ba --- /dev/null +++ b/src/peft/tuners/frod/layer.py @@ -0,0 +1,303 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Optional + +import numpy as np +import torch +import torch.nn.functional as F +from numpy.linalg import inv +from torch import nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose + +from .._buffer_dict import BufferDict + + +class FRODLayer(BaseTunerLayer): + adapter_layer_names = ("frod_lambda_s_values", "frod_lambda_l") + other_param_names = ("frod_V", "frod_U", "frod_s_indices", "frod_s_size") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.frod_dropout = nn.ModuleDict({}) + + # Sparse S is parameterized by its COO values only. + self.frod_lambda_s_values = nn.ParameterDict({}) + self.frod_lambda_l = nn.ParameterDict({}) + + self.frod_s_indices: Optional[BufferDict] = None + self.frod_s_size: Optional[BufferDict] = None + self.frod_V: Optional[BufferDict] = None + self.frod_U: BufferDict = BufferDict({}, persistent=False) + + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + def update_layer( + self, + adapter_name, + frod_V: BufferDict, + frod_s_indices: BufferDict, + frod_s_size: BufferDict, + frod_dropout, + init_weights, + ): + weight = self.get_base_layer().weight + device = weight.device + dtype = weight.dtype + + param_dtype = dtype + if device.type == "cuda" and dtype == torch.float32: + param_dtype = torch.float16 + + self.r[adapter_name] = self.out_features + if frod_dropout > 0.0: + frod_dropout_layer = nn.Dropout(p=frod_dropout) + else: + frod_dropout_layer = nn.Identity() + + self.frod_dropout.update(nn.ModuleDict({adapter_name: frod_dropout_layer})) + + if adapter_name not in frod_V: + if not frod_V: + raise ValueError("The FRoD projection buffers are empty. This should not happen.") + frod_V[adapter_name] = next(iter(frod_V.values())) + frod_s_indices[adapter_name] = next(iter(frod_s_indices.values())) + frod_s_size[adapter_name] = next(iter(frod_s_size.values())) + + nnz = frod_s_indices[adapter_name].shape[1] + self.frod_lambda_s_values[adapter_name] = nn.Parameter(torch.zeros(nnz, device=device, dtype=param_dtype)) + + self.__dict__["frod_V"] = frod_V + self.__dict__["frod_s_indices"] = frod_s_indices + self.__dict__["frod_s_size"] = frod_s_size + + # Keep cached projections on CPU and move them lazily in forward. + self.frod_V[adapter_name] = self.frod_V[adapter_name].to(dtype=param_dtype, device="cpu") + self.frod_s_indices[adapter_name] = self.frod_s_indices[adapter_name].to(device="cpu", dtype=torch.long) + self.frod_s_size[adapter_name] = self.frod_s_size[adapter_name].to(device="cpu", dtype=torch.long) + + U, L = self._calculate_frod_u_and_lambda(self.frod_V[adapter_name], weight) + U = U.to(param_dtype) + L = L.to(device=device, dtype=param_dtype) + self.frod_lambda_l[adapter_name] = nn.Parameter(L, requires_grad=True) + if init_weights: + self.reset_frod_parameters(adapter_name) + + self.frod_U[adapter_name] = U.cpu() + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters) + + def _calculate_frod_u_and_lambda(self, V, W): + w = W.detach().to(torch.float32).cpu().numpy() + v = V.detach().to(torch.float32).cpu().numpy() + try: + v_inv_T = inv(v).T + except np.linalg.LinAlgError: + v_inv_T = np.linalg.pinv(v, rcond=1e-6).T + Bi = w @ v_inv_T + lambda_l = np.linalg.norm(Bi, axis=0) + u = np.divide(Bi, lambda_l, out=np.zeros_like(Bi), where=lambda_l > 1e-8) + U = torch.from_numpy(u).float() + L = torch.from_numpy(lambda_l).float() + return U, L + + def reset_frod_parameters(self, adapter_name): + if adapter_name in self.frod_lambda_s_values: + with torch.no_grad(): + nn.init.zeros_(self.frod_lambda_s_values[adapter_name]) + if adapter_name in self.frod_lambda_l: + with torch.no_grad(): + nn.init.zeros_(self.frod_lambda_l[adapter_name]) + + +class Linear(nn.Linear, FRODLayer): + def __init__( + self, + base_layer, + frod_V: BufferDict, + frod_s_indices: BufferDict, + frod_s_size: BufferDict, + adapter_name: str, + frod_dropout: float = 0.0, + fan_in_fan_out: bool = False, + is_target_conv_1d_layer: bool = False, + init_weights: bool = True, + **kwargs, + ) -> None: + super(nn.Linear, self).__init__() + FRODLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer(adapter_name, frod_V, frod_s_indices, frod_s_size, frod_dropout, init_weights) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + for active_adapter in adapter_names: + if active_adapter in self.frod_lambda_l.keys(): + base_layer = self.get_base_layer() + if safe_merge: + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.frod_lambda_l.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + weight = self.get_base_layer().weight + device = weight.device + dtype = weight.dtype + U = self.frod_U[adapter].to(device=device, dtype=dtype) + V = self.frod_V[adapter].to(device=device, dtype=dtype) + indices = self.frod_s_indices[adapter].to(device=U.device, dtype=torch.long) + size_tensor = self.frod_s_size[adapter] + if isinstance(size_tensor, torch.Tensor): + size = tuple(int(dim) for dim in size_tensor.tolist()) + else: + size = tuple(int(dim) for dim in size_tensor) + values = self.frod_lambda_s_values[adapter].to(U.device, U.dtype).clone() + lambda_l = self.frod_lambda_l[adapter].to(device=U.device, dtype=U.dtype) + + S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() + S = S_sparse.to_dense() + L = torch.diag_embed(lambda_l) + + return transpose(U @ (S + L).T @ V.T, self.fan_in_fan_out) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.frod_lambda_s_values: + continue + + target_dtype = x.dtype + V = self.frod_V[active_adapter].to(device=x.device, dtype=target_dtype) + U = self.frod_U[active_adapter].to(device=x.device, dtype=target_dtype) + indices = self.frod_s_indices[active_adapter].to(device=x.device, dtype=torch.long) + size_tensor = self.frod_s_size[active_adapter] + if isinstance(size_tensor, torch.Tensor): + size = tuple(int(dim) for dim in size_tensor.tolist()) + else: + size = tuple(int(dim) for dim in size_tensor) + values = self.frod_lambda_s_values[active_adapter].to(device=x.device, dtype=target_dtype) + lambda_l = self.frod_lambda_l[active_adapter].to(device=x.device, dtype=target_dtype) + + x = x.to(target_dtype) + h = self.frod_dropout[active_adapter](x) + + batch_shape = h.shape[:-1] + h_flat = h.reshape(-1, h.shape[-1]) + z_flat = torch.matmul(h_flat, V) + + matmul_dtype = z_flat.dtype + if z_flat.is_cuda and matmul_dtype in (torch.float16, torch.bfloat16): + matmul_dtype = torch.float32 + + values = values.to(device=z_flat.device, dtype=matmul_dtype) + z_flat_mm = z_flat.to(matmul_dtype) + S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() + if S_sparse.dtype != matmul_dtype: + S_sparse = S_sparse.to(dtype=matmul_dtype) + z_S_flat = torch.sparse.mm(S_sparse.t(), z_flat_mm.t()).t() + + lambda_l = lambda_l.to(device=z_flat.device, dtype=matmul_dtype) + z_L_flat = z_flat_mm * lambda_l + + U_mm = U.to(device=z_flat.device, dtype=matmul_dtype) + out_add_flat = F.linear(z_S_flat + z_L_flat, U_mm) + out_add_flat = out_add_flat.to(target_dtype) + out_add = out_add_flat.reshape(*batch_shape, out_add_flat.shape[-1]) + + result = result + out_add + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "frod." + rep + + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: + dtype = None + weight = None + if device is None: + for weight_name in ("weight", "qweight"): + weight = getattr(self.get_base_layer(), weight_name, None) + if weight is not None: + device = weight.device + dtype = weight.dtype + break + else: + return + + for adapter_layer_name in self.adapter_layer_names: + adapter_layer = getattr(self, adapter_layer_name, None) + if not isinstance(adapter_layer, nn.ParameterDict): + continue + if adapter_name not in adapter_layer: + continue + param = adapter_layer[adapter_name] + if param.is_meta: + continue + adapter_layer[adapter_name] = param.to(device, dtype=dtype) diff --git a/src/peft/tuners/frod/model.py b/src/peft/tuners/frod/model.py new file mode 100644 index 0000000000..6b148ce51e --- /dev/null +++ b/src/peft/tuners/frod/model.py @@ -0,0 +1,276 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from collections import defaultdict + +import numpy as np +import torch +from numpy.linalg import qr +from torch import nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING + +from .._buffer_dict import BufferDict +from ..tuners_utils import _maybe_include_all_linear_layers +from .config import FRODConfig +from .layer import FRODLayer, Linear + + +def _category_from_key(key: str) -> str: + parts = key.split(".") + if len(parts) == 1: + return parts[0] + if parts[-2].isdigit(): + return parts[-1] + category = f"{parts[-2]}_{parts[-1]}" + if (category == "output_dense") and (len(parts) >= 3) and (parts[-3] == "attention"): + return "attention_output" + return category + + +def _layer_index_from_key(key: str, fallback: int) -> int: + parts = key.split(".") + if "layers" in parts: + try: + return int(parts[parts.index("layers") + 1]) + except (ValueError, IndexError): + pass + for part in parts: + if part.isdigit(): + return int(part) + return fallback + + +def _projection_from_weights(matrices: list[np.ndarray], regularization_alpha: float) -> np.ndarray: + stacked = np.vstack(matrices) + if stacked.shape[0] < stacked.shape[1]: + _, _, vh = np.linalg.svd(stacked, full_matrices=True) + return vh.T + + q_matrix, r_matrix = qr(stacked) + q_slices = [] + start = 0 + for matrix in matrices: + rows = matrix.shape[0] + q_slices.append(q_matrix[start : start + rows, :]) + start += rows + + dim = r_matrix.shape[1] + t_pi = np.zeros((dim, dim), dtype=r_matrix.dtype) + for q_slice in q_slices: + q_term = q_slice.T @ q_slice + regularization_alpha * np.eye(dim, dtype=r_matrix.dtype) + t_pi += np.linalg.inv(q_term) + t_pi /= len(q_slices) + + _, eigenvectors = np.linalg.eigh(t_pi) + return r_matrix.T @ eigenvectors + + +class FRODModel(BaseTuner): + prefix: str = "frod_" + tuner_layer_cls = FRODLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING + + def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: + weights = defaultdict(dict) + model_config = self.get_model_config(self.model) + peft_config = self._prepare_adapter_config(config, model_config) + peft_config = _maybe_include_all_linear_layers(peft_config, self.model) + + fallback_index = 0 + for key, module in self.model.named_modules(): + if not self._check_target_module_exists(peft_config, key): + continue + + if isinstance(module, nn.Linear): + weight = module.weight + elif isinstance(module, Conv1D): + weight = module.weight.T + else: + continue + + category = _category_from_key(key) + layer_idx = _layer_index_from_key(key, fallback_index) + fallback_index += 1 + weights[layer_idx][category] = weight + + if not weights: + raise ValueError( + "No layer types compatible with FRoD were found. Please check `peft_config.target_modules`." + ) + + if not hasattr(self, "frod_V"): + self.frod_V = nn.ModuleDict() + self.frod_s_indices = nn.ModuleDict() + self.frod_s_size = nn.ModuleDict() + + generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key) + categories = {category for layer_dict in weights.values() for category in layer_dict} + for category in sorted(categories): + matrices = [ + layer_dict[category].detach().to(torch.float32).cpu().numpy() + for _, layer_dict in sorted(weights.items()) + if category in layer_dict + ] + if not matrices: + continue + + v_matrix = _projection_from_weights(matrices, config.regularization_alpha) + example_weight = next(layer_dict[category] for layer_dict in weights.values() if category in layer_dict) + v_tensor = torch.from_numpy(v_matrix).to(dtype=example_weight.dtype, device="cpu") + + if category not in self.frod_V: + self.frod_V[category] = BufferDict({}, persistent=config.save_projection) + self.frod_V[category][adapter_name] = v_tensor + + in_dim = v_tensor.shape[0] + rows, cols = torch.meshgrid(torch.arange(in_dim), torch.arange(in_dim), indexing="ij") + mask_indices = torch.stack([rows.flatten(), cols.flatten()], dim=1) + non_diag_indices = mask_indices[mask_indices[:, 0] != mask_indices[:, 1]] + nnz = min(int(in_dim * in_dim * config.sparse_rate), non_diag_indices.shape[0]) + if nnz: + perm = torch.randperm(non_diag_indices.shape[0], generator=generator)[:nnz] + indices = non_diag_indices[perm].t().contiguous() + else: + indices = torch.empty(2, 0, dtype=torch.long) + size = torch.tensor([in_dim, in_dim], dtype=torch.long) + + if category not in self.frod_s_indices: + self.frod_s_indices[category] = BufferDict({}, persistent=config.save_projection) + self.frod_s_indices[category][adapter_name] = indices.to(torch.long) + if category not in self.frod_s_size: + self.frod_s_size[category] = BufferDict({}, persistent=config.save_projection) + self.frod_s_size[category][adapter_name] = size + + def _pre_injection_hook(self, model: nn.Module, config: FRODConfig, adapter_name: str) -> None: + self._init_frod_projections(config, adapter_name) + + def _check_new_adapter_config(self, config: FRODConfig) -> None: + super()._check_new_adapter_config(config) + + for existing_config in self.peft_config.values(): + if existing_config is config: + continue + if existing_config.projection_prng_key != config.projection_prng_key: + raise ValueError( + f"FRoD projection initialization key must be the same for all adapters. Got " + f"{config.projection_prng_key=} but previous config had " + f"{existing_config.projection_prng_key}." + ) + + save_projection_values = sorted({config.save_projection for config in self.peft_config.values()}) + if len(save_projection_values) > 1: + raise ValueError( + "FRoD projection weights must be saved for all adapters or none, but got multiple different values: " + f"{save_projection_values}" + ) + + def _create_and_replace( + self, + frod_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + category = _category_from_key(current_key) + if category not in self.frod_V: + self._init_frod_projections(frod_config, adapter_name) + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "frod_dropout": frod_config.frod_dropout, + "fan_in_fan_out": frod_config.fan_in_fan_out, + "init_weights": frod_config.init_weights, + "bias": bias, + } + + if isinstance(target, Linear): + target.update_layer( + adapter_name, + self.frod_V[category], + self.frod_s_indices[category], + self.frod_s_size[category], + frod_config.frod_dropout, + frod_config.init_weights, + ) + else: + new_module = self._create_new_module( + frod_config, + self.frod_V[category], + self.frod_s_indices[category], + self.frod_s_size[category], + adapter_name, + target, + **kwargs, + ) + if adapter_name not in self.active_adapters: + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module( + frod_config, + frod_V, + frod_s_indices, + frod_s_size, + adapter_name, + target, + **kwargs, + ): + bias = kwargs.pop("bias", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = frod_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = frod_config.fan_in_fan_out = True + else: + raise TypeError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + + return Linear( + target, + frod_V, + frod_s_indices, + frod_s_size, + adapter_name, + bias=bias, + **kwargs, + ) diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py index 4106c78060..8e56825f58 100644 --- a/src/peft/utils/__init__.py +++ b/src/peft/utils/__init__.py @@ -26,6 +26,7 @@ TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, @@ -89,6 +90,7 @@ "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", diff --git a/src/peft/utils/constants.py b/src/peft/utils/constants.py index 94aa475f0d..9cf874225f 100644 --- a/src/peft/utils/constants.py +++ b/src/peft/utils/constants.py @@ -135,6 +135,8 @@ def starcoder_model_postprocess_past_key_value(past_key_values): TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING["phi"] = ["q_proj", "v_proj"] +TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() + TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING["dinov2"] = ["query", "value"] diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py index 4cc720c4a5..93fddebd5f 100644 --- a/src/peft/utils/other.py +++ b/src/peft/utils/other.py @@ -49,6 +49,7 @@ TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, @@ -96,6 +97,7 @@ "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 80fc1db8f5..9cf9b2e149 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -38,6 +38,7 @@ class PeftType(str, enum.Enum): - POLY - LN_TUNING - VERA + - FROD - FOURIERFT - HRA - BONE @@ -71,6 +72,7 @@ class PeftType(str, enum.Enum): POLY = "POLY" LN_TUNING = "LN_TUNING" VERA = "VERA" + FROD = "FROD" PVERA = "PVERA" FOURIERFT = "FOURIERFT" XLORA = "XLORA" diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py index 57660c4283..65cd7bf502 100644 --- a/src/peft/utils/save_and_load.py +++ b/src/peft/utils/save_and_load.py @@ -273,6 +273,23 @@ def renamed_dora_weights(k): ) to_return["base_model.pvera_A." + adapter_name] = state_dict["base_model.pvera_A." + adapter_name] to_return["base_model.pvera_B." + adapter_name] = state_dict["base_model.pvera_B." + adapter_name] + elif config.peft_type == PeftType.FROD: + frod_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + projection_prefixes = ("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.") + layer_projection_parts = (".frod_V.", ".frod_s_indices.", ".frod_s_size.", ".frod_U.") + to_return = { + k: state_dict[k] + for k in state_dict + if (frod_prefix in k) and (adapter_name in k) and not any(part in k for part in layer_projection_parts) + } + if config.save_projection: + to_return.update( + { + k: state_dict[k] + for k in state_dict + if k.startswith(projection_prefixes) and k.endswith(f".{adapter_name}") + } + ) elif config.peft_type == PeftType.XLORA: to_return = {k: state_dict[k] for k in state_dict if "internal_xlora_classifier" in k} elif config.peft_type == PeftType.VBLORA: @@ -715,6 +732,13 @@ def set_peft_model_state_dict( new_key = k.replace(".tinylora_v.", f".tinylora_v.{adapter_name}.") tinylora_v_state_dict[new_key] = state_dict.pop(k) + frod_projection_state_dict = {} + if config.peft_type == PeftType.FROD: + frod_projection_prefixes = ("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.") + frod_projection_keys = [k for k in state_dict if k.startswith(frod_projection_prefixes)] + for k in frod_projection_keys: + frod_projection_state_dict[f"{k}.{adapter_name}"] = state_dict.pop(k) + peft_model_state_dict = _insert_adapter_name_into_state_dict( state_dict, adapter_name=adapter_name, parameter_prefix=parameter_prefix ) @@ -722,6 +746,8 @@ def set_peft_model_state_dict( # Add back the tinylora_v keys (now in the correct format) if config.peft_type == PeftType.TINYLORA: peft_model_state_dict.update(tinylora_v_state_dict) + elif config.peft_type == PeftType.FROD: + peft_model_state_dict.update(frod_projection_state_dict) if config.peft_type == PeftType.ADALORA: rank_pattern = config.rank_pattern @@ -796,6 +822,21 @@ def set_peft_model_state_dict( " PRNG initialisation to restore these projections using `config.projection_prng_key`, which may" " not be accurate on all system configurations." ) + elif config.peft_type == PeftType.FROD: + has_projection = any( + k.startswith(("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.")) + for k in peft_model_state_dict + ) + if config.save_projection and not has_projection: + raise ValueError( + "Specified to load FRoD projection tensors from state dictionary however they were not present!" + ) + elif not config.save_projection and has_projection: + warnings.warn( + "Specified to not load FRoD projection tensors from state dictionary however they are present. " + "Consider using them to ensure checkpoint loading is correct by setting " + "`peft_config.save_projection = True`." + ) elif config.peft_type == PeftType.LORA: # Here we take care of a refactor of DoRA which changed lora_magnitude_vector from a ParameterDict to a # ModuleDict with a DoraLayer instance. The old parameter is now the "weight" attribute of that layer. diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 7a809a6b4c..92811b85e0 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -37,6 +37,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -870,6 +871,14 @@ RandLoraConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "randlora_alpha": 1}, ), + ######## + # FRoD # + ######## + ("Vanilla MLP 1 FRoD", "MLP", FRODConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 FRoD", "MLP", FRODConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), ####### # C3A # ####### @@ -3455,7 +3464,8 @@ def test_multiple_adapters_automatic_modules_to_save(self): assert "other" in model.base_model.classifier.modules_to_save @pytest.mark.parametrize( - "config_cls", [IA3Config, BeftConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig] + "config_cls", + [IA3Config, BeftConfig, FRODConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig], ) def test_multiple_adapters_mixed_modules_to_save(self, config_cls): # See issue 1574 @@ -3487,7 +3497,8 @@ def test_multiple_adapters_mixed_modules_to_save(self, config_cls): model(**inputs) @pytest.mark.parametrize( - "config_cls", [IA3Config, BeftConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig] + "config_cls", + [IA3Config, BeftConfig, FRODConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig], ) def test_multiple_adapters_mixed_modules_to_save_order_switched(self, config_cls): # See issue 1574 @@ -3830,6 +3841,7 @@ def test_load_resized_embedding_ignore_mismatched_sizes(self): AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False), BeftConfig(target_modules=["lin0"], init_weights=False), + FRODConfig(target_modules=["lin0"], init_weights=False), OFTConfig(target_modules=["lin0"], init_weights=False, r=2, oft_block_size=0), BOFTConfig(target_modules=["lin0"], init_weights=False, boft_block_size=2), HRAConfig(target_modules=["lin0"], init_weights=False), From 2d3c730ad8e3477212c7102aa80d8e43aa2646be Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Thu, 21 May 2026 16:42:22 +0000 Subject: [PATCH 2/9] Add FRoD-specific tests --- tests/test_frod.py | 238 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 tests/test_frod.py diff --git a/tests/test_frod.py b/tests/test_frod.py new file mode 100644 index 0000000000..c21b3040f6 --- /dev/null +++ b/tests/test_frod.py @@ -0,0 +1,238 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to FRoD, since FRoD has shared projection buffers. + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from safetensors import safe_open +from torch import nn + +from peft import FRODConfig, PeftModel, get_peft_model + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestFROD: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + @pytest.fixture + def mlp_same_prng(self, mlp): + torch.manual_seed(0) + + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(mlp, config) + config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model.add_adapter("other", config2) + return peft_model + + @staticmethod + def _make_second_adapter_different(peft_model): + with torch.no_grad(): + for module in peft_model.base_model.model.modules(): + if hasattr(module, "frod_lambda_l") and "second" in module.frod_lambda_l: + module.frod_lambda_l["second"].add_(0.1) + + def test_multiple_adapters_same_prng_projection_buffers(self, mlp_same_prng): + # Multiple adapters with the same PRNG key share fixed projection buffers within each FRoD layer. + assert ( + mlp_same_prng.base_model.model.lin1.frod_V["default"].data_ptr() + == mlp_same_prng.base_model.model.lin1.frod_V["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_s_indices["default"].data_ptr() + == mlp_same_prng.base_model.model.lin1.frod_s_indices["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin2.frod_V["default"].data_ptr() + == mlp_same_prng.base_model.model.lin2.frod_V["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin2.frod_s_indices["default"].data_ptr() + == mlp_same_prng.base_model.model.lin2.frod_s_indices["other"].data_ptr() + ) + + def test_multiple_adapters_different_prng_raises(self): + model = MLP() + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(model, config) + config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, projection_prng_key=123) + + msg = ( + r"FRoD projection initialization key must be the same for all adapters. Got " + r"config.projection_prng_key=123 but previous config had 0" + ) + with pytest.raises(ValueError, match=msg): + peft_model.add_adapter("other", config2) + + def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): + # Check saving and loading works with multiple adapters without saved projection tensors. + torch.manual_seed(1) + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + self._make_second_adapter_different(peft_model) + peft_model.eval() + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + peft_model.set_adapter("second") + output_second = peft_model(input) + + assert not torch.allclose(output_first, output_second, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + assert os.path.exists(save_path / "first" / "adapter_config.json") + assert os.path.exists(save_path / "second" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path / "first", adapter_name="first") + peft_model.load_adapter(save_path / "second", "second") + peft_model.eval() + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + peft_model.set_adapter("second") + output_second_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_second, output_second_loaded, atol=1e-3, rtol=1e-3) + + def test_save_projection_false_contains_no_frod_projection_tensors(self, mlp, tmp_path): + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model = get_peft_model(mlp, config) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + + state_dict = {} + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) + + assert not any("frod_V" in key for key in state_dict) + assert not any("frod_s_indices" in key for key in state_dict) + assert not any("frod_s_size" in key for key in state_dict) + assert not any("frod_U" in key for key in state_dict) + + def test_save_projection_true_contains_top_level_projection_tensors_only(self, mlp, tmp_path): + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(mlp, config) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + + keys = [] + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + keys = list(f.keys()) + + assert "base_model.frod_V.lin1" in keys + assert "base_model.frod_s_indices.lin1" in keys + assert "base_model.frod_s_size.lin1" in keys + assert "base_model.frod_V.lin2" in keys + assert not any(".model.lin1.frod_V" in key for key in keys) + assert not any("frod_U" in key for key in keys) + + def test_frod_projection_buffers_share_memory_with_layers(self, mlp_same_prng): + frod_V_lin1 = mlp_same_prng.base_model.frod_V["lin1"]["default"] + frod_s_indices_lin1 = mlp_same_prng.base_model.frod_s_indices["lin1"]["default"] + + assert frod_V_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_V["default"].data_ptr() + assert frod_V_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_V["other"].data_ptr() + assert ( + frod_s_indices_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_s_indices["default"].data_ptr() + ) + assert frod_s_indices_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_s_indices["other"].data_ptr() + + # Different target categories have distinct projection buffers. + assert frod_V_lin1.data_ptr() != mlp_same_prng.base_model.frod_V["lin2"]["default"].data_ptr() + + def test_frod_lambda_dont_share_memory(self, mlp_same_prng): + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.frod_lambda_s_values["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_l["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.frod_lambda_l["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_l["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.frod_lambda_l["default"].data_ptr() + ) + + def test_frod_different_shapes(self, mlp): + config = FRODConfig(target_modules=["lin0", "lin3"], init_weights=False) + mlp_different_shapes = get_peft_model(mlp, config) + + assert mlp.lin0.base_layer.weight.shape != mlp.lin3.base_layer.weight.shape + assert mlp_different_shapes.base_model.frod_V["lin0"]["default"].shape == ( + mlp.lin0.in_features, + mlp.lin0.in_features, + ) + assert mlp_different_shapes.base_model.frod_V["lin3"]["default"].shape == ( + mlp.lin3.in_features, + mlp.lin3.in_features, + ) + + input = torch.randn(5, 10) + mlp_different_shapes(input) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_frod_dtypes(self, dtype): + if dtype == torch.bfloat16: + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + model = MLP().to(dtype) + config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(model, config) + inputs = torch.randn(5, 10).to(dtype) + output = peft_model(inputs) + assert output.dtype == dtype From b252c65dc5ac2b48a036564661a63b7e20b9fc72 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Wed, 27 May 2026 04:27:29 +0000 Subject: [PATCH 3/9] Finalize FROD integration --- docs/source/package_reference/frod.md | 28 ++++ examples/frod_finetuning/README.md | 27 ++++ .../frod_image_classification.py | 131 ++++++++++++++++++ .../frod_text_classification.py | 93 +++++++++++++ examples/frod_finetuning/requirements.txt | 6 + .../llama-3.2-3B-default/adapter_config.json | 20 +++ .../adapter_config.json | 20 +++ .../training_params.json | 5 + src/peft/tuners/frod/config.py | 22 ++- src/peft/tuners/frod/layer.py | 18 ++- src/peft/tuners/frod/model.py | 3 + src/peft/utils/constants.py | 1 + tests/test_config.py | 2 + tests/test_decoder_models.py | 9 ++ tests/test_encoder_decoder_models.py | 9 ++ tests/test_feature_extraction_models.py | 9 ++ tests/test_seq_classifier.py | 9 ++ 17 files changed, 403 insertions(+), 9 deletions(-) create mode 100644 examples/frod_finetuning/README.md create mode 100644 examples/frod_finetuning/frod_image_classification.py create mode 100644 examples/frod_finetuning/frod_text_classification.py create mode 100644 examples/frod_finetuning/requirements.txt create mode 100644 method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json create mode 100644 method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json create mode 100644 method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json diff --git a/docs/source/package_reference/frod.md b/docs/source/package_reference/frod.md index 7fceb6fa41..b2c146fe0f 100644 --- a/docs/source/package_reference/frod.md +++ b/docs/source/package_reference/frod.md @@ -20,15 +20,43 @@ FRoD is a parameter-efficient fine-tuning method that combines a shared full-ran rotational degrees. The adapter update is expressed through fixed projection tensors and trainable coefficients, which allows FRoD to apply full-rank updates while keeping the number of trained parameters small. +Paper: [Full-Rank Efficient Fine-Tuning with Rotational Degrees](https://doi.org/10.1609/aaai.v40i31.39813). + When saving the adapter parameters, it is possible to avoid storing the projection tensors by setting `save_projection=False` on the `FRODConfig`. In that case, the projections are restored from the base model weights and the fixed random seed from `projection_prng_key`. This reduces checkpoint size, but the default is `save_projection=True` to make checkpoint loading independent of regeneration details. +Compared to LoRA, FRoD can express a full-rank update in each adapted linear layer while training only the diagonal +coefficients and a sparse set of off-diagonal rotation coefficients. This can be useful when a low-rank update is too +restrictive. The trade-off is that FRoD computes fixed projection tensors from the base weights during adapter +injection, which makes setup more expensive and the implementation less broadly supported than LoRA. + FRoD currently has the following constraint: - Only `nn.Linear` and `transformers.pytorch_utils.Conv1D` layers are supported. +## Quickstart + +```python +from transformers import AutoModelForSequenceClassification + +from peft import FRODConfig, TaskType, get_peft_model + +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2) + +peft_config = FRODConfig( + task_type=TaskType.SEQ_CLS, + target_modules=["query", "value"], + modules_to_save=["classifier"], + sparse_rate=0.02, + frod_dropout=0.0, +) + +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +``` + ## FRODConfig [[autodoc]] tuners.frod.config.FRODConfig diff --git a/examples/frod_finetuning/README.md b/examples/frod_finetuning/README.md new file mode 100644 index 0000000000..f818ed6b51 --- /dev/null +++ b/examples/frod_finetuning/README.md @@ -0,0 +1,27 @@ +# FRoD fine-tuning examples + +These examples show minimal FRoD fine-tuning with the Transformers `Trainer`. + +Install the example dependencies and run either script directly: + +```bash +pip install -r examples/frod_finetuning/requirements.txt +python examples/frod_finetuning/frod_text_classification.py +python examples/frod_finetuning/frod_image_classification.py +``` + +The text example fine-tunes `google-bert/bert-base-uncased` on `nyu-mll/glue` with the `sst2` configuration. The image +example fine-tunes `google/vit-base-patch16-224` on the train and test parquet splits from `tanganke/stanford_cars`. + +Both scripts use separate optimizer learning rates for FRoD diagonal coefficients, FRoD sparse coefficients, and the +classification head. FRoD dropout is set to `0.0` because the sparse rotational parameterization is the main +regularizer in these examples. + +To use local mirrors of the image model or dataset, override the image example paths with environment variables: + +```bash +FROD_IMAGE_MODEL_NAME=/path/to/local/vit-model \ +FROD_STANFORD_CARS_DATA_DIR=/path/to/local/stanford_cars \ +FROD_IMAGE_OUTPUT_DIR=vit-local-frod-stanford-cars \ +python examples/frod_finetuning/frod_image_classification.py +``` diff --git a/examples/frod_finetuning/frod_image_classification.py b/examples/frod_finetuning/frod_image_classification.py new file mode 100644 index 0000000000..3f61f52f32 --- /dev/null +++ b/examples/frod_finetuning/frod_image_classification.py @@ -0,0 +1,131 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import os + +import numpy as np +import torch +from datasets import load_dataset +from transformers import AutoImageProcessor, AutoModelForImageClassification, Trainer, TrainingArguments + +from peft import FRODConfig, get_peft_model + + +MODEL_NAME = os.environ.get("FROD_IMAGE_MODEL_NAME", "google/vit-base-patch16-224") +OUTPUT_DIR = os.environ.get("FROD_IMAGE_OUTPUT_DIR", "vit-base-patch16-224-frod-stanford-cars") +DATA_DIR = os.environ.get("FROD_STANFORD_CARS_DATA_DIR") +FROD_LAMBDA_L_LR = 5e-4 +FROD_LAMBDA_S_LR = 5e-5 +CLASSIFIER_LR = 1e-4 + +def main(): + if DATA_DIR: + data_files = { + "train": [ + os.path.join(DATA_DIR, "data", "train-00000-of-00002.parquet"), + os.path.join(DATA_DIR, "data", "train-00001-of-00002.parquet"), + ], + "test": [ + os.path.join(DATA_DIR, "data", "test-00000-of-00002.parquet"), + os.path.join(DATA_DIR, "data", "test-00001-of-00002.parquet"), + ], + } + else: + data_files = { + "train": [ + "hf://datasets/tanganke/stanford_cars/data/train-00000-of-00002.parquet", + "hf://datasets/tanganke/stanford_cars/data/train-00001-of-00002.parquet", + ], + "test": [ + "hf://datasets/tanganke/stanford_cars/data/test-00000-of-00002.parquet", + "hf://datasets/tanganke/stanford_cars/data/test-00001-of-00002.parquet", + ], + } + + dataset = load_dataset("parquet", data_files=data_files) + train_split = dataset["train"] + eval_split = dataset["test"] + image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME) + label_feature = train_split.features["label"] + label_names = ( + label_feature.names if hasattr(label_feature, "names") else [str(i) for i in sorted(set(train_split["label"]))] + ) + id2label = dict(enumerate(label_names)) + label2id = {name: idx for idx, name in id2label.items()} + + model = AutoModelForImageClassification.from_pretrained( + MODEL_NAME, + num_labels=len(label_names), + id2label=id2label, + label2id=label2id, + ignore_mismatched_sizes=True, + ) + peft_config = FRODConfig( + target_modules=["query", "value"], + modules_to_save=["classifier"], + frod_dropout=0.0, + sparse_rate=0.02, + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + def transform(batch): + images = [image.convert("RGB") for image in batch["image"]] + inputs = image_processor(images, return_tensors="pt") + inputs["labels"] = batch["label"] + return inputs + + train_dataset = train_split.with_transform(transform) + eval_dataset = eval_split.with_transform(transform) + + def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + labels = torch.tensor([example["labels"] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} + + def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=-1) + return {"accuracy": (predictions == eval_pred.label_ids).mean().item()} + + optimizer = torch.optim.AdamW( + [ + {"params": [p for n, p in model.named_parameters() if "frod_lambda_l" in n], "lr": FROD_LAMBDA_L_LR}, + { + "params": [p for n, p in model.named_parameters() if "frod_lambda_s_values" in n], + "lr": FROD_LAMBDA_S_LR, + }, + {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": CLASSIFIER_LR}, + ] + ) + + training_args = TrainingArguments( + output_dir=OUTPUT_DIR, + learning_rate=FROD_LAMBDA_L_LR, + per_device_train_batch_size=32, + per_device_eval_batch_size=64, + num_train_epochs=1, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + remove_unused_columns=False, + report_to="none", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=collate_fn, + compute_metrics=compute_metrics, + optimizers=(optimizer, None), + ) + trainer.train() + trainer.evaluate() + model.save_pretrained(OUTPUT_DIR) + + +if __name__ == "__main__": + main() diff --git a/examples/frod_finetuning/frod_text_classification.py b/examples/frod_finetuning/frod_text_classification.py new file mode 100644 index 0000000000..105beda9b9 --- /dev/null +++ b/examples/frod_finetuning/frod_text_classification.py @@ -0,0 +1,93 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import numpy as np +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) + +from peft import FRODConfig, TaskType, get_peft_model + + +MODEL_NAME = "google-bert/bert-base-uncased" +DATASET_NAME = "nyu-mll/glue" +TASK_NAME = "sst2" +OUTPUT_DIR = "bert-base-uncased-frod-sst2" +FROD_LAMBDA_L_LR = 2e-2 +FROD_LAMBDA_S_LR = 2e-3 +CLASSIFIER_LR = 1e-2 + + +def main(): + dataset = load_dataset(DATASET_NAME, TASK_NAME) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + + def preprocess(batch): + return tokenizer(batch["sentence"], truncation=True) + + tokenized = dataset.map(preprocess, batched=True) + tokenized = tokenized.rename_column("label", "labels") + + model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) + peft_config = FRODConfig( + task_type=TaskType.SEQ_CLS, + target_modules=["query", "value"], + modules_to_save=["classifier"], + frod_dropout=0.0, + sparse_rate=0.02, + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=-1) + return {"accuracy": (predictions == eval_pred.label_ids).mean().item()} + + optimizer = torch.optim.AdamW( + [ + {"params": [p for n, p in model.named_parameters() if "frod_lambda_l" in n], "lr": FROD_LAMBDA_L_LR}, + { + "params": [p for n, p in model.named_parameters() if "frod_lambda_s_values" in n], + "lr": FROD_LAMBDA_S_LR, + }, + {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": CLASSIFIER_LR}, + ] + ) + + training_args = TrainingArguments( + output_dir=OUTPUT_DIR, + learning_rate=FROD_LAMBDA_L_LR, + per_device_train_batch_size=32, + per_device_eval_batch_size=64, + num_train_epochs=1, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + report_to="none", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized["train"], + eval_dataset=tokenized["validation"], + tokenizer=tokenizer, + data_collator=DataCollatorWithPadding(tokenizer=tokenizer), + compute_metrics=compute_metrics, + optimizers=(optimizer, None), + ) + trainer.train() + trainer.evaluate() + model.save_pretrained(OUTPUT_DIR) + + +if __name__ == "__main__": + main() diff --git a/examples/frod_finetuning/requirements.txt b/examples/frod_finetuning/requirements.txt new file mode 100644 index 0000000000..8bcaa74619 --- /dev/null +++ b/examples/frod_finetuning/requirements.txt @@ -0,0 +1,6 @@ +peft +transformers +accelerate>=1.0.0 +datasets +numpy +Pillow diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000..d52af61d39 --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "frod_dropout": 0.0, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "FROD", + "projection_prng_key": 0, + "regularization_alpha": 0.001, + "revision": null, + "save_projection": true, + "sparse_rate": 0.01, + "target_modules": null, + "task_type": "CAUSAL_LM" +} diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json new file mode 100644 index 0000000000..8abdd8540b --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "frod_dropout": 0.0, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "FROD", + "projection_prng_key": 0, + "regularization_alpha": 0.001, + "revision": null, + "save_projection": true, + "sparse_rate": 0.02, + "target_modules": null, + "task_type": "CAUSAL_LM" +} diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json new file mode 100644 index 0000000000..52d87e3ef6 --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json @@ -0,0 +1,5 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} diff --git a/src/peft/tuners/frod/config.py b/src/peft/tuners/frod/config.py index 354c1279b4..2e5b724768 100644 --- a/src/peft/tuners/frod/config.py +++ b/src/peft/tuners/frod/config.py @@ -56,6 +56,12 @@ class FRODConfig(PeftConfig): layers_pattern (`Optional[Union[List[str], str]]`): The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + sparse_rate (`float`): + Fraction of off-diagonal entries in the sparse trainable rotation matrix. Higher values increase capacity + and trainable parameters; lower values are cheaper. Defaults to `0.01`. + regularization_alpha (`float`): + Small positive value used while building the shared basis from base weights. It stabilizes the matrix + inverse when layers in the same category have correlated weights. Defaults to `1e-3`. """ target_modules: Optional[Union[list[str], str]] = field( @@ -126,15 +132,27 @@ class FRODConfig(PeftConfig): ) }, ) - sparse_rate: float = field(default=0.01, metadata={"help": "Sparse rate"}) + sparse_rate: float = field( + default=0.01, + metadata={ + "help": ( + "Fraction of off-diagonal entries in the sparse trainable rotation matrix. Higher values increase " + "capacity and trainable parameters; lower values are cheaper." + ) + }, + ) regularization_alpha: float = field( default=1e-3, metadata={ - "help": ("Regularization parameter used when building the shared FRoD basis."), + "help": ( + "Small positive value used while building the shared basis from base weights. It stabilizes matrix " + "inverses for correlated layers." + ), }, ) def __post_init__(self): + super().__post_init__() self.peft_type = PeftType.FROD self.target_modules = ( set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py index edaf2ea2ba..6feb0b0729 100644 --- a/src/peft/tuners/frod/layer.py +++ b/src/peft/tuners/frod/layer.py @@ -29,7 +29,7 @@ class FRODLayer(BaseTunerLayer): - adapter_layer_names = ("frod_lambda_s_values", "frod_lambda_l") + adapter_layer_names = ("frod_lambda_l", "frod_lambda_s_values") other_param_names = ("frod_V", "frod_U", "frod_s_indices", "frod_s_size") def __init__(self, base_layer: nn.Module, **kwargs): @@ -38,8 +38,8 @@ def __init__(self, base_layer: nn.Module, **kwargs): self.frod_dropout = nn.ModuleDict({}) # Sparse S is parameterized by its COO values only. - self.frod_lambda_s_values = nn.ParameterDict({}) self.frod_lambda_l = nn.ParameterDict({}) + self.frod_lambda_s_values = nn.ParameterDict({}) self.frod_s_indices: Optional[BufferDict] = None self.frod_s_size: Optional[BufferDict] = None @@ -74,13 +74,12 @@ def update_layer( frod_dropout, init_weights, ): - weight = self.get_base_layer().weight - device = weight.device - dtype = weight.dtype + base_layer = self.get_base_layer() + weight = base_layer.weight.T if isinstance(base_layer, Conv1D) else base_layer.weight + device = base_layer.weight.device + dtype = base_layer.weight.dtype param_dtype = dtype - if device.type == "cuda" and dtype == torch.float32: - param_dtype = torch.float16 self.r[adapter_name] = self.out_features if frod_dropout > 0.0: @@ -115,6 +114,11 @@ def update_layer( self.frod_lambda_l[adapter_name] = nn.Parameter(L, requires_grad=True) if init_weights: self.reset_frod_parameters(adapter_name) + else: + # PEFT convention: init_weights=False should produce a non-identity adapter for merge tests. + with torch.no_grad(): + nn.init.normal_(self.frod_lambda_s_values[adapter_name], std=0.02) + self.frod_lambda_l[adapter_name].add_(torch.randn_like(self.frod_lambda_l[adapter_name]) * 0.02) self.frod_U[adapter_name] = U.cpu() self._move_adapter_to_device_of_base_layer(adapter_name) diff --git a/src/peft/tuners/frod/model.py b/src/peft/tuners/frod/model.py index 6b148ce51e..2968e65416 100644 --- a/src/peft/tuners/frod/model.py +++ b/src/peft/tuners/frod/model.py @@ -73,6 +73,7 @@ def _projection_from_weights(matrices: list[np.ndarray], regularization_alpha: f dim = r_matrix.shape[1] t_pi = np.zeros((dim, dim), dtype=r_matrix.dtype) + # Layers of the same projection category can be highly correlated; this ridge term keeps the inverse stable. for q_slice in q_slices: q_term = q_slice.T @ q_slice + regularization_alpha * np.eye(dim, dtype=r_matrix.dtype) t_pi += np.linalg.inv(q_term) @@ -144,6 +145,8 @@ def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: mask_indices = torch.stack([rows.flatten(), cols.flatten()], dim=1) non_diag_indices = mask_indices[mask_indices[:, 0] != mask_indices[:, 1]] nnz = min(int(in_dim * in_dim * config.sparse_rate), non_diag_indices.shape[0]) + if (config.sparse_rate > 0) and (non_diag_indices.shape[0] > 0): + nnz = max(1, nnz) if nnz: perm = torch.randperm(non_diag_indices.shape[0], generator=generator)[:nnz] indices = non_diag_indices[perm].t().contiguous() diff --git a/src/peft/utils/constants.py b/src/peft/utils/constants.py index 9cf874225f..3ef9c0f80a 100644 --- a/src/peft/utils/constants.py +++ b/src/peft/utils/constants.py @@ -136,6 +136,7 @@ def starcoder_model_postprocess_past_key_value(past_key_values): TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING["phi"] = ["q_proj", "v_proj"] TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING["vit"] = ["query", "value"] TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING["dinov2"] = ["query", "value"] diff --git a/tests/test_config.py b/tests/test_config.py index 02cd0004ad..c9ee5894c0 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -30,6 +30,7 @@ CartridgeConfig, CPTConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -78,6 +79,7 @@ class TestingCommitHashError(Exception): (BOFTConfig, {}), (C3AConfig, {}), (FourierFTConfig, {}), + (FRODConfig, {}), (GraloraConfig, {}), (HiraConfig, {}), (HRAConfig, {}), diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index f43763a554..b6de666976 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -38,6 +38,7 @@ CPTConfig, DeloraConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -145,6 +146,14 @@ "target_modules": None, }, ), + ( + FRODConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index 6411d30b93..bca8a7d3bf 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -25,6 +25,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -108,6 +109,14 @@ "task_type": "SEQ_2_SEQ_LM", }, ), + ( + FRODConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index 90493f275c..bece31bf15 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -23,6 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -105,6 +106,14 @@ "target_modules": None, }, ), + ( + FRODConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index 613ca1b8c3..b08f0a9945 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -23,6 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FRODConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -106,6 +107,14 @@ "target_modules": None, }, ), + ( + FRODConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { From 0e0d816832535af125e1006e1f305e3dd6c7a2f4 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Thu, 28 May 2026 09:47:47 +0000 Subject: [PATCH 4/9] Address FROD PR review feedback --- docs/source/package_reference/frod.md | 14 +-- .../frod_image_classification.py | 4 +- .../frod_text_classification.py | 4 +- src/peft/__init__.py | 8 +- src/peft/tuners/__init__.py | 6 +- src/peft/tuners/frod/__init__.py | 24 +++- src/peft/tuners/frod/config.py | 6 +- src/peft/tuners/frod/layer.py | 117 +++++++----------- src/peft/tuners/frod/model.py | 62 +++++----- src/peft/utils/save_and_load.py | 5 +- tests/test_config.py | 4 +- tests/test_custom_models.py | 25 ++-- tests/test_decoder_models.py | 4 +- tests/test_encoder_decoder_models.py | 4 +- tests/test_feature_extraction_models.py | 4 +- tests/test_frod.py | 60 ++------- tests/test_initialization.py | 66 ++++++++++ tests/test_seq_classifier.py | 4 +- 18 files changed, 222 insertions(+), 199 deletions(-) diff --git a/docs/source/package_reference/frod.md b/docs/source/package_reference/frod.md index b2c146fe0f..5369494d09 100644 --- a/docs/source/package_reference/frod.md +++ b/docs/source/package_reference/frod.md @@ -23,7 +23,7 @@ allows FRoD to apply full-rank updates while keeping the number of trained param Paper: [Full-Rank Efficient Fine-Tuning with Rotational Degrees](https://doi.org/10.1609/aaai.v40i31.39813). When saving the adapter parameters, it is possible to avoid storing the projection tensors by setting -`save_projection=False` on the `FRODConfig`. In that case, the projections are restored from the base model weights and +`save_projection=False` on the `FrodConfig`. In that case, the projections are restored from the base model weights and the fixed random seed from `projection_prng_key`. This reduces checkpoint size, but the default is `save_projection=True` to make checkpoint loading independent of regeneration details. @@ -41,11 +41,11 @@ FRoD currently has the following constraint: ```python from transformers import AutoModelForSequenceClassification -from peft import FRODConfig, TaskType, get_peft_model +from peft import FrodConfig, TaskType, get_peft_model model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2) -peft_config = FRODConfig( +peft_config = FrodConfig( task_type=TaskType.SEQ_CLS, target_modules=["query", "value"], modules_to_save=["classifier"], @@ -57,10 +57,10 @@ model = get_peft_model(model, peft_config) model.print_trainable_parameters() ``` -## FRODConfig +## FrodConfig -[[autodoc]] tuners.frod.config.FRODConfig +[[autodoc]] tuners.frod.config.FrodConfig -## FRODModel +## FrodModel -[[autodoc]] tuners.frod.model.FRODModel +[[autodoc]] tuners.frod.model.FrodModel diff --git a/examples/frod_finetuning/frod_image_classification.py b/examples/frod_finetuning/frod_image_classification.py index 3f61f52f32..24e6c345c5 100644 --- a/examples/frod_finetuning/frod_image_classification.py +++ b/examples/frod_finetuning/frod_image_classification.py @@ -9,7 +9,7 @@ from datasets import load_dataset from transformers import AutoImageProcessor, AutoModelForImageClassification, Trainer, TrainingArguments -from peft import FRODConfig, get_peft_model +from peft import FrodConfig, get_peft_model MODEL_NAME = os.environ.get("FROD_IMAGE_MODEL_NAME", "google/vit-base-patch16-224") @@ -61,7 +61,7 @@ def main(): label2id=label2id, ignore_mismatched_sizes=True, ) - peft_config = FRODConfig( + peft_config = FrodConfig( target_modules=["query", "value"], modules_to_save=["classifier"], frod_dropout=0.0, diff --git a/examples/frod_finetuning/frod_text_classification.py b/examples/frod_finetuning/frod_text_classification.py index 105beda9b9..afa801a236 100644 --- a/examples/frod_finetuning/frod_text_classification.py +++ b/examples/frod_finetuning/frod_text_classification.py @@ -13,7 +13,7 @@ TrainingArguments, ) -from peft import FRODConfig, TaskType, get_peft_model +from peft import FrodConfig, TaskType, get_peft_model MODEL_NAME = "google-bert/bert-base-uncased" @@ -36,7 +36,7 @@ def preprocess(batch): tokenized = tokenized.rename_column("label", "labels") model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) - peft_config = FRODConfig( + peft_config = FrodConfig( task_type=TaskType.SEQ_CLS, target_modules=["query", "value"], modules_to_save=["classifier"], diff --git a/src/peft/__init__.py b/src/peft/__init__.py index df185d9d76..078e322289 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -70,8 +70,8 @@ EvaConfig, FourierFTConfig, FourierFTModel, - FRODConfig, - FRODModel, + FrodConfig, + FrodModel, GraloraConfig, GraloraModel, HiraConfig, @@ -200,10 +200,10 @@ "DeloraConfig", "DeloraModel", "EvaConfig", - "FRODConfig", - "FRODModel", "FourierFTConfig", "FourierFTModel", + "FrodConfig", + "FrodModel", "GraloraConfig", "GraloraModel", "HRAConfig", diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index 9ff82ea725..22908b57a9 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -22,7 +22,7 @@ from .cpt import CPTConfig, CPTEmbedding from .delora import DeloraConfig, DeloraModel from .fourierft import FourierFTConfig, FourierFTModel -from .frod import FRODConfig, FRODModel +from .frod import FrodConfig, FrodModel from .gralora import GraloraConfig, GraloraModel from .hira import HiraConfig, HiraModel from .hra import HRAConfig, HRAModel @@ -94,10 +94,10 @@ "DeloraConfig", "DeloraModel", "EvaConfig", - "FRODConfig", - "FRODModel", "FourierFTConfig", "FourierFTModel", + "FrodConfig", + "FrodModel", "GraloraConfig", "GraloraModel", "HRAConfig", diff --git a/src/peft/tuners/frod/__init__.py b/src/peft/tuners/frod/__init__.py index 3be7152c07..b26001a796 100644 --- a/src/peft/tuners/frod/__init__.py +++ b/src/peft/tuners/frod/__init__.py @@ -1,10 +1,24 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from peft.utils import register_peft_method -from .config import FRODConfig -from .layer import FRODLayer, Linear -from .model import FRODModel +from .config import FrodConfig +from .layer import FrodLayer, Linear +from .model import FrodModel -__all__ = ["FRODConfig", "FRODLayer", "FRODModel", "Linear"] +__all__ = ["FrodConfig", "FrodLayer", "FrodModel", "Linear"] -register_peft_method(name="frod", config_cls=FRODConfig, model_cls=FRODModel, prefix="frod_") +register_peft_method(name="frod", config_cls=FrodConfig, model_cls=FrodModel, prefix="frod_") diff --git a/src/peft/tuners/frod/config.py b/src/peft/tuners/frod/config.py index 2e5b724768..f581108a89 100644 --- a/src/peft/tuners/frod/config.py +++ b/src/peft/tuners/frod/config.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2026-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ @dataclass -class FRODConfig(PeftConfig): +class FrodConfig(PeftConfig): """ - This is the configuration class to store the configuration of a [`FRODModel`]. + This is the configuration class to store the configuration of a [`FrodModel`]. Paper: https://doi.org/10.1609/aaai.v40i31.39813. diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py index 6feb0b0729..1289e7c9c8 100644 --- a/src/peft/tuners/frod/layer.py +++ b/src/peft/tuners/frod/layer.py @@ -1,4 +1,4 @@ -# Copyright 2023-present the HuggingFace Inc. team. +# Copyright 2026-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,20 +15,18 @@ import warnings from typing import Optional -import numpy as np import torch import torch.nn.functional as F -from numpy.linalg import inv from torch import nn -from transformers.pytorch_utils import Conv1D -from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge from peft.utils.other import transpose from .._buffer_dict import BufferDict +from .config import FrodConfig -class FRODLayer(BaseTunerLayer): +class FrodLayer(BaseTunerLayer): adapter_layer_names = ("frod_lambda_l", "frod_lambda_s_values") other_param_names = ("frod_V", "frod_U", "frod_s_indices", "frod_s_size") @@ -41,76 +39,64 @@ def __init__(self, base_layer: nn.Module, **kwargs): self.frod_lambda_l = nn.ParameterDict({}) self.frod_lambda_s_values = nn.ParameterDict({}) - self.frod_s_indices: Optional[BufferDict] = None - self.frod_s_size: Optional[BufferDict] = None - self.frod_V: Optional[BufferDict] = None + self.frod_s_indices: BufferDict = BufferDict({}, persistent=False) + self.frod_s_size: BufferDict = BufferDict({}, persistent=False) + self.frod_V: BufferDict = BufferDict({}, persistent=False) self.frod_U: BufferDict = BufferDict({}, persistent=False) self._disable_adapters = False self.merged_adapters = [] - base_layer = self.get_base_layer() - if isinstance(base_layer, nn.Linear): - in_features, out_features = base_layer.in_features, base_layer.out_features - elif isinstance(base_layer, Conv1D): - in_features, out_features = ( - base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape - ) - - self.in_features = in_features - self.out_features = out_features + self.in_features, self.out_features = _get_in_out_features(self.get_base_layer()) self.kwargs = kwargs - @property - def merged(self) -> bool: - return bool(self.merged_adapters) - def update_layer( self, adapter_name, frod_V: BufferDict, frod_s_indices: BufferDict, frod_s_size: BufferDict, - frod_dropout, - init_weights, + config: FrodConfig, ): + frod_dropout = config.frod_dropout + init_weights = config.init_weights base_layer = self.get_base_layer() - weight = base_layer.weight.T if isinstance(base_layer, Conv1D) else base_layer.weight + weight = transpose(base_layer.weight, self.fan_in_fan_out) device = base_layer.weight.device dtype = base_layer.weight.dtype - param_dtype = dtype - self.r[adapter_name] = self.out_features if frod_dropout > 0.0: frod_dropout_layer = nn.Dropout(p=frod_dropout) else: frod_dropout_layer = nn.Identity() - self.frod_dropout.update(nn.ModuleDict({adapter_name: frod_dropout_layer})) + self.frod_dropout[adapter_name] = frod_dropout_layer + if frod_V is None or frod_s_indices is None or frod_s_size is None: + raise ValueError("The FRoD projection buffers are missing. This should not happen.") if adapter_name not in frod_V: - if not frod_V: - raise ValueError("The FRoD projection buffers are empty. This should not happen.") - frod_V[adapter_name] = next(iter(frod_V.values())) - frod_s_indices[adapter_name] = next(iter(frod_s_indices.values())) - frod_s_size[adapter_name] = next(iter(frod_s_size.values())) + # FRoD projection buffers are shared across adapters for the same module category. + reference_adapter = next(iter(frod_V)) + frod_V[adapter_name] = frod_V[reference_adapter] + frod_s_indices[adapter_name] = frod_s_indices[reference_adapter] + frod_s_size[adapter_name] = frod_s_size[reference_adapter] nnz = frod_s_indices[adapter_name].shape[1] - self.frod_lambda_s_values[adapter_name] = nn.Parameter(torch.zeros(nnz, device=device, dtype=param_dtype)) + self.frod_lambda_s_values[adapter_name] = nn.Parameter(torch.zeros(nnz, device=device, dtype=dtype)) - self.__dict__["frod_V"] = frod_V - self.__dict__["frod_s_indices"] = frod_s_indices - self.__dict__["frod_s_size"] = frod_s_size + self.frod_V[adapter_name] = frod_V[adapter_name] + self.frod_s_indices[adapter_name] = frod_s_indices[adapter_name] + self.frod_s_size[adapter_name] = frod_s_size[adapter_name] # Keep cached projections on CPU and move them lazily in forward. - self.frod_V[adapter_name] = self.frod_V[adapter_name].to(dtype=param_dtype, device="cpu") + self.frod_V[adapter_name] = self.frod_V[adapter_name].to(dtype=dtype, device="cpu") self.frod_s_indices[adapter_name] = self.frod_s_indices[adapter_name].to(device="cpu", dtype=torch.long) self.frod_s_size[adapter_name] = self.frod_s_size[adapter_name].to(device="cpu", dtype=torch.long) U, L = self._calculate_frod_u_and_lambda(self.frod_V[adapter_name], weight) - U = U.to(param_dtype) - L = L.to(device=device, dtype=param_dtype) + U = U.to(dtype) + L = L.to(device=device, dtype=dtype) self.frod_lambda_l[adapter_name] = nn.Parameter(L, requires_grad=True) if init_weights: self.reset_frod_parameters(adapter_name) @@ -125,18 +111,18 @@ def update_layer( self.set_adapter(self.active_adapters) def _calculate_frod_u_and_lambda(self, V, W): - w = W.detach().to(torch.float32).cpu().numpy() - v = V.detach().to(torch.float32).cpu().numpy() + w = W.detach().to(torch.float32).cpu() + v = V.detach().to(torch.float32).cpu() try: - v_inv_T = inv(v).T - except np.linalg.LinAlgError: - v_inv_T = np.linalg.pinv(v, rcond=1e-6).T - Bi = w @ v_inv_T - lambda_l = np.linalg.norm(Bi, axis=0) - u = np.divide(Bi, lambda_l, out=np.zeros_like(Bi), where=lambda_l > 1e-8) - U = torch.from_numpy(u).float() - L = torch.from_numpy(lambda_l).float() - return U, L + v_inv_T = torch.linalg.inv(v).T + except RuntimeError: + v_inv_T = torch.linalg.pinv(v, rtol=1e-6).T + bi = w @ v_inv_T + lambda_l = torch.linalg.norm(bi, dim=0) + u = torch.zeros_like(bi) + nonzero = lambda_l > 1e-8 + u[:, nonzero] = bi[:, nonzero] / lambda_l[nonzero] + return u.float(), lambda_l.float() def reset_frod_parameters(self, adapter_name): if adapter_name in self.frod_lambda_s_values: @@ -147,7 +133,7 @@ def reset_frod_parameters(self, adapter_name): nn.init.zeros_(self.frod_lambda_l[adapter_name]) -class Linear(nn.Linear, FRODLayer): +class Linear(nn.Linear, FrodLayer): def __init__( self, base_layer, @@ -155,18 +141,16 @@ def __init__( frod_s_indices: BufferDict, frod_s_size: BufferDict, adapter_name: str, - frod_dropout: float = 0.0, - fan_in_fan_out: bool = False, + config: FrodConfig, is_target_conv_1d_layer: bool = False, - init_weights: bool = True, **kwargs, ) -> None: super(nn.Linear, self).__init__() - FRODLayer.__init__(self, base_layer, **kwargs) - self.fan_in_fan_out = fan_in_fan_out + FrodLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = config.fan_in_fan_out self._active_adapter = adapter_name - self.update_layer(adapter_name, frod_V, frod_s_indices, frod_s_size, frod_dropout, init_weights) + self.update_layer(adapter_name, frod_V, frod_s_indices, frod_s_size, config=config) self.is_target_conv_1d_layer = is_target_conv_1d_layer def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: @@ -206,11 +190,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: U = self.frod_U[adapter].to(device=device, dtype=dtype) V = self.frod_V[adapter].to(device=device, dtype=dtype) indices = self.frod_s_indices[adapter].to(device=U.device, dtype=torch.long) - size_tensor = self.frod_s_size[adapter] - if isinstance(size_tensor, torch.Tensor): - size = tuple(int(dim) for dim in size_tensor.tolist()) - else: - size = tuple(int(dim) for dim in size_tensor) + size = tuple(int(dim) for dim in self.frod_s_size[adapter].tolist()) values = self.frod_lambda_s_values[adapter].to(U.device, U.dtype).clone() lambda_l = self.frod_lambda_l[adapter].to(device=U.device, dtype=U.dtype) @@ -239,11 +219,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: V = self.frod_V[active_adapter].to(device=x.device, dtype=target_dtype) U = self.frod_U[active_adapter].to(device=x.device, dtype=target_dtype) indices = self.frod_s_indices[active_adapter].to(device=x.device, dtype=torch.long) - size_tensor = self.frod_s_size[active_adapter] - if isinstance(size_tensor, torch.Tensor): - size = tuple(int(dim) for dim in size_tensor.tolist()) - else: - size = tuple(int(dim) for dim in size_tensor) + size = tuple(int(dim) for dim in self.frod_s_size[active_adapter].tolist()) values = self.frod_lambda_s_values[active_adapter].to(device=x.device, dtype=target_dtype) lambda_l = self.frod_lambda_l[active_adapter].to(device=x.device, dtype=target_dtype) @@ -254,6 +230,8 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: h_flat = h.reshape(-1, h.shape[-1]) z_flat = torch.matmul(h_flat, V) + # This block computes the sparse FRoD update z @ S with torch.sparse.mm. + # CUDA sparse fp16/bf16 kernels are less reliable, so use fp32 here and cast the update back below. matmul_dtype = z_flat.dtype if z_flat.is_cuda and matmul_dtype in (torch.float16, torch.bfloat16): matmul_dtype = torch.float32 @@ -279,6 +257,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: return result def __repr__(self) -> str: + # Match PEFT tuner convention so printed models show FRoD-wrapped layers as `frod.*`. rep = super().__repr__() return "frod." + rep diff --git a/src/peft/tuners/frod/model.py b/src/peft/tuners/frod/model.py index 2968e65416..5d3dd5c624 100644 --- a/src/peft/tuners/frod/model.py +++ b/src/peft/tuners/frod/model.py @@ -17,9 +17,7 @@ import warnings from collections import defaultdict -import numpy as np import torch -from numpy.linalg import qr from torch import nn from transformers.pytorch_utils import Conv1D @@ -28,8 +26,8 @@ from .._buffer_dict import BufferDict from ..tuners_utils import _maybe_include_all_linear_layers -from .config import FRODConfig -from .layer import FRODLayer, Linear +from .config import FrodConfig +from .layer import FrodLayer, Linear def _category_from_key(key: str) -> str: @@ -57,13 +55,13 @@ def _layer_index_from_key(key: str, fallback: int) -> int: return fallback -def _projection_from_weights(matrices: list[np.ndarray], regularization_alpha: float) -> np.ndarray: - stacked = np.vstack(matrices) +def _projection_from_weights(matrices: list[torch.Tensor], regularization_alpha: float) -> torch.Tensor: + stacked = torch.cat(matrices, dim=0) if stacked.shape[0] < stacked.shape[1]: - _, _, vh = np.linalg.svd(stacked, full_matrices=True) + _, _, vh = torch.linalg.svd(stacked, full_matrices=True) return vh.T - q_matrix, r_matrix = qr(stacked) + q_matrix, r_matrix = torch.linalg.qr(stacked) q_slices = [] start = 0 for matrix in matrices: @@ -72,23 +70,23 @@ def _projection_from_weights(matrices: list[np.ndarray], regularization_alpha: f start += rows dim = r_matrix.shape[1] - t_pi = np.zeros((dim, dim), dtype=r_matrix.dtype) + t_pi = torch.zeros((dim, dim), dtype=r_matrix.dtype) # Layers of the same projection category can be highly correlated; this ridge term keeps the inverse stable. for q_slice in q_slices: - q_term = q_slice.T @ q_slice + regularization_alpha * np.eye(dim, dtype=r_matrix.dtype) - t_pi += np.linalg.inv(q_term) + q_term = q_slice.T @ q_slice + regularization_alpha * torch.eye(dim, dtype=r_matrix.dtype) + t_pi += torch.linalg.inv(q_term) t_pi /= len(q_slices) - _, eigenvectors = np.linalg.eigh(t_pi) + _, eigenvectors = torch.linalg.eigh(t_pi) return r_matrix.T @ eigenvectors -class FRODModel(BaseTuner): +class FrodModel(BaseTuner): prefix: str = "frod_" - tuner_layer_cls = FRODLayer + tuner_layer_cls = FrodLayer target_module_mapping = TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING - def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: + def _init_frod_projections(self, config: FrodConfig, adapter_name: str) -> None: weights = defaultdict(dict) model_config = self.get_model_config(self.model) peft_config = self._prepare_adapter_config(config, model_config) @@ -116,6 +114,8 @@ def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: "No layer types compatible with FRoD were found. Please check `peft_config.target_modules`." ) + # BaseTuner.__init__() enters the pre-injection flow before a FrodModel subclass + # could assign ModuleDicts after super().__init__(), so create these containers lazily here. if not hasattr(self, "frod_V"): self.frod_V = nn.ModuleDict() self.frod_s_indices = nn.ModuleDict() @@ -125,7 +125,7 @@ def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: categories = {category for layer_dict in weights.values() for category in layer_dict} for category in sorted(categories): matrices = [ - layer_dict[category].detach().to(torch.float32).cpu().numpy() + layer_dict[category].detach().to(torch.float32).cpu() for _, layer_dict in sorted(weights.items()) if category in layer_dict ] @@ -134,7 +134,7 @@ def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: v_matrix = _projection_from_weights(matrices, config.regularization_alpha) example_weight = next(layer_dict[category] for layer_dict in weights.values() if category in layer_dict) - v_tensor = torch.from_numpy(v_matrix).to(dtype=example_weight.dtype, device="cpu") + v_tensor = v_matrix.to(dtype=example_weight.dtype, device="cpu") if category not in self.frod_V: self.frod_V[category] = BufferDict({}, persistent=config.save_projection) @@ -161,10 +161,10 @@ def _init_frod_projections(self, config: FRODConfig, adapter_name: str) -> None: self.frod_s_size[category] = BufferDict({}, persistent=config.save_projection) self.frod_s_size[category][adapter_name] = size - def _pre_injection_hook(self, model: nn.Module, config: FRODConfig, adapter_name: str) -> None: + def _pre_injection_hook(self, model: nn.Module, config: FrodConfig, adapter_name: str) -> None: self._init_frod_projections(config, adapter_name) - def _check_new_adapter_config(self, config: FRODConfig) -> None: + def _check_new_adapter_config(self, config: FrodConfig) -> None: super()._check_new_adapter_config(config) for existing_config in self.peft_config.values(): @@ -186,7 +186,7 @@ def _check_new_adapter_config(self, config: FRODConfig) -> None: def _create_and_replace( self, - frod_config, + frod_config: FrodConfig, adapter_name, target, target_name, @@ -201,12 +201,6 @@ def _create_and_replace( if category not in self.frod_V: self._init_frod_projections(frod_config, adapter_name) bias = hasattr(target, "bias") and target.bias is not None - kwargs = { - "frod_dropout": frod_config.frod_dropout, - "fan_in_fan_out": frod_config.fan_in_fan_out, - "init_weights": frod_config.init_weights, - "bias": bias, - } if isinstance(target, Linear): target.update_layer( @@ -214,8 +208,7 @@ def _create_and_replace( self.frod_V[category], self.frod_s_indices[category], self.frod_s_size[category], - frod_config.frod_dropout, - frod_config.init_weights, + config=frod_config, ) else: new_module = self._create_new_module( @@ -225,7 +218,7 @@ def _create_and_replace( self.frod_s_size[category], adapter_name, target, - **kwargs, + bias=bias, ) if adapter_name not in self.active_adapters: new_module.requires_grad_(False) @@ -233,7 +226,7 @@ def _create_and_replace( @staticmethod def _create_new_module( - frod_config, + frod_config: FrodConfig, frod_V, frod_s_indices, frod_s_size, @@ -249,19 +242,19 @@ def _create_new_module( target_base_layer = target if isinstance(target_base_layer, torch.nn.Linear): - if kwargs["fan_in_fan_out"]: + if frod_config.fan_in_fan_out: warnings.warn( "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " "Setting fan_in_fan_out to False." ) - kwargs["fan_in_fan_out"] = frod_config.fan_in_fan_out = False + frod_config.fan_in_fan_out = False elif isinstance(target_base_layer, Conv1D): kwargs["is_target_conv_1d_layer"] = True - if not kwargs["fan_in_fan_out"]: + if not frod_config.fan_in_fan_out: warnings.warn( "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." ) - kwargs["fan_in_fan_out"] = frod_config.fan_in_fan_out = True + frod_config.fan_in_fan_out = True else: raise TypeError( f"Target module {target} is not supported. Currently, only the following modules are supported: " @@ -274,6 +267,7 @@ def _create_new_module( frod_s_indices, frod_s_size, adapter_name, + config=frod_config, bias=bias, **kwargs, ) diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py index 65cd7bf502..a2148f28a9 100644 --- a/src/peft/utils/save_and_load.py +++ b/src/peft/utils/save_and_load.py @@ -829,7 +829,10 @@ def set_peft_model_state_dict( ) if config.save_projection and not has_projection: raise ValueError( - "Specified to load FRoD projection tensors from state dictionary however they were not present!" + "Specified to load FRoD projection tensors from state dictionary however they were not present. " + "If this checkpoint was saved with `save_projection=False`, set `peft_config.save_projection` " + "to `False` before loading so the projections are regenerated from the base model weights. " + "Otherwise, re-save the adapter with `save_projection=True` to include these tensors." ) elif not config.save_projection and has_projection: warnings.warn( diff --git a/tests/test_config.py b/tests/test_config.py index c9ee5894c0..b7a101a744 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -30,7 +30,7 @@ CartridgeConfig, CPTConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -79,7 +79,7 @@ class TestingCommitHashError(Exception): (BOFTConfig, {}), (C3AConfig, {}), (FourierFTConfig, {}), - (FRODConfig, {}), + (FrodConfig, {}), (GraloraConfig, {}), (HiraConfig, {}), (HRAConfig, {}), diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 92811b85e0..358ea319b7 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -37,7 +37,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -874,11 +874,11 @@ ######## # FRoD # ######## - ("Vanilla MLP 1 FRoD", "MLP", FRODConfig, {"target_modules": "lin0"}), - ("Vanilla MLP 2 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0"]}), - ("Vanilla MLP 3 FRoD", "MLP", FRODConfig, {"target_modules": ["lin1"]}), - ("Vanilla MLP 4 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0", "lin1"]}), - ("Vanilla MLP 5 FRoD", "MLP", FRODConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), + ("Vanilla MLP 1 FRoD", "MLP", FrodConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 FRoD", "MLP", FrodConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), ####### # C3A # ####### @@ -1399,6 +1399,13 @@ {"target_modules": ["lin0"], "init_weights": False}, {"target_modules": ["lin0"], "init_weights": False}, ), + ( + "FRoD Same", + "frod", + FrodConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), # Note: PVeRA presents the same problem mentioned above for VeRA. ( "PVeRA Same", @@ -3465,7 +3472,7 @@ def test_multiple_adapters_automatic_modules_to_save(self): @pytest.mark.parametrize( "config_cls", - [IA3Config, BeftConfig, FRODConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig], + [IA3Config, BeftConfig, FrodConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig], ) def test_multiple_adapters_mixed_modules_to_save(self, config_cls): # See issue 1574 @@ -3498,7 +3505,7 @@ def test_multiple_adapters_mixed_modules_to_save(self, config_cls): @pytest.mark.parametrize( "config_cls", - [IA3Config, BeftConfig, FRODConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig], + [IA3Config, BeftConfig, FrodConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig], ) def test_multiple_adapters_mixed_modules_to_save_order_switched(self, config_cls): # See issue 1574 @@ -3841,7 +3848,7 @@ def test_load_resized_embedding_ignore_mismatched_sizes(self): AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False), BeftConfig(target_modules=["lin0"], init_weights=False), - FRODConfig(target_modules=["lin0"], init_weights=False), + FrodConfig(target_modules=["lin0"], init_weights=False), OFTConfig(target_modules=["lin0"], init_weights=False, r=2, oft_block_size=0), BOFTConfig(target_modules=["lin0"], init_weights=False, boft_block_size=2), HRAConfig(target_modules=["lin0"], init_weights=False), diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index b6de666976..c1e4050e31 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -38,7 +38,7 @@ CPTConfig, DeloraConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -147,7 +147,7 @@ }, ), ( - FRODConfig, + FrodConfig, { "task_type": "CAUSAL_LM", "target_modules": None, diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index bca8a7d3bf..603abbc495 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -25,7 +25,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -110,7 +110,7 @@ }, ), ( - FRODConfig, + FrodConfig, { "target_modules": None, "task_type": "SEQ_2_SEQ_LM", diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index bece31bf15..fe9450ca2f 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -23,7 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -107,7 +107,7 @@ }, ), ( - FRODConfig, + FrodConfig, { "task_type": "FEATURE_EXTRACTION", "target_modules": None, diff --git a/tests/test_frod.py b/tests/test_frod.py index c21b3040f6..19bd494c39 100644 --- a/tests/test_frod.py +++ b/tests/test_frod.py @@ -22,7 +22,7 @@ from safetensors import safe_open from torch import nn -from peft import FRODConfig, PeftModel, get_peft_model +from peft import FrodConfig, PeftModel, get_peft_model class MLP(nn.Module): @@ -47,7 +47,7 @@ def forward(self, X): return X -class TestFROD: +class TestFrod: @pytest.fixture def mlp(self): torch.manual_seed(0) @@ -58,59 +58,19 @@ def mlp(self): def mlp_same_prng(self, mlp): torch.manual_seed(0) - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) peft_model = get_peft_model(mlp, config) - config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + config2 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) peft_model.add_adapter("other", config2) return peft_model - @staticmethod - def _make_second_adapter_different(peft_model): - with torch.no_grad(): - for module in peft_model.base_model.model.modules(): - if hasattr(module, "frod_lambda_l") and "second" in module.frod_lambda_l: - module.frod_lambda_l["second"].add_(0.1) - - def test_multiple_adapters_same_prng_projection_buffers(self, mlp_same_prng): - # Multiple adapters with the same PRNG key share fixed projection buffers within each FRoD layer. - assert ( - mlp_same_prng.base_model.model.lin1.frod_V["default"].data_ptr() - == mlp_same_prng.base_model.model.lin1.frod_V["other"].data_ptr() - ) - assert ( - mlp_same_prng.base_model.model.lin1.frod_s_indices["default"].data_ptr() - == mlp_same_prng.base_model.model.lin1.frod_s_indices["other"].data_ptr() - ) - assert ( - mlp_same_prng.base_model.model.lin2.frod_V["default"].data_ptr() - == mlp_same_prng.base_model.model.lin2.frod_V["other"].data_ptr() - ) - assert ( - mlp_same_prng.base_model.model.lin2.frod_s_indices["default"].data_ptr() - == mlp_same_prng.base_model.model.lin2.frod_s_indices["other"].data_ptr() - ) - - def test_multiple_adapters_different_prng_raises(self): - model = MLP() - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) - peft_model = get_peft_model(model, config) - config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, projection_prng_key=123) - - msg = ( - r"FRoD projection initialization key must be the same for all adapters. Got " - r"config.projection_prng_key=123 but previous config had 0" - ) - with pytest.raises(ValueError, match=msg): - peft_model.add_adapter("other", config2) - def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): # Check saving and loading works with multiple adapters without saved projection tensors. torch.manual_seed(1) - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) peft_model = get_peft_model(mlp, config, adapter_name="first") - config2 = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + config2 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) peft_model.add_adapter("second", config2) - self._make_second_adapter_different(peft_model) peft_model.eval() input = torch.randn(5, 10) @@ -141,7 +101,7 @@ def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): assert torch.allclose(output_second, output_second_loaded, atol=1e-3, rtol=1e-3) def test_save_projection_false_contains_no_frod_projection_tensors(self, mlp, tmp_path): - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) peft_model = get_peft_model(mlp, config) save_path = tmp_path / "frod" @@ -158,7 +118,7 @@ def test_save_projection_false_contains_no_frod_projection_tensors(self, mlp, tm assert not any("frod_U" in key for key in state_dict) def test_save_projection_true_contains_top_level_projection_tensors_only(self, mlp, tmp_path): - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) peft_model = get_peft_model(mlp, config) save_path = tmp_path / "frod" @@ -208,7 +168,7 @@ def test_frod_lambda_dont_share_memory(self, mlp_same_prng): ) def test_frod_different_shapes(self, mlp): - config = FRODConfig(target_modules=["lin0", "lin3"], init_weights=False) + config = FrodConfig(target_modules=["lin0", "lin3"], init_weights=False) mlp_different_shapes = get_peft_model(mlp, config) assert mlp.lin0.base_layer.weight.shape != mlp.lin3.base_layer.weight.shape @@ -231,7 +191,7 @@ def test_frod_dtypes(self, dtype): pytest.skip("bfloat16 not supported on this system, skipping the test") model = MLP().to(dtype) - config = FRODConfig(target_modules=["lin1", "lin2"], init_weights=False) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) peft_model = get_peft_model(model, config) inputs = torch.randn(5, 10).to(dtype) output = peft_model(inputs) diff --git a/tests/test_initialization.py b/tests/test_initialization.py index 27d9ba16e2..6970d35904 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -35,6 +35,7 @@ C3AConfig, DeloraConfig, EvaConfig, + FrodConfig, GraloraConfig, IA3Config, LilyConfig, @@ -1835,6 +1836,71 @@ def test_vera_add_second_adapter_with_higher_rank(self): model.add_adapter("other", config1) +class TestFrodInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) + self.lin2 = nn.Linear(20, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + X = self.lin2(X) + return X + + return MLP().to(self.torch_device) + + def test_frod_multiple_adapters_same_prng_share_projection_buffers(self): + torch.manual_seed(0) + config0 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + model = get_peft_model(self.get_model().cpu(), config0) + + config1 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + model.add_adapter("other", config1) + + assert model.base_model.model.lin1.frod_V["default"].data_ptr() == ( + model.base_model.model.lin1.frod_V["other"].data_ptr() + ) + assert model.base_model.model.lin1.frod_s_indices["default"].data_ptr() == ( + model.base_model.model.lin1.frod_s_indices["other"].data_ptr() + ) + assert model.base_model.model.lin2.frod_V["default"].data_ptr() == ( + model.base_model.model.lin2.frod_V["other"].data_ptr() + ) + assert model.base_model.model.lin2.frod_s_indices["default"].data_ptr() == ( + model.base_model.model.lin2.frod_s_indices["other"].data_ptr() + ) + + def test_frod_mixing_save_projection_raises(self): + config0 = FrodConfig(target_modules=["lin0"], init_weights=False, save_projection=True) + model = get_peft_model(self.get_model(), config0) + + config1 = FrodConfig(target_modules=["lin0"], init_weights=False, save_projection=False) + msg = re.escape( + "FRoD projection weights must be saved for all adapters or none, but got multiple different values: " + "[False, True]" + ) + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + def test_frod_add_second_adapter_with_different_prng_key_raises(self): + config0 = FrodConfig(target_modules=["lin0"], init_weights=False) + model = get_peft_model(self.get_model(), config0) + + config1 = FrodConfig(target_modules=["lin0"], init_weights=False, projection_prng_key=123) + msg = re.escape( + "FRoD projection initialization key must be the same for all adapters. Got " + "config.projection_prng_key=123 but previous config had 0." + ) + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + class TestVeloraInitialization: @pytest.mark.parametrize( "config_kwargs, msg", diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index b08f0a9945..c7de1272d3 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -23,7 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, - FRODConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -108,7 +108,7 @@ }, ), ( - FRODConfig, + FrodConfig, { "task_type": "SEQ_CLS", "target_modules": None, From 17353444ac7cb54b848b0a7923656d01126e3d61 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Fri, 29 May 2026 04:48:04 +0000 Subject: [PATCH 5/9] Fix FRoD sparse forward semantics --- src/peft/tuners/frod/layer.py | 55 ++++++++++++++++++++++------------- tests/test_frod.py | 20 +++++++++++++ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py index 1289e7c9c8..1680c07c21 100644 --- a/src/peft/tuners/frod/layer.py +++ b/src/peft/tuners/frod/layer.py @@ -46,6 +46,7 @@ def __init__(self, base_layer: nn.Module, **kwargs): self._disable_adapters = False self.merged_adapters = [] + self._frod_merged_delta = {} self.in_features, self.out_features = _get_in_out_features(self.get_base_layer()) self.kwargs = kwargs @@ -103,24 +104,23 @@ def update_layer( else: # PEFT convention: init_weights=False should produce a non-identity adapter for merge tests. with torch.no_grad(): - nn.init.normal_(self.frod_lambda_s_values[adapter_name], std=0.02) - self.frod_lambda_l[adapter_name].add_(torch.randn_like(self.frod_lambda_l[adapter_name]) * 0.02) + nn.init.normal_(self.frod_lambda_s_values[adapter_name], std=0.05) + self.frod_lambda_l[adapter_name].add_(torch.randn_like(self.frod_lambda_l[adapter_name]) * 0.05) self.frod_U[adapter_name] = U.cpu() self._move_adapter_to_device_of_base_layer(adapter_name) self.set_adapter(self.active_adapters) def _calculate_frod_u_and_lambda(self, V, W): - w = W.detach().to(torch.float32).cpu() - v = V.detach().to(torch.float32).cpu() + w = W.detach().to(torch.float64).cpu() + v = V.detach().to(torch.float64).cpu() try: - v_inv_T = torch.linalg.inv(v).T + bi = torch.linalg.solve(v, w.T).T except RuntimeError: - v_inv_T = torch.linalg.pinv(v, rtol=1e-6).T - bi = w @ v_inv_T + bi = w @ torch.linalg.pinv(v, rtol=1e-6).T lambda_l = torch.linalg.norm(bi, dim=0) u = torch.zeros_like(bi) - nonzero = lambda_l > 1e-8 + nonzero = lambda_l > 1e-12 u[:, nonzero] = bi[:, nonzero] / lambda_l[nonzero] return u.float(), lambda_l.float() @@ -128,9 +128,6 @@ def reset_frod_parameters(self, adapter_name): if adapter_name in self.frod_lambda_s_values: with torch.no_grad(): nn.init.zeros_(self.frod_lambda_s_values[adapter_name]) - if adapter_name in self.frod_lambda_l: - with torch.no_grad(): - nn.init.zeros_(self.frod_lambda_l[adapter_name]) class Linear(nn.Linear, FrodLayer): @@ -158,19 +155,22 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N if not adapter_names: return + base_layer = self.get_base_layer() + base_weight = base_layer.weight.data.clone() for active_adapter in adapter_names: if active_adapter in self.frod_lambda_l.keys(): - base_layer = self.get_base_layer() + delta_weight = self._get_delta_weight(active_adapter, base_weight=base_weight) if safe_merge: orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += delta_weight if not torch.isfinite(orig_weights).all(): raise ValueError( f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" ) base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += delta_weight + self._frod_merged_delta[active_adapter] = delta_weight self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -181,12 +181,19 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.frod_lambda_l.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + delta_weight = self._frod_merged_delta.pop(active_adapter, None) + if delta_weight is None: + delta_weight = self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= delta_weight def get_delta_weight(self, adapter) -> torch.Tensor: - weight = self.get_base_layer().weight + return self._get_delta_weight(adapter) + + def _get_delta_weight(self, adapter, base_weight: Optional[torch.Tensor] = None) -> torch.Tensor: + weight = self.get_base_layer().weight if base_weight is None else base_weight device = weight.device dtype = weight.dtype + base_weight = transpose(weight, self.fan_in_fan_out) U = self.frod_U[adapter].to(device=device, dtype=dtype) V = self.frod_V[adapter].to(device=device, dtype=dtype) indices = self.frod_s_indices[adapter].to(device=U.device, dtype=torch.long) @@ -197,8 +204,11 @@ def get_delta_weight(self, adapter) -> torch.Tensor: S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() S = S_sparse.to_dense() L = torch.diag_embed(lambda_l) + frod_weight = U @ (S + L) @ V.T - return transpose(U @ (S + L).T @ V.T, self.fan_in_fan_out) + # FRoD parameterizes the adapted weight itself. Return only the difference so PEFT merge/unmerge and + # disable-adapter behavior preserve the base model while the active adapter still replaces the base weight. + return transpose(frod_weight - base_weight, self.fan_in_fan_out) def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: previous_dtype = x.dtype @@ -230,7 +240,8 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: h_flat = h.reshape(-1, h.shape[-1]) z_flat = torch.matmul(h_flat, V) - # This block computes the sparse FRoD update z @ S with torch.sparse.mm. + # This block computes the sparse FRoD update z @ S.T with torch.sparse.mm, matching + # F.linear(h, U @ (S + diag(lambda_l)) @ V.T). # CUDA sparse fp16/bf16 kernels are less reliable, so use fp32 here and cast the update back below. matmul_dtype = z_flat.dtype if z_flat.is_cuda and matmul_dtype in (torch.float16, torch.bfloat16): @@ -241,7 +252,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() if S_sparse.dtype != matmul_dtype: S_sparse = S_sparse.to(dtype=matmul_dtype) - z_S_flat = torch.sparse.mm(S_sparse.t(), z_flat_mm.t()).t() + z_S_flat = torch.sparse.mm(S_sparse, z_flat_mm.t()).t() lambda_l = lambda_l.to(device=z_flat.device, dtype=matmul_dtype) z_L_flat = z_flat_mm * lambda_l @@ -250,8 +261,12 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: out_add_flat = F.linear(z_S_flat + z_L_flat, U_mm) out_add_flat = out_add_flat.to(target_dtype) out_add = out_add_flat.reshape(*batch_shape, out_add_flat.shape[-1]) + base_weight = transpose(self.get_base_layer().weight, self.fan_in_fan_out).to( + device=x.device, dtype=target_dtype + ) + base_out = F.linear(x, base_weight) - result = result + out_add + result = result - base_out + out_add result = result.to(previous_dtype) return result diff --git a/tests/test_frod.py b/tests/test_frod.py index 19bd494c39..88ca464fe5 100644 --- a/tests/test_frod.py +++ b/tests/test_frod.py @@ -135,6 +135,26 @@ def test_save_projection_true_contains_top_level_projection_tensors_only(self, m assert not any(".model.lin1.frod_V" in key for key in keys) assert not any("frod_U" in key for key in keys) + def test_frod_default_initialization_reconstructs_base_weight(self, mlp): + torch.manual_seed(0) + mlp.eval() + inputs = torch.randn(5, 10) + expected = mlp(inputs) + + config = FrodConfig(target_modules=["lin1", "lin2"]) + peft_model = get_peft_model(mlp, config) + peft_model.eval() + + actual = peft_model(inputs) + assert torch.allclose(actual, expected, atol=1e-4, rtol=1e-4) + + for module in (peft_model.base_model.model.lin1, peft_model.base_model.model.lin2): + delta_weight = module.get_delta_weight("default") + + assert module.frod_lambda_l["default"].norm() > 0 + assert torch.count_nonzero(module.frod_lambda_s_values["default"]) == 0 + assert torch.allclose(delta_weight, torch.zeros_like(delta_weight), atol=1e-4) + def test_frod_projection_buffers_share_memory_with_layers(self, mlp_same_prng): frod_V_lin1 = mlp_same_prng.base_model.frod_V["lin1"]["default"] frod_s_indices_lin1 = mlp_same_prng.base_model.frod_s_indices["lin1"]["default"] From 80c1870e1872d2d9872a40606c2970b6519429af Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Fri, 29 May 2026 04:48:13 +0000 Subject: [PATCH 6/9] Align FRoD image example with CLIP setup --- examples/frod_finetuning/README.md | 6 ++-- .../frod_image_classification.py | 28 ++++++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/examples/frod_finetuning/README.md b/examples/frod_finetuning/README.md index f818ed6b51..08c123713a 100644 --- a/examples/frod_finetuning/README.md +++ b/examples/frod_finetuning/README.md @@ -11,7 +11,7 @@ python examples/frod_finetuning/frod_image_classification.py ``` The text example fine-tunes `google-bert/bert-base-uncased` on `nyu-mll/glue` with the `sst2` configuration. The image -example fine-tunes `google/vit-base-patch16-224` on the train and test parquet splits from `tanganke/stanford_cars`. +example fine-tunes `openai/clip-vit-base-patch32` on the train and test parquet splits from `tanganke/stanford_cars`. Both scripts use separate optimizer learning rates for FRoD diagonal coefficients, FRoD sparse coefficients, and the classification head. FRoD dropout is set to `0.0` because the sparse rotational parameterization is the main @@ -20,8 +20,8 @@ regularizer in these examples. To use local mirrors of the image model or dataset, override the image example paths with environment variables: ```bash -FROD_IMAGE_MODEL_NAME=/path/to/local/vit-model \ +FROD_IMAGE_MODEL_NAME=/path/to/local/clip-vit-model \ FROD_STANFORD_CARS_DATA_DIR=/path/to/local/stanford_cars \ -FROD_IMAGE_OUTPUT_DIR=vit-local-frod-stanford-cars \ +FROD_IMAGE_OUTPUT_DIR=clip-vit-local-frod-stanford-cars \ python examples/frod_finetuning/frod_image_classification.py ``` diff --git a/examples/frod_finetuning/frod_image_classification.py b/examples/frod_finetuning/frod_image_classification.py index 24e6c345c5..0d3320a07f 100644 --- a/examples/frod_finetuning/frod_image_classification.py +++ b/examples/frod_finetuning/frod_image_classification.py @@ -12,12 +12,18 @@ from peft import FrodConfig, get_peft_model -MODEL_NAME = os.environ.get("FROD_IMAGE_MODEL_NAME", "google/vit-base-patch16-224") -OUTPUT_DIR = os.environ.get("FROD_IMAGE_OUTPUT_DIR", "vit-base-patch16-224-frod-stanford-cars") +MODEL_NAME = os.environ.get("FROD_IMAGE_MODEL_NAME", "openai/clip-vit-base-patch32") +OUTPUT_DIR = os.environ.get("FROD_IMAGE_OUTPUT_DIR", "clip-vit-base-patch32-frod-stanford-cars") DATA_DIR = os.environ.get("FROD_STANFORD_CARS_DATA_DIR") -FROD_LAMBDA_L_LR = 5e-4 -FROD_LAMBDA_S_LR = 5e-5 -CLASSIFIER_LR = 1e-4 +NUM_TRAIN_EPOCHS = int(os.environ.get("FROD_IMAGE_NUM_TRAIN_EPOCHS", "3")) +TRAIN_BATCH_SIZE = int(os.environ.get("FROD_IMAGE_TRAIN_BATCH_SIZE", "64")) +EVAL_BATCH_SIZE = int(os.environ.get("FROD_IMAGE_EVAL_BATCH_SIZE", "64")) +SPARSE_RATE = float(os.environ.get("FROD_IMAGE_SPARSE_RATE", "0.01")) +FROD_LAMBDA_L_LR = float(os.environ.get("FROD_IMAGE_LAMBDA_L_LR", "5e-4")) +FROD_LAMBDA_S_LR = float(os.environ.get("FROD_IMAGE_LAMBDA_S_LR", "5e-5")) +CLASSIFIER_LR = float(os.environ.get("FROD_IMAGE_CLASSIFIER_LR", "1e-4")) +CLIP_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"] + def main(): if DATA_DIR: @@ -62,10 +68,11 @@ def main(): ignore_mismatched_sizes=True, ) peft_config = FrodConfig( - target_modules=["query", "value"], + target_modules=CLIP_TARGET_MODULES, modules_to_save=["classifier"], frod_dropout=0.0, - sparse_rate=0.02, + sparse_rate=SPARSE_RATE, + projection_prng_key=3, ) model = get_peft_model(model, peft_config) model.print_trainable_parameters() @@ -102,13 +109,14 @@ def compute_metrics(eval_pred): training_args = TrainingArguments( output_dir=OUTPUT_DIR, learning_rate=FROD_LAMBDA_L_LR, - per_device_train_batch_size=32, - per_device_eval_batch_size=64, - num_train_epochs=1, + per_device_train_batch_size=TRAIN_BATCH_SIZE, + per_device_eval_batch_size=EVAL_BATCH_SIZE, + num_train_epochs=NUM_TRAIN_EPOCHS, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="accuracy", + lr_scheduler_type="constant", remove_unused_columns=False, report_to="none", ) From e08e49f9969eb7a89f63514d4fc81940bdafc6e5 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Sat, 30 May 2026 11:23:56 +0000 Subject: [PATCH 7/9] Address FRoD layer review comments --- src/peft/tuners/frod/layer.py | 91 +++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py index 1680c07c21..69534a5398 100644 --- a/src/peft/tuners/frod/layer.py +++ b/src/peft/tuners/frod/layer.py @@ -156,22 +156,24 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N return base_layer = self.get_base_layer() - base_weight = base_layer.weight.data.clone() + adapter_deltas = [] for active_adapter in adapter_names: if active_adapter in self.frod_lambda_l.keys(): - delta_weight = self._get_delta_weight(active_adapter, base_weight=base_weight) - if safe_merge: - orig_weights = base_layer.weight.data.clone() - orig_weights += delta_weight - if not torch.isfinite(orig_weights).all(): - raise ValueError( - f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" - ) - base_layer.weight.data = orig_weights - else: - base_layer.weight.data += delta_weight - self._frod_merged_delta[active_adapter] = delta_weight - self.merged_adapters.append(active_adapter) + adapter_deltas.append((active_adapter, self.get_delta_weight(active_adapter))) + + for active_adapter, delta_weight in adapter_deltas: + if safe_merge: + orig_weights = base_layer.weight.data.clone() + orig_weights += delta_weight + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += delta_weight + self._frod_merged_delta[active_adapter] = delta_weight + self.merged_adapters.append(active_adapter) def unmerge(self) -> None: if not self.merged: @@ -187,10 +189,7 @@ def unmerge(self) -> None: self.get_base_layer().weight.data -= delta_weight def get_delta_weight(self, adapter) -> torch.Tensor: - return self._get_delta_weight(adapter) - - def _get_delta_weight(self, adapter, base_weight: Optional[torch.Tensor] = None) -> torch.Tensor: - weight = self.get_base_layer().weight if base_weight is None else base_weight + weight = self.get_base_layer().weight device = weight.device dtype = weight.dtype base_weight = transpose(weight, self.fan_in_fan_out) @@ -221,11 +220,15 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: result = self.base_layer(x, *args, **kwargs) else: result = self.base_layer(x, *args, **kwargs) + target_dtype = x.dtype + base_weight = transpose(self.get_base_layer().weight, self.fan_in_fan_out).to( + device=x.device, dtype=target_dtype + ) + base_out = None for active_adapter in self.active_adapters: if active_adapter not in self.frod_lambda_s_values: continue - target_dtype = x.dtype V = self.frod_V[active_adapter].to(device=x.device, dtype=target_dtype) U = self.frod_U[active_adapter].to(device=x.device, dtype=target_dtype) indices = self.frod_s_indices[active_adapter].to(device=x.device, dtype=torch.long) @@ -233,8 +236,8 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: values = self.frod_lambda_s_values[active_adapter].to(device=x.device, dtype=target_dtype) lambda_l = self.frod_lambda_l[active_adapter].to(device=x.device, dtype=target_dtype) - x = x.to(target_dtype) - h = self.frod_dropout[active_adapter](x) + dropout = self.frod_dropout[active_adapter] + h = dropout(x) batch_shape = h.shape[:-1] h_flat = h.reshape(-1, h.shape[-1]) @@ -242,9 +245,9 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: # This block computes the sparse FRoD update z @ S.T with torch.sparse.mm, matching # F.linear(h, U @ (S + diag(lambda_l)) @ V.T). - # CUDA sparse fp16/bf16 kernels are less reliable, so use fp32 here and cast the update back below. + # Sparse fp16/bf16 kernels are less reliable, so use fp32 here and cast the update back below. matmul_dtype = z_flat.dtype - if z_flat.is_cuda and matmul_dtype in (torch.float16, torch.bfloat16): + if matmul_dtype in (torch.float16, torch.bfloat16): matmul_dtype = torch.float32 values = values.to(device=z_flat.device, dtype=matmul_dtype) @@ -261,33 +264,36 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: out_add_flat = F.linear(z_S_flat + z_L_flat, U_mm) out_add_flat = out_add_flat.to(target_dtype) out_add = out_add_flat.reshape(*batch_shape, out_add_flat.shape[-1]) - base_weight = transpose(self.get_base_layer().weight, self.fan_in_fan_out).to( - device=x.device, dtype=target_dtype - ) - base_out = F.linear(x, base_weight) + # FRoD reconstructs the adapted weight directly, so subtract the base-weight contribution and only + # accumulate the adapter delta. + if isinstance(dropout, nn.Identity): + if base_out is None: + base_out = F.linear(x, base_weight) + adapter_base_out = base_out + else: + adapter_base_out = F.linear(h, base_weight) - result = result - base_out + out_add + result = result + out_add - adapter_base_out result = result.to(previous_dtype) return result def __repr__(self) -> str: - # Match PEFT tuner convention so printed models show FRoD-wrapped layers as `frod.*`. rep = super().__repr__() return "frod." + rep def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: - dtype = None - weight = None - if device is None: - for weight_name in ("weight", "qweight"): - weight = getattr(self.get_base_layer(), weight_name, None) - if weight is not None: - device = weight.device - dtype = weight.dtype - break - else: - return + """Move trainable FRoD parameters while keeping shared projection buffers on CPU.""" + base_layer = self.get_base_layer() + base_device, base_dtype = self._get_base_layer_device_and_dtype(base_layer) + + target_device = device if device is not None else base_device + if target_device is None: + return + + target_dtype = None + if base_dtype is not None and (base_dtype.is_floating_point or base_dtype.is_complex): + target_dtype = base_dtype for adapter_layer_name in self.adapter_layer_names: adapter_layer = getattr(self, adapter_layer_name, None) @@ -298,4 +304,7 @@ def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optio param = adapter_layer[adapter_name] if param.is_meta: continue - adapter_layer[adapter_name] = param.to(device, dtype=dtype) + if target_dtype is not None: + adapter_layer[adapter_name] = param.to(target_device, dtype=target_dtype) + else: + adapter_layer[adapter_name] = param.to(target_device) From 3bfdc2394e41e9e9821e1d0660900567f580e56c Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Tue, 2 Jun 2026 13:30:27 +0000 Subject: [PATCH 8/9] Document FRoD module key parsing --- src/peft/tuners/frod/model.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/peft/tuners/frod/model.py b/src/peft/tuners/frod/model.py index 5d3dd5c624..f545b38e1a 100644 --- a/src/peft/tuners/frod/model.py +++ b/src/peft/tuners/frod/model.py @@ -31,6 +31,15 @@ def _category_from_key(key: str) -> str: + """Infer the projection-sharing category from a dotted module key. + + FRoD shares projection buffers across modules that play the same role in different transformer blocks. This helper + assumes keys follow the dotted paths returned by `named_modules()` and derives the role from the final path + components. For example, `encoder.layer.0.attention.self.query` maps to `self_query`, while + `vision_model.encoder.layers.0.self_attn.q_proj` maps to `self_attn_q_proj`. The BERT-style attention output key + `encoder.layer.0.attention.output.dense` is normalized to `attention_output` so it does not collide with MLP + `output.dense` modules. + """ parts = key.split(".") if len(parts) == 1: return parts[0] @@ -43,6 +52,13 @@ def _category_from_key(key: str) -> str: def _layer_index_from_key(key: str, fallback: int) -> int: + """Infer the transformer block index from a dotted module key. + + Many decoder and vision models use paths like `model.layers.3.self_attn.q_proj`, so the first preference is the + integer immediately after a `layers` path component. Encoder models often use paths like + `encoder.layer.11.attention.self.query`; for those, the first numeric path component is used. If no numeric layer + id is present, e.g. for `classifier.dense`, `fallback` keeps the projection initialization order deterministic. + """ parts = key.split(".") if "layers" in parts: try: From 3209febae5d98eebe361aef2581dbecc70bd9f68 Mon Sep 17 00:00:00 2001 From: Bane-Elvin Date: Tue, 2 Jun 2026 13:34:48 +0000 Subject: [PATCH 9/9] Test FRoD category inference --- tests/test_frod.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_frod.py b/tests/test_frod.py index 88ca464fe5..67621faf3b 100644 --- a/tests/test_frod.py +++ b/tests/test_frod.py @@ -21,6 +21,7 @@ from accelerate.utils.imports import is_bf16_available from safetensors import safe_open from torch import nn +from transformers import LlamaConfig, LlamaForCausalLM from peft import FrodConfig, PeftModel, get_peft_model @@ -169,6 +170,24 @@ def test_frod_projection_buffers_share_memory_with_layers(self, mlp_same_prng): # Different target categories have distinct projection buffers. assert frod_V_lin1.data_ptr() != mlp_same_prng.base_model.frod_V["lin2"]["default"].data_ptr() + def test_frod_categories_with_common_llama_targets(self): + model = LlamaForCausalLM( + LlamaConfig( + hidden_size=16, + intermediate_size=32, + num_attention_heads=4, + num_hidden_layers=2, + vocab_size=32, + ) + ) + config = FrodConfig(target_modules=["q_proj", "v_proj"]) + + peft_model = get_peft_model(model, config) + + assert sorted(peft_model.base_model.frod_V.keys()) == ["self_attn_q_proj", "self_attn_v_proj"] + assert "default" in peft_model.base_model.frod_V["self_attn_q_proj"] + assert "default" in peft_model.base_model.frod_V["self_attn_v_proj"] + def test_frod_lambda_dont_share_memory(self, mlp_same_prng): assert ( mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["default"].data_ptr()