diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a9db3fa19a..de928e9d6e 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -124,6 +124,8 @@ title: PVeRA - local: package_reference/fourierft title: FourierFT + - local: package_reference/frod + title: FRoD - local: package_reference/gralora title: GraLoRA - local: package_reference/vblora diff --git a/docs/source/package_reference/frod.md b/docs/source/package_reference/frod.md new file mode 100644 index 0000000000..5369494d09 --- /dev/null +++ b/docs/source/package_reference/frod.md @@ -0,0 +1,66 @@ + + +# FRoD: Full-Rank Efficient Fine-Tuning with Rotational Degrees + +FRoD is a parameter-efficient fine-tuning method that combines a shared full-rank basis with sparse learnable +rotational degrees. The adapter update is expressed through fixed projection tensors and trainable coefficients, which +allows FRoD to apply full-rank updates while keeping the number of trained parameters small. + +Paper: [Full-Rank Efficient Fine-Tuning with Rotational Degrees](https://doi.org/10.1609/aaai.v40i31.39813). + +When saving the adapter parameters, it is possible to avoid storing the projection tensors by setting +`save_projection=False` on the `FrodConfig`. In that case, the projections are restored from the base model weights and +the fixed random seed from `projection_prng_key`. This reduces checkpoint size, but the default is +`save_projection=True` to make checkpoint loading independent of regeneration details. + +Compared to LoRA, FRoD can express a full-rank update in each adapted linear layer while training only the diagonal +coefficients and a sparse set of off-diagonal rotation coefficients. This can be useful when a low-rank update is too +restrictive. The trade-off is that FRoD computes fixed projection tensors from the base weights during adapter +injection, which makes setup more expensive and the implementation less broadly supported than LoRA. + +FRoD currently has the following constraint: + +- Only `nn.Linear` and `transformers.pytorch_utils.Conv1D` layers are supported. + +## Quickstart + +```python +from transformers import AutoModelForSequenceClassification + +from peft import FrodConfig, TaskType, get_peft_model + +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2) + +peft_config = FrodConfig( + task_type=TaskType.SEQ_CLS, + target_modules=["query", "value"], + modules_to_save=["classifier"], + sparse_rate=0.02, + frod_dropout=0.0, +) + +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +``` + +## FrodConfig + +[[autodoc]] tuners.frod.config.FrodConfig + +## FrodModel + +[[autodoc]] tuners.frod.model.FrodModel diff --git a/examples/frod_finetuning/README.md b/examples/frod_finetuning/README.md new file mode 100644 index 0000000000..08c123713a --- /dev/null +++ b/examples/frod_finetuning/README.md @@ -0,0 +1,27 @@ +# FRoD fine-tuning examples + +These examples show minimal FRoD fine-tuning with the Transformers `Trainer`. + +Install the example dependencies and run either script directly: + +```bash +pip install -r examples/frod_finetuning/requirements.txt +python examples/frod_finetuning/frod_text_classification.py +python examples/frod_finetuning/frod_image_classification.py +``` + +The text example fine-tunes `google-bert/bert-base-uncased` on `nyu-mll/glue` with the `sst2` configuration. The image +example fine-tunes `openai/clip-vit-base-patch32` on the train and test parquet splits from `tanganke/stanford_cars`. + +Both scripts use separate optimizer learning rates for FRoD diagonal coefficients, FRoD sparse coefficients, and the +classification head. FRoD dropout is set to `0.0` because the sparse rotational parameterization is the main +regularizer in these examples. + +To use local mirrors of the image model or dataset, override the image example paths with environment variables: + +```bash +FROD_IMAGE_MODEL_NAME=/path/to/local/clip-vit-model \ +FROD_STANFORD_CARS_DATA_DIR=/path/to/local/stanford_cars \ +FROD_IMAGE_OUTPUT_DIR=clip-vit-local-frod-stanford-cars \ +python examples/frod_finetuning/frod_image_classification.py +``` diff --git a/examples/frod_finetuning/frod_image_classification.py b/examples/frod_finetuning/frod_image_classification.py new file mode 100644 index 0000000000..0d3320a07f --- /dev/null +++ b/examples/frod_finetuning/frod_image_classification.py @@ -0,0 +1,139 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import os + +import numpy as np +import torch +from datasets import load_dataset +from transformers import AutoImageProcessor, AutoModelForImageClassification, Trainer, TrainingArguments + +from peft import FrodConfig, get_peft_model + + +MODEL_NAME = os.environ.get("FROD_IMAGE_MODEL_NAME", "openai/clip-vit-base-patch32") +OUTPUT_DIR = os.environ.get("FROD_IMAGE_OUTPUT_DIR", "clip-vit-base-patch32-frod-stanford-cars") +DATA_DIR = os.environ.get("FROD_STANFORD_CARS_DATA_DIR") +NUM_TRAIN_EPOCHS = int(os.environ.get("FROD_IMAGE_NUM_TRAIN_EPOCHS", "3")) +TRAIN_BATCH_SIZE = int(os.environ.get("FROD_IMAGE_TRAIN_BATCH_SIZE", "64")) +EVAL_BATCH_SIZE = int(os.environ.get("FROD_IMAGE_EVAL_BATCH_SIZE", "64")) +SPARSE_RATE = float(os.environ.get("FROD_IMAGE_SPARSE_RATE", "0.01")) +FROD_LAMBDA_L_LR = float(os.environ.get("FROD_IMAGE_LAMBDA_L_LR", "5e-4")) +FROD_LAMBDA_S_LR = float(os.environ.get("FROD_IMAGE_LAMBDA_S_LR", "5e-5")) +CLASSIFIER_LR = float(os.environ.get("FROD_IMAGE_CLASSIFIER_LR", "1e-4")) +CLIP_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"] + + +def main(): + if DATA_DIR: + data_files = { + "train": [ + os.path.join(DATA_DIR, "data", "train-00000-of-00002.parquet"), + os.path.join(DATA_DIR, "data", "train-00001-of-00002.parquet"), + ], + "test": [ + os.path.join(DATA_DIR, "data", "test-00000-of-00002.parquet"), + os.path.join(DATA_DIR, "data", "test-00001-of-00002.parquet"), + ], + } + else: + data_files = { + "train": [ + "hf://datasets/tanganke/stanford_cars/data/train-00000-of-00002.parquet", + "hf://datasets/tanganke/stanford_cars/data/train-00001-of-00002.parquet", + ], + "test": [ + "hf://datasets/tanganke/stanford_cars/data/test-00000-of-00002.parquet", + "hf://datasets/tanganke/stanford_cars/data/test-00001-of-00002.parquet", + ], + } + + dataset = load_dataset("parquet", data_files=data_files) + train_split = dataset["train"] + eval_split = dataset["test"] + image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME) + label_feature = train_split.features["label"] + label_names = ( + label_feature.names if hasattr(label_feature, "names") else [str(i) for i in sorted(set(train_split["label"]))] + ) + id2label = dict(enumerate(label_names)) + label2id = {name: idx for idx, name in id2label.items()} + + model = AutoModelForImageClassification.from_pretrained( + MODEL_NAME, + num_labels=len(label_names), + id2label=id2label, + label2id=label2id, + ignore_mismatched_sizes=True, + ) + peft_config = FrodConfig( + target_modules=CLIP_TARGET_MODULES, + modules_to_save=["classifier"], + frod_dropout=0.0, + sparse_rate=SPARSE_RATE, + projection_prng_key=3, + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + def transform(batch): + images = [image.convert("RGB") for image in batch["image"]] + inputs = image_processor(images, return_tensors="pt") + inputs["labels"] = batch["label"] + return inputs + + train_dataset = train_split.with_transform(transform) + eval_dataset = eval_split.with_transform(transform) + + def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + labels = torch.tensor([example["labels"] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} + + def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=-1) + return {"accuracy": (predictions == eval_pred.label_ids).mean().item()} + + optimizer = torch.optim.AdamW( + [ + {"params": [p for n, p in model.named_parameters() if "frod_lambda_l" in n], "lr": FROD_LAMBDA_L_LR}, + { + "params": [p for n, p in model.named_parameters() if "frod_lambda_s_values" in n], + "lr": FROD_LAMBDA_S_LR, + }, + {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": CLASSIFIER_LR}, + ] + ) + + training_args = TrainingArguments( + output_dir=OUTPUT_DIR, + learning_rate=FROD_LAMBDA_L_LR, + per_device_train_batch_size=TRAIN_BATCH_SIZE, + per_device_eval_batch_size=EVAL_BATCH_SIZE, + num_train_epochs=NUM_TRAIN_EPOCHS, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + lr_scheduler_type="constant", + remove_unused_columns=False, + report_to="none", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=collate_fn, + compute_metrics=compute_metrics, + optimizers=(optimizer, None), + ) + trainer.train() + trainer.evaluate() + model.save_pretrained(OUTPUT_DIR) + + +if __name__ == "__main__": + main() diff --git a/examples/frod_finetuning/frod_text_classification.py b/examples/frod_finetuning/frod_text_classification.py new file mode 100644 index 0000000000..afa801a236 --- /dev/null +++ b/examples/frod_finetuning/frod_text_classification.py @@ -0,0 +1,93 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import numpy as np +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainingArguments, +) + +from peft import FrodConfig, TaskType, get_peft_model + + +MODEL_NAME = "google-bert/bert-base-uncased" +DATASET_NAME = "nyu-mll/glue" +TASK_NAME = "sst2" +OUTPUT_DIR = "bert-base-uncased-frod-sst2" +FROD_LAMBDA_L_LR = 2e-2 +FROD_LAMBDA_S_LR = 2e-3 +CLASSIFIER_LR = 1e-2 + + +def main(): + dataset = load_dataset(DATASET_NAME, TASK_NAME) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + + def preprocess(batch): + return tokenizer(batch["sentence"], truncation=True) + + tokenized = dataset.map(preprocess, batched=True) + tokenized = tokenized.rename_column("label", "labels") + + model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) + peft_config = FrodConfig( + task_type=TaskType.SEQ_CLS, + target_modules=["query", "value"], + modules_to_save=["classifier"], + frod_dropout=0.0, + sparse_rate=0.02, + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=-1) + return {"accuracy": (predictions == eval_pred.label_ids).mean().item()} + + optimizer = torch.optim.AdamW( + [ + {"params": [p for n, p in model.named_parameters() if "frod_lambda_l" in n], "lr": FROD_LAMBDA_L_LR}, + { + "params": [p for n, p in model.named_parameters() if "frod_lambda_s_values" in n], + "lr": FROD_LAMBDA_S_LR, + }, + {"params": [p for n, p in model.named_parameters() if "classifier" in n], "lr": CLASSIFIER_LR}, + ] + ) + + training_args = TrainingArguments( + output_dir=OUTPUT_DIR, + learning_rate=FROD_LAMBDA_L_LR, + per_device_train_batch_size=32, + per_device_eval_batch_size=64, + num_train_epochs=1, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + report_to="none", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized["train"], + eval_dataset=tokenized["validation"], + tokenizer=tokenizer, + data_collator=DataCollatorWithPadding(tokenizer=tokenizer), + compute_metrics=compute_metrics, + optimizers=(optimizer, None), + ) + trainer.train() + trainer.evaluate() + model.save_pretrained(OUTPUT_DIR) + + +if __name__ == "__main__": + main() diff --git a/examples/frod_finetuning/requirements.txt b/examples/frod_finetuning/requirements.txt new file mode 100644 index 0000000000..8bcaa74619 --- /dev/null +++ b/examples/frod_finetuning/requirements.txt @@ -0,0 +1,6 @@ +peft +transformers +accelerate>=1.0.0 +datasets +numpy +Pillow diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000..d52af61d39 --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "frod_dropout": 0.0, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "FROD", + "projection_prng_key": 0, + "regularization_alpha": 0.001, + "revision": null, + "save_projection": true, + "sparse_rate": 0.01, + "target_modules": null, + "task_type": "CAUSAL_LM" +} diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json new file mode 100644 index 0000000000..8abdd8540b --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "frod_dropout": 0.0, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "FROD", + "projection_prng_key": 0, + "regularization_alpha": 0.001, + "revision": null, + "save_projection": true, + "sparse_rate": 0.02, + "target_modules": null, + "task_type": "CAUSAL_LM" +} diff --git a/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json new file mode 100644 index 0000000000..52d87e3ef6 --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/frod/llama-3.2-3B-sparse0.02-lr_0.001/training_params.json @@ -0,0 +1,5 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} diff --git a/src/peft/__init__.py b/src/peft/__init__.py index ec12d52583..078e322289 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -70,6 +70,8 @@ EvaConfig, FourierFTConfig, FourierFTModel, + FrodConfig, + FrodModel, GraloraConfig, GraloraModel, HiraConfig, @@ -200,6 +202,8 @@ "EvaConfig", "FourierFTConfig", "FourierFTModel", + "FrodConfig", + "FrodModel", "GraloraConfig", "GraloraModel", "HRAConfig", diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index 4900a71aa8..22908b57a9 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -22,6 +22,7 @@ from .cpt import CPTConfig, CPTEmbedding from .delora import DeloraConfig, DeloraModel from .fourierft import FourierFTConfig, FourierFTModel +from .frod import FrodConfig, FrodModel from .gralora import GraloraConfig, GraloraModel from .hira import HiraConfig, HiraModel from .hra import HRAConfig, HRAModel @@ -95,6 +96,8 @@ "EvaConfig", "FourierFTConfig", "FourierFTModel", + "FrodConfig", + "FrodModel", "GraloraConfig", "GraloraModel", "HRAConfig", diff --git a/src/peft/tuners/frod/__init__.py b/src/peft/tuners/frod/__init__.py new file mode 100644 index 0000000000..b26001a796 --- /dev/null +++ b/src/peft/tuners/frod/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import FrodConfig +from .layer import FrodLayer, Linear +from .model import FrodModel + + +__all__ = ["FrodConfig", "FrodLayer", "FrodModel", "Linear"] + +register_peft_method(name="frod", config_cls=FrodConfig, model_cls=FrodModel, prefix="frod_") diff --git a/src/peft/tuners/frod/config.py b/src/peft/tuners/frod/config.py new file mode 100644 index 0000000000..f581108a89 --- /dev/null +++ b/src/peft/tuners/frod/config.py @@ -0,0 +1,164 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class FrodConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`FrodModel`]. + + Paper: https://doi.org/10.1609/aaai.v40i31.39813. + + Args: + target_modules (`Union[List[str], str]`): + The names of the modules to apply FRoD to. Only linear layers are supported. + projection_prng_key (`int`): + Random seed used when initializing the sparse FRoD COO pattern. + save_projection (`bool`): + Whether to save the FRoD projection tensors in the state dict. This increases checkpoint size but makes + adapter reloading independent of local cache regeneration. Defaults to `True`. + frod_dropout (`float`): + The dropout probability for FRoD layers. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for FRoD. Can be 'none', 'all' or 'frod_only'. If 'all' or 'frod_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + modules_to_save (`List[str]`): + List of modules apart from FRoD layers to be set as trainable and saved in the final checkpoint. + init_weights (`bool`): + Whether to initialize the weights of the FRoD layers with their default initialization. Don't change this + setting, except if you know exactly what you're doing. + layers_to_transform (`Union[List[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply the FRoD transformations on + the layer indexes that are specified in this list. If a single integer is passed, it will apply the FRoD + transformations on the layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + sparse_rate (`float`): + Fraction of off-diagonal entries in the sparse trainable rotation matrix. Higher values increase capacity + and trainable parameters; lower values are cheaper. Defaults to `0.01`. + regularization_alpha (`float`): + Small positive value used while building the shared basis from base weights. It stabilizes the matrix + inverse when layers in the same category have correlated weights. Defaults to `1e-3`. + """ + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with FRoD." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + projection_prng_key: int = field( + default=0, + metadata={"help": "Random seed used when initializing the FRoD sparse COO structure."}, + ) + save_projection: bool = field( + default=True, + metadata={ + "help": ( + "Whether to save the FRoD projection tensors in the state dict. This increases checkpoint size but " + "guarantees that we can reload the adapter on all system configurations." + ) + }, + ) + frod_dropout: float = field(default=0.0, metadata={"help": "Dropout in the FRoD adapter layers"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field(default="none", metadata={"help": "Bias type for FRoD. Can be 'none', 'all' or 'frod_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from FRoD layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the FRoD layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer " + "pattern is not in the common layers pattern. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + ) + }, + ) + sparse_rate: float = field( + default=0.01, + metadata={ + "help": ( + "Fraction of off-diagonal entries in the sparse trainable rotation matrix. Higher values increase " + "capacity and trainable parameters; lower values are cheaper." + ) + }, + ) + regularization_alpha: float = field( + default=1e-3, + metadata={ + "help": ( + "Small positive value used while building the shared basis from base weights. It stabilizes matrix " + "inverses for correlated layers." + ), + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.FROD + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + if self.sparse_rate < 0 or self.sparse_rate > 1: + raise ValueError(f"`sparse_rate` should be between 0 and 1, got {self.sparse_rate}.") diff --git a/src/peft/tuners/frod/layer.py b/src/peft/tuners/frod/layer.py new file mode 100644 index 0000000000..69534a5398 --- /dev/null +++ b/src/peft/tuners/frod/layer.py @@ -0,0 +1,310 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge +from peft.utils.other import transpose + +from .._buffer_dict import BufferDict +from .config import FrodConfig + + +class FrodLayer(BaseTunerLayer): + adapter_layer_names = ("frod_lambda_l", "frod_lambda_s_values") + other_param_names = ("frod_V", "frod_U", "frod_s_indices", "frod_s_size") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.frod_dropout = nn.ModuleDict({}) + + # Sparse S is parameterized by its COO values only. + self.frod_lambda_l = nn.ParameterDict({}) + self.frod_lambda_s_values = nn.ParameterDict({}) + + self.frod_s_indices: BufferDict = BufferDict({}, persistent=False) + self.frod_s_size: BufferDict = BufferDict({}, persistent=False) + self.frod_V: BufferDict = BufferDict({}, persistent=False) + self.frod_U: BufferDict = BufferDict({}, persistent=False) + + self._disable_adapters = False + self.merged_adapters = [] + self._frod_merged_delta = {} + + self.in_features, self.out_features = _get_in_out_features(self.get_base_layer()) + self.kwargs = kwargs + + def update_layer( + self, + adapter_name, + frod_V: BufferDict, + frod_s_indices: BufferDict, + frod_s_size: BufferDict, + config: FrodConfig, + ): + frod_dropout = config.frod_dropout + init_weights = config.init_weights + base_layer = self.get_base_layer() + weight = transpose(base_layer.weight, self.fan_in_fan_out) + device = base_layer.weight.device + dtype = base_layer.weight.dtype + + self.r[adapter_name] = self.out_features + if frod_dropout > 0.0: + frod_dropout_layer = nn.Dropout(p=frod_dropout) + else: + frod_dropout_layer = nn.Identity() + + self.frod_dropout[adapter_name] = frod_dropout_layer + + if frod_V is None or frod_s_indices is None or frod_s_size is None: + raise ValueError("The FRoD projection buffers are missing. This should not happen.") + if adapter_name not in frod_V: + # FRoD projection buffers are shared across adapters for the same module category. + reference_adapter = next(iter(frod_V)) + frod_V[adapter_name] = frod_V[reference_adapter] + frod_s_indices[adapter_name] = frod_s_indices[reference_adapter] + frod_s_size[adapter_name] = frod_s_size[reference_adapter] + + nnz = frod_s_indices[adapter_name].shape[1] + self.frod_lambda_s_values[adapter_name] = nn.Parameter(torch.zeros(nnz, device=device, dtype=dtype)) + + self.frod_V[adapter_name] = frod_V[adapter_name] + self.frod_s_indices[adapter_name] = frod_s_indices[adapter_name] + self.frod_s_size[adapter_name] = frod_s_size[adapter_name] + + # Keep cached projections on CPU and move them lazily in forward. + self.frod_V[adapter_name] = self.frod_V[adapter_name].to(dtype=dtype, device="cpu") + self.frod_s_indices[adapter_name] = self.frod_s_indices[adapter_name].to(device="cpu", dtype=torch.long) + self.frod_s_size[adapter_name] = self.frod_s_size[adapter_name].to(device="cpu", dtype=torch.long) + + U, L = self._calculate_frod_u_and_lambda(self.frod_V[adapter_name], weight) + U = U.to(dtype) + L = L.to(device=device, dtype=dtype) + self.frod_lambda_l[adapter_name] = nn.Parameter(L, requires_grad=True) + if init_weights: + self.reset_frod_parameters(adapter_name) + else: + # PEFT convention: init_weights=False should produce a non-identity adapter for merge tests. + with torch.no_grad(): + nn.init.normal_(self.frod_lambda_s_values[adapter_name], std=0.05) + self.frod_lambda_l[adapter_name].add_(torch.randn_like(self.frod_lambda_l[adapter_name]) * 0.05) + + self.frod_U[adapter_name] = U.cpu() + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters) + + def _calculate_frod_u_and_lambda(self, V, W): + w = W.detach().to(torch.float64).cpu() + v = V.detach().to(torch.float64).cpu() + try: + bi = torch.linalg.solve(v, w.T).T + except RuntimeError: + bi = w @ torch.linalg.pinv(v, rtol=1e-6).T + lambda_l = torch.linalg.norm(bi, dim=0) + u = torch.zeros_like(bi) + nonzero = lambda_l > 1e-12 + u[:, nonzero] = bi[:, nonzero] / lambda_l[nonzero] + return u.float(), lambda_l.float() + + def reset_frod_parameters(self, adapter_name): + if adapter_name in self.frod_lambda_s_values: + with torch.no_grad(): + nn.init.zeros_(self.frod_lambda_s_values[adapter_name]) + + +class Linear(nn.Linear, FrodLayer): + def __init__( + self, + base_layer, + frod_V: BufferDict, + frod_s_indices: BufferDict, + frod_s_size: BufferDict, + adapter_name: str, + config: FrodConfig, + is_target_conv_1d_layer: bool = False, + **kwargs, + ) -> None: + super(nn.Linear, self).__init__() + FrodLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = config.fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer(adapter_name, frod_V, frod_s_indices, frod_s_size, config=config) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + base_layer = self.get_base_layer() + adapter_deltas = [] + for active_adapter in adapter_names: + if active_adapter in self.frod_lambda_l.keys(): + adapter_deltas.append((active_adapter, self.get_delta_weight(active_adapter))) + + for active_adapter, delta_weight in adapter_deltas: + if safe_merge: + orig_weights = base_layer.weight.data.clone() + orig_weights += delta_weight + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += delta_weight + self._frod_merged_delta[active_adapter] = delta_weight + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.frod_lambda_l.keys(): + delta_weight = self._frod_merged_delta.pop(active_adapter, None) + if delta_weight is None: + delta_weight = self.get_delta_weight(active_adapter) + self.get_base_layer().weight.data -= delta_weight + + def get_delta_weight(self, adapter) -> torch.Tensor: + weight = self.get_base_layer().weight + device = weight.device + dtype = weight.dtype + base_weight = transpose(weight, self.fan_in_fan_out) + U = self.frod_U[adapter].to(device=device, dtype=dtype) + V = self.frod_V[adapter].to(device=device, dtype=dtype) + indices = self.frod_s_indices[adapter].to(device=U.device, dtype=torch.long) + size = tuple(int(dim) for dim in self.frod_s_size[adapter].tolist()) + values = self.frod_lambda_s_values[adapter].to(U.device, U.dtype).clone() + lambda_l = self.frod_lambda_l[adapter].to(device=U.device, dtype=U.dtype) + + S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() + S = S_sparse.to_dense() + L = torch.diag_embed(lambda_l) + frod_weight = U @ (S + L) @ V.T + + # FRoD parameterizes the adapted weight itself. Return only the difference so PEFT merge/unmerge and + # disable-adapter behavior preserve the base model while the active adapter still replaces the base weight. + return transpose(frod_weight - base_weight, self.fan_in_fan_out) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + target_dtype = x.dtype + base_weight = transpose(self.get_base_layer().weight, self.fan_in_fan_out).to( + device=x.device, dtype=target_dtype + ) + base_out = None + for active_adapter in self.active_adapters: + if active_adapter not in self.frod_lambda_s_values: + continue + + V = self.frod_V[active_adapter].to(device=x.device, dtype=target_dtype) + U = self.frod_U[active_adapter].to(device=x.device, dtype=target_dtype) + indices = self.frod_s_indices[active_adapter].to(device=x.device, dtype=torch.long) + size = tuple(int(dim) for dim in self.frod_s_size[active_adapter].tolist()) + values = self.frod_lambda_s_values[active_adapter].to(device=x.device, dtype=target_dtype) + lambda_l = self.frod_lambda_l[active_adapter].to(device=x.device, dtype=target_dtype) + + dropout = self.frod_dropout[active_adapter] + h = dropout(x) + + batch_shape = h.shape[:-1] + h_flat = h.reshape(-1, h.shape[-1]) + z_flat = torch.matmul(h_flat, V) + + # This block computes the sparse FRoD update z @ S.T with torch.sparse.mm, matching + # F.linear(h, U @ (S + diag(lambda_l)) @ V.T). + # Sparse fp16/bf16 kernels are less reliable, so use fp32 here and cast the update back below. + matmul_dtype = z_flat.dtype + if matmul_dtype in (torch.float16, torch.bfloat16): + matmul_dtype = torch.float32 + + values = values.to(device=z_flat.device, dtype=matmul_dtype) + z_flat_mm = z_flat.to(matmul_dtype) + S_sparse = torch.sparse_coo_tensor(indices, values, size).coalesce() + if S_sparse.dtype != matmul_dtype: + S_sparse = S_sparse.to(dtype=matmul_dtype) + z_S_flat = torch.sparse.mm(S_sparse, z_flat_mm.t()).t() + + lambda_l = lambda_l.to(device=z_flat.device, dtype=matmul_dtype) + z_L_flat = z_flat_mm * lambda_l + + U_mm = U.to(device=z_flat.device, dtype=matmul_dtype) + out_add_flat = F.linear(z_S_flat + z_L_flat, U_mm) + out_add_flat = out_add_flat.to(target_dtype) + out_add = out_add_flat.reshape(*batch_shape, out_add_flat.shape[-1]) + # FRoD reconstructs the adapted weight directly, so subtract the base-weight contribution and only + # accumulate the adapter delta. + if isinstance(dropout, nn.Identity): + if base_out is None: + base_out = F.linear(x, base_weight) + adapter_base_out = base_out + else: + adapter_base_out = F.linear(h, base_weight) + + result = result + out_add - adapter_base_out + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "frod." + rep + + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: + """Move trainable FRoD parameters while keeping shared projection buffers on CPU.""" + base_layer = self.get_base_layer() + base_device, base_dtype = self._get_base_layer_device_and_dtype(base_layer) + + target_device = device if device is not None else base_device + if target_device is None: + return + + target_dtype = None + if base_dtype is not None and (base_dtype.is_floating_point or base_dtype.is_complex): + target_dtype = base_dtype + + for adapter_layer_name in self.adapter_layer_names: + adapter_layer = getattr(self, adapter_layer_name, None) + if not isinstance(adapter_layer, nn.ParameterDict): + continue + if adapter_name not in adapter_layer: + continue + param = adapter_layer[adapter_name] + if param.is_meta: + continue + if target_dtype is not None: + adapter_layer[adapter_name] = param.to(target_device, dtype=target_dtype) + else: + adapter_layer[adapter_name] = param.to(target_device) diff --git a/src/peft/tuners/frod/model.py b/src/peft/tuners/frod/model.py new file mode 100644 index 0000000000..f545b38e1a --- /dev/null +++ b/src/peft/tuners/frod/model.py @@ -0,0 +1,289 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from collections import defaultdict + +import torch +from torch import nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING + +from .._buffer_dict import BufferDict +from ..tuners_utils import _maybe_include_all_linear_layers +from .config import FrodConfig +from .layer import FrodLayer, Linear + + +def _category_from_key(key: str) -> str: + """Infer the projection-sharing category from a dotted module key. + + FRoD shares projection buffers across modules that play the same role in different transformer blocks. This helper + assumes keys follow the dotted paths returned by `named_modules()` and derives the role from the final path + components. For example, `encoder.layer.0.attention.self.query` maps to `self_query`, while + `vision_model.encoder.layers.0.self_attn.q_proj` maps to `self_attn_q_proj`. The BERT-style attention output key + `encoder.layer.0.attention.output.dense` is normalized to `attention_output` so it does not collide with MLP + `output.dense` modules. + """ + parts = key.split(".") + if len(parts) == 1: + return parts[0] + if parts[-2].isdigit(): + return parts[-1] + category = f"{parts[-2]}_{parts[-1]}" + if (category == "output_dense") and (len(parts) >= 3) and (parts[-3] == "attention"): + return "attention_output" + return category + + +def _layer_index_from_key(key: str, fallback: int) -> int: + """Infer the transformer block index from a dotted module key. + + Many decoder and vision models use paths like `model.layers.3.self_attn.q_proj`, so the first preference is the + integer immediately after a `layers` path component. Encoder models often use paths like + `encoder.layer.11.attention.self.query`; for those, the first numeric path component is used. If no numeric layer + id is present, e.g. for `classifier.dense`, `fallback` keeps the projection initialization order deterministic. + """ + parts = key.split(".") + if "layers" in parts: + try: + return int(parts[parts.index("layers") + 1]) + except (ValueError, IndexError): + pass + for part in parts: + if part.isdigit(): + return int(part) + return fallback + + +def _projection_from_weights(matrices: list[torch.Tensor], regularization_alpha: float) -> torch.Tensor: + stacked = torch.cat(matrices, dim=0) + if stacked.shape[0] < stacked.shape[1]: + _, _, vh = torch.linalg.svd(stacked, full_matrices=True) + return vh.T + + q_matrix, r_matrix = torch.linalg.qr(stacked) + q_slices = [] + start = 0 + for matrix in matrices: + rows = matrix.shape[0] + q_slices.append(q_matrix[start : start + rows, :]) + start += rows + + dim = r_matrix.shape[1] + t_pi = torch.zeros((dim, dim), dtype=r_matrix.dtype) + # Layers of the same projection category can be highly correlated; this ridge term keeps the inverse stable. + for q_slice in q_slices: + q_term = q_slice.T @ q_slice + regularization_alpha * torch.eye(dim, dtype=r_matrix.dtype) + t_pi += torch.linalg.inv(q_term) + t_pi /= len(q_slices) + + _, eigenvectors = torch.linalg.eigh(t_pi) + return r_matrix.T @ eigenvectors + + +class FrodModel(BaseTuner): + prefix: str = "frod_" + tuner_layer_cls = FrodLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING + + def _init_frod_projections(self, config: FrodConfig, adapter_name: str) -> None: + weights = defaultdict(dict) + model_config = self.get_model_config(self.model) + peft_config = self._prepare_adapter_config(config, model_config) + peft_config = _maybe_include_all_linear_layers(peft_config, self.model) + + fallback_index = 0 + for key, module in self.model.named_modules(): + if not self._check_target_module_exists(peft_config, key): + continue + + if isinstance(module, nn.Linear): + weight = module.weight + elif isinstance(module, Conv1D): + weight = module.weight.T + else: + continue + + category = _category_from_key(key) + layer_idx = _layer_index_from_key(key, fallback_index) + fallback_index += 1 + weights[layer_idx][category] = weight + + if not weights: + raise ValueError( + "No layer types compatible with FRoD were found. Please check `peft_config.target_modules`." + ) + + # BaseTuner.__init__() enters the pre-injection flow before a FrodModel subclass + # could assign ModuleDicts after super().__init__(), so create these containers lazily here. + if not hasattr(self, "frod_V"): + self.frod_V = nn.ModuleDict() + self.frod_s_indices = nn.ModuleDict() + self.frod_s_size = nn.ModuleDict() + + generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key) + categories = {category for layer_dict in weights.values() for category in layer_dict} + for category in sorted(categories): + matrices = [ + layer_dict[category].detach().to(torch.float32).cpu() + for _, layer_dict in sorted(weights.items()) + if category in layer_dict + ] + if not matrices: + continue + + v_matrix = _projection_from_weights(matrices, config.regularization_alpha) + example_weight = next(layer_dict[category] for layer_dict in weights.values() if category in layer_dict) + v_tensor = v_matrix.to(dtype=example_weight.dtype, device="cpu") + + if category not in self.frod_V: + self.frod_V[category] = BufferDict({}, persistent=config.save_projection) + self.frod_V[category][adapter_name] = v_tensor + + in_dim = v_tensor.shape[0] + rows, cols = torch.meshgrid(torch.arange(in_dim), torch.arange(in_dim), indexing="ij") + mask_indices = torch.stack([rows.flatten(), cols.flatten()], dim=1) + non_diag_indices = mask_indices[mask_indices[:, 0] != mask_indices[:, 1]] + nnz = min(int(in_dim * in_dim * config.sparse_rate), non_diag_indices.shape[0]) + if (config.sparse_rate > 0) and (non_diag_indices.shape[0] > 0): + nnz = max(1, nnz) + if nnz: + perm = torch.randperm(non_diag_indices.shape[0], generator=generator)[:nnz] + indices = non_diag_indices[perm].t().contiguous() + else: + indices = torch.empty(2, 0, dtype=torch.long) + size = torch.tensor([in_dim, in_dim], dtype=torch.long) + + if category not in self.frod_s_indices: + self.frod_s_indices[category] = BufferDict({}, persistent=config.save_projection) + self.frod_s_indices[category][adapter_name] = indices.to(torch.long) + if category not in self.frod_s_size: + self.frod_s_size[category] = BufferDict({}, persistent=config.save_projection) + self.frod_s_size[category][adapter_name] = size + + def _pre_injection_hook(self, model: nn.Module, config: FrodConfig, adapter_name: str) -> None: + self._init_frod_projections(config, adapter_name) + + def _check_new_adapter_config(self, config: FrodConfig) -> None: + super()._check_new_adapter_config(config) + + for existing_config in self.peft_config.values(): + if existing_config is config: + continue + if existing_config.projection_prng_key != config.projection_prng_key: + raise ValueError( + f"FRoD projection initialization key must be the same for all adapters. Got " + f"{config.projection_prng_key=} but previous config had " + f"{existing_config.projection_prng_key}." + ) + + save_projection_values = sorted({config.save_projection for config in self.peft_config.values()}) + if len(save_projection_values) > 1: + raise ValueError( + "FRoD projection weights must be saved for all adapters or none, but got multiple different values: " + f"{save_projection_values}" + ) + + def _create_and_replace( + self, + frod_config: FrodConfig, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + category = _category_from_key(current_key) + if category not in self.frod_V: + self._init_frod_projections(frod_config, adapter_name) + bias = hasattr(target, "bias") and target.bias is not None + + if isinstance(target, Linear): + target.update_layer( + adapter_name, + self.frod_V[category], + self.frod_s_indices[category], + self.frod_s_size[category], + config=frod_config, + ) + else: + new_module = self._create_new_module( + frod_config, + self.frod_V[category], + self.frod_s_indices[category], + self.frod_s_size[category], + adapter_name, + target, + bias=bias, + ) + if adapter_name not in self.active_adapters: + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module( + frod_config: FrodConfig, + frod_V, + frod_s_indices, + frod_s_size, + adapter_name, + target, + **kwargs, + ): + bias = kwargs.pop("bias", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if frod_config.fan_in_fan_out: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + frod_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not frod_config.fan_in_fan_out: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + frod_config.fan_in_fan_out = True + else: + raise TypeError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + + return Linear( + target, + frod_V, + frod_s_indices, + frod_s_size, + adapter_name, + config=frod_config, + bias=bias, + **kwargs, + ) diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py index 4106c78060..8e56825f58 100644 --- a/src/peft/utils/__init__.py +++ b/src/peft/utils/__init__.py @@ -26,6 +26,7 @@ TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, @@ -89,6 +90,7 @@ "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", diff --git a/src/peft/utils/constants.py b/src/peft/utils/constants.py index 94aa475f0d..3ef9c0f80a 100644 --- a/src/peft/utils/constants.py +++ b/src/peft/utils/constants.py @@ -135,6 +135,9 @@ def starcoder_model_postprocess_past_key_value(past_key_values): TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING["phi"] = ["q_proj", "v_proj"] +TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING["vit"] = ["query", "value"] + TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING.copy() TRANSFORMERS_MODELS_TO_PVERA_TARGET_MODULES_MAPPING["dinov2"] = ["query", "value"] diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py index 4cc720c4a5..93fddebd5f 100644 --- a/src/peft/utils/other.py +++ b/src/peft/utils/other.py @@ -49,6 +49,7 @@ TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, @@ -96,6 +97,7 @@ "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_DELORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FROD_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_GRALORA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 80fc1db8f5..9cf9b2e149 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -38,6 +38,7 @@ class PeftType(str, enum.Enum): - POLY - LN_TUNING - VERA + - FROD - FOURIERFT - HRA - BONE @@ -71,6 +72,7 @@ class PeftType(str, enum.Enum): POLY = "POLY" LN_TUNING = "LN_TUNING" VERA = "VERA" + FROD = "FROD" PVERA = "PVERA" FOURIERFT = "FOURIERFT" XLORA = "XLORA" diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py index 57660c4283..a2148f28a9 100644 --- a/src/peft/utils/save_and_load.py +++ b/src/peft/utils/save_and_load.py @@ -273,6 +273,23 @@ def renamed_dora_weights(k): ) to_return["base_model.pvera_A." + adapter_name] = state_dict["base_model.pvera_A." + adapter_name] to_return["base_model.pvera_B." + adapter_name] = state_dict["base_model.pvera_B." + adapter_name] + elif config.peft_type == PeftType.FROD: + frod_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + projection_prefixes = ("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.") + layer_projection_parts = (".frod_V.", ".frod_s_indices.", ".frod_s_size.", ".frod_U.") + to_return = { + k: state_dict[k] + for k in state_dict + if (frod_prefix in k) and (adapter_name in k) and not any(part in k for part in layer_projection_parts) + } + if config.save_projection: + to_return.update( + { + k: state_dict[k] + for k in state_dict + if k.startswith(projection_prefixes) and k.endswith(f".{adapter_name}") + } + ) elif config.peft_type == PeftType.XLORA: to_return = {k: state_dict[k] for k in state_dict if "internal_xlora_classifier" in k} elif config.peft_type == PeftType.VBLORA: @@ -715,6 +732,13 @@ def set_peft_model_state_dict( new_key = k.replace(".tinylora_v.", f".tinylora_v.{adapter_name}.") tinylora_v_state_dict[new_key] = state_dict.pop(k) + frod_projection_state_dict = {} + if config.peft_type == PeftType.FROD: + frod_projection_prefixes = ("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.") + frod_projection_keys = [k for k in state_dict if k.startswith(frod_projection_prefixes)] + for k in frod_projection_keys: + frod_projection_state_dict[f"{k}.{adapter_name}"] = state_dict.pop(k) + peft_model_state_dict = _insert_adapter_name_into_state_dict( state_dict, adapter_name=adapter_name, parameter_prefix=parameter_prefix ) @@ -722,6 +746,8 @@ def set_peft_model_state_dict( # Add back the tinylora_v keys (now in the correct format) if config.peft_type == PeftType.TINYLORA: peft_model_state_dict.update(tinylora_v_state_dict) + elif config.peft_type == PeftType.FROD: + peft_model_state_dict.update(frod_projection_state_dict) if config.peft_type == PeftType.ADALORA: rank_pattern = config.rank_pattern @@ -796,6 +822,24 @@ def set_peft_model_state_dict( " PRNG initialisation to restore these projections using `config.projection_prng_key`, which may" " not be accurate on all system configurations." ) + elif config.peft_type == PeftType.FROD: + has_projection = any( + k.startswith(("base_model.frod_V.", "base_model.frod_s_indices.", "base_model.frod_s_size.")) + for k in peft_model_state_dict + ) + if config.save_projection and not has_projection: + raise ValueError( + "Specified to load FRoD projection tensors from state dictionary however they were not present. " + "If this checkpoint was saved with `save_projection=False`, set `peft_config.save_projection` " + "to `False` before loading so the projections are regenerated from the base model weights. " + "Otherwise, re-save the adapter with `save_projection=True` to include these tensors." + ) + elif not config.save_projection and has_projection: + warnings.warn( + "Specified to not load FRoD projection tensors from state dictionary however they are present. " + "Consider using them to ensure checkpoint loading is correct by setting " + "`peft_config.save_projection = True`." + ) elif config.peft_type == PeftType.LORA: # Here we take care of a refactor of DoRA which changed lora_magnitude_vector from a ParameterDict to a # ModuleDict with a DoraLayer instance. The old parameter is now the "weight" attribute of that layer. diff --git a/tests/test_config.py b/tests/test_config.py index 02cd0004ad..b7a101a744 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -30,6 +30,7 @@ CartridgeConfig, CPTConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -78,6 +79,7 @@ class TestingCommitHashError(Exception): (BOFTConfig, {}), (C3AConfig, {}), (FourierFTConfig, {}), + (FrodConfig, {}), (GraloraConfig, {}), (HiraConfig, {}), (HRAConfig, {}), diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 7a809a6b4c..358ea319b7 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -37,6 +37,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -870,6 +871,14 @@ RandLoraConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "randlora_alpha": 1}, ), + ######## + # FRoD # + ######## + ("Vanilla MLP 1 FRoD", "MLP", FrodConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 FRoD", "MLP", FrodConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 FRoD", "MLP", FrodConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), ####### # C3A # ####### @@ -1390,6 +1399,13 @@ {"target_modules": ["lin0"], "init_weights": False}, {"target_modules": ["lin0"], "init_weights": False}, ), + ( + "FRoD Same", + "frod", + FrodConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), # Note: PVeRA presents the same problem mentioned above for VeRA. ( "PVeRA Same", @@ -3455,7 +3471,8 @@ def test_multiple_adapters_automatic_modules_to_save(self): assert "other" in model.base_model.classifier.modules_to_save @pytest.mark.parametrize( - "config_cls", [IA3Config, BeftConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig] + "config_cls", + [IA3Config, BeftConfig, FrodConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig, MissConfig], ) def test_multiple_adapters_mixed_modules_to_save(self, config_cls): # See issue 1574 @@ -3487,7 +3504,8 @@ def test_multiple_adapters_mixed_modules_to_save(self, config_cls): model(**inputs) @pytest.mark.parametrize( - "config_cls", [IA3Config, BeftConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig] + "config_cls", + [IA3Config, BeftConfig, FrodConfig, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, ShiraConfig], ) def test_multiple_adapters_mixed_modules_to_save_order_switched(self, config_cls): # See issue 1574 @@ -3830,6 +3848,7 @@ def test_load_resized_embedding_ignore_mismatched_sizes(self): AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False), BeftConfig(target_modules=["lin0"], init_weights=False), + FrodConfig(target_modules=["lin0"], init_weights=False), OFTConfig(target_modules=["lin0"], init_weights=False, r=2, oft_block_size=0), BOFTConfig(target_modules=["lin0"], init_weights=False, boft_block_size=2), HRAConfig(target_modules=["lin0"], init_weights=False), diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index f43763a554..c1e4050e31 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -38,6 +38,7 @@ CPTConfig, DeloraConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -145,6 +146,14 @@ "target_modules": None, }, ), + ( + FrodConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index 6411d30b93..603abbc495 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -25,6 +25,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -108,6 +109,14 @@ "task_type": "SEQ_2_SEQ_LM", }, ), + ( + FrodConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index 90493f275c..fe9450ca2f 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -23,6 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -105,6 +106,14 @@ "target_modules": None, }, ), + ( + FrodConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, { diff --git a/tests/test_frod.py b/tests/test_frod.py new file mode 100644 index 0000000000..67621faf3b --- /dev/null +++ b/tests/test_frod.py @@ -0,0 +1,237 @@ +# Copyright 2026-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to FRoD, since FRoD has shared projection buffers. + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from safetensors import safe_open +from torch import nn +from transformers import LlamaConfig, LlamaForCausalLM + +from peft import FrodConfig, PeftModel, get_peft_model + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestFrod: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + @pytest.fixture + def mlp_same_prng(self, mlp): + torch.manual_seed(0) + + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(mlp, config) + config2 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model.add_adapter("other", config2) + return peft_model + + def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): + # Check saving and loading works with multiple adapters without saved projection tensors. + torch.manual_seed(1) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + peft_model.eval() + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + peft_model.set_adapter("second") + output_second = peft_model(input) + + assert not torch.allclose(output_first, output_second, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + assert os.path.exists(save_path / "first" / "adapter_config.json") + assert os.path.exists(save_path / "second" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path / "first", adapter_name="first") + peft_model.load_adapter(save_path / "second", "second") + peft_model.eval() + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + peft_model.set_adapter("second") + output_second_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_second, output_second_loaded, atol=1e-3, rtol=1e-3) + + def test_save_projection_false_contains_no_frod_projection_tensors(self, mlp, tmp_path): + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model = get_peft_model(mlp, config) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + + state_dict = {} + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) + + assert not any("frod_V" in key for key in state_dict) + assert not any("frod_s_indices" in key for key in state_dict) + assert not any("frod_s_size" in key for key in state_dict) + assert not any("frod_U" in key for key in state_dict) + + def test_save_projection_true_contains_top_level_projection_tensors_only(self, mlp, tmp_path): + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(mlp, config) + + save_path = tmp_path / "frod" + peft_model.save_pretrained(save_path) + + keys = [] + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + keys = list(f.keys()) + + assert "base_model.frod_V.lin1" in keys + assert "base_model.frod_s_indices.lin1" in keys + assert "base_model.frod_s_size.lin1" in keys + assert "base_model.frod_V.lin2" in keys + assert not any(".model.lin1.frod_V" in key for key in keys) + assert not any("frod_U" in key for key in keys) + + def test_frod_default_initialization_reconstructs_base_weight(self, mlp): + torch.manual_seed(0) + mlp.eval() + inputs = torch.randn(5, 10) + expected = mlp(inputs) + + config = FrodConfig(target_modules=["lin1", "lin2"]) + peft_model = get_peft_model(mlp, config) + peft_model.eval() + + actual = peft_model(inputs) + assert torch.allclose(actual, expected, atol=1e-4, rtol=1e-4) + + for module in (peft_model.base_model.model.lin1, peft_model.base_model.model.lin2): + delta_weight = module.get_delta_weight("default") + + assert module.frod_lambda_l["default"].norm() > 0 + assert torch.count_nonzero(module.frod_lambda_s_values["default"]) == 0 + assert torch.allclose(delta_weight, torch.zeros_like(delta_weight), atol=1e-4) + + def test_frod_projection_buffers_share_memory_with_layers(self, mlp_same_prng): + frod_V_lin1 = mlp_same_prng.base_model.frod_V["lin1"]["default"] + frod_s_indices_lin1 = mlp_same_prng.base_model.frod_s_indices["lin1"]["default"] + + assert frod_V_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_V["default"].data_ptr() + assert frod_V_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_V["other"].data_ptr() + assert ( + frod_s_indices_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_s_indices["default"].data_ptr() + ) + assert frod_s_indices_lin1.data_ptr() == mlp_same_prng.base_model.model.lin1.frod_s_indices["other"].data_ptr() + + # Different target categories have distinct projection buffers. + assert frod_V_lin1.data_ptr() != mlp_same_prng.base_model.frod_V["lin2"]["default"].data_ptr() + + def test_frod_categories_with_common_llama_targets(self): + model = LlamaForCausalLM( + LlamaConfig( + hidden_size=16, + intermediate_size=32, + num_attention_heads=4, + num_hidden_layers=2, + vocab_size=32, + ) + ) + config = FrodConfig(target_modules=["q_proj", "v_proj"]) + + peft_model = get_peft_model(model, config) + + assert sorted(peft_model.base_model.frod_V.keys()) == ["self_attn_q_proj", "self_attn_v_proj"] + assert "default" in peft_model.base_model.frod_V["self_attn_q_proj"] + assert "default" in peft_model.base_model.frod_V["self_attn_v_proj"] + + def test_frod_lambda_dont_share_memory(self, mlp_same_prng): + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_s_values["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.frod_lambda_s_values["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_l["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.frod_lambda_l["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.frod_lambda_l["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.frod_lambda_l["default"].data_ptr() + ) + + def test_frod_different_shapes(self, mlp): + config = FrodConfig(target_modules=["lin0", "lin3"], init_weights=False) + mlp_different_shapes = get_peft_model(mlp, config) + + assert mlp.lin0.base_layer.weight.shape != mlp.lin3.base_layer.weight.shape + assert mlp_different_shapes.base_model.frod_V["lin0"]["default"].shape == ( + mlp.lin0.in_features, + mlp.lin0.in_features, + ) + assert mlp_different_shapes.base_model.frod_V["lin3"]["default"].shape == ( + mlp.lin3.in_features, + mlp.lin3.in_features, + ) + + input = torch.randn(5, 10) + mlp_different_shapes(input) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_frod_dtypes(self, dtype): + if dtype == torch.bfloat16: + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + model = MLP().to(dtype) + config = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(model, config) + inputs = torch.randn(5, 10).to(dtype) + output = peft_model(inputs) + assert output.dtype == dtype diff --git a/tests/test_initialization.py b/tests/test_initialization.py index 27d9ba16e2..6970d35904 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -35,6 +35,7 @@ C3AConfig, DeloraConfig, EvaConfig, + FrodConfig, GraloraConfig, IA3Config, LilyConfig, @@ -1835,6 +1836,71 @@ def test_vera_add_second_adapter_with_higher_rank(self): model.add_adapter("other", config1) +class TestFrodInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) + self.lin2 = nn.Linear(20, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + X = self.lin2(X) + return X + + return MLP().to(self.torch_device) + + def test_frod_multiple_adapters_same_prng_share_projection_buffers(self): + torch.manual_seed(0) + config0 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + model = get_peft_model(self.get_model().cpu(), config0) + + config1 = FrodConfig(target_modules=["lin1", "lin2"], init_weights=False) + model.add_adapter("other", config1) + + assert model.base_model.model.lin1.frod_V["default"].data_ptr() == ( + model.base_model.model.lin1.frod_V["other"].data_ptr() + ) + assert model.base_model.model.lin1.frod_s_indices["default"].data_ptr() == ( + model.base_model.model.lin1.frod_s_indices["other"].data_ptr() + ) + assert model.base_model.model.lin2.frod_V["default"].data_ptr() == ( + model.base_model.model.lin2.frod_V["other"].data_ptr() + ) + assert model.base_model.model.lin2.frod_s_indices["default"].data_ptr() == ( + model.base_model.model.lin2.frod_s_indices["other"].data_ptr() + ) + + def test_frod_mixing_save_projection_raises(self): + config0 = FrodConfig(target_modules=["lin0"], init_weights=False, save_projection=True) + model = get_peft_model(self.get_model(), config0) + + config1 = FrodConfig(target_modules=["lin0"], init_weights=False, save_projection=False) + msg = re.escape( + "FRoD projection weights must be saved for all adapters or none, but got multiple different values: " + "[False, True]" + ) + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + def test_frod_add_second_adapter_with_different_prng_key_raises(self): + config0 = FrodConfig(target_modules=["lin0"], init_weights=False) + model = get_peft_model(self.get_model(), config0) + + config1 = FrodConfig(target_modules=["lin0"], init_weights=False, projection_prng_key=123) + msg = re.escape( + "FRoD projection initialization key must be the same for all adapters. Got " + "config.projection_prng_key=123 but previous config had 0." + ) + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + class TestVeloraInitialization: @pytest.mark.parametrize( "config_kwargs, msg", diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index 613ca1b8c3..c7de1272d3 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -23,6 +23,7 @@ C3AConfig, DeloraConfig, FourierFTConfig, + FrodConfig, GraloraConfig, HiraConfig, HRAConfig, @@ -106,6 +107,14 @@ "target_modules": None, }, ), + ( + FrodConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "sparse_rate": 0.01, + }, + ), ( GraloraConfig, {