From 26bdb488961346fe8286b3585c2d9bd31554eace Mon Sep 17 00:00:00 2001 From: Sten111 Date: Mon, 27 Apr 2026 13:27:08 +0200 Subject: [PATCH 1/4] FEAT Add MiCA (Minor Component Adaptation) as a LoRA variant Adds Minor Component Adaptation (https://arxiv.org/abs/2604.01694) as a new init scheme for LoraConfig, triggered by `init_lora_weights="mica"`. Resolves #3142. MiCA initializes `B = U[:, -r:]` (the r left singular vectors of the base weight associated with the smallest singular values) and `A = 0`. During training only `A` is updated; `B` is frozen. Because `A == 0` at init, the adapter contribution `B @ A` is zero and the forward output is preserved exactly, with no need to mutate the base weight. Implementation: * `LoraConfig.init_lora_weights` accepts `"mica"`. * `LoraLayer.mica_init` performs the SVD-based init for Linear targets and validates `r <= min(in_features, out_features)`. The init is skipped when the adapter parameters are on the meta device (low_cpu_mem_usage path). * `MiCALinearVariant` is a `LoraVariant` that resolves for the MiCA init scheme. Forward and merge semantics are vanilla LoRA; the only override of substance is the new `update_requires_grad` hook. * `LoraVariant.update_requires_grad(module, adapter_name)` is a new entry point on the variant base class. Default is a no-op so existing variants are unaffected. `LoraModel._mark_only_adapters_as_trainable` invokes it for every adapter after the base trainability marking, which is where MiCA freezes `lora_B`. MiCA is currently restricted to `nn.Linear`. Passing `init_lora_weights="mica"` on a non-Linear target raises `ValueError: Unknown initialization` via the existing `reset_lora_parameters` fallback. Tests: * `tests/test_initialization.py` adds 6 MiCA-specific tests covering init correctness, that B is the minor (not major) subspace, B-freeze, train step behavior, save/load round-trip, and the unsupported-layer error. * `tests/test_custom_models.py` adds two parametrized MiCA entries to `TEST_CASES` for broader coverage (save/load, merge/unmerge, autocast). * `tests/testing_common.py` and `tests/test_custom_models.py` relax two assertions that previously required *every* `lora_*` parameter to be trainable / receive gradients, to accommodate variants like MiCA that intentionally freeze a subset. Docs and example: * `docs/source/developer_guides/lora.md` adds a MiCA section. * `examples/mica_finetuning/` provides a runnable example and README. * `method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-mica/` registers a benchmark config. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/source/developer_guides/lora.md | 13 +++ examples/mica_finetuning/README.md | 80 +++++++++++++++ examples/mica_finetuning/mica_finetuning.py | 80 +++++++++++++++ .../adapter_config.json | 30 ++++++ src/peft/tuners/lora/config.py | 24 ++++- src/peft/tuners/lora/layer.py | 52 ++++++++++ src/peft/tuners/lora/model.py | 9 ++ src/peft/tuners/lora/variants.py | 47 +++++++++ tests/test_custom_models.py | 21 +++- tests/test_initialization.py | 98 +++++++++++++++++++ tests/testing_common.py | 6 +- 11 files changed, 453 insertions(+), 7 deletions(-) create mode 100644 examples/mica_finetuning/README.md create mode 100644 examples/mica_finetuning/mica_finetuning.py create mode 100644 method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-mica/adapter_config.json diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 56c1c3ad30..000c9fb888 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -54,6 +54,19 @@ lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) ``` For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning). +### MiCA + +[MiCA](https://arxiv.org/abs/2604.01694) (Minor Component Adaptation) is a complement to PiSSA: instead of initializing from the *principal* singular components, MiCA uses the *minor* ones. Concretely, with `W = U Σ V^T`, MiCA sets `B = U[:, -r:]` (the `r` left singular vectors associated with the smallest singular values) and `A = 0`. During training, only `A` is updated; `B` is frozen. The intuition is that the minor singular directions are largely unused by the pretrained task and therefore offer a more "plastic" subspace for injecting new knowledge while preserving pretrained capabilities. + +Because `A == 0` at init, the adapter contribution `B · A == 0` and the model output is preserved exactly at step 0 — no residual subtraction on the base weight is needed (unlike PiSSA). Since only `A` is trainable, the trainable parameter count for matching `r` is roughly half that of LoRA. + +```python +from peft import LoraConfig +config = LoraConfig(init_lora_weights="mica", r=16, target_modules=["q_proj", "v_proj"], ...) +``` + +MiCA currently supports `nn.Linear` target modules only and requires `r <= min(in_features, out_features)` for every targeted layer. For detailed usage, see [these instructions](https://github.com/huggingface/peft/tree/main/examples/mica_finetuning). + ### CorDA [CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). diff --git a/examples/mica_finetuning/README.md b/examples/mica_finetuning/README.md new file mode 100644 index 0000000000..3a870a6198 --- /dev/null +++ b/examples/mica_finetuning/README.md @@ -0,0 +1,80 @@ +# MiCA: Minor Component Adaptation + +## Introduction ([Paper](https://arxiv.org/abs/2604.01694)) + +Minor Component Adaptation (MiCA) is a parameter-efficient fine-tuning method closely related to LoRA. Like LoRA, MiCA inserts a low-rank update `ΔW = (α/r) · B · A` into a pretrained weight `W ∈ R^{out×in}`. Unlike LoRA, MiCA initializes the matrices from the singular value decomposition of `W` and trains only one of them: + +- Compute the SVD `W = U Σ V^T`. +- Initialize `B = U[:, -r:]` — the `r` left singular vectors associated with the **smallest** singular values. +- Initialize `A = 0`. +- During training, optimize only `A`; `W` and `B` remain frozen. + +The motivation is that the *minor* singular directions of a pretrained weight encode subspaces that are largely unused by the original task. Restricting adaptation to these directions provides a more "plastic" subspace for knowledge injection, with less risk of overwriting capabilities encoded in the dominant subspace. Empirically MiCA improves knowledge acquisition while reducing the trainable parameter footprint compared with LoRA at the same rank (because only `A` is trained, the parameter count is roughly halved for matching `r`). + +Because `A == 0` at initialization, the adapter contribution `B · A == 0` and the model's forward output is preserved exactly at step 0 — no residual subtraction is needed on the base weight. + +## Quick Start + +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +tokenizer.pad_token_id = tokenizer.eos_token_id + +lora_config = LoraConfig( + init_lora_weights="mica", + r=16, + lora_alpha=16, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", +) +peft_model = get_peft_model(model, lora_config) +peft_model.print_trainable_parameters() + +dataset = load_dataset("imdb", split="train[:1%]") +training_args = SFTConfig(dataset_text_field="text", max_length=128) +trainer = SFTTrainer( + model=peft_model, + args=training_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("mica-llama-2-7b") +``` + +To reload the trained adapter: + +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", dtype=torch.bfloat16, device_map="auto" +) +peft_model = PeftModel.from_pretrained(model, "mica-llama-2-7b") +``` + +## Notes and limitations + +- MiCA currently supports `nn.Linear` target modules only. +- The chosen rank must satisfy `r <= min(in_features, out_features)` for every targeted layer; otherwise initialization raises `ValueError`. +- MiCA performs a full SVD per target weight at initialization. For 7B-scale models this is a one-time cost of seconds; for substantially larger weight matrices (e.g. 70B-scale) the cost grows. +- Combining MiCA with `use_dora=True` or other LoRA variants is not supported in this initial integration. + +## Citation + +``` +@article{rudiger2026mica, + title={MiCA Learns More Knowledge Than LoRA and Full Fine-Tuning}, + author={R{\"u}diger, Sten and Raschka, Sebastian}, + journal={arXiv preprint arXiv:2604.01694}, + year={2026} +} +``` diff --git a/examples/mica_finetuning/mica_finetuning.py b/examples/mica_finetuning/mica_finetuning.py new file mode 100644 index 0000000000..39e11d3917 --- /dev/null +++ b/examples/mica_finetuning/mica_finetuning.py @@ -0,0 +1,80 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Minimal MiCA fine-tuning example. + +Mirrors `examples/pissa_finetuning/pissa_finetuning.py` in spirit but with the MiCA-specific knobs only. MiCA +initializes `B` from the bottom-r left singular vectors of the base weight and freezes it during training; only +`A` is updated. Because `A == 0` at init, the adapter is a no-op on initialization and no residual subtraction +on the base weight is needed. +""" + +from dataclasses import dataclass, field +from typing import Optional + +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser +from trl import SFTConfig, SFTTrainer + +from peft import LoraConfig, get_peft_model + + +@dataclass +class ScriptArguments(SFTConfig): + base_model_name_or_path: Optional[str] = field(default=None, metadata={"help": "Name or path of the base model."}) + lora_r: int = field(default=16) + lora_alpha: int = field(default=16) + lora_dropout: float = field(default=0.0) + target_modules: Optional[str] = field( + default="q_proj,v_proj", + metadata={"help": "Comma-separated module names to adapt with MiCA."}, + ) + data_path: str = field(default="imdb", metadata={"help": "HF dataset path."}) + dataset_split: str = field(default="train[:1%]") + dataset_text_field: str = field(default="text") + + +def train(): + parser = HfArgumentParser(ScriptArguments) + args = parser.parse_args_into_dataclasses()[0] + + model = AutoModelForCausalLM.from_pretrained(args.base_model_name_or_path, dtype=torch.bfloat16, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_or_path) + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + lora_config = LoraConfig( + init_lora_weights="mica", + r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + target_modules=[m.strip() for m in args.target_modules.split(",")], + task_type="CAUSAL_LM", + ) + peft_model = get_peft_model(model, lora_config) + peft_model.print_trainable_parameters() + + dataset = load_dataset(args.data_path, split=args.dataset_split) + trainer = SFTTrainer( + model=peft_model, + args=args, + train_dataset=dataset, + processing_class=tokenizer, + ) + trainer.train() + peft_model.save_pretrained(args.output_dir) + + +if __name__ == "__main__": + train() diff --git a/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-mica/adapter_config.json b/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-mica/adapter_config.json new file mode 100644 index 0000000000..e62097f635 --- /dev/null +++ b/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-mica/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": "mica", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 73cb43343c..100ad7c752 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -357,7 +357,7 @@ class LoraConfig(PeftConfig): use the original default value of `lora_alpha/r`. modules_to_save (`List[str]`): List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. - init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"]`): + init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal", "mica"]`): How to initialize the weights of the adapter layers. Passing True (default) results in the default initialization from the reference implementation from Microsoft, with the LoRA B weight being set to 0. This means that without further training, the LoRA adapter will be a no-op. Setting the initialization to @@ -379,7 +379,10 @@ class LoraConfig(PeftConfig): converges even more rapidly than PiSSA in Instruction-Previewed Mode, and preserves world knowledge better than LoRA in Knowledge-Preserved Mode. Passing `"orthogonal"` results in LoRA A and B being intialized orthogonally; in this, it resembles `"olora"`, but the base weights are left untouched (requires `r` to be - even, only supported for linear layers for now). + even, only supported for linear layers for now). Passing `"mica"` results in the initialization of Minor Component Adaptation (MiCA), which initializes B from + the r left singular vectors of the base weight associated with the smallest singular values, sets A to + zero, and freezes B during training; only A is updated. Currently only supported for linear layers. layers_to_transform (`Union[List[int], int]`): The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices that are specified in this list. If a single integer is passed, it will apply the transformations on the @@ -512,7 +515,17 @@ class LoraConfig(PeftConfig): ) init_lora_weights: ( bool - | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"] + | Literal[ + "gaussian", + "eva", + "olora", + "pissa", + "pissa_niter_[number of iters]", + "corda", + "loftq", + "orthogonal", + "mica", + ] ) = field( default=True, metadata={ @@ -532,7 +545,10 @@ class LoraConfig(PeftConfig): "nonnegative integer. " "Passing `'corda'` results in CorDA initialization. " "Pass `'loftq'` to use LoftQ initialization. " - "Pass `'orthogonal'` for orthogonal initialization of LoRA A and B." + "Pass `'orthogonal'` for orthogonal initialization of LoRA A and B. " + "Pass `'mica'` to use MiCA initialization, where B is set to the r left singular vectors of the " + "base weight associated with the smallest singular values, A is set to zero, and B is frozen during " + "training (only A is updated)." ), }, ) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index eba67e76eb..a85462e01b 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -96,6 +96,17 @@ def forward( """ raise NotImplementedError + @staticmethod + def update_requires_grad(module: LoraLayer, adapter_name: str) -> None: + """ + Adjust `requires_grad` of the adapter parameters after the tuner's default trainability marking. + + This hook is invoked from the LoRA model's `_mark_only_adapters_as_trainable` once for every adapter that uses + this variant. The default implementation is a no-op so existing variants keep their current behavior; variants + like MiCA override this to freeze a subset of the adapter parameters (e.g. `lora_B`). + """ + return None + class LoraLayer(BaseTunerLayer): # All names of layers that may contain (trainable) adapter weights @@ -228,6 +239,9 @@ def update_layer( elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": with gather_params_ctx(self.get_base_layer().weight): self.olora_init(adapter_name) + elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "mica": + with gather_params_ctx(self.get_base_layer().weight): + self.mica_init(adapter_name) elif init_lora_weights == "loftq": with gather_params_ctx(self.get_base_layer().weight): self.loftq_init(adapter_name, config) @@ -392,6 +406,39 @@ def pissa_init(self, adapter_name, init_lora_weights): weight = transpose(weight.to(dtype), self.fan_in_fan_out) self.get_base_layer().weight.data = weight + def mica_init(self, adapter_name): + """Minor Component Adaptation (MiCA) initialization (https://arxiv.org/abs/2604.01694). + + Initializes `lora_B` from the `r` left singular vectors of the base weight associated with the smallest + singular values, and sets `lora_A` to zero. The `lora_B` matrix is frozen during training (see + `MiCALinearVariant.update_requires_grad`); only `lora_A` is updated. Because `lora_A == 0` at init, the adapter + contribution `B @ A == 0` and the base weight does not need to be modified to preserve the forward output. + """ + # When the adapter is being created under `init_empty_weights` (e.g. low_cpu_mem_usage=True), its parameters + # live on the meta device and will be filled in from a checkpoint after creation. Skip the SVD in that case. + if self.lora_B[adapter_name].weight.device.type == "meta": + return + weight = self.get_base_layer().weight + dtype = weight.dtype + if dtype not in [torch.float32, torch.float16, torch.bfloat16]: + raise TypeError("Please initialize MiCA under float32, float16, or bfloat16.") + weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) + # weight has shape (out_features, in_features) once transposed for fan_in_fan_out, matching nn.Linear.weight. + # SVD: weight = V @ diag(S) @ Uh, with V: (out, k), Uh: (k, in), S sorted descending. + # MiCA selects the LAST r left singular vectors (smallest singular values) for B and zeroes A. + r = self.r[adapter_name] + max_r = min(weight.shape) + if r > max_r: + raise ValueError( + f"MiCA requires `r` <= min(in_features, out_features) but got r={r} for a layer with " + f"weight shape {tuple(weight.shape)} (max usable r is {max_r})." + ) + V, _, _ = torch.linalg.svd(weight.data, full_matrices=False) + lora_B = V[:, -r:].contiguous() + lora_A = torch.zeros(r, weight.shape[1], device=weight.device) + self.lora_B[adapter_name].weight.data = lora_B.to(dtype) + self.lora_A[adapter_name].weight.data = lora_A.to(dtype) + def corda_init(self, adapter_name, init_lora_weights): linear = self.get_base_layer() weight = linear.weight @@ -803,6 +850,11 @@ def resolve_lora_variant(self, config: LoraConfig, **kwargs) -> Optional[LoraVar return BdLoraLinearVariant() + if isinstance(config.init_lora_weights, str) and config.init_lora_weights.lower() == "mica": + from .variants import MiCALinearVariant + + return MiCALinearVariant() + use_alora = config.alora_invocation_tokens is not None if not config.use_dora and not use_alora: return None diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 57c03b4fa5..4e069c82f8 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -333,6 +333,15 @@ def _create_and_replace( tp_size=self.model._tp_size, ) + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + super()._mark_only_adapters_as_trainable(model) + # Give each adapter's LoRA variant a chance to refine `requires_grad` after the default marking. Variants + # like MiCA override `update_requires_grad` to freeze a subset of the adapter parameters (e.g. lora_B). + for module in model.modules(): + if isinstance(module, LoraLayer): + for adapter_name, variant in module.lora_variant.items(): + variant.update_requires_grad(module, adapter_name) + def _replace_module(self, parent, child_name, new_module, child): # override in LoraModel to handle quantized weights properly diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 8d990b7f26..55d8325796 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -921,3 +921,50 @@ def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) @staticmethod def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: return orig_weight - BdLoraLinearVariant._get_bdlora_delta_weight(module, active_adapter) + + +class MiCALinearVariant(LoraVariant): + """Variant for Minor Component Adaptation (MiCA), https://arxiv.org/abs/2604.01694. + + The actual SVD-based initialization is performed in `LoraLayer.mica_init` (called from `update_layer`); this + variant only adds the freezing of `lora_B` after the tuner's default trainability marking. Forward and merge + semantics are identical to vanilla LoRA, since `delta_W = scaling * B @ A` and only `A` is updated. + """ + + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: + # MiCA's adapter weights are populated in LoraLayer.mica_init before this hook runs; nothing to do here. + return None + + @staticmethod + def update_requires_grad(module: Linear, adapter_name: str) -> None: + if adapter_name in module.lora_B: + module.lora_B[adapter_name].weight.requires_grad = False + if module.lora_B[adapter_name].bias is not None: + module.lora_B[adapter_name].bias.requires_grad = False + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + return orig_weight + module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + orig_weight.data += module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + return orig_weight - module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + lora_A = module.lora_A[active_adapter] + lora_B = module.lora_B[active_adapter] + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + return result + lora_B(lora_A(dropout(x))) * scaling diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 0aea212d4e..f31b74076c 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -99,6 +99,19 @@ ), ("Vanilla MLP 7 LoRA with DoRA", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_dora": True}), ("Vanilla MLP 8 LoRA with DoRA", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_dora": True}), + ( + "Vanilla MLP 1 LoRA with MiCA", + "MLP", + LoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": "mica", "r": 4}, + ), + ( + "Vanilla MLP 2 LoRA with MiCA", + "MLP", + LoraConfig, + # lin1 is nn.Linear(20, 2), so the largest feasible MiCA rank for both targets is 2. + {"target_modules": ["lin0", "lin1"], "init_lora_weights": "mica", "r": 2}, + ), ( "Vanilla MLP 9 LoRA with DoRA", "MLP", @@ -2378,8 +2391,12 @@ def test_only_params_are_updated(self, test_name, model_id, config_cls, config_k for name, param_before in params_before.items(): param_after = params_after[name] if (model.prefix in name) or ("modules_to_save" in name) or ("token_adapter.trainable_tokens" in name): - # target_modules, modules_to_save and modules of `NewTokensWrapper` _are_ updated - assert not torch.allclose(param_before, param_after, atol=tol, rtol=tol) + # target_modules, modules_to_save and modules of `NewTokensWrapper` _are_ updated, except for adapter + # parameters that the variant intentionally freezes (e.g. MiCA freezes lora_B). + if not param_after.requires_grad: + assert torch.equal(param_before, param_after) + else: + assert not torch.allclose(param_before, param_after, atol=tol, rtol=tol) else: assert torch.allclose(param_before, param_after, atol=tol, rtol=tol) diff --git a/tests/test_initialization.py b/tests/test_initialization.py index 3be97bfab8..cdb9ad2990 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -387,6 +387,104 @@ def test_lora_pissa_linear_init_default(self, data): peft_model = get_peft_model(deepcopy(model), config) assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + def test_lora_mica_linear_init_default(self, data): + # MiCA initializes A=0 and B = bottom-r left singular vectors of W. Because A=0, the adapter contribution + # B @ A is zero at init, so the forward output must equal the base model's output exactly. + model = self.get_model() + output = model(data)[0] + + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + + weight_A = peft_model.base_model.linear.lora_A["default"].weight + weight_B = peft_model.base_model.linear.lora_B["default"].weight + + # A must be zero + assert torch.all(weight_A == 0) + # B columns must be orthonormal (since they are left singular vectors) + eye = torch.eye(weight_B.shape[1], device=weight_B.device, dtype=weight_B.dtype) + assert torch.allclose(weight_B.t() @ weight_B, eye, atol=1e-4) + # Output at init equals the base output + assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + + def test_lora_mica_uses_minor_components(self): + # Verify B equals the *minor* (smallest singular value) left singular vectors, not the major ones. + torch.manual_seed(0) + model = self.get_model() + r = 8 + + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=r) + peft_model = get_peft_model(deepcopy(model), config) + weight_B = peft_model.base_model.linear.lora_B["default"].weight.detach().cpu() + + # Reference SVD of the original weight + W = model.linear.weight.detach().cpu().to(torch.float32) + U, S, _ = torch.linalg.svd(W, full_matrices=False) + minor_U = U[:, -r:] + major_U = U[:, :r] + + # B should span the same subspace as `minor_U` (column spans match up to sign/orthogonal mixing within + # equal-singular-value groups). Equality of projectors is the right invariant. + proj_B = weight_B @ weight_B.t() + proj_minor = minor_U @ minor_U.t() + proj_major = major_U @ major_U.t() + assert torch.allclose(proj_B, proj_minor, atol=1e-4) + assert not torch.allclose(proj_B, proj_major, atol=1e-2) + + def test_lora_mica_freezes_B(self): + model = self.get_model() + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + + assert peft_model.base_model.linear.lora_A["default"].weight.requires_grad + assert not peft_model.base_model.linear.lora_B["default"].weight.requires_grad + + def test_lora_mica_train_step_updates_only_A(self, data): + # After one optimizer step, lora_A must change but lora_B must stay exactly equal. + torch.manual_seed(0) + model = self.get_model() + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + + A_before = peft_model.base_model.linear.lora_A["default"].weight.detach().clone() + B_before = peft_model.base_model.linear.lora_B["default"].weight.detach().clone() + + opt = torch.optim.SGD([p for p in peft_model.parameters() if p.requires_grad], lr=1.0) + out = peft_model(data)[0] + loss = out.sum() + loss.backward() + opt.step() + + A_after = peft_model.base_model.linear.lora_A["default"].weight.detach() + B_after = peft_model.base_model.linear.lora_B["default"].weight.detach() + assert not torch.equal(A_before, A_after) + assert torch.equal(B_before, B_after) + + def test_lora_mica_save_and_load(self, data, tmp_path): + # Save then reload onto the same base weights and verify outputs match exactly. + torch.manual_seed(0) + model = self.get_model() + + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + # mutate lora_A so the adapter is non-trivial + peft_model.base_model.linear.lora_A["default"].weight.data.normal_() + output_mica = peft_model(data)[0] + + peft_model.save_pretrained(tmp_path / "mica-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "mica-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_mica, output_loaded, atol=1e-6) + + def test_lora_mica_unsupported_layer_raises(self): + # MiCA is currently only implemented for nn.Linear. Trying to apply it to a non-Linear layer (e.g. conv2d) + # should raise rather than silently fall through to a default init. + model = self.get_model() + config = LoraConfig(init_lora_weights="mica", target_modules=["conv2d"], r=4) + with pytest.raises(ValueError, match="Unknown initialization"): + get_peft_model(deepcopy(model), config) + def test_lora_olora_linear_init_default(self, data): model = self.get_model() output = model(data)[0] diff --git a/tests/testing_common.py b/tests/testing_common.py index fc8161829c..5ac161964e 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1033,7 +1033,11 @@ def _test_training(self, model_id, config_cls, config_kwargs): for n, param in model.named_parameters(): if (model.prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): - assert param.grad is not None + # variants like MiCA intentionally freeze a subset of adapter params, which won't have a grad + if param.requires_grad: + assert param.grad is not None + else: + assert param.grad is None else: assert param.grad is None From 50ecb0d45ffb2710fe46c16274b06619b9b35c9e Mon Sep 17 00:00:00 2001 From: Sten111 Date: Thu, 21 May 2026 16:52:12 +0200 Subject: [PATCH 2/4] changes for ruff --- src/peft/tuners/lora/layer.py | 2 +- tests/test_initialization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index d7311435a3..3be15ed09d 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -108,7 +108,7 @@ def update_requires_grad(module: LoraLayer, adapter_name: str) -> None: this variant. The default implementation is a no-op so existing variants keep their current behavior; variants like MiCA override this to freeze a subset of the adapter parameters (e.g. `lora_B`). """ - return None + return class LoraLayer(BaseTunerLayer): diff --git a/tests/test_initialization.py b/tests/test_initialization.py index d779d4ec2a..a175fcddee 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -421,7 +421,7 @@ def test_lora_mica_uses_minor_components(self): # Reference SVD of the original weight W = model.linear.weight.detach().cpu().to(torch.float32) - U, S, _ = torch.linalg.svd(W, full_matrices=False) + U, _S, _ = torch.linalg.svd(W, full_matrices=False) minor_U = U[:, -r:] major_U = U[:, :r] From 23622a307ebbd8ef3e6702d431ed93fac828d720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sten=20R=C3=BCdiger?= <127544698+sr-networks@users.noreply.github.com> Date: Thu, 4 Jun 2026 18:13:08 +0200 Subject: [PATCH 3/4] Update src/peft/tuners/lora/layer.py Co-authored-by: Benjamin Bossan --- src/peft/tuners/lora/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 3be15ed09d..6cf02df522 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -436,7 +436,7 @@ def mica_init(self, adapter_name): f"MiCA requires `r` <= min(in_features, out_features) but got r={r} for a layer with " f"weight shape {tuple(weight.shape)} (max usable r is {max_r})." ) - V, _, _ = torch.linalg.svd(weight.data, full_matrices=False) + U, _, _ = torch.linalg.svd(weight.data, full_matrices=False) lora_B = V[:, -r:].contiguous() lora_A = torch.zeros(r, weight.shape[1], device=weight.device) self.lora_B[adapter_name].weight.data = lora_B.to(dtype) From cd6bcae9f12260b6fde07360f5ace761e3385859 Mon Sep 17 00:00:00 2001 From: Sten111 Date: Thu, 4 Jun 2026 18:24:50 +0200 Subject: [PATCH 4/4] Address MiCA trainability and embedding support --- docs/source/developer_guides/lora.md | 2 +- examples/mica_finetuning/README.md | 4 +- src/peft/tuners/lora/config.py | 2 +- src/peft/tuners/lora/layer.py | 62 ++++++++++++---- src/peft/tuners/lora/model.py | 9 --- src/peft/tuners/lora/variants.py | 65 ++++++++++++++--- src/peft/tuners/tuners_utils.py | 32 ++++++++ tests/test_custom_models.py | 7 -- tests/test_initialization.py | 105 +++++++++++++++++---------- 9 files changed, 205 insertions(+), 83 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index f7687d274a..0fb251ed38 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -65,7 +65,7 @@ from peft import LoraConfig config = LoraConfig(init_lora_weights="mica", r=16, target_modules=["q_proj", "v_proj"], ...) ``` -MiCA currently supports `nn.Linear` target modules only and requires `r <= min(in_features, out_features)` for every targeted layer. For detailed usage, see [these instructions](https://github.com/huggingface/peft/tree/main/examples/mica_finetuning). +MiCA currently supports `nn.Linear` and `nn.Embedding` target modules. The chosen rank must satisfy `r <= min(in_features, out_features)` for linear layers and `r <= min(num_embeddings, embedding_dim)` for embedding layers. For detailed usage, see [these instructions](https://github.com/huggingface/peft/tree/main/examples/mica_finetuning). ### CorDA diff --git a/examples/mica_finetuning/README.md b/examples/mica_finetuning/README.md index 3a870a6198..205c9b2799 100644 --- a/examples/mica_finetuning/README.md +++ b/examples/mica_finetuning/README.md @@ -63,8 +63,8 @@ peft_model = PeftModel.from_pretrained(model, "mica-llama-2-7b") ## Notes and limitations -- MiCA currently supports `nn.Linear` target modules only. -- The chosen rank must satisfy `r <= min(in_features, out_features)` for every targeted layer; otherwise initialization raises `ValueError`. +- MiCA currently supports `nn.Linear` and `nn.Embedding` target modules. +- The chosen rank must satisfy `r <= min(in_features, out_features)` for linear layers and `r <= min(num_embeddings, embedding_dim)` for embedding layers; otherwise initialization raises `ValueError`. - MiCA performs a full SVD per target weight at initialization. For 7B-scale models this is a one-time cost of seconds; for substantially larger weight matrices (e.g. 70B-scale) the cost grows. - Combining MiCA with `use_dora=True` or other LoRA variants is not supported in this initial integration. diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index e614ebe98a..d057a904b8 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -433,7 +433,7 @@ class LoraConfig(PeftConfig): even, only supported for linear layers for now). Passing `"mica"` results in the initialization of Minor Component Adaptation (MiCA), which initializes B from the r left singular vectors of the base weight associated with the smallest singular values, sets A to - zero, and freezes B during training; only A is updated. Currently only supported for linear layers. + zero, and freezes B during training; only A is updated. Currently supported for linear and embedding layers. layers_to_transform (`Union[List[int], int]`): The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices that are specified in this list. If a single integer is passed, it will apply the transformations on the diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 6cf02df522..9a2e6f6c02 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -99,17 +99,6 @@ def forward( """ raise NotImplementedError - @staticmethod - def update_requires_grad(module: LoraLayer, adapter_name: str) -> None: - """ - Adjust `requires_grad` of the adapter parameters after the tuner's default trainability marking. - - This hook is invoked from the LoRA model's `_mark_only_adapters_as_trainable` once for every adapter that uses - this variant. The default implementation is a no-op so existing variants keep their current behavior; variants - like MiCA override this to freeze a subset of the adapter parameters (e.g. `lora_B`). - """ - return - class LoraLayer(BaseTunerLayer): # All names of layers that may contain (trainable) adapter weights @@ -147,6 +136,10 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, * self.in_features = in_features self.out_features = out_features + def delete_adapter(self, adapter_name: str) -> None: + super().delete_adapter(adapter_name) + self.lora_variant.pop(adapter_name, None) + def _get_in_out_features(self, module: nn.Module) -> tuple[int, int] | tuple[None, None]: return _get_in_out_features(module) @@ -414,20 +407,22 @@ def mica_init(self, adapter_name): Initializes `lora_B` from the `r` left singular vectors of the base weight associated with the smallest singular values, and sets `lora_A` to zero. The `lora_B` matrix is frozen during training (see - `MiCALinearVariant.update_requires_grad`); only `lora_A` is updated. Because `lora_A == 0` at init, the adapter + `MiCALinearVariant.init`); only `lora_A` is updated. Because `lora_A == 0` at init, the adapter contribution `B @ A == 0` and the base weight does not need to be modified to preserve the forward output. """ # When the adapter is being created under `init_empty_weights` (e.g. low_cpu_mem_usage=True), its parameters # live on the meta device and will be filled in from a checkpoint after creation. Skip the SVD in that case. if self.lora_B[adapter_name].weight.device.type == "meta": return + weight = self.get_base_layer().weight dtype = weight.dtype if dtype not in [torch.float32, torch.float16, torch.bfloat16]: raise TypeError("Please initialize MiCA under float32, float16, or bfloat16.") + weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) # weight has shape (out_features, in_features) once transposed for fan_in_fan_out, matching nn.Linear.weight. - # SVD: weight = V @ diag(S) @ Uh, with V: (out, k), Uh: (k, in), S sorted descending. + # SVD: weight = U @ diag(S) @ Vh, with U: (out, k), Vh: (k, in), S sorted descending. # MiCA selects the LAST r left singular vectors (smallest singular values) for B and zeroes A. r = self.r[adapter_name] max_r = min(weight.shape) @@ -437,7 +432,7 @@ def mica_init(self, adapter_name): f"weight shape {tuple(weight.shape)} (max usable r is {max_r})." ) U, _, _ = torch.linalg.svd(weight.data, full_matrices=False) - lora_B = V[:, -r:].contiguous() + lora_B = U[:, -r:].contiguous() lora_A = torch.zeros(r, weight.shape[1], device=weight.device) self.lora_B[adapter_name].weight.data = lora_B.to(dtype) self.lora_A[adapter_name].weight.data = lora_A.to(dtype) @@ -1116,6 +1111,10 @@ def __init__( def resolve_lora_variant(self, *, config: LoraConfig, **kwargs) -> Optional[LoraVariant]: if config.velora_config is not None: raise ValueError("VeLoRA does not support adapting embedding layers.") + if isinstance(config.init_lora_weights, str) and config.init_lora_weights.lower() == "mica": + from .variants import MiCAEmbeddingVariant + + return MiCAEmbeddingVariant() if not config.use_dora: return None @@ -1168,7 +1167,10 @@ def update_layer( self.use_dora[adapter_name] = config.use_dora - if init_lora_weights == "loftq": + if isinstance(init_lora_weights, str) and init_lora_weights.lower() == "mica": + with gather_params_ctx(self.get_base_layer().weight): + self.mica_init(adapter_name) + elif init_lora_weights == "loftq": self.loftq_init(adapter_name) elif init_lora_weights == "lora_ga": # Embedding layers don't support LoRA-GA, fall back to standard initialization @@ -1197,6 +1199,36 @@ def output_fn(outputs): self.input_fns[adapter_name] = input_fn self.output_fns[adapter_name] = output_fn + def mica_init(self, adapter_name): + """Minor Component Adaptation (MiCA) initialization for embedding layers. + + The effective embedding projection has shape `(embedding_dim, num_embeddings)`, so MiCA initializes + `lora_embedding_B` from the minor left singular vectors of `base_layer.weight.T` and sets + `lora_embedding_A` to zero. + """ + if self.lora_embedding_B[adapter_name].device.type == "meta": + return + + weight = self.get_base_layer().weight + dtype = weight.dtype + if dtype not in [torch.float32, torch.float16, torch.bfloat16]: + raise TypeError("Please initialize MiCA under float32, float16, or bfloat16.") + + weight = weight.to(torch.float32).T + r = self.r[adapter_name] + max_r = min(weight.shape) + if r > max_r: + raise ValueError( + f"MiCA requires `r` <= min(num_embeddings, embedding_dim) but got r={r} for an embedding layer with " + f"weight shape {tuple(self.get_base_layer().weight.shape)} (max usable r is {max_r})." + ) + + U, _, _ = torch.linalg.svd(weight.data, full_matrices=False) + lora_embedding_B = U[:, -r:].contiguous() + lora_embedding_A = torch.zeros(r, weight.shape[1], device=weight.device) + self.lora_embedding_B[adapter_name].data = lora_embedding_B.to(dtype) + self.lora_embedding_A[adapter_name].data = lora_embedding_A.to(dtype) + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: """ Merge the active adapter weights into the base weights diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 8513b3ca08..8be2790268 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -333,15 +333,6 @@ def _create_and_replace( tp_size=self.model._tp_size, ) - def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: - super()._mark_only_adapters_as_trainable(model) - # Give each adapter's LoRA variant a chance to refine `requires_grad` after the default marking. Variants - # like MiCA override `update_requires_grad` to freeze a subset of the adapter parameters (e.g. lora_B). - for module in model.modules(): - if isinstance(module, LoraLayer): - for adapter_name, variant in module.lora_variant.items(): - variant.update_requires_grad(module, adapter_name) - def _replace_module(self, parent, child_name, new_module, child): # override in LoraModel to handle quantized weights properly diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 055eb19cda..18763c5a98 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -1218,25 +1218,25 @@ def forward( return result +def _register_frozen_peft_weight(module: Linear | Embedding, adapter_name: str, weight_name: str) -> None: + frozen_peft_weight_names = module.frozen_peft_weight_names.copy() + frozen_names = frozen_peft_weight_names.get(adapter_name, ()) + frozen_peft_weight_names[adapter_name] = tuple(dict.fromkeys((*frozen_names, weight_name))) + module.frozen_peft_weight_names = frozen_peft_weight_names + module._freeze_declared_peft_weights(adapter_name) + + class MiCALinearVariant(LoraVariant): """Variant for Minor Component Adaptation (MiCA), https://arxiv.org/abs/2604.01694. The actual SVD-based initialization is performed in `LoraLayer.mica_init` (called from `update_layer`); this - variant only adds the freezing of `lora_B` after the tuner's default trainability marking. Forward and merge - semantics are identical to vanilla LoRA, since `delta_W = scaling * B @ A` and only `A` is updated. + variant declares `lora_B` as frozen. Forward and merge semantics are identical to vanilla LoRA, since + `delta_W = scaling * B @ A` and only `A` is updated. """ @staticmethod def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: - # MiCA's adapter weights are populated in LoraLayer.mica_init before this hook runs; nothing to do here. - return None - - @staticmethod - def update_requires_grad(module: Linear, adapter_name: str) -> None: - if adapter_name in module.lora_B: - module.lora_B[adapter_name].weight.requires_grad = False - if module.lora_B[adapter_name].bias is not None: - module.lora_B[adapter_name].bias.requires_grad = False + _register_frozen_peft_weight(module, adapter_name, "lora_B") @staticmethod def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: @@ -1263,3 +1263,46 @@ def forward( dropout = module.lora_dropout[active_adapter] scaling = module.scaling[active_adapter] return result + lora_B(lora_A(dropout(x))) * scaling + + +class MiCAEmbeddingVariant(LoraVariant): + """Embedding variant for Minor Component Adaptation (MiCA), https://arxiv.org/abs/2604.01694.""" + + @staticmethod + def init(module: Embedding, adapter_name: str, **kwargs: Any) -> None: + _register_frozen_peft_weight(module, adapter_name, "lora_embedding_B") + + @staticmethod + def merge_safe(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + return orig_weight + module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def merge_unsafe(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> None: + orig_weight.data += module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def unmerge(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + return orig_weight - module.get_delta_weight(active_adapter).to(orig_weight.dtype) + + @staticmethod + def forward( + module: Embedding, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + embedding_A = module.lora_embedding_A[active_adapter].T + embedding_B = module.lora_embedding_B[active_adapter].T + scaling = module.scaling[active_adapter] + input_fn = module.input_fns.get(active_adapter, None) + output_fn = module.output_fns.get(active_adapter, None) + + after_A = module._embed(x, embedding_A, input_fn=input_fn, output_fn=output_fn) + adapter_output = (after_A @ embedding_B) * scaling + + embed_scale = module._get_embed_scale() + if embed_scale is not None: + adapter_output = adapter_output * embed_scale.to(adapter_output.dtype) + + return result + adapter_output diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py index 7227e869dd..84ac173118 100644 --- a/src/peft/tuners/tuners_utils.py +++ b/src/peft/tuners/tuners_utils.py @@ -495,6 +495,10 @@ def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: else: raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") + for module in model.modules(): + if isinstance(module, self.tuner_layer_cls): + module._freeze_declared_peft_weights() + def _enable_adapter_layers(self, enabled: bool = True) -> None: for module in self.model.modules(): if isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)): @@ -1385,6 +1389,8 @@ class BaseTunerLayer(ABC): adapter_layer_names: tuple[str, ...] = () # All names of other parameters that may contain adapter-related parameters other_param_names: tuple[str, ...] = () + # Mapping from adapter name to adapter layer names that should always stay frozen + frozen_peft_weight_names: dict[str, tuple[str, ...]] = {} # indicates whether all adapters should be disabled _disable_adapters: bool = False @@ -1521,6 +1527,23 @@ def enable_adapters(self, enabled: bool) -> None: _set_layer_requires_grad(layer, False) self._disable_adapters = True + def _freeze_declared_peft_weights(self, adapter_names: str | Sequence[str] | None = None) -> None: + if adapter_names is None: + adapter_names = self.frozen_peft_weight_names.keys() + elif isinstance(adapter_names, str): + adapter_names = [adapter_names] + + for adapter_name in adapter_names: + for layer_name in self.frozen_peft_weight_names.get(adapter_name, ()): + if layer_name not in self.adapter_layer_names: + continue + + module_dict = getattr(self, layer_name) + if adapter_name not in module_dict: + continue + + _set_layer_requires_grad(module_dict[adapter_name], False) + def set_adapter(self, adapter_names: str | list[str], inference_mode: bool = False) -> None: """Set the active adapter(s). @@ -1543,6 +1566,7 @@ def set_adapter(self, adapter_names: str | list[str], inference_mode: bool = Fal should_require_grad = (key in adapter_names) and (not inference_mode) _set_layer_requires_grad(layer, should_require_grad) + self._freeze_declared_peft_weights(adapter_names) self._active_adapter = adapter_names def _all_available_adapter_names(self) -> list[str]: @@ -1573,6 +1597,11 @@ def delete_adapter(self, adapter_name: str) -> None: if adapter_name in getattr(self, attr): del getattr(self, attr)[adapter_name] + if adapter_name in self.frozen_peft_weight_names: + frozen_peft_weight_names = self.frozen_peft_weight_names.copy() + del frozen_peft_weight_names[adapter_name] + self.frozen_peft_weight_names = frozen_peft_weight_names + if adapter_name in self.active_adapters: # choose a new active adapter active_adapters = self.active_adapters[:] @@ -1614,6 +1643,9 @@ def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: b if key in adapter_names_set: _set_layer_requires_grad(layer, requires_grad) + if requires_grad: + self._freeze_declared_peft_weights(adapter_names_set) + def _get_base_layer_device_and_dtype(self, base_layer): """ Helper function to determine the device and dtype of the base layer. If not possible to determine, return None. diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index c089933277..4cb8f1fc89 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -107,13 +107,6 @@ LoraConfig, {"target_modules": ["lin0"], "init_lora_weights": "mica", "r": 4}, ), - ( - "Vanilla MLP 2 LoRA with MiCA", - "MLP", - LoraConfig, - # lin1 is nn.Linear(20, 2), so the largest feasible MiCA rank for both targets is 2. - {"target_modules": ["lin0", "lin1"], "init_lora_weights": "mica", "r": 2}, - ), ( "Vanilla MLP 9 LoRA with DoRA", "MLP", diff --git a/tests/test_initialization.py b/tests/test_initialization.py index a175fcddee..379e976f58 100644 --- a/tests/test_initialization.py +++ b/tests/test_initialization.py @@ -409,6 +409,32 @@ def test_lora_mica_linear_init_default(self, data): # Output at init equals the base output assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + def test_lora_mica_embedding_init_default(self): + class EmbeddingModel(nn.Module): + def __init__(self): + super().__init__() + self.embed = nn.Embedding(7, 5) + + def forward(self, x): + return self.embed(x) + + model = EmbeddingModel() + data = torch.arange(7).unsqueeze(0) + output = model(data) + + config = LoraConfig(init_lora_weights="mica", target_modules=["embed"], r=3) + peft_model = get_peft_model(deepcopy(model), config) + + weight_A = peft_model.base_model.embed.lora_embedding_A["default"] + weight_B = peft_model.base_model.embed.lora_embedding_B["default"] + + assert torch.all(weight_A == 0) + eye = torch.eye(weight_B.shape[1], device=weight_B.device, dtype=weight_B.dtype) + assert torch.allclose(weight_B.t() @ weight_B, eye, atol=1e-4) + assert weight_A.requires_grad + assert not weight_B.requires_grad + assert torch.allclose(output, peft_model(data), atol=1e-06) + def test_lora_mica_uses_minor_components(self): # Verify B equals the *minor* (smallest singular value) left singular vectors, not the major ones. torch.manual_seed(0) @@ -441,51 +467,56 @@ def test_lora_mica_freezes_B(self): assert peft_model.base_model.linear.lora_A["default"].weight.requires_grad assert not peft_model.base_model.linear.lora_B["default"].weight.requires_grad - def test_lora_mica_train_step_updates_only_A(self, data): - # After one optimizer step, lora_A must change but lora_B must stay exactly equal. - torch.manual_seed(0) - model = self.get_model() - config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) - peft_model = get_peft_model(deepcopy(model), config) + def test_lora_mica_freezes_B_when_switching_adapters(self): + class SimpleMlp(nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(10, 10) + self.fc2 = nn.Linear(10, 10) - A_before = peft_model.base_model.linear.lora_A["default"].weight.detach().clone() - B_before = peft_model.base_model.linear.lora_B["default"].weight.detach().clone() + def forward(self, x): + x = torch.relu(self.fc1(x)) + return self.fc2(x) - opt = torch.optim.SGD([p for p in peft_model.parameters() if p.requires_grad], lr=1.0) - out = peft_model(data)[0] - loss = out.sum() - loss.backward() - opt.step() + def trainable_parameters(model): + return [name for name, param in model.named_parameters() if param.requires_grad] - A_after = peft_model.base_model.linear.lora_A["default"].weight.detach() - B_after = peft_model.base_model.linear.lora_B["default"].weight.detach() - assert not torch.equal(A_before, A_after) - assert torch.equal(B_before, B_after) + config0 = LoraConfig(target_modules=["fc1"], init_lora_weights="mica", r=4) + model = get_peft_model(SimpleMlp(), config0) + assert trainable_parameters(model) == ["base_model.model.fc1.lora_A.default.weight"] - def test_lora_mica_save_and_load(self, data, tmp_path): - # Save then reload onto the same base weights and verify outputs match exactly. - torch.manual_seed(0) - model = self.get_model() + config1 = LoraConfig(target_modules=["fc1", "fc2"], init_lora_weights="mica", r=4) + model.add_adapter("other", config1) + model.set_adapter("other") + assert trainable_parameters(model) == [ + "base_model.model.fc1.lora_A.other.weight", + "base_model.model.fc2.lora_A.other.weight", + ] - config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=8) - peft_model = get_peft_model(deepcopy(model), config) - # mutate lora_A so the adapter is non-trivial - peft_model.base_model.linear.lora_A["default"].weight.data.normal_() - output_mica = peft_model(data)[0] + model.set_adapter("default") + assert trainable_parameters(model) == ["base_model.model.fc1.lora_A.default.weight"] - peft_model.save_pretrained(tmp_path / "mica-model") - model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "mica-model") - output_loaded = model_loaded(data)[0] + model.delete_adapter("other") + config2 = LoraConfig(target_modules=["fc1"], r=4) + model.add_adapter("other", config2) + model.set_adapter("other") + assert trainable_parameters(model) == [ + "base_model.model.fc1.lora_A.other.weight", + "base_model.model.fc1.lora_B.other.weight", + ] - assert torch.allclose(output_mica, output_loaded, atol=1e-6) + def test_lora_mica_rank_too_large_raises(self): + class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 3) - def test_lora_mica_unsupported_layer_raises(self): - # MiCA is currently only implemented for nn.Linear. Trying to apply it to a non-Linear layer (e.g. conv2d) - # should raise rather than silently fall through to a default init. - model = self.get_model() - config = LoraConfig(init_lora_weights="mica", target_modules=["conv2d"], r=4) - with pytest.raises(ValueError, match="Unknown initialization"): - get_peft_model(deepcopy(model), config) + def forward(self, x): + return self.linear(x) + + config = LoraConfig(init_lora_weights="mica", target_modules=["linear"], r=3) + with pytest.raises(ValueError, match="MiCA requires `r` <= min"): + get_peft_model(SimpleModel(), config) def test_lora_olora_linear_init_default(self, data): model = self.get_model()