From cce64a5fa5ddbdf7d490b0f87981b02a4046e2f4 Mon Sep 17 00:00:00 2001 From: nemo Date: Fri, 29 May 2026 14:36:12 +0200 Subject: [PATCH 01/33] Starting with prompt tuning methods --- docs/source/_toctree.yml | 28 +- .../{tutorial => guides}/peft_integrations.md | 0 .../{tutorial => guides}/peft_model_config.md | 0 docs/source/index.md | 9 +- docs/source/methods/overview.md | 94 ++++++ docs/source/package_reference/lora.md | 11 + docs/source/package_reference/p_tuning.md | 43 ++- .../source/package_reference/prefix_tuning.md | 59 +++- .../source/package_reference/prompt_tuning.md | 42 ++- docs/source/quicktour.md | 40 +++ .../task_guides/prompt_based_methods.md | 302 ------------------ 11 files changed, 301 insertions(+), 327 deletions(-) rename docs/source/{tutorial => guides}/peft_integrations.md (100%) rename docs/source/{tutorial => guides}/peft_model_config.md (100%) create mode 100644 docs/source/methods/overview.md delete mode 100644 docs/source/task_guides/prompt_based_methods.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a9db3fa19a..3cb51eb70c 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -7,13 +7,6 @@ - local: install title: Installation -- title: Tutorial - sections: - - local: tutorial/peft_model_config - title: Configurations and models - - local: tutorial/peft_integrations - title: Integrations - - title: PEFT method guides sections: - local: task_guides/prompt_based_methods @@ -23,8 +16,18 @@ - local: task_guides/ia3 title: IA3 -- title: Developer guides +- title: Guides sections: + - local: guides/peft_model_config + title: Configurations and models + - local: guides/peft_integrations + title: Integrations + - sections: + - local: accelerate/deepspeed + title: DeepSpeed + - local: accelerate/fsdp + title: Fully Sharded Data Parallel + title: Distributed Training - local: developer_guides/model_merging title: Model merging - local: developer_guides/quantization @@ -36,7 +39,7 @@ - local: developer_guides/low_level_api title: Adapter injection - local: developer_guides/mixed_models - title: Mixed adapter types + title: Mixing PEFT methods - local: developer_guides/torch_compile title: torch.compile - local: developer_guides/contributing @@ -46,12 +49,7 @@ - local: developer_guides/checkpoint title: PEFT checkpoint format -- title: 🤗 Accelerate integrations - sections: - - local: accelerate/deepspeed - title: DeepSpeed - - local: accelerate/fsdp - title: Fully Sharded Data Parallel + - title: Conceptual guides sections: diff --git a/docs/source/tutorial/peft_integrations.md b/docs/source/guides/peft_integrations.md similarity index 100% rename from docs/source/tutorial/peft_integrations.md rename to docs/source/guides/peft_integrations.md diff --git a/docs/source/tutorial/peft_model_config.md b/docs/source/guides/peft_model_config.md similarity index 100% rename from docs/source/tutorial/peft_model_config.md rename to docs/source/guides/peft_model_config.md diff --git a/docs/source/index.md b/docs/source/index.md index d38544311f..db205a1e99 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -16,10 +16,17 @@ rendered properly in your Markdown viewer. # PEFT -🤗 PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained models to various downstream applications without fine-tuning all of a model's parameters because it is prohibitively costly. PEFT methods only fine-tune a small number of (extra) model parameters - significantly decreasing computational and storage costs - while yielding performance comparable to a fully fine-tuned model. This makes it more accessible to train and store large language models (LLMs) on consumer hardware. +🤗 PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained models to various downstream applications without fine-tuning all of a model's parameters because it is prohibitively costly. PEFT methods only fine-tune a small number of (extra) model parameters - significantly decreasing computational and storage costs - while yielding performance comparable to a fully fine-tuned model. This makes it more accessible to train and store large language models (LLMs) and other big models on consumer hardware. PEFT is integrated with the Transformers, Diffusers, and Accelerate libraries to provide a faster and easier way to load, train, and use large models for inference. +
+
+ There are numerous methods to "adapt" existing models, often extensively integrating into the model. PEFT can be thought of as a framework for arbitrary methods of model adaption (modifying weights, wrapping layers, manipulating KV-caches, ...) while also serving as a reference implementation for many fine-tuning methods. +
+
+
+
+ + + + + +# Prompt-based methods + +A prompt can describe a task or provide an example of a task you want the model to learn. Instead of manually creating these prompts, soft prompting methods add learnable parameters to the input embeddings that can be optimized for a specific task while keeping the pretrained model's parameters frozen. This makes it both faster and easier to finetune large language models (LLMs) for new downstream tasks. + +The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning) and you can learn more about how these methods work conceptually in the [Soft prompts](../conceptual_guides/prompting) guide. If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! + +This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not. + +> [!TIP] +> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! + + + +### PEFT configuration and model + +For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +> [!TIP] +> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! + + + + +[P-tuning](../conceptual_guides/prompting#p-tuning) adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence. Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. + +```py +from peft import PromptEncoderConfig, get_peft_model + +peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" +``` + + + + +[Prefix tuning](../conceptual_guides/prompting#prefix-tuning) adds task-specific parameters in all of the model layers, which are optimized by a separate feed-forward network. Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. + +```py +from peft import PrefixTuningConfig, get_peft_model + +peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" +``` + + + + +[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. + +Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. + +```py +from peft import PromptTuningConfig, PromptTuningInit, get_peft_model + +prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" +peft_config = PromptTuningConfig( + task_type="CAUSAL_LM", + prompt_tuning_init=PromptTuningInit.TEXT, + num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), + prompt_tuning_init_text=prompt_tuning_init_text, + tokenizer_name_or_path="bigscience/bloomz-560m", +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" +``` + + + + diff --git a/docs/source/package_reference/lora.md b/docs/source/package_reference/lora.md index a7b8688178..52c9f87de8 100644 --- a/docs/source/package_reference/lora.md +++ b/docs/source/package_reference/lora.md @@ -94,3 +94,14 @@ The abstract from the paper is: ## Intruder Dimension Reduction [[autodoc]] tuners.lora.intruders.reduce_intruder_dimension + + +## Benchmark overview + + + diff --git a/docs/source/package_reference/p_tuning.md b/docs/source/package_reference/p_tuning.md index a35f7244c3..1b38708cee 100644 --- a/docs/source/package_reference/p_tuning.md +++ b/docs/source/package_reference/p_tuning.md @@ -16,16 +16,55 @@ rendered properly in your Markdown viewer. # P-tuning -[P-tuning](https://hf.co/papers/2103.10385) adds trainable prompt embeddings to the input that is optimized by a prompt encoder to find a better prompt, eliminating the need to manually design prompts. The prompt tokens can be added anywhere in the input sequence, and p-tuning also introduces anchor tokens for improving performance. +
+ +
+Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder
(image source). + +[P-tuning](https://hf.co/papers/2103.10385) is designed for natural language understanding (NLU) tasks and all language models. The abstract from the paper is: *While GPTs with traditional fine-tuning fail to achieve strong results on natural language understanding (NLU), we show that GPTs can be better than or comparable to similar-sized BERTs on NLU tasks with a novel method P-tuning -- which employs trainable continuous prompt embeddings. On the knowledge probing (LAMA) benchmark, the best GPT recovers 64\% (P@1) of world knowledge without any additional text provided during test time, which substantially improves the previous best by 20+ percentage points. On the SuperGlue benchmark, GPTs achieve comparable and sometimes better performance to similar-sized BERTs in supervised learning. Importantly, we find that P-tuning also improves BERTs' performance in both few-shot and supervised settings while largely reducing the need for prompt engineering. Consequently, P-tuning outperforms the state-of-the-art approaches on the few-shot SuperGlue benchmark.*. +The method adds trainable prompt embeddings to the input that is optimized by a prompt encoder to find a better prompt, eliminating the need to manually design prompts. The prompt tokens can be added anywhere in the input sequence, and p-tuning also introduces anchor tokens for improving performance. A prompt encoder (a bidirectional long-short term memory network or LSTM) is used to optimize the prompt parameters. Unlike prefix tuning: + +- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning +- the prompt tokens are only added to the input instead of adding them to every layer of the model +- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence + +The paper's results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks. + +## Usage + +Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. + +```py +from peft import PromptEncoderConfig, get_peft_model + +peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" +``` + +## Benchmark overview + + + + +# API + ## PromptEncoderConfig [[autodoc]] tuners.p_tuning.config.PromptEncoderConfig ## PromptEncoder -[[autodoc]] tuners.p_tuning.model.PromptEncoder \ No newline at end of file +[[autodoc]] tuners.p_tuning.model.PromptEncoder + diff --git a/docs/source/package_reference/prefix_tuning.md b/docs/source/package_reference/prefix_tuning.md index 9d722da219..76031f9da8 100644 --- a/docs/source/package_reference/prefix_tuning.md +++ b/docs/source/package_reference/prefix_tuning.md @@ -16,14 +16,40 @@ rendered properly in your Markdown viewer. # Prefix tuning +
+ +
+Optimize the prefix parameters for each task (image source). + [Prefix tuning](https://hf.co/papers/2101.00190) prefixes a series of task-specific vectors to the input sequence that can be learned while keeping the pretrained model frozen. The prefix parameters are inserted in all of the model layers. +The abstract from the paper is: + +*Fine-tuning is the de facto way to leverage large pretrained language models to perform downstream tasks. However, it modifies all the language model parameters and therefore necessitates storing a full copy for each task. In this paper, we propose prefix-tuning, a lightweight alternative to fine-tuning for natural language generation tasks, which keeps language model parameters frozen, but optimizes a small continuous task-specific vector (called the prefix). Prefix-tuning draws inspiration from prompting, allowing subsequent tokens to attend to this prefix as if it were "virtual tokens". We apply prefix-tuning to GPT-2 for table-to-text generation and to BART for summarization. We find that by learning only 0.1\% of the parameters, prefix-tuning obtains comparable performance in the full data setting, outperforms fine-tuning in low-data settings, and extrapolates better to examples with topics unseen during training*. + **Note** For encoder-decoder models (seq2seq), the prefix is only applied to the decoder, which does not correspond to the paper specification (see e.g. Figure 2). Prefix tuning can still be fine-tuned on these model architectures but the performance could be sub-par; consider using other PEFT methods for encoder-decoder models. -## Possible Initialization +Prefix tuning is very similar to [prompt tuning](../package_reference/prompt_tuning). The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts. + +As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings. + +## Basic Usage + +Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. -By default, prefix tuning is randomly initialized. There's also the option to initialize the embeddings (or the -projection thereof) to be close to a no-op (initialized to zero, it will still shift the probability mass a bit). +```py +from peft import PrefixTuningConfig, get_peft_model + +peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" +``` + +## Possible Initializations + +By default, prefix tuning uses randomly initialized virtual tokens. There's also the option to initialize the vectors +to be close to a no-op (initialized to zero, it will still shift the probability mass a bit). This means that the KV-cache injected prefixes have less impact from the beginning and reduces the variance in training performance. @@ -42,12 +68,26 @@ tok = AutoTokenizer.from_pretrained("gpt2") peft_cfg = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, prefix_projection=False) model = get_peft_model(base, peft_cfg) +initialize_kv_prefix_from_text( + model, + tok, + text="...a long context with at least num_virtual_tokens tokens...", + use_chat_template=False, +)m peft import PrefixTuningConfig, get_peft_model, initialize_kv_prefix_from_text + +base = AutoModelForCausalLM.from_pretrained("gpt2") +tok = AutoTokenizer.from_pretrained("gpt2") + +peft_cfg = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, prefix_projection=False) +model = get_peft_model(base, peft_cfg) + initialize_kv_prefix_from_text( model, tok, text="...a long context with at least num_virtual_tokens tokens...", use_chat_template=False, ) + ``` Make sure the text is long enough to produce at least `num_virtual_tokens` tokens, otherwise initialization will fail. @@ -61,9 +101,18 @@ As a guideline: * if it is not possible to use an initialization text or you want to quickly check if prefix tuning is viable at all, use a zero init without projection -The abstract from the paper is: -*Fine-tuning is the de facto way to leverage large pretrained language models to perform downstream tasks. However, it modifies all the language model parameters and therefore necessitates storing a full copy for each task. In this paper, we propose prefix-tuning, a lightweight alternative to fine-tuning for natural language generation tasks, which keeps language model parameters frozen, but optimizes a small continuous task-specific vector (called the prefix). Prefix-tuning draws inspiration from prompting, allowing subsequent tokens to attend to this prefix as if it were "virtual tokens". We apply prefix-tuning to GPT-2 for table-to-text generation and to BART for summarization. We find that by learning only 0.1\% of the parameters, prefix-tuning obtains comparable performance in the full data setting, outperforms fine-tuning in low-data settings, and extrapolates better to examples with topics unseen during training*. +## Benchmark overview + + + + +# API ## PrefixTuningConfig diff --git a/docs/source/package_reference/prompt_tuning.md b/docs/source/package_reference/prompt_tuning.md index 61dbb6a2e9..83419ed4e4 100644 --- a/docs/source/package_reference/prompt_tuning.md +++ b/docs/source/package_reference/prompt_tuning.md @@ -16,16 +16,54 @@ rendered properly in your Markdown viewer. # Prompt tuning -[Prompt tuning](https://hf.co/papers/2104.08691) adds task-specific prompts to the input, and these prompt parameters are updated independently of the pretrained model parameters which are frozen. +[Prompt tuning](https://hf.co/papers/2104.08691) adds a task-specific, virtual prompt to the input that consists of trainable vectors in the embedding space. The virtual token parameters are updated independently of the pretrained model parameters which are frozen. The abstract from the paper is: *In this work, we explore "prompt tuning", a simple yet effective mechanism for learning "soft prompts" to condition frozen language models to perform specific downstream tasks. Unlike the discrete text prompts used by GPT-3, soft prompts are learned through backpropagation and can be tuned to incorporate signal from any number of labeled examples. Our end-to-end learned approach outperforms GPT-3's "few-shot" learning by a large margin. More remarkably, through ablations on model size using T5, we show that prompt tuning becomes more competitive with scale: as models exceed billions of parameters, our method "closes the gap" and matches the strong performance of model tuning (where all model weights are tuned). This finding is especially relevant in that large models are costly to share and serve, and the ability to reuse one frozen model for multiple downstream tasks can ease this burden. Our method can be seen as a simplification of the recently proposed "prefix tuning" of Li and Liang (2021), and we provide a comparison to this and other similar approaches. Finally, we show that conditioning a frozen model with soft prompts confers benefits in robustness to domain transfer, as compared to full model tuning*. +In contrast to [prefix tuning](../package_reference/prefix_tuning), only the +input of the first layer receives the virtual tokens. + +## Usage + +There are two decisions to take: how many virtual tokens are added to the +input of the model (`num_virtual_tokens`) - this will define how many +trainable parameters there will be - and how these tokens are initialized. + +Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. + +```py +from peft import PromptTuningConfig, PromptTuningInit, get_peft_model + +prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" +peft_config = PromptTuningConfig( + task_type="CAUSAL_LM", + prompt_tuning_init=PromptTuningInit.TEXT, + num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), + prompt_tuning_init_text=prompt_tuning_init_text, + tokenizer_name_or_path="bigscience/bloomz-560m", +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" +``` + +## Benchmark overview + + + +# API + ## PromptTuningConfig [[autodoc]] tuners.prompt_tuning.config.PromptTuningConfig ## PromptEmbedding -[[autodoc]] tuners.prompt_tuning.model.PromptEmbedding \ No newline at end of file +[[autodoc]] tuners.prompt_tuning.model.PromptEmbedding diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 1f0a0a27be..64747f302c 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -20,6 +20,46 @@ PEFT offers parameter-efficient methods for finetuning large pretrained models. This quicktour will show you PEFT's main features and how you can train or run inference on large models that would typically be inaccessible on consumer devices. + +### PEFT configuration and model + +For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +> [!TIP] +> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! + + + + + + + + + +[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. + +Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. + +```py +from peft import PromptTuningConfig, PromptTuningInit, get_peft_model + +prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" +peft_config = PromptTuningConfig( + task_type="CAUSAL_LM", + prompt_tuning_init=PromptTuningInit.TEXT, + num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), + prompt_tuning_init_text=prompt_tuning_init_text, + tokenizer_name_or_path="bigscience/bloomz-560m", +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" +``` + + + + + ## Train Each PEFT method is defined by a [`PeftConfig`] class that stores all the important parameters for building a [`PeftModel`]. For example, to train with LoRA, load and create a [`LoraConfig`] class and specify the following parameters: diff --git a/docs/source/task_guides/prompt_based_methods.md b/docs/source/task_guides/prompt_based_methods.md deleted file mode 100644 index cc6262aebf..0000000000 --- a/docs/source/task_guides/prompt_based_methods.md +++ /dev/null @@ -1,302 +0,0 @@ - - -# Prompt-based methods - -A prompt can describe a task or provide an example of a task you want the model to learn. Instead of manually creating these prompts, soft prompting methods add learnable parameters to the input embeddings that can be optimized for a specific task while keeping the pretrained model's parameters frozen. This makes it both faster and easier to finetune large language models (LLMs) for new downstream tasks. - -The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning) and you can learn more about how these methods work conceptually in the [Soft prompts](../conceptual_guides/prompting) guide. If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! - -This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not. - -> [!TIP] -> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! - -Before you begin, make sure you have all the necessary libraries installed. - -```bash -pip install -q peft transformers datasets -``` - -## Dataset - -For this guide, you'll use the `twitter_complaints` subset of the [RAFT](https://huggingface.co/datasets/ought/raft) dataset. The `twitter_complaints` subset contains tweets labeled as `complaint` and `no complaint` and you can check out the [dataset viewer](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) for a better idea of what the data looks like. - -Use the [`~datasets.load_dataset`] function to load the dataset and create a new `text_label` column so it is easier to understand what the `Label` values, `1` and `2` mean. - -```py -from datasets import load_dataset - -ds = load_dataset( - "parquet", - data_files={ - "train": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/train/0000.parquet", - "test": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/test/0000.parquet" - } -) - -classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] -ds = ds.map( - lambda x: {"text_label": [classes[label] for label in x["Label"]]}, - batched=True, - num_proc=1, -) -ds["train"][0] -{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2, "text_label": "no complaint"} -``` - -Load a tokenizer, define the padding token to use, and determine the maximum length of the tokenized label. - -```py -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") -if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id -target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes]) -print(target_max_length) -``` - -Create a preprocessing function that tokenizes the tweet text and labels, pad the inputs and labels in each batch, create an attention mask, and truncate sequences to the `max_length`. Then convert the `input_ids`, `attention_mask`, and `labels` to PyTorch tensors. - -```py -import torch - -max_length = 64 - -def preprocess_function(examples, text_column="Tweet text", label_column="text_label"): - batch_size = len(examples[text_column]) - inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] - targets = [str(x) for x in examples[label_column]] - model_inputs = tokenizer(inputs) - labels = tokenizer(targets) - classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] - for i in range(batch_size): - sample_input_ids = model_inputs["input_ids"][i] - label_input_ids = labels["input_ids"][i] - model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( - max_length - len(sample_input_ids) - ) + sample_input_ids - model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ - "attention_mask" - ][i] - labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids - model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length]) - model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length]) - labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length]) - model_inputs["labels"] = labels["input_ids"] - return model_inputs -``` - -Apply the preprocessing function to the entire dataset with the [`~datasets.Dataset.map`] function, and remove the unprocessed columns because the model won't need them. - -```py -processed_ds = ds.map( - preprocess_function, - batched=True, - num_proc=1, - remove_columns=ds["train"].column_names, - load_from_cache_file=False, - desc="Running tokenizer on dataset", -) -``` - -Finally, create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). You can set `pin_memory=True` to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU. - -```py -from torch.utils.data import DataLoader -from transformers import default_data_collator - -train_ds = processed_ds["train"] -eval_ds = processed_ds["test"] - -batch_size = 16 - -train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -``` - -## Model - -Now let's load a pretrained model to use as the base model for the soft prompt method. This guide uses the [bigscience/bloomz-560m](https://huggingface.co/bigscience/bloomz-560m) model, but you can use any causal language model you want. - -```py -from transformers import AutoModelForCausalLM - -model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m") -``` - -### PEFT configuration and model - -For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - -[P-tuning](../conceptual_guides/prompting#p-tuning) adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence. Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. - -```py -from peft import PromptEncoderConfig, get_peft_model - -peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" -``` - - - - -[Prefix tuning](../conceptual_guides/prompting#prefix-tuning) adds task-specific parameters in all of the model layers, which are optimized by a separate feed-forward network. Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. - -```py -from peft import PrefixTuningConfig, get_peft_model - -peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" -``` - - - - -[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. - -Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. - -```py -from peft import PromptTuningConfig, PromptTuningInit, get_peft_model - -prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" -peft_config = PromptTuningConfig( - task_type="CAUSAL_LM", - prompt_tuning_init=PromptTuningInit.TEXT, - num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), - prompt_tuning_init_text=prompt_tuning_init_text, - tokenizer_name_or_path="bigscience/bloomz-560m", -) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" -``` - - - - -### Training - -Set up an optimizer and learning rate scheduler. - -```py -from transformers import get_linear_schedule_with_warmup - -lr = 3e-2 -num_epochs = 50 - -optimizer = torch.optim.AdamW(model.parameters(), lr=lr) -lr_scheduler = get_linear_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=(len(train_dataloader) * num_epochs), -) -``` - -Move the model to the GPU and create a training loop that reports the loss and perplexity for each epoch. - -```py -from tqdm import tqdm - -device = "cuda" -model = model.to(device) - -for epoch in range(num_epochs): - model.train() - total_loss = 0 - for step, batch in enumerate(tqdm(train_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - total_loss += loss.detach().float() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - - model.eval() - eval_loss = 0 - - for step, batch in enumerate(tqdm(eval_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - with torch.no_grad(): - outputs = model(**batch) - loss = outputs.loss - eval_loss += loss.detach().float() - - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_ppl = torch.exp(eval_epoch_loss) - train_epoch_loss = total_loss / len(train_dataloader) - train_ppl = torch.exp(train_epoch_loss) - print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") -``` - -## Share your model - -Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -account = -peft_model_id = f"{account}/bloomz-560-m-peft-method" -model.push_to_hub(peft_model_id) -``` - -If you check the model file size in the repository, you’ll see that it is a lot smaller than a full sized model! - -
- -
For example, the adapter weights for a opt-350m model stored on the Hub are only ~6MB compared to the full model size which can be ~700MB.
-
- -## Inference - -Let's load the model for inference and test it out on a tweet! - -```py -from peft import AutoPeftModelForCausalLM - -model = AutoPeftModelForCausalLM.from_pretrained("peft_model_id").to("cuda") -tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") - -i = 15 -inputs = tokenizer(f'{text_column} : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt") -print(ds["test"][i]["Tweet text"]) -"@NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve?" -``` - -Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted classification label. - -```py -with torch.no_grad(): - inputs = {k: v.to(device) for k, v in inputs.items()} - outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) - print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) -"['Tweet text : @NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label : complaint']" -``` From 485fcdbd04e0d3150c668e076aa62049856664ff Mon Sep 17 00:00:00 2001 From: nemo Date: Fri, 29 May 2026 15:06:34 +0200 Subject: [PATCH 02/33] Moved prompt tuning methods to respective sections --- docs/source/_toctree.yml | 9 +- docs/source/conceptual_guides/prompting.md | 93 ------------------- docs/source/methods/overview.md | 13 ++- docs/source/package_reference/cpt.md | 20 +++- .../multitask_prompt_tuning.md | 26 +++++- .../source/package_reference/prompt_tuning.md | 5 + docs/source/quicktour.md | 29 ------ 7 files changed, 61 insertions(+), 134 deletions(-) delete mode 100644 docs/source/conceptual_guides/prompting.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 3cb51eb70c..aaec52308b 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -9,8 +9,6 @@ - title: PEFT method guides sections: - - local: task_guides/prompt_based_methods - title: Prompt-based methods - local: task_guides/lora_based_methods title: LoRA methods - local: task_guides/ia3 @@ -49,14 +47,15 @@ - local: developer_guides/checkpoint title: PEFT checkpoint format - +- title: Methods + sections: + - local: methods/overview + title: Overview - title: Conceptual guides sections: - local: conceptual_guides/adapter title: Adapters - - local: conceptual_guides/prompting - title: Soft prompts - local: conceptual_guides/ia3 title: IA3 - local: conceptual_guides/oft diff --git a/docs/source/conceptual_guides/prompting.md b/docs/source/conceptual_guides/prompting.md deleted file mode 100644 index 733ffbf461..0000000000 --- a/docs/source/conceptual_guides/prompting.md +++ /dev/null @@ -1,93 +0,0 @@ - - -# Soft prompts - -Training large pretrained language models is very time-consuming and compute-intensive. As they continue to grow in size, there is increasing interest in more efficient training methods such as *prompting*. Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. - -There are two categories of prompting methods: - -- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt -- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word - -This conceptual guide provides a brief overview of the soft prompt methods included in 🤗 PEFT: prompt tuning, prefix tuning, P-tuning, and multitask prompt tuning. - -## Prompt tuning - -
- -
-Only train and store a significantly smaller set of task-specific prompt parameters (image source). - -[Prompt tuning](https://hf.co/papers/2104.08691) was developed for text classification tasks on T5 models, and all downstream tasks are cast as a text generation task. For example, sequence classification usually assigns a single class label to a sequence of text. By casting it as a text generation task, the tokens that make up the class label are *generated*. Prompts are added to the input as a series of tokens. Typically, the model parameters are fixed which means the prompt tokens are also fixed by the model parameters. - -The key idea behind prompt tuning is that prompt tokens have their own parameters that are updated independently. This means you can keep the pretrained model's parameters frozen, and only update the gradients of the prompt token embeddings. The results are comparable to the traditional method of training the entire model, and prompt tuning performance scales as model size increases. - -Take a look at [Prompt tuning for causal language modeling](../task_guides/clm-prompt-tuning) for a step-by-step guide on how to train a model with prompt tuning. - -## Prefix tuning - -
- -
-Optimize the prefix parameters for each task (image source). - -[Prefix tuning](https://hf.co/papers/2101.00190) was designed for natural language generation (NLG) tasks on GPT models. It is very similar to prompt tuning; prefix tuning also prepends a sequence of task-specific vectors to the input that can be trained and updated while keeping the rest of the pretrained model's parameters frozen. - -The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts. - -As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings. - -Take a look at [Prefix tuning for conditional generation](../task_guides/seq2seq-prefix-tuning) for a step-by-step guide on how to train a model with prefix tuning. - -## P-tuning - -
- -
-Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder (image source). - -[P-tuning](https://hf.co/papers/2103.10385) is designed for natural language understanding (NLU) tasks and all language models. -It is another variation of a soft prompt method; P-tuning also adds a trainable embedding tensor that can be optimized to find better prompts, and it uses a prompt encoder (a bidirectional long-short term memory network or LSTM) to optimize the prompt parameters. Unlike prefix tuning though: - -- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning -- the prompt tokens are only added to the input instead of adding them to every layer of the model -- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence - -The results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks. - -Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq-classification) for a step-by-step guide on how to train a model with P-tuning. - -## Multitask prompt tuning - -
- -
-Multitask prompt tuning enables parameter-efficient transfer learning. - -[Multitask prompt tuning (MPT)](https://hf.co/papers/2303.02861) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages: - -1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training. -2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix. - -
- -
-Prompt decomposition. - - -## Context-Aware Prompt Tuning (CPT) - -
- -
-CPT optimizing only specific token embeddings while keeping the rest of the model frozen (image source). - -[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. -This approach combines ideas from In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective. -In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. -To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. -Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor. - -Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT. diff --git a/docs/source/methods/overview.md b/docs/source/methods/overview.md index be78def665..c88fd30f30 100644 --- a/docs/source/methods/overview.md +++ b/docs/source/methods/overview.md @@ -20,17 +20,24 @@ rendered properly in your Markdown viewer. # Prompt-based methods -A prompt can describe a task or provide an example of a task you want the model to learn. Instead of manually creating these prompts, soft prompting methods add learnable parameters to the input embeddings that can be optimized for a specific task while keeping the pretrained model's parameters frozen. This makes it both faster and easier to finetune large language models (LLMs) for new downstream tasks. + Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. -The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning) and you can learn more about how these methods work conceptually in the [Soft prompts](../conceptual_guides/prompting) guide. If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! +There are two categories of prompting methods: -This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not. +- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt +- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word + +The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning, ...), explore the table of contents for a full listing of soft prompt methods. +If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! > [!TIP] > Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! + + + ### PEFT configuration and model For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. diff --git a/docs/source/package_reference/cpt.md b/docs/source/package_reference/cpt.md index 9e67fd7c37..822542c556 100644 --- a/docs/source/package_reference/cpt.md +++ b/docs/source/package_reference/cpt.md @@ -12,17 +12,31 @@ rendered properly in your Markdown viewer. # Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods -[CPT](https://huggingface.co/papers/2410.17222) combines In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization to improve few-shot learning by refining context embeddings. CPT updates the context tokens by optimizing both the context and the training examples, encapsulating them into a novel loss design that minimizes overfitting, enables more effective optimization, and drives significant improvements in classification tasks. -[//]: # ([CPT](https://huggingface.co/papers/2410.17222) for the paper) +
+ +
+CPT optimizing only specific token embeddings while keeping the rest of the model frozen (image source). + +[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. +This approach combines ideas from In-Context Learning (ICL), [Prompt Tuning](../package_reference/prompt_tuning) (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective. +In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. +To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. +Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor. The abstract from the paper is: > Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning. - Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT. +## Benchmark overview + +There is no benchmark for MPT yet. Feel free to contribute an experiment +configuration but make sure to first create an issue +[here](https://github.com/huggingface/peft/issues). + +# API ## CPTConfig diff --git a/docs/source/package_reference/multitask_prompt_tuning.md b/docs/source/package_reference/multitask_prompt_tuning.md index 119739a3dc..24488f6428 100644 --- a/docs/source/package_reference/multitask_prompt_tuning.md +++ b/docs/source/package_reference/multitask_prompt_tuning.md @@ -22,10 +22,34 @@ The abstract from the paper is: *Prompt tuning, in which a base pretrained model is adapted to each task via conditioning on learned prompt vectors, has emerged as a promising approach for efficiently adapting large language models to multiple downstream tasks. However, existing methods typically learn soft prompt vectors from scratch, and it has not been clear how to exploit the rich cross-task knowledge with prompt vectors in a multitask learning setting. We propose multitask prompt tuning (MPT), which first learns a single transferable prompt by distilling knowledge from multiple task-specific source prompts. We then learn multiplicative low rank updates to this shared prompt to efficiently adapt it to each downstream target task. Extensive experiments on 23 NLP datasets demonstrate that our proposed approach outperforms the state-of-the-art methods, including the full finetuning baseline in some cases, despite only tuning 0.035% as many task-specific parameters*. +
+ +
+Multitask prompt tuning enables parameter-efficient transfer learning. + +MPT consists of two stages: + +1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training. +2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix. + +
+ +
+Prompt decomposition. + +## Benchmark overview + +There is no benchmark for MPT yet. Feel free to contribute an experiment +configuration but make sure to first create an issue +[here](https://github.com/huggingface/peft/issues). + + +# API + ## MultitaskPromptTuningConfig [[autodoc]] tuners.multitask_prompt_tuning.config.MultitaskPromptTuningConfig ## MultitaskPromptEmbedding -[[autodoc]] tuners.multitask_prompt_tuning.model.MultitaskPromptEmbedding \ No newline at end of file +[[autodoc]] tuners.multitask_prompt_tuning.model.MultitaskPromptEmbedding diff --git a/docs/source/package_reference/prompt_tuning.md b/docs/source/package_reference/prompt_tuning.md index 83419ed4e4..4f9f30b8d1 100644 --- a/docs/source/package_reference/prompt_tuning.md +++ b/docs/source/package_reference/prompt_tuning.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Prompt tuning +
+ +
+Only train and store a significantly smaller set of task-specific prompt parameters (image source). + [Prompt tuning](https://hf.co/papers/2104.08691) adds a task-specific, virtual prompt to the input that consists of trainable vectors in the embedding space. The virtual token parameters are updated independently of the pretrained model parameters which are frozen. The abstract from the paper is: diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 64747f302c..36c4b0b22e 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -28,36 +28,7 @@ For any PEFT method, you'll need to create a configuration which contains all th > [!TIP] > Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - - - - -[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. - -Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. - -```py -from peft import PromptTuningConfig, PromptTuningInit, get_peft_model - -prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" -peft_config = PromptTuningConfig( - task_type="CAUSAL_LM", - prompt_tuning_init=PromptTuningInit.TEXT, - num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), - prompt_tuning_init_text=prompt_tuning_init_text, - tokenizer_name_or_path="bigscience/bloomz-560m", -) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" -``` - - - ## Train From 02a13f96627c5bdc16f2a9a774378e3f56fd7ed0 Mon Sep 17 00:00:00 2001 From: nemo Date: Fri, 29 May 2026 15:50:05 +0200 Subject: [PATCH 03/33] Move IA3 --- docs/source/_toctree.yml | 4 - docs/source/conceptual_guides/ia3.md | 68 -------- docs/source/package_reference/ia3.md | 45 ++++- docs/source/package_reference/lora.md | 21 ++- docs/source/task_guides/ia3.md | 235 -------------------------- 5 files changed, 56 insertions(+), 317 deletions(-) delete mode 100644 docs/source/conceptual_guides/ia3.md delete mode 100644 docs/source/task_guides/ia3.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index aaec52308b..f338878665 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -11,8 +11,6 @@ sections: - local: task_guides/lora_based_methods title: LoRA methods - - local: task_guides/ia3 - title: IA3 - title: Guides sections: @@ -56,8 +54,6 @@ sections: - local: conceptual_guides/adapter title: Adapters - - local: conceptual_guides/ia3 - title: IA3 - local: conceptual_guides/oft title: OFT/BOFT diff --git a/docs/source/conceptual_guides/ia3.md b/docs/source/conceptual_guides/ia3.md deleted file mode 100644 index 92daaac105..0000000000 --- a/docs/source/conceptual_guides/ia3.md +++ /dev/null @@ -1,68 +0,0 @@ - - -# IA3 - -This conceptual guide gives a brief overview of [IA3](https://huggingface.co/papers/2205.05638), a parameter-efficient fine tuning technique that is -intended to improve over [LoRA](./lora). - -To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) -rescales inner activations with learned vectors. These learned vectors are injected in the attention and feedforward modules -in a typical transformer-based architecture. These learned vectors are the only trainable parameters during fine-tuning, and thus the original -weights remain frozen. Dealing with learned vectors (as opposed to learned low-rank updates to a weight matrix like LoRA) -keeps the number of trainable parameters much smaller. - -Being similar to LoRA, IA3 carries many of the same advantages: - -* IA3 makes fine-tuning more efficient by drastically reducing the number of trainable parameters. (For T0, an IA3 model only has about 0.01% trainable parameters, while even LoRA has > 0.1%) -* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable IA3 models for various downstream tasks built on top of them. -* Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models. -* IA3 does not add any inference latency because adapter weights can be merged with the base model. - -In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable -parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers -of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer -in each transformer block. - -Given the target layers for injecting IA3 parameters, the number of trainable parameters -can be determined based on the size of the weight matrices. - - -## Common IA3 parameters in PEFT - -As with other methods supported by PEFT, to fine-tune a model using IA3, you need to: - -1. Instantiate a base model. -2. Create a configuration (`IA3Config`) where you define IA3-specific parameters. -3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. -4. Train the `PeftModel` as you normally would train the base model. - -`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: - -- `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors. -- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with -the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers. Note that `feedforward_modules` must be a subset of `target_modules`. -- `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - -## Example Usage - -For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows: - -```py -peft_config = IA3Config( - task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"] -) -``` \ No newline at end of file diff --git a/docs/source/package_reference/ia3.md b/docs/source/package_reference/ia3.md index 3885fd9c60..330bd0d58e 100644 --- a/docs/source/package_reference/ia3.md +++ b/docs/source/package_reference/ia3.md @@ -22,10 +22,53 @@ The abstract from the paper is: *Few-shot in-context learning (ICL) enables pre-trained language models to perform a previously-unseen task without any gradient-based training by feeding a small number of training examples as part of the input. ICL incurs substantial computational, memory, and storage costs because it involves processing all of the training examples every time a prediction is made. Parameter-efficient fine-tuning (PEFT) (e.g. adapter modules, prompt tuning, sparse update methods, etc.) offers an alternative paradigm where a small set of parameters are trained to enable a model to perform the new task. In this paper, we rigorously compare few-shot ICL and PEFT and demonstrate that the latter offers better accuracy as well as dramatically lower computational costs. Along the way, we introduce a new PEFT method called (IA)^3 that scales activations by learned vectors, attaining stronger performance while only introducing a relatively tiny amount of new parameters. We also propose a simple recipe based on the T0 model called T-Few that can be applied to new tasks without task-specific tuning or modifications. We validate the effectiveness of T-Few on completely unseen tasks by applying it to the RAFT benchmark, attaining super-human performance for the first time and outperforming the state-of-the-art by 6% absolute. All of the code used in our experiments is publicly available*. +To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) +rescales inner activations with learned vectors. These learned vectors are injected in the attention and feedforward modules +in a typical transformer-based architecture. These learned vectors are the only trainable parameters during fine-tuning, and thus the original +weights remain frozen. Dealing with learned vectors (as opposed to learned low-rank updates to a weight matrix like LoRA) +keeps the number of trainable parameters much smaller. + +Being similar to [LoRA](./lora), IA3 carries many of the same advantages: + +* IA3 makes fine-tuning more efficient by drastically reducing the number of trainable parameters. (For T0, an IA3 model only has about 0.01% trainable parameters, while even LoRA has > 0.1%) +* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable IA3 models for various downstream tasks built on top of them. +* Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models. +* IA3 does not add any inference latency because adapter weights can be merged with the base model. + +In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable +parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers +of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer +in each transformer block. + +Given the target layers for injecting IA3 parameters, the number of trainable parameters +can be determined based on the size of the weight matrices. + +## Usage + +For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows: + +```py +peft_config = IA3Config( + task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"] +) +``` + +## Benchmark overview + + + + +# API + ## IA3Config [[autodoc]] tuners.ia3.config.IA3Config ## IA3Model -[[autodoc]] tuners.ia3.model.IA3Model \ No newline at end of file +[[autodoc]] tuners.ia3.model.IA3Model diff --git a/docs/source/package_reference/lora.md b/docs/source/package_reference/lora.md index 52c9f87de8..132336ad05 100644 --- a/docs/source/package_reference/lora.md +++ b/docs/source/package_reference/lora.md @@ -22,6 +22,18 @@ The abstract from the paper is: *We propose a neural language modeling system based on low-rank adaptation (LoRA) for speech recognition output rescoring. Although pretrained language models (LMs) like BERT have shown superior performance in second-pass rescoring, the high computational cost of scaling up the pretraining stage and adapting the pretrained models to specific domains limit their practical use in rescoring. Here we present a method based on low-rank decomposition to train a rescoring BERT model and adapt it to new domains using only a fraction (0.08%) of the pretrained parameters. These inserted matrices are optimized through a discriminative training objective along with a correlation-based regularization loss. The proposed low-rank adaptation Rescore-BERT (LoRB) architecture is evaluated on LibriSpeech and internal datasets with decreased training times by factors between 5.4 and 3.6.*. +## Benchmark overview + + + + +# API + ## LoraConfig [[autodoc]] tuners.lora.config.LoraConfig @@ -96,12 +108,3 @@ The abstract from the paper is: [[autodoc]] tuners.lora.intruders.reduce_intruder_dimension -## Benchmark overview - - - diff --git a/docs/source/task_guides/ia3.md b/docs/source/task_guides/ia3.md deleted file mode 100644 index c23145f897..0000000000 --- a/docs/source/task_guides/ia3.md +++ /dev/null @@ -1,235 +0,0 @@ - - -# IA3 - -[IA3](../conceptual_guides/ia3) multiplies the model's activations (the keys and values in the self-attention and encoder-decoder attention blocks, and the intermediate activation of the position-wise feedforward network) by three learned vectors. This PEFT method introduces an even smaller number of trainable parameters than LoRA which introduces weight matrices instead of vectors. The original model's parameters are kept frozen and only these vectors are updated. As a result, it is faster, cheaper and more efficient to finetune for a new downstream task. - -This guide will show you how to train a sequence-to-sequence model with IA3 to *generate a sentiment* given some financial news. - -> [!TIP] -> Some familiarity with the general process of training a sequence-to-sequence would be really helpful and allow you to focus on how to apply IA3. If you’re new, we recommend taking a look at the [Translation](https://huggingface.co/docs/transformers/tasks/translation) and [Summarization](https://huggingface.co/docs/transformers/tasks/summarization) guides first from the Transformers documentation. When you’re ready, come back and see how easy it is to drop PEFT in to your training! - -## Dataset - -You'll use the [zeroshot/twitter-financial-news-sentiment](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment) dataset. This dataset contains financial tweets labeled with sentiment (bearish, bullish, or neutral). Take a look at the [dataset viewer](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment/viewer) for a better idea of the data and sentences you'll be working with. - -Load the dataset with the [`~datasets.load_dataset`] function. This dataset only contains a train split, so use the [`~datasets.train_test_split`] function to create a train and validation split. Create a new `text_label` column so it is easier to understand what the `label` values `0`, `1`, and `2` mean. - -```py -from datasets import load_dataset - -ds = load_dataset("zeroshot/twitter-financial-news-sentiment") -ds = ds["train"].train_test_split(test_size=0.1) -ds["validation"] = ds["test"] -del ds["test"] - -classes = ds["train"].features["label"].names -ds = ds.map( - lambda x: {"text_label": [classes[label] for label in x["label"]]}, - batched=True, - num_proc=1, -) - -ds["train"][0] -{'text': 'Morrisons reports first sales rise in four years', - 'label': 1, - 'text_label': 'bullish'} -``` - -Load a tokenizer and create a preprocessing function that: - -1. tokenizes the inputs, pads and truncates the sequence to the `max_length` -2. apply the same tokenizer to the labels but with a shorter `max_length` that corresponds to the label -3. mask the padding tokens - -```py -from transformers import AutoTokenizer - -text_column = "text" -label_column = "text_label" -max_length = 128 - -tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - -def preprocess_function(examples): - inputs = examples[text_column] - targets = examples[label_column] - model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") - labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt") - labels = labels["input_ids"] - labels[labels == tokenizer.pad_token_id] = -100 - model_inputs["labels"] = labels - return model_inputs -``` - -Use the [`~datasets.Dataset.map`] function to apply the preprocessing function to the entire dataset. - -```py -processed_ds = ds.map( - preprocess_function, - batched=True, - num_proc=1, - remove_columns=ds["train"].column_names, - load_from_cache_file=False, - desc="Running tokenizer on dataset", -) -``` - -Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the accelerator during training if your dataset samples are on a CPU. - -```py -from torch.utils.data import DataLoader -from transformers import default_data_collator - -train_ds = processed_ds["train"] -eval_ds = processed_ds["validation"] - -batch_size = 8 - -train_dataloader = DataLoader( - train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True -) -eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -``` - -## Model - -Now you can load a pretrained model to use as the base model for IA3. This guide uses the [bigscience/mt0-large](https://huggingface.co/bigscience/mt0-large) model, but you can use any sequence-to-sequence model you like. - -```py -from transformers import AutoModelForSeq2SeqLM - -model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") -``` - -### PEFT configuration and model - -All PEFT methods need a configuration that contains and specifies all the parameters for how the PEFT method should be applied. Create an [`IA3Config`] with the task type and set the inference mode to `False`. You can find additional parameters for this configuration in the [API reference](../package_reference/ia3#ia3config). - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - -Once the configuration is setup, pass it to the [`get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -```py -from peft import IA3Config, get_peft_model - -peft_config = IA3Config(task_type="SEQ_2_SEQ_LM") -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.022980103060766553" -``` - -### Training - -Set up an optimizer and learning rate scheduler. - -```py -import torch -from transformers import get_linear_schedule_with_warmup - -lr = 8e-3 -num_epochs = 3 - -optimizer = torch.optim.AdamW(model.parameters(), lr=lr) -lr_scheduler = get_linear_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=(len(train_dataloader) * num_epochs), -) -``` - -Move the model to the accelerator and create a training loop that reports the loss and perplexity for each epoch. - -```py -from tqdm import tqdm - -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" -model = model.to(device) - -for epoch in range(num_epochs): - model.train() - total_loss = 0 - for step, batch in enumerate(tqdm(train_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - total_loss += loss.detach().float() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - - model.eval() - eval_loss = 0 - eval_preds = [] - for step, batch in enumerate(tqdm(eval_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - with torch.no_grad(): - outputs = model(**batch) - loss = outputs.loss - eval_loss += loss.detach().float() - eval_preds.extend( - tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True) - ) - - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_ppl = torch.exp(eval_epoch_loss) - train_epoch_loss = total_loss / len(train_dataloader) - train_ppl = torch.exp(train_epoch_loss) - print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") -``` - -## Share your model - -After training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -account = -peft_model_id = f"{account}/mt0-large-ia3" -model.push_to_hub(peft_model_id) -``` - -## Inference - -To load the model for inference, use the [`~AutoPeftModelForSeq2SeqLM.from_pretrained`] method. Let's also load a sentence of financial news from the dataset to generate a sentiment for. - -```py -from peft import AutoPeftModelForSeq2SeqLM - -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" - -model = AutoPeftModelForSeq2SeqLM.from_pretrained("/mt0-large-ia3").to(device) -tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - -i = 15 -inputs = tokenizer(ds["validation"][text_column][i], return_tensors="pt") -print(ds["validation"][text_column][i]) -"The robust growth was the result of the inclusion of clothing chain Lindex in the Group in December 2007 ." -``` - -Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted sentiment label. - -```py -with torch.no_grad(): - inputs = {k: v.to(device) for k, v in inputs.items()} - outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) - print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) -['positive'] -``` From 0dc79b0e048b386b603efb3758730190cb851101 Mon Sep 17 00:00:00 2001 From: nemo Date: Tue, 2 Jun 2026 01:10:50 +0200 Subject: [PATCH 04/33] Moving LoRA --- docs/source/_toctree.yml | 87 +++ docs/source/conceptual_guides/adapter.md | 24 +- docs/source/methods/overview.md | 96 +-- docs/source/package_reference/adalora.md | 15 +- docs/source/package_reference/ia3.md | 5 + docs/source/package_reference/lora.md | 827 +++++++++++++++++++++++ 6 files changed, 968 insertions(+), 86 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index f338878665..3ed9a19355 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -49,6 +49,93 @@ sections: - local: methods/overview title: Overview + - sections: + - local: package_reference/layernorm_tuning + title: LayerNorm Tuning + title: Layer Tuning + - sections: + - local: package_reference/p_tuning + title: P-Tuning + title: Soft Prompting + - sections: + - sections: + - local: package_reference/lora + title: Overview + - local: package_reference/lora_initializations + title: Initializations + - sections: + - local: package_reference/lora_variant_velora + title: VeLoRA + - local: package_reference/lora_variant_dora + title: DoRA + title: Variants + title: LoRA + - local: package_reference/adalora + title: AdaLoRA + - local: package_reference/adamss + title: AdaMSS + - local: package_reference/ia3 + title: IA3 + - local: package_reference/llama_adapter + title: Llama-Adapter + - local: package_reference/loha + title: LoHa + - local: package_reference/lokr + title: LoKr + - local: package_reference/osf + title: OSF + - local: package_reference/xlora + title: X-LoRA + - local: package_reference/adapter_utils + title: LyCORIS + - local: package_reference/oft + title: OFT + - local: package_reference/boft + title: BOFT + - local: package_reference/psoft + title: PSOFT + - local: package_reference/poly + title: Polytropon + - local: package_reference/vera + title: VeRA + - local: package_reference/pvera + title: PVeRA + - local: package_reference/fourierft + title: FourierFT + - local: package_reference/gralora + title: GraLoRA + - local: package_reference/vblora + title: VB-LoRA + - local: package_reference/hira + title: HiRA + - local: package_reference/hra + title: HRA + - local: package_reference/cpt + title: CPT + - local: package_reference/randlora + title: RandLora + - local: package_reference/shira + title: SHiRA + - local: package_reference/c3a + title: C3A + - local: package_reference/miss + title: MiSS + - local: package_reference/road + title: RoAd + - local: package_reference/waveft + title: WaveFT + - local: package_reference/delora + title: DeLoRA + - local: package_reference/tinylora + title: TinyLoRA + - local: package_reference/lily + title: Lily + - local: package_reference/peanut + title: PEANuT + - local: package_reference/beft + title: BEFT + title: Adapters + - title: Conceptual guides sections: diff --git a/docs/source/conceptual_guides/adapter.md b/docs/source/conceptual_guides/adapter.md index 825df1abac..98352bf6ee 100644 --- a/docs/source/conceptual_guides/adapter.md +++ b/docs/source/conceptual_guides/adapter.md @@ -22,25 +22,7 @@ This guide will give you a brief overview of the adapter methods supported by PE ## Low-Rank Adaptation (LoRA) -> [!TIP] -> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. -As mentioned briefly earlier, [LoRA](https://hf.co/papers/2106.09685) is a technique that accelerates finetuning large models while consuming less memory. - -LoRA represents the weight updates ∆W with two smaller matrices (called *update matrices*) through low-rank decomposition. These new matrices can be trained to adapt to the new data while keeping the overall number of parameters low. The original weight matrix remains frozen and doesn't receive any further updates. To produce the final results, the original and extra adapted weights are combined. You could also merge the adapter weights with the base model to eliminate inference latency. - -
- -
- -This approach has a number of advantages: - -* LoRA makes finetuning more efficient by drastically reducing the number of trainable parameters. -* The original pretrained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them. -* LoRA is orthogonal to other parameter-efficient methods and can be combined with many of them. -* Performance of models finetuned using LoRA is comparable to the performance of fully finetuned models. - -In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, LoRA is typically only applied to the attention blocks in Transformer models. The resulting number of trainable parameters in a LoRA model depends on the size of the update matrices, which is determined mainly by the rank `r` and the shape of the original weight matrix.
@@ -109,16 +91,16 @@ To avoid adding noise to the tokens, the adapter uses zero-initialized attention ## Householder Reflection Adaptation (HRA) -[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. +[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge.
Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation -HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. +HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. -The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. +The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. ## Bone diff --git a/docs/source/methods/overview.md b/docs/source/methods/overview.md index c88fd30f30..9d0a340990 100644 --- a/docs/source/methods/overview.md +++ b/docs/source/methods/overview.md @@ -1,4 +1,4 @@ - +# Parameter efficient fine-tuning methods +Training a model parameter efficiently means to train as few parameters as possible to achieve comparable performance to training all parameters, i.e. full fine-tuning. There is, of course, no free lunch: by using fewer and therefore less expressive, parameters, it is not guaranteed that you will get the same performance! You may need to use a specific PEFT method to get optimal results for the model/task combination you want to train. But you will need less memory and possibly less compute during training and may gain features such as fast hot-swapping between trained expert models and less forgetting of previous knowledge compared to full fine-tuning. +Giving general advice for training large models is hard but for generative +models, especially language models, you can follow these steps: -# Prompt-based methods - - Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. - -There are two categories of prompting methods: - -- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt -- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word - -The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning, ...), explore the table of contents for a full listing of soft prompt methods. -If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! - -> [!TIP] -> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! - +1. use prompting (few-shot examples in the prompt) to see if the model is + already capable of the task. If the model solves your problem, great! You can + now use [Prompt-based methods](#Prompt-based methods) to learn the prompt and + save precious tokens. +2. If prompt-based methods are not sufficient you can use [layer tuning](#Layer tuning) + and [adapter methods](#Adapter methods). These methods are generally + more expressive than prompt-based methods and get closer to full-finetuning. +3. Make sure to measure retention of already learnt knowledge since each + fine-tuning step is potentially unlearning past knowledege. +The [PEFT method comparison suite](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison) aims to give a rough overview of (most) implemented methods on selected benchmarks and models. +## Adapter methods +Adapter methods can be seen as ways of adding relatively small, trainable matrices to existing models for fine-tuning. The goal is to introduce few trainable parameters to steer the big model in the direction of the task that needs fine-tuning to save on resources, such as memory or compute. -### PEFT configuration and model - -For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. +A popular way to realize adapters is to insert smaller trainable matrices that are a low-rank decomposition of the adapted weight's layout to save on memory. There are several different ways to express the weight matrix as a low-rank decomposition, but [Low-Rank Adaptation (LoRA)](../package_resources/lora) is the most common method. The PEFT library supports several other variations of this formulation - some are direct variants of LoRA and are documented under LoRA, some are different enough to count as their own methods, such as [Low-Rank Hadamard Product (LoHa)](../package_resources/loha), [Low-Rank Kronecker Product (LoKr)](../package_resources/lokr), and [Adaptive Low-Rank Adaptation (AdaLoRA)](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora). If you're interested in applying these methods to other tasks and use cases like semantic segmentation, token classification, take a look at our [notebook collection](https://huggingface.co/collections/PEFT/notebooks-6573b28b33e5a4bf5b157fc1)! > [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - -[P-tuning](../conceptual_guides/prompting#p-tuning) adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence. Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. +> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. -```py -from peft import PromptEncoderConfig, get_peft_model +Low-rank adapters are only one possible adapter formualation, PEFT implements many other types of adapters as well. For example, Orthogonal Fine-Tuning methods ([OFT](../package_reference/oft), [BOFT](../package_reference/boft), ...) use orthogonal decompositions of the adapter weights to achieve small size. Methods like [MiSS](../package_reference/miss) shard matrices and share these shards to save on memory. [IA3](../package_reference/ia3) just introduces three trainable vectors to steer the original model. -peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" -``` - - +## Prompt-based methods -[Prefix tuning](../conceptual_guides/prompting#prefix-tuning) adds task-specific parameters in all of the model layers, which are optimized by a separate feed-forward network. Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. +Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. -```py -from peft import PrefixTuningConfig, get_peft_model - -peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" -``` - - - +There are two categories of prompting methods: -[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. +- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt +- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word -Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. +The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning, ...), explore the table of contents for a full listing of soft prompt methods. +If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! -```py -from peft import PromptTuningConfig, PromptTuningInit, get_peft_model +> [!TIP] +> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! -prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" -peft_config = PromptTuningConfig( - task_type="CAUSAL_LM", - prompt_tuning_init=PromptTuningInit.TEXT, - num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), - prompt_tuning_init_text=prompt_tuning_init_text, - tokenizer_name_or_path="bigscience/bloomz-560m", -) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" -``` +## Layer Tuning - - +Layer Tuning categorizes methods that target specific layers of a model such as [LayerNorm Tuning](../package_reference/layernorm_tuning) +or targeting specific tokens in the embedding matrix via [TrainableTokens](../package_reference/trainable_tokens). diff --git a/docs/source/package_reference/adalora.md b/docs/source/package_reference/adalora.md index 9cc51d0e09..cacdb4bf1e 100644 --- a/docs/source/package_reference/adalora.md +++ b/docs/source/package_reference/adalora.md @@ -22,10 +22,23 @@ The abstract from the paper is: *Fine-tuning large pre-trained language models on downstream tasks has become an important paradigm in NLP. However, common practice fine-tunes all of the parameters in a pre-trained model, which becomes prohibitive when a large number of downstream tasks are present. Therefore, many fine-tuning methods are proposed to learn incremental updates of pre-trained weights in a parameter efficient way, e.g., low-rank increments. These methods often evenly distribute the budget of incremental updates across all pre-trained weight matrices, and overlook the varying importance of different weight parameters. As a consequence, the fine-tuning performance is suboptimal. To bridge this gap, we propose AdaLoRA, which adaptively allocates the parameter budget among weight matrices according to their importance score. In particular, AdaLoRA parameterizes the incremental updates in the form of singular value decomposition. Such a novel approach allows us to effectively prune the singular values of unimportant updates, which is essentially to reduce their parameter budget but circumvent intensive exact SVD computations. We conduct extensive experiments with several pre-trained models on natural language processing, question answering, and natural language generation to validate the effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable improvement over baselines, especially in the low budget settings. Our code is publicly available at https://github.com/QingruZhang/AdaLoRA*. + +## Benchmark overview + + + + +# API + ## AdaLoraConfig [[autodoc]] tuners.adalora.config.AdaLoraConfig ## AdaLoraModel -[[autodoc]] tuners.adalora.model.AdaLoraModel \ No newline at end of file +[[autodoc]] tuners.adalora.model.AdaLoraModel diff --git a/docs/source/package_reference/ia3.md b/docs/source/package_reference/ia3.md index 330bd0d58e..1b70292d5d 100644 --- a/docs/source/package_reference/ia3.md +++ b/docs/source/package_reference/ia3.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # IA3 +
+ +
+IA3 introduces three vectors, lv, lk and lff to scale value, key and feed-forward activations (image source). + Infused Adapter by Inhibiting and Amplifying Inner Activations, or [IA3](https://hf.co/papers/2205.05638), is a method that adds three learned vectors to rescale the keys and values of the self-attention and encoder-decoder attention layers, and the intermediate activation of the position-wise feed-forward network. The abstract from the paper is: diff --git a/docs/source/package_reference/lora.md b/docs/source/package_reference/lora.md index 132336ad05..64d698540e 100644 --- a/docs/source/package_reference/lora.md +++ b/docs/source/package_reference/lora.md @@ -16,12 +16,57 @@ rendered properly in your Markdown viewer. # LoRA +> [!TIP] +> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. + +As mentioned briefly earlier, [LoRA](https://hf.co/papers/2106.09685) is a technique that accelerates finetuning large models while consuming less memory. + +LoRA represents the weight updates ∆W with two smaller matrices (called *update matrices*) through low-rank decomposition. These new matrices can be trained to adapt to the new data while keeping the overall number of parameters low. The original weight matrix remains frozen and doesn't receive any further updates. To produce the final results, the original and extra adapted weights are combined. You could also merge the adapter weights with the base model to eliminate inference latency. + +
+ +
+ +This approach has a number of advantages: + +* LoRA makes finetuning more efficient by drastically reducing the number of trainable parameters. +* The original pretrained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them. +* LoRA is orthogonal to other parameter-efficient methods and can be combined with many of them. +* Performance of models finetuned using LoRA is comparable to the performance of fully finetuned models. + +In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, LoRA is typically only applied to the attention blocks in Transformer models. The resulting number of trainable parameters in a LoRA model depends on the size of the update matrices, which is determined mainly by the rank `r` and the shape of the original weight matrix. + Low-Rank Adaptation ([LoRA](https://huggingface.co/papers/2309.15223)) is a PEFT method that decomposes a large matrix into two smaller low-rank matrices in the attention layers. This drastically reduces the number of parameters that need to be fine-tuned. The abstract from the paper is: *We propose a neural language modeling system based on low-rank adaptation (LoRA) for speech recognition output rescoring. Although pretrained language models (LMs) like BERT have shown superior performance in second-pass rescoring, the high computational cost of scaling up the pretraining stage and adapting the pretrained models to specific domains limit their practical use in rescoring. Here we present a method based on low-rank decomposition to train a rescoring BERT model and adapt it to new domains using only a fraction (0.08%) of the pretrained parameters. These inserted matrices are optimized through a discriminative training objective along with a correlation-based regularization loss. The proposed low-rank adaptation Rescore-BERT (LoRB) architecture is evaluated on LibriSpeech and internal datasets with decreased training times by factors between 5.4 and 3.6.*. +You can initialize the low-rank matrices with different use-cases in mind - task awareness (CoRDA, EVA), faster convergence (PiSSA), mitigating quantizations (LoftQ) - just to name a few use-cases. Read about the different initializations [here](../package_reference/lora_initializations). The default initialization is for LoRA to be a no-op, to gradually learn new behavior without interfering much with the existing model. + +Since LoRA is a very popular method there are several incremental variations of LoRA that we call variants - they are often on-par or better-performing than LoRA and can be thought of as their own PEFT methods but based on LoRA. [Explore LoRA variants](../package_reference/lora_variants). + +## Usage + +LoRA decomposes the weight update matrix into *two* smaller matrices. The size of these low-rank matrices is determined by its *rank* or `r`. A higher rank means the model has more parameters to train, but it also means the model has more learning capacity. You'll also want to specify the `target_modules` which determine where the smaller matrices are inserted. In the following example, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `lora_alpha` (scaling factor), `bias` (whether `none`, `all` or only the LoRA bias parameters should be trained), and `modules_to_save` (the modules apart from the LoRA layers to be trained and saved). All of these parameters - and more - are found in the [`LoraConfig`]. + +```py +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=16, + target_modules=["query", "value"], + lora_dropout=0.1, + bias="none", + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 667,493 || all params: 86,543,818 || trainable%: 0.7712775047664294" +``` + + ## Benchmark overview +## Training + +This section shows how to handle more complex training scenarios instead of only applying a low-rank adapter +to the model and feed data. + +### QLoRA-style training + +The default LoRA settings in PEFT add trainable weights to the query and value layers of each attention block. But [QLoRA](https://hf.co/papers/2305.14314), which adds trainable weights to all the linear layers of a transformer model, can provide performance equal to a fully finetuned model. To apply LoRA to all the linear layers, like in QLoRA, set `target_modules="all-linear"` (easier than specifying individual modules by name which can vary depending on the architecture). + +```py +config = LoraConfig(target_modules="all-linear", ...) +``` + +For more information about how to apply quantization to PEFT adapters, refer to the [quantization guide](quantization). + +### Memory efficient Layer Replication with LoRA + +An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://huggingface.co/papers/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument. + +```py +config = LoraConfig(layer_replication=[[0,4], [2,5]], ...) +``` + +Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters. + +[Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The +[adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning. + +### Fine grained control over ranks and alpha (scaling) + +By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values. + +To give an example, let's assume that we have a model with the following structure: + +```python +>>> print(model) +Outer( + (foo): Linear(...) + (module): Middle( + (foo): Linear(...) + (foobar): Linear(...) + (module): Inner( + (foo): Linear(...) + (barfoo): Linear(...) + ) + ) +) +``` + +- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched. +- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes. +- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason. +- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`. +- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`. +- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT. + +The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good. + +### Targeting `nn.Parameter` directly + +Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with `target_parameters`. As an example, for [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164), you can pass: `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. + +Note that when targeting expert parameters, PEFT can add a substantial runtime overhead. The reason is that PEFT always materializes the LoRA contribution for _each expert_ even if only a small amount of experts is required. During training, this is less relevant since, over the course of the sequence, typically a large fraction of experts is activated at least once. However, during inference, normally a KV cache is used and we thus need to only compute the last token, which means that only a small amount of experts is activated. Therefore, using LoRA on MoE layers can result in a substantial slowdown at inference time. The recommendation is thus to merge the weights (`model.merge_adapter()` or `model = model.merge_and_unload()`). This removes the PEFT overhead. + +A more detailed investigation of this issue can be found on this [pull request on MoE optimization](https://github.com/huggingface/peft/pull/3139). + +#### Caveats + +- At the moment, this argument allows to target 2-dim or 3-dim `nn.Parameter`s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension. +- It is currently not possible to add multiple LoRA adapters (via `model.add_adapter` or `model.load_adapter`) that use `target_parameters` at the same time. + +#### MoE expert parameters and vLLM + +Some MoE models in Transformers store expert weights as `nn.Parameter` tensors (often 3D), not `nn.Linear` modules. +To apply LoRA to those experts, use `target_parameters` and set a per-layer rank with `rank_pattern`: + +```python +num_experts = getattr(model.config, "num_local_experts", None) or model.config.num_experts +effective_r = max(1, r // num_experts) +config = LoraConfig( + r=r, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + target_parameters=[ + # Mixtral / Qwen3-MoE / GPT-OSS + "mlp.experts.gate_up_proj", + "mlp.experts.down_proj", + # Llama4 + # "feed_forward.experts.gate_up_proj", + # "feed_forward.experts.down_proj", + ], + rank_pattern={ + "experts.gate_up_proj": effective_r, + "experts.down_proj": effective_r, + }, +) +``` + +This keeps the total LoRA parameter budget similar to dense layers (see +[LoRA Without Regret](https://thinkingmachines.ai/blog/lora/) by Schulman et. al.). +Non-expert modules use the default rank `r`. + +Accelerated inference with the fine-tuned model is possible with, for example, [vLLM](https://vllm.ai/) which supports fused MoE expert layers since v0.11.2. + +### Efficiently train tokens alongside LoRA + +PEFT LoRA adapters support adding new tokens with the `trainable_token_indices` parameter. This allows tuning of other tokens alongside fine-tuning specific layers. Only the specified tokens are trained and all other tokens are untouched. It saves memory and doesn't throw away learned context from existing token embeddings unlike training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`]. + +```py +# for layer 'embed_tokens' +config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...) + +# specific embedding layer +config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...) +``` + +In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model. + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import get_peft_model, LoraConfig + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + +# we define our new tokens and add them to the tokenizer as special tokens +special_tokens = ['<|start_think|>', '<|stop_think|>'] +tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) + +# make room for new tokens in the embedding matrix if it isn't big enough already +base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings)) + +# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens` +# and specifically our new tokens we just added +lora_config = LoraConfig( + target_modules='all-linear', + trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)}, +) +peft_model = get_peft_model(base_model, lora_config) + +# proceed to train the model like normal +[...] +``` + +The token weights are saved as a part of the adapter state dict alongside the LoRA weights. Full fine-tuning and saving the embedding matrix would have stored a much bigger file. + +To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens): + +| | Trainable Tokens | LoRA | Full Fine-tuning | +| --------: | :--------------: | :--------: | :--------------: | +| VRAM | 15,562 MB | 15,581MB | ~16,500MB | +| Influence | 6 tokens | all tokens | all tokens | + +### Weight tying + +Many causal LMs use **weight tying**, where two or more weights share the same underlying parameters. In the most common case, the input embedding weights (`embed_tokens`) and output projection weights (`lm_head`) share the same tensor. This is because it reduces parameters and usually preserves model quality. + +It's not always obvious how PEFT deals with these tied weights when they are targeted for fine-tuning. For LoRA, the `ensure_weight_tying` on the [`LoraConfig`] controls whether PEFT should explicitly keep adapter-side updates tied for those layers. In practice, this can affect `modules_to_save`, `target_modules`, and `trainable_token_indices`. Note that this logic partially relies on convention when it comes to naming the layers (`"embed_tokens"`, `"lm_head"`) and proper working cannot be guaranteed if those conventions are not used. + +The tables below summarize expected behavior. + +#### `modules_to_save` + +| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | +|-------------------------|-----------------------|-----------------------------------------------------|--------------------------------------------------------------| +| No | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Add `ModulesToSaveWrapper` on selected layer only | +| No | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Warn, then add `ModulesToSaveWrapper` on selected layer only | +| Yes | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Treat as separate | +| Yes | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Wrap tied layers and keep wrappers tied | +| No | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Treat as separate | +| No | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | +| Yes | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | +| Yes | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Keep `ModulesToSaveWrapper`s tied | + +#### `target_modules` + +| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | +|-------------------------|-----------------------|----------------------------------------------------|--------------------------------------------| +| No | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Add LoRA on selected layer only | +| No | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Warn, then add LoRA on selected layer only | +| Yes | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Treat as separate | +| Yes | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Keep LoRA adapters tied | +| No | `False` | `target_modules=["embed_tokens", "lm_head"]` | Treat as separate | +| No | `True` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | +| Yes | `False` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | +| Yes | `True` | `target_modules=["embed_tokens", "lm_head"]` | Keep LoRA adapters tied | + +#### `trainable_token_indices` + +For trainable tokens, we have the additional complication that even if the LM head and embeddings are tied, as a user I may want to fine-tune *different* tokens on them. In the example table below, we thus differentiate between fine-tuning the same and fine-tuning different tokens. + +| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | +|-------------------------|-----------------------|-----------------------------------------------------------------------|------------------------------------------------| +| No | `False` | `trainable_token_indices=[1, 2, 3]` | Trainable tokens on embeddings only | +| No | `True` | `trainable_token_indices=[1, 2, 3]` | Warn, then trainable tokens on embeddings only | +| Yes | `False` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | +| Yes | `True` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | +| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Treat as separate | +| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Warn, then treat as separate | +| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | +| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | +| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | +| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Warn, then treat as separate | +| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | +| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Error | + +For users, this means: + +- In general, if you want to fine-tune weights that are tied and want to keep them tied, pass `ensure_weight_tying=True`. +- If your base model's weights are untied, `ensure_weight_tying=True` cannot force tying and only warns. +- For `trainable_token_indices`, tied layers must use the same token indices when `ensure_weight_tying=True`. + +## Optimizers + +LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+. + +### LoRA-FA Optimizer + +LoRA training can be more effective and efficient using LoRA-FA, as described in [LoRA-FA](https://huggingface.co/papers/2308.03303). LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption. + +```py +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_lorafa_optimizer +from transformers import Trainer, get_cosine_schedule_with_warmup + +base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") + +config = LoraConfig(...) +model = get_peft_model(base_model, config) + +optimizer = create_lorafa_optimizer( + model=model, + r=128, + lora_alpha=32, + lr=7e-5, +) + +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=100, + num_training_steps=1000, +) + +trainer = Trainer( + ..., + optimizers=(optimizer, scheduler), +) +``` + +### LoRA+ optimized LoRA + +LoRA training can be optimized using [LoRA+](https://huggingface.co/papers/2402.12354), which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%. + +```py +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_loraplus_optimizer +from transformers import Trainer +import bitsandbytes as bnb + +base_model = ... +config = LoraConfig(...) +model = get_peft_model(base_model, config) + +optimizer = create_loraplus_optimizer( + model=model, + optimizer_cls=bnb.optim.Adam8bit, + lr=5e-5, + loraplus_lr_ratio=16, +) +scheduler = None + +... +trainer = Trainer( + ..., + optimizers=(optimizer, scheduler), +) +``` + + +## Post-Training + +This section shows potential post-processing methods for trained adapters. + + +### Merge LoRA weights into the base model + +While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory. + +Below is a diagram that explains the intuition of LoRA adapter merging: + +
+ +
+ +We show in the snippets below how to run that using PEFT. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) +model = model.merge_and_unload() +``` + +It is important to assign the returned model to a variable and use it, [`~LoraModel.merge_and_unload`] is not an in-place operation. If you need to keep a copy of the weights so you can unmerge the adapter later or delete and load different ones, you should use the [`~LoraModel.merge_adapter`] function instead. Now you have the option to use [`~LoraModel.unmerge_adapter`] to return the base model. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) +model.merge_adapter() + +# unmerge the LoRA layers from the base model +model.unmerge_adapter() +``` + +The [`~LoraModel.add_weighted_adapter`] function is useful for merging multiple LoRAs into a new adapter based on a user provided weighting scheme in the `weights` parameter. Below is an end-to-end example. + +First load the base model: + +```python +from transformers import AutoModelForCausalLM +from peft import PeftModel +import torch + +base_model = AutoModelForCausalLM.from_pretrained( + "mistralai/Mistral-7B-v0.1", dtype=torch.float16, device_map="auto" +) +``` + +Then we load the first adapter: + +```python +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name="sft") +``` + +Then load a different adapter and merge it with the first one: + +```python +weighted_adapter_name = "sft-dpo" +model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") +model.add_weighted_adapter( + adapters=["sft", "dpo"], + weights=[0.7, 0.3], + adapter_name=weighted_adapter_name, + combination_type="linear" +) +model.set_adapter(weighted_adapter_name) +``` + +> [!TIP] +> There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype. + +Now, perform inference: + +```python +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + +prompt = "Hey, are you conscious? Can you talk to me?" +inputs = tokenizer(prompt, return_tensors="pt") +inputs = {k: v.to(device) for k, v in inputs.items()} + +with torch.no_grad(): + generate_ids = model.generate(**inputs, max_length=30) +outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +print(outputs) +``` + +### Recovering base model performance via intruder dimension reduction + +The paper [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://huggingface.co/papers/2410.21228) argues +that LoRA training introduces extra dimensions into the weights that have very little in common with the already +learnt weights and lead to forgetting of already learned information. PEFT implements the suggested mitigation +in [`peft.tuners.lora.intruders.reduce_intruder_dimension`]. + +The mitigation will take a PEFT model with a loaded LoRA and create a new, modified adapter that is loaded alongside +the existing adapter and now the active adapter. + +Example usage: + +```python +from peft.tuners.lora.intruders import reduce_intruder_dimension + +peft_model = AutoPeftModelForCausalLM.from_pretrained('hubnemo/llama-3.2b-metamathqa-lora64') + +reduce_intruder_dimension( + peft_model, + mitigation_lambda=0.75, +) + +peft_model.generate(...) +``` + +There are a few hyper-parameters that can be used for tuning the effectiveness of the mitigation but, as evidenced +in Figure 8 of the paper, it will always be a trade-off between task accuracy learned by the adapter and forgetting +of the base model's knowledge. The mitigation will remove information from the adapter to reduce the impact on +forgetting previous knowledge but this also means that some information about the task learned by the adapter is +lost as well. + +While the defaults are set to deliver a good trade-off between the two factors it is not guaranteed that the defaults +will hold for your adapter, your model and your data, therefore it is wise to have a benchmark ready to measure +the effect. + +## Load adapters + +Adapters can be loaded onto a pretrained model with [`~PeftModel.load_adapter`], which is useful for trying out different adapters whose weights aren't merged. Set the active adapter weights with the [`~LoraModel.set_adapter`] function. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) + +# load different adapter +model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") + +# set adapter as active +model.set_adapter("dpo") +``` + +To return the base model, you could use [`~LoraModel.unload`] to unload all of the LoRA modules or [`~LoraModel.delete_adapter`] to delete the adapter entirely. [`~LoraModel.unload`] is not an in-place operation, remember to assign the returned model to a variable and use it. + +```py +# unload adapter +model = model.unload() + +# delete adapter +model.delete_adapter("dpo") +``` + +## Tensor Parallelism + +LoRA supports [Tensor Parallelism (TP)](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism) as provided by Transformers. When a base model is loaded with a `tp_plan`, PEFT automatically detects the TP configuration of each target module and adds the appropriate hooks to the LoRA adapter weights so that they participate correctly in the tensor-parallel computation. + +> [!WARNING] +> Tensor Parallelism support for LoRA requires `transformers >= 5.4.0`. + +Usage is identical to the standard LoRA workflow — simply load the base model with a `tp_plan` before wrapping it with PEFT: + +```py +from transformers import AutoModelForCausalLM +from peft import get_peft_model, LoraConfig + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", tp_plan="auto") +lora_config = LoraConfig(r=16, target_modules=["q_proj", "v_proj"]) +model = get_peft_model(model, lora_config) +``` + +Saving and loading work as usual via `save_pretrained` / `from_pretrained`. PEFT gathers the sharded adapter weights back to full tensors before saving, so checkpoints are portable and independent of the number of devices used during training. + + +## Inference + +This section showcases what you can do during inference time with LoRA, such as uncoupling the adapter. + +### Activated LoRA (aLoRA) + +Activated LoRA (aLoRA) is a low rank adapter architecture for causal LMs that reuses the existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see the aLoRA [paper](https://huggingface.co/papers/2504.12397). + +This technique scans for the last occurrence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token). It activates the adapter weights on tokens starting with the beginning of the invocation sequence. Any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights. Weights on prior tokens are left un-adapted, making the cache for those tokens interchangeable with base model cache due to the causal attention mask in causal LMs. Usage is very similar to standard LoRA. The key difference is that the invocation sequence must be specified when the adapter is created: + +```py +from peft import LoraConfig + +config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...) +``` + +alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as: +```py +invocation_string = "placeholder" +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` +The tokenizer is the base model's tokenizer. Use `add_special_tokens=False` to avoid adding `SOS`/`EOS` tokens in our search string (which will most likely cause the search to fail). + +**Notes** +* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. +* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. +* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. +* Beam search is not yet supported. +* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model. This can complicate the target use case of both the base model and adapter model operating on overlapping context. You can workaround this by adding [trainable tokens](../package_reference/trainable_tokens) to the base model prior to training the adapter. + +#### Choice of invocation sequence and SFT design + +You must add the `alora_invocation_tokens` sequence because it is not added automatically. We recommend activating the adapter weights early (at the start of any adapter-specific prompting), but after any long inputs, to maximize model performance without compromising cache reuse. As with any model, +formatting should be consistent between train and test. + +Consider the following example, where the base model has a chat template, +and the goal is to train the adapter to generate a desired output. + +* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. +* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. + +After deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as +```py +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` + +An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). + +> [!NOTE] +> If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. + +To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. + +#### Using (and reusing) cache for generation +The main purpose of aLoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: +1. The base model has generated something, and an aLoRA adapter is then called to do a follow-up generation. For example, the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response. +2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a follow-up generation where there is partial context overlap with the original aLoRA. For example, the user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, aLoRA checks if these documents are relevant to the question, and then the base model generates an answer. + + +To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Take care to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2). + +**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: +``` +from transformers import DynamicCache +... +cache = DynamicCache() +inputs_base = tokenizer(prompt_base, return_tensors="pt") +# Generate from base model and save cache +with model_alora.disable_adapter(): + output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True) +output_text_base = tokenizer.decode(output.sequences[0]) +cache = output.past_key_values + +# Generate with aLoRA adapter from cache +prompt_alora = output_text + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Note: cache is now tainted with adapter values and cannot be used in base model from here on! +``` + +**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. + +``` +from transformers import DynamicCache +import copy +... +cache = DynamicCache() +inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device) + +# Prefill from base model and save cache +with model_alora.disable_adapter(): + with torch.no_grad(): + model_alora(**inputs_shared, past_key_values=cache) +cache_copy = copy.deepcopy(cache) + +# Generate from aLoRA using prefilled cache +prompt_alora = prompt_shared + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Generate from base model using saved cache not tainted by aLoRA KV values +prompt_base = prompt_shared +inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device) +with model_alora.disable_adapter(): + output = model_alora.generate(**inputs_base, past_key_values=cache_copy) +output_text_base = tokenizer.decode(output[0]) +``` + + +### Inference with different LoRA adapters in the same batch + +Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch. + +Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an example of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this: + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel + +model_id = ... +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained(model_id) +# load the LoRA adapter for French +peft_model = PeftModel.from_pretrained(model, , adapter_name="adapter_fr") +# next, load the LoRA adapter for German +peft_model.load_adapter(, adapter_name="adapter_de") +``` + +Now, we want to generate text on a sample that contains all three languages: The first three samples are in English, the next three are in French, and the last three are in German. We can use the `adapter_names` argument to specify which adapter to use for each sample. Since our base model is used for English, we use the special string `"__base__"` for these samples. For the next three samples, we indicate the adapter name of the French LoRA fine-tune, in this case `"adapter_fr"`. For the last three samples, we indicate the adapter name of the German LoRA fine-tune, in this case `"adapter_de"`. This way, we can use the base model and the two adapters in a single batch. + +```python +inputs = tokenizer( + [ + "Hello, my dog is cute", + "Hello, my cat is awesome", + "Hello, my fish is great", + "Salut, mon chien est mignon", + "Salut, mon chat est génial", + "Salut, mon poisson est super", + "Hallo, mein Hund ist süß", + "Hallo, meine Katze ist toll", + "Hallo, mein Fisch ist großartig", + ], + return_tensors="pt", + padding=True, +) + +adapter_names = [ + "__base__", "__base__", "__base__", + "adapter_fr", "adapter_fr", "adapter_fr", + "adapter_de", "adapter_de", "adapter_de", +] +output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_tokens=20) +``` + +Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples. + +Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters. + +#### Caveats + +Using this feature has some drawbacks, namely: + +- It only works for inference, not for training. +- Disabling adapters using the `with model.disable_adapter()` context takes precedence over `adapter_names`. +- You cannot pass `adapter_names` when some adapter weights were merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`. +- For obvious reasons, this cannot be used after calling `merge_and_unload()`, since all the LoRA adapters will be merged into the base weights in this case. +- This feature does not currently work with DoRA, so set `use_dora=False` in your `LoraConfig` if you want to use it. +- The `modules_to_save` feature is currently only supported for the layers of types `Linear`, `Embedding`, `Conv2d` and `Conv1d`. +- There is an expected overhead for inference with `adapter_names`, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following: + - Increase the batch size. + - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters. + - Take a look at alternative implementations such as [LoRAX](https://github.com/predibase/lorax), [punica](https://github.com/punica-ai/punica), or [S-LoRA](https://github.com/S-LoRA/S-LoRA), which are specialized to work with a large number of different adapters. + + +### Composing and Reusing LoRA Adapters +#### Arrow +[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. + +In PEFT, Arrow is enabled through [`ArrowConfig]` and `create_arrow_model`. You can also configure parameters such as `top_k` (the number of LoRA adapters combined per token), `router_temperature` (the softmax temperature applied to the routing coefficients), and `rng_seed` (for reproducibility). + +```py +from peft import create_arrow_model, ArrowConfig +from transformers import AutoModelForCausalLM + +# Loading the model +base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + +# Creating the Arrow config +arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + rng_seed=42, +) + +# The LoRA adapters below were trained on a clustered FLAN dataset. +# Task clustering was performed using the Model-Based Clustering (MBC) method, +# as described in the Arrow paper. +# While one could train a separate LoRA for each task and let Arrow route tokens among them, +# training LoRAs on clusters of tasks instead provides an indirect optimization for +# transfer across the multi-task dataset. +task_specific_adapter_paths = [ + f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) + ] + +# Creating the Arrow model +model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=task_specific_adapter_paths, + arrow_config=arrow_config, + ) + +# Now the forward path could be called on this model, like a normal PeftModel. +``` + +Furthermore, you can add or remove adapters after calling ```create_arrow_model```—for example, to fine-tune a new adapter or discard an unnecessary one. Once the adapters are in place, you can activate the ```"arrow_router"``` for inference to use Arrow. Note that if you add a new LoRA adapter after ```create_arrow_model``` and want to fine-tune it, you must explicitly set the new adapter as active, since ```"arrow_router"``` is activated by default in ```create_arrow_model```. + +```py +from trl import SFTTrainer, SFTConfig + +# Adding a new adapter and activating it +model.add_adapter(adapter_name='new_adapter') +model.set_adapter('new_adapter') + +# Now the model could be trained along the `new_adapter`. +trainer = SFTTrainer( + model=model, + args=SFTConfig(...), + ... + ) + +# Once the training is done, you can activate `arrow_router` and use it in inference +model.set_adapter('arrow_router') # Model is ready to be used at inference time now +``` + +#### GenKnowSub +[GenKnowSub](https://aclanthology.org/2025.acl-short.54/) augments Arrow by purifying task-specific LoRA adapters before routing. The key idea is to subtract general knowledge encoded in LoRA space—based on the [forgetting-via-negation principle](https://huggingface.co/papers/2212.04089)—so that task adapters become more isolated and focused on task-relevant signals. Concretely, GenKnowSub estimates a low-dimensional “general” subspace from a set of general (non task-specific) LoRA adapters and removes this component from each task adapter’s LoRA update prior to Arrow’s token-wise routing. This typically improves compositionality and reduces interference when combining many task adapters. + +In PEFT, enable GenKnowSub by setting ```use_gks=True``` in ArrowConfig, and providing ```general_adapter_paths``` in ```create_arrow_model```: + +```py +from peft import create_arrow_model, ArrowConfig +from transformers import AutoModelForCausalLM + +# Loading the model +base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + +# Creating the Arrow config +arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + use_gks=True, + rng_seed=42, +) + +# Path to task-specific, trained on flan clustered dataset (as we explained before.) +task_specific_adapter_paths = [ + f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) + ] +# These general adapters are trained on English, German, and French Wikipedia dataset, +# with causal language modelling objective, each pair like: (507 token tsentence, 5 token completion), and the loss computed on the completion +general_adapter_paths = [ + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17" + ] + +# Creating the Arrow model +model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=task_specific_adapter_paths, + general_adapter_paths=general_adapter_paths, + arrow_config=arrow_config, + ) + +# Now the forward path could be called on this model, like a normal PeftModel. +``` +To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling ```create_arrow_model``` (as described in the Arrow section) is still supported in this case. + +> [!TIP] +> **Things to keep in mind when using Arrow + GenKnowSub:** +> +> - All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```. +> +> - Any inconsistency in these settings will raise an error in ```create_arrow_model```. +> +> - Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically. +> +> - Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior. +> +> - In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```. +> +> - This implementation is compatible with 4-bit quantization (via bitsandbytes): +> +> ```py +> from transformers import AutoModelForCausalLM, BitsAndBytesConfig +> import torch +> +> # Quantisation config +> bnb_config = BitsAndBytesConfig( +> load_in_4bit=True, +> bnb_4bit_quant_type="nf4", +> bnb_4bit_compute_dtype=torch.bfloat16, +> bnb_4bit_use_double_quant=False, +> ) +> +> # Loading the model +> base_model = AutoModelForCausalLM.from_pretrained( +> "microsoft/Phi-3-mini-4k-instruct", +> dtype=torch.bfloat16, +> device_map="auto", +> quantization_config=bnb_config, +> ) +> +> # Now call create_arrow_model() as we explained before. +> ``` + # API ## LoraConfig From a552f8ea32df1c2313c09bc5f41a1a649bbae1ec Mon Sep 17 00:00:00 2001 From: nemo Date: Tue, 2 Jun 2026 17:29:30 +0200 Subject: [PATCH 05/33] Moving from conceptual guides --- docs/source/_toctree.yml | 26 +- docs/source/conceptual_guides/adapter.md | 30 - docs/source/developer_guides/lora.md | 1074 ----------------- docs/source/package_reference/adalora.md | 37 +- docs/source/package_reference/boft.md | 17 + docs/source/package_reference/c3a.md | 12 + docs/source/package_reference/loha.md | 19 +- docs/source/package_reference/lokr.md | 36 +- docs/source/package_reference/lora.md | 188 ++- .../package_reference/lora_variant_dora.md | 80 ++ .../package_reference/lora_variant_velora.md | 49 + docs/source/package_reference/oft.md | 20 +- docs/source/package_reference/poly.md | 4 + 13 files changed, 446 insertions(+), 1146 deletions(-) delete mode 100644 docs/source/developer_guides/lora.md create mode 100644 docs/source/package_reference/lora_variant_dora.md create mode 100644 docs/source/package_reference/lora_variant_velora.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 3ed9a19355..e171b85aa4 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -28,8 +28,6 @@ title: Model merging - local: developer_guides/quantization title: Quantization - - local: developer_guides/lora - title: LoRA - local: developer_guides/custom_models title: Custom models - local: developer_guides/low_level_api @@ -52,17 +50,29 @@ - sections: - local: package_reference/layernorm_tuning title: LayerNorm Tuning + - local: package_reference/trainable_tokens + title: Trainable Tokens title: Layer Tuning - sections: - local: package_reference/p_tuning title: P-Tuning + - local: package_reference/cpt + title: CPT + - local: package_reference/prefix_tuning + title: Prefix tuning + - local: package_reference/cartridges + title: Cartridges + - local: package_reference/prompt_tuning + title: Prompt tuning title: Soft Prompting - sections: + - local: package_reference/adalora + title: AdaLoRA + - local: package_reference/adamss + title: AdaMSS - sections: - local: package_reference/lora - title: Overview - - local: package_reference/lora_initializations - title: Initializations + title: LoRA - sections: - local: package_reference/lora_variant_velora title: VeLoRA @@ -70,10 +80,6 @@ title: DoRA title: Variants title: LoRA - - local: package_reference/adalora - title: AdaLoRA - - local: package_reference/adamss - title: AdaMSS - local: package_reference/ia3 title: IA3 - local: package_reference/llama_adapter @@ -110,8 +116,6 @@ title: HiRA - local: package_reference/hra title: HRA - - local: package_reference/cpt - title: CPT - local: package_reference/randlora title: RandLora - local: package_reference/shira diff --git a/docs/source/conceptual_guides/adapter.md b/docs/source/conceptual_guides/adapter.md index 98352bf6ee..efe43ea073 100644 --- a/docs/source/conceptual_guides/adapter.md +++ b/docs/source/conceptual_guides/adapter.md @@ -41,40 +41,10 @@ For each step, X-LoRA requires the base model to be run twice: first, to get hid Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. -## Low-Rank Hadamard Product (LoHa) -Low-rank decomposition can impact performance because the weight updates are limited to the low-rank space, which can constrain a model's expressiveness. However, you don't necessarily want to use a larger rank because it increases the number of trainable parameters. To address this, [LoHa](https://huggingface.co/papers/2108.06098) (a method originally developed for computer vision) was applied to diffusion models where the ability to generate diverse images is an important consideration. LoHa should also work with general model types, but the embedding layers aren't currently implemented in PEFT. -LoHa uses the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_(matrices)) (element-wise product) instead of the matrix product. ∆W is represented by four smaller matrices instead of two - like in LoRA - and each pair of these low-rank matrices are combined with the Hadamard product. As a result, ∆W can have the same number of trainable parameters but a higher rank and expressivity. -## Low-Rank Kronecker Product (LoKr) -[LoKr](https://hf.co/papers/2309.14859) is very similar to LoRA and LoHa, and it is also mainly applied to diffusion models, though you could also use it with other model types. LoKr replaces the matrix product with the [Kronecker product](https://en.wikipedia.org/wiki/Kronecker_product) instead. The Kronecker product decomposition creates a block matrix which preserves the rank of the original weight matrix. Another benefit of the Kronecker product is that it can be vectorized by stacking the matrix columns. This can speed up the process because you're avoiding fully reconstructing ∆W. - -## Orthogonal Finetuning (OFT) - -
- -
-Controlling Text-to-Image Diffusion by Orthogonal Finetuning - -[OFT](https://hf.co/papers/2306.07280) is a method that primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). - -OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. - -## Orthogonal Butterfly (BOFT) - -[BOFT](https://hf.co/papers/2311.06243) is an improved orthogonal finetuning method that focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard OFT. Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. - -Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time). - -In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead. - -## Adaptive Low-Rank Adaptation (AdaLoRA) - -[AdaLoRA](https://hf.co/papers/2303.10512) manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The ∆W is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of ∆W is adjusted according to an importance score. ∆W is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning. - -Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance. ## Llama-Adapter diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md deleted file mode 100644 index be424086f2..0000000000 --- a/docs/source/developer_guides/lora.md +++ /dev/null @@ -1,1074 +0,0 @@ - - -# LoRA - -LoRA is low-rank decomposition method to reduce the number of trainable parameters which speeds up finetuning large models and uses less memory. In PEFT, using LoRA is as easy as setting up a [`LoraConfig`] and wrapping it with [`get_peft_model`] to create a trainable [`PeftModel`]. - -This guide explores in more detail other options and features for using LoRA. - -## Initialization - -The initialization of LoRA weights is controlled by the parameter `init_lora_weights` in [`LoraConfig`]. By default, PEFT initializes LoRA weights with Kaiming-uniform for weight A and zeros for weight B resulting in an identity transform (same as the reference [implementation](https://github.com/microsoft/LoRA)). - -It is also possible to pass `init_lora_weights="gaussian"`. As the name suggests, this initializes weight A with a Gaussian distribution and zeros for weight B (this is how [Diffusers](https://huggingface.co/docs/diffusers/index) initializes LoRA weights). - -```py -from peft import LoraConfig - -config = LoraConfig(init_lora_weights="gaussian", ...) -``` - -There is also an option to set `init_lora_weights=False` which is useful for debugging and testing. This should be the only time you use this option. When choosing this option, the LoRA weights are initialized such that they do *not* result in an identity transform. - -```py -from peft import LoraConfig - -config = LoraConfig(init_lora_weights=False, ...) -``` - -### PiSSA -[PiSSA](https://huggingface.co/papers/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. - -Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model: -```python -from peft import LoraConfig -config = LoraConfig(init_lora_weights="pissa", ...) -``` -Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time: -```python -lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) -``` -For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning). - -### CorDA - -[CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). -The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. -When preserving pre-trained knowledge is not a concern, -the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance. - -You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices. - -```py -@torch.no_grad() -def run_model(): - # Assume `model` and `dataset` is in context... - model.eval() - for batch in dataset: - model(**batch) - - -corda_config = CordaConfig( - corda_method="kpm", -) -lora_config = LoraConfig( - init_lora_weights="corda", - corda_config=corda_config, -) -preprocess_corda(model, lora_config, run_model=run_model) -peft_model = get_peft_model(model, lora_config) -``` - -For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning). - -### OLoRA -[OLoRA](https://huggingface.co/papers/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance. - -You just need to pass a single additional option to use OLoRA: -```python -from peft import LoraConfig -config = LoraConfig(init_lora_weights="olora", ...) -``` -For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning). - -### EVA -[EVA](https://huggingface.co/papers/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis. - -You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]: -```python -from peft import LoraConfig, EvaConfig -peft_config = LoraConfig( - init_lora_weights = "eva", - eva_config = EvaConfig(rho = 2.0), - ... -) -``` -The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r. - -It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]: -```python -peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) -``` -Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning): -```python -initialize_lora_eva_weights(peft_model, dataloader) -``` -EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual. - -> [!TIP] -> For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning). - -### LoftQ - -#### Standard approach - -When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://huggingface.co/papers/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning). - -> [!TIP] -> Learn more about how PEFT works with quantization and how to use LoftQ in the [Quantization](quantization) guide. - -### Rank-stabilized LoRA - -Another way to initialize [`LoraConfig`] is with the [rank-stabilized LoRA (rsLoRA)](https://huggingface.co/papers/2312.03732) method. The LoRA architecture scales each adapter during every forward pass by a fixed scalar which is set at initialization and depends on the rank `r`. The scalar is given by `lora_alpha/r` in the original implementation, but rsLoRA uses `lora_alpha/math.sqrt(r)` which stabilizes the adapters and increases the performance potential from using a higher `r`. - -```py -from peft import LoraConfig - -config = LoraConfig(use_rslora=True, ...) -``` - -### LoRA-GA - -[LoRA-GA](../package_reference/lora#lora-ga) (Low-Rank Adaptation with Gradient Approximation) initializes the adapter -weights by performing SVD on estimated gradients, so that the weights are aligning closer to full-finetuning for faster -convergence. - -This method requires an initialization function to estimate the gradients -before beginning the actual training: - -```python -from peft.tuners.lora import preprocess_loraga - -def train_step(): - """Run forward and backward passes for gradient estimation.""" - dataloader_iter = iter(grad_dataloader) - for _ in range(N): - batch = next(dataloader_iter) - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - loss.backward() - -preprocess_loraga(model, lora_config, train_step) -``` - -### KappaTuneSelector - -KappaTune implements the condition-number-based target selection strategy from the [KappaTune paper](https://arxiv.org/abs/2506.16289). It scans every `nn.Linear` module and, for models where MoE expert weights are stored as fused 3D `nn.Parameter` tensors (e.g. Llama-4, Qwen3-MoE), also those parameters, computes the matrix condition number κ = σ_max / σ_min for each, and selects the most isotropic layers (lowest κ). These isotropic layers serve as ideal candidates for fine-tuning, since their high-entropy nature allows them to absorb new information more readily, leaving the specialized, anisotropic layers intact to mitigate catastrophic forgetting during continual learning. - -Use `find_kappa_target_modules` as a one-liner to get the optimal `target_modules` for `LoraConfig`: - -```python -from peft import LoraConfig, get_peft_model -from peft.helpers import find_kappa_target_modules - -model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") - -targets = find_kappa_target_modules(model, top_p=0.2) -config = LoraConfig( - target_modules=targets["target_modules"], - target_parameters=targets["target_parameters"] if stable_modules_dic["target_parameters"] else None, - r=64, - lora_alpha=32, - task_type="CAUSAL_LM", -) -peft_model = get_peft_model(model, config) -``` - -See a complete example [here](https://github.com/huggingface/peft/blob/main/examples/KappaTune/experiments_kappatune_peft.py). - -## Variants - -PEFT implements LoRA variants that improve upon the original LoRA. - -### Weight-Decomposed Low-Rank Adaptation (DoRA) - -This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. - -```py -from peft import LoraConfig - -config = LoraConfig(use_dora=True, ...) -``` - -If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`. - -```py -from peft import LoraConfig, LoraRuntimeConfig - -config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...) -``` - -A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method. - -```py -from peft import PeftModel - -model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True) -``` - -#### Optimization - -DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the -base result at those times to get the speedup. -Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py) -with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora` on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations: - -| | Without Optimization | With Optimization | -| :--: | :--: | :--: | -| train runtime (sec) | 359.7298 | **279.2676** | -| train samples per second | 1.779 | **2.292** | -| train steps per second | 0.056 | **0.072** | - -Moreover, it is possible to further increase runtime performance of DoRA by using the [`DoraCaching`] helper context. This requires the model to be in `eval` mode: - -```py -from peft.helpers import DoraCaching - -model.eval() -with DoraCaching(): - output = model(inputs) -``` - -For [`meta-llama/Llama-3.1-8B`](https://huggingface.co/meta-llama/Llama-3.1-8B), the [DoRA caching benchmark script](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora-caching.py) shows that, compared to LoRA: - -- DoRA without caching requires 139% more time -- DoRA without caching requires 4% more memory -- DoRA with caching requires 17% more time -- DoRA with caching requires 41% more memory - -Caching can thus make inference with DoRA significantly faster but it also requires signficantly more memory. Ideally, if the use case allows it, just merge the DoRA adapter to avoid both memory and runtime overhead. - -#### Caveats - -- DoRA only supports embedding, linear, and Conv2d layers at the moment. -- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. -- DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2. - -### VeLoRA - -[VeLoRA](https://huggingface.co/papers/2405.17991) is a LoRA variant that reduces training memory by compressing the activations saved for the LoRA in the forward pass and then reconstructing them in the backwards pass to implement the update rules. In PEFT, VeLoRA is configured as a LoRA variant through the `velora_config` argument on [`LoraConfig`]. - -```py -from peft import LoraConfig, VeloraConfig - -config = LoraConfig( - target_modules=["q_proj", "v_proj"], - velora_config=VeloraConfig( - num_groups=64, - scale=0.2, - init_type="batch_average", - ), -) -``` - -VeLoRA is applied to every LoRA layer selected by `target_modules`. `num_groups` controls how the input activation depth is split before compression. If the activation depth is not evenly divisible by `num_groups`, VeLoRA pads the grouped representation internally and removes the padding after reconstruction. `scale` rescales the reconstructed activations during the backward pass, and `init_type` chooses how the projection is initialized. - -Use `batch_average_once` to initialize the projection from the first training batch, `batch_average` to update it from every training forward pass, or `random` to initialize it immediately from a random normalized vector. - -Below are some results with the [MetaMathQA benchmark](https://github.com/huggingface/peft/tree/main/method_comparison/MetaMathQA). - -| Variant | Training Loss | Max Memory (GiB) | Tokens/sec | -|---|---:|---:|---:| -| LoRA | 0.5427 | 27.69 | 2366.2 | -| LoRA + GC | 0.5426 | 13.17 | 1671.8 | -| LoRA+VeLoRA | 0.5427 | 19.94 | 2057.6 | - -#### Caveats - -- VeLoRA is currently supported on standard LoRA linear layers only. - -## Training - -This section shows how to handle more complex training scenarios instead of only applying a low-rank adapter -to the model and feed data. - -### QLoRA-style training - -The default LoRA settings in PEFT add trainable weights to the query and value layers of each attention block. But [QLoRA](https://hf.co/papers/2305.14314), which adds trainable weights to all the linear layers of a transformer model, can provide performance equal to a fully finetuned model. To apply LoRA to all the linear layers, like in QLoRA, set `target_modules="all-linear"` (easier than specifying individual modules by name which can vary depending on the architecture). - -```py -config = LoraConfig(target_modules="all-linear", ...) -``` - -For more information about how to apply quantization to PEFT adapters, refer to the [quantization guide](quantization). - -### Memory efficient Layer Replication with LoRA - -An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://huggingface.co/papers/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument. - -```py -config = LoraConfig(layer_replication=[[0,4], [2,5]], ...) -``` - -Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters. - -[Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The -[adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning. - -### Fine grained control over ranks and alpha (scaling) - -By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values. - -To give an example, let's assume that we have a model with the following structure: - -```python ->>> print(model) -Outer( - (foo): Linear(...) - (module): Middle( - (foo): Linear(...) - (foobar): Linear(...) - (module): Inner( - (foo): Linear(...) - (barfoo): Linear(...) - ) - ) -) -``` - -- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched. -- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes. -- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason. -- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`. -- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`. -- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT. - -The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good. - -### Targeting `nn.Parameter` directly - -Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with `target_parameters`. As an example, for [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164), you can pass: `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. - -Note that when targeting expert parameters, PEFT can add a substantial runtime overhead. The reason is that PEFT always materializes the LoRA contribution for _each expert_ even if only a small amount of experts is required. During training, this is less relevant since, over the course of the sequence, typically a large fraction of experts is activated at least once. However, during inference, normally a KV cache is used and we thus need to only compute the last token, which means that only a small amount of experts is activated. Therefore, using LoRA on MoE layers can result in a substantial slowdown at inference time. The recommendation is thus to merge the weights (`model.merge_adapter()` or `model = model.merge_and_unload()`). This removes the PEFT overhead. - -A more detailed investigation of this issue can be found on this [pull request on MoE optimization](https://github.com/huggingface/peft/pull/3139). - -#### Caveats - -- At the moment, this argument allows to target 2-dim or 3-dim `nn.Parameter`s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension. -- It is currently not possible to add multiple LoRA adapters (via `model.add_adapter` or `model.load_adapter`) that use `target_parameters` at the same time. - -#### MoE expert parameters and vLLM - -Some MoE models in Transformers store expert weights as `nn.Parameter` tensors (often 3D), not `nn.Linear` modules. -To apply LoRA to those experts, use `target_parameters` and set a per-layer rank with `rank_pattern`: - -```python -num_experts = getattr(model.config, "num_local_experts", None) or model.config.num_experts -effective_r = max(1, r // num_experts) -config = LoraConfig( - r=r, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - target_parameters=[ - # Mixtral / Qwen3-MoE / GPT-OSS - "mlp.experts.gate_up_proj", - "mlp.experts.down_proj", - # Llama4 - # "feed_forward.experts.gate_up_proj", - # "feed_forward.experts.down_proj", - ], - rank_pattern={ - "experts.gate_up_proj": effective_r, - "experts.down_proj": effective_r, - }, -) -``` - -This keeps the total LoRA parameter budget similar to dense layers (see -[LoRA Without Regret](https://thinkingmachines.ai/blog/lora/) by Schulman et. al.). -Non-expert modules use the default rank `r`. - -Accelerated inference with the fine-tuned model is possible with, for example, [vLLM](https://vllm.ai/) which supports fused MoE expert layers since v0.11.2. - -### Efficiently train tokens alongside LoRA - -PEFT LoRA adapters support adding new tokens with the `trainable_token_indices` parameter. This allows tuning of other tokens alongside fine-tuning specific layers. Only the specified tokens are trained and all other tokens are untouched. It saves memory and doesn't throw away learned context from existing token embeddings unlike training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`]. - -```py -# for layer 'embed_tokens' -config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...) - -# specific embedding layer -config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...) -``` - -In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model. - -```py -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import get_peft_model, LoraConfig - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - -# we define our new tokens and add them to the tokenizer as special tokens -special_tokens = ['<|start_think|>', '<|stop_think|>'] -tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) - -# make room for new tokens in the embedding matrix if it isn't big enough already -base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings)) - -# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens` -# and specifically our new tokens we just added -lora_config = LoraConfig( - target_modules='all-linear', - trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)}, -) -peft_model = get_peft_model(base_model, lora_config) - -# proceed to train the model like normal -[...] -``` - -The token weights are saved as a part of the adapter state dict alongside the LoRA weights. Full fine-tuning and saving the embedding matrix would have stored a much bigger file. - -To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens): - -| | Trainable Tokens | LoRA | Full Fine-tuning | -| --------: | :--------------: | :--------: | :--------------: | -| VRAM | 15,562 MB | 15,581MB | ~16,500MB | -| Influence | 6 tokens | all tokens | all tokens | - -### Weight tying - -Many causal LMs use **weight tying**, where two or more weights share the same underlying parameters. In the most common case, the input embedding weights (`embed_tokens`) and output projection weights (`lm_head`) share the same tensor. This is because it reduces parameters and usually preserves model quality. - -It's not always obvious how PEFT deals with these tied weights when they are targeted for fine-tuning. For LoRA, the `ensure_weight_tying` on the [`LoraConfig`] controls whether PEFT should explicitly keep adapter-side updates tied for those layers. In practice, this can affect `modules_to_save`, `target_modules`, and `trainable_token_indices`. Note that this logic partially relies on convention when it comes to naming the layers (`"embed_tokens"`, `"lm_head"`) and proper working cannot be guaranteed if those conventions are not used. - -The tables below summarize expected behavior. - -#### `modules_to_save` - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|-----------------------------------------------------|--------------------------------------------------------------| -| No | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Add `ModulesToSaveWrapper` on selected layer only | -| No | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Warn, then add `ModulesToSaveWrapper` on selected layer only | -| Yes | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Treat as separate | -| Yes | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Wrap tied layers and keep wrappers tied | -| No | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Treat as separate | -| No | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Keep `ModulesToSaveWrapper`s tied | - -#### `target_modules` - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|----------------------------------------------------|--------------------------------------------| -| No | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Add LoRA on selected layer only | -| No | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Warn, then add LoRA on selected layer only | -| Yes | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Treat as separate | -| Yes | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Keep LoRA adapters tied | -| No | `False` | `target_modules=["embed_tokens", "lm_head"]` | Treat as separate | -| No | `True` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `False` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `True` | `target_modules=["embed_tokens", "lm_head"]` | Keep LoRA adapters tied | - -#### `trainable_token_indices` - -For trainable tokens, we have the additional complication that even if the LM head and embeddings are tied, as a user I may want to fine-tune *different* tokens on them. In the example table below, we thus differentiate between fine-tuning the same and fine-tuning different tokens. - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|-----------------------------------------------------------------------|------------------------------------------------| -| No | `False` | `trainable_token_indices=[1, 2, 3]` | Trainable tokens on embeddings only | -| No | `True` | `trainable_token_indices=[1, 2, 3]` | Warn, then trainable tokens on embeddings only | -| Yes | `False` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | -| Yes | `True` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | -| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Treat as separate | -| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Warn, then treat as separate | -| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | -| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | -| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | -| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Warn, then treat as separate | -| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | -| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Error | - -For users, this means: - -- In general, if you want to fine-tune weights that are tied and want to keep them tied, pass `ensure_weight_tying=True`. -- If your base model's weights are untied, `ensure_weight_tying=True` cannot force tying and only warns. -- For `trainable_token_indices`, tied layers must use the same token indices when `ensure_weight_tying=True`. - -## Optimizers - -LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+. - -### LoRA-FA Optimizer - -LoRA training can be more effective and efficient using LoRA-FA, as described in [LoRA-FA](https://huggingface.co/papers/2308.03303). LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption. - -```py -from peft import LoraConfig, get_peft_model -from peft.optimizers import create_lorafa_optimizer -from transformers import Trainer, get_cosine_schedule_with_warmup - -base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") - -config = LoraConfig(...) -model = get_peft_model(base_model, config) - -optimizer = create_lorafa_optimizer( - model=model, - r=128, - lora_alpha=32, - lr=7e-5, -) - -scheduler = get_cosine_schedule_with_warmup( - optimizer, - num_warmup_steps=100, - num_training_steps=1000, -) - -trainer = Trainer( - ..., - optimizers=(optimizer, scheduler), -) -``` - -### LoRA+ optimized LoRA - -LoRA training can be optimized using [LoRA+](https://huggingface.co/papers/2402.12354), which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%. - -```py -from peft import LoraConfig, get_peft_model -from peft.optimizers import create_loraplus_optimizer -from transformers import Trainer -import bitsandbytes as bnb - -base_model = ... -config = LoraConfig(...) -model = get_peft_model(base_model, config) - -optimizer = create_loraplus_optimizer( - model=model, - optimizer_cls=bnb.optim.Adam8bit, - lr=5e-5, - loraplus_lr_ratio=16, -) -scheduler = None - -... -trainer = Trainer( - ..., - optimizers=(optimizer, scheduler), -) -``` - - -## Post-Training - -This section shows potential post-processing methods for trained adapters. - - -### Merge LoRA weights into the base model - -While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory. - -Below is a diagram that explains the intuition of LoRA adapter merging: - -
- -
- -We show in the snippets below how to run that using PEFT. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) -model = model.merge_and_unload() -``` - -It is important to assign the returned model to a variable and use it, [`~LoraModel.merge_and_unload`] is not an in-place operation. If you need to keep a copy of the weights so you can unmerge the adapter later or delete and load different ones, you should use the [`~LoraModel.merge_adapter`] function instead. Now you have the option to use [`~LoraModel.unmerge_adapter`] to return the base model. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) -model.merge_adapter() - -# unmerge the LoRA layers from the base model -model.unmerge_adapter() -``` - -The [`~LoraModel.add_weighted_adapter`] function is useful for merging multiple LoRAs into a new adapter based on a user provided weighting scheme in the `weights` parameter. Below is an end-to-end example. - -First load the base model: - -```python -from transformers import AutoModelForCausalLM -from peft import PeftModel -import torch - -base_model = AutoModelForCausalLM.from_pretrained( - "mistralai/Mistral-7B-v0.1", dtype=torch.float16, device_map="auto" -) -``` - -Then we load the first adapter: - -```python -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name="sft") -``` - -Then load a different adapter and merge it with the first one: - -```python -weighted_adapter_name = "sft-dpo" -model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") -model.add_weighted_adapter( - adapters=["sft", "dpo"], - weights=[0.7, 0.3], - adapter_name=weighted_adapter_name, - combination_type="linear" -) -model.set_adapter(weighted_adapter_name) -``` - -> [!TIP] -> There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype. - -Now, perform inference: - -```python -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" - -tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - -prompt = "Hey, are you conscious? Can you talk to me?" -inputs = tokenizer(prompt, return_tensors="pt") -inputs = {k: v.to(device) for k, v in inputs.items()} - -with torch.no_grad(): - generate_ids = model.generate(**inputs, max_length=30) -outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] -print(outputs) -``` - -### Recovering base model performance via intruder dimension reduction - -The paper [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://huggingface.co/papers/2410.21228) argues -that LoRA training introduces extra dimensions into the weights that have very little in common with the already -learnt weights and lead to forgetting of already learned information. PEFT implements the suggested mitigation -in [`peft.tuners.lora.intruders.reduce_intruder_dimension`]. - -The mitigation will take a PEFT model with a loaded LoRA and create a new, modified adapter that is loaded alongside -the existing adapter and now the active adapter. - -Example usage: - -```python -from peft.tuners.lora.intruders import reduce_intruder_dimension - -peft_model = AutoPeftModelForCausalLM.from_pretrained('hubnemo/llama-3.2b-metamathqa-lora64') - -reduce_intruder_dimension( - peft_model, - mitigation_lambda=0.75, -) - -peft_model.generate(...) -``` - -There are a few hyper-parameters that can be used for tuning the effectiveness of the mitigation but, as evidenced -in Figure 8 of the paper, it will always be a trade-off between task accuracy learned by the adapter and forgetting -of the base model's knowledge. The mitigation will remove information from the adapter to reduce the impact on -forgetting previous knowledge but this also means that some information about the task learned by the adapter is -lost as well. - -While the defaults are set to deliver a good trade-off between the two factors it is not guaranteed that the defaults -will hold for your adapter, your model and your data, therefore it is wise to have a benchmark ready to measure -the effect. - -## Load adapters - -Adapters can be loaded onto a pretrained model with [`~PeftModel.load_adapter`], which is useful for trying out different adapters whose weights aren't merged. Set the active adapter weights with the [`~LoraModel.set_adapter`] function. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) - -# load different adapter -model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") - -# set adapter as active -model.set_adapter("dpo") -``` - -To return the base model, you could use [`~LoraModel.unload`] to unload all of the LoRA modules or [`~LoraModel.delete_adapter`] to delete the adapter entirely. [`~LoraModel.unload`] is not an in-place operation, remember to assign the returned model to a variable and use it. - -```py -# unload adapter -model = model.unload() - -# delete adapter -model.delete_adapter("dpo") -``` - -## Tensor Parallelism - -LoRA supports [Tensor Parallelism (TP)](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism) as provided by Transformers. When a base model is loaded with a `tp_plan`, PEFT automatically detects the TP configuration of each target module and adds the appropriate hooks to the LoRA adapter weights so that they participate correctly in the tensor-parallel computation. - -> [!WARNING] -> Tensor Parallelism support for LoRA requires `transformers >= 5.4.0`. - -Usage is identical to the standard LoRA workflow — simply load the base model with a `tp_plan` before wrapping it with PEFT: - -```py -from transformers import AutoModelForCausalLM -from peft import get_peft_model, LoraConfig - -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", tp_plan="auto") -lora_config = LoraConfig(r=16, target_modules=["q_proj", "v_proj"]) -model = get_peft_model(model, lora_config) -``` - -Saving and loading work as usual via `save_pretrained` / `from_pretrained`. PEFT gathers the sharded adapter weights back to full tensors before saving, so checkpoints are portable and independent of the number of devices used during training. - -## Inference - -This section showcases what you can do during inference time with LoRA, such as uncoupling the adapter. - -### Activated LoRA (aLoRA) - -Activated LoRA (aLoRA) is a low rank adapter architecture for causal LMs that reuses the existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see the aLoRA [paper](https://huggingface.co/papers/2504.12397). - -This technique scans for the last occurrence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token). It activates the adapter weights on tokens starting with the beginning of the invocation sequence. Any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights. Weights on prior tokens are left un-adapted, making the cache for those tokens interchangeable with base model cache due to the causal attention mask in causal LMs. Usage is very similar to standard LoRA. The key difference is that the invocation sequence must be specified when the adapter is created: - -```py -from peft import LoraConfig - -config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...) -``` - -alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as: -```py -invocation_string = "placeholder" -alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). -``` -The tokenizer is the base model's tokenizer. Use `add_special_tokens=False` to avoid adding `SOS`/`EOS` tokens in our search string (which will most likely cause the search to fail). - -**Notes** -* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. -* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. -* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. -* Beam search is not yet supported. -* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model. This can complicate the target use case of both the base model and adapter model operating on overlapping context. You can workaround this by adding [trainable tokens](../package_reference/trainable_tokens) to the base model prior to training the adapter. - -#### Choice of invocation sequence and SFT design - -You must add the `alora_invocation_tokens` sequence because it is not added automatically. We recommend activating the adapter weights early (at the start of any adapter-specific prompting), but after any long inputs, to maximize model performance without compromising cache reuse. As with any model, -formatting should be consistent between train and test. - -Consider the following example, where the base model has a chat template, -and the goal is to train the adapter to generate a desired output. - -* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. -* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. - -After deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as -```py -alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). -``` - -An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). - -> [!NOTE] -> If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. - -To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. - -#### Using (and reusing) cache for generation -The main purpose of aLoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: -1. The base model has generated something, and an aLoRA adapter is then called to do a follow-up generation. For example, the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response. -2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a follow-up generation where there is partial context overlap with the original aLoRA. For example, the user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, aLoRA checks if these documents are relevant to the question, and then the base model generates an answer. - - -To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Take care to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2). - -**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: -``` -from transformers import DynamicCache -... -cache = DynamicCache() -inputs_base = tokenizer(prompt_base, return_tensors="pt") -# Generate from base model and save cache -with model_alora.disable_adapter(): - output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True) -output_text_base = tokenizer.decode(output.sequences[0]) -cache = output.past_key_values - -# Generate with aLoRA adapter from cache -prompt_alora = output_text + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) -output = model_alora.generate(**inputs_alora, past_key_values=cache) -output_text_alora = tokenizer.decode(output[0]) - -# Note: cache is now tainted with adapter values and cannot be used in base model from here on! -``` - -**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. - -``` -from transformers import DynamicCache -import copy -... -cache = DynamicCache() -inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device) - -# Prefill from base model and save cache -with model_alora.disable_adapter(): - with torch.no_grad(): - model_alora(**inputs_shared, past_key_values=cache) -cache_copy = copy.deepcopy(cache) - -# Generate from aLoRA using prefilled cache -prompt_alora = prompt_shared + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) -output = model_alora.generate(**inputs_alora, past_key_values=cache) -output_text_alora = tokenizer.decode(output[0]) - -# Generate from base model using saved cache not tainted by aLoRA KV values -prompt_base = prompt_shared -inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device) -with model_alora.disable_adapter(): - output = model_alora.generate(**inputs_base, past_key_values=cache_copy) -output_text_base = tokenizer.decode(output[0]) -``` - - -### Inference with different LoRA adapters in the same batch - -Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch. - -Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an example of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this: - -```python -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import PeftModel - -model_id = ... -tokenizer = AutoTokenizer.from_pretrained(model_id) - -model = AutoModelForCausalLM.from_pretrained(model_id) -# load the LoRA adapter for French -peft_model = PeftModel.from_pretrained(model, , adapter_name="adapter_fr") -# next, load the LoRA adapter for German -peft_model.load_adapter(, adapter_name="adapter_de") -``` - -Now, we want to generate text on a sample that contains all three languages: The first three samples are in English, the next three are in French, and the last three are in German. We can use the `adapter_names` argument to specify which adapter to use for each sample. Since our base model is used for English, we use the special string `"__base__"` for these samples. For the next three samples, we indicate the adapter name of the French LoRA fine-tune, in this case `"adapter_fr"`. For the last three samples, we indicate the adapter name of the German LoRA fine-tune, in this case `"adapter_de"`. This way, we can use the base model and the two adapters in a single batch. - -```python -inputs = tokenizer( - [ - "Hello, my dog is cute", - "Hello, my cat is awesome", - "Hello, my fish is great", - "Salut, mon chien est mignon", - "Salut, mon chat est génial", - "Salut, mon poisson est super", - "Hallo, mein Hund ist süß", - "Hallo, meine Katze ist toll", - "Hallo, mein Fisch ist großartig", - ], - return_tensors="pt", - padding=True, -) - -adapter_names = [ - "__base__", "__base__", "__base__", - "adapter_fr", "adapter_fr", "adapter_fr", - "adapter_de", "adapter_de", "adapter_de", -] -output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_tokens=20) -``` - -Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples. - -Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters. - -#### Caveats - -Using this feature has some drawbacks, namely: - -- It only works for inference, not for training. -- Disabling adapters using the `with model.disable_adapter()` context takes precedence over `adapter_names`. -- You cannot pass `adapter_names` when some adapter weights were merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`. -- For obvious reasons, this cannot be used after calling `merge_and_unload()`, since all the LoRA adapters will be merged into the base weights in this case. -- This feature does not currently work with DoRA, so set `use_dora=False` in your `LoraConfig` if you want to use it. -- The `modules_to_save` feature is currently only supported for the layers of types `Linear`, `Embedding`, `Conv2d` and `Conv1d`. -- There is an expected overhead for inference with `adapter_names`, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following: - - Increase the batch size. - - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters. - - Take a look at alternative implementations such as [LoRAX](https://github.com/predibase/lorax), [punica](https://github.com/punica-ai/punica), or [S-LoRA](https://github.com/S-LoRA/S-LoRA), which are specialized to work with a large number of different adapters. - - -### Composing and Reusing LoRA Adapters -#### Arrow -[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. - -In PEFT, Arrow is enabled through [`ArrowConfig]` and `create_arrow_model`. You can also configure parameters such as `top_k` (the number of LoRA adapters combined per token), `router_temperature` (the softmax temperature applied to the routing coefficients), and `rng_seed` (for reproducibility). - -```py -from peft import create_arrow_model, ArrowConfig -from transformers import AutoModelForCausalLM - -# Loading the model -base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - -# Creating the Arrow config -arrow_config = ArrowConfig( - top_k=3, - router_temperature=1.0, - rng_seed=42, -) - -# The LoRA adapters below were trained on a clustered FLAN dataset. -# Task clustering was performed using the Model-Based Clustering (MBC) method, -# as described in the Arrow paper. -# While one could train a separate LoRA for each task and let Arrow route tokens among them, -# training LoRAs on clusters of tasks instead provides an indirect optimization for -# transfer across the multi-task dataset. -task_specific_adapter_paths = [ - f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) - ] - -# Creating the Arrow model -model = create_arrow_model( - base_model=base_model, - task_specific_adapter_paths=task_specific_adapter_paths, - arrow_config=arrow_config, - ) - -# Now the forward path could be called on this model, like a normal PeftModel. -``` - -Furthermore, you can add or remove adapters after calling ```create_arrow_model```—for example, to fine-tune a new adapter or discard an unnecessary one. Once the adapters are in place, you can activate the ```"arrow_router"``` for inference to use Arrow. Note that if you add a new LoRA adapter after ```create_arrow_model``` and want to fine-tune it, you must explicitly set the new adapter as active, since ```"arrow_router"``` is activated by default in ```create_arrow_model```. - -```py -from trl import SFTTrainer, SFTConfig - -# Adding a new adapter and activating it -model.add_adapter(adapter_name='new_adapter') -model.set_adapter('new_adapter') - -# Now the model could be trained along the `new_adapter`. -trainer = SFTTrainer( - model=model, - args=SFTConfig(...), - ... - ) - -# Once the training is done, you can activate `arrow_router` and use it in inference -model.set_adapter('arrow_router') # Model is ready to be used at inference time now -``` - -#### GenKnowSub -[GenKnowSub](https://aclanthology.org/2025.acl-short.54/) augments Arrow by purifying task-specific LoRA adapters before routing. The key idea is to subtract general knowledge encoded in LoRA space—based on the [forgetting-via-negation principle](https://huggingface.co/papers/2212.04089)—so that task adapters become more isolated and focused on task-relevant signals. Concretely, GenKnowSub estimates a low-dimensional “general” subspace from a set of general (non task-specific) LoRA adapters and removes this component from each task adapter’s LoRA update prior to Arrow’s token-wise routing. This typically improves compositionality and reduces interference when combining many task adapters. - -In PEFT, enable GenKnowSub by setting ```use_gks=True``` in ArrowConfig, and providing ```general_adapter_paths``` in ```create_arrow_model```: - -```py -from peft import create_arrow_model, ArrowConfig -from transformers import AutoModelForCausalLM - -# Loading the model -base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - -# Creating the Arrow config -arrow_config = ArrowConfig( - top_k=3, - router_temperature=1.0, - use_gks=True, - rng_seed=42, -) - -# Path to task-specific, trained on flan clustered dataset (as we explained before.) -task_specific_adapter_paths = [ - f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) - ] -# These general adapters are trained on English, German, and French Wikipedia dataset, -# with causal language modelling objective, each pair like: (507 token tsentence, 5 token completion), and the loss computed on the completion -general_adapter_paths = [ - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17", - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35", - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17" - ] - -# Creating the Arrow model -model = create_arrow_model( - base_model=base_model, - task_specific_adapter_paths=task_specific_adapter_paths, - general_adapter_paths=general_adapter_paths, - arrow_config=arrow_config, - ) - -# Now the forward path could be called on this model, like a normal PeftModel. -``` -To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling ```create_arrow_model``` (as described in the Arrow section) is still supported in this case. - -> [!TIP] -> **Things to keep in mind when using Arrow + GenKnowSub:** -> -> - All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```. -> -> - Any inconsistency in these settings will raise an error in ```create_arrow_model```. -> -> - Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically. -> -> - Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior. -> -> - In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```. -> -> - This implementation is compatible with 4-bit quantization (via bitsandbytes): -> -> ```py -> from transformers import AutoModelForCausalLM, BitsAndBytesConfig -> import torch -> -> # Quantisation config -> bnb_config = BitsAndBytesConfig( -> load_in_4bit=True, -> bnb_4bit_quant_type="nf4", -> bnb_4bit_compute_dtype=torch.bfloat16, -> bnb_4bit_use_double_quant=False, -> ) -> -> # Loading the model -> base_model = AutoModelForCausalLM.from_pretrained( -> "microsoft/Phi-3-mini-4k-instruct", -> dtype=torch.bfloat16, -> device_map="auto", -> quantization_config=bnb_config, -> ) -> -> # Now call create_arrow_model() as we explained before. -> ``` diff --git a/docs/source/package_reference/adalora.md b/docs/source/package_reference/adalora.md index cacdb4bf1e..21580ecea9 100644 --- a/docs/source/package_reference/adalora.md +++ b/docs/source/package_reference/adalora.md @@ -16,12 +16,24 @@ rendered properly in your Markdown viewer. # AdaLoRA -[AdaLoRA](https://hf.co/papers/2303.10512) is a method for optimizing the number of trainable parameters to assign to weight matrices and layers, unlike LoRA, which distributes parameters evenly across all modules. More parameters are budgeted for important weight matrices and layers while less important ones receive fewer parameters. +[AdaLoRA](https://hf.co/papers/2303.10512) (Adaptive LoRA) is a method for optimizing the number of trainable parameters to assign to weight matrices and layers, unlike LoRA, which distributes parameters evenly across all modules. More parameters are budgeted for important weight matrices and layers while less important ones receive fewer parameters. You can control the average desired *rank* or `r` of the matrices, and which modules to apply AdaLoRA to with `target_modules`. Other important parameters to set are `lora_alpha` (scaling factor), and `modules_to_save` (the modules apart from the AdaLoRA layers to be trained and saved). All of these parameters - and more - are found in the [`AdaLoraConfig`]. The abstract from the paper is: *Fine-tuning large pre-trained language models on downstream tasks has become an important paradigm in NLP. However, common practice fine-tunes all of the parameters in a pre-trained model, which becomes prohibitive when a large number of downstream tasks are present. Therefore, many fine-tuning methods are proposed to learn incremental updates of pre-trained weights in a parameter efficient way, e.g., low-rank increments. These methods often evenly distribute the budget of incremental updates across all pre-trained weight matrices, and overlook the varying importance of different weight parameters. As a consequence, the fine-tuning performance is suboptimal. To bridge this gap, we propose AdaLoRA, which adaptively allocates the parameter budget among weight matrices according to their importance score. In particular, AdaLoRA parameterizes the incremental updates in the form of singular value decomposition. Such a novel approach allows us to effectively prune the singular values of unimportant updates, which is essentially to reduce their parameter budget but circumvent intensive exact SVD computations. We conduct extensive experiments with several pre-trained models on natural language processing, question answering, and natural language generation to validate the effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable improvement over baselines, especially in the low budget settings. Our code is publicly available at https://github.com/QingruZhang/AdaLoRA*. +> [!WARNING] +> AdaLoRA has an [`~AdaLoraModel.update_and_allocate`] method that should be called at each training step to update the parameter budget and mask, otherwise the adaptation step is not performed. This requires writing a custom training loop or subclassing the [`~transformers.Trainer`] to incorporate this method. As an example, take a look at this [custom training loop](https://github.com/huggingface/peft/blob/912ad41e96e03652cabf47522cd876076f7a0c4f/examples/conditional_generation/peft_adalora_seq2seq.py#L120). + +AdaLoRA manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The $\Delta W$ is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of $\Delta W$ is adjusted according to an importance score. $\Delta W$ is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning. + +Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance. + +> [!NOTE] +> **Contributions welcome**: This section needs clarification. +> +> It is unclear how importance is measured. The explanations are also a bit redundant and could benefit from consolidation. +> See [here](../developer_guides/contributing#documentation-improvements) on how to contribute. ## Benchmark overview @@ -32,6 +44,29 @@ The abstract from the paper is: height="1000" > +## Usage + + +```py +from peft import AdaLoraConfig, get_peft_model + +config = AdaLoraConfig( + r=8, + init_r=12, + tinit=200, + tfinal=1000, + deltaT=10, + target_modules=["query", "value"], + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 520,325 || all params: 87,614,722 || trainable%: 0.5938785036606062" + +[... training code ...] + +model.update_and_allocate(step_idx) +``` # API diff --git a/docs/source/package_reference/boft.md b/docs/source/package_reference/boft.md index 48231fa9fd..b65903a19e 100644 --- a/docs/source/package_reference/boft.md +++ b/docs/source/package_reference/boft.md @@ -22,6 +22,23 @@ The abstract from the paper is: *Large foundation models are becoming ubiquitous, but training them from scratch is prohibitively expensive. Thus, efficiently adapting these powerful models to downstream tasks is increasingly important. In this paper, we study a principled finetuning paradigm -- Orthogonal Finetuning (OFT) -- for downstream task adaptation. Despite demonstrating good generalizability, OFT still uses a fairly large number of trainable parameters due to the high dimensionality of orthogonal matrices. To address this, we start by examining OFT from an information transmission perspective, and then identify a few key desiderata that enable better parameter-efficiency. Inspired by how the Cooley-Tukey fast Fourier transform algorithm enables efficient information transmission, we propose an efficient orthogonal parameterization using butterfly structures. We apply this parameterization to OFT, creating a novel parameter-efficient finetuning method, called Orthogonal Butterfly (BOFT). By subsuming OFT as a special case, BOFT introduces a generalized orthogonal finetuning framework. Finally, we conduct an extensive empirical study of adapting large vision transformers, large language models, and text-to-image diffusion models to various downstream tasks in vision and language*. +BOFT focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard [OFT](./oft). Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. + +Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time). + +In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead. + +## Benchmark overview + + + +# API + ## BOFTConfig [[autodoc]] tuners.boft.config.BOFTConfig diff --git a/docs/source/package_reference/c3a.md b/docs/source/package_reference/c3a.md index 05ac7c1193..04baf55f3e 100644 --- a/docs/source/package_reference/c3a.md +++ b/docs/source/package_reference/c3a.md @@ -34,6 +34,18 @@ The abstract from the paper is: > Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ to represent weight changes (i.e., $\Delta \mathbf{W} = \mathbf{B} \mathbf{A}$). This method reduces trainable parameters and mitigates heavy memory consumption associated with full delta matrices by sequentially multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its success, the intrinsic low-rank characteristic may limit its performance. Although several variants have been proposed to address this issue, they often overlook the crucial computational and memory efficiency brought by LoRA. In this paper, we propose Circular Convolution Adaptation (C3A), which not only achieves high-rank adaptation with enhanced performance but also excels in both computational power and memory utilization. Extensive experiments demonstrate that C3A consistently outperforms LoRA and its variants across various fine-tuning tasks. +## Benchmark overview + + + + +# API + ## C3AConfig [[autodoc]] tuners.c3a.config.C3AConfig diff --git a/docs/source/package_reference/loha.md b/docs/source/package_reference/loha.md index 8632612127..0a5fb52be1 100644 --- a/docs/source/package_reference/loha.md +++ b/docs/source/package_reference/loha.md @@ -18,12 +18,16 @@ rendered properly in your Markdown viewer. Low-Rank Hadamard Product ([LoHa](https://huggingface.co/papers/2108.06098)), is similar to LoRA except it approximates the large weight matrix with more low-rank matrices and combines them with the Hadamard product. This method is even more parameter-efficient than LoRA and achieves comparable performance. LoHa was originally proposed for federated learning (FedPara) but works well as a general-purpose PEFT method, and is especially popular for fine-tuning image generation models such as Stable Diffusion. -> **Note:** LoHa is part of the [LyCORIS](./adapter_utils) family of adapters. Its close relative [LoKr](./lokr) uses the Kronecker product instead of the Hadamard product. For more background on how LoHa works conceptually, see the [Adapters guide](../conceptual_guides/adapter#low-rank-hadamard-product-loha). +> **Note:** LoHa is part of the [LyCORIS](./adapter_utils) family of adapters. Its close relative [LoKr](./lokr) uses the Kronecker product instead of the Hadamard product. The abstract from the paper is: *In this work, we propose a communication-efficient parameterization, FedPara, for federated learning (FL) to overcome the burdens on frequent model uploads and downloads. Our method re-parameterizes weight parameters of layers using low-rank weights followed by the Hadamard product. Compared to the conventional low-rank parameterization, our FedPara method is not restricted to low-rank constraints, and thereby it has a far larger capacity. This property enables to achieve comparable performance while requiring 3 to 10 times lower communication costs than the model with the original layers, which is not achievable by the traditional low-rank methods. The efficiency of our method can be further improved by combining with other efficient FL optimizers. In addition, we extend our method to a personalized FL application, pFedPara, which separates parameters into global and local ones. We show that pFedPara outperforms competing personalized FL methods with more than three times fewer parameters.* +Low-rank decomposition can impact performance because the weight updates are limited to the low-rank space, which can constrain a model's expressiveness. However, you don't necessarily want to use a larger rank because it increases the number of trainable parameters. To address this, LoHa was applied to diffusion models where the ability to generate diverse images is an important consideration. LoHa should also work with general model types, but support for embedding layers isn't currently implemented in PEFT. + +LoHa uses the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_(matrices)) (element-wise product) instead of the matrix product. $\Delta W$ is represented by four smaller matrices instead of two - like in LoRA - and each pair of these low-rank matrices are combined with the Hadamard product. As a result, $\Delta W$ can have the same number of trainable parameters but a higher rank and expressivity. + ## When to use LoHa LoHa is a good choice when: @@ -34,7 +38,7 @@ LoHa is a good choice when: LoHa supports linear and Conv2d layers. For tasks that additionally require embedding layer adaptation, consider [LoRA](./lora) instead. -## Quick start +## Usage ```python from diffusers import StableDiffusionPipeline @@ -61,6 +65,17 @@ pipeline.unet = get_peft_model(pipeline.unet, config_unet) pipeline.unet.print_trainable_parameters() ``` +## Benchmark overview + + + +# API + ## LoHaConfig [[autodoc]] tuners.loha.config.LoHaConfig diff --git a/docs/source/package_reference/lokr.md b/docs/source/package_reference/lokr.md index 5be43f8546..679d35b716 100644 --- a/docs/source/package_reference/lokr.md +++ b/docs/source/package_reference/lokr.md @@ -16,7 +16,39 @@ rendered properly in your Markdown viewer. # LoKr -Low-Rank Kronecker Product ([LoKr](https://hf.co/papers/2309.14859)), is a LoRA-variant method that approximates the large weight matrix with two low-rank matrices and combines them with the Kronecker product. LoKr also provides an optional third low-rank matrix to provide better control during fine-tuning. +Low-Rank Kronecker Product ([LoKr](https://hf.co/papers/2309.14859)), is a LoRA-variant method that approximates the large weight matrix with two low-rank matrices and combines them with the [Kronecker product](https://en.wikipedia.org/wiki/Kronecker_product). LoKr also provides an optional third low-rank matrix to provide better control during fine-tuning. By expresseing the weight update matrix as a decomposition of a Kronecker product, creating a block matrix, LoKr is able to preserve the rank of the original weight matrix. The size of the smaller matrices are determined by its *rank* or `r`. Another benefit of the Kronecker product is that it can be vectorized by stacking the matrix columns. This can speed up the process because you're avoiding fully reconstructing ∆W. + +The abstract from the paper is: + +*Text-to-image generative models have garnered immense attention for their ability to produce high-fidelity images from text prompts. Among these, Stable Diffusion distinguishes itself as a leading open-source model in this fast-growing field. However, the intricacies of fine-tuning these models pose multiple challenges from new methodology integration to systematic evaluation. Addressing these issues, this paper introduces LyCORIS [Lora beYond Conventional methods, Other Rank adaptation Implementations for Stable diffusion](https://github.com/KohakuBlueleaf/LyCORIS), an open-source library that offers a wide selection of fine-tuning methodologies for Stable Diffusion. Furthermore, we present a thorough framework for the systematic assessment of varied fine-tuning techniques. This framework employs a diverse suite of metrics and delves into multiple facets of fine-tuning, including hyperparameter adjustments and the evaluation with different prompt types across various concept categories. Through this comprehensive approach, our work provides essential insights into the nuanced effects of fine-tuning parameters, bridging the gap between state-of-the-art research and practical application.* + +## Usage + +```py +from peft import LoKrConfig, get_peft_model + +config = LoKrConfig( + r=16, + alpha=16, + target_modules=["query", "value"], + module_dropout=0.1, + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 116,069 || all params: 87,172,042 || trainable%: 0.13314934162033282" +``` + +## Benchmark overview + + + +# API ## LoKrConfig @@ -24,4 +56,4 @@ Low-Rank Kronecker Product ([LoKr](https://hf.co/papers/2309.14859)), is a LoRA- ## LoKrModel -[[autodoc]] tuners.lokr.model.LoKrModel \ No newline at end of file +[[autodoc]] tuners.lokr.model.LoKrModel diff --git a/docs/source/package_reference/lora.md b/docs/source/package_reference/lora.md index 64d698540e..85a868a7d6 100644 --- a/docs/source/package_reference/lora.md +++ b/docs/source/package_reference/lora.md @@ -77,6 +77,167 @@ model.print_trainable_parameters() > +## Initialization + +The initialization of LoRA weights is controlled by the parameter `init_lora_weights` in [`LoraConfig`]. By default, PEFT initializes LoRA weights with Kaiming-uniform for weight A and zeros for weight B resulting in an identity transform (same as the reference [implementation](https://github.com/microsoft/LoRA)). + +It is also possible to pass `init_lora_weights="gaussian"`. As the name suggests, this initializes weight A with a Gaussian distribution and zeros for weight B (this is how [Diffusers](https://huggingface.co/docs/diffusers/index) initializes LoRA weights). + +```py +from peft import LoraConfig + +config = LoraConfig(init_lora_weights="gaussian", ...) +``` + +There is also an option to set `init_lora_weights=False` which is useful for debugging and testing. This should be the only time you use this option. When choosing this option, the LoRA weights are initialized such that they do *not* result in an identity transform. + +```py +from peft import LoraConfig + +config = LoraConfig(init_lora_weights=False, ...) +``` + + + +[PiSSA](https://huggingface.co/papers/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. + +Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model: +```python +from peft import LoraConfig +config = LoraConfig(init_lora_weights="pissa", ...) +``` +Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time: +```python +lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) +``` +For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning). + + + +[CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). +The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. +When preserving pre-trained knowledge is not a concern, +the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance. + +You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices. + +```py +@torch.no_grad() +def run_model(): + # Assume `model` and `dataset` is in context... + model.eval() + for batch in dataset: + model(**batch) + + +corda_config = CordaConfig( + corda_method="kpm", +) +lora_config = LoraConfig( + init_lora_weights="corda", + corda_config=corda_config, +) +preprocess_corda(model, lora_config, run_model=run_model) +peft_model = get_peft_model(model, lora_config) +``` + +For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning). + + + +[OLoRA](https://huggingface.co/papers/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance. + +You just need to pass a single additional option to use OLoRA: +```python +from peft import LoraConfig +config = LoraConfig(init_lora_weights="olora", ...) +``` +For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning). + + + +[EVA](https://huggingface.co/papers/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis. + +You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]: +```python +from peft import LoraConfig, EvaConfig +peft_config = LoraConfig( + init_lora_weights = "eva", + eva_config = EvaConfig(rho = 2.0), + ... +) +``` +The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r. + +It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]: +```python +peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) +``` +Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning): +```python +initialize_lora_eva_weights(peft_model, dataloader) +``` +EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual. + +> [!TIP] +> For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning). + + + +When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://huggingface.co/papers/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning). + +> [!TIP] +> Learn more about how PEFT works with quantization and how to use LoftQ in the [Quantization](../developer_guides/quantization) guide. + + + +Another way to initialize [`LoraConfig`] is with the [rank-stabilized LoRA (rsLoRA)](https://huggingface.co/papers/2312.03732) method. The LoRA architecture scales each adapter during every forward pass by a fixed scalar which is set at initialization and depends on the rank `r`. The scalar is given by `lora_alpha/r` in the original implementation, but rsLoRA uses `lora_alpha/math.sqrt(r)` which stabilizes the adapters and increases the performance potential from using a higher `r`. + +```py +from peft import LoraConfig + +config = LoraConfig(use_rslora=True, ...) +``` + + + +[LoRA-GA](https://hf.co/papers/2407.05000) (Low-Rank Adaptation with Gradient Approximation) initializes the adapter +weights by performing SVD on estimated gradients, so that the weights are aligning closer to full-finetuning for faster +convergence. + +This method requires an initialization function to estimate the gradients +before beginning the actual training: + +```python +from peft.tuners.lora import preprocess_loraga + +def train_step(): + """Run forward and backward passes for gradient estimation.""" + dataloader_iter = iter(grad_dataloader) + for _ in range(N): + batch = next(dataloader_iter) + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + loss.backward() + +preprocess_loraga(model, lora_config, train_step) +``` + +#### Usage Tips + +- **Gradient Estimation**: LoRA-GA requires a gradient estimation phase before model initialization. Use `preprocess_loraga()` with a `train_step` callback to compute gradients over a small number of training batches (typically 64-128 batches). + +- **Initialization Strategies**: LoRA-GA supports four direction strategies (`direction`): `"ArBr"`, `"A2rBr"`, `"ArB2r"` (default), and `"random"`, and four scaling strategies (`scale`): `"stable"` (default), `"weight_svd"`, `"gd_scale"`, and `"unit"`. The default combination provides the best balance of convergence speed and stability. + +- **Base Weight Modification**: Unlike standard LoRA, LoRA-GA modifies the base model weights during initialization by subtracting a scaled version of the low-rank approximation. This enables better alignment with full fine-tuning gradients. Since base weights are modified, use `save_pretrained()` with the `save_embedding_layers` argument or `save_mutated_as_lora` pattern to properly save the adapter. + +- **Computational Overhead**: The gradient estimation adds a small overhead during initialization (typically 1-2 minutes for 64 batches), but this is quickly amortized by faster convergence during training. + +- **Compatibility**: LoRA-GA requires full-precision weights and does not support quantized models. Can be combined with other LoRA variants like DoRA. + + + ## Training This section shows how to handle more complex training scenarios instead of only applying a low-rank adapter @@ -719,7 +880,7 @@ Using this feature has some drawbacks, namely: ### Composing and Reusing LoRA Adapters #### Arrow -[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. +[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task, similar to [Polytropon](polytropon) but without the need for fine-tuning. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. In PEFT, Arrow is enabled through [`ArrowConfig]` and `create_arrow_model`. You can also configure parameters such as `top_k` (the number of LoRA adapters combined per token), `router_temperature` (the softmax temperature applied to the routing coefficients), and `rng_seed` (for reproducibility). @@ -896,30 +1057,7 @@ To encode general knowledge, GenKnowSub subtracts the average of the provided ge [[autodoc]] tuners.lora.eva.get_eva_state_dict -## Variants - -### LoRA-GA - -[LoRA-GA](https://hf.co/papers/2407.05000) (Low-Rank Adaptation with Gradient Approximation) improves upon standard LoRA by using gradient information during initialization to achieve faster convergence. Instead of random initialization, LoRA-GA performs SVD on estimated gradients to initialize adapter weights in a direction that aligns with full fine-tuning, resulting in 2-4x faster convergence with the same final performance. - -The abstract from the paper is: - -*Low-rank adaptation (LoRA) is a popular technique for parameter-efficient fine-tuning of large language models. However, LoRA's random initialization of adapter weights leads to slow convergence during the initial training phase. In this paper, we propose LoRA-GA (Low-Rank Adaptation with Gradient Approximation), a novel initialization method that leverages gradient information to initialize LoRA adapters. Specifically, we estimate gradients on a small set of training samples and perform singular value decomposition (SVD) to extract principal components. These components are used to initialize the adapter matrices, aligning the initial update direction with that of full fine-tuning. Our experiments across various tasks and model scales demonstrate that LoRA-GA achieves 2-4x faster convergence compared to standard LoRA while maintaining the same final performance. The method is orthogonal to existing LoRA variants and can be easily integrated with techniques like DoRA and LoRA+.* - -#### Usage Tips - -- **Gradient Estimation**: LoRA-GA requires a gradient estimation phase before model initialization. Use `preprocess_loraga()` with a `train_step` callback to compute gradients over a small number of training batches (typically 64-128 batches). - - -- **Initialization Strategies**: LoRA-GA supports four direction strategies (`direction`): `"ArBr"`, `"A2rBr"`, `"ArB2r"` (default), and `"random"`, and four scaling strategies (`scale`): `"stable"` (default), `"weight_svd"`, `"gd_scale"`, and `"unit"`. The default combination provides the best balance of convergence speed and stability. - -- **Base Weight Modification**: Unlike standard LoRA, LoRA-GA modifies the base model weights during initialization by subtracting a scaled version of the low-rank approximation. This enables better alignment with full fine-tuning gradients. Since base weights are modified, use `save_pretrained()` with the `save_embedding_layers` argument or `save_mutated_as_lora` pattern to properly save the adapter. - -- **Computational Overhead**: The gradient estimation adds a small overhead during initialization (typically 1-2 minutes for 64 batches), but this is quickly amortized by faster convergence during training. - -- **Compatibility**: LoRA-GA requires full-precision weights and does not support quantized models. Can be combined with other LoRA variants like DoRA. - -#### LoraGAConfig +### LoraGAConfig [[autodoc]] tuners.lora.config.LoraGAConfig diff --git a/docs/source/package_reference/lora_variant_dora.md b/docs/source/package_reference/lora_variant_dora.md new file mode 100644 index 0000000000..281b2c0a49 --- /dev/null +++ b/docs/source/package_reference/lora_variant_dora.md @@ -0,0 +1,80 @@ + + +# Weight-Decomposed Low-Rank Adaptation (DoRA) + +This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. + +```py +from peft import LoraConfig + +config = LoraConfig(use_dora=True, ...) +``` + +If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`. + +```py +from peft import LoraConfig, LoraRuntimeConfig + +config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...) +``` + +A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method. + +```py +from peft import PeftModel + +model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True) +``` + +## Optimization + +DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the +base result at those times to get the speedup. +Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py) +with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora` on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations: + +| | Without Optimization | With Optimization | +| :--: | :--: | :--: | +| train runtime (sec) | 359.7298 | **279.2676** | +| train samples per second | 1.779 | **2.292** | +| train steps per second | 0.056 | **0.072** | + +Moreover, it is possible to further increase runtime performance of DoRA by using the [`DoraCaching`] helper context. This requires the model to be in `eval` mode: + +```py +from peft.helpers import DoraCaching + +model.eval() +with DoraCaching(): + output = model(inputs) +``` + +For [`meta-llama/Llama-3.1-8B`](https://huggingface.co/meta-llama/Llama-3.1-8B), the [DoRA caching benchmark script](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora-caching.py) shows that, compared to LoRA: + +- DoRA without caching requires 139% more time +- DoRA without caching requires 4% more memory +- DoRA with caching requires 17% more time +- DoRA with caching requires 41% more memory + +Caching can thus make inference with DoRA significantly faster but it also requires signficantly more memory. Ideally, if the use case allows it, just merge the DoRA adapter to avoid both memory and runtime overhead. + +## Caveats + +- DoRA only supports embedding, linear, and Conv2d layers at the moment. +- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. +- DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2. + diff --git a/docs/source/package_reference/lora_variant_velora.md b/docs/source/package_reference/lora_variant_velora.md new file mode 100644 index 0000000000..8658108cd4 --- /dev/null +++ b/docs/source/package_reference/lora_variant_velora.md @@ -0,0 +1,49 @@ + + +### VeLoRA + +[VeLoRA](https://huggingface.co/papers/2405.17991) is a LoRA variant that reduces training memory by compressing the activations saved for the LoRA in the forward pass and then reconstructing them in the backwards pass to implement the update rules. In PEFT, VeLoRA is configured as a LoRA variant through the `velora_config` argument on [`LoraConfig`]. + +```py +from peft import LoraConfig, VeloraConfig + +config = LoraConfig( + target_modules=["q_proj", "v_proj"], + velora_config=VeloraConfig( + num_groups=64, + scale=0.2, + init_type="batch_average", + ), +) +``` + +VeLoRA is applied to every LoRA layer selected by `target_modules`. `num_groups` controls how the input activation depth is split before compression. If the activation depth is not evenly divisible by `num_groups`, VeLoRA pads the grouped representation internally and removes the padding after reconstruction. `scale` rescales the reconstructed activations during the backward pass, and `init_type` chooses how the projection is initialized. + +Use `batch_average_once` to initialize the projection from the first training batch, `batch_average` to update it from every training forward pass, or `random` to initialize it immediately from a random normalized vector. + +Below are some results with the [MetaMathQA benchmark](https://github.com/huggingface/peft/tree/main/method_comparison/MetaMathQA). + +| Variant | Training Loss | Max Memory (GiB) | Tokens/sec | +|---|---:|---:|---:| +| LoRA | 0.5427 | 27.69 | 2366.2 | +| LoRA + GC | 0.5426 | 13.17 | 1671.8 | +| LoRA+VeLoRA | 0.5427 | 19.94 | 2057.6 | + +#### Caveats + +- VeLoRA is currently supported on standard LoRA linear layers only. + diff --git a/docs/source/package_reference/oft.md b/docs/source/package_reference/oft.md index 63909b202b..7cbebcbd13 100644 --- a/docs/source/package_reference/oft.md +++ b/docs/source/package_reference/oft.md @@ -16,12 +16,30 @@ rendered properly in your Markdown viewer. # OFT -[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. +
+ +
+Controlling Text-to-Image Diffusion by Orthogonal Finetuning + +[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. The method primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). The abstract from the paper is: *Large text-to-image diffusion models have impressive capabilities in generating photorealistic images from text prompts. How to effectively guide or control these powerful models to perform different downstream tasks becomes an important open problem. To tackle this challenge, we introduce a principled finetuning method -- Orthogonal Finetuning (OFT), for adapting text-to-image diffusion models to downstream tasks. Unlike existing methods, OFT can provably preserve hyperspherical energy which characterizes the pairwise neuron relationship on the unit hypersphere. We find that this property is crucial for preserving the semantic generation ability of text-to-image diffusion models. To improve finetuning stability, we further propose Constrained Orthogonal Finetuning (COFT) which imposes an additional radius constraint to the hypersphere. Specifically, we consider two important finetuning text-to-image tasks: subject-driven generation where the goal is to generate subject-specific images given a few images of a subject and a text prompt, and controllable generation where the goal is to enable the model to take in additional control signals. We empirically show that our OFT framework outperforms existing methods in generation quality and convergence speed*. +OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. + +## Benchmark overview + + + +# API + ## OFTConfig [[autodoc]] tuners.oft.config.OFTConfig diff --git a/docs/source/package_reference/poly.md b/docs/source/package_reference/poly.md index a4cf28ce56..3dd2a20c9b 100644 --- a/docs/source/package_reference/poly.md +++ b/docs/source/package_reference/poly.md @@ -35,6 +35,10 @@ The abstract from the paper is: +In case you want to try out routing without training first, you can check out the [Arrow LoRA variant](./lora#Arrow). + +# API + ## PolyConfig [[autodoc]] tuners.poly.config.PolyConfig From 2cb339fe5b32aef86fe6fe5933386a5a37aadead Mon Sep 17 00:00:00 2001 From: nemo Date: Tue, 2 Jun 2026 21:34:52 +0200 Subject: [PATCH 06/33] More moving adapter stuff --- docs/source/_toctree.yml | 4 +- docs/source/conceptual_guides/adapter.md | 11 ---- .../source/package_reference/llama_adapter.md | 18 +++++- docs/source/package_reference/loha.md | 5 ++ docs/source/package_reference/lokr.md | 5 ++ docs/source/package_reference/lora.md | 22 +++---- docs/source/task_guides/lora_based_methods.md | 64 ------------------- 7 files changed, 39 insertions(+), 90 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index e171b85aa4..8ba0e54a72 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -64,6 +64,8 @@ title: Cartridges - local: package_reference/prompt_tuning title: Prompt tuning + - local: package_reference/llama_adapter + title: Llama-Adapter title: Soft Prompting - sections: - local: package_reference/adalora @@ -82,8 +84,6 @@ title: LoRA - local: package_reference/ia3 title: IA3 - - local: package_reference/llama_adapter - title: Llama-Adapter - local: package_reference/loha title: LoHa - local: package_reference/lokr diff --git a/docs/source/conceptual_guides/adapter.md b/docs/source/conceptual_guides/adapter.md index efe43ea073..337e95b7e5 100644 --- a/docs/source/conceptual_guides/adapter.md +++ b/docs/source/conceptual_guides/adapter.md @@ -20,15 +20,8 @@ Adapter-based methods add extra trainable parameters after the attention and ful This guide will give you a brief overview of the adapter methods supported by PEFT (if you're interested in learning more details about a specific method, take a look at the linked paper). -## Low-Rank Adaptation (LoRA) - -
- -
-Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation - ## Mixture of LoRA Experts (X-LoRA) [X-LoRA](https://huggingface.co/papers/2402.07148) is a mixture of experts method for LoRA which works by using dense or sparse gating to dynamically activate LoRA experts. The LoRA experts as well as the base model are frozen during training, resulting in a low parameter count as only the gating layers must be trained. In particular, the gating layers output scalings which (depending on config) are granular on the layer and token level. Additionally, during inference, X-LoRA dynamically activates LoRA adapters to recall knowledge and effectively mix them: @@ -52,10 +45,6 @@ Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the A set of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response. -
- -
-LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions. diff --git a/docs/source/package_reference/llama_adapter.md b/docs/source/package_reference/llama_adapter.md index 52e6c537b2..7ad1f8406a 100644 --- a/docs/source/package_reference/llama_adapter.md +++ b/docs/source/package_reference/llama_adapter.md @@ -16,16 +16,32 @@ rendered properly in your Markdown viewer. # Llama-Adapter +
+ +
+LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention + [Llama-Adapter](https://hf.co/papers/2303.16199) is a PEFT method specifically designed for turning Llama into an instruction-following model. The Llama model is frozen and only a set of adaptation prompts prefixed to the input instruction tokens are learned. Since randomly initialized modules inserted into the model can cause the model to lose some of its existing knowledge, Llama-Adapter uses zero-initialized attention with zero gating to progressively add the instructional prompts to the model. The abstract from the paper is: *We present LLaMA-Adapter, a lightweight adaption method to efficiently fine-tune LLaMA into an instruction-following model. Using 52K self-instruct demonstrations, LLaMA-Adapter only introduces 1.2M learnable parameters upon the frozen LLaMA 7B model, and costs less than one hour for fine-tuning on 8 A100 GPUs. Specifically, we adopt a set of learnable adaption prompts, and prepend them to the input text tokens at higher transformer layers. Then, a zero-init attention mechanism with zero gating is proposed, which adaptively injects the new instructional cues into LLaMA, while effectively preserves its pre-trained knowledge. With efficient training, LLaMA-Adapter generates high-quality responses, comparable to Alpaca with fully fine-tuned 7B parameters. Furthermore, our approach can be simply extended to multi-modal input, e.g., images, for image-conditioned LLaMA, which achieves superior reasoning capacity on ScienceQA. We release our code at https://github.com/ZrrSkywalker/LLaMA-Adapter*. +## Benchmark overview + + + +# API + ## AdaptionPromptConfig [[autodoc]] tuners.adaption_prompt.config.AdaptionPromptConfig ## AdaptionPromptModel -[[autodoc]] tuners.adaption_prompt.model.AdaptionPromptModel \ No newline at end of file +[[autodoc]] tuners.adaption_prompt.model.AdaptionPromptModel diff --git a/docs/source/package_reference/loha.md b/docs/source/package_reference/loha.md index 0a5fb52be1..b389f3c0a0 100644 --- a/docs/source/package_reference/loha.md +++ b/docs/source/package_reference/loha.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # LoHa +
+ +
+Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation + Low-Rank Hadamard Product ([LoHa](https://huggingface.co/papers/2108.06098)), is similar to LoRA except it approximates the large weight matrix with more low-rank matrices and combines them with the Hadamard product. This method is even more parameter-efficient than LoRA and achieves comparable performance. LoHa was originally proposed for federated learning (FedPara) but works well as a general-purpose PEFT method, and is especially popular for fine-tuning image generation models such as Stable Diffusion. > **Note:** LoHa is part of the [LyCORIS](./adapter_utils) family of adapters. Its close relative [LoKr](./lokr) uses the Kronecker product instead of the Hadamard product. diff --git a/docs/source/package_reference/lokr.md b/docs/source/package_reference/lokr.md index 679d35b716..c4b9192012 100644 --- a/docs/source/package_reference/lokr.md +++ b/docs/source/package_reference/lokr.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # LoKr +
+ +
+Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation + Low-Rank Kronecker Product ([LoKr](https://hf.co/papers/2309.14859)), is a LoRA-variant method that approximates the large weight matrix with two low-rank matrices and combines them with the [Kronecker product](https://en.wikipedia.org/wiki/Kronecker_product). LoKr also provides an optional third low-rank matrix to provide better control during fine-tuning. By expresseing the weight update matrix as a decomposition of a Kronecker product, creating a block matrix, LoKr is able to preserve the rank of the original weight matrix. The size of the smaller matrices are determined by its *rank* or `r`. Another benefit of the Kronecker product is that it can be vectorized by stacking the matrix columns. This can speed up the process because you're avoiding fully reconstructing ∆W. The abstract from the paper is: diff --git a/docs/source/package_reference/lora.md b/docs/source/package_reference/lora.md index 85a868a7d6..ea722f696e 100644 --- a/docs/source/package_reference/lora.md +++ b/docs/source/package_reference/lora.md @@ -1,4 +1,4 @@ - - -# Adapters - -Adapter-based methods add extra trainable parameters after the attention and fully-connected layers of a frozen pretrained model to reduce memory-usage and speed up training. The method varies depending on the adapter, it could simply be an extra added layer or it could be expressing the weight updates ∆W as a low-rank decomposition of the weight matrix. Either way, the adapters are typically small but demonstrate comparable performance to a fully finetuned model and enable training larger models with fewer resources. - -This guide will give you a brief overview of the adapter methods supported by PEFT (if you're interested in learning more details about a specific method, take a look at the linked paper). - - - -## Mixture of LoRA Experts (X-LoRA) - -[X-LoRA](https://huggingface.co/papers/2402.07148) is a mixture of experts method for LoRA which works by using dense or sparse gating to dynamically activate LoRA experts. The LoRA experts as well as the base model are frozen during training, resulting in a low parameter count as only the gating layers must be trained. In particular, the gating layers output scalings which (depending on config) are granular on the layer and token level. Additionally, during inference, X-LoRA dynamically activates LoRA adapters to recall knowledge and effectively mix them: - -The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context. - -![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) - -For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step. - -Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. - - - - - - -## Llama-Adapter - -[Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into an instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset. - -A set of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response. - - -To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions. - -## Householder Reflection Adaptation (HRA) - -[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. - -
- -
-Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation - -HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. - -The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. - -## Bone - -Bone was deprecated and removed in PEFT v0.19.0 in favor of [MiSS](https://huggingface.co/papers/2409.15371) (new version of paper: "MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing"). If you already have a Bone checkpoint, you can use `/scripts/convert-bone-to-miss.py` to convert it into a MiSS checkpoint and proceed with training using MiSS. - -## MiSS -[MiSS](https://github.com/Joluck/MiSS) Matrix Shard Sharing is a novel Parameter-Efficient Fine-Tuning (PEFT) method designed to address the trade-off between adaptability and efficiency in Large Language Models. The core approach of MiSS involves a simple shard-sharing mechanism. It achieves low-rank adaptation by decomposing a weight matrix into multiple fragments and then utilizing a shared, trainable "common fragment." The final low-rank update matrix is constructed by replicating these shared, partitioned shards. (MiSS is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency.) - -MiSS: Revisiting the Trade-off in LoRA with an Efficient Shard-Sharing Structure - -Intuitively, the shape of a single trainable matrix in MiSS is consistent with `lora_B`, so the `r` parameter in MiSS is less than the `r` in LoRA by (`in_feature * r`). - -Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and MiSS-r equals LoRA-r, MiSS's number of trainable parameters is only half that of LoRA. - -Although the nonlinear updates of Bat bring some performance improvements, they also increase computational overhead. Its main purpose is to provide researchers with a direction for improvement. Therefore, we recommend fine-tuning the comprehensive MiSS model instead. diff --git a/docs/source/methods/overview.md b/docs/source/methods/overview.md index 9d0a340990..54e8fd564f 100644 --- a/docs/source/methods/overview.md +++ b/docs/source/methods/overview.md @@ -46,7 +46,6 @@ A popular way to realize adapters is to insert smaller trainable matrices that a Low-rank adapters are only one possible adapter formualation, PEFT implements many other types of adapters as well. For example, Orthogonal Fine-Tuning methods ([OFT](../package_reference/oft), [BOFT](../package_reference/boft), ...) use orthogonal decompositions of the adapter weights to achieve small size. Methods like [MiSS](../package_reference/miss) shard matrices and share these shards to save on memory. [IA3](../package_reference/ia3) just introduces three trainable vectors to steer the original model. - ## Prompt-based methods Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. diff --git a/docs/source/package_reference/hra.md b/docs/source/package_reference/hra.md index fa499069b4..59a35362bd 100644 --- a/docs/source/package_reference/hra.md +++ b/docs/source/package_reference/hra.md @@ -16,13 +16,21 @@ rendered properly in your Markdown viewer. # Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation (HRA) -[HRA](https://huggingface.co/papers/2405.17484) is a simple but effective adapter-based fine-tuning method by leveraging Householder reflections. This method harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge. It consistently achieves better performance with fewer trainable parameters and outperforms state-of-the-art adapters across different models, including large language models (LLMs) and conditional image generators. +
+ +
+Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation +[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, by leveraging [Householder reflections](https://en.wikipedia.org/wiki/Householder_transformation) to reduce parameters and computation costs while penalizing the loss of pre-training knowledge. It consistently achieves better performance with fewer trainable parameters and outperforms state-of-the-art adapters across different models, including large language models (LLMs) and conditional image generators. + +HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter. The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. The abstract from the paper is: > While following different technical routes, both low-rank and orthogonal adaptation techniques can efficiently adapt large-scale pre-training models in specific tasks or domains based on a small piece of trainable parameters. In this study, we bridge the gap between these two techniques, proposing a simple but effective adaptation method based on Householder reflections. Given a pre-trained model, our method fine-tunes its layers by multiplying each frozen weight matrix with an orthogonal matrix constructed by a chain of learnable Householder reflections (HRs). This HR-based orthogonal fine-tuning is equivalent to an adaptive low-rank adaptation. Moreover, we show that the orthogonality of the reflection planes corresponding to the HRs impacts the model capacity and regularity. The analysis motivates us to regularize the orthogonality of the HRs, leading to different implementations of the proposed Householder reflection adaptation (HRA) method. Compared with state-of-the-art methods, HRA achieves superior performance with fewer learnable parameters when adapting large language models and conditional image generators. The code is available at [peft](https://github.com/huggingface/peft/tree/main/src/peft/tuners/hra) and [HRA](https://github.com/DaShenZi721/HRA). +# API + ## HRAConfig [[autodoc]] tuners.hra.config.HRAConfig diff --git a/docs/source/package_reference/miss.md b/docs/source/package_reference/miss.md index 8226a4acd2..97589f095e 100644 --- a/docs/source/package_reference/miss.md +++ b/docs/source/package_reference/miss.md @@ -16,12 +16,36 @@ rendered properly in your Markdown viewer. # MiSS -MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing([MiSS](https://huggingface.co/papers/2409.15371)) is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency. +MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing ([MiSS](https://huggingface.co/papers/2409.15371)) is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency. + +The core approach of MiSS involves a simple shard-sharing mechanism. It achieves low-rank adaptation by decomposing a weight matrix into multiple fragments and then utilizing a shared, trainable "common fragment." The final low-rank update matrix is constructed by replicating these shared, partitioned shards. + +Intuitively, the shape of a single trainable matrix in MiSS is consistent with `lora_B`, so the `r` parameter in MiSS is less than the `r` in LoRA by (`in_feature * r`). + +Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and MiSS-r equals LoRA-r, MiSS's number of trainable parameters is only half that of LoRA. + +Although the nonlinear updates of Bat bring some performance improvements, they also increase computational overhead. Its main purpose is to provide researchers with a direction for improvement. Therefore, we recommend fine-tuning the comprehensive MiSS model instead. The abstract from the paper is: *Parameter-Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), effectively reduce the number of trainable parameters in Large Language Models (LLMs). However, as model scales continue to grow, the demand for computational resources remains a significant challenge. Existing LoRA variants often struggle to strike an optimal balance between adaptability (model performance and convergence speed) and efficiency (computational overhead, memory usage, and initialization time). This paper introduces MiSS(Matrix Shard Sharing ), a novel PEFT approach that addresses this trade-off through a simple shard-sharing mechanism. MiSS leverages the insight that a low-rank adaptation can be achieved by decomposing the weight matrix into multiple fragment matrices and utilizing a shared, trainable common fragment. This method constructs the low-rank update matrix through the replication of these shared, partitioned shards. We also propose a hardware-efficient and broadly applicable implementation for MiSS. Extensive experiments conducted on a range of tasks, alongside a systematic analysis of computational performance, demonstrate MiSS's superiority. The results show that MiSS significantly outperforms standard LoRA and its prominent variants in both model performance metrics and computational efficiency, including initialization speed and training throughput. By effectively balancing expressive power and resource utilization, MiSS offers a compelling solution for efficiently adapting large-scale models*. +> [!NOTE] +> **Contributions welcome**: This section needs clarification. +> +> This section is too steep to understand, it needs a visualization and a better introduction into the key aspects of MiSS to be understandable. +> See [here](../developer_guides/contributing#documentation-improvements) on how to contribute. + +## Benchmark overview + + + +# API ## MissConfig @@ -29,4 +53,4 @@ The abstract from the paper is: ## MissModel -[[autodoc]] tuners.miss.model.MissModel \ No newline at end of file +[[autodoc]] tuners.miss.model.MissModel diff --git a/docs/source/package_reference/xlora.md b/docs/source/package_reference/xlora.md index f4710ab6fa..0ebfb744d0 100644 --- a/docs/source/package_reference/xlora.md +++ b/docs/source/package_reference/xlora.md @@ -24,6 +24,10 @@ The below graphic demonstrates how the scalings change for different prompts for ![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) +For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step. + +Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. + The abstract from the paper is: *We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model (LLM) without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics and design. The impact of this work include access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, as well as molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties, but also reasons over the results and correctly predicts likely mechanisms that explain distinct molecular behaviors.*. @@ -47,6 +51,8 @@ Please cite X-LoRA as: } ``` +# API + ## XLoraConfig [[autodoc]] tuners.xlora.config.XLoraConfig diff --git a/docs/source/task_guides/lora_based_methods.md b/docs/source/task_guides/lora_based_methods.md deleted file mode 100644 index 6901568dcc..0000000000 --- a/docs/source/task_guides/lora_based_methods.md +++ /dev/null @@ -1,280 +0,0 @@ - - -# LoRA methods - - - -This guide will show you how to quickly train an image classification model - with a low-rank decomposition method - to identify the class of food shown in an image. - -> [!TIP] -> Some familiarity with the general process of training an image classification model would be really helpful and allow you to focus on the low-rank decomposition methods. If you're new, we recommend taking a look at the [Image classification](https://huggingface.co/docs/transformers/tasks/image_classification) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! - -Before you begin, make sure you have all the necessary libraries installed. - -```bash -pip install -q peft transformers datasets -``` - -## Dataset - -In this guide, you'll use the [Food-101](https://huggingface.co/datasets/food101) dataset which contains images of 101 food classes (take a look at the [dataset viewer](https://huggingface.co/datasets/food101/viewer/default/train) to get a better idea of what the dataset looks like). - -Load the dataset with the [`~datasets.load_dataset`] function. - -```py -from datasets import load_dataset - -ds = load_dataset("food101") -``` - -Each food class is labeled with an integer, so to make it easier to understand what these integers represent, you'll create a `label2id` and `id2label` dictionary to map the integer to its class label. - -```py -labels = ds["train"].features["label"].names -label2id, id2label = dict(), dict() -for i, label in enumerate(labels): - label2id[label] = i - id2label[i] = label - -id2label[2] -"baklava" -``` - -Load an image processor to properly resize and normalize the pixel values of the training and evaluation images. - -```py -from transformers import AutoImageProcessor - -image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") -``` - -You can also use the image processor to prepare some transformation functions for data augmentation and pixel scaling. - -```py -from torchvision.transforms import ( - CenterCrop, - Compose, - Normalize, - RandomHorizontalFlip, - RandomResizedCrop, - Resize, - ToTensor, -) - -normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) -train_transforms = Compose( - [ - RandomResizedCrop(image_processor.size["height"]), - RandomHorizontalFlip(), - ToTensor(), - normalize, - ] -) - -val_transforms = Compose( - [ - Resize(image_processor.size["height"]), - CenterCrop(image_processor.size["height"]), - ToTensor(), - normalize, - ] -) - -def preprocess_train(example_batch): - example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]] - return example_batch - -def preprocess_val(example_batch): - example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]] - return example_batch -``` - -Define the training and validation datasets, and use the [`~datasets.Dataset.set_transform`] function to apply the transformations on-the-fly. - -```py -train_ds = ds["train"] -val_ds = ds["validation"] - -train_ds.set_transform(preprocess_train) -val_ds.set_transform(preprocess_val) -``` - -Finally, you'll need a data collator to create a batch of training and evaluation data and convert the labels to `torch.tensor` objects. - -```py -import torch - -def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - labels = torch.tensor([example["label"] for example in examples]) - return {"pixel_values": pixel_values, "labels": labels} -``` - -## Model - -Now let's load a pretrained model to use as the base model. This guide uses the [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) model, but you can use any image classification model you want. Pass the `label2id` and `id2label` dictionaries to the model so it knows how to map the integer labels to their class labels, and you can optionally pass the `ignore_mismatched_sizes=True` parameter if you're finetuning a checkpoint that has already been finetuned. - -```py -from transformers import AutoModelForImageClassification, TrainingArguments, Trainer - -model = AutoModelForImageClassification.from_pretrained( - "google/vit-base-patch16-224-in21k", - label2id=label2id, - id2label=id2label, - ignore_mismatched_sizes=True, -) -``` - -### PEFT configuration and model - -Every PEFT method requires a configuration that holds all the parameters specifying how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - - - - - - -[LoHa](../conceptual_guides/adapter#low-rank-hadamard-product-loha) decomposes the weight update matrix into *four* smaller matrices and each pair of smaller matrices is combined with the Hadamard product. This allows the weight update matrix to keep the same number of trainable parameters when compared to LoRA, but with a higher rank (`r^2` for LoHA when compared to `2*r` for LoRA). The size of the smaller matrices is determined by its *rank* or `r`. You'll also want to specify the `target_modules` which determines where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `alpha` (scaling factor), and `modules_to_save` (the modules apart from the LoHa layers to be trained and saved). All of these parameters - and more - are found in the [`LoHaConfig`]. - -```py -from peft import LoHaConfig, get_peft_model - -config = LoHaConfig( - r=16, - alpha=16, - target_modules=["query", "value"], - module_dropout=0.1, - modules_to_save=["classifier"], -) -model = get_peft_model(model, config) -model.print_trainable_parameters() -"trainable params: 1,257,317 || all params: 87,133,642 || trainable%: 1.4429753779831676" -``` - - - - - -### Training - -For training, let's use the [`~transformers.Trainer`] class from Transformers. The [`Trainer`] contains a PyTorch training loop, and when you're ready, call [`~transformers.Trainer.train`] to start training. To customize the training run, configure the training hyperparameters in the [`~transformers.TrainingArguments`] class. With LoRA-like methods, you can afford to use a higher batch size and learning rate. - -```py -from transformers import TrainingArguments, Trainer - -account = "stevhliu" -peft_model_id = f"{account}/google/vit-base-patch16-224-in21k-lora" -batch_size = 128 - -args = TrainingArguments( - peft_model_id, - remove_unused_columns=False, - eval_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-3, - per_device_train_batch_size=batch_size, - gradient_accumulation_steps=4, - per_device_eval_batch_size=batch_size, - fp16=True, - num_train_epochs=5, - logging_steps=10, - load_best_model_at_end=True, - label_names=["labels"], -) -``` - -Begin training with [`~transformers.Trainer.train`]. - -```py -trainer = Trainer( - model, - args, - train_dataset=train_ds, - eval_dataset=val_ds, - processing_class=image_processor, - data_collator=collate_fn, -) -trainer.train() -``` - -## Share your model - -Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You’ll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -notebook_login() -``` - -Call [`~transformers.PreTrainedModel.push_to_hub`] to save your model to your repositoy. - -```py -model.push_to_hub(peft_model_id) -``` - -## Inference - -Let's load the model from the Hub and test it out on a food image. - -```py -from peft import PeftConfig, PeftModel -from transformers import AutoImageProcessor -from PIL import Image -import requests - -config = PeftConfig.from_pretrained("stevhliu/vit-base-patch16-224-in21k-lora") -model = AutoModelForImageClassification.from_pretrained( - config.base_model_name_or_path, - label2id=label2id, - id2label=id2label, - ignore_mismatched_sizes=True, -) -model = PeftModel.from_pretrained(model, "stevhliu/vit-base-patch16-224-in21k-lora") - -url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg" -image = Image.open(requests.get(url, stream=True).raw) -image -``` - -
- -
- -Convert the image to RGB and return the underlying PyTorch tensors. - -```py -encoding = image_processor(image.convert("RGB"), return_tensors="pt") -``` - -Now run the model and return the predicted class! - -```py -with torch.no_grad(): - outputs = model(**encoding) - logits = outputs.logits - -predicted_class_idx = logits.argmax(-1).item() -print("Predicted class:", model.config.id2label[predicted_class_idx]) -"Predicted class: beignets" -``` From 610c1da9f4f07ff4e1fe68821656d5014002ca03 Mon Sep 17 00:00:00 2001 From: nemo Date: Wed, 3 Jun 2026 22:33:32 +0200 Subject: [PATCH 08/33] Some work on the quicktour + fixes --- docs/source/package_reference/cpt.md | 2 +- .../multitask_prompt_tuning.md | 2 +- docs/source/quicktour.md | 111 +++++++++++++----- 3 files changed, 83 insertions(+), 32 deletions(-) diff --git a/docs/source/package_reference/cpt.md b/docs/source/package_reference/cpt.md index 822542c556..b60915e6e4 100644 --- a/docs/source/package_reference/cpt.md +++ b/docs/source/package_reference/cpt.md @@ -32,7 +32,7 @@ Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/ ## Benchmark overview -There is no benchmark for MPT yet. Feel free to contribute an experiment +There is no benchmark for this method yet. Feel free to contribute an experiment configuration but make sure to first create an issue [here](https://github.com/huggingface/peft/issues). diff --git a/docs/source/package_reference/multitask_prompt_tuning.md b/docs/source/package_reference/multitask_prompt_tuning.md index 24488f6428..ad5efa9103 100644 --- a/docs/source/package_reference/multitask_prompt_tuning.md +++ b/docs/source/package_reference/multitask_prompt_tuning.md @@ -39,7 +39,7 @@ MPT consists of two stages: ## Benchmark overview -There is no benchmark for MPT yet. Feel free to contribute an experiment +There is no benchmark for this method yet. Feel free to contribute an experiment configuration but make sure to first create an issue [here](https://github.com/huggingface/peft/issues). diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 36c4b0b22e..ca624f551a 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -18,59 +18,87 @@ rendered properly in your Markdown viewer. PEFT offers parameter-efficient methods for finetuning large pretrained models. The traditional paradigm is to finetune all of a model's parameters for each downstream task, but this is becoming exceedingly costly and impractical because of the enormous number of parameters in models today. Instead, it is more efficient to train a smaller number of prompt parameters or use a reparametrization method like low-rank adaptation (LoRA) to reduce the number of trainable parameters. -This quicktour will show you PEFT's main features and how you can train or run inference on large models that would typically be inaccessible on consumer devices. - - -### PEFT configuration and model - -For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. +
+
+ PEFT can be thought of as a framework for adding trainable parameters to arbitrary places in existing models ("base models"). Specific PEFT methods arrange the trainable parameters in certain ways or modify the training process to achieve fine-tuning performance comparable to training all parameters of the base model. +
+
+
-> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! +This quicktour will show you PEFT's main features and how you can train or run inference on large models that would typically be inaccessible on consumer devices. +## PEFT configuration and model +For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied, most importantly which layers of the existing model to target with trainable parameters. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. -## Train +Let's use [LoRA](../package_reference/lora) as an example but only discuss common parameters - you might want to use one of the [many other PEFT methods](../methods/overview). +The configuration usually entails this: -Each PEFT method is defined by a [`PeftConfig`] class that stores all the important parameters for building a [`PeftModel`]. For example, to train with LoRA, load and create a [`LoraConfig`] class and specify the following parameters: +- `target_modules`: which modules of the base model to adapt +- `task_type` (default: `None`): the nature of the trained task; if provided may help to automatically save relevant (but untargeted) layers alongside the adapter weights or warn you about incompatibilities +- `inference_mode` (default: `False`): whether you're using the model for inference or not -- `task_type`: the task to train for (sequence-to-sequence language modeling in this case) -- `inference_mode`: whether you're using the model for inference or not -- `r`: the dimension of the low-rank matrices -- `lora_alpha`: the scaling factor for the low-rank matrices -- `lora_dropout`: the dropout probability of the LoRA layers +Depending on the PEFT method you choose you will add specific parameters that, for example, determine the size of the update matrices. +Here's an example of a config you may encounter in the wild: ```python from peft import LoraConfig, TaskType -peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) +peft_config = LoraConfig(target_modules=["q_proj"], task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) ``` > [!TIP] > See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type. -Once the [`LoraConfig`] is setup, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. +Once the [`LoraConfig`] is set up, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can (but don't have to) load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. Load the base model you want to finetune. ```python -from transformers import AutoModelForSeq2SeqLM +from transformers import AutoModelForCausalLM -model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") ``` -Wrap the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. +Now wrap the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. + +
+
+

+ Wrapping means that PEFT replaces the targeted layers (here: all q_proj layers) with the adapter-specific layer for the target layer's type. + Since we're dealing with linear layers, it will be, in this case, a lora.Linear layer. +

+

+ Note that we've only specified q_proj but in actuality we are targeting all model.layers[:].self_attn.q_proj layers. This is + because PEFT searches for matching suffixes by default. Pass a string with a regular expression if you want to target more complex layer patterns. +

+
+
+
+ +
+
+
+

+ The base model's layer will be wrapped, retained and not trained while new, trainable weights are added and are combined. + How these new weights are structured and combined with the weights of the base model is a good portion of what sets + the different PEFT methods apart. +

+
+
+ +To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. ```python from peft import get_peft_model -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282" +peft_model = get_peft_model(model, peft_config) +peft_model.print_trainable_parameters() +"output: trainable params: 524,288 || all params: 1,236,338,688 || trainable%: 0.0424" ``` -Out of [bigscience/mt0-large's](https://huggingface.co/bigscience/mt0-large) 1.2B parameters, you're only training 0.19% of them! +Out of [meta-llama/Llama-3.2-1B's](https://huggingface.co/meta-llama/Llama-3.2-1B) 1B parameters, you're only training 0.04% of them! That is it 🎉! Now you can train the model with the Transformers [`~transformers.Trainer`], Accelerate, or any custom PyTorch training loop. @@ -78,7 +106,7 @@ For example, to train with the [`~transformers.Trainer`] class, setup a [`~trans ```py training_args = TrainingArguments( - output_dir="your-name/bigscience/mt0-large-lora", + output_dir="your-name/meta-llama/my-llama3.2-adapter", learning_rate=1e-3, per_device_train_batch_size=32, per_device_eval_batch_size=32, @@ -94,11 +122,10 @@ Pass the model, training arguments, dataset, tokenizer, and any other necessary ```py trainer = Trainer( - model=model, + model=peft_model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], - processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) @@ -108,10 +135,10 @@ trainer.train() ### Save model -After your model is finished training, you can save your model to a directory using the [`~transformers.PreTrainedModel.save_pretrained`] function. +After your model is finished training, you can save your model to a directory using the [`~PeftModel.save_pretrained`] function. ```py -model.save_pretrained("output_dir") +peft_model.save_pretrained("output_dir") ``` You can also save your model to the Hub (make sure you're logged in to your Hugging Face account first) with the [`~transformers.PreTrainedModel.push_to_hub`] function. @@ -120,7 +147,7 @@ You can also save your model to the Hub (make sure you're logged in to your Hugg from huggingface_hub import notebook_login notebook_login() -model.push_to_hub("your-name/bigscience/mt0-large-lora") +peft_model.push_to_hub("your-name/my-llama3.2-adapter") ``` Both methods only save the extra PEFT weights that were trained, meaning it is super efficient to store, transfer, and load. For example, this [facebook/opt-350m](https://huggingface.co/ybelkada/opt-350m-lora) model trained with LoRA only contains two files: `adapter_config.json` and `adapter_model.safetensors`. The `adapter_model.safetensors` file is just 6.3MB! @@ -163,6 +190,30 @@ from peft import AutoPeftModel model = AutoPeftModel.from_pretrained("smangrul/openai-whisper-large-v2-LORA-colab") ``` +## Multiple adapters + +PEFT supports installing multiple adapters (of the same kind, in this document this would be LoRA) on top of a base model. When you call `get_peft_model` there is only one adapter named `"default"` but you can add as many additional adapters by calling `peft_model.add_adapter(adapter_name=...)`. + +
+
+

+ This works because the wrapped layer actually has a unique set of trainable weights for each adapter name. Not every adapter is active and trainable by default. + You have to explicitly enable adapters by name before they are active. This allows you to quickly swap between adapters where task-specific knowledge is needed + or serve different use-cases on top of one model. +

+
+
+
+ +Just remember to call `peft_model.set_adapter()` first to enable the adapter. + +Quick example: + +```py +peft_model.add_adapter(adapter_name='new_adapter') +peft_model.set_adapter('new_adapter') +``` + ## Next steps Now that you've seen how to train a model with one of the PEFT methods, we encourage you to try out some of the other methods like prompt tuning. The steps are very similar to the ones shown in the quicktour: From 6bf3be2e91d8500d346368698de0ff693978190d Mon Sep 17 00:00:00 2001 From: nemo Date: Wed, 3 Jun 2026 22:49:16 +0200 Subject: [PATCH 09/33] Benchmark results for some adapters --- docs/source/_toctree.yml | 2 -- docs/source/package_reference/adamss.md | 11 ++++++ docs/source/package_reference/cartridges.md | 2 ++ docs/source/package_reference/fourierft.md | 11 ++++++ docs/source/package_reference/gralora.md | 18 ++++++++++ docs/source/package_reference/hira.md | 38 ++++++++++++++------- docs/source/package_reference/osf.md | 15 ++++++-- docs/source/package_reference/psoft.md | 14 ++++++-- docs/source/package_reference/pvera.md | 13 ++++++- docs/source/package_reference/randlora.md | 15 ++++++-- docs/source/package_reference/shira.md | 11 ++++++ docs/source/package_reference/vblora.md | 11 ++++++ docs/source/package_reference/vera.md | 11 ++++++ 13 files changed, 151 insertions(+), 21 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index b81521dd67..8b2c99cfd3 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -87,8 +87,6 @@ title: OSF - local: package_reference/xlora title: X-LoRA - - local: package_reference/adapter_utils - title: LyCORIS - local: package_reference/oft title: OFT - local: package_reference/boft diff --git a/docs/source/package_reference/adamss.md b/docs/source/package_reference/adamss.md index 2ef4e550bc..f56f730ae7 100644 --- a/docs/source/package_reference/adamss.md +++ b/docs/source/package_reference/adamss.md @@ -29,6 +29,17 @@ AdaMSS currently has the following constraints: If these constraints don't work for your use case, consider other methods instead. +## Benchmark overview + + + +# API + ## AdamssConfig [[autodoc]] tuners.adamss.config.AdamssConfig diff --git a/docs/source/package_reference/cartridges.md b/docs/source/package_reference/cartridges.md index ce8a4fea14..ac605a8ea9 100644 --- a/docs/source/package_reference/cartridges.md +++ b/docs/source/package_reference/cartridges.md @@ -82,6 +82,8 @@ same underlying checkpoint. To concatenate independently trained cartridges into a single adapter, use `compose_cartridge_adapters(...)`. +# API + ## CartridgeConfig [[autodoc]] tuners.cartridge.config.CartridgeConfig diff --git a/docs/source/package_reference/fourierft.md b/docs/source/package_reference/fourierft.md index 1d298a9042..ad6b447205 100644 --- a/docs/source/package_reference/fourierft.md +++ b/docs/source/package_reference/fourierft.md @@ -29,6 +29,17 @@ The abstract from the paper is: > Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or larger base models. In this work, we aim to further compress trainable parameters by enjoying the powerful expressiveness of the Fourier transform. Specifically, we introduce FourierFT, which treats Delta W as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover Delta W. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M. +## Benchmark overview + + + +# API + ## FourierFTConfig [[autodoc]] tuners.fourierft.config.FourierFTConfig diff --git a/docs/source/package_reference/gralora.md b/docs/source/package_reference/gralora.md index 3d499756c1..c5dbfb6419 100644 --- a/docs/source/package_reference/gralora.md +++ b/docs/source/package_reference/gralora.md @@ -30,3 +30,21 @@ outperforms LoRA and other baselines, achieving up to +8.5% absolute gain in Pass@1 on HumanEval+. These improvements hold across model sizes and rank settings, making GraLoRA a scalable and robust solution for PEFT.* +## Benchmark overview + + + +# API + +## GraloraConfig + +[[autodoc]] tuners.gralora.config.GraloraConfig + +## GraloraModel + +[[autodoc]] tuners.gralora.model.GraloraModel diff --git a/docs/source/package_reference/hira.md b/docs/source/package_reference/hira.md index 15cdd45251..44fb514269 100644 --- a/docs/source/package_reference/hira.md +++ b/docs/source/package_reference/hira.md @@ -38,6 +38,31 @@ peft_model.print_trainable_parameters() # trainable params: 4,718,592 || all params: 129,957,888 || trainable%: 3.6309 ``` +## Benchmark overview + + + +## Citation: + +If you found HiRA is useful, please cite HiRA as: +``` +@inproceedings{ +huang2025hira, +title={Hi{RA}: Parameter-Efficient Hadamard High-Rank Adaptation for Large Language Models}, +author={Qiushi Huang and Tom Ko and Zhan Zhuang and Lilian Tang and Yu Zhang}, +booktitle={The Thirteenth International Conference on Learning Representations}, +year={2025}, +url={https://openreview.net/forum?id=TwJrTz9cRS} +} +``` + +# API + ## HiraConfig [[autodoc]] tuners.hira.config.HiraConfig @@ -61,15 +86,4 @@ peft_model.print_trainable_parameters() [[autodoc]] tuners.hira.layer.Conv1d [[autodoc]] tuners.hira.layer.Conv2d [[autodoc]] tuners.hira.layer.ConvNd -## Citation: -If you found HiRA is useful, please cite HiRA as: -``` -@inproceedings{ -huang2025hira, -title={Hi{RA}: Parameter-Efficient Hadamard High-Rank Adaptation for Large Language Models}, -author={Qiushi Huang and Tom Ko and Zhan Zhuang and Lilian Tang and Yu Zhang}, -booktitle={The Thirteenth International Conference on Learning Representations}, -year={2025}, -url={https://openreview.net/forum?id=TwJrTz9cRS} -} -``` + diff --git a/docs/source/package_reference/osf.md b/docs/source/package_reference/osf.md index 266138589b..caefbb2332 100644 --- a/docs/source/package_reference/osf.md +++ b/docs/source/package_reference/osf.md @@ -105,7 +105,7 @@ config = OSFConfig( "gate_proj": 4 # Lower rank for gate projection } ) - + # Fractional preserved rank is supported (interpreted per-target as fraction * min_dim) config = OSFConfig(effective_rank=0.8) # preserve 80% of min_dim; train remaining 20% config = OSFConfig(rank_pattern={"q_proj": 0.5}) # preserve 50% on q_proj, others use global/default @@ -144,7 +144,7 @@ train_task(model, task_3_data) When training on a known sequence of n tasks, one effective strategy is to progressively allocate model capacity to balance learning new tasks while preserving previous knowledge: - **Task 1**: Use full capacity (train everything) -- **Task 2**: Freeze 1/n of model capacity, train remaining (n-1)/n capacity +- **Task 2**: Freeze 1/n of model capacity, train remaining (n-1)/n capacity - **Task 3**: Freeze 2/n of model capacity, train remaining (n-2)/n capacity - **Task n**: Freeze (n-1)/n of model capacity, use 1/n capacity for final task @@ -222,6 +222,17 @@ optimizer = torch.optim.AdamW([ ], lr=1e-4) ``` +## Benchmark overview + + + +# API + ## OSFConfig [[autodoc]] tuners.osf.config.OSFConfig diff --git a/docs/source/package_reference/psoft.md b/docs/source/package_reference/psoft.md index 4eea99a1cd..9ecc1da2e6 100644 --- a/docs/source/package_reference/psoft.md +++ b/docs/source/package_reference/psoft.md @@ -67,7 +67,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id) # Configure PSOFT config = PsoftConfig( - r=32, # the dimension of trainable matrix R, + r=32, # the dimension of trainable matrix R, psoft_alpha=32, # scaling factor (typically set to r in PSOFT), target_modules=["q_proj", "v_proj"], # target attention projection layers ab_svd_init="psoft_init", # principal subspace initialization @@ -119,6 +119,16 @@ config = PsoftConfig(psoft_orth=True,psoft_mag_a=True,psoft_mag_b=True) 4. **SVD Initialization**: The `lowrank` option is more memory- and compute-efficient than `full`, making it more suitable for large models. 5. **Cayley–Neumann Approximation**: When the rank is large, enabling the Cayley–Neumann approximation can significantly improve computational efficiency, while the benefit is less pronounced for small ranks. In practice, a small number of Neumann series terms (typically `5`) usually provides a good balance between accuracy and efficiency. +## Benchmark overview + + + +# API ## PsoftConfig @@ -126,4 +136,4 @@ config = PsoftConfig(psoft_orth=True,psoft_mag_a=True,psoft_mag_b=True) ## PsoftModel -[[autodoc]] tuners.psoft.model.PsoftModel \ No newline at end of file +[[autodoc]] tuners.psoft.model.PsoftModel diff --git a/docs/source/package_reference/pvera.md b/docs/source/package_reference/pvera.md index 6ea72c7d25..0f527e0d04 100644 --- a/docs/source/package_reference/pvera.md +++ b/docs/source/package_reference/pvera.md @@ -31,10 +31,21 @@ The abstract from the paper is: > Large foundation models have emerged in the last years and are pushing performance boundaries for a variety of tasks. Training or even finetuning such models demands vast datasets and computational resources, which are often scarce and costly. Adaptation methods provide a computationally efficient solution to address these limitations by allowing such models to be finetuned on small amounts of data and computing power. This is achieved by appending new trainable modules to frozen backbones with only a fraction of the trainable parameters and fitting only these modules on novel tasks. Recently, the VeRA adapter was shown to excel in parameter-efficient adaptations by utilizing a pair of frozen random low-rank matrices shared across all layers. In this paper, we propose PVeRA, a probabilistic version of the VeRA adapter, which modifies the low-rank matrices of VeRA in a probabilistic manner. This modification naturally allows handling inherent ambiguities in the input and allows for different sampling configurations during training and testing. A comprehensive evaluation was performed on the VTAB-1k benchmark and seven adapters, with PVeRA outperforming VeRA and other adapters. +## Benchmark overview + + + +# API + ## PveraConfig [[autodoc]] tuners.pvera.config.PveraConfig ## PveraModel -[[autodoc]] tuners.pvera.model.PveraModel \ No newline at end of file +[[autodoc]] tuners.pvera.model.PveraModel diff --git a/docs/source/package_reference/randlora.md b/docs/source/package_reference/randlora.md index 930c400685..568887cbbe 100644 --- a/docs/source/package_reference/randlora.md +++ b/docs/source/package_reference/randlora.md @@ -14,14 +14,14 @@ rendered properly in your Markdown viewer. --> -# RandLora: Full-rank parameter-efficient fine-tuning of large models +# RandLora: Full-rank parameter-efficient fine-tuning of large models [RandLora](https://huggingface.co/papers/2502.00987) is a parameter-efficient fine-tuning technique that is similar to [LoRA](https://huggingface.co/papers/2106.09685) and [VeRA](https://huggingface.co/papers/2310.11454) but performs full rank updates to improve performance. RandLora can be particularly useful when adapting large model to hard tasks that require complex updates while preserving the parameter efficiency of LoRA. The full rank update of RandLora is achieved by linearly scaling random bases. The random bases are a collection of multiple low rank matrices such that the summation of their ranks if greater or equal to the full rank of the parameter matrices. The trainable parameters of RandLora are two diagonal matrices (vectors) that get multiplied with the right hand low rank random bases, in a similar way to VeRA's update. To maintain low memory usage, RandLora uses a custom function that prevents storing unnecessary bases in memory for backpropagation. RandLora presents the noteworthy difference that contrary to other LoRA-like PEFT algorithm, increasing RandLora's random base ranks increases the amount of trainable parameters. Because number of bases x bases rank is constant in RandLora, reducing the rank will increase the number of random bases, hence the number of base-specific trainable diagonal bases. Because reducing the rank of RandLora's random bases will increase their number, RandLora can become slower to train than LoRA for very small ranks where typically, ranks below 4 with result in a large training time increase. This does not affect inference though as the RandLora adapters can be merged into the pretrained weight matrices. -RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. +RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. Similarly to VeRA, when saving the RandLora's parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default). @@ -36,6 +36,17 @@ The abstract from the paper is: > Low-Rank Adaptation (LoRA) and its variants have shown impressive results in reducing the number of trainable parameters and memory requirements of large transformer networks while maintaining fine-tuning performance. The low-rank nature of the weight update inherently limits the representation power of fine-tuned models, however, thus potentially compromising performance on complex tasks. This raises a critical question: when a performance gap between LoRA and standard fine-tuning is observed, is it due to the reduced number of trainable parameters or the rank deficiency? This paper aims to answer this question by introducing RandLora, a parameter-efficient method that performs full-rank updates using a learned linear combinations of low-rank, non-trainable random matrices. Our method limits the number of trainable parameters by restricting optimization to diagonal scaling matrices applied to the fixed random matrices. This allows us to effectively overcome the low-rank limitations while maintaining parameter and memory efficiency during training. Through extensive experimentation across vision, language, and vision-language benchmarks, we systematically evaluate the limitations of LoRA and existing random basis methods. Our findings reveal that full-rank updates are beneficial across vision and language tasks individually, and even more so for vision-language tasks, where RandLora significantly reduces---and sometimes eliminates---the performance gap between standard fine-tuning and LoRA, demonstrating its efficacy. +## Benchmark overview + + + +# API + ## RandLoraConfig [[autodoc]] tuners.randlora.config.RandLoraConfig diff --git a/docs/source/package_reference/shira.md b/docs/source/package_reference/shira.md index cbd869ddb4..64b5499549 100644 --- a/docs/source/package_reference/shira.md +++ b/docs/source/package_reference/shira.md @@ -26,6 +26,17 @@ The abstract from the paper is: > Low Rank Adaptation (LoRA) has gained massive attention in the recent generative AI research. One of the main advantages of LoRA is its ability to be fused with pretrained models, adding no overhead during inference. However, from a mobile deployment standpoint, we can either avoid inference overhead in the fused mode but lose the ability to switch adapters rapidly, or suffer significant (up to 30% higher) inference latency while enabling rapid switching in the unfused mode. LoRA also exhibits concept-loss when multiple adapters are used concurrently. In this paper, we propose Sparse High Rank Adapters (SHiRA), a new paradigm which incurs no inference overhead, enables rapid switching, and significantly reduces concept-loss. Specifically, SHiRA can be trained by directly tuning only 1-2% of the base model weights while leaving others unchanged. This results in a highly sparse adapter which can be switched directly in the fused mode. We further provide theoretical and empirical insights on how high sparsity in SHiRA can aid multi-adapter fusion by reducing concept loss. Our extensive experiments on LVMs and LLMs demonstrate that finetuning only a small fraction of the parameters in the base model significantly outperforms LoRA while enabling both rapid switching and multi-adapter fusion. Finally, we provide a latency- and memory-efficient SHiRA implementation based on Parameter-Efficient Finetuning (PEFT) Library which trains at nearly the same speed as LoRA while consuming up to 16% lower peak GPU memory, thus making SHiRA easy to adopt for practical use cases. To demonstrate rapid switching benefits during inference, we show that loading SHiRA on a base model can be 5x-16x faster than LoRA fusion on a CPU. +## Benchmark overview + + + +# API + ## ShiraConfig [[autodoc]] tuners.shira.config.ShiraConfig diff --git a/docs/source/package_reference/vblora.md b/docs/source/package_reference/vblora.md index 02aaf10b87..3784661f4c 100644 --- a/docs/source/package_reference/vblora.md +++ b/docs/source/package_reference/vblora.md @@ -30,6 +30,17 @@ The abstract from the paper is: - VB-LoRA has two sets of training parameters: vector bank parameters and logit parameters. In practice, we found that logit parameters require a higher learning rate, while vector bank parameters require a lower learning rate. When using the AdamW optimizer, typical learning rates are 0.01 for logits and 0.001 for vector bank parameters. +## Benchmark overview + + + +# API + ## VBLoRAConfig [[autodoc]] tuners.vblora.config.VBLoRAConfig diff --git a/docs/source/package_reference/vera.md b/docs/source/package_reference/vera.md index f9ed281275..da8db7fcbc 100644 --- a/docs/source/package_reference/vera.md +++ b/docs/source/package_reference/vera.md @@ -30,6 +30,17 @@ The abstract from the paper is: > Low-rank adapation (LoRA) is a popular method that reduces the number of trainable parameters when finetuning large language models, but still faces acute storage challenges when scaling to even larger models or deploying numerous per-user or per-task adapted models. In this work, we present Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the number of trainable parameters compared to LoRA, yet maintains the same performance. It achieves this by using a single pair of low-rank matrices shared across all layers and learning small scaling vectors instead. We demonstrate its effectiveness on the GLUE and E2E benchmarks, image classification tasks, and show its application in instruction-tuning of 7B and 13B language models. +## Benchmark overview + + + +# API + ## VeRAConfig [[autodoc]] tuners.vera.config.VeraConfig From 0087b91c0b31b167fb66b61cf8f90d5bf6cb7f7f Mon Sep 17 00:00:00 2001 From: nemo Date: Wed, 3 Jun 2026 22:53:09 +0200 Subject: [PATCH 10/33] Integrade config guide instead --- docs/source/quicktour.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index ca624f551a..af9e70de26 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -49,7 +49,7 @@ peft_config = LoraConfig(target_modules=["q_proj"], task_type=TaskType.CAUSAL_LM ``` > [!TIP] -> See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type. +> See the [configuration guide](guides/peft_model_config) for more details on how the PEFT configuration works under the hood. Once the [`LoraConfig`] is set up, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can (but don't have to) load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. From 5805c57164e11bea78687c3ceb9711f30077992e Mon Sep 17 00:00:00 2001 From: nemo Date: Thu, 4 Jun 2026 12:14:33 +0200 Subject: [PATCH 11/33] Moved *OFT to subsections + docstring update --- docs/source/conceptual_guides/oft.md | 165 -------------------------- docs/source/package_reference/boft.md | 46 ++++++- docs/source/package_reference/oft.md | 53 ++++++++- src/peft/tuners/boft/config.py | 16 ++- src/peft/tuners/oft/config.py | 17 ++- 5 files changed, 124 insertions(+), 173 deletions(-) delete mode 100644 docs/source/conceptual_guides/oft.md diff --git a/docs/source/conceptual_guides/oft.md b/docs/source/conceptual_guides/oft.md deleted file mode 100644 index f7cd21054b..0000000000 --- a/docs/source/conceptual_guides/oft.md +++ /dev/null @@ -1,165 +0,0 @@ - - -# Orthogonal Finetuning (OFT and BOFT) - -This conceptual guide gives a brief overview of [OFT](https://huggingface.co/papers/2306.07280), [OFTv2](https://huggingface.co/papers/2506.19847) and [BOFT](https://huggingface.co/papers/2311.06243), a parameter-efficient fine-tuning technique that utilizes orthogonal matrix to multiplicatively transform the pretrained weight matrices. - -To achieve efficient fine-tuning, OFT represents the weight updates with an orthogonal transformation. The orthogonal transformation is parameterized by an orthogonal matrix multiplied to the pretrained weight matrix. These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn't receive any further adjustments. To produce the final results, both the original and the adapted weights are multiplied togethor. - -Orthogonal Butterfly (BOFT) generalizes OFT with Butterfly factorization and further improves its parameter efficiency and finetuning flexibility. In short, OFT can be viewed as a special case of BOFT. Different from LoRA that uses additive low-rank weight updates, BOFT uses multiplicative orthogonal weight updates. The comparison is shown below. - -
- -
- - -BOFT has some advantages compared to LoRA: - -* BOFT proposes a simple yet generic way to finetune pretrained models to downstream tasks, yielding a better preservation of pretraining knowledge and a better parameter efficiency. -* Through the orthogonality, BOFT introduces a structural constraint, i.e., keeping the [hyperspherical energy](https://huggingface.co/papers/1805.09298) unchanged during finetuning. This can effectively reduce the forgetting of pretraining knowledge. -* BOFT uses the butterfly factorization to efficiently parameterize the orthogonal matrix, which yields a compact yet expressive learning space (i.e., hypothesis class). -* The sparse matrix decomposition in BOFT brings in additional inductive biases that are beneficial to generalization. - -In principle, BOFT can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. Given the target layers for injecting BOFT parameters, the number of trainable parameters can be determined based on the size of the weight matrices. - -## Merge OFT/BOFT weights into the base model - -Similar to LoRA, the weights learned by OFT/BOFT can be integrated into the pretrained weight matrices using the merge_and_unload() function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. - -
- -
- -This works because during training, the orthogonal weight matrix (R in the diagram above) and the pretrained weight matrices are separate. But once training is complete, these weights can actually be merged (multiplied) into a new weight matrix that is equivalent. - -## Utils for OFT / BOFT - -### Common OFT / BOFT parameters in PEFT - -As with other methods supported by PEFT, to fine-tune a model using OFT or BOFT, you need to: - -1. Instantiate a base model. -2. Create a configuration (`OFTConfig` or `BOFTConfig`) where you define OFT/BOFT-specific parameters. -3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. -4. Train the `PeftModel` as you normally would train the base model. - - -### OFT-specific parameters - -`OFTConfig` allows you to control how OFT is applied to the base model through the following parameters: - -- `r`: OFT rank, number of OFT blocks per injected layer. **Bigger** `r` results in more sparse update matrices with **fewer** trainable paramters. **Note**: You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the `oft_block_size` instead for better clarity. -- `oft_block_size`: OFT block size across different layers. **Bigger** `oft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**: Please choose `oft_block_size` to be divisible by layer's input dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. -- `use_cayley_neumann`: Specifies whether to use the Cayley-Neumann parameterization (efficient but approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of matrix inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly worse because of the approximation error. Please test both settings (`True` and `False`) depending on your needs. Default is `False`. -- `module_dropout`: The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA. -- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"oft_only"`. -- `target_modules`: The modules (for example, attention blocks) to inject the OFT matrices. -- `modules_to_save`: List of modules apart from OFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - -### BOFT-specific parameters - -`BOFTConfig` allows you to control how BOFT is applied to the base model through the following parameters: - -- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. **Bigger** `boft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**, please choose `boft_block_size` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only -specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. -- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. **Bigger** `boft_block_num` result in sparser update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only -specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. -- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks become half. -- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"boft_only"`. -- `boft_dropout`: specify the probability of multiplicative dropout. -- `target_modules`: The modules (for example, attention blocks) to inject the OFT/BOFT matrices. -- `modules_to_save`: List of modules apart from OFT/BOFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - - - -## OFT Example Usage - -For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline: - -```py -from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -from trl import SFTTrainer -from peft import OFTConfig - -if use_quantization: - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_storage=torch.bfloat16, - ) - -model = AutoModelForCausalLM.from_pretrained( - "model_name", - quantization_config=bnb_config -) -tokenizer = AutoTokenizer.from_pretrained("model_name") - -# Configure OFT -peft_config = OFTConfig( - oft_block_size=32, - use_cayley_neumann=True, - target_modules="all-linear", - bias="none", - task_type="CAUSAL_LM" -) - -trainer = SFTTrainer( - model=model, - train_dataset=ds['train'], - peft_config=peft_config, - processing_class=tokenizer, - args=training_arguments, - data_collator=collator, -) - -trainer.train() -``` - - -## BOFT Example Usage - -For an example of the BOFT method application to various downstream tasks, please refer to the following guides: - -Take a look at the following step-by-step guides on how to finetune a model with BOFT: -- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md) -- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md) - -For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows: - -```py -import transformers -from transformers import AutoModelForSeq2SeqLM, BOFTConfig -from peft import BOFTConfig, get_peft_model - -config = BOFTConfig( - boft_block_size=4, - boft_n_butterfly_factor=2, - target_modules=["query", "value", "key", "output.dense", "mlp.fc1", "mlp.fc2"], - boft_dropout=0.1, - bias="boft_only", - modules_to_save=["classifier"], -) - -model = transformers.Dinov2ForImageClassification.from_pretrained( - "facebook/dinov2-large", - num_labels=100, -) - -boft_model = get_peft_model(model, config) -``` diff --git a/docs/source/package_reference/boft.md b/docs/source/package_reference/boft.md index b65903a19e..b5a54de0a6 100644 --- a/docs/source/package_reference/boft.md +++ b/docs/source/package_reference/boft.md @@ -22,12 +22,14 @@ The abstract from the paper is: *Large foundation models are becoming ubiquitous, but training them from scratch is prohibitively expensive. Thus, efficiently adapting these powerful models to downstream tasks is increasingly important. In this paper, we study a principled finetuning paradigm -- Orthogonal Finetuning (OFT) -- for downstream task adaptation. Despite demonstrating good generalizability, OFT still uses a fairly large number of trainable parameters due to the high dimensionality of orthogonal matrices. To address this, we start by examining OFT from an information transmission perspective, and then identify a few key desiderata that enable better parameter-efficiency. Inspired by how the Cooley-Tukey fast Fourier transform algorithm enables efficient information transmission, we propose an efficient orthogonal parameterization using butterfly structures. We apply this parameterization to OFT, creating a novel parameter-efficient finetuning method, called Orthogonal Butterfly (BOFT). By subsuming OFT as a special case, BOFT introduces a generalized orthogonal finetuning framework. Finally, we conduct an extensive empirical study of adapting large vision transformers, large language models, and text-to-image diffusion models to various downstream tasks in vision and language*. -BOFT focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard [OFT](./oft). Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. +BOFT focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard [OFT](./oft). Like OFT, BOFT maintains the same cosine similarity ([hyperspherical energy](https://huggingface.co/papers/1805.09298)) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time). In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead. +BOFT can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. Given the target layers for injecting BOFT parameters, the number of trainable parameters can be determined based on the size of the weight matrices. + ## Benchmark overview +## Merge BOFT weights into the base model + +Similar to LoRA, the weights learned by BOFT can be integrated into the pretrained weight matrices using the [`~BOFTModel.merge_and_unload()` function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. + +
+ +
+ +This works because during training, the orthogonal weight matrix (R in the diagram above) and the pretrained weight matrices are separate. But once training is complete, these weights can actually be merged (multiplied) into a new weight matrix that is equivalent. + +## BOFT Example Usage + +For an example of the BOFT method application to various downstream tasks, please refer to the following guides: + +Take a look at the following step-by-step guides on how to finetune a model with BOFT: +- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md) +- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md) + +For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows: + +```py +import transformers +from transformers import AutoModelForSeq2SeqLM, BOFTConfig +from peft import BOFTConfig, get_peft_model + +config = BOFTConfig( + boft_block_size=4, + boft_n_butterfly_factor=2, + target_modules=["query", "value", "key", "output.dense", "mlp.fc1", "mlp.fc2"], + boft_dropout=0.1, + bias="boft_only", + modules_to_save=["classifier"], +) + +model = transformers.Dinov2ForImageClassification.from_pretrained( + "facebook/dinov2-large", + num_labels=100, +) + +boft_model = get_peft_model(model, config) +``` + # API ## BOFTConfig diff --git a/docs/source/package_reference/oft.md b/docs/source/package_reference/oft.md index 7cbebcbd13..f3715ff819 100644 --- a/docs/source/package_reference/oft.md +++ b/docs/source/package_reference/oft.md @@ -21,13 +21,13 @@ rendered properly in your Markdown viewer.
Controlling Text-to-Image Diffusion by Orthogonal Finetuning -[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. The method primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). +[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) and [OFTv2](https://huggingface.co/papers/2506.19847) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. The method primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity ([hyperspherical energy](https://huggingface.co/papers/1805.09298)) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). The abstract from the paper is: *Large text-to-image diffusion models have impressive capabilities in generating photorealistic images from text prompts. How to effectively guide or control these powerful models to perform different downstream tasks becomes an important open problem. To tackle this challenge, we introduce a principled finetuning method -- Orthogonal Finetuning (OFT), for adapting text-to-image diffusion models to downstream tasks. Unlike existing methods, OFT can provably preserve hyperspherical energy which characterizes the pairwise neuron relationship on the unit hypersphere. We find that this property is crucial for preserving the semantic generation ability of text-to-image diffusion models. To improve finetuning stability, we further propose Constrained Orthogonal Finetuning (COFT) which imposes an additional radius constraint to the hypersphere. Specifically, we consider two important finetuning text-to-image tasks: subject-driven generation where the goal is to generate subject-specific images given a few images of a subject and a text prompt, and controllable generation where the goal is to enable the model to take in additional control signals. We empirically show that our OFT framework outperforms existing methods in generation quality and convergence speed*. -OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. +OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged, potentially leading to less forgetting of previous learnt knowledge. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. ## Benchmark overview @@ -38,6 +38,55 @@ OFT preserves the hyperspherical energy by learning an orthogonal transformation height="1000" > +## Merge OFT weights into the base model + +Similar to LoRA, the weights learned by OFT can be integrated into the pretrained weight matrices using the [`~OFTModel.merge_and_unload()` function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. + +## OFT Example Usage + +For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from trl import SFTTrainer +from peft import OFTConfig + +if use_quantization: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_storage=torch.bfloat16, + ) + +model = AutoModelForCausalLM.from_pretrained( + "model_name", + quantization_config=bnb_config +) +tokenizer = AutoTokenizer.from_pretrained("model_name") + +# Configure OFT +peft_config = OFTConfig( + oft_block_size=32, + use_cayley_neumann=True, + target_modules="all-linear", + bias="none", + task_type="CAUSAL_LM" +) + +trainer = SFTTrainer( + model=model, + train_dataset=ds['train'], + peft_config=peft_config, + processing_class=tokenizer, + args=training_arguments, + data_collator=collator, +) + +trainer.train() +``` + # API ## OFTConfig diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py index 1715cc5bc6..9fdf4b7eb4 100644 --- a/src/peft/tuners/boft/config.py +++ b/src/peft/tuners/boft/config.py @@ -30,9 +30,19 @@ class BOFTConfig(PeftConfig): This is the configuration class to store the configuration of a [`BOFTModel`]. Args: - boft_block_size (`int`): BOFT block size across different layers. - boft_block_num (`int`): Number of BOFT blocks per injected layer. - boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers. + boft_block_size (`int`): BOFT matrix block size across different layers, expressed in `int`. Bigger + block sizes results in more dense update matrices with more trainable parameters. Choose `boft_block_size` + to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only specify + either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because + `boft_block_size` x `boft_block_num` must equal the layer's input dimension. + boft_block_num (`int`): Number of BOFT blocks per injected layer. Bigger `boft_block_num` result in sparser + update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be + divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Only specify either + `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because + `boft_block_size` x `boft_block_num` must equal the layer's input dimension. + boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers. For + `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the + effective block size of OFT becomes twice as big and the number of blocks become half. target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to. exclude_modules (`Optional[Union[List[str], str]]`): The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py index 9c62e1bece..fd4990afa8 100644 --- a/src/peft/tuners/oft/config.py +++ b/src/peft/tuners/oft/config.py @@ -30,8 +30,21 @@ class OFTConfig(PeftConfig): This is the configuration class to store the configuration of a [`OFTModel`]. Args: - r (`int`): OFT rank, number of OFT blocks per injected layer. - oft_block_size (`int`): OFT block size across different layers. + r (`int`): OFT rank, number of OFT blocks per injected layer. Bigger `r` results in more sparse update matrices + with fewer trainable paramters. You can only specify either `r` or `oft_block_size`, but not both + simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let you speficy either + `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the + `oft_block_size` instead for better clarity. + oft_block_size (`int`): OFT block size across different layers. Bigger `oft_block_size` results in more dense + update matrices with more trainable parameters. Choose `oft_block_size` to be divisible by layer's input + dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not + both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let you speficy + either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. + use_cayley_neumann (bool): Specifies whether to use the Cayley-Neumann parameterization (efficient but + approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of + matrix inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly + worse because of the approximation error. Please test both settings (`True` and `False`) depending on + your needs. Default is `False`. module_dropout (`float`): The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA. From 65f2f2b010f6739406ee331b364c29972553bec8 Mon Sep 17 00:00:00 2001 From: nemo Date: Thu, 4 Jun 2026 13:00:35 +0200 Subject: [PATCH 12/33] add training efficiency guide --- docs/source/_toctree.yml | 7 +-- .../memory_efficient_training.md | 46 +++++++++++++++++++ docs/source/methods/overview.md | 6 +-- docs/source/package_reference/boft.md | 2 +- 4 files changed, 52 insertions(+), 9 deletions(-) create mode 100644 docs/source/developer_guides/memory_efficient_training.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 8b2c99cfd3..6ede76556e 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -19,6 +19,8 @@ - local: accelerate/fsdp title: Fully Sharded Data Parallel title: Distributed Training + - local: developer_guides/memory_efficient_training + title: Memory Efficient Training - local: developer_guides/model_merging title: Model merging - local: developer_guides/quantization @@ -134,11 +136,6 @@ title: Adapters -- title: Conceptual guides - sections: - - local: conceptual_guides/oft - title: OFT/BOFT - - sections: - sections: - local: package_reference/auto_class diff --git a/docs/source/developer_guides/memory_efficient_training.md b/docs/source/developer_guides/memory_efficient_training.md new file mode 100644 index 0000000000..f15c82353f --- /dev/null +++ b/docs/source/developer_guides/memory_efficient_training.md @@ -0,0 +1,46 @@ + + +# Memory Efficient Training + +🤗 PEFT provides you with methods for parameter efficient fine-tuning but that doesn't mean that your training process is memory efficient. This guide is a collection of tips that you can use to improve memory efficiency of your training process. This guide is mostly an overview page that will link you to the respective other guides and offer some tips for specific situations. + +## Choosing the right method + +Not every PEFT method is built equally and some formulations are easier to build in a memory efficient manner. If you are on a memory budget it makes sense to check out the [PEFT method comparison suite](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison) and filter for **maximum** accelerator memory usage. Average accelerator memory usage can be fairly equal across methods but not every method scales equally with activations and sequence length and is more prone to memory spikes than others. + +## Chunked NLL loss + +Using [`NLLLoss`](https://docs.pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html) is very common when training language models (or classification tasks, for that matter) but it is usually computed in one go, meaning you will allocate a matrix of size `batch × sequence × vocabulary`. With particularly long sequences or vocabularies this can get expensive fast. + +When using [TRL] you can either use the [Liger kernel integration](https://huggingface.co/docs/trl/liger_kernel_integration) or use [Chunked NLLLoss](https://huggingface.co/docs/trl/v1.5.1/en/reducing_memory_usage#chunked-cross-entropy-for-reducing-peak-memory-usage). The latter will split the sequence in chunks of size 256 to keep the maximum memory consumption constant. + +![NLL vs. Chunked NLL comparison](https://private-user-images.githubusercontent.com/45557362/585095978-246efe41-a013-4597-8b0b-b38b83c71bb5.png?jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3ODA1NjkzMDUsIm5iZiI6MTc4MDU2OTAwNSwicGF0aCI6Ii80NTU1NzM2Mi81ODUwOTU5NzgtMjQ2ZWZlNDEtYTAxMy00NTk3LThiMGItYjM4YjgzYzcxYmI1LnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNjA2MDQlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjYwNjA0VDEwMzAwNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTRkMGQyZDMwMzkyMTk5MTE2MTBjM2U2OWM3OTUwZjZiMWYzOTJiOWY1OWYwM2RhZjFiYTVmZmE1OGE2ZmVhYmUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JnJlc3BvbnNlLWNvbnRlbnQtdHlwZT1pbWFnZSUyRnBuZyJ9.nfeb11_ztlkQbvDwRGOl7vgbvskx_TgdyKeVE4CoYPg) + +In case the default chunk size is not optimal for your setting, look in the [original TRL PR](https://github.com/huggingface/trl/pull/5575) for more information on how to tune the chunk size. + +## Quantization + +Quantization is one of the best ways to reduce memory consumption *of the base model* and will, depending on the employed quantization, also reduce activation memory. Since the PEFT methods will only take up a small portion of the total number of parameters, PEFT defaults to use a higher precision than the base model. This can also have the effect that adapters can mitigate some of the quality loss incured by quantization methods. Read the [PEFT quantization guide](quantization). + +## Compilation + +The models we train are composed of operations like matrix multiplications, sums and assignments where each operation produces a new result and, subsequently, needs to take up memory. If those intermediate results are not needed we can fuse these operations and save up on memory. This is just one of many optimizations that `torch.compile` can do for you, so check out the [PEFT torch.compile guide](torch_compile). + +## Gradient Checkpointing + +You can trade memory with computation by only saving every nth gradient between layers and computing the rest on the fly. Check out the [gradient checkpointing](https://huggingface.co/docs/transformers/grad_checkpointing) documentation of Transformers to learn more. + diff --git a/docs/source/methods/overview.md b/docs/source/methods/overview.md index 54e8fd564f..d733842928 100644 --- a/docs/source/methods/overview.md +++ b/docs/source/methods/overview.md @@ -24,10 +24,10 @@ models, especially language models, you can follow these steps: 1. use prompting (few-shot examples in the prompt) to see if the model is already capable of the task. If the model solves your problem, great! You can - now use [Prompt-based methods](#Prompt-based methods) to learn the prompt and + now use [Prompt-based methods](#prompt-based-methods) to learn the prompt and save precious tokens. -2. If prompt-based methods are not sufficient you can use [layer tuning](#Layer tuning) - and [adapter methods](#Adapter methods). These methods are generally +2. If prompt-based methods are not sufficient you can use [layer tuning](#layer-tuning) + and [adapter methods](#adapter-methods). These methods are generally more expressive than prompt-based methods and get closer to full-finetuning. 3. Make sure to measure retention of already learnt knowledge since each fine-tuning step is potentially unlearning past knowledege. diff --git a/docs/source/package_reference/boft.md b/docs/source/package_reference/boft.md index b5a54de0a6..0e7be442f2 100644 --- a/docs/source/package_reference/boft.md +++ b/docs/source/package_reference/boft.md @@ -1,4 +1,4 @@ -