diff --git a/docs/source/_redirects.yml b/docs/source/_redirects.yml new file mode 100644 index 0000000000..7408cd064f --- /dev/null +++ b/docs/source/_redirects.yml @@ -0,0 +1,3 @@ +conceptual_guides/adapter: package_reference/lora +conceptual_guides/ia3: package_reference/ia3 +developer_guides/lora: package_reference/lora diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a9db3fa19a..05c9c9d248 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -7,36 +7,30 @@ - local: install title: Installation -- title: Tutorial +- title: Guides sections: - - local: tutorial/peft_model_config + - local: guides/peft_model_config title: Configurations and models - - local: tutorial/peft_integrations + - local: guides/peft_integrations title: Integrations - -- title: PEFT method guides - sections: - - local: task_guides/prompt_based_methods - title: Prompt-based methods - - local: task_guides/lora_based_methods - title: LoRA methods - - local: task_guides/ia3 - title: IA3 - -- title: Developer guides - sections: + - sections: + - local: accelerate/deepspeed + title: DeepSpeed + - local: accelerate/fsdp + title: Fully Sharded Data Parallel + title: Distributed Training + - local: developer_guides/memory_efficient_training + title: Memory Efficient Training - local: developer_guides/model_merging title: Model merging - local: developer_guides/quantization title: Quantization - - local: developer_guides/lora - title: LoRA - local: developer_guides/custom_models title: Custom models - local: developer_guides/low_level_api title: Adapter injection - local: developer_guides/mixed_models - title: Mixed adapter types + title: Mixing PEFT methods - local: developer_guides/torch_compile title: torch.compile - local: developer_guides/contributing @@ -46,23 +40,102 @@ - local: developer_guides/checkpoint title: PEFT checkpoint format -- title: 🤗 Accelerate integrations - sections: - - local: accelerate/deepspeed - title: DeepSpeed - - local: accelerate/fsdp - title: Fully Sharded Data Parallel - -- title: Conceptual guides +- title: Methods sections: - - local: conceptual_guides/adapter - title: Adapters - - local: conceptual_guides/prompting - title: Soft prompts - - local: conceptual_guides/ia3 - title: IA3 - - local: conceptual_guides/oft - title: OFT/BOFT + - local: methods/overview + title: Overview + - sections: + - local: package_reference/layernorm_tuning + title: LayerNorm Tuning + - local: package_reference/trainable_tokens + title: Trainable Tokens + title: Layer Tuning + - sections: + - local: package_reference/cartridges + title: Cartridges + - local: package_reference/cpt + title: CPT + - local: package_reference/llama_adapter + title: Llama-Adapter + - local: package_reference/p_tuning + title: P-Tuning + - local: package_reference/prefix_tuning + title: Prefix tuning + - local: package_reference/prompt_tuning + title: Prompt tuning + title: Soft Prompting + - sections: + - local: package_reference/adalora + title: AdaLoRA + - local: package_reference/adamss + title: AdaMSS + - local: package_reference/beft + title: BEFT + - local: package_reference/boft + title: BOFT + - local: package_reference/c3a + title: C3A + - local: package_reference/delora + title: DeLoRA + - local: package_reference/fourierft + title: FourierFT + - local: package_reference/gralora + title: GraLoRA + - local: package_reference/hira + title: HiRA + - local: package_reference/hra + title: HRA + - local: package_reference/ia3 + title: IA3 + - local: package_reference/lily + title: Lily + - local: package_reference/loha + title: LoHa + - local: package_reference/lokr + title: LoKr + - sections: + - local: package_reference/lora + title: LoRA + - sections: + - local: package_reference/lora_variant_dora + title: DoRA + - local: package_reference/lora_variant_monteclora + title: MonteCLoRA + - local: package_reference/lora_variant_velora + title: VeLoRA + title: Variants + title: LoRA + - local: package_reference/miss + title: MiSS + - local: package_reference/oft + title: OFT + - local: package_reference/osf + title: OSF + - local: package_reference/peanut + title: PEANuT + - local: package_reference/poly + title: Polytropon + - local: package_reference/psoft + title: PSOFT + - local: package_reference/pvera + title: PVeRA + - local: package_reference/randlora + title: RandLora + - local: package_reference/road + title: RoAd + - local: package_reference/shira + title: SHiRA + - local: package_reference/tinylora + title: TinyLoRA + - local: package_reference/vblora + title: VB-LoRA + - local: package_reference/vera + title: VeRA + - local: package_reference/waveft + title: WaveFT + - local: package_reference/xlora + title: X-LoRA + title: Adapters - sections: - sections: diff --git a/docs/source/conceptual_guides/adapter.md b/docs/source/conceptual_guides/adapter.md deleted file mode 100644 index 825df1abac..0000000000 --- a/docs/source/conceptual_guides/adapter.md +++ /dev/null @@ -1,136 +0,0 @@ - - -# Adapters - -Adapter-based methods add extra trainable parameters after the attention and fully-connected layers of a frozen pretrained model to reduce memory-usage and speed up training. The method varies depending on the adapter, it could simply be an extra added layer or it could be expressing the weight updates ∆W as a low-rank decomposition of the weight matrix. Either way, the adapters are typically small but demonstrate comparable performance to a fully finetuned model and enable training larger models with fewer resources. - -This guide will give you a brief overview of the adapter methods supported by PEFT (if you're interested in learning more details about a specific method, take a look at the linked paper). - -## Low-Rank Adaptation (LoRA) - -> [!TIP] -> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. - -As mentioned briefly earlier, [LoRA](https://hf.co/papers/2106.09685) is a technique that accelerates finetuning large models while consuming less memory. - -LoRA represents the weight updates ∆W with two smaller matrices (called *update matrices*) through low-rank decomposition. These new matrices can be trained to adapt to the new data while keeping the overall number of parameters low. The original weight matrix remains frozen and doesn't receive any further updates. To produce the final results, the original and extra adapted weights are combined. You could also merge the adapter weights with the base model to eliminate inference latency. - -
- -
- -This approach has a number of advantages: - -* LoRA makes finetuning more efficient by drastically reducing the number of trainable parameters. -* The original pretrained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them. -* LoRA is orthogonal to other parameter-efficient methods and can be combined with many of them. -* Performance of models finetuned using LoRA is comparable to the performance of fully finetuned models. - -In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, LoRA is typically only applied to the attention blocks in Transformer models. The resulting number of trainable parameters in a LoRA model depends on the size of the update matrices, which is determined mainly by the rank `r` and the shape of the original weight matrix. - -
- -
-Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation - -## Mixture of LoRA Experts (X-LoRA) - -[X-LoRA](https://huggingface.co/papers/2402.07148) is a mixture of experts method for LoRA which works by using dense or sparse gating to dynamically activate LoRA experts. The LoRA experts as well as the base model are frozen during training, resulting in a low parameter count as only the gating layers must be trained. In particular, the gating layers output scalings which (depending on config) are granular on the layer and token level. Additionally, during inference, X-LoRA dynamically activates LoRA adapters to recall knowledge and effectively mix them: - -The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context. - -![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) - -For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step. - -Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. - -## Low-Rank Hadamard Product (LoHa) - -Low-rank decomposition can impact performance because the weight updates are limited to the low-rank space, which can constrain a model's expressiveness. However, you don't necessarily want to use a larger rank because it increases the number of trainable parameters. To address this, [LoHa](https://huggingface.co/papers/2108.06098) (a method originally developed for computer vision) was applied to diffusion models where the ability to generate diverse images is an important consideration. LoHa should also work with general model types, but the embedding layers aren't currently implemented in PEFT. - -LoHa uses the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_(matrices)) (element-wise product) instead of the matrix product. ∆W is represented by four smaller matrices instead of two - like in LoRA - and each pair of these low-rank matrices are combined with the Hadamard product. As a result, ∆W can have the same number of trainable parameters but a higher rank and expressivity. - -## Low-Rank Kronecker Product (LoKr) - -[LoKr](https://hf.co/papers/2309.14859) is very similar to LoRA and LoHa, and it is also mainly applied to diffusion models, though you could also use it with other model types. LoKr replaces the matrix product with the [Kronecker product](https://en.wikipedia.org/wiki/Kronecker_product) instead. The Kronecker product decomposition creates a block matrix which preserves the rank of the original weight matrix. Another benefit of the Kronecker product is that it can be vectorized by stacking the matrix columns. This can speed up the process because you're avoiding fully reconstructing ∆W. - -## Orthogonal Finetuning (OFT) - -
- -
-Controlling Text-to-Image Diffusion by Orthogonal Finetuning - -[OFT](https://hf.co/papers/2306.07280) is a method that primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). - -OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. - -## Orthogonal Butterfly (BOFT) - -[BOFT](https://hf.co/papers/2311.06243) is an improved orthogonal finetuning method that focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard OFT. Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. - -Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time). - -In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead. - -## Adaptive Low-Rank Adaptation (AdaLoRA) - -[AdaLoRA](https://hf.co/papers/2303.10512) manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The ∆W is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of ∆W is adjusted according to an importance score. ∆W is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning. - -Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance. - -## Llama-Adapter - -[Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into an instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset. - -A set of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response. - -
- -
-LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention - -To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions. - -## Householder Reflection Adaptation (HRA) - -[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. - -
- -
-Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation - -HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. - -The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. - -## Bone - -Bone was deprecated and removed in PEFT v0.19.0 in favor of [MiSS](https://huggingface.co/papers/2409.15371) (new version of paper: "MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing"). If you already have a Bone checkpoint, you can use `/scripts/convert-bone-to-miss.py` to convert it into a MiSS checkpoint and proceed with training using MiSS. - -## MiSS -[MiSS](https://github.com/Joluck/MiSS) Matrix Shard Sharing is a novel Parameter-Efficient Fine-Tuning (PEFT) method designed to address the trade-off between adaptability and efficiency in Large Language Models. The core approach of MiSS involves a simple shard-sharing mechanism. It achieves low-rank adaptation by decomposing a weight matrix into multiple fragments and then utilizing a shared, trainable "common fragment." The final low-rank update matrix is constructed by replicating these shared, partitioned shards. (MiSS is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency.) - -MiSS: Revisiting the Trade-off in LoRA with an Efficient Shard-Sharing Structure - -Intuitively, the shape of a single trainable matrix in MiSS is consistent with `lora_B`, so the `r` parameter in MiSS is less than the `r` in LoRA by (`in_feature * r`). - -Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and MiSS-r equals LoRA-r, MiSS's number of trainable parameters is only half that of LoRA. - -Although the nonlinear updates of Bat bring some performance improvements, they also increase computational overhead. Its main purpose is to provide researchers with a direction for improvement. Therefore, we recommend fine-tuning the comprehensive MiSS model instead. diff --git a/docs/source/conceptual_guides/ia3.md b/docs/source/conceptual_guides/ia3.md deleted file mode 100644 index 92daaac105..0000000000 --- a/docs/source/conceptual_guides/ia3.md +++ /dev/null @@ -1,68 +0,0 @@ - - -# IA3 - -This conceptual guide gives a brief overview of [IA3](https://huggingface.co/papers/2205.05638), a parameter-efficient fine tuning technique that is -intended to improve over [LoRA](./lora). - -To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) -rescales inner activations with learned vectors. These learned vectors are injected in the attention and feedforward modules -in a typical transformer-based architecture. These learned vectors are the only trainable parameters during fine-tuning, and thus the original -weights remain frozen. Dealing with learned vectors (as opposed to learned low-rank updates to a weight matrix like LoRA) -keeps the number of trainable parameters much smaller. - -Being similar to LoRA, IA3 carries many of the same advantages: - -* IA3 makes fine-tuning more efficient by drastically reducing the number of trainable parameters. (For T0, an IA3 model only has about 0.01% trainable parameters, while even LoRA has > 0.1%) -* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable IA3 models for various downstream tasks built on top of them. -* Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models. -* IA3 does not add any inference latency because adapter weights can be merged with the base model. - -In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable -parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers -of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer -in each transformer block. - -Given the target layers for injecting IA3 parameters, the number of trainable parameters -can be determined based on the size of the weight matrices. - - -## Common IA3 parameters in PEFT - -As with other methods supported by PEFT, to fine-tune a model using IA3, you need to: - -1. Instantiate a base model. -2. Create a configuration (`IA3Config`) where you define IA3-specific parameters. -3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. -4. Train the `PeftModel` as you normally would train the base model. - -`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: - -- `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors. -- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with -the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers. Note that `feedforward_modules` must be a subset of `target_modules`. -- `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - -## Example Usage - -For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows: - -```py -peft_config = IA3Config( - task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"] -) -``` \ No newline at end of file diff --git a/docs/source/conceptual_guides/oft.md b/docs/source/conceptual_guides/oft.md deleted file mode 100644 index f7cd21054b..0000000000 --- a/docs/source/conceptual_guides/oft.md +++ /dev/null @@ -1,165 +0,0 @@ - - -# Orthogonal Finetuning (OFT and BOFT) - -This conceptual guide gives a brief overview of [OFT](https://huggingface.co/papers/2306.07280), [OFTv2](https://huggingface.co/papers/2506.19847) and [BOFT](https://huggingface.co/papers/2311.06243), a parameter-efficient fine-tuning technique that utilizes orthogonal matrix to multiplicatively transform the pretrained weight matrices. - -To achieve efficient fine-tuning, OFT represents the weight updates with an orthogonal transformation. The orthogonal transformation is parameterized by an orthogonal matrix multiplied to the pretrained weight matrix. These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn't receive any further adjustments. To produce the final results, both the original and the adapted weights are multiplied togethor. - -Orthogonal Butterfly (BOFT) generalizes OFT with Butterfly factorization and further improves its parameter efficiency and finetuning flexibility. In short, OFT can be viewed as a special case of BOFT. Different from LoRA that uses additive low-rank weight updates, BOFT uses multiplicative orthogonal weight updates. The comparison is shown below. - -
- -
- - -BOFT has some advantages compared to LoRA: - -* BOFT proposes a simple yet generic way to finetune pretrained models to downstream tasks, yielding a better preservation of pretraining knowledge and a better parameter efficiency. -* Through the orthogonality, BOFT introduces a structural constraint, i.e., keeping the [hyperspherical energy](https://huggingface.co/papers/1805.09298) unchanged during finetuning. This can effectively reduce the forgetting of pretraining knowledge. -* BOFT uses the butterfly factorization to efficiently parameterize the orthogonal matrix, which yields a compact yet expressive learning space (i.e., hypothesis class). -* The sparse matrix decomposition in BOFT brings in additional inductive biases that are beneficial to generalization. - -In principle, BOFT can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. Given the target layers for injecting BOFT parameters, the number of trainable parameters can be determined based on the size of the weight matrices. - -## Merge OFT/BOFT weights into the base model - -Similar to LoRA, the weights learned by OFT/BOFT can be integrated into the pretrained weight matrices using the merge_and_unload() function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. - -
- -
- -This works because during training, the orthogonal weight matrix (R in the diagram above) and the pretrained weight matrices are separate. But once training is complete, these weights can actually be merged (multiplied) into a new weight matrix that is equivalent. - -## Utils for OFT / BOFT - -### Common OFT / BOFT parameters in PEFT - -As with other methods supported by PEFT, to fine-tune a model using OFT or BOFT, you need to: - -1. Instantiate a base model. -2. Create a configuration (`OFTConfig` or `BOFTConfig`) where you define OFT/BOFT-specific parameters. -3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. -4. Train the `PeftModel` as you normally would train the base model. - - -### OFT-specific parameters - -`OFTConfig` allows you to control how OFT is applied to the base model through the following parameters: - -- `r`: OFT rank, number of OFT blocks per injected layer. **Bigger** `r` results in more sparse update matrices with **fewer** trainable paramters. **Note**: You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the `oft_block_size` instead for better clarity. -- `oft_block_size`: OFT block size across different layers. **Bigger** `oft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**: Please choose `oft_block_size` to be divisible by layer's input dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. -- `use_cayley_neumann`: Specifies whether to use the Cayley-Neumann parameterization (efficient but approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of matrix inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly worse because of the approximation error. Please test both settings (`True` and `False`) depending on your needs. Default is `False`. -- `module_dropout`: The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA. -- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"oft_only"`. -- `target_modules`: The modules (for example, attention blocks) to inject the OFT matrices. -- `modules_to_save`: List of modules apart from OFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - -### BOFT-specific parameters - -`BOFTConfig` allows you to control how BOFT is applied to the base model through the following parameters: - -- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. **Bigger** `boft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**, please choose `boft_block_size` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only -specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. -- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. **Bigger** `boft_block_num` result in sparser update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only -specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. -- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks become half. -- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"boft_only"`. -- `boft_dropout`: specify the probability of multiplicative dropout. -- `target_modules`: The modules (for example, attention blocks) to inject the OFT/BOFT matrices. -- `modules_to_save`: List of modules apart from OFT/BOFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. - - - -## OFT Example Usage - -For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline: - -```py -from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -from trl import SFTTrainer -from peft import OFTConfig - -if use_quantization: - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_storage=torch.bfloat16, - ) - -model = AutoModelForCausalLM.from_pretrained( - "model_name", - quantization_config=bnb_config -) -tokenizer = AutoTokenizer.from_pretrained("model_name") - -# Configure OFT -peft_config = OFTConfig( - oft_block_size=32, - use_cayley_neumann=True, - target_modules="all-linear", - bias="none", - task_type="CAUSAL_LM" -) - -trainer = SFTTrainer( - model=model, - train_dataset=ds['train'], - peft_config=peft_config, - processing_class=tokenizer, - args=training_arguments, - data_collator=collator, -) - -trainer.train() -``` - - -## BOFT Example Usage - -For an example of the BOFT method application to various downstream tasks, please refer to the following guides: - -Take a look at the following step-by-step guides on how to finetune a model with BOFT: -- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md) -- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md) - -For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows: - -```py -import transformers -from transformers import AutoModelForSeq2SeqLM, BOFTConfig -from peft import BOFTConfig, get_peft_model - -config = BOFTConfig( - boft_block_size=4, - boft_n_butterfly_factor=2, - target_modules=["query", "value", "key", "output.dense", "mlp.fc1", "mlp.fc2"], - boft_dropout=0.1, - bias="boft_only", - modules_to_save=["classifier"], -) - -model = transformers.Dinov2ForImageClassification.from_pretrained( - "facebook/dinov2-large", - num_labels=100, -) - -boft_model = get_peft_model(model, config) -``` diff --git a/docs/source/conceptual_guides/prompting.md b/docs/source/conceptual_guides/prompting.md deleted file mode 100644 index 733ffbf461..0000000000 --- a/docs/source/conceptual_guides/prompting.md +++ /dev/null @@ -1,93 +0,0 @@ - - -# Soft prompts - -Training large pretrained language models is very time-consuming and compute-intensive. As they continue to grow in size, there is increasing interest in more efficient training methods such as *prompting*. Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. - -There are two categories of prompting methods: - -- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt -- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word - -This conceptual guide provides a brief overview of the soft prompt methods included in 🤗 PEFT: prompt tuning, prefix tuning, P-tuning, and multitask prompt tuning. - -## Prompt tuning - -
- -
-Only train and store a significantly smaller set of task-specific prompt parameters (image source). - -[Prompt tuning](https://hf.co/papers/2104.08691) was developed for text classification tasks on T5 models, and all downstream tasks are cast as a text generation task. For example, sequence classification usually assigns a single class label to a sequence of text. By casting it as a text generation task, the tokens that make up the class label are *generated*. Prompts are added to the input as a series of tokens. Typically, the model parameters are fixed which means the prompt tokens are also fixed by the model parameters. - -The key idea behind prompt tuning is that prompt tokens have their own parameters that are updated independently. This means you can keep the pretrained model's parameters frozen, and only update the gradients of the prompt token embeddings. The results are comparable to the traditional method of training the entire model, and prompt tuning performance scales as model size increases. - -Take a look at [Prompt tuning for causal language modeling](../task_guides/clm-prompt-tuning) for a step-by-step guide on how to train a model with prompt tuning. - -## Prefix tuning - -
- -
-Optimize the prefix parameters for each task (image source). - -[Prefix tuning](https://hf.co/papers/2101.00190) was designed for natural language generation (NLG) tasks on GPT models. It is very similar to prompt tuning; prefix tuning also prepends a sequence of task-specific vectors to the input that can be trained and updated while keeping the rest of the pretrained model's parameters frozen. - -The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts. - -As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings. - -Take a look at [Prefix tuning for conditional generation](../task_guides/seq2seq-prefix-tuning) for a step-by-step guide on how to train a model with prefix tuning. - -## P-tuning - -
- -
-Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder (image source). - -[P-tuning](https://hf.co/papers/2103.10385) is designed for natural language understanding (NLU) tasks and all language models. -It is another variation of a soft prompt method; P-tuning also adds a trainable embedding tensor that can be optimized to find better prompts, and it uses a prompt encoder (a bidirectional long-short term memory network or LSTM) to optimize the prompt parameters. Unlike prefix tuning though: - -- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning -- the prompt tokens are only added to the input instead of adding them to every layer of the model -- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence - -The results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks. - -Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq-classification) for a step-by-step guide on how to train a model with P-tuning. - -## Multitask prompt tuning - -
- -
-Multitask prompt tuning enables parameter-efficient transfer learning. - -[Multitask prompt tuning (MPT)](https://hf.co/papers/2303.02861) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages: - -1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training. -2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix. - -
- -
-Prompt decomposition. - - -## Context-Aware Prompt Tuning (CPT) - -
- -
-CPT optimizing only specific token embeddings while keeping the rest of the model frozen (image source). - -[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. -This approach combines ideas from In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective. -In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. -To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. -Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor. - -Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT. diff --git a/docs/source/developer_guides/custom_models.md b/docs/source/developer_guides/custom_models.md index b31f3be917..d6ad6e93de 100644 --- a/docs/source/developer_guides/custom_models.md +++ b/docs/source/developer_guides/custom_models.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. Some fine-tuning techniques, such as prompt tuning, are specific to language models. That means in 🤗 PEFT, it is assumed a 🤗 Transformers model is being used. However, other fine-tuning techniques - like -[LoRA](../conceptual_guides/lora) - are not restricted to specific model types. +[LoRA](../package_reference/lora) - are not restricted to specific model types. In this guide, we will see how LoRA can be applied to a multilayer perceptron, a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library, or a new 🤗 Transformers architecture. diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md deleted file mode 100644 index 58a4132506..0000000000 --- a/docs/source/developer_guides/lora.md +++ /dev/null @@ -1,1126 +0,0 @@ - - -# LoRA - -LoRA is low-rank decomposition method to reduce the number of trainable parameters which speeds up finetuning large models and uses less memory. In PEFT, using LoRA is as easy as setting up a [`LoraConfig`] and wrapping it with [`get_peft_model`] to create a trainable [`PeftModel`]. - -This guide explores in more detail other options and features for using LoRA. - -## Initialization - -The initialization of LoRA weights is controlled by the parameter `init_lora_weights` in [`LoraConfig`]. By default, PEFT initializes LoRA weights with Kaiming-uniform for weight A and zeros for weight B resulting in an identity transform (same as the reference [implementation](https://github.com/microsoft/LoRA)). - -It is also possible to pass `init_lora_weights="gaussian"`. As the name suggests, this initializes weight A with a Gaussian distribution and zeros for weight B (this is how [Diffusers](https://huggingface.co/docs/diffusers/index) initializes LoRA weights). - -```py -from peft import LoraConfig - -config = LoraConfig(init_lora_weights="gaussian", ...) -``` - -There is also an option to set `init_lora_weights=False` which is useful for debugging and testing. This should be the only time you use this option. When choosing this option, the LoRA weights are initialized such that they do *not* result in an identity transform. - -```py -from peft import LoraConfig - -config = LoraConfig(init_lora_weights=False, ...) -``` - -### PiSSA -[PiSSA](https://huggingface.co/papers/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. - -Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model: -```python -from peft import LoraConfig -config = LoraConfig(init_lora_weights="pissa", ...) -``` -Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time: -```python -lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) -``` -For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning). - -### CorDA - -[CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). -The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. -When preserving pre-trained knowledge is not a concern, -the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance. - -You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices. - -```py -@torch.no_grad() -def run_model(): - # Assume `model` and `dataset` is in context... - model.eval() - for batch in dataset: - model(**batch) - - -corda_config = CordaConfig( - corda_method="kpm", -) -lora_config = LoraConfig( - init_lora_weights="corda", - corda_config=corda_config, -) -preprocess_corda(model, lora_config, run_model=run_model) -peft_model = get_peft_model(model, lora_config) -``` - -For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning). - -### OLoRA -[OLoRA](https://huggingface.co/papers/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance. - -You just need to pass a single additional option to use OLoRA: -```python -from peft import LoraConfig -config = LoraConfig(init_lora_weights="olora", ...) -``` -For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning). - -### EVA -[EVA](https://huggingface.co/papers/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis. - -You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]: -```python -from peft import LoraConfig, EvaConfig -peft_config = LoraConfig( - init_lora_weights = "eva", - eva_config = EvaConfig(rho = 2.0), - ... -) -``` -The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r. - -It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]: -```python -peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) -``` -Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning): -```python -initialize_lora_eva_weights(peft_model, dataloader) -``` -EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual. - -> [!TIP] -> For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning). - -### LoftQ - -#### Standard approach - -When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://huggingface.co/papers/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning). - -> [!TIP] -> Learn more about how PEFT works with quantization and how to use LoftQ in the [Quantization](quantization) guide. - -### Rank-stabilized LoRA - -Another way to initialize [`LoraConfig`] is with the [rank-stabilized LoRA (rsLoRA)](https://huggingface.co/papers/2312.03732) method. The LoRA architecture scales each adapter during every forward pass by a fixed scalar which is set at initialization and depends on the rank `r`. The scalar is given by `lora_alpha/r` in the original implementation, but rsLoRA uses `lora_alpha/math.sqrt(r)` which stabilizes the adapters and increases the performance potential from using a higher `r`. - -```py -from peft import LoraConfig - -config = LoraConfig(use_rslora=True, ...) -``` - -### LoRA-GA - -[LoRA-GA](../package_reference/lora#lora-ga) (Low-Rank Adaptation with Gradient Approximation) initializes the adapter -weights by performing SVD on estimated gradients, so that the weights are aligning closer to full-finetuning for faster -convergence. - -This method requires an initialization function to estimate the gradients -before beginning the actual training: - -```python -from peft.tuners.lora import preprocess_loraga - -def train_step(): - """Run forward and backward passes for gradient estimation.""" - dataloader_iter = iter(grad_dataloader) - for _ in range(N): - batch = next(dataloader_iter) - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - loss.backward() - -preprocess_loraga(model, lora_config, train_step) -``` - -### KappaTuneSelector - -KappaTune implements the condition-number-based target selection strategy from the [KappaTune paper](https://arxiv.org/abs/2506.16289). It scans every `nn.Linear` module and, for models where MoE expert weights are stored as fused 3D `nn.Parameter` tensors (e.g. Llama-4, Qwen3-MoE), also those parameters, computes the matrix condition number κ = σ_max / σ_min for each, and selects the most isotropic layers (lowest κ). These isotropic layers serve as ideal candidates for fine-tuning, since their high-entropy nature allows them to absorb new information more readily, leaving the specialized, anisotropic layers intact to mitigate catastrophic forgetting during continual learning. - -Use `find_kappa_target_modules` as a one-liner to get the optimal `target_modules` for `LoraConfig`: - -```python -from peft import LoraConfig, get_peft_model -from peft.helpers import find_kappa_target_modules - -model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") - -targets = find_kappa_target_modules(model, top_p=0.2) -config = LoraConfig( - target_modules=targets["target_modules"], - target_parameters=targets["target_parameters"] if stable_modules_dic["target_parameters"] else None, - r=64, - lora_alpha=32, - task_type="CAUSAL_LM", -) -peft_model = get_peft_model(model, config) -``` - -See a complete example [here](https://github.com/huggingface/peft/blob/main/examples/KappaTune/experiments_kappatune_peft.py). - -## Variants - -PEFT implements LoRA variants that improve upon the original LoRA. - -### Weight-Decomposed Low-Rank Adaptation (DoRA) - -This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. - -```py -from peft import LoraConfig - -config = LoraConfig(use_dora=True, ...) -``` - -If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`. - -```py -from peft import LoraConfig, LoraRuntimeConfig - -config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...) -``` - -A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method. - -```py -from peft import PeftModel - -model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True) -``` - -#### Optimization - -DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the -base result at those times to get the speedup. -Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py) -with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora` on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations: - -| | Without Optimization | With Optimization | -| :--: | :--: | :--: | -| train runtime (sec) | 359.7298 | **279.2676** | -| train samples per second | 1.779 | **2.292** | -| train steps per second | 0.056 | **0.072** | - -Moreover, it is possible to further increase runtime performance of DoRA by using the [`DoraCaching`] helper context. This requires the model to be in `eval` mode: - -```py -from peft.helpers import DoraCaching - -model.eval() -with DoraCaching(): - output = model(inputs) -``` - -For [`meta-llama/Llama-3.1-8B`](https://huggingface.co/meta-llama/Llama-3.1-8B), the [DoRA caching benchmark script](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora-caching.py) shows that, compared to LoRA: - -- DoRA without caching requires 139% more time -- DoRA without caching requires 4% more memory -- DoRA with caching requires 17% more time -- DoRA with caching requires 41% more memory - -Caching can thus make inference with DoRA significantly faster but it also requires signficantly more memory. Ideally, if the use case allows it, just merge the DoRA adapter to avoid both memory and runtime overhead. - -#### Caveats - -- DoRA only supports embedding, linear, and Conv2d layers at the moment. -- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. -- DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2. - -### MonteCLoRA (Monte Carlo Low-Rank Adaptation) - -MonteCLoRA wraps a standard LoRA adapter with a small variational module that draws Monte Carlo samples of stochastic perturbations on top of the LoRA `A` matrix during training. Concretely, it learns variational parameters (a Wishart-based covariance, a per-sample multivariate-normal noise term, and a Dirichlet weighting over the samples) and adds the resulting averaged perturbation to `lora_A` at every forward pass. A KL-divergence + entropy term is added to the training loss to keep these variational parameters anchored to a sensible prior. At inference time the sampler is disabled and MonteCLoRA behaves exactly like a regular LoRA adapter, so there is **no extra inference cost or extra parameters to merge**. For the full method see https://huggingface.co/papers/2411.04358. - -You may want to consider MonteCLoRA when: - -- You are fine-tuning on a small or noisy dataset and want stronger regularization than vanilla LoRA. The Monte Carlo averaging and the KL term together act as a Bayesian-style regularizer. -- You want better uncertainty calibration / robustness from your adapter without paying extra cost at inference time (the variational machinery is training-only). -- Vanilla LoRA is overfitting and lowering `r` or increasing `lora_dropout` is not enough. - -You probably do *not* need MonteCLoRA when you have a large, clean dataset and vanilla LoRA already trains stably — in that regime the extra variational parameters mostly add training overhead without much benefit. - -To enable MonteCLoRA, pass a `MontecloraConfig` to `LoraConfig`: - -```py -from peft import LoraConfig, MontecloraConfig - -monteclora_config = MontecloraConfig( - num_samples=8, # number of Monte Carlo samples per forward pass - sample_scaler=1e-4, # magnitude of the variational perturbation - kl_loss_weight=1e-5, # weight of the KL term added to the training loss -) -config = LoraConfig( - r=16, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - monteclora_config=monteclora_config, -) -``` - -During training you must add the variational regularization loss to the task loss. The simplest way is to call [`LoraModel._get_monteclora_loss`] on the underlying `LoraModel`: - -```py -task_loss = ... # standard loss returned by your model -monteclora_loss = model._get_monteclora_loss() # 0.0 if MonteCLoRA is not used -total_loss = task_loss + monteclora_loss -total_loss.backward() -``` - -If you train with the HF `Trainer`, you can simply mix in [`peft.helpers.MontecloraTrainerMixin`] which does this for you in `compute_loss`: - -```py -from transformers import Trainer -from peft.helpers import MontecloraTrainerMixin - - -class MontecloraTrainer(MontecloraTrainerMixin, Trainer): - pass -``` - -A complete working example is available at [`examples/monteclora_finetuning`](https://github.com/huggingface/peft/tree/main/examples/monteclora_finetuning). - -### VeLoRA - -[VeLoRA](https://huggingface.co/papers/2405.17991) is a LoRA variant that reduces training memory by compressing the activations saved for the LoRA in the forward pass and then reconstructing them in the backwards pass to implement the update rules. In PEFT, VeLoRA is configured as a LoRA variant through the `velora_config` argument on [`LoraConfig`]. - -```py -from peft import LoraConfig, VeloraConfig - -config = LoraConfig( - target_modules=["q_proj", "v_proj"], - velora_config=VeloraConfig( - num_groups=64, - scale=0.2, - init_type="batch_average", - ), -) -``` - -VeLoRA is applied to every LoRA layer selected by `target_modules`. `num_groups` controls how the input activation depth is split before compression. If the activation depth is not evenly divisible by `num_groups`, VeLoRA pads the grouped representation internally and removes the padding after reconstruction. `scale` rescales the reconstructed activations during the backward pass, and `init_type` chooses how the projection is initialized. - -Use `batch_average_once` to initialize the projection from the first training batch, `batch_average` to update it from every training forward pass, or `random` to initialize it immediately from a random normalized vector. - -Below are some results with the [MetaMathQA benchmark](https://github.com/huggingface/peft/tree/main/method_comparison/MetaMathQA). - -| Variant | Training Loss | Max Memory (GiB) | Tokens/sec | -|---|---:|---:|---:| -| LoRA | 0.5427 | 27.69 | 2366.2 | -| LoRA + GC | 0.5426 | 13.17 | 1671.8 | -| LoRA+VeLoRA | 0.5427 | 19.94 | 2057.6 | - -#### Caveats - -- VeLoRA is currently supported on standard LoRA linear layers only. - -## Training - -This section shows how to handle more complex training scenarios instead of only applying a low-rank adapter -to the model and feed data. - -### QLoRA-style training - -The default LoRA settings in PEFT add trainable weights to the query and value layers of each attention block. But [QLoRA](https://hf.co/papers/2305.14314), which adds trainable weights to all the linear layers of a transformer model, can provide performance equal to a fully finetuned model. To apply LoRA to all the linear layers, like in QLoRA, set `target_modules="all-linear"` (easier than specifying individual modules by name which can vary depending on the architecture). - -```py -config = LoraConfig(target_modules="all-linear", ...) -``` - -For more information about how to apply quantization to PEFT adapters, refer to the [quantization guide](quantization). - -### Memory efficient Layer Replication with LoRA - -An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://huggingface.co/papers/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument. - -```py -config = LoraConfig(layer_replication=[[0,4], [2,5]], ...) -``` - -Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters. - -[Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The -[adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning. - -### Fine grained control over ranks and alpha (scaling) - -By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values. - -To give an example, let's assume that we have a model with the following structure: - -```python ->>> print(model) -Outer( - (foo): Linear(...) - (module): Middle( - (foo): Linear(...) - (foobar): Linear(...) - (module): Inner( - (foo): Linear(...) - (barfoo): Linear(...) - ) - ) -) -``` - -- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched. -- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes. -- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason. -- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`. -- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`. -- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT. - -The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good. - -### Targeting `nn.Parameter` directly - -Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with `target_parameters`. As an example, for [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164), you can pass: `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. - -Note that when targeting expert parameters, PEFT can add a substantial runtime overhead. The reason is that PEFT always materializes the LoRA contribution for _each expert_ even if only a small amount of experts is required. During training, this is less relevant since, over the course of the sequence, typically a large fraction of experts is activated at least once. However, during inference, normally a KV cache is used and we thus need to only compute the last token, which means that only a small amount of experts is activated. Therefore, using LoRA on MoE layers can result in a substantial slowdown at inference time. The recommendation is thus to merge the weights (`model.merge_adapter()` or `model = model.merge_and_unload()`). This removes the PEFT overhead. - -A more detailed investigation of this issue can be found on this [pull request on MoE optimization](https://github.com/huggingface/peft/pull/3139). - -#### Caveats - -- At the moment, this argument allows to target 2-dim or 3-dim `nn.Parameter`s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension. -- It is currently not possible to add multiple LoRA adapters (via `model.add_adapter` or `model.load_adapter`) that use `target_parameters` at the same time. - -#### MoE expert parameters and vLLM - -Some MoE models in Transformers store expert weights as `nn.Parameter` tensors (often 3D), not `nn.Linear` modules. -To apply LoRA to those experts, use `target_parameters` and set a per-layer rank with `rank_pattern`: - -```python -num_experts = getattr(model.config, "num_local_experts", None) or model.config.num_experts -effective_r = max(1, r // num_experts) -config = LoraConfig( - r=r, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - target_parameters=[ - # Mixtral / Qwen3-MoE / GPT-OSS - "mlp.experts.gate_up_proj", - "mlp.experts.down_proj", - # Llama4 - # "feed_forward.experts.gate_up_proj", - # "feed_forward.experts.down_proj", - ], - rank_pattern={ - "experts.gate_up_proj": effective_r, - "experts.down_proj": effective_r, - }, -) -``` - -This keeps the total LoRA parameter budget similar to dense layers (see -[LoRA Without Regret](https://thinkingmachines.ai/blog/lora/) by Schulman et. al.). -Non-expert modules use the default rank `r`. - -Accelerated inference with the fine-tuned model is possible with, for example, [vLLM](https://vllm.ai/) which supports fused MoE expert layers since v0.11.2. - -### Efficiently train tokens alongside LoRA - -PEFT LoRA adapters support adding new tokens with the `trainable_token_indices` parameter. This allows tuning of other tokens alongside fine-tuning specific layers. Only the specified tokens are trained and all other tokens are untouched. It saves memory and doesn't throw away learned context from existing token embeddings unlike training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`]. - -```py -# for layer 'embed_tokens' -config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...) - -# specific embedding layer -config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...) -``` - -In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model. - -```py -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import get_peft_model, LoraConfig - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - -# we define our new tokens and add them to the tokenizer as special tokens -special_tokens = ['<|start_think|>', '<|stop_think|>'] -tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) - -# make room for new tokens in the embedding matrix if it isn't big enough already -base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings)) - -# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens` -# and specifically our new tokens we just added -lora_config = LoraConfig( - target_modules='all-linear', - trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)}, -) -peft_model = get_peft_model(base_model, lora_config) - -# proceed to train the model like normal -[...] -``` - -The token weights are saved as a part of the adapter state dict alongside the LoRA weights. Full fine-tuning and saving the embedding matrix would have stored a much bigger file. - -To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens): - -| | Trainable Tokens | LoRA | Full Fine-tuning | -| --------: | :--------------: | :--------: | :--------------: | -| VRAM | 15,562 MB | 15,581MB | ~16,500MB | -| Influence | 6 tokens | all tokens | all tokens | - -### Weight tying - -Many causal LMs use **weight tying**, where two or more weights share the same underlying parameters. In the most common case, the input embedding weights (`embed_tokens`) and output projection weights (`lm_head`) share the same tensor. This is because it reduces parameters and usually preserves model quality. - -It's not always obvious how PEFT deals with these tied weights when they are targeted for fine-tuning. For LoRA, the `ensure_weight_tying` on the [`LoraConfig`] controls whether PEFT should explicitly keep adapter-side updates tied for those layers. In practice, this can affect `modules_to_save`, `target_modules`, and `trainable_token_indices`. Note that this logic partially relies on convention when it comes to naming the layers (`"embed_tokens"`, `"lm_head"`) and proper working cannot be guaranteed if those conventions are not used. - -The tables below summarize expected behavior. - -#### `modules_to_save` - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|-----------------------------------------------------|--------------------------------------------------------------| -| No | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Add `ModulesToSaveWrapper` on selected layer only | -| No | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Warn, then add `ModulesToSaveWrapper` on selected layer only | -| Yes | `False` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Treat as separate | -| Yes | `True` | `modules_to_save=["embed_tokens"]` or `["lm_head"]` | Wrap tied layers and keep wrappers tied | -| No | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Treat as separate | -| No | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `False` | `modules_to_save=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `True` | `modules_to_save=["embed_tokens", "lm_head"]` | Keep `ModulesToSaveWrapper`s tied | - -#### `target_modules` - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|----------------------------------------------------|--------------------------------------------| -| No | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Add LoRA on selected layer only | -| No | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Warn, then add LoRA on selected layer only | -| Yes | `False` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Treat as separate | -| Yes | `True` | `target_modules=["embed_tokens"]` or `["lm_head"]` | Keep LoRA adapters tied | -| No | `False` | `target_modules=["embed_tokens", "lm_head"]` | Treat as separate | -| No | `True` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `False` | `target_modules=["embed_tokens", "lm_head"]` | Warn, then treat as separate | -| Yes | `True` | `target_modules=["embed_tokens", "lm_head"]` | Keep LoRA adapters tied | - -#### `trainable_token_indices` - -For trainable tokens, we have the additional complication that even if the LM head and embeddings are tied, as a user I may want to fine-tune *different* tokens on them. In the example table below, we thus differentiate between fine-tuning the same and fine-tuning different tokens. - -| Base model weights tied | `ensure_weight_tying` | `LoraConfig` shape | Behavior | -|-------------------------|-----------------------|-----------------------------------------------------------------------|------------------------------------------------| -| No | `False` | `trainable_token_indices=[1, 2, 3]` | Trainable tokens on embeddings only | -| No | `True` | `trainable_token_indices=[1, 2, 3]` | Warn, then trainable tokens on embeddings only | -| Yes | `False` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | -| Yes | `True` | `trainable_token_indices=[1, 2, 3]` | Tied trainable tokens | -| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Treat as separate | -| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Warn, then treat as separate | -| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | -| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}` | Tied trainable tokens | -| No | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | -| No | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Warn, then treat as separate | -| Yes | `False` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Treat as separate | -| Yes | `True` | `trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}` | Error | - -For users, this means: - -- In general, if you want to fine-tune weights that are tied and want to keep them tied, pass `ensure_weight_tying=True`. -- If your base model's weights are untied, `ensure_weight_tying=True` cannot force tying and only warns. -- For `trainable_token_indices`, tied layers must use the same token indices when `ensure_weight_tying=True`. - -## Optimizers - -LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+. - -### LoRA-FA Optimizer - -LoRA training can be more effective and efficient using LoRA-FA, as described in [LoRA-FA](https://huggingface.co/papers/2308.03303). LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption. - -```py -from peft import LoraConfig, get_peft_model -from peft.optimizers import create_lorafa_optimizer -from transformers import Trainer, get_cosine_schedule_with_warmup - -base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") - -config = LoraConfig(...) -model = get_peft_model(base_model, config) - -optimizer = create_lorafa_optimizer( - model=model, - r=128, - lora_alpha=32, - lr=7e-5, -) - -scheduler = get_cosine_schedule_with_warmup( - optimizer, - num_warmup_steps=100, - num_training_steps=1000, -) - -trainer = Trainer( - ..., - optimizers=(optimizer, scheduler), -) -``` - -### LoRA+ optimized LoRA - -LoRA training can be optimized using [LoRA+](https://huggingface.co/papers/2402.12354), which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%. - -```py -from peft import LoraConfig, get_peft_model -from peft.optimizers import create_loraplus_optimizer -from transformers import Trainer -import bitsandbytes as bnb - -base_model = ... -config = LoraConfig(...) -model = get_peft_model(base_model, config) - -optimizer = create_loraplus_optimizer( - model=model, - optimizer_cls=bnb.optim.Adam8bit, - lr=5e-5, - loraplus_lr_ratio=16, -) -scheduler = None - -... -trainer = Trainer( - ..., - optimizers=(optimizer, scheduler), -) -``` - - -## Post-Training - -This section shows potential post-processing methods for trained adapters. - - -### Merge LoRA weights into the base model - -While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory. - -Below is a diagram that explains the intuition of LoRA adapter merging: - -
- -
- -We show in the snippets below how to run that using PEFT. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) -model = model.merge_and_unload() -``` - -It is important to assign the returned model to a variable and use it, [`~LoraModel.merge_and_unload`] is not an in-place operation. If you need to keep a copy of the weights so you can unmerge the adapter later or delete and load different ones, you should use the [`~LoraModel.merge_adapter`] function instead. Now you have the option to use [`~LoraModel.unmerge_adapter`] to return the base model. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) -model.merge_adapter() - -# unmerge the LoRA layers from the base model -model.unmerge_adapter() -``` - -The [`~LoraModel.add_weighted_adapter`] function is useful for merging multiple LoRAs into a new adapter based on a user provided weighting scheme in the `weights` parameter. Below is an end-to-end example. - -First load the base model: - -```python -from transformers import AutoModelForCausalLM -from peft import PeftModel -import torch - -base_model = AutoModelForCausalLM.from_pretrained( - "mistralai/Mistral-7B-v0.1", dtype=torch.float16, device_map="auto" -) -``` - -Then we load the first adapter: - -```python -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name="sft") -``` - -Then load a different adapter and merge it with the first one: - -```python -weighted_adapter_name = "sft-dpo" -model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") -model.add_weighted_adapter( - adapters=["sft", "dpo"], - weights=[0.7, 0.3], - adapter_name=weighted_adapter_name, - combination_type="linear" -) -model.set_adapter(weighted_adapter_name) -``` - -> [!TIP] -> There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype. - -Now, perform inference: - -```python -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" - -tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - -prompt = "Hey, are you conscious? Can you talk to me?" -inputs = tokenizer(prompt, return_tensors="pt") -inputs = {k: v.to(device) for k, v in inputs.items()} - -with torch.no_grad(): - generate_ids = model.generate(**inputs, max_length=30) -outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] -print(outputs) -``` - -### Recovering base model performance via intruder dimension reduction - -The paper [LoRA vs Full Fine-tuning: An Illusion of Equivalence](https://huggingface.co/papers/2410.21228) argues -that LoRA training introduces extra dimensions into the weights that have very little in common with the already -learnt weights and lead to forgetting of already learned information. PEFT implements the suggested mitigation -in [`peft.tuners.lora.intruders.reduce_intruder_dimension`]. - -The mitigation will take a PEFT model with a loaded LoRA and create a new, modified adapter that is loaded alongside -the existing adapter and now the active adapter. - -Example usage: - -```python -from peft.tuners.lora.intruders import reduce_intruder_dimension - -peft_model = AutoPeftModelForCausalLM.from_pretrained('hubnemo/llama-3.2b-metamathqa-lora64') - -reduce_intruder_dimension( - peft_model, - mitigation_lambda=0.75, -) - -peft_model.generate(...) -``` - -There are a few hyper-parameters that can be used for tuning the effectiveness of the mitigation but, as evidenced -in Figure 8 of the paper, it will always be a trade-off between task accuracy learned by the adapter and forgetting -of the base model's knowledge. The mitigation will remove information from the adapter to reduce the impact on -forgetting previous knowledge but this also means that some information about the task learned by the adapter is -lost as well. - -While the defaults are set to deliver a good trade-off between the two factors it is not guaranteed that the defaults -will hold for your adapter, your model and your data, therefore it is wise to have a benchmark ready to measure -the effect. - -## Load adapters - -Adapters can be loaded onto a pretrained model with [`~PeftModel.load_adapter`], which is useful for trying out different adapters whose weights aren't merged. Set the active adapter weights with the [`~LoraModel.set_adapter`] function. - -```py -from transformers import AutoModelForCausalLM -from peft import PeftModel - -base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" -model = PeftModel.from_pretrained(base_model, peft_model_id) - -# load different adapter -model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") - -# set adapter as active -model.set_adapter("dpo") -``` - -To return the base model, you could use [`~LoraModel.unload`] to unload all of the LoRA modules or [`~LoraModel.delete_adapter`] to delete the adapter entirely. [`~LoraModel.unload`] is not an in-place operation, remember to assign the returned model to a variable and use it. - -```py -# unload adapter -model = model.unload() - -# delete adapter -model.delete_adapter("dpo") -``` - -## Tensor Parallelism - -LoRA supports [Tensor Parallelism (TP)](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism) as provided by Transformers. When a base model is loaded with a `tp_plan`, PEFT automatically detects the TP configuration of each target module and adds the appropriate hooks to the LoRA adapter weights so that they participate correctly in the tensor-parallel computation. - -> [!WARNING] -> Tensor Parallelism support for LoRA requires `transformers >= 5.4.0`. - -Usage is identical to the standard LoRA workflow — simply load the base model with a `tp_plan` before wrapping it with PEFT: - -```py -from transformers import AutoModelForCausalLM -from peft import get_peft_model, LoraConfig - -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", tp_plan="auto") -lora_config = LoraConfig(r=16, target_modules=["q_proj", "v_proj"]) -model = get_peft_model(model, lora_config) -``` - -Saving and loading work as usual via `save_pretrained` / `from_pretrained`. PEFT gathers the sharded adapter weights back to full tensors before saving, so checkpoints are portable and independent of the number of devices used during training. - -## Inference - -This section showcases what you can do during inference time with LoRA, such as uncoupling the adapter. - -### Activated LoRA (aLoRA) - -Activated LoRA (aLoRA) is a low rank adapter architecture for causal LMs that reuses the existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see the aLoRA [paper](https://huggingface.co/papers/2504.12397). - -This technique scans for the last occurrence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token). It activates the adapter weights on tokens starting with the beginning of the invocation sequence. Any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights. Weights on prior tokens are left un-adapted, making the cache for those tokens interchangeable with base model cache due to the causal attention mask in causal LMs. Usage is very similar to standard LoRA. The key difference is that the invocation sequence must be specified when the adapter is created: - -```py -from peft import LoraConfig - -config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...) -``` - -alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as: -```py -invocation_string = "placeholder" -alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). -``` -The tokenizer is the base model's tokenizer. Use `add_special_tokens=False` to avoid adding `SOS`/`EOS` tokens in our search string (which will most likely cause the search to fail). - -**Notes** -* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. -* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. -* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. -* Beam search is not yet supported. -* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model. This can complicate the target use case of both the base model and adapter model operating on overlapping context. You can workaround this by adding [trainable tokens](../package_reference/trainable_tokens) to the base model prior to training the adapter. - -#### Choice of invocation sequence and SFT design - -You must add the `alora_invocation_tokens` sequence because it is not added automatically. We recommend activating the adapter weights early (at the start of any adapter-specific prompting), but after any long inputs, to maximize model performance without compromising cache reuse. As with any model, -formatting should be consistent between train and test. - -Consider the following example, where the base model has a chat template, -and the goal is to train the adapter to generate a desired output. - -* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. -* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. - -After deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as -```py -alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). -``` - -An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). - -> [!NOTE] -> If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. - -To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. - -#### Using (and reusing) cache for generation -The main purpose of aLoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: -1. The base model has generated something, and an aLoRA adapter is then called to do a follow-up generation. For example, the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response. -2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a follow-up generation where there is partial context overlap with the original aLoRA. For example, the user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, aLoRA checks if these documents are relevant to the question, and then the base model generates an answer. - - -To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Take care to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2). - -**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: -``` -from transformers import DynamicCache -... -cache = DynamicCache() -inputs_base = tokenizer(prompt_base, return_tensors="pt") -# Generate from base model and save cache -with model_alora.disable_adapter(): - output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True) -output_text_base = tokenizer.decode(output.sequences[0]) -cache = output.past_key_values - -# Generate with aLoRA adapter from cache -prompt_alora = output_text + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) -output = model_alora.generate(**inputs_alora, past_key_values=cache) -output_text_alora = tokenizer.decode(output[0]) - -# Note: cache is now tainted with adapter values and cannot be used in base model from here on! -``` - -**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. - -``` -from transformers import DynamicCache -import copy -... -cache = DynamicCache() -inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device) - -# Prefill from base model and save cache -with model_alora.disable_adapter(): - with torch.no_grad(): - model_alora(**inputs_shared, past_key_values=cache) -cache_copy = copy.deepcopy(cache) - -# Generate from aLoRA using prefilled cache -prompt_alora = prompt_shared + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) -output = model_alora.generate(**inputs_alora, past_key_values=cache) -output_text_alora = tokenizer.decode(output[0]) - -# Generate from base model using saved cache not tainted by aLoRA KV values -prompt_base = prompt_shared -inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device) -with model_alora.disable_adapter(): - output = model_alora.generate(**inputs_base, past_key_values=cache_copy) -output_text_base = tokenizer.decode(output[0]) -``` - - -### Inference with different LoRA adapters in the same batch - -Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch. - -Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an example of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this: - -```python -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import PeftModel - -model_id = ... -tokenizer = AutoTokenizer.from_pretrained(model_id) - -model = AutoModelForCausalLM.from_pretrained(model_id) -# load the LoRA adapter for French -peft_model = PeftModel.from_pretrained(model, , adapter_name="adapter_fr") -# next, load the LoRA adapter for German -peft_model.load_adapter(, adapter_name="adapter_de") -``` - -Now, we want to generate text on a sample that contains all three languages: The first three samples are in English, the next three are in French, and the last three are in German. We can use the `adapter_names` argument to specify which adapter to use for each sample. Since our base model is used for English, we use the special string `"__base__"` for these samples. For the next three samples, we indicate the adapter name of the French LoRA fine-tune, in this case `"adapter_fr"`. For the last three samples, we indicate the adapter name of the German LoRA fine-tune, in this case `"adapter_de"`. This way, we can use the base model and the two adapters in a single batch. - -```python -inputs = tokenizer( - [ - "Hello, my dog is cute", - "Hello, my cat is awesome", - "Hello, my fish is great", - "Salut, mon chien est mignon", - "Salut, mon chat est génial", - "Salut, mon poisson est super", - "Hallo, mein Hund ist süß", - "Hallo, meine Katze ist toll", - "Hallo, mein Fisch ist großartig", - ], - return_tensors="pt", - padding=True, -) - -adapter_names = [ - "__base__", "__base__", "__base__", - "adapter_fr", "adapter_fr", "adapter_fr", - "adapter_de", "adapter_de", "adapter_de", -] -output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_tokens=20) -``` - -Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples. - -Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters. - -#### Caveats - -Using this feature has some drawbacks, namely: - -- It only works for inference, not for training. -- Disabling adapters using the `with model.disable_adapter()` context takes precedence over `adapter_names`. -- You cannot pass `adapter_names` when some adapter weights were merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`. -- For obvious reasons, this cannot be used after calling `merge_and_unload()`, since all the LoRA adapters will be merged into the base weights in this case. -- This feature does not currently work with DoRA, so set `use_dora=False` in your `LoraConfig` if you want to use it. -- The `modules_to_save` feature is currently only supported for the layers of types `Linear`, `Embedding`, `Conv2d` and `Conv1d`. -- There is an expected overhead for inference with `adapter_names`, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following: - - Increase the batch size. - - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters. - - Take a look at alternative implementations such as [LoRAX](https://github.com/predibase/lorax), [punica](https://github.com/punica-ai/punica), or [S-LoRA](https://github.com/S-LoRA/S-LoRA), which are specialized to work with a large number of different adapters. - - -### Composing and Reusing LoRA Adapters -#### Arrow -[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. - -In PEFT, Arrow is enabled through [`ArrowConfig]` and `create_arrow_model`. You can also configure parameters such as `top_k` (the number of LoRA adapters combined per token), `router_temperature` (the softmax temperature applied to the routing coefficients), and `rng_seed` (for reproducibility). - -```py -from peft import create_arrow_model, ArrowConfig -from transformers import AutoModelForCausalLM - -# Loading the model -base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - -# Creating the Arrow config -arrow_config = ArrowConfig( - top_k=3, - router_temperature=1.0, - rng_seed=42, -) - -# The LoRA adapters below were trained on a clustered FLAN dataset. -# Task clustering was performed using the Model-Based Clustering (MBC) method, -# as described in the Arrow paper. -# While one could train a separate LoRA for each task and let Arrow route tokens among them, -# training LoRAs on clusters of tasks instead provides an indirect optimization for -# transfer across the multi-task dataset. -task_specific_adapter_paths = [ - f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) - ] - -# Creating the Arrow model -model = create_arrow_model( - base_model=base_model, - task_specific_adapter_paths=task_specific_adapter_paths, - arrow_config=arrow_config, - ) - -# Now the forward path could be called on this model, like a normal PeftModel. -``` - -Furthermore, you can add or remove adapters after calling ```create_arrow_model```—for example, to fine-tune a new adapter or discard an unnecessary one. Once the adapters are in place, you can activate the ```"arrow_router"``` for inference to use Arrow. Note that if you add a new LoRA adapter after ```create_arrow_model``` and want to fine-tune it, you must explicitly set the new adapter as active, since ```"arrow_router"``` is activated by default in ```create_arrow_model```. - -```py -from trl import SFTTrainer, SFTConfig - -# Adding a new adapter and activating it -model.add_adapter(adapter_name='new_adapter') -model.set_adapter('new_adapter') - -# Now the model could be trained along the `new_adapter`. -trainer = SFTTrainer( - model=model, - args=SFTConfig(...), - ... - ) - -# Once the training is done, you can activate `arrow_router` and use it in inference -model.set_adapter('arrow_router') # Model is ready to be used at inference time now -``` - -#### GenKnowSub -[GenKnowSub](https://aclanthology.org/2025.acl-short.54/) augments Arrow by purifying task-specific LoRA adapters before routing. The key idea is to subtract general knowledge encoded in LoRA space—based on the [forgetting-via-negation principle](https://huggingface.co/papers/2212.04089)—so that task adapters become more isolated and focused on task-relevant signals. Concretely, GenKnowSub estimates a low-dimensional “general” subspace from a set of general (non task-specific) LoRA adapters and removes this component from each task adapter’s LoRA update prior to Arrow’s token-wise routing. This typically improves compositionality and reduces interference when combining many task adapters. - -In PEFT, enable GenKnowSub by setting ```use_gks=True``` in ArrowConfig, and providing ```general_adapter_paths``` in ```create_arrow_model```: - -```py -from peft import create_arrow_model, ArrowConfig -from transformers import AutoModelForCausalLM - -# Loading the model -base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") - -# Creating the Arrow config -arrow_config = ArrowConfig( - top_k=3, - router_temperature=1.0, - use_gks=True, - rng_seed=42, -) - -# Path to task-specific, trained on flan clustered dataset (as we explained before.) -task_specific_adapter_paths = [ - f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) - ] -# These general adapters are trained on English, German, and French Wikipedia dataset, -# with causal language modelling objective, each pair like: (507 token tsentence, 5 token completion), and the loss computed on the completion -general_adapter_paths = [ - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17", - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35", - "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17" - ] - -# Creating the Arrow model -model = create_arrow_model( - base_model=base_model, - task_specific_adapter_paths=task_specific_adapter_paths, - general_adapter_paths=general_adapter_paths, - arrow_config=arrow_config, - ) - -# Now the forward path could be called on this model, like a normal PeftModel. -``` -To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling ```create_arrow_model``` (as described in the Arrow section) is still supported in this case. - -> [!TIP] -> **Things to keep in mind when using Arrow + GenKnowSub:** -> -> - All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```. -> -> - Any inconsistency in these settings will raise an error in ```create_arrow_model```. -> -> - Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically. -> -> - Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior. -> -> - In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```. -> -> - This implementation is compatible with 4-bit quantization (via bitsandbytes): -> -> ```py -> from transformers import AutoModelForCausalLM, BitsAndBytesConfig -> import torch -> -> # Quantisation config -> bnb_config = BitsAndBytesConfig( -> load_in_4bit=True, -> bnb_4bit_quant_type="nf4", -> bnb_4bit_compute_dtype=torch.bfloat16, -> bnb_4bit_use_double_quant=False, -> ) -> -> # Loading the model -> base_model = AutoModelForCausalLM.from_pretrained( -> "microsoft/Phi-3-mini-4k-instruct", -> dtype=torch.bfloat16, -> device_map="auto", -> quantization_config=bnb_config, -> ) -> -> # Now call create_arrow_model() as we explained before. -> ``` diff --git a/docs/source/developer_guides/memory_efficient_training.md b/docs/source/developer_guides/memory_efficient_training.md new file mode 100644 index 0000000000..5ba8cea088 --- /dev/null +++ b/docs/source/developer_guides/memory_efficient_training.md @@ -0,0 +1,48 @@ + + +# Memory Efficient Training + +🤗 PEFT provides you with methods for parameter efficient fine-tuning but that doesn't mean that your training process is memory efficient. This guide is a collection of tips that you can use to improve memory efficiency of your training process. This guide is mostly an overview page that will link you to the respective other guides and offer some tips for specific situations. + +## Choosing the right method + +Not every PEFT method is built equally and some formulations are easier to build in a memory efficient manner. If you are on a memory budget it makes sense to check out the [PEFT method comparison suite](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison) and filter for **maximum** accelerator memory usage. Average accelerator memory usage can be fairly equal across methods but not every method scales equally with activations and sequence length and is more prone to memory spikes than others. + +Especially when targeting large layers like language modeling heads or embedding layers to fine-tune specific tokens it might make sense to look into [using trainable tokens](troubleshooting#using-trainable-tokens). + +## Chunked NLL loss + +Using [`NLLLoss`](https://docs.pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html) is very common when training language models (or classification tasks, for that matter) but it is usually computed in one go, meaning you will allocate a matrix of size `batch × sequence × vocabulary`. With particularly long sequences or vocabularies this can get expensive fast. + +When using [TRL] you can either use the [Liger kernel integration](https://huggingface.co/docs/trl/liger_kernel_integration) or use [Chunked NLLLoss](https://huggingface.co/docs/trl/v1.5.1/en/reducing_memory_usage#chunked-cross-entropy-for-reducing-peak-memory-usage). The latter will split the sequence in chunks of size 256 to keep the maximum memory consumption constant. + +![NLL vs. Chunked NLL comparison](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/chunked_nll.png) + +In case the default chunk size is not optimal for your setting, look in the [original TRL PR](https://github.com/huggingface/trl/pull/5575) for more information on how to tune the chunk size. + +## Quantization + +Quantization is one of the best ways to reduce memory consumption *of the base model* and will, depending on the employed quantization, also reduce activation memory. Since the PEFT methods will only take up a small portion of the total number of parameters, PEFT defaults to use a higher precision than the base model. This can also have the effect that adapters can mitigate some of the quality loss incured by quantization methods. Read the [PEFT quantization guide](quantization). + +## Compilation + +The models we train are composed of operations like matrix multiplications, sums and assignments where each operation produces a new result and, subsequently, needs to take up memory. If those intermediate results are not needed we can fuse these operations and save up on memory. This is just one of many optimizations that `torch.compile` can do for you, so check out the [PEFT torch.compile guide](torch_compile). + +## Gradient Checkpointing + +You can trade memory with computation by only saving every nth gradient between layers and computing the rest on the fly. Check out the [gradient checkpointing](https://huggingface.co/docs/transformers/grad_checkpointing) documentation of Transformers to learn more. + diff --git a/docs/source/developer_guides/troubleshooting.md b/docs/source/developer_guides/troubleshooting.md index 88d44aa9d5..d56dea44d6 100644 --- a/docs/source/developer_guides/troubleshooting.md +++ b/docs/source/developer_guides/troubleshooting.md @@ -119,7 +119,7 @@ peft_model = PeftModel.from_pretrained(base_model, peft_model_id) ### Randomly initialized layers -For some tasks, it is important to correctly configure `modules_to_save` in the config to account for randomly initialized layers. +For some tasks, it is important to correctly configure `modules_to_save` in the config to account for randomly initialized layers. As an example, this is necessary if you use LoRA to fine-tune a language model for sequence classification because 🤗 Transformers adds a randomly initialized classification head on top of the model. If you do not add this layer to `modules_to_save`, the classification head won't be saved. The next time you load the model, you'll get a _different_ randomly initialized classification head, resulting in completely different results. @@ -147,7 +147,7 @@ For many language fine-tuning tasks, extending the model's vocabulary is necessa #### Using trainable tokens -Let's start with trainable tokens, in this case its [LoRA integration](../developer_guides/lora#efficiently-train-tokens-alongside-lora). If you're interested in only training the new embeddings and nothing else, refer to the [standalone documentation](../package_reference/trainable_tokens). +Let's start with trainable tokens, in this case its [LoRA integration](../package_reference/lora#efficiently-train-tokens-alongside-lora). If you're interested in only training the new embeddings and nothing else, refer to the [standalone documentation](../package_reference/trainable_tokens). To enable selective token training of the embedding layer, you'll need to supply the token ids of your newly added tokens via the `trainable_token_indices` parameter. Optionally you can specify which layer to target if there is more than one embedding layer. For a Mistral model this could look like this: @@ -227,7 +227,7 @@ As always, it is best practice to ensure the model works correctly for inference ### Check layer and model status -Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. +Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. The [`~peft.PeftModel.get_layer_status`] method gives you a detailed overview of each targeted layer's active, merged, and available adapters. diff --git a/docs/source/tutorial/peft_integrations.md b/docs/source/guides/peft_integrations.md similarity index 100% rename from docs/source/tutorial/peft_integrations.md rename to docs/source/guides/peft_integrations.md diff --git a/docs/source/tutorial/peft_model_config.md b/docs/source/guides/peft_model_config.md similarity index 92% rename from docs/source/tutorial/peft_model_config.md rename to docs/source/guides/peft_model_config.md index 83aa7705da..4f6df826ff 100644 --- a/docs/source/tutorial/peft_model_config.md +++ b/docs/source/guides/peft_model_config.md @@ -27,7 +27,7 @@ The PEFT library is designed to help you quickly train large models on free or l A configuration stores important parameters that specify how a particular PEFT method should be applied. -For example, take a look at the following [`LoraConfig`](https://huggingface.co/ybelkada/opt-350m-lora/blob/main/adapter_config.json) for applying LoRA and [`PromptEncoderConfig`](https://huggingface.co/smangrul/roberta-large-peft-p-tuning/blob/main/adapter_config.json) for applying p-tuning (these configuration files are already JSON-serialized). Whenever you load a PEFT adapter, it is a good idea to check whether it has an associated adapter_config.json file which is required. +For example, take a look at the following `LoraConfig` for applying LoRA and `PromptEncoderConfig` for applying p-tuning (these configuration files are already JSON-serialized). Whenever you load a PEFT adapter, it is a good idea to check whether it has an associated `adapter_config.json` file which is required. @@ -170,7 +170,7 @@ from peft import AutoPeftModelForCausalLM lora_model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora") ``` -Take a look at the [AutoPeftModel](package_reference/auto_class) API reference to learn more about the [`AutoPeftModel`] classes. +Take a look at the [AutoPeftModel](../package_reference/auto_class) API reference to learn more about the [`AutoPeftModel`] classes. ## Next steps diff --git a/docs/source/index.md b/docs/source/index.md index d38544311f..0db7b2381d 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -16,34 +16,27 @@ rendered properly in your Markdown viewer. # PEFT -🤗 PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained models to various downstream applications without fine-tuning all of a model's parameters because it is prohibitively costly. PEFT methods only fine-tune a small number of (extra) model parameters - significantly decreasing computational and storage costs - while yielding performance comparable to a fully fine-tuned model. This makes it more accessible to train and store large language models (LLMs) on consumer hardware. +🤗 PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained models to various downstream applications without fine-tuning all of a model's parameters because it is prohibitively costly. PEFT methods only fine-tune a small number of (extra) model parameters - significantly decreasing computational and storage costs - while yielding performance comparable to a fully fine-tuned model. This makes it more accessible to train and store large language models (LLMs) and other big models on consumer hardware. PEFT is integrated with the Transformers, Diffusers, and Accelerate libraries to provide a faster and easier way to load, train, and use large models for inference. +
+
+ There are numerous methods to "adapt" existing models, often extensively integrating into the model. PEFT can be thought of as a framework for arbitrary methods of model adaption (modifying weights, wrapping layers, manipulating KV-caches, ...) while also serving as a reference implementation for many fine-tuning methods. +
+
+
+
- - diff --git a/docs/source/methods/overview.md b/docs/source/methods/overview.md new file mode 100644 index 0000000000..a63f9571ea --- /dev/null +++ b/docs/source/methods/overview.md @@ -0,0 +1,68 @@ + + + +# Parameter efficient fine-tuning methods + +Training a model parameter efficiently means to train as few parameters as possible to achieve comparable performance to training all parameters, i.e. full fine-tuning. There is, of course, no free lunch: by using fewer and therefore less expressive, parameters, it is not guaranteed that you will get the same performance! You may need to use a specific PEFT method to get optimal results for the model/task combination you want to train. But you will need less memory and possibly less compute during training and may gain features such as fast hot-swapping between trained expert models and less forgetting of previous knowledge compared to full fine-tuning. + +Giving general advice for training large models is hard but for generative +models, especially language models, you can follow these steps: + +1. use prompting (few-shot examples in the prompt) to see if the model is + already capable of the task. If the model solves your problem, great! You can + now use [Prompt-based methods](#prompt-based-methods) to learn the prompt and + save precious tokens. +2. If prompt-based methods are not sufficient you can use [layer tuning](#layer-tuning) + and [adapter methods](#adapter-methods). These methods are generally + more expressive than prompt-based methods and get closer to full-finetuning. +3. Make sure to measure retention of already learnt knowledge since each + fine-tuning step is potentially unlearning past knowledege. + +The [PEFT method comparison suite](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison) aims to give a rough overview of (most) implemented methods on selected benchmarks and models. + + +## Adapter methods + +Adapter methods can be seen as ways of adding relatively small, trainable matrices to existing models for fine-tuning. The goal is to introduce few trainable parameters to steer the big model in the direction of the task that needs fine-tuning to save on resources, such as memory or compute. + +A popular way to realize adapters is to insert smaller trainable matrices that are a low-rank decomposition of the adapted weight's layout to save on memory. There are several different ways to express the weight matrix as a low-rank decomposition, but [Low-Rank Adaptation (LoRA)](../package_reference/lora) is the most common method. The PEFT library supports several other variations of this formulation - some are direct variants of LoRA and are documented under LoRA, some are different enough to count as their own methods, such as [Low-Rank Hadamard Product (LoHa)](../package_reference/loha), [Low-Rank Kronecker Product (LoKr)](../package_reference/lokr), and [Adaptive Low-Rank Adaptation (AdaLoRA)](../package_reference/adalora). If you're interested in applying these methods to other tasks and use cases like semantic segmentation, token classification, take a look at our [notebook collection](https://huggingface.co/collections/PEFT/notebooks-6573b28b33e5a4bf5b157fc1)! + +> [!TIP] +> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. + +Low-rank adapters are only one possible adapter formualation, PEFT implements many other types of adapters as well. For example, Orthogonal Fine-Tuning methods ([OFT](../package_reference/oft), [BOFT](../package_reference/boft), ...) use orthogonal decompositions of the adapter weights to achieve small size. Methods like [MiSS](../package_reference/miss) shard matrices and share these shards to save on memory. [IA3](../package_reference/ia3) just introduces three trainable vectors to steer the original model. + +## Prompt-based methods + +Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. + +There are two categories of prompting methods: + +- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt +- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word + +The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning, ...), explore the table of contents for a full listing of soft prompt methods. +If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! + +> [!TIP] +> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! + +## Layer Tuning + +Layer Tuning categorizes methods that target specific layers of a model such as [LayerNorm Tuning](../package_reference/layernorm_tuning) +or targeting specific tokens in the embedding matrix via [TrainableTokens](../package_reference/trainable_tokens). + diff --git a/docs/source/package_reference/adalora.md b/docs/source/package_reference/adalora.md index 9cc51d0e09..f96142d9f2 100644 --- a/docs/source/package_reference/adalora.md +++ b/docs/source/package_reference/adalora.md @@ -16,16 +16,64 @@ rendered properly in your Markdown viewer. # AdaLoRA -[AdaLoRA](https://hf.co/papers/2303.10512) is a method for optimizing the number of trainable parameters to assign to weight matrices and layers, unlike LoRA, which distributes parameters evenly across all modules. More parameters are budgeted for important weight matrices and layers while less important ones receive fewer parameters. +[AdaLoRA](https://hf.co/papers/2303.10512) (Adaptive LoRA) is a method for optimizing the number of trainable parameters to assign to weight matrices and layers, unlike LoRA, which distributes parameters evenly across all modules. More parameters are budgeted for important weight matrices and layers while less important ones receive fewer parameters. You can control the average desired *rank* or `r` of the matrices, and which modules to apply AdaLoRA to with `target_modules`. Other important parameters to set are `lora_alpha` (scaling factor), and `modules_to_save` (the modules apart from the AdaLoRA layers to be trained and saved). All of these parameters - and more - are found in the [`AdaLoraConfig`]. The abstract from the paper is: *Fine-tuning large pre-trained language models on downstream tasks has become an important paradigm in NLP. However, common practice fine-tunes all of the parameters in a pre-trained model, which becomes prohibitive when a large number of downstream tasks are present. Therefore, many fine-tuning methods are proposed to learn incremental updates of pre-trained weights in a parameter efficient way, e.g., low-rank increments. These methods often evenly distribute the budget of incremental updates across all pre-trained weight matrices, and overlook the varying importance of different weight parameters. As a consequence, the fine-tuning performance is suboptimal. To bridge this gap, we propose AdaLoRA, which adaptively allocates the parameter budget among weight matrices according to their importance score. In particular, AdaLoRA parameterizes the incremental updates in the form of singular value decomposition. Such a novel approach allows us to effectively prune the singular values of unimportant updates, which is essentially to reduce their parameter budget but circumvent intensive exact SVD computations. We conduct extensive experiments with several pre-trained models on natural language processing, question answering, and natural language generation to validate the effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable improvement over baselines, especially in the low budget settings. Our code is publicly available at https://github.com/QingruZhang/AdaLoRA*. +> [!WARNING] +> AdaLoRA has an [`~AdaLoraModel.update_and_allocate`] method that should be called at each training step to update the parameter budget and mask, otherwise the adaptation step is not performed. This requires writing a custom training loop or subclassing the [`~transformers.Trainer`] to incorporate this method. As an example, take a look at this [custom training loop](https://github.com/huggingface/peft/blob/912ad41e96e03652cabf47522cd876076f7a0c4f/examples/conditional_generation/peft_adalora_seq2seq.py#L120). + +AdaLoRA manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The $\Delta W$ is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of $\Delta W$ is adjusted according to an importance score. $\Delta W$ is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning. + +Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance. + +> [!NOTE] +> **Contributions welcome**: This section needs clarification. +> +> It is unclear how importance is measured. The explanations are also a bit redundant and could benefit from consolidation. +> See [here](../developer_guides/contributing#documentation-improvements) on how to contribute. + +## Benchmark overview + + + +## Usage + + +```py +from peft import AdaLoraConfig, get_peft_model + +config = AdaLoraConfig( + r=8, + init_r=12, + tinit=200, + tfinal=1000, + deltaT=10, + target_modules=["query", "value"], + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 520,325 || all params: 87,614,722 || trainable%: 0.5938785036606062" + +[... training code ...] + +model.update_and_allocate(step_idx) +``` + +# API + ## AdaLoraConfig [[autodoc]] tuners.adalora.config.AdaLoraConfig ## AdaLoraModel -[[autodoc]] tuners.adalora.model.AdaLoraModel \ No newline at end of file +[[autodoc]] tuners.adalora.model.AdaLoraModel diff --git a/docs/source/package_reference/adamss.md b/docs/source/package_reference/adamss.md index 2ef4e550bc..fe3279b29e 100644 --- a/docs/source/package_reference/adamss.md +++ b/docs/source/package_reference/adamss.md @@ -22,13 +22,23 @@ The abstract from the paper is: > We propose AdaMSS, an adaptive multi-subspace approach for parameter-efficient fine-tuning of large models. Unlike traditional parameterefficient fine-tuning methods that operate within a large single subspace of the network weights, AdaMSS leverages subspace segmentation to obtain multiple smaller subspaces and adaptively reduces the number of trainable parameters during training, ultimately updating only those associated with a small subset of subspaces most relevant to the target downstream task. By using the lowest-rank representation, AdaMSS achieves more compact expressiveness and finer tuning of the model parameters. Theoretical analyses demonstrate that AdaMSS has better generalization guarantee than LoRA, PiSSA, and other single-subspace low-rankbased methods. Extensive experiments across image classification, natural language understanding, and natural language generation tasks show that AdaMSS achieves comparable performance to full fine-tuning and outperforms other parameterefficient fine-tuning methods in most cases, all while requiring fewer trainable parameters. Notably, on the ViT-Large model, AdaMSS achieves 4.7% higher average accuracy than LoRA across seven tasks, using just 15.4% of the trainable parameters. On RoBERTa-Large, AdaMSS outperforms PiSSA by 7% in average accuracy across six tasks while reducing the number of trainable parameters by approximately 94.4%. These results demonstrate the effectiveness of AdaMSS in parameter-efficient fine-tuning. The code for AdaMSS is available at https: //github.com/jzheng20/AdaMSS. - AdaMSS currently has the following constraints: - Only `nn.Linear` layers are supported. - Requires scikit-learn for the KMeans clustering step. If these constraints don't work for your use case, consider other methods instead. +## Benchmark overview + + + +# API + ## AdamssConfig [[autodoc]] tuners.adamss.config.AdamssConfig diff --git a/docs/source/package_reference/beft.md b/docs/source/package_reference/beft.md index f3b29468d1..f7c5de4a39 100644 --- a/docs/source/package_reference/beft.md +++ b/docs/source/package_reference/beft.md @@ -21,7 +21,7 @@ rendered properly in your Markdown viewer. BEFT currently has the following tradeoffs: Pros: -- BEFT requires far fewer parameters than LoRA, while maintaining competitive or superior performance across tasks in low-data regimes. +- BEFT requires far fewer parameters than LoRA, while maintaining competitive or superior performance across tasks in low-data regimes. Cons: - In high-data regimes, BEFT may show limited effectiveness compared to LoRA and full-parameters fine-tuning. @@ -32,10 +32,21 @@ The abstract from the paper is: *Fine-tuning the bias terms of large language models (LLMs) has the potential to achieve unprecedented parameter efficiency while maintaining competitive performance, particularly in low-data regimes. However, the link between fine-tuning different bias terms (i.e., **b**q, **b**k, and **b**v in the query, key, or value projections) and downstream performance remains largely unclear to date. In this paper, we investigate the link between fine-tuning **b**q, **b**k, and **b**v with the performance of the downstream task. Our key finding is that directly fine-tuning **b**v generally leads to higher downstream performance in low-data regimes, in comparison to **b**q and **b**k. We extensively evaluate this unique property across a wide range of LLMs spanning encoder-only and decoder-only architectures up to 6.7B parameters (including bias-free LLMs). Our results provide strong evidence for the effectiveness of directly fine-tuning **b**v across various downstream tasks*. +## Benchmark overview + + + +# API + ## BeftConfig [[autodoc]] tuners.beft.config.BeftConfig ## BeftModel -[[autodoc]] tuners.beft.model.BeftModel \ No newline at end of file +[[autodoc]] tuners.beft.model.BeftModel diff --git a/docs/source/package_reference/boft.md b/docs/source/package_reference/boft.md index 48231fa9fd..5a31d0adf4 100644 --- a/docs/source/package_reference/boft.md +++ b/docs/source/package_reference/boft.md @@ -1,4 +1,4 @@ - + +# Weight-Decomposed Low-Rank Adaptation (DoRA) + +This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. + +```py +from peft import LoraConfig + +config = LoraConfig(use_dora=True, ...) +``` + +If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`. + +```py +from peft import LoraConfig, LoraRuntimeConfig + +config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...) +``` + +A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method. + +```py +from peft import PeftModel + +model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True) +``` + +## Optimization + +DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the +base result at those times to get the speedup. +Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py) +with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora` on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations: + +| | Without Optimization | With Optimization | +| :--: | :--: | :--: | +| train runtime (sec) | 359.7298 | **279.2676** | +| train samples per second | 1.779 | **2.292** | +| train steps per second | 0.056 | **0.072** | + +Moreover, it is possible to further increase runtime performance of DoRA by using the [`DoraCaching`] helper context. This requires the model to be in `eval` mode: + +```py +from peft.helpers import DoraCaching + +model.eval() +with DoraCaching(): + output = model(inputs) +``` + +For [`meta-llama/Llama-3.1-8B`](https://huggingface.co/meta-llama/Llama-3.1-8B), the [DoRA caching benchmark script](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora-caching.py) shows that, compared to LoRA: + +- DoRA without caching requires 139% more time +- DoRA without caching requires 4% more memory +- DoRA with caching requires 17% more time +- DoRA with caching requires 41% more memory + +Caching can thus make inference with DoRA significantly faster but it also requires signficantly more memory. Ideally, if the use case allows it, just merge the DoRA adapter to avoid both memory and runtime overhead. + +## Caveats + +- DoRA only supports embedding, linear, and Conv2d layers at the moment. +- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. +- DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2. + diff --git a/docs/source/package_reference/lora_variant_monteclora.md b/docs/source/package_reference/lora_variant_monteclora.md new file mode 100644 index 0000000000..bc6a63b9b2 --- /dev/null +++ b/docs/source/package_reference/lora_variant_monteclora.md @@ -0,0 +1,51 @@ +# MonteCLoRA (Monte Carlo Low-Rank Adaptation) + +MonteCLoRA wraps a standard LoRA adapter with a small variational module that draws Monte Carlo samples of stochastic perturbations on top of the LoRA `A` matrix during training. Concretely, it learns variational parameters (a Wishart-based covariance, a per-sample multivariate-normal noise term, and a Dirichlet weighting over the samples) and adds the resulting averaged perturbation to `lora_A` at every forward pass. A KL-divergence + entropy term is added to the training loss to keep these variational parameters anchored to a sensible prior. At inference time the sampler is disabled and MonteCLoRA behaves exactly like a regular LoRA adapter, so there is **no extra inference cost or extra parameters to merge**. For the full method see https://huggingface.co/papers/2411.04358. + +You may want to consider MonteCLoRA when: + +- You are fine-tuning on a small or noisy dataset and want stronger regularization than vanilla LoRA. The Monte Carlo averaging and the KL term together act as a Bayesian-style regularizer. +- You want better uncertainty calibration / robustness from your adapter without paying extra cost at inference time (the variational machinery is training-only). +- Vanilla LoRA is overfitting and lowering `r` or increasing `lora_dropout` is not enough. + +You probably do *not* need MonteCLoRA when you have a large, clean dataset and vanilla LoRA already trains stably — in that regime the extra variational parameters mostly add training overhead without much benefit. + +To enable MonteCLoRA, pass a `MontecloraConfig` to `LoraConfig`: + +```py +from peft import LoraConfig, MontecloraConfig + +monteclora_config = MontecloraConfig( + num_samples=8, # number of Monte Carlo samples per forward pass + sample_scaler=1e-4, # magnitude of the variational perturbation + kl_loss_weight=1e-5, # weight of the KL term added to the training loss +) +config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + monteclora_config=monteclora_config, +) +``` + +During training you must add the variational regularization loss to the task loss. The simplest way is to call [`LoraModel._get_monteclora_loss`] on the underlying `LoraModel`: + +```py +task_loss = ... # standard loss returned by your model +monteclora_loss = model._get_monteclora_loss() # 0.0 if MonteCLoRA is not used +total_loss = task_loss + monteclora_loss +total_loss.backward() +``` + +If you train with the HF `Trainer`, you can simply mix in [`peft.helpers.MontecloraTrainerMixin`] which does this for you in `compute_loss`: + +```py +from transformers import Trainer +from peft.helpers import MontecloraTrainerMixin + + +class MontecloraTrainer(MontecloraTrainerMixin, Trainer): + pass +``` + +A complete working example is available at [`examples/monteclora_finetuning`](https://github.com/huggingface/peft/tree/main/examples/monteclora_finetuning). diff --git a/docs/source/package_reference/lora_variant_velora.md b/docs/source/package_reference/lora_variant_velora.md new file mode 100644 index 0000000000..8658108cd4 --- /dev/null +++ b/docs/source/package_reference/lora_variant_velora.md @@ -0,0 +1,49 @@ + + +### VeLoRA + +[VeLoRA](https://huggingface.co/papers/2405.17991) is a LoRA variant that reduces training memory by compressing the activations saved for the LoRA in the forward pass and then reconstructing them in the backwards pass to implement the update rules. In PEFT, VeLoRA is configured as a LoRA variant through the `velora_config` argument on [`LoraConfig`]. + +```py +from peft import LoraConfig, VeloraConfig + +config = LoraConfig( + target_modules=["q_proj", "v_proj"], + velora_config=VeloraConfig( + num_groups=64, + scale=0.2, + init_type="batch_average", + ), +) +``` + +VeLoRA is applied to every LoRA layer selected by `target_modules`. `num_groups` controls how the input activation depth is split before compression. If the activation depth is not evenly divisible by `num_groups`, VeLoRA pads the grouped representation internally and removes the padding after reconstruction. `scale` rescales the reconstructed activations during the backward pass, and `init_type` chooses how the projection is initialized. + +Use `batch_average_once` to initialize the projection from the first training batch, `batch_average` to update it from every training forward pass, or `random` to initialize it immediately from a random normalized vector. + +Below are some results with the [MetaMathQA benchmark](https://github.com/huggingface/peft/tree/main/method_comparison/MetaMathQA). + +| Variant | Training Loss | Max Memory (GiB) | Tokens/sec | +|---|---:|---:|---:| +| LoRA | 0.5427 | 27.69 | 2366.2 | +| LoRA + GC | 0.5426 | 13.17 | 1671.8 | +| LoRA+VeLoRA | 0.5427 | 19.94 | 2057.6 | + +#### Caveats + +- VeLoRA is currently supported on standard LoRA linear layers only. + diff --git a/docs/source/package_reference/miss.md b/docs/source/package_reference/miss.md index f8324ee4d3..c2c321fc48 100644 --- a/docs/source/package_reference/miss.md +++ b/docs/source/package_reference/miss.md @@ -22,6 +22,15 @@ The abstract from the paper is: *Parameter-Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), effectively reduce the number of trainable parameters in Large Language Models (LLMs). However, as model scales continue to grow, the demand for computational resources remains a significant challenge. Existing LoRA variants often struggle to strike an optimal balance between adaptability (model performance and convergence speed) and efficiency (computational overhead, memory usage, and initialization time). This paper introduces MiSS (Matrix Shard Sharing), a novel PEFT approach that addresses this trade-off through a simple shard-sharing mechanism. MiSS leverages the insight that a low-rank adaptation can be achieved by decomposing the weight matrix into multiple fragment matrices and utilizing a shared, trainable common fragment. This method constructs the low-rank update matrix through the replication of these shared, partitioned shards. We also propose a hardware-efficient and broadly applicable implementation for MiSS. Extensive experiments conducted on a range of tasks, alongside a systematic analysis of computational performance, demonstrate MiSS's superiority. The results show that MiSS significantly outperforms standard LoRA and its prominent variants in both model performance metrics and computational efficiency, including initialization speed and training throughput. By effectively balancing expressive power and resource utilization, MiSS offers a compelling solution for efficiently adapting large-scale models.* +## Benchmark overview + + + ## When to use MiSS MiSS is a good choice when: @@ -82,10 +91,12 @@ model.print_trainable_parameters() For a full fine-tuning example including training and inference, see the [MiSS fine-tuning example](https://github.com/huggingface/peft/tree/main/examples/miss_finetuning). +# API + ## MissConfig [[autodoc]] tuners.miss.config.MissConfig ## MissModel -[[autodoc]] tuners.miss.model.MissModel \ No newline at end of file +[[autodoc]] tuners.miss.model.MissModel diff --git a/docs/source/package_reference/multitask_prompt_tuning.md b/docs/source/package_reference/multitask_prompt_tuning.md index 119739a3dc..ad5efa9103 100644 --- a/docs/source/package_reference/multitask_prompt_tuning.md +++ b/docs/source/package_reference/multitask_prompt_tuning.md @@ -22,10 +22,34 @@ The abstract from the paper is: *Prompt tuning, in which a base pretrained model is adapted to each task via conditioning on learned prompt vectors, has emerged as a promising approach for efficiently adapting large language models to multiple downstream tasks. However, existing methods typically learn soft prompt vectors from scratch, and it has not been clear how to exploit the rich cross-task knowledge with prompt vectors in a multitask learning setting. We propose multitask prompt tuning (MPT), which first learns a single transferable prompt by distilling knowledge from multiple task-specific source prompts. We then learn multiplicative low rank updates to this shared prompt to efficiently adapt it to each downstream target task. Extensive experiments on 23 NLP datasets demonstrate that our proposed approach outperforms the state-of-the-art methods, including the full finetuning baseline in some cases, despite only tuning 0.035% as many task-specific parameters*. +
+ +
+Multitask prompt tuning enables parameter-efficient transfer learning. + +MPT consists of two stages: + +1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training. +2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix. + +
+ +
+Prompt decomposition. + +## Benchmark overview + +There is no benchmark for this method yet. Feel free to contribute an experiment +configuration but make sure to first create an issue +[here](https://github.com/huggingface/peft/issues). + + +# API + ## MultitaskPromptTuningConfig [[autodoc]] tuners.multitask_prompt_tuning.config.MultitaskPromptTuningConfig ## MultitaskPromptEmbedding -[[autodoc]] tuners.multitask_prompt_tuning.model.MultitaskPromptEmbedding \ No newline at end of file +[[autodoc]] tuners.multitask_prompt_tuning.model.MultitaskPromptEmbedding diff --git a/docs/source/package_reference/oft.md b/docs/source/package_reference/oft.md index 63909b202b..fcf3ee9050 100644 --- a/docs/source/package_reference/oft.md +++ b/docs/source/package_reference/oft.md @@ -16,12 +16,79 @@ rendered properly in your Markdown viewer. # OFT -[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. +
+ +
+Controlling Text-to-Image Diffusion by Orthogonal Finetuning + +[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) and [OFTv2](https://huggingface.co/papers/2506.19847) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. The method primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity ([hyperspherical energy](https://huggingface.co/papers/1805.09298)) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). The abstract from the paper is: *Large text-to-image diffusion models have impressive capabilities in generating photorealistic images from text prompts. How to effectively guide or control these powerful models to perform different downstream tasks becomes an important open problem. To tackle this challenge, we introduce a principled finetuning method -- Orthogonal Finetuning (OFT), for adapting text-to-image diffusion models to downstream tasks. Unlike existing methods, OFT can provably preserve hyperspherical energy which characterizes the pairwise neuron relationship on the unit hypersphere. We find that this property is crucial for preserving the semantic generation ability of text-to-image diffusion models. To improve finetuning stability, we further propose Constrained Orthogonal Finetuning (COFT) which imposes an additional radius constraint to the hypersphere. Specifically, we consider two important finetuning text-to-image tasks: subject-driven generation where the goal is to generate subject-specific images given a few images of a subject and a text prompt, and controllable generation where the goal is to enable the model to take in additional control signals. We empirically show that our OFT framework outperforms existing methods in generation quality and convergence speed*. +OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged, potentially leading to less forgetting of previous learnt knowledge. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. + +## Benchmark overview + + + +## Merge OFT weights into the base model + +Similar to LoRA, the weights learned by OFT can be integrated into the pretrained weight matrices using the [`~OFTModel.merge_and_unload()` function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. + +## OFT Example Usage + +For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from trl import SFTTrainer +from peft import OFTConfig + +if use_quantization: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_storage=torch.bfloat16, + ) + +model = AutoModelForCausalLM.from_pretrained( + "model_name", + quantization_config=bnb_config +) +tokenizer = AutoTokenizer.from_pretrained("model_name") + +# Configure OFT +peft_config = OFTConfig( + oft_block_size=32, + use_cayley_neumann=True, + target_modules="all-linear", + bias="none", + task_type="CAUSAL_LM" +) + +trainer = SFTTrainer( + model=model, + train_dataset=ds['train'], + peft_config=peft_config, + processing_class=tokenizer, + args=training_arguments, + data_collator=collator, +) + +trainer.train() +``` + +# API + ## OFTConfig [[autodoc]] tuners.oft.config.OFTConfig diff --git a/docs/source/package_reference/osf.md b/docs/source/package_reference/osf.md index 266138589b..6cd43fad7f 100644 --- a/docs/source/package_reference/osf.md +++ b/docs/source/package_reference/osf.md @@ -105,7 +105,7 @@ config = OSFConfig( "gate_proj": 4 # Lower rank for gate projection } ) - + # Fractional preserved rank is supported (interpreted per-target as fraction * min_dim) config = OSFConfig(effective_rank=0.8) # preserve 80% of min_dim; train remaining 20% config = OSFConfig(rank_pattern={"q_proj": 0.5}) # preserve 50% on q_proj, others use global/default @@ -144,7 +144,7 @@ train_task(model, task_3_data) When training on a known sequence of n tasks, one effective strategy is to progressively allocate model capacity to balance learning new tasks while preserving previous knowledge: - **Task 1**: Use full capacity (train everything) -- **Task 2**: Freeze 1/n of model capacity, train remaining (n-1)/n capacity +- **Task 2**: Freeze 1/n of model capacity, train remaining (n-1)/n capacity - **Task 3**: Freeze 2/n of model capacity, train remaining (n-2)/n capacity - **Task n**: Freeze (n-1)/n of model capacity, use 1/n capacity for final task @@ -222,6 +222,17 @@ optimizer = torch.optim.AdamW([ ], lr=1e-4) ``` +## Benchmark overview + + + +# API + ## OSFConfig [[autodoc]] tuners.osf.config.OSFConfig diff --git a/docs/source/package_reference/p_tuning.md b/docs/source/package_reference/p_tuning.md index a35f7244c3..d529448dae 100644 --- a/docs/source/package_reference/p_tuning.md +++ b/docs/source/package_reference/p_tuning.md @@ -16,16 +16,55 @@ rendered properly in your Markdown viewer. # P-tuning -[P-tuning](https://hf.co/papers/2103.10385) adds trainable prompt embeddings to the input that is optimized by a prompt encoder to find a better prompt, eliminating the need to manually design prompts. The prompt tokens can be added anywhere in the input sequence, and p-tuning also introduces anchor tokens for improving performance. +
+ +
+Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder (image source). + +[P-tuning](https://hf.co/papers/2103.10385) is designed for natural language understanding (NLU) tasks and all language models. The abstract from the paper is: *While GPTs with traditional fine-tuning fail to achieve strong results on natural language understanding (NLU), we show that GPTs can be better than or comparable to similar-sized BERTs on NLU tasks with a novel method P-tuning -- which employs trainable continuous prompt embeddings. On the knowledge probing (LAMA) benchmark, the best GPT recovers 64\% (P@1) of world knowledge without any additional text provided during test time, which substantially improves the previous best by 20+ percentage points. On the SuperGlue benchmark, GPTs achieve comparable and sometimes better performance to similar-sized BERTs in supervised learning. Importantly, we find that P-tuning also improves BERTs' performance in both few-shot and supervised settings while largely reducing the need for prompt engineering. Consequently, P-tuning outperforms the state-of-the-art approaches on the few-shot SuperGlue benchmark.*. +The method adds trainable prompt embeddings to the input that is optimized by a prompt encoder to find a better prompt, eliminating the need to manually design prompts. The prompt tokens can be added anywhere in the input sequence, and p-tuning also introduces anchor tokens for improving performance. A prompt encoder (a bidirectional long-short term memory network or LSTM) is used to optimize the prompt parameters. Unlike prefix tuning: + +- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning +- the prompt tokens are only added to the input instead of adding them to every layer of the model +- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence + +The paper's results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks. + +## Usage + +Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. + +```py +from peft import PromptEncoderConfig, get_peft_model + +peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" +``` + +## Benchmark overview + + + + +# API + ## PromptEncoderConfig [[autodoc]] tuners.p_tuning.config.PromptEncoderConfig ## PromptEncoder -[[autodoc]] tuners.p_tuning.model.PromptEncoder \ No newline at end of file +[[autodoc]] tuners.p_tuning.model.PromptEncoder + diff --git a/docs/source/package_reference/peanut.md b/docs/source/package_reference/peanut.md index c40e95fb1e..f986d87897 100644 --- a/docs/source/package_reference/peanut.md +++ b/docs/source/package_reference/peanut.md @@ -43,6 +43,17 @@ The abstract from the paper is: > Fine-tuning large pre-trained foundation models often yields excellent downstream performance but is prohibitively expensive when updating all parameters. Parameter-efficient fine-tuning (PEFT) methods such as LoRA alleviate this by introducing lightweight update modules, yet they commonly rely on weight-agnostic linear approximations, limiting their expressiveness. In this work, we propose PEANuT, a novel PEFT framework that introduces weight-aware neural tweakers, compact neural modules that generate task-adaptive updates conditioned on frozen pre-trained weights. PEANuT provides a flexible yet efficient way to capture complex update patterns without full model tuning. We theoretically show that PEANuT achieves equivalent or greater expressivity than existing linear PEFT methods with comparable or fewer parameters. Extensive experiments across four benchmarks with over twenty datasets demonstrate that PEANuT consistently outperforms strong baselines in both NLP and vision tasks, while maintaining low computational overhead. +## Benchmark overview + + + +# API + ## PeanutConfig [[autodoc]] tuners.peanut.config.PeanutConfig diff --git a/docs/source/package_reference/poly.md b/docs/source/package_reference/poly.md index a4cf28ce56..3dd2a20c9b 100644 --- a/docs/source/package_reference/poly.md +++ b/docs/source/package_reference/poly.md @@ -35,6 +35,10 @@ The abstract from the paper is: +In case you want to try out routing without training first, you can check out the [Arrow LoRA variant](./lora#Arrow). + +# API + ## PolyConfig [[autodoc]] tuners.poly.config.PolyConfig diff --git a/docs/source/package_reference/prefix_tuning.md b/docs/source/package_reference/prefix_tuning.md index 9d722da219..06ec4c5245 100644 --- a/docs/source/package_reference/prefix_tuning.md +++ b/docs/source/package_reference/prefix_tuning.md @@ -16,14 +16,40 @@ rendered properly in your Markdown viewer. # Prefix tuning +
+ +
+Optimize the prefix parameters for each task (image source). + [Prefix tuning](https://hf.co/papers/2101.00190) prefixes a series of task-specific vectors to the input sequence that can be learned while keeping the pretrained model frozen. The prefix parameters are inserted in all of the model layers. +The abstract from the paper is: + +*Fine-tuning is the de facto way to leverage large pretrained language models to perform downstream tasks. However, it modifies all the language model parameters and therefore necessitates storing a full copy for each task. In this paper, we propose prefix-tuning, a lightweight alternative to fine-tuning for natural language generation tasks, which keeps language model parameters frozen, but optimizes a small continuous task-specific vector (called the prefix). Prefix-tuning draws inspiration from prompting, allowing subsequent tokens to attend to this prefix as if it were "virtual tokens". We apply prefix-tuning to GPT-2 for table-to-text generation and to BART for summarization. We find that by learning only 0.1\% of the parameters, prefix-tuning obtains comparable performance in the full data setting, outperforms fine-tuning in low-data settings, and extrapolates better to examples with topics unseen during training*. + **Note** For encoder-decoder models (seq2seq), the prefix is only applied to the decoder, which does not correspond to the paper specification (see e.g. Figure 2). Prefix tuning can still be fine-tuned on these model architectures but the performance could be sub-par; consider using other PEFT methods for encoder-decoder models. -## Possible Initialization +Prefix tuning is very similar to [prompt tuning](../package_reference/prompt_tuning). The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts. + +As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings. + +## Basic Usage + +Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. -By default, prefix tuning is randomly initialized. There's also the option to initialize the embeddings (or the -projection thereof) to be close to a no-op (initialized to zero, it will still shift the probability mass a bit). +```py +from peft import PrefixTuningConfig, get_peft_model + +peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" +``` + +## Possible Initializations + +By default, prefix tuning uses randomly initialized virtual tokens. There's also the option to initialize the vectors +to be close to a no-op (initialized to zero, it will still shift the probability mass a bit). This means that the KV-cache injected prefixes have less impact from the beginning and reduces the variance in training performance. @@ -42,12 +68,26 @@ tok = AutoTokenizer.from_pretrained("gpt2") peft_cfg = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, prefix_projection=False) model = get_peft_model(base, peft_cfg) +initialize_kv_prefix_from_text( + model, + tok, + text="...a long context with at least num_virtual_tokens tokens...", + use_chat_template=False, +)m peft import PrefixTuningConfig, get_peft_model, initialize_kv_prefix_from_text + +base = AutoModelForCausalLM.from_pretrained("gpt2") +tok = AutoTokenizer.from_pretrained("gpt2") + +peft_cfg = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, prefix_projection=False) +model = get_peft_model(base, peft_cfg) + initialize_kv_prefix_from_text( model, tok, text="...a long context with at least num_virtual_tokens tokens...", use_chat_template=False, ) + ``` Make sure the text is long enough to produce at least `num_virtual_tokens` tokens, otherwise initialization will fail. @@ -61,9 +101,18 @@ As a guideline: * if it is not possible to use an initialization text or you want to quickly check if prefix tuning is viable at all, use a zero init without projection -The abstract from the paper is: -*Fine-tuning is the de facto way to leverage large pretrained language models to perform downstream tasks. However, it modifies all the language model parameters and therefore necessitates storing a full copy for each task. In this paper, we propose prefix-tuning, a lightweight alternative to fine-tuning for natural language generation tasks, which keeps language model parameters frozen, but optimizes a small continuous task-specific vector (called the prefix). Prefix-tuning draws inspiration from prompting, allowing subsequent tokens to attend to this prefix as if it were "virtual tokens". We apply prefix-tuning to GPT-2 for table-to-text generation and to BART for summarization. We find that by learning only 0.1\% of the parameters, prefix-tuning obtains comparable performance in the full data setting, outperforms fine-tuning in low-data settings, and extrapolates better to examples with topics unseen during training*. +## Benchmark overview + + + + +# API ## PrefixTuningConfig diff --git a/docs/source/package_reference/prompt_tuning.md b/docs/source/package_reference/prompt_tuning.md index 61dbb6a2e9..e03be042fa 100644 --- a/docs/source/package_reference/prompt_tuning.md +++ b/docs/source/package_reference/prompt_tuning.md @@ -16,16 +16,59 @@ rendered properly in your Markdown viewer. # Prompt tuning -[Prompt tuning](https://hf.co/papers/2104.08691) adds task-specific prompts to the input, and these prompt parameters are updated independently of the pretrained model parameters which are frozen. +
+ +
+Only train and store a significantly smaller set of task-specific prompt parameters (image source). + +[Prompt tuning](https://hf.co/papers/2104.08691) adds a task-specific, virtual prompt to the input that consists of trainable vectors in the embedding space. The virtual token parameters are updated independently of the pretrained model parameters which are frozen. The abstract from the paper is: *In this work, we explore "prompt tuning", a simple yet effective mechanism for learning "soft prompts" to condition frozen language models to perform specific downstream tasks. Unlike the discrete text prompts used by GPT-3, soft prompts are learned through backpropagation and can be tuned to incorporate signal from any number of labeled examples. Our end-to-end learned approach outperforms GPT-3's "few-shot" learning by a large margin. More remarkably, through ablations on model size using T5, we show that prompt tuning becomes more competitive with scale: as models exceed billions of parameters, our method "closes the gap" and matches the strong performance of model tuning (where all model weights are tuned). This finding is especially relevant in that large models are costly to share and serve, and the ability to reuse one frozen model for multiple downstream tasks can ease this burden. Our method can be seen as a simplification of the recently proposed "prefix tuning" of Li and Liang (2021), and we provide a comparison to this and other similar approaches. Finally, we show that conditioning a frozen model with soft prompts confers benefits in robustness to domain transfer, as compared to full model tuning*. +In contrast to [prefix tuning](../package_reference/prefix_tuning), only the +input of the first layer receives the virtual tokens. + +## Usage + +There are two decisions to take: how many virtual tokens are added to the +input of the model (`num_virtual_tokens`) - this will define how many +trainable parameters there will be - and how these tokens are initialized. + +Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. + +```py +from peft import PromptTuningConfig, PromptTuningInit, get_peft_model + +prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" +peft_config = PromptTuningConfig( + task_type="CAUSAL_LM", + prompt_tuning_init=PromptTuningInit.TEXT, + num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), + prompt_tuning_init_text=prompt_tuning_init_text, + tokenizer_name_or_path="bigscience/bloomz-560m", +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" +``` + +## Benchmark overview + + + +# API + ## PromptTuningConfig [[autodoc]] tuners.prompt_tuning.config.PromptTuningConfig ## PromptEmbedding -[[autodoc]] tuners.prompt_tuning.model.PromptEmbedding \ No newline at end of file +[[autodoc]] tuners.prompt_tuning.model.PromptEmbedding diff --git a/docs/source/package_reference/psoft.md b/docs/source/package_reference/psoft.md index 4eea99a1cd..d5aaeed4d1 100644 --- a/docs/source/package_reference/psoft.md +++ b/docs/source/package_reference/psoft.md @@ -67,7 +67,7 @@ model = AutoModelForCausalLM.from_pretrained(model_id) # Configure PSOFT config = PsoftConfig( - r=32, # the dimension of trainable matrix R, + r=32, # the dimension of trainable matrix R, psoft_alpha=32, # scaling factor (typically set to r in PSOFT), target_modules=["q_proj", "v_proj"], # target attention projection layers ab_svd_init="psoft_init", # principal subspace initialization @@ -119,6 +119,16 @@ config = PsoftConfig(psoft_orth=True,psoft_mag_a=True,psoft_mag_b=True) 4. **SVD Initialization**: The `lowrank` option is more memory- and compute-efficient than `full`, making it more suitable for large models. 5. **Cayley–Neumann Approximation**: When the rank is large, enabling the Cayley–Neumann approximation can significantly improve computational efficiency, while the benefit is less pronounced for small ranks. In practice, a small number of Neumann series terms (typically `5`) usually provides a good balance between accuracy and efficiency. +## Benchmark overview + + + +# API ## PsoftConfig @@ -126,4 +136,4 @@ config = PsoftConfig(psoft_orth=True,psoft_mag_a=True,psoft_mag_b=True) ## PsoftModel -[[autodoc]] tuners.psoft.model.PsoftModel \ No newline at end of file +[[autodoc]] tuners.psoft.model.PsoftModel diff --git a/docs/source/package_reference/pvera.md b/docs/source/package_reference/pvera.md index 6ea72c7d25..d0c6ca4e93 100644 --- a/docs/source/package_reference/pvera.md +++ b/docs/source/package_reference/pvera.md @@ -31,10 +31,21 @@ The abstract from the paper is: > Large foundation models have emerged in the last years and are pushing performance boundaries for a variety of tasks. Training or even finetuning such models demands vast datasets and computational resources, which are often scarce and costly. Adaptation methods provide a computationally efficient solution to address these limitations by allowing such models to be finetuned on small amounts of data and computing power. This is achieved by appending new trainable modules to frozen backbones with only a fraction of the trainable parameters and fitting only these modules on novel tasks. Recently, the VeRA adapter was shown to excel in parameter-efficient adaptations by utilizing a pair of frozen random low-rank matrices shared across all layers. In this paper, we propose PVeRA, a probabilistic version of the VeRA adapter, which modifies the low-rank matrices of VeRA in a probabilistic manner. This modification naturally allows handling inherent ambiguities in the input and allows for different sampling configurations during training and testing. A comprehensive evaluation was performed on the VTAB-1k benchmark and seven adapters, with PVeRA outperforming VeRA and other adapters. +## Benchmark overview + + + +# API + ## PveraConfig [[autodoc]] tuners.pvera.config.PveraConfig ## PveraModel -[[autodoc]] tuners.pvera.model.PveraModel \ No newline at end of file +[[autodoc]] tuners.pvera.model.PveraModel diff --git a/docs/source/package_reference/randlora.md b/docs/source/package_reference/randlora.md index 930c400685..8140538423 100644 --- a/docs/source/package_reference/randlora.md +++ b/docs/source/package_reference/randlora.md @@ -14,14 +14,14 @@ rendered properly in your Markdown viewer. --> -# RandLora: Full-rank parameter-efficient fine-tuning of large models +# RandLora: Full-rank parameter-efficient fine-tuning of large models [RandLora](https://huggingface.co/papers/2502.00987) is a parameter-efficient fine-tuning technique that is similar to [LoRA](https://huggingface.co/papers/2106.09685) and [VeRA](https://huggingface.co/papers/2310.11454) but performs full rank updates to improve performance. RandLora can be particularly useful when adapting large model to hard tasks that require complex updates while preserving the parameter efficiency of LoRA. The full rank update of RandLora is achieved by linearly scaling random bases. The random bases are a collection of multiple low rank matrices such that the summation of their ranks if greater or equal to the full rank of the parameter matrices. The trainable parameters of RandLora are two diagonal matrices (vectors) that get multiplied with the right hand low rank random bases, in a similar way to VeRA's update. To maintain low memory usage, RandLora uses a custom function that prevents storing unnecessary bases in memory for backpropagation. RandLora presents the noteworthy difference that contrary to other LoRA-like PEFT algorithm, increasing RandLora's random base ranks increases the amount of trainable parameters. Because number of bases x bases rank is constant in RandLora, reducing the rank will increase the number of random bases, hence the number of base-specific trainable diagonal bases. Because reducing the rank of RandLora's random bases will increase their number, RandLora can become slower to train than LoRA for very small ranks where typically, ranks below 4 with result in a large training time increase. This does not affect inference though as the RandLora adapters can be merged into the pretrained weight matrices. -RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. +RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. Similarly to VeRA, when saving the RandLora's parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default). @@ -36,6 +36,17 @@ The abstract from the paper is: > Low-Rank Adaptation (LoRA) and its variants have shown impressive results in reducing the number of trainable parameters and memory requirements of large transformer networks while maintaining fine-tuning performance. The low-rank nature of the weight update inherently limits the representation power of fine-tuned models, however, thus potentially compromising performance on complex tasks. This raises a critical question: when a performance gap between LoRA and standard fine-tuning is observed, is it due to the reduced number of trainable parameters or the rank deficiency? This paper aims to answer this question by introducing RandLora, a parameter-efficient method that performs full-rank updates using a learned linear combinations of low-rank, non-trainable random matrices. Our method limits the number of trainable parameters by restricting optimization to diagonal scaling matrices applied to the fixed random matrices. This allows us to effectively overcome the low-rank limitations while maintaining parameter and memory efficiency during training. Through extensive experimentation across vision, language, and vision-language benchmarks, we systematically evaluate the limitations of LoRA and existing random basis methods. Our findings reveal that full-rank updates are beneficial across vision and language tasks individually, and even more so for vision-language tasks, where RandLora significantly reduces---and sometimes eliminates---the performance gap between standard fine-tuning and LoRA, demonstrating its efficacy. +## Benchmark overview + + + +# API + ## RandLoraConfig [[autodoc]] tuners.randlora.config.RandLoraConfig diff --git a/docs/source/package_reference/road.md b/docs/source/package_reference/road.md index fb951a91de..4a69c26008 100644 --- a/docs/source/package_reference/road.md +++ b/docs/source/package_reference/road.md @@ -20,7 +20,18 @@ rendered properly in your Markdown viewer. Finetuning with RoAd typically requires higher learning rate compared to LoRA or similar methods, around 1e-3. Currently RoAd only supports linear layers and it can be used on models quantized with bitsandbytes (4-bit or 8-bit). -For running inference with different RoAd adapters in the same batch see [Inference with different LoRA adapters in the same batch](../developer_guides/lora#inference-with-different-lora-adapters-in-the-same-batch). +For running inference with different RoAd adapters in the same batch see [Inference with different LoRA adapters in the same batch](lora#inference-with-different-lora-adapters-in-the-same-batch). + +## Benchmark overview + + + +# API ## RoadConfig diff --git a/docs/source/package_reference/shira.md b/docs/source/package_reference/shira.md index cbd869ddb4..05d12dd263 100644 --- a/docs/source/package_reference/shira.md +++ b/docs/source/package_reference/shira.md @@ -26,6 +26,17 @@ The abstract from the paper is: > Low Rank Adaptation (LoRA) has gained massive attention in the recent generative AI research. One of the main advantages of LoRA is its ability to be fused with pretrained models, adding no overhead during inference. However, from a mobile deployment standpoint, we can either avoid inference overhead in the fused mode but lose the ability to switch adapters rapidly, or suffer significant (up to 30% higher) inference latency while enabling rapid switching in the unfused mode. LoRA also exhibits concept-loss when multiple adapters are used concurrently. In this paper, we propose Sparse High Rank Adapters (SHiRA), a new paradigm which incurs no inference overhead, enables rapid switching, and significantly reduces concept-loss. Specifically, SHiRA can be trained by directly tuning only 1-2% of the base model weights while leaving others unchanged. This results in a highly sparse adapter which can be switched directly in the fused mode. We further provide theoretical and empirical insights on how high sparsity in SHiRA can aid multi-adapter fusion by reducing concept loss. Our extensive experiments on LVMs and LLMs demonstrate that finetuning only a small fraction of the parameters in the base model significantly outperforms LoRA while enabling both rapid switching and multi-adapter fusion. Finally, we provide a latency- and memory-efficient SHiRA implementation based on Parameter-Efficient Finetuning (PEFT) Library which trains at nearly the same speed as LoRA while consuming up to 16% lower peak GPU memory, thus making SHiRA easy to adopt for practical use cases. To demonstrate rapid switching benefits during inference, we show that loading SHiRA on a base model can be 5x-16x faster than LoRA fusion on a CPU. +## Benchmark overview + + + +# API + ## ShiraConfig [[autodoc]] tuners.shira.config.ShiraConfig diff --git a/docs/source/package_reference/tinylora.md b/docs/source/package_reference/tinylora.md index 3217951b1a..16bcaf2db7 100644 --- a/docs/source/package_reference/tinylora.md +++ b/docs/source/package_reference/tinylora.md @@ -32,6 +32,17 @@ The abstract from the paper is: > Recent research has shown that language models can learn to reason, often via reinforcement learning. Some work even trains low-rank parameterizations for reasoning, but conventional LoRA cannot scale below the model dimension. We question whether even rank=1 LoRA is necessary for learning to reason and propose TinyLoRA, a method for scaling low-rank adapters to sizes as small as one parameter. Within our new parameterization, we are able to train the 8B parameter size of Qwen2.5 to 91% accuracy on GSM8K with only 13 trained parameters in bf16 (26 total bytes). We find this trend holds in general: we are able to recover 90% of performance improvements while training 1000x fewer parameters across a suite of more difficult learning-to-reason benchmarks such as AIME, AMC, and MATH500. Notably, we are only able to achieve such strong performance with RL: models trained using SFT require 100-1000x larger updates to reach the same performance. +## Benchmark overview + + + +# API + ## TinyLoraConfig [[autodoc]] tuners.tinylora.config.TinyLoraConfig diff --git a/docs/source/package_reference/trainable_tokens.md b/docs/source/package_reference/trainable_tokens.md index adebde7357..42d22a3f74 100644 --- a/docs/source/package_reference/trainable_tokens.md +++ b/docs/source/package_reference/trainable_tokens.md @@ -31,7 +31,7 @@ these numbers a bit. Note that this method does not add tokens for you, you have to add tokens to the tokenizer yourself and resize the embedding matrix of the model accordingly. This method will only re-train the embeddings for the tokens you specify. -This method can also be used in conjunction with LoRA layers! See [the LoRA developer guide](../developer_guides/lora#efficiently-train-tokens-alongside-lora). +This method can also be used in conjunction with LoRA layers! See [the LoRA documentation](lora#efficiently-train-tokens-alongside-lora). > [!TIP] > Saving the model with [`~PeftModel.save_pretrained`] or retrieving the state dict using @@ -40,6 +40,17 @@ This method can also be used in conjunction with LoRA layers! See [the LoRA deve > `save_embedding_layers=False` when calling `save_pretrained`. This is safe to do as long as you don't modify the > embedding matrix through other means as well, as such changes will be not tracked by trainable tokens. +## Benchmark overview + + + +# API + ## TrainableTokensConfig [[autodoc]] tuners.trainable_tokens.config.TrainableTokensConfig diff --git a/docs/source/package_reference/vblora.md b/docs/source/package_reference/vblora.md index 02aaf10b87..5e791a950d 100644 --- a/docs/source/package_reference/vblora.md +++ b/docs/source/package_reference/vblora.md @@ -30,6 +30,17 @@ The abstract from the paper is: - VB-LoRA has two sets of training parameters: vector bank parameters and logit parameters. In practice, we found that logit parameters require a higher learning rate, while vector bank parameters require a lower learning rate. When using the AdamW optimizer, typical learning rates are 0.01 for logits and 0.001 for vector bank parameters. +## Benchmark overview + + + +# API + ## VBLoRAConfig [[autodoc]] tuners.vblora.config.VBLoRAConfig diff --git a/docs/source/package_reference/vera.md b/docs/source/package_reference/vera.md index f9ed281275..98f3f795db 100644 --- a/docs/source/package_reference/vera.md +++ b/docs/source/package_reference/vera.md @@ -30,6 +30,17 @@ The abstract from the paper is: > Low-rank adapation (LoRA) is a popular method that reduces the number of trainable parameters when finetuning large language models, but still faces acute storage challenges when scaling to even larger models or deploying numerous per-user or per-task adapted models. In this work, we present Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the number of trainable parameters compared to LoRA, yet maintains the same performance. It achieves this by using a single pair of low-rank matrices shared across all layers and learning small scaling vectors instead. We demonstrate its effectiveness on the GLUE and E2E benchmarks, image classification tasks, and show its application in instruction-tuning of 7B and 13B language models. +## Benchmark overview + + + +# API + ## VeRAConfig [[autodoc]] tuners.vera.config.VeraConfig diff --git a/docs/source/package_reference/waveft.md b/docs/source/package_reference/waveft.md index 29837bc774..434b25b6d6 100644 --- a/docs/source/package_reference/waveft.md +++ b/docs/source/package_reference/waveft.md @@ -26,6 +26,17 @@ The abstract from the paper is: >Efficiently adapting large foundation models is critical, especially with tight compute and memory budgets. Parameter-Efficient Fine-Tuning (PEFT) methods such as LoRA offer limited granularity and effectiveness in few-parameter regimes. We propose Wavelet Fine-Tuning (WaveFT), a novel PEFT method that learns highly sparse updates in the wavelet domain of residual matrices. WaveFT allows precise control of trainable parameters, offering fine-grained capacity adjustment and excelling with remarkably low parameter count, potentially far fewer than LoRA’s minimum—ideal for extreme parameter-efficient scenarios. Evaluated on personalized text-to-image generation using Stable Diffusion XL as baseline, WaveFT significantly outperforms LoRA and other PEFT methods, especially at low parameter counts; achieving superior subject fidelity, prompt alignment, and image diversity. +## Benchmark overview + + + +# API + ## WaveFTConfig [[autodoc]] tuners.waveft.config.WaveFTConfig diff --git a/docs/source/package_reference/xlora.md b/docs/source/package_reference/xlora.md index f4710ab6fa..0ebfb744d0 100644 --- a/docs/source/package_reference/xlora.md +++ b/docs/source/package_reference/xlora.md @@ -24,6 +24,10 @@ The below graphic demonstrates how the scalings change for different prompts for ![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) +For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step. + +Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. + The abstract from the paper is: *We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model (LLM) without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics and design. The impact of this work include access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, as well as molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties, but also reasons over the results and correctly predicts likely mechanisms that explain distinct molecular behaviors.*. @@ -47,6 +51,8 @@ Please cite X-LoRA as: } ``` +# API + ## XLoraConfig [[autodoc]] tuners.xlora.config.XLoraConfig diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 1f0a0a27be..0fea7f268b 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -18,48 +18,88 @@ rendered properly in your Markdown viewer. PEFT offers parameter-efficient methods for finetuning large pretrained models. The traditional paradigm is to finetune all of a model's parameters for each downstream task, but this is becoming exceedingly costly and impractical because of the enormous number of parameters in models today. Instead, it is more efficient to train a smaller number of prompt parameters or use a reparametrization method like low-rank adaptation (LoRA) to reduce the number of trainable parameters. +
+
+ PEFT can be thought of as a framework for adding trainable parameters to arbitrary places in existing models ("base models"). Specific PEFT methods arrange the trainable parameters in certain ways or modify the training process to achieve fine-tuning performance comparable to training all parameters of the base model. +
+
+
+ This quicktour will show you PEFT's main features and how you can train or run inference on large models that would typically be inaccessible on consumer devices. -## Train -Each PEFT method is defined by a [`PeftConfig`] class that stores all the important parameters for building a [`PeftModel`]. For example, to train with LoRA, load and create a [`LoraConfig`] class and specify the following parameters: +## PEFT configuration and model + +For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied, most importantly which layers of the existing model to target with trainable parameters. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +Let's use [LoRA](./package_reference/lora) as an example but only discuss common parameters - you might want to use one of the [many other PEFT methods](./methods/overview). +The configuration usually entails this: + +- `target_modules`: which modules of the base model to adapt +- `task_type` (default: `None`, see [available `TaskType`s](package_reference/peft_types#peft.TaskType)): the nature of the trained task; if provided may help to automatically save relevant layers alongside the adapter weights or warn you about incompatibilities +- `inference_mode` (default: `False`): whether you're using the model for inference or not -- `task_type`: the task to train for (sequence-to-sequence language modeling in this case) -- `inference_mode`: whether you're using the model for inference or not -- `r`: the dimension of the low-rank matrices -- `lora_alpha`: the scaling factor for the low-rank matrices -- `lora_dropout`: the dropout probability of the LoRA layers +Depending on the PEFT method you choose you will add specific parameters that, for example, determine the size of the update matrices. +Here's an example of a config you may encounter in the wild: ```python from peft import LoraConfig, TaskType -peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) +peft_config = LoraConfig(target_modules=["q_proj"], task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) ``` > [!TIP] -> See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type. +> See the [configuration guide](guides/peft_model_config) for more details on how the PEFT configuration works under the hood. -Once the [`LoraConfig`] is setup, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. +Once the [`LoraConfig`] is set up, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can (but don't have to) load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. Load the base model you want to finetune. ```python -from transformers import AutoModelForSeq2SeqLM +from transformers import AutoModelForCausalLM -model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") ``` -Wrap the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. +Now wrap the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. + +
+
+

+ Wrapping means that PEFT replaces the targeted layers (here: all q_proj layers) with the adapter-specific layer for the target layer's type. + Since we're dealing with linear layers, it will be, in this case, a lora.Linear layer. Note that these changes are done in-place to + save memory, so your base model is now modified. +

+

+ Note that we've only specified q_proj but in actuality we are targeting all model.layers[:].self_attn.q_proj layers. This is + because PEFT searches for matching suffixes by default. Pass a string with a regular expression if you want to target more complex layer patterns. +

+
+
+
+ +
+
+
+

+ The base model's layer will be wrapped, retained and not trained while new, trainable weights are added and are combined. + How these new weights are structured and combined with the weights of the base model is a good portion of what sets + the different PEFT methods apart. +

+
+
+ +To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. ```python from peft import get_peft_model -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282" +peft_model = get_peft_model(model, peft_config) +peft_model.print_trainable_parameters() +"output: trainable params: 524,288 || all params: 1,236,338,688 || trainable%: 0.0424" ``` -Out of [bigscience/mt0-large's](https://huggingface.co/bigscience/mt0-large) 1.2B parameters, you're only training 0.19% of them! +Out of [meta-llama/Llama-3.2-1B's](https://huggingface.co/meta-llama/Llama-3.2-1B) 1B parameters, you're only training 0.04% of them! That is it 🎉! Now you can train the model with the Transformers [`~transformers.Trainer`], Accelerate, or any custom PyTorch training loop. @@ -67,7 +107,7 @@ For example, to train with the [`~transformers.Trainer`] class, setup a [`~trans ```py training_args = TrainingArguments( - output_dir="your-name/bigscience/mt0-large-lora", + output_dir="your-name/meta-llama/my-llama3.2-adapter", learning_rate=1e-3, per_device_train_batch_size=32, per_device_eval_batch_size=32, @@ -83,11 +123,10 @@ Pass the model, training arguments, dataset, tokenizer, and any other necessary ```py trainer = Trainer( - model=model, + model=peft_model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], - processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) @@ -97,10 +136,10 @@ trainer.train() ### Save model -After your model is finished training, you can save your model to a directory using the [`~transformers.PreTrainedModel.save_pretrained`] function. +After your model is finished training, you can save your model to a directory using the [`~PeftModel.save_pretrained`] function. ```py -model.save_pretrained("output_dir") +peft_model.save_pretrained("output_dir") ``` You can also save your model to the Hub (make sure you're logged in to your Hugging Face account first) with the [`~transformers.PreTrainedModel.push_to_hub`] function. @@ -109,7 +148,7 @@ You can also save your model to the Hub (make sure you're logged in to your Hugg from huggingface_hub import notebook_login notebook_login() -model.push_to_hub("your-name/bigscience/mt0-large-lora") +peft_model.push_to_hub("your-name/my-llama3.2-adapter") ``` Both methods only save the extra PEFT weights that were trained, meaning it is super efficient to store, transfer, and load. For example, this [facebook/opt-350m](https://huggingface.co/ybelkada/opt-350m-lora) model trained with LoRA only contains two files: `adapter_config.json` and `adapter_model.safetensors`. The `adapter_model.safetensors` file is just 6.3MB! @@ -152,11 +191,35 @@ from peft import AutoPeftModel model = AutoPeftModel.from_pretrained("smangrul/openai-whisper-large-v2-LORA-colab") ``` +## Multiple adapters + +PEFT supports installing multiple adapters (of the same kind, in this document this would be LoRA) on top of a base model. When you call `get_peft_model` there is only one adapter named `"default"` but you can add as many additional adapters by calling `peft_model.add_adapter(adapter_name=...)`. + +
+
+

+ This works because the wrapped layer actually has a unique set of trainable weights for each adapter name. Not every adapter is active and trainable by default. + You have to explicitly enable adapters by name before they are active. This allows you to quickly swap between adapters where task-specific knowledge is needed + or serve different use-cases on top of one model. +

+
+
+
+ +Just remember to call `peft_model.set_adapter()` first to enable the adapter. + +Quick example: + +```py +peft_model.add_adapter(adapter_name='new_adapter') +peft_model.set_adapter('new_adapter') +``` + ## Next steps Now that you've seen how to train a model with one of the PEFT methods, we encourage you to try out some of the other methods like prompt tuning. The steps are very similar to the ones shown in the quicktour: -1. prepare a [`PeftConfig`] for a PEFT method +1. prepare a [`PeftConfig`] for a PEFT method, e.g. a [`LoraConfig`] or some other config (see the [method overview](methods/overview)) 2. use the [`get_peft_model`] method to create a [`PeftModel`] from the configuration and base model Then you can train it however you like! To load a PEFT model for inference, you can use the [`AutoPeftModel`] class. diff --git a/docs/source/task_guides/ia3.md b/docs/source/task_guides/ia3.md deleted file mode 100644 index c23145f897..0000000000 --- a/docs/source/task_guides/ia3.md +++ /dev/null @@ -1,235 +0,0 @@ - - -# IA3 - -[IA3](../conceptual_guides/ia3) multiplies the model's activations (the keys and values in the self-attention and encoder-decoder attention blocks, and the intermediate activation of the position-wise feedforward network) by three learned vectors. This PEFT method introduces an even smaller number of trainable parameters than LoRA which introduces weight matrices instead of vectors. The original model's parameters are kept frozen and only these vectors are updated. As a result, it is faster, cheaper and more efficient to finetune for a new downstream task. - -This guide will show you how to train a sequence-to-sequence model with IA3 to *generate a sentiment* given some financial news. - -> [!TIP] -> Some familiarity with the general process of training a sequence-to-sequence would be really helpful and allow you to focus on how to apply IA3. If you’re new, we recommend taking a look at the [Translation](https://huggingface.co/docs/transformers/tasks/translation) and [Summarization](https://huggingface.co/docs/transformers/tasks/summarization) guides first from the Transformers documentation. When you’re ready, come back and see how easy it is to drop PEFT in to your training! - -## Dataset - -You'll use the [zeroshot/twitter-financial-news-sentiment](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment) dataset. This dataset contains financial tweets labeled with sentiment (bearish, bullish, or neutral). Take a look at the [dataset viewer](https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment/viewer) for a better idea of the data and sentences you'll be working with. - -Load the dataset with the [`~datasets.load_dataset`] function. This dataset only contains a train split, so use the [`~datasets.train_test_split`] function to create a train and validation split. Create a new `text_label` column so it is easier to understand what the `label` values `0`, `1`, and `2` mean. - -```py -from datasets import load_dataset - -ds = load_dataset("zeroshot/twitter-financial-news-sentiment") -ds = ds["train"].train_test_split(test_size=0.1) -ds["validation"] = ds["test"] -del ds["test"] - -classes = ds["train"].features["label"].names -ds = ds.map( - lambda x: {"text_label": [classes[label] for label in x["label"]]}, - batched=True, - num_proc=1, -) - -ds["train"][0] -{'text': 'Morrisons reports first sales rise in four years', - 'label': 1, - 'text_label': 'bullish'} -``` - -Load a tokenizer and create a preprocessing function that: - -1. tokenizes the inputs, pads and truncates the sequence to the `max_length` -2. apply the same tokenizer to the labels but with a shorter `max_length` that corresponds to the label -3. mask the padding tokens - -```py -from transformers import AutoTokenizer - -text_column = "text" -label_column = "text_label" -max_length = 128 - -tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - -def preprocess_function(examples): - inputs = examples[text_column] - targets = examples[label_column] - model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") - labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt") - labels = labels["input_ids"] - labels[labels == tokenizer.pad_token_id] = -100 - model_inputs["labels"] = labels - return model_inputs -``` - -Use the [`~datasets.Dataset.map`] function to apply the preprocessing function to the entire dataset. - -```py -processed_ds = ds.map( - preprocess_function, - batched=True, - num_proc=1, - remove_columns=ds["train"].column_names, - load_from_cache_file=False, - desc="Running tokenizer on dataset", -) -``` - -Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the accelerator during training if your dataset samples are on a CPU. - -```py -from torch.utils.data import DataLoader -from transformers import default_data_collator - -train_ds = processed_ds["train"] -eval_ds = processed_ds["validation"] - -batch_size = 8 - -train_dataloader = DataLoader( - train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True -) -eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -``` - -## Model - -Now you can load a pretrained model to use as the base model for IA3. This guide uses the [bigscience/mt0-large](https://huggingface.co/bigscience/mt0-large) model, but you can use any sequence-to-sequence model you like. - -```py -from transformers import AutoModelForSeq2SeqLM - -model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") -``` - -### PEFT configuration and model - -All PEFT methods need a configuration that contains and specifies all the parameters for how the PEFT method should be applied. Create an [`IA3Config`] with the task type and set the inference mode to `False`. You can find additional parameters for this configuration in the [API reference](../package_reference/ia3#ia3config). - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - -Once the configuration is setup, pass it to the [`get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -```py -from peft import IA3Config, get_peft_model - -peft_config = IA3Config(task_type="SEQ_2_SEQ_LM") -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.022980103060766553" -``` - -### Training - -Set up an optimizer and learning rate scheduler. - -```py -import torch -from transformers import get_linear_schedule_with_warmup - -lr = 8e-3 -num_epochs = 3 - -optimizer = torch.optim.AdamW(model.parameters(), lr=lr) -lr_scheduler = get_linear_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=(len(train_dataloader) * num_epochs), -) -``` - -Move the model to the accelerator and create a training loop that reports the loss and perplexity for each epoch. - -```py -from tqdm import tqdm - -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" -model = model.to(device) - -for epoch in range(num_epochs): - model.train() - total_loss = 0 - for step, batch in enumerate(tqdm(train_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - total_loss += loss.detach().float() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - - model.eval() - eval_loss = 0 - eval_preds = [] - for step, batch in enumerate(tqdm(eval_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - with torch.no_grad(): - outputs = model(**batch) - loss = outputs.loss - eval_loss += loss.detach().float() - eval_preds.extend( - tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True) - ) - - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_ppl = torch.exp(eval_epoch_loss) - train_epoch_loss = total_loss / len(train_dataloader) - train_ppl = torch.exp(train_epoch_loss) - print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") -``` - -## Share your model - -After training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -account = -peft_model_id = f"{account}/mt0-large-ia3" -model.push_to_hub(peft_model_id) -``` - -## Inference - -To load the model for inference, use the [`~AutoPeftModelForSeq2SeqLM.from_pretrained`] method. Let's also load a sentence of financial news from the dataset to generate a sentiment for. - -```py -from peft import AutoPeftModelForSeq2SeqLM - -device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" - -model = AutoPeftModelForSeq2SeqLM.from_pretrained("/mt0-large-ia3").to(device) -tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") - -i = 15 -inputs = tokenizer(ds["validation"][text_column][i], return_tensors="pt") -print(ds["validation"][text_column][i]) -"The robust growth was the result of the inclusion of clothing chain Lindex in the Group in December 2007 ." -``` - -Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted sentiment label. - -```py -with torch.no_grad(): - inputs = {k: v.to(device) for k, v in inputs.items()} - outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) - print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) -['positive'] -``` diff --git a/docs/source/task_guides/lora_based_methods.md b/docs/source/task_guides/lora_based_methods.md deleted file mode 100644 index 50be188848..0000000000 --- a/docs/source/task_guides/lora_based_methods.md +++ /dev/null @@ -1,344 +0,0 @@ - - -# LoRA methods - -A popular way to efficiently train large models is to insert (typically in the attention blocks) smaller trainable matrices that are a low-rank decomposition of the delta weight matrix to be learnt during finetuning. The pretrained model's original weight matrix is frozen and only the smaller matrices are updated during training. This reduces the number of trainable parameters, reducing memory usage and training time which can be very expensive for large models. - -There are several different ways to express the weight matrix as a low-rank decomposition, but [Low-Rank Adaptation (LoRA)](../conceptual_guides/adapter#low-rank-adaptation-lora) is the most common method. The PEFT library supports several other LoRA variants, such as [Low-Rank Hadamard Product (LoHa)](../conceptual_guides/adapter#low-rank-hadamard-product-loha), [Low-Rank Kronecker Product (LoKr)](../conceptual_guides/adapter#low-rank-kronecker-product-lokr), and [Adaptive Low-Rank Adaptation (AdaLoRA)](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora). You can learn more about how these methods work conceptually in the [Adapters](../conceptual_guides/adapter) guide. If you're interested in applying these methods to other tasks and use cases like semantic segmentation, token classification, take a look at our [notebook collection](https://huggingface.co/collections/PEFT/notebooks-6573b28b33e5a4bf5b157fc1)! - -Additionally, PEFT supports the [X-LoRA](../conceptual_guides/adapter#mixture-of-lora-experts-x-lora) Mixture of LoRA Experts method. - -This guide will show you how to quickly train an image classification model - with a low-rank decomposition method - to identify the class of food shown in an image. - -> [!TIP] -> Some familiarity with the general process of training an image classification model would be really helpful and allow you to focus on the low-rank decomposition methods. If you're new, we recommend taking a look at the [Image classification](https://huggingface.co/docs/transformers/tasks/image_classification) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! - -Before you begin, make sure you have all the necessary libraries installed. - -```bash -pip install -q peft transformers datasets -``` - -## Dataset - -In this guide, you'll use the [Food-101](https://huggingface.co/datasets/food101) dataset which contains images of 101 food classes (take a look at the [dataset viewer](https://huggingface.co/datasets/food101/viewer/default/train) to get a better idea of what the dataset looks like). - -Load the dataset with the [`~datasets.load_dataset`] function. - -```py -from datasets import load_dataset - -ds = load_dataset("food101") -``` - -Each food class is labeled with an integer, so to make it easier to understand what these integers represent, you'll create a `label2id` and `id2label` dictionary to map the integer to its class label. - -```py -labels = ds["train"].features["label"].names -label2id, id2label = dict(), dict() -for i, label in enumerate(labels): - label2id[label] = i - id2label[i] = label - -id2label[2] -"baklava" -``` - -Load an image processor to properly resize and normalize the pixel values of the training and evaluation images. - -```py -from transformers import AutoImageProcessor - -image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") -``` - -You can also use the image processor to prepare some transformation functions for data augmentation and pixel scaling. - -```py -from torchvision.transforms import ( - CenterCrop, - Compose, - Normalize, - RandomHorizontalFlip, - RandomResizedCrop, - Resize, - ToTensor, -) - -normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) -train_transforms = Compose( - [ - RandomResizedCrop(image_processor.size["height"]), - RandomHorizontalFlip(), - ToTensor(), - normalize, - ] -) - -val_transforms = Compose( - [ - Resize(image_processor.size["height"]), - CenterCrop(image_processor.size["height"]), - ToTensor(), - normalize, - ] -) - -def preprocess_train(example_batch): - example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]] - return example_batch - -def preprocess_val(example_batch): - example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]] - return example_batch -``` - -Define the training and validation datasets, and use the [`~datasets.Dataset.set_transform`] function to apply the transformations on-the-fly. - -```py -train_ds = ds["train"] -val_ds = ds["validation"] - -train_ds.set_transform(preprocess_train) -val_ds.set_transform(preprocess_val) -``` - -Finally, you'll need a data collator to create a batch of training and evaluation data and convert the labels to `torch.tensor` objects. - -```py -import torch - -def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) - labels = torch.tensor([example["label"] for example in examples]) - return {"pixel_values": pixel_values, "labels": labels} -``` - -## Model - -Now let's load a pretrained model to use as the base model. This guide uses the [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) model, but you can use any image classification model you want. Pass the `label2id` and `id2label` dictionaries to the model so it knows how to map the integer labels to their class labels, and you can optionally pass the `ignore_mismatched_sizes=True` parameter if you're finetuning a checkpoint that has already been finetuned. - -```py -from transformers import AutoModelForImageClassification, TrainingArguments, Trainer - -model = AutoModelForImageClassification.from_pretrained( - "google/vit-base-patch16-224-in21k", - label2id=label2id, - id2label=id2label, - ignore_mismatched_sizes=True, -) -``` - -### PEFT configuration and model - -Every PEFT method requires a configuration that holds all the parameters specifying how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - -[LoRA](../conceptual_guides/adapter#low-rank-adaptation-lora) decomposes the weight update matrix into *two* smaller matrices. The size of these low-rank matrices is determined by its *rank* or `r`. A higher rank means the model has more parameters to train, but it also means the model has more learning capacity. You'll also want to specify the `target_modules` which determine where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `lora_alpha` (scaling factor), `bias` (whether `none`, `all` or only the LoRA bias parameters should be trained), and `modules_to_save` (the modules apart from the LoRA layers to be trained and saved). All of these parameters - and more - are found in the [`LoraConfig`]. - -```py -from peft import LoraConfig, get_peft_model - -config = LoraConfig( - r=16, - lora_alpha=16, - target_modules=["query", "value"], - lora_dropout=0.1, - bias="none", - modules_to_save=["classifier"], -) -model = get_peft_model(model, config) -model.print_trainable_parameters() -"trainable params: 667,493 || all params: 86,543,818 || trainable%: 0.7712775047664294" -``` - - - - -[LoHa](../conceptual_guides/adapter#low-rank-hadamard-product-loha) decomposes the weight update matrix into *four* smaller matrices and each pair of smaller matrices is combined with the Hadamard product. This allows the weight update matrix to keep the same number of trainable parameters when compared to LoRA, but with a higher rank (`r^2` for LoHA when compared to `2*r` for LoRA). The size of the smaller matrices is determined by its *rank* or `r`. You'll also want to specify the `target_modules` which determines where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `alpha` (scaling factor), and `modules_to_save` (the modules apart from the LoHa layers to be trained and saved). All of these parameters - and more - are found in the [`LoHaConfig`]. - -```py -from peft import LoHaConfig, get_peft_model - -config = LoHaConfig( - r=16, - alpha=16, - target_modules=["query", "value"], - module_dropout=0.1, - modules_to_save=["classifier"], -) -model = get_peft_model(model, config) -model.print_trainable_parameters() -"trainable params: 1,257,317 || all params: 87,133,642 || trainable%: 1.4429753779831676" -``` - - - - -[LoKr](../conceptual_guides/adapter#low-rank-kronecker-product-lokr) expresses the weight update matrix as a decomposition of a Kronecker product, creating a block matrix that is able to preserve the rank of the original weight matrix. The size of the smaller matrices are determined by its *rank* or `r`. You'll also want to specify the `target_modules` which determines where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `alpha` (scaling factor), and `modules_to_save` (the modules apart from the LoKr layers to be trained and saved). All of these parameters - and more - are found in the [`LoKrConfig`]. - -```py -from peft import LoKrConfig, get_peft_model - -config = LoKrConfig( - r=16, - alpha=16, - target_modules=["query", "value"], - module_dropout=0.1, - modules_to_save=["classifier"], -) -model = get_peft_model(model, config) -model.print_trainable_parameters() -"trainable params: 116,069 || all params: 87,172,042 || trainable%: 0.13314934162033282" -``` - - - - -[AdaLoRA](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora) efficiently manages the LoRA parameter budget by assigning important weight matrices more parameters and pruning less important ones. In contrast, LoRA evenly distributes parameters across all modules. You can control the average desired *rank* or `r` of the matrices, and which modules to apply AdaLoRA to with `target_modules`. Other important parameters to set are `lora_alpha` (scaling factor), and `modules_to_save` (the modules apart from the AdaLoRA layers to be trained and saved). All of these parameters - and more - are found in the [`AdaLoraConfig`]. - -```py -from peft import AdaLoraConfig, get_peft_model - -config = AdaLoraConfig( - r=8, - init_r=12, - tinit=200, - tfinal=1000, - deltaT=10, - target_modules=["query", "value"], - modules_to_save=["classifier"], -) -model = get_peft_model(model, config) -model.print_trainable_parameters() -"trainable params: 520,325 || all params: 87,614,722 || trainable%: 0.5938785036606062" -``` - - - - -### Training - -For training, let's use the [`~transformers.Trainer`] class from Transformers. The [`Trainer`] contains a PyTorch training loop, and when you're ready, call [`~transformers.Trainer.train`] to start training. To customize the training run, configure the training hyperparameters in the [`~transformers.TrainingArguments`] class. With LoRA-like methods, you can afford to use a higher batch size and learning rate. - -> [!WARNING] -> AdaLoRA has an [`~AdaLoraModel.update_and_allocate`] method that should be called at each training step to update the parameter budget and mask, otherwise the adaptation step is not performed. This requires writing a custom training loop or subclassing the [`~transformers.Trainer`] to incorporate this method. As an example, take a look at this [custom training loop](https://github.com/huggingface/peft/blob/912ad41e96e03652cabf47522cd876076f7a0c4f/examples/conditional_generation/peft_adalora_seq2seq.py#L120). - -```py -from transformers import TrainingArguments, Trainer - -account = "stevhliu" -peft_model_id = f"{account}/google/vit-base-patch16-224-in21k-lora" -batch_size = 128 - -args = TrainingArguments( - peft_model_id, - remove_unused_columns=False, - eval_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-3, - per_device_train_batch_size=batch_size, - gradient_accumulation_steps=4, - per_device_eval_batch_size=batch_size, - fp16=True, - num_train_epochs=5, - logging_steps=10, - load_best_model_at_end=True, - label_names=["labels"], -) -``` - -Begin training with [`~transformers.Trainer.train`]. - -```py -trainer = Trainer( - model, - args, - train_dataset=train_ds, - eval_dataset=val_ds, - processing_class=image_processor, - data_collator=collate_fn, -) -trainer.train() -``` - -## Share your model - -Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You’ll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -notebook_login() -``` - -Call [`~transformers.PreTrainedModel.push_to_hub`] to save your model to your repositoy. - -```py -model.push_to_hub(peft_model_id) -``` - -## Inference - -Let's load the model from the Hub and test it out on a food image. - -```py -from peft import PeftConfig, PeftModel -from transformers import AutoImageProcessor -from PIL import Image -import requests - -config = PeftConfig.from_pretrained("stevhliu/vit-base-patch16-224-in21k-lora") -model = AutoModelForImageClassification.from_pretrained( - config.base_model_name_or_path, - label2id=label2id, - id2label=id2label, - ignore_mismatched_sizes=True, -) -model = PeftModel.from_pretrained(model, "stevhliu/vit-base-patch16-224-in21k-lora") - -url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg" -image = Image.open(requests.get(url, stream=True).raw) -image -``` - -
- -
- -Convert the image to RGB and return the underlying PyTorch tensors. - -```py -encoding = image_processor(image.convert("RGB"), return_tensors="pt") -``` - -Now run the model and return the predicted class! - -```py -with torch.no_grad(): - outputs = model(**encoding) - logits = outputs.logits - -predicted_class_idx = logits.argmax(-1).item() -print("Predicted class:", model.config.id2label[predicted_class_idx]) -"Predicted class: beignets" -``` diff --git a/docs/source/task_guides/prompt_based_methods.md b/docs/source/task_guides/prompt_based_methods.md deleted file mode 100644 index cc6262aebf..0000000000 --- a/docs/source/task_guides/prompt_based_methods.md +++ /dev/null @@ -1,302 +0,0 @@ - - -# Prompt-based methods - -A prompt can describe a task or provide an example of a task you want the model to learn. Instead of manually creating these prompts, soft prompting methods add learnable parameters to the input embeddings that can be optimized for a specific task while keeping the pretrained model's parameters frozen. This makes it both faster and easier to finetune large language models (LLMs) for new downstream tasks. - -The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning) and you can learn more about how these methods work conceptually in the [Soft prompts](../conceptual_guides/prompting) guide. If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! - -This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not. - -> [!TIP] -> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! - -Before you begin, make sure you have all the necessary libraries installed. - -```bash -pip install -q peft transformers datasets -``` - -## Dataset - -For this guide, you'll use the `twitter_complaints` subset of the [RAFT](https://huggingface.co/datasets/ought/raft) dataset. The `twitter_complaints` subset contains tweets labeled as `complaint` and `no complaint` and you can check out the [dataset viewer](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) for a better idea of what the data looks like. - -Use the [`~datasets.load_dataset`] function to load the dataset and create a new `text_label` column so it is easier to understand what the `Label` values, `1` and `2` mean. - -```py -from datasets import load_dataset - -ds = load_dataset( - "parquet", - data_files={ - "train": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/train/0000.parquet", - "test": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/test/0000.parquet" - } -) - -classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] -ds = ds.map( - lambda x: {"text_label": [classes[label] for label in x["Label"]]}, - batched=True, - num_proc=1, -) -ds["train"][0] -{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2, "text_label": "no complaint"} -``` - -Load a tokenizer, define the padding token to use, and determine the maximum length of the tokenized label. - -```py -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") -if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id -target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes]) -print(target_max_length) -``` - -Create a preprocessing function that tokenizes the tweet text and labels, pad the inputs and labels in each batch, create an attention mask, and truncate sequences to the `max_length`. Then convert the `input_ids`, `attention_mask`, and `labels` to PyTorch tensors. - -```py -import torch - -max_length = 64 - -def preprocess_function(examples, text_column="Tweet text", label_column="text_label"): - batch_size = len(examples[text_column]) - inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] - targets = [str(x) for x in examples[label_column]] - model_inputs = tokenizer(inputs) - labels = tokenizer(targets) - classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] - for i in range(batch_size): - sample_input_ids = model_inputs["input_ids"][i] - label_input_ids = labels["input_ids"][i] - model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( - max_length - len(sample_input_ids) - ) + sample_input_ids - model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ - "attention_mask" - ][i] - labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids - model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length]) - model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length]) - labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length]) - model_inputs["labels"] = labels["input_ids"] - return model_inputs -``` - -Apply the preprocessing function to the entire dataset with the [`~datasets.Dataset.map`] function, and remove the unprocessed columns because the model won't need them. - -```py -processed_ds = ds.map( - preprocess_function, - batched=True, - num_proc=1, - remove_columns=ds["train"].column_names, - load_from_cache_file=False, - desc="Running tokenizer on dataset", -) -``` - -Finally, create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). You can set `pin_memory=True` to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU. - -```py -from torch.utils.data import DataLoader -from transformers import default_data_collator - -train_ds = processed_ds["train"] -eval_ds = processed_ds["test"] - -batch_size = 16 - -train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) -``` - -## Model - -Now let's load a pretrained model to use as the base model for the soft prompt method. This guide uses the [bigscience/bloomz-560m](https://huggingface.co/bigscience/bloomz-560m) model, but you can use any causal language model you want. - -```py -from transformers import AutoModelForCausalLM - -model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m") -``` - -### PEFT configuration and model - -For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. - -> [!TIP] -> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! - - - - -[P-tuning](../conceptual_guides/prompting#p-tuning) adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence. Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. - -```py -from peft import PromptEncoderConfig, get_peft_model - -peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" -``` - - - - -[Prefix tuning](../conceptual_guides/prompting#prefix-tuning) adds task-specific parameters in all of the model layers, which are optimized by a separate feed-forward network. Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. - -```py -from peft import PrefixTuningConfig, get_peft_model - -peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" -``` - - - - -[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. - -Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. - -```py -from peft import PromptTuningConfig, PromptTuningInit, get_peft_model - -prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" -peft_config = PromptTuningConfig( - task_type="CAUSAL_LM", - prompt_tuning_init=PromptTuningInit.TEXT, - num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), - prompt_tuning_init_text=prompt_tuning_init_text, - tokenizer_name_or_path="bigscience/bloomz-560m", -) -model = get_peft_model(model, peft_config) -model.print_trainable_parameters() -"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" -``` - - - - -### Training - -Set up an optimizer and learning rate scheduler. - -```py -from transformers import get_linear_schedule_with_warmup - -lr = 3e-2 -num_epochs = 50 - -optimizer = torch.optim.AdamW(model.parameters(), lr=lr) -lr_scheduler = get_linear_schedule_with_warmup( - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=(len(train_dataloader) * num_epochs), -) -``` - -Move the model to the GPU and create a training loop that reports the loss and perplexity for each epoch. - -```py -from tqdm import tqdm - -device = "cuda" -model = model.to(device) - -for epoch in range(num_epochs): - model.train() - total_loss = 0 - for step, batch in enumerate(tqdm(train_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss - total_loss += loss.detach().float() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - - model.eval() - eval_loss = 0 - - for step, batch in enumerate(tqdm(eval_dataloader)): - batch = {k: v.to(device) for k, v in batch.items()} - with torch.no_grad(): - outputs = model(**batch) - loss = outputs.loss - eval_loss += loss.detach().float() - - eval_epoch_loss = eval_loss / len(eval_dataloader) - eval_ppl = torch.exp(eval_epoch_loss) - train_epoch_loss = total_loss / len(train_dataloader) - train_ppl = torch.exp(train_epoch_loss) - print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") -``` - -## Share your model - -Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. - -```py -from huggingface_hub import notebook_login - -account = -peft_model_id = f"{account}/bloomz-560-m-peft-method" -model.push_to_hub(peft_model_id) -``` - -If you check the model file size in the repository, you’ll see that it is a lot smaller than a full sized model! - -
- -
For example, the adapter weights for a opt-350m model stored on the Hub are only ~6MB compared to the full model size which can be ~700MB.
-
- -## Inference - -Let's load the model for inference and test it out on a tweet! - -```py -from peft import AutoPeftModelForCausalLM - -model = AutoPeftModelForCausalLM.from_pretrained("peft_model_id").to("cuda") -tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") - -i = 15 -inputs = tokenizer(f'{text_column} : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt") -print(ds["test"][i]["Tweet text"]) -"@NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve?" -``` - -Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted classification label. - -```py -with torch.no_grad(): - inputs = {k: v.to(device) for k, v in inputs.items()} - outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) - print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) -"['Tweet text : @NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label : complaint']" -``` diff --git a/src/peft/helpers.py b/src/peft/helpers.py index 9c497ed160..2541e638ff 100644 --- a/src/peft/helpers.py +++ b/src/peft/helpers.py @@ -298,7 +298,7 @@ class MontecloraTrainer(MontecloraTrainerMixin, Trainer): # Get PEFT model and train model = get_peft_model(base_model, lora_config) - trainer = MontecloraTrainer(model=model, args=training_args, ...) + trainer = MontecloraTrainer(model=model, args=training_args) trainer.train() ``` """ diff --git a/src/peft/tuners/adamss/asa_callback.py b/src/peft/tuners/adamss/asa_callback.py index 0e9e2947a7..2adfe56cc5 100644 --- a/src/peft/tuners/adamss/asa_callback.py +++ b/src/peft/tuners/adamss/asa_callback.py @@ -66,7 +66,7 @@ class AdamssAsaCallback(TrainerCallback): trainer = Trainer( model=model, callbacks=[AdamssAsaCallback()], - ..., + # ..., ) trainer.train() ``` diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py index 1715cc5bc6..9fdf4b7eb4 100644 --- a/src/peft/tuners/boft/config.py +++ b/src/peft/tuners/boft/config.py @@ -30,9 +30,19 @@ class BOFTConfig(PeftConfig): This is the configuration class to store the configuration of a [`BOFTModel`]. Args: - boft_block_size (`int`): BOFT block size across different layers. - boft_block_num (`int`): Number of BOFT blocks per injected layer. - boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers. + boft_block_size (`int`): BOFT matrix block size across different layers, expressed in `int`. Bigger + block sizes results in more dense update matrices with more trainable parameters. Choose `boft_block_size` + to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only specify + either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because + `boft_block_size` x `boft_block_num` must equal the layer's input dimension. + boft_block_num (`int`): Number of BOFT blocks per injected layer. Bigger `boft_block_num` result in sparser + update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be + divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Only specify either + `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because + `boft_block_size` x `boft_block_num` must equal the layer's input dimension. + boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers. For + `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the + effective block size of OFT becomes twice as big and the number of blocks become half. target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to. exclude_modules (`Optional[Union[List[str], str]]`): The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py index 9c62e1bece..bbe92af142 100644 --- a/src/peft/tuners/oft/config.py +++ b/src/peft/tuners/oft/config.py @@ -30,8 +30,22 @@ class OFTConfig(PeftConfig): This is the configuration class to store the configuration of a [`OFTModel`]. Args: - r (`int`): OFT rank, number of OFT blocks per injected layer. - oft_block_size (`int`): OFT block size across different layers. + r (`int`): + OFT rank, number of OFT blocks per injected layer. Bigger `r` results in more sparse update matrices with + fewer trainable paramters. You can only specify either `r` or `oft_block_size`, but not both + simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let you speficy either + `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the + `oft_block_size` instead for better clarity. + oft_block_size (`int`): OFT block size across different layers. Bigger `oft_block_size` results in more dense + update matrices with more trainable parameters. Choose `oft_block_size` to be divisible by layer's input + dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not + both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let you speficy + either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. + use_cayley_neumann (bool): Specifies whether to use the Cayley-Neumann parameterization (efficient but + approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of matrix + inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly worse + because of the approximation error. Please test both settings (`True` and `False`) depending on your needs. + Default is `False`. module_dropout (`float`): The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA.