From 3f531da779e1e0a39c3fe05e2802c0787c70a24a Mon Sep 17 00:00:00 2001 From: AshNicolus Date: Mon, 8 Jun 2026 21:53:27 +0530 Subject: [PATCH 1/3] FIX AdaMSS save/load reproduction by making slice_pca SVD deterministic --- src/peft/tuners/adamss/utils.py | 20 ++++++++++++---- tests/test_adamss_asa.py | 41 ++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py index 7e20c95c5c..58626e0eeb 100644 --- a/src/peft/tuners/adamss/utils.py +++ b/src/peft/tuners/adamss/utils.py @@ -38,11 +38,21 @@ def slice_pca(tensor, r, device, dtype=torch.float32): UU = torch.zeros(B, C, H, effective_r, dtype=dtype, device=device) VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device) - for i in range(B): - for j in range(C): - U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None) - UU[i, j, :, :] = U[:, 0:effective_r] - VVT[i, j, :, :] = V[:, 0:effective_r].T + # torch.svd_lowrank draws a random projection internally, so its result (and hence the + # downstream clustering and scatter_index) depends on the global RNG state. Because the + # AdaMSS adapter is rebuilt from the base weights when an adapter is loaded, a different + # RNG state at load time would produce a different scatter_index than at save time and the + # reloaded adapter would not reproduce its outputs. Seed a forked RNG so the decomposition + # is deterministic and reproducible across save/load; this matches the fixed seed already + # used for KMeans in clustering_Z and leaves the global RNG stream untouched. + fork_devices = [device] if torch.device(device).type == "cuda" else [] + with torch.random.fork_rng(devices=fork_devices): + torch.manual_seed(0) + for i in range(B): + for j in range(C): + U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None) + UU[i, j, :, :] = U[:, 0:effective_r] + VVT[i, j, :, :] = V[:, 0:effective_r].T # Return computed matrices (important: ensure callers receive VVT and UU) return VVT, UU diff --git a/tests/test_adamss_asa.py b/tests/test_adamss_asa.py index 9a305e4ba4..fea1ee62fe 100644 --- a/tests/test_adamss_asa.py +++ b/tests/test_adamss_asa.py @@ -7,10 +7,12 @@ - update_and_allocate: full ASA flow (accumulate → global mask → reset) """ +import copy + import torch from torch import nn -from peft import AdamssConfig, get_peft_model +from peft import AdamssConfig, PeftModel, get_peft_model from peft.tuners.adamss.layer import AdamssLayer @@ -270,3 +272,40 @@ def test_asa_disabled_is_noop(self): for layer in layers: for p in layer.adamss_A["default"]: assert p.requires_grad + + +class TestAdamssSaveLoad: + def test_save_load_reproduces_output(self, tmp_path): + # Regression test: a trained AdaMSS adapter must reproduce its outputs after + # save_pretrained/from_pretrained. The adapter is rebuilt from the base weights on load, + # so the SVD-based subspace allocation (and the resulting scatter_index used to place each + # subspace's contribution into the output) must be deterministic; otherwise the restored + # weights are scattered to the wrong output dimensions and the output changes. + torch.manual_seed(0) + base = SimpleMLP() + config = AdamssConfig(target_modules=["lin0", "lin1"], r=8, num_subspaces=4, subspace_rank=1) + model = get_peft_model(copy.deepcopy(base), config) + model.eval() + + # Simulate training so the adapter is non-trivial (AdaMSS initializes B=0, so an untrained + # adapter is a no-op and would reproduce trivially even if the mapping were wrong). + with torch.no_grad(): + for p in model.parameters(): + if p.requires_grad: + p.add_(torch.randn_like(p) * 0.1 + 0.05) + + x = torch.randn(4, 20) + with torch.no_grad(): + out_before = model(x) + + model.save_pretrained(tmp_path) + + # Change the global RNG state to mimic loading in a fresh process; the reconstructed + # subspace allocation must not depend on it. + torch.manual_seed(12345) + reloaded = PeftModel.from_pretrained(copy.deepcopy(base), tmp_path) + reloaded.eval() + with torch.no_grad(): + out_after = reloaded(x) + + assert torch.allclose(out_before, out_after, atol=1e-5, rtol=1e-4) From 9b7e1731ac912952466259abcf33333d1e5721d0 Mon Sep 17 00:00:00 2001 From: AshNicolus Date: Tue, 9 Jun 2026 16:15:15 +0530 Subject: [PATCH 2/3] Address review: configurable random_seed for AdaMSS/PSOFT SVD init; generic save/load roundtrip test --- src/peft/tuners/adamss/config.py | 12 ++++++++++ src/peft/tuners/adamss/layer.py | 2 +- src/peft/tuners/adamss/utils.py | 18 +++++++------- src/peft/tuners/psoft/config.py | 11 +++++++++ src/peft/tuners/psoft/layer.py | 15 ++++++++++-- tests/test_adamss_asa.py | 41 +------------------------------- tests/test_custom_models.py | 34 ++++++++++++++++++++++++++ 7 files changed, 82 insertions(+), 51 deletions(-) diff --git a/src/peft/tuners/adamss/config.py b/src/peft/tuners/adamss/config.py index 0a041c6651..641ff137f4 100644 --- a/src/peft/tuners/adamss/config.py +++ b/src/peft/tuners/adamss/config.py @@ -317,6 +317,18 @@ class AdamssConfig(PeftConfig): ) }, ) + random_seed: int = field( + default=0, + metadata={ + "help": ( + "Seed for the random projection used by the SVD (torch.svd_lowrank) during initialization. " + "AdaMSS rebuilds its subspace decomposition from the base weights when an adapter is loaded, so a " + "fixed seed is required for the reloaded adapter to reproduce its trained outputs. The seed is stored " + "in the adapter config and reused on load; change it to obtain a different (but still reproducible) " + "subspace allocation. Default: 0." + ) + }, + ) def __post_init__(self): self.peft_type = PeftType.ADAMSS diff --git a/src/peft/tuners/adamss/layer.py b/src/peft/tuners/adamss/layer.py index 2a0ba296b3..dc08b463c5 100644 --- a/src/peft/tuners/adamss/layer.py +++ b/src/peft/tuners/adamss/layer.py @@ -175,7 +175,7 @@ def update_layer( # Perform SVD decomposition with diagnostics in case of failure try: - res = slice_pca(weight_tensor, r, device, torch.float32) + res = slice_pca(weight_tensor, r, device, torch.float32, random_seed=config.random_seed) except Exception as e: raise RuntimeError( f"slice_pca raised an exception for layer {adapter_name} (shape={tuple(weight_tensor.shape)}, dtype={weight_tensor.dtype}, device={device}): {e}" diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py index 58626e0eeb..4f237aa318 100644 --- a/src/peft/tuners/adamss/utils.py +++ b/src/peft/tuners/adamss/utils.py @@ -15,7 +15,7 @@ import torch -def slice_pca(tensor, r, device, dtype=torch.float32): +def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0): """ Perform slice-wise PCA (SVD) on 4D tensor. @@ -24,6 +24,8 @@ def slice_pca(tensor, r, device, dtype=torch.float32): r: rank for low-rank approximation device: computation device dtype: data type + random_seed: seed for the random projection used by ``torch.svd_lowrank``, so the decomposition is + deterministic and reproducible across save/load Returns: VVT: Right singular vectors (B, C, r, W) UU: Left singular vectors (B, C, H, r) @@ -39,15 +41,15 @@ def slice_pca(tensor, r, device, dtype=torch.float32): VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device) # torch.svd_lowrank draws a random projection internally, so its result (and hence the - # downstream clustering and scatter_index) depends on the global RNG state. Because the - # AdaMSS adapter is rebuilt from the base weights when an adapter is loaded, a different - # RNG state at load time would produce a different scatter_index than at save time and the - # reloaded adapter would not reproduce its outputs. Seed a forked RNG so the decomposition - # is deterministic and reproducible across save/load; this matches the fixed seed already - # used for KMeans in clustering_Z and leaves the global RNG stream untouched. + # downstream clustering and scatter_index) depends on the RNG state. Because the AdaMSS + # adapter is rebuilt from the base weights when it is loaded, a different RNG state at load + # time would produce a different scatter_index than at save time and the reloaded adapter + # would not reproduce its outputs. Seed a forked RNG with the configurable ``random_seed`` so + # the decomposition is deterministic and reproducible across save/load (torch.svd_lowrank does + # not accept a generator argument); fork_rng leaves the global RNG stream untouched. fork_devices = [device] if torch.device(device).type == "cuda" else [] with torch.random.fork_rng(devices=fork_devices): - torch.manual_seed(0) + torch.manual_seed(random_seed) for i in range(B): for j in range(C): U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None) diff --git a/src/peft/tuners/psoft/config.py b/src/peft/tuners/psoft/config.py index b8f6a731d5..b034d5075f 100644 --- a/src/peft/tuners/psoft/config.py +++ b/src/peft/tuners/psoft/config.py @@ -159,6 +159,17 @@ class PsoftConfig(PeftConfig): "help": "Number of power iterations used by torch.svd_lowrank when psoft_svd='lowrank'. Only used when psoft_svd='lowrank'. " }, ) + random_seed: int = field( + default=0, + metadata={ + "help": ( + "Seed for the random projection used by torch.svd_lowrank when psoft_svd='lowrank'. PSOFT rebuilds " + "its SVD-based A/B initialization from the base weights when an adapter is loaded, so a fixed seed is " + "required for the reloaded adapter to reproduce its trained outputs. The seed is stored in the adapter " + "config and reused on load. Only used when psoft_svd='lowrank'. Default: 0." + ) + }, + ) psoft_orth: bool = field( default=True, metadata={ diff --git a/src/peft/tuners/psoft/layer.py b/src/peft/tuners/psoft/layer.py index 121c0a61d2..5ce45b2539 100644 --- a/src/peft/tuners/psoft/layer.py +++ b/src/peft/tuners/psoft/layer.py @@ -205,6 +205,7 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None: self.psoft_dropout = nn.ModuleDict({}) self.psoft_svd: dict[str, str] = {} self.psoft_svd_lowrank_niter: dict[str, int] = {} + self.random_seed: dict[str, int] = {} self.ab_svd_init: dict[str, Optional[str]] = {} # per-adapter trainable module @@ -251,6 +252,7 @@ def update_layer(self, adapter_name: str, config: PsoftConfig, **kwargs: Any) -> self.ab_svd_init[adapter_name] = config.ab_svd_init self.psoft_svd[adapter_name] = config.psoft_svd self.psoft_svd_lowrank_niter[adapter_name] = config.psoft_svd_lowrank_niter + self.random_seed[adapter_name] = config.random_seed self.psoft_R[adapter_name] = OrthLayer( size=r, @@ -290,6 +292,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No r, svd_mode=self.psoft_svd[adapter_name], niter=self.psoft_svd_lowrank_niter[adapter_name], + random_seed=self.random_seed[adapter_name], ) Sr_scaled = Sr / self.scaling[adapter_name] @@ -309,7 +312,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No self._set_psoft_ab_cache_buffers(adapter_name, A, B) - def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int): + def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int, random_seed: int = 0): # weight: (out, in) fp32 if svd_mode == "full": U, S, Vh = torch.linalg.svd(weight.data, full_matrices=False) @@ -317,7 +320,15 @@ def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, n Sr = S[:r] # (r,) Uhr = Vh[:r, :] # (r, in) elif svd_mode == "lowrank": - U, S, V = svd_lowrank(weight.data, q=r, niter=niter) # V: (in, r) + # torch.svd_lowrank uses a random projection, so the A/B initialization it produces depends on the + # RNG state. Because that initialization is rebuilt from the base weights when an adapter is loaded, + # seed a forked RNG with the configurable random_seed to make it deterministic and reproducible across + # save/load (torch.svd_lowrank does not accept a generator argument). fork_rng leaves the global RNG + # stream untouched. + fork_devices = [weight.device] if weight.device.type == "cuda" else [] + with torch.random.fork_rng(devices=fork_devices): + torch.manual_seed(random_seed) + U, S, V = svd_lowrank(weight.data, q=r, niter=niter) # V: (in, r) Vr = U[:, :r] Sr = S[:r] Uhr = V[:, :r].t() # (r, in) diff --git a/tests/test_adamss_asa.py b/tests/test_adamss_asa.py index fea1ee62fe..9a305e4ba4 100644 --- a/tests/test_adamss_asa.py +++ b/tests/test_adamss_asa.py @@ -7,12 +7,10 @@ - update_and_allocate: full ASA flow (accumulate → global mask → reset) """ -import copy - import torch from torch import nn -from peft import AdamssConfig, PeftModel, get_peft_model +from peft import AdamssConfig, get_peft_model from peft.tuners.adamss.layer import AdamssLayer @@ -272,40 +270,3 @@ def test_asa_disabled_is_noop(self): for layer in layers: for p in layer.adamss_A["default"]: assert p.requires_grad - - -class TestAdamssSaveLoad: - def test_save_load_reproduces_output(self, tmp_path): - # Regression test: a trained AdaMSS adapter must reproduce its outputs after - # save_pretrained/from_pretrained. The adapter is rebuilt from the base weights on load, - # so the SVD-based subspace allocation (and the resulting scatter_index used to place each - # subspace's contribution into the output) must be deterministic; otherwise the restored - # weights are scattered to the wrong output dimensions and the output changes. - torch.manual_seed(0) - base = SimpleMLP() - config = AdamssConfig(target_modules=["lin0", "lin1"], r=8, num_subspaces=4, subspace_rank=1) - model = get_peft_model(copy.deepcopy(base), config) - model.eval() - - # Simulate training so the adapter is non-trivial (AdaMSS initializes B=0, so an untrained - # adapter is a no-op and would reproduce trivially even if the mapping were wrong). - with torch.no_grad(): - for p in model.parameters(): - if p.requires_grad: - p.add_(torch.randn_like(p) * 0.1 + 0.05) - - x = torch.randn(4, 20) - with torch.no_grad(): - out_before = model(x) - - model.save_pretrained(tmp_path) - - # Change the global RNG state to mimic loading in a fresh process; the reconstructed - # subspace allocation must not depend on it. - torch.manual_seed(12345) - reloaded = PeftModel.from_pretrained(copy.deepcopy(base), tmp_path) - reloaded.eval() - with torch.no_grad(): - out_after = reloaded(x) - - assert torch.allclose(out_before, out_after, atol=1e-5, rtol=1e-4) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index a528171ca6..c96c41795f 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -2553,6 +2553,40 @@ def test_only_params_are_updated(self, test_name, model_id, config_cls, config_k else: assert torch.allclose(param_before, param_after, atol=tol, rtol=tol) + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_save_load_roundtrip(self, test_name, model_id, config_cls, config_kwargs, tmp_path): + # An explicit test that when loading a trained model, the outputs from the forward pass remain the same + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + with torch.inference_mode(): + output_base = model(**X) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + torch.manual_seed(0) + model = get_peft_model(model, config) + model.eval() + with torch.inference_mode(): + output_before = model(**X) + + # sanity check + atol, rtol = 1e-5, 1e-5 + assert not torch.allclose(output_base, output_before, atol=atol, rtol=rtol) + + model.save_pretrained(tmp_path) + del model + + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + torch.manual_seed(54321) # ensure that the seed is different from what was used when get_peft_model was called + model = PeftModel.from_pretrained(model, tmp_path) + with torch.inference_mode(): + output_after = model(**X) + + assert torch.allclose(output_before, output_after, atol=atol, rtol=rtol) + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) def test_parameters_after_loading_model(self, test_name, model_id, config_cls, config_kwargs): # An explicit test that when loading a trained model, the parameters are loaded correctly From 00659c6f443721fcf9d8a2134a3c93c13e8fceb0 Mon Sep 17 00:00:00 2001 From: AshNicolus Date: Tue, 9 Jun 2026 18:30:42 +0530 Subject: [PATCH 3/3] Trim verbose comments and config help text per review --- src/peft/tuners/adamss/config.py | 7 ++----- src/peft/tuners/adamss/utils.py | 13 +++++-------- src/peft/tuners/psoft/config.py | 7 +++---- src/peft/tuners/psoft/layer.py | 6 ++---- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/src/peft/tuners/adamss/config.py b/src/peft/tuners/adamss/config.py index 641ff137f4..7a998fe066 100644 --- a/src/peft/tuners/adamss/config.py +++ b/src/peft/tuners/adamss/config.py @@ -321,11 +321,8 @@ class AdamssConfig(PeftConfig): default=0, metadata={ "help": ( - "Seed for the random projection used by the SVD (torch.svd_lowrank) during initialization. " - "AdaMSS rebuilds its subspace decomposition from the base weights when an adapter is loaded, so a " - "fixed seed is required for the reloaded adapter to reproduce its trained outputs. The seed is stored " - "in the adapter config and reused on load; change it to obtain a different (but still reproducible) " - "subspace allocation. Default: 0." + "Seed used to deterministically create and rebuild the adapter weights, so that a saved adapter " + "reproduces its outputs after loading. Default: 0." ) }, ) diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py index 4f237aa318..e8600cbe6c 100644 --- a/src/peft/tuners/adamss/utils.py +++ b/src/peft/tuners/adamss/utils.py @@ -24,7 +24,7 @@ def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0): r: rank for low-rank approximation device: computation device dtype: data type - random_seed: seed for the random projection used by ``torch.svd_lowrank``, so the decomposition is + random_seed: seed for the random projection used by `torch.svd_lowrank`, so the decomposition is deterministic and reproducible across save/load Returns: @@ -40,13 +40,10 @@ def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0): UU = torch.zeros(B, C, H, effective_r, dtype=dtype, device=device) VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device) - # torch.svd_lowrank draws a random projection internally, so its result (and hence the - # downstream clustering and scatter_index) depends on the RNG state. Because the AdaMSS - # adapter is rebuilt from the base weights when it is loaded, a different RNG state at load - # time would produce a different scatter_index than at save time and the reloaded adapter - # would not reproduce its outputs. Seed a forked RNG with the configurable ``random_seed`` so - # the decomposition is deterministic and reproducible across save/load (torch.svd_lowrank does - # not accept a generator argument); fork_rng leaves the global RNG stream untouched. + # torch.svd_lowrank draws a random projection internally, so its result (and hence the downstream + # clustering and scatter_index) depends on the RNG state. Seed a forked RNG with the configurable + # random_seed so the result is deterministic (torch.svd_lowrank does not accept a generator argument); + # fork_rng leaves the global RNG stream untouched. fork_devices = [device] if torch.device(device).type == "cuda" else [] with torch.random.fork_rng(devices=fork_devices): torch.manual_seed(random_seed) diff --git a/src/peft/tuners/psoft/config.py b/src/peft/tuners/psoft/config.py index b034d5075f..41d4e3c93b 100644 --- a/src/peft/tuners/psoft/config.py +++ b/src/peft/tuners/psoft/config.py @@ -163,10 +163,9 @@ class PsoftConfig(PeftConfig): default=0, metadata={ "help": ( - "Seed for the random projection used by torch.svd_lowrank when psoft_svd='lowrank'. PSOFT rebuilds " - "its SVD-based A/B initialization from the base weights when an adapter is loaded, so a fixed seed is " - "required for the reloaded adapter to reproduce its trained outputs. The seed is stored in the adapter " - "config and reused on load. Only used when psoft_svd='lowrank'. Default: 0." + "Seed used to deterministically create and rebuild the adapter weights when psoft_svd='lowrank', so " + "that a saved adapter reproduces its outputs after loading. Only used when psoft_svd='lowrank'. " + "Default: 0." ) }, ) diff --git a/src/peft/tuners/psoft/layer.py b/src/peft/tuners/psoft/layer.py index 5ce45b2539..d2fa8fafbb 100644 --- a/src/peft/tuners/psoft/layer.py +++ b/src/peft/tuners/psoft/layer.py @@ -321,10 +321,8 @@ def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, n Uhr = Vh[:r, :] # (r, in) elif svd_mode == "lowrank": # torch.svd_lowrank uses a random projection, so the A/B initialization it produces depends on the - # RNG state. Because that initialization is rebuilt from the base weights when an adapter is loaded, - # seed a forked RNG with the configurable random_seed to make it deterministic and reproducible across - # save/load (torch.svd_lowrank does not accept a generator argument). fork_rng leaves the global RNG - # stream untouched. + # RNG state. Seed a forked RNG with the configurable random_seed to make it deterministic + # (torch.svd_lowrank does not accept a generator argument); fork_rng leaves the global RNG untouched. fork_devices = [weight.device] if weight.device.type == "cuda" else [] with torch.random.fork_rng(devices=fork_devices): torch.manual_seed(random_seed)