diff --git a/src/peft/tuners/adamss/config.py b/src/peft/tuners/adamss/config.py index 0a041c6651..7a998fe066 100644 --- a/src/peft/tuners/adamss/config.py +++ b/src/peft/tuners/adamss/config.py @@ -317,6 +317,15 @@ class AdamssConfig(PeftConfig): ) }, ) + random_seed: int = field( + default=0, + metadata={ + "help": ( + "Seed used to deterministically create and rebuild the adapter weights, so that a saved adapter " + "reproduces its outputs after loading. Default: 0." + ) + }, + ) def __post_init__(self): self.peft_type = PeftType.ADAMSS diff --git a/src/peft/tuners/adamss/layer.py b/src/peft/tuners/adamss/layer.py index 2a0ba296b3..dc08b463c5 100644 --- a/src/peft/tuners/adamss/layer.py +++ b/src/peft/tuners/adamss/layer.py @@ -175,7 +175,7 @@ def update_layer( # Perform SVD decomposition with diagnostics in case of failure try: - res = slice_pca(weight_tensor, r, device, torch.float32) + res = slice_pca(weight_tensor, r, device, torch.float32, random_seed=config.random_seed) except Exception as e: raise RuntimeError( f"slice_pca raised an exception for layer {adapter_name} (shape={tuple(weight_tensor.shape)}, dtype={weight_tensor.dtype}, device={device}): {e}" diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py index 7e20c95c5c..e8600cbe6c 100644 --- a/src/peft/tuners/adamss/utils.py +++ b/src/peft/tuners/adamss/utils.py @@ -15,7 +15,7 @@ import torch -def slice_pca(tensor, r, device, dtype=torch.float32): +def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0): """ Perform slice-wise PCA (SVD) on 4D tensor. @@ -24,6 +24,8 @@ def slice_pca(tensor, r, device, dtype=torch.float32): r: rank for low-rank approximation device: computation device dtype: data type + random_seed: seed for the random projection used by `torch.svd_lowrank`, so the decomposition is + deterministic and reproducible across save/load Returns: VVT: Right singular vectors (B, C, r, W) UU: Left singular vectors (B, C, H, r) @@ -38,11 +40,18 @@ def slice_pca(tensor, r, device, dtype=torch.float32): UU = torch.zeros(B, C, H, effective_r, dtype=dtype, device=device) VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device) - for i in range(B): - for j in range(C): - U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None) - UU[i, j, :, :] = U[:, 0:effective_r] - VVT[i, j, :, :] = V[:, 0:effective_r].T + # torch.svd_lowrank draws a random projection internally, so its result (and hence the downstream + # clustering and scatter_index) depends on the RNG state. Seed a forked RNG with the configurable + # random_seed so the result is deterministic (torch.svd_lowrank does not accept a generator argument); + # fork_rng leaves the global RNG stream untouched. + fork_devices = [device] if torch.device(device).type == "cuda" else [] + with torch.random.fork_rng(devices=fork_devices): + torch.manual_seed(random_seed) + for i in range(B): + for j in range(C): + U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None) + UU[i, j, :, :] = U[:, 0:effective_r] + VVT[i, j, :, :] = V[:, 0:effective_r].T # Return computed matrices (important: ensure callers receive VVT and UU) return VVT, UU diff --git a/src/peft/tuners/psoft/config.py b/src/peft/tuners/psoft/config.py index b8f6a731d5..41d4e3c93b 100644 --- a/src/peft/tuners/psoft/config.py +++ b/src/peft/tuners/psoft/config.py @@ -159,6 +159,16 @@ class PsoftConfig(PeftConfig): "help": "Number of power iterations used by torch.svd_lowrank when psoft_svd='lowrank'. Only used when psoft_svd='lowrank'. " }, ) + random_seed: int = field( + default=0, + metadata={ + "help": ( + "Seed used to deterministically create and rebuild the adapter weights when psoft_svd='lowrank', so " + "that a saved adapter reproduces its outputs after loading. Only used when psoft_svd='lowrank'. " + "Default: 0." + ) + }, + ) psoft_orth: bool = field( default=True, metadata={ diff --git a/src/peft/tuners/psoft/layer.py b/src/peft/tuners/psoft/layer.py index 121c0a61d2..d2fa8fafbb 100644 --- a/src/peft/tuners/psoft/layer.py +++ b/src/peft/tuners/psoft/layer.py @@ -205,6 +205,7 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None: self.psoft_dropout = nn.ModuleDict({}) self.psoft_svd: dict[str, str] = {} self.psoft_svd_lowrank_niter: dict[str, int] = {} + self.random_seed: dict[str, int] = {} self.ab_svd_init: dict[str, Optional[str]] = {} # per-adapter trainable module @@ -251,6 +252,7 @@ def update_layer(self, adapter_name: str, config: PsoftConfig, **kwargs: Any) -> self.ab_svd_init[adapter_name] = config.ab_svd_init self.psoft_svd[adapter_name] = config.psoft_svd self.psoft_svd_lowrank_niter[adapter_name] = config.psoft_svd_lowrank_niter + self.random_seed[adapter_name] = config.random_seed self.psoft_R[adapter_name] = OrthLayer( size=r, @@ -290,6 +292,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No r, svd_mode=self.psoft_svd[adapter_name], niter=self.psoft_svd_lowrank_niter[adapter_name], + random_seed=self.random_seed[adapter_name], ) Sr_scaled = Sr / self.scaling[adapter_name] @@ -309,7 +312,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No self._set_psoft_ab_cache_buffers(adapter_name, A, B) - def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int): + def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int, random_seed: int = 0): # weight: (out, in) fp32 if svd_mode == "full": U, S, Vh = torch.linalg.svd(weight.data, full_matrices=False) @@ -317,7 +320,13 @@ def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, n Sr = S[:r] # (r,) Uhr = Vh[:r, :] # (r, in) elif svd_mode == "lowrank": - U, S, V = svd_lowrank(weight.data, q=r, niter=niter) # V: (in, r) + # torch.svd_lowrank uses a random projection, so the A/B initialization it produces depends on the + # RNG state. Seed a forked RNG with the configurable random_seed to make it deterministic + # (torch.svd_lowrank does not accept a generator argument); fork_rng leaves the global RNG untouched. + fork_devices = [weight.device] if weight.device.type == "cuda" else [] + with torch.random.fork_rng(devices=fork_devices): + torch.manual_seed(random_seed) + U, S, V = svd_lowrank(weight.data, q=r, niter=niter) # V: (in, r) Vr = U[:, :r] Sr = S[:r] Uhr = V[:, :r].t() # (r, in) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index a528171ca6..c96c41795f 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -2553,6 +2553,40 @@ def test_only_params_are_updated(self, test_name, model_id, config_cls, config_k else: assert torch.allclose(param_before, param_after, atol=tol, rtol=tol) + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_save_load_roundtrip(self, test_name, model_id, config_cls, config_kwargs, tmp_path): + # An explicit test that when loading a trained model, the outputs from the forward pass remain the same + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + with torch.inference_mode(): + output_base = model(**X) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + torch.manual_seed(0) + model = get_peft_model(model, config) + model.eval() + with torch.inference_mode(): + output_before = model(**X) + + # sanity check + atol, rtol = 1e-5, 1e-5 + assert not torch.allclose(output_base, output_before, atol=atol, rtol=rtol) + + model.save_pretrained(tmp_path) + del model + + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + torch.manual_seed(54321) # ensure that the seed is different from what was used when get_peft_model was called + model = PeftModel.from_pretrained(model, tmp_path) + with torch.inference_mode(): + output_after = model(**X) + + assert torch.allclose(output_before, output_after, atol=atol, rtol=rtol) + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) def test_parameters_after_loading_model(self, test_name, model_id, config_cls, config_kwargs): # An explicit test that when loading a trained model, the parameters are loaded correctly