From 3f531da779e1e0a39c3fe05e2802c0787c70a24a Mon Sep 17 00:00:00 2001
From: AshNicolus <yashnema52@gmail.com>
Date: Mon, 8 Jun 2026 21:53:27 +0530
Subject: [PATCH 1/3] FIX AdaMSS save/load reproduction by making slice_pca SVD
 deterministic

---
 src/peft/tuners/adamss/utils.py | 20 ++++++++++++----
 tests/test_adamss_asa.py        | 41 ++++++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py
index 7e20c95c5c..58626e0eeb 100644
--- a/src/peft/tuners/adamss/utils.py
+++ b/src/peft/tuners/adamss/utils.py
@@ -38,11 +38,21 @@ def slice_pca(tensor, r, device, dtype=torch.float32):
     UU = torch.zeros(B, C, H, effective_r, dtype=dtype, device=device)
     VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device)
 
-    for i in range(B):
-        for j in range(C):
-            U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None)
-            UU[i, j, :, :] = U[:, 0:effective_r]
-            VVT[i, j, :, :] = V[:, 0:effective_r].T
+    # torch.svd_lowrank draws a random projection internally, so its result (and hence the
+    # downstream clustering and scatter_index) depends on the global RNG state. Because the
+    # AdaMSS adapter is rebuilt from the base weights when an adapter is loaded, a different
+    # RNG state at load time would produce a different scatter_index than at save time and the
+    # reloaded adapter would not reproduce its outputs. Seed a forked RNG so the decomposition
+    # is deterministic and reproducible across save/load; this matches the fixed seed already
+    # used for KMeans in clustering_Z and leaves the global RNG stream untouched.
+    fork_devices = [device] if torch.device(device).type == "cuda" else []
+    with torch.random.fork_rng(devices=fork_devices):
+        torch.manual_seed(0)
+        for i in range(B):
+            for j in range(C):
+                U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None)
+                UU[i, j, :, :] = U[:, 0:effective_r]
+                VVT[i, j, :, :] = V[:, 0:effective_r].T
     # Return computed matrices (important: ensure callers receive VVT and UU)
     return VVT, UU
 
diff --git a/tests/test_adamss_asa.py b/tests/test_adamss_asa.py
index 9a305e4ba4..fea1ee62fe 100644
--- a/tests/test_adamss_asa.py
+++ b/tests/test_adamss_asa.py
@@ -7,10 +7,12 @@
 - update_and_allocate: full ASA flow (accumulate → global mask → reset)
 """
 
+import copy
+
 import torch
 from torch import nn
 
-from peft import AdamssConfig, get_peft_model
+from peft import AdamssConfig, PeftModel, get_peft_model
 from peft.tuners.adamss.layer import AdamssLayer
 
 
@@ -270,3 +272,40 @@ def test_asa_disabled_is_noop(self):
         for layer in layers:
             for p in layer.adamss_A["default"]:
                 assert p.requires_grad
+
+
+class TestAdamssSaveLoad:
+    def test_save_load_reproduces_output(self, tmp_path):
+        # Regression test: a trained AdaMSS adapter must reproduce its outputs after
+        # save_pretrained/from_pretrained. The adapter is rebuilt from the base weights on load,
+        # so the SVD-based subspace allocation (and the resulting scatter_index used to place each
+        # subspace's contribution into the output) must be deterministic; otherwise the restored
+        # weights are scattered to the wrong output dimensions and the output changes.
+        torch.manual_seed(0)
+        base = SimpleMLP()
+        config = AdamssConfig(target_modules=["lin0", "lin1"], r=8, num_subspaces=4, subspace_rank=1)
+        model = get_peft_model(copy.deepcopy(base), config)
+        model.eval()
+
+        # Simulate training so the adapter is non-trivial (AdaMSS initializes B=0, so an untrained
+        # adapter is a no-op and would reproduce trivially even if the mapping were wrong).
+        with torch.no_grad():
+            for p in model.parameters():
+                if p.requires_grad:
+                    p.add_(torch.randn_like(p) * 0.1 + 0.05)
+
+        x = torch.randn(4, 20)
+        with torch.no_grad():
+            out_before = model(x)
+
+        model.save_pretrained(tmp_path)
+
+        # Change the global RNG state to mimic loading in a fresh process; the reconstructed
+        # subspace allocation must not depend on it.
+        torch.manual_seed(12345)
+        reloaded = PeftModel.from_pretrained(copy.deepcopy(base), tmp_path)
+        reloaded.eval()
+        with torch.no_grad():
+            out_after = reloaded(x)
+
+        assert torch.allclose(out_before, out_after, atol=1e-5, rtol=1e-4)

From 9b7e1731ac912952466259abcf33333d1e5721d0 Mon Sep 17 00:00:00 2001
From: AshNicolus <yashnema52@gmail.com>
Date: Tue, 9 Jun 2026 16:15:15 +0530
Subject: [PATCH 2/3] Address review: configurable random_seed for AdaMSS/PSOFT
 SVD init; generic save/load roundtrip test

---
 src/peft/tuners/adamss/config.py | 12 ++++++++++
 src/peft/tuners/adamss/layer.py  |  2 +-
 src/peft/tuners/adamss/utils.py  | 18 +++++++-------
 src/peft/tuners/psoft/config.py  | 11 +++++++++
 src/peft/tuners/psoft/layer.py   | 15 ++++++++++--
 tests/test_adamss_asa.py         | 41 +-------------------------------
 tests/test_custom_models.py      | 34 ++++++++++++++++++++++++++
 7 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/src/peft/tuners/adamss/config.py b/src/peft/tuners/adamss/config.py
index 0a041c6651..641ff137f4 100644
--- a/src/peft/tuners/adamss/config.py
+++ b/src/peft/tuners/adamss/config.py
@@ -317,6 +317,18 @@ class AdamssConfig(PeftConfig):
             )
         },
     )
+    random_seed: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "Seed for the random projection used by the SVD (torch.svd_lowrank) during initialization. "
+                "AdaMSS rebuilds its subspace decomposition from the base weights when an adapter is loaded, so a "
+                "fixed seed is required for the reloaded adapter to reproduce its trained outputs. The seed is stored "
+                "in the adapter config and reused on load; change it to obtain a different (but still reproducible) "
+                "subspace allocation. Default: 0."
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.ADAMSS
diff --git a/src/peft/tuners/adamss/layer.py b/src/peft/tuners/adamss/layer.py
index 2a0ba296b3..dc08b463c5 100644
--- a/src/peft/tuners/adamss/layer.py
+++ b/src/peft/tuners/adamss/layer.py
@@ -175,7 +175,7 @@ def update_layer(
 
         # Perform SVD decomposition with diagnostics in case of failure
         try:
-            res = slice_pca(weight_tensor, r, device, torch.float32)
+            res = slice_pca(weight_tensor, r, device, torch.float32, random_seed=config.random_seed)
         except Exception as e:
             raise RuntimeError(
                 f"slice_pca raised an exception for layer {adapter_name} (shape={tuple(weight_tensor.shape)}, dtype={weight_tensor.dtype}, device={device}): {e}"
diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py
index 58626e0eeb..4f237aa318 100644
--- a/src/peft/tuners/adamss/utils.py
+++ b/src/peft/tuners/adamss/utils.py
@@ -15,7 +15,7 @@
 import torch
 
 
-def slice_pca(tensor, r, device, dtype=torch.float32):
+def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0):
     """
     Perform slice-wise PCA (SVD) on 4D tensor.
 
@@ -24,6 +24,8 @@ def slice_pca(tensor, r, device, dtype=torch.float32):
         r: rank for low-rank approximation
         device: computation device
         dtype: data type
+        random_seed: seed for the random projection used by ``torch.svd_lowrank``, so the decomposition is
+            deterministic and reproducible across save/load
 
     Returns:
         VVT: Right singular vectors (B, C, r, W) UU: Left singular vectors (B, C, H, r)
@@ -39,15 +41,15 @@ def slice_pca(tensor, r, device, dtype=torch.float32):
     VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device)
 
     # torch.svd_lowrank draws a random projection internally, so its result (and hence the
-    # downstream clustering and scatter_index) depends on the global RNG state. Because the
-    # AdaMSS adapter is rebuilt from the base weights when an adapter is loaded, a different
-    # RNG state at load time would produce a different scatter_index than at save time and the
-    # reloaded adapter would not reproduce its outputs. Seed a forked RNG so the decomposition
-    # is deterministic and reproducible across save/load; this matches the fixed seed already
-    # used for KMeans in clustering_Z and leaves the global RNG stream untouched.
+    # downstream clustering and scatter_index) depends on the RNG state. Because the AdaMSS
+    # adapter is rebuilt from the base weights when it is loaded, a different RNG state at load
+    # time would produce a different scatter_index than at save time and the reloaded adapter
+    # would not reproduce its outputs. Seed a forked RNG with the configurable ``random_seed`` so
+    # the decomposition is deterministic and reproducible across save/load (torch.svd_lowrank does
+    # not accept a generator argument); fork_rng leaves the global RNG stream untouched.
     fork_devices = [device] if torch.device(device).type == "cuda" else []
     with torch.random.fork_rng(devices=fork_devices):
-        torch.manual_seed(0)
+        torch.manual_seed(random_seed)
         for i in range(B):
             for j in range(C):
                 U, _, V = torch.svd_lowrank(tensor[i, j, :, :], q=effective_r, niter=2, M=None)
diff --git a/src/peft/tuners/psoft/config.py b/src/peft/tuners/psoft/config.py
index b8f6a731d5..b034d5075f 100644
--- a/src/peft/tuners/psoft/config.py
+++ b/src/peft/tuners/psoft/config.py
@@ -159,6 +159,17 @@ class PsoftConfig(PeftConfig):
             "help": "Number of power iterations used by torch.svd_lowrank when psoft_svd='lowrank'. Only used when psoft_svd='lowrank'. "
         },
     )
+    random_seed: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "Seed for the random projection used by torch.svd_lowrank when psoft_svd='lowrank'. PSOFT rebuilds "
+                "its SVD-based A/B initialization from the base weights when an adapter is loaded, so a fixed seed is "
+                "required for the reloaded adapter to reproduce its trained outputs. The seed is stored in the adapter "
+                "config and reused on load. Only used when psoft_svd='lowrank'. Default: 0."
+            )
+        },
+    )
     psoft_orth: bool = field(
         default=True,
         metadata={
diff --git a/src/peft/tuners/psoft/layer.py b/src/peft/tuners/psoft/layer.py
index 121c0a61d2..5ce45b2539 100644
--- a/src/peft/tuners/psoft/layer.py
+++ b/src/peft/tuners/psoft/layer.py
@@ -205,6 +205,7 @@ def __init__(self, base_layer: nn.Module, **kwargs) -> None:
         self.psoft_dropout = nn.ModuleDict({})
         self.psoft_svd: dict[str, str] = {}
         self.psoft_svd_lowrank_niter: dict[str, int] = {}
+        self.random_seed: dict[str, int] = {}
         self.ab_svd_init: dict[str, Optional[str]] = {}
 
         # per-adapter trainable module
@@ -251,6 +252,7 @@ def update_layer(self, adapter_name: str, config: PsoftConfig, **kwargs: Any) ->
         self.ab_svd_init[adapter_name] = config.ab_svd_init
         self.psoft_svd[adapter_name] = config.psoft_svd
         self.psoft_svd_lowrank_niter[adapter_name] = config.psoft_svd_lowrank_niter
+        self.random_seed[adapter_name] = config.random_seed
 
         self.psoft_R[adapter_name] = OrthLayer(
             size=r,
@@ -290,6 +292,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No
                 r,
                 svd_mode=self.psoft_svd[adapter_name],
                 niter=self.psoft_svd_lowrank_niter[adapter_name],
+                random_seed=self.random_seed[adapter_name],
             )
 
             Sr_scaled = Sr / self.scaling[adapter_name]
@@ -309,7 +312,7 @@ def _build_psoft_ab_cache_buffers(self, adapter_name: str, init_type: str) -> No
 
             self._set_psoft_ab_cache_buffers(adapter_name, A, B)
 
-    def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int):
+    def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, niter: int, random_seed: int = 0):
         # weight: (out, in) fp32
         if svd_mode == "full":
             U, S, Vh = torch.linalg.svd(weight.data, full_matrices=False)
@@ -317,7 +320,15 @@ def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, n
             Sr = S[:r]  # (r,)
             Uhr = Vh[:r, :]  # (r, in)
         elif svd_mode == "lowrank":
-            U, S, V = svd_lowrank(weight.data, q=r, niter=niter)  # V: (in, r)
+            # torch.svd_lowrank uses a random projection, so the A/B initialization it produces depends on the
+            # RNG state. Because that initialization is rebuilt from the base weights when an adapter is loaded,
+            # seed a forked RNG with the configurable random_seed to make it deterministic and reproducible across
+            # save/load (torch.svd_lowrank does not accept a generator argument). fork_rng leaves the global RNG
+            # stream untouched.
+            fork_devices = [weight.device] if weight.device.type == "cuda" else []
+            with torch.random.fork_rng(devices=fork_devices):
+                torch.manual_seed(random_seed)
+                U, S, V = svd_lowrank(weight.data, q=r, niter=niter)  # V: (in, r)
             Vr = U[:, :r]
             Sr = S[:r]
             Uhr = V[:, :r].t()  # (r, in)
diff --git a/tests/test_adamss_asa.py b/tests/test_adamss_asa.py
index fea1ee62fe..9a305e4ba4 100644
--- a/tests/test_adamss_asa.py
+++ b/tests/test_adamss_asa.py
@@ -7,12 +7,10 @@
 - update_and_allocate: full ASA flow (accumulate → global mask → reset)
 """
 
-import copy
-
 import torch
 from torch import nn
 
-from peft import AdamssConfig, PeftModel, get_peft_model
+from peft import AdamssConfig, get_peft_model
 from peft.tuners.adamss.layer import AdamssLayer
 
 
@@ -272,40 +270,3 @@ def test_asa_disabled_is_noop(self):
         for layer in layers:
             for p in layer.adamss_A["default"]:
                 assert p.requires_grad
-
-
-class TestAdamssSaveLoad:
-    def test_save_load_reproduces_output(self, tmp_path):
-        # Regression test: a trained AdaMSS adapter must reproduce its outputs after
-        # save_pretrained/from_pretrained. The adapter is rebuilt from the base weights on load,
-        # so the SVD-based subspace allocation (and the resulting scatter_index used to place each
-        # subspace's contribution into the output) must be deterministic; otherwise the restored
-        # weights are scattered to the wrong output dimensions and the output changes.
-        torch.manual_seed(0)
-        base = SimpleMLP()
-        config = AdamssConfig(target_modules=["lin0", "lin1"], r=8, num_subspaces=4, subspace_rank=1)
-        model = get_peft_model(copy.deepcopy(base), config)
-        model.eval()
-
-        # Simulate training so the adapter is non-trivial (AdaMSS initializes B=0, so an untrained
-        # adapter is a no-op and would reproduce trivially even if the mapping were wrong).
-        with torch.no_grad():
-            for p in model.parameters():
-                if p.requires_grad:
-                    p.add_(torch.randn_like(p) * 0.1 + 0.05)
-
-        x = torch.randn(4, 20)
-        with torch.no_grad():
-            out_before = model(x)
-
-        model.save_pretrained(tmp_path)
-
-        # Change the global RNG state to mimic loading in a fresh process; the reconstructed
-        # subspace allocation must not depend on it.
-        torch.manual_seed(12345)
-        reloaded = PeftModel.from_pretrained(copy.deepcopy(base), tmp_path)
-        reloaded.eval()
-        with torch.no_grad():
-            out_after = reloaded(x)
-
-        assert torch.allclose(out_before, out_after, atol=1e-5, rtol=1e-4)
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
index a528171ca6..c96c41795f 100644
--- a/tests/test_custom_models.py
+++ b/tests/test_custom_models.py
@@ -2553,6 +2553,40 @@ def test_only_params_are_updated(self, test_name, model_id, config_cls, config_k
             else:
                 assert torch.allclose(param_before, param_after, atol=tol, rtol=tol)
 
+    @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES)
+    def test_save_load_roundtrip(self, test_name, model_id, config_cls, config_kwargs, tmp_path):
+        # An explicit test that when loading a trained model, the outputs from the forward pass remain the same
+        X = self.prepare_inputs_for_testing()
+        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        with torch.inference_mode():
+            output_base = model(**X)
+
+        config_kwargs = set_init_weights_false(config_cls, config_kwargs)
+        config = config_cls(
+            base_model_name_or_path=model_id,
+            **config_kwargs,
+        )
+        torch.manual_seed(0)
+        model = get_peft_model(model, config)
+        model.eval()
+        with torch.inference_mode():
+            output_before = model(**X)
+
+        # sanity check
+        atol, rtol = 1e-5, 1e-5
+        assert not torch.allclose(output_base, output_before, atol=atol, rtol=rtol)
+
+        model.save_pretrained(tmp_path)
+        del model
+
+        model = self.transformers_class.from_pretrained(model_id).to(self.torch_device)
+        torch.manual_seed(54321)  # ensure that the seed is different from what was used when get_peft_model was called
+        model = PeftModel.from_pretrained(model, tmp_path)
+        with torch.inference_mode():
+            output_after = model(**X)
+
+        assert torch.allclose(output_before, output_after, atol=atol, rtol=rtol)
+
     @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES)
     def test_parameters_after_loading_model(self, test_name, model_id, config_cls, config_kwargs):
         # An explicit test that when loading a trained model, the parameters are loaded correctly

From 00659c6f443721fcf9d8a2134a3c93c13e8fceb0 Mon Sep 17 00:00:00 2001
From: AshNicolus <yashnema52@gmail.com>
Date: Tue, 9 Jun 2026 18:30:42 +0530
Subject: [PATCH 3/3] Trim verbose comments and config help text per review

---
 src/peft/tuners/adamss/config.py |  7 ++-----
 src/peft/tuners/adamss/utils.py  | 13 +++++--------
 src/peft/tuners/psoft/config.py  |  7 +++----
 src/peft/tuners/psoft/layer.py   |  6 ++----
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/src/peft/tuners/adamss/config.py b/src/peft/tuners/adamss/config.py
index 641ff137f4..7a998fe066 100644
--- a/src/peft/tuners/adamss/config.py
+++ b/src/peft/tuners/adamss/config.py
@@ -321,11 +321,8 @@ class AdamssConfig(PeftConfig):
         default=0,
         metadata={
             "help": (
-                "Seed for the random projection used by the SVD (torch.svd_lowrank) during initialization. "
-                "AdaMSS rebuilds its subspace decomposition from the base weights when an adapter is loaded, so a "
-                "fixed seed is required for the reloaded adapter to reproduce its trained outputs. The seed is stored "
-                "in the adapter config and reused on load; change it to obtain a different (but still reproducible) "
-                "subspace allocation. Default: 0."
+                "Seed used to deterministically create and rebuild the adapter weights, so that a saved adapter "
+                "reproduces its outputs after loading. Default: 0."
             )
         },
     )
diff --git a/src/peft/tuners/adamss/utils.py b/src/peft/tuners/adamss/utils.py
index 4f237aa318..e8600cbe6c 100644
--- a/src/peft/tuners/adamss/utils.py
+++ b/src/peft/tuners/adamss/utils.py
@@ -24,7 +24,7 @@ def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0):
         r: rank for low-rank approximation
         device: computation device
         dtype: data type
-        random_seed: seed for the random projection used by ``torch.svd_lowrank``, so the decomposition is
+        random_seed: seed for the random projection used by `torch.svd_lowrank`, so the decomposition is
             deterministic and reproducible across save/load
 
     Returns:
@@ -40,13 +40,10 @@ def slice_pca(tensor, r, device, dtype=torch.float32, random_seed=0):
     UU = torch.zeros(B, C, H, effective_r, dtype=dtype, device=device)
     VVT = torch.zeros(B, C, effective_r, W, dtype=dtype, device=device)
 
-    # torch.svd_lowrank draws a random projection internally, so its result (and hence the
-    # downstream clustering and scatter_index) depends on the RNG state. Because the AdaMSS
-    # adapter is rebuilt from the base weights when it is loaded, a different RNG state at load
-    # time would produce a different scatter_index than at save time and the reloaded adapter
-    # would not reproduce its outputs. Seed a forked RNG with the configurable ``random_seed`` so
-    # the decomposition is deterministic and reproducible across save/load (torch.svd_lowrank does
-    # not accept a generator argument); fork_rng leaves the global RNG stream untouched.
+    # torch.svd_lowrank draws a random projection internally, so its result (and hence the downstream
+    # clustering and scatter_index) depends on the RNG state. Seed a forked RNG with the configurable
+    # random_seed so the result is deterministic (torch.svd_lowrank does not accept a generator argument);
+    # fork_rng leaves the global RNG stream untouched.
     fork_devices = [device] if torch.device(device).type == "cuda" else []
     with torch.random.fork_rng(devices=fork_devices):
         torch.manual_seed(random_seed)
diff --git a/src/peft/tuners/psoft/config.py b/src/peft/tuners/psoft/config.py
index b034d5075f..41d4e3c93b 100644
--- a/src/peft/tuners/psoft/config.py
+++ b/src/peft/tuners/psoft/config.py
@@ -163,10 +163,9 @@ class PsoftConfig(PeftConfig):
         default=0,
         metadata={
             "help": (
-                "Seed for the random projection used by torch.svd_lowrank when psoft_svd='lowrank'. PSOFT rebuilds "
-                "its SVD-based A/B initialization from the base weights when an adapter is loaded, so a fixed seed is "
-                "required for the reloaded adapter to reproduce its trained outputs. The seed is stored in the adapter "
-                "config and reused on load. Only used when psoft_svd='lowrank'. Default: 0."
+                "Seed used to deterministically create and rebuild the adapter weights when psoft_svd='lowrank', so "
+                "that a saved adapter reproduces its outputs after loading. Only used when psoft_svd='lowrank'. "
+                "Default: 0."
             )
         },
     )
diff --git a/src/peft/tuners/psoft/layer.py b/src/peft/tuners/psoft/layer.py
index 5ce45b2539..d2fa8fafbb 100644
--- a/src/peft/tuners/psoft/layer.py
+++ b/src/peft/tuners/psoft/layer.py
@@ -321,10 +321,8 @@ def _compute_svd_factors(self, weight: torch.Tensor, r: int, *, svd_mode: str, n
             Uhr = Vh[:r, :]  # (r, in)
         elif svd_mode == "lowrank":
             # torch.svd_lowrank uses a random projection, so the A/B initialization it produces depends on the
-            # RNG state. Because that initialization is rebuilt from the base weights when an adapter is loaded,
-            # seed a forked RNG with the configurable random_seed to make it deterministic and reproducible across
-            # save/load (torch.svd_lowrank does not accept a generator argument). fork_rng leaves the global RNG
-            # stream untouched.
+            # RNG state. Seed a forked RNG with the configurable random_seed to make it deterministic
+            # (torch.svd_lowrank does not accept a generator argument); fork_rng leaves the global RNG untouched.
             fork_devices = [weight.device] if weight.device.type == "cuda" else []
             with torch.random.fork_rng(devices=fork_devices):
                 torch.manual_seed(random_seed)