diff --git a/optax/_src/lbfgs_test.py b/optax/_src/lbfgs_test.py
index a6b11f763..7971aece2 100644
--- a/optax/_src/lbfgs_test.py
+++ b/optax/_src/lbfgs_test.py
@@ -458,7 +458,7 @@ def fun(x):
     opt = alias.lbfgs()
     sol_arr, _ = _run_opt(opt, fun, init_array, maxiter=3)
     sol_tree, _ = _run_opt(opt, fun, init_tree, maxiter=3)
-    sol_tree = jnp.stack((sol_tree[0], sol_tree[1]))
+    sol_tree = jnp.asarray(sol_tree)
     test_utils.assert_trees_all_close(
         sol_arr, sol_tree, rtol=5 * 1e-5, atol=5 * 1e-5
     )
diff --git a/optax/_src/linesearch_test.py b/optax/_src/linesearch_test.py
index 78fe44c9c..6b5a56ba3 100644
--- a/optax/_src/linesearch_test.py
+++ b/optax/_src/linesearch_test.py
@@ -87,12 +87,11 @@ def zakharov(x):
 class BacktrackingLinesearchTest(parameterized.TestCase):
 
   def _check_decrease_conditions(
-      self, fun, init_params, descent_dir, final_params, final_state, opt_args
+      self, fun, init_params, descent_dir, final_params, final_lr, opt_args
   ):
     """Check decrease conditions."""
     init_value, init_grad = jax.value_and_grad(fun)(init_params)
     final_value = fun(final_params)
-    final_lr = final_state[0]
 
     slope = optax.tree.vdot(descent_dir, init_grad)
     slope_rtol, atol, rtol = (
@@ -176,12 +175,11 @@ def test_linesearch(
     params = update.apply_updates(init_params, updates)
 
     self._check_decrease_conditions(
-        # pyrefly: ignore[bad-index]
         fn,
         init_params,
         descent_dir,
         params,
-        state[-1],
+        optax.tree.get(state, 'learning_rate'),
         opt_args,
     )
 
diff --git a/optax/_src/transform.py b/optax/_src/transform.py
index d5ca14fd6..b383ce468 100644
--- a/optax/_src/transform.py
+++ b/optax/_src/transform.py
@@ -477,7 +477,7 @@ def _comb(g, m):
       elif mode == 'refined':
         # Keep small values linear, saturate to sign for large values.
         return jnp.where(jnp.abs(x) < 1.0, x, jnp.sign(x))
-      else:  # pytype: disable=unreachable
+      else:
         raise ValueError(
             f'Unknown lion mode: {mode}. '
             'It needs to be one of ["hard", "smooth", "refined"].'
@@ -1552,14 +1552,14 @@ class ScaleByLBFGSState(NamedTuple):
   updates: base.Params
   diff_params_memory: base.ArrayTree
   diff_updates_memory: base.ArrayTree
-  weights_memory: jax.typing.ArrayLike
+  weights_memory: jax.Array
 
 
 def _precondition_by_lbfgs(
     updates: base.Updates,
     diff_params_memory: base.ArrayTree,
     diff_updates_memory: base.ArrayTree,
-    weights_memory: jax.typing.ArrayLike,
+    weights_memory: jax.Array,
     identity_scale: jax.typing.ArrayLike,  # float
     memory_idx: jax.typing.ArrayLike,  # int
 ) -> base.Updates:
@@ -1596,14 +1596,13 @@ def _precondition_by_lbfgs(
     , 1999
   """
   rhos = weights_memory
-  memory_size = weights_memory.shape[0]  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  memory_size = weights_memory.shape[0]
   indices = (memory_idx + jnp.arange(memory_size)) % memory_size
 
   def right_product(vec, idx):
     dwi, dui = jax.tree.map(
         lambda x: x[idx], (diff_params_memory, diff_updates_memory)
     )
-    # pyrefly: ignore[bad-index]
     alpha = rhos[idx] * optax.tree.real(optax.tree.vdot(dwi, vec))
     vec_new = optax.tree.add_scale(vec, -alpha, dui)
     vec_new = optax.tree.cast_like(vec_new, vec)
@@ -1620,7 +1619,6 @@ def left_product(vec, idx_alpha):
     dwi, dui = jax.tree.map(
         lambda x: x[idx], (diff_params_memory, diff_updates_memory)
     )
-    # pyrefly: ignore[bad-index]
     beta = rhos[idx] * optax.tree.real(optax.tree.vdot(dui, vec))
     vec_new = optax.tree.add_scale(vec, alpha - beta, dwi)
     vec_new = optax.tree.cast_like(vec_new, vec)
diff --git a/optax/contrib/_common_test.py b/optax/contrib/_common_test.py
index 4f95de043..8db640cf8 100644
--- a/optax/contrib/_common_test.py
+++ b/optax/contrib/_common_test.py
@@ -19,6 +19,7 @@
 
 import functools
 import inspect
+from typing import Any, cast
 
 from absl.testing import absltest
 from absl.testing import parameterized
@@ -352,7 +353,7 @@ def test_optimizers_can_be_wrapped_in_inject_hyperparams(
       factory = getattr(contrib, wrapper_name)
       factory = functools.partial(factory, base_opt)
       hparams = wrapper_kwargs
-    opt = factory(**hparams)  # pyrefly: ignore[bad-unpacking]
+    opt = factory(**cast(dict[str, Any], hparams))
 
     # Add here the hyperparameters that cannot be injected with
     # inject_hyperparams.
diff --git a/optax/contrib/_galore_test.py b/optax/contrib/_galore_test.py
index 0143c628a..d007e38b2 100644
--- a/optax/contrib/_galore_test.py
+++ b/optax/contrib/_galore_test.py
@@ -16,17 +16,16 @@
 
 from absl.testing import absltest
 from absl.testing import parameterized
-
 import jax
 import jax.numpy as jnp
-
+from optax import tree
 from optax._src import transform
 from optax._src import update
 from optax.contrib import _galore
 
 
-def _tree_sum_squares(tree):
-  leaves = jax.tree.leaves(tree)
+def _tree_sum_squares(pytree):
+  leaves = jax.tree.leaves(pytree)
   return sum(jnp.sum(jnp.square(x.astype(jnp.float32))) for x in leaves)
 
 
@@ -332,22 +331,17 @@ def test_3d_memory_reduction_with_dimension_numbers(self):
         weight_dimension_numbers=dim_nums,
     )
     state = opt.init(params)
-    # pyrefly: ignore[bad-index]
-    assert isinstance(state[0], _galore.GaLoreState)
-    # pytype: disable=annotation-type-mismatch
-    galore_state: _galore.GaLoreState = state[0]
-    # pytype: enable=annotation-type-mismatch
-    base_state = galore_state.base_optimizer_state
 
     # Reshaped: (256, 512), m < n → right projection
     # Moments should be (m, rank) = (256, 16)
-    # pytype: disable=attribute-error
-    self.assertEqual(base_state.mu["w"].shape, (embed_dim, rank))
-    self.assertEqual(base_state.nu["w"].shape, (embed_dim, rank))
+    mu = tree.get(state, "mu")
+    nu = tree.get(state, "nu")
+    self.assertEqual(mu["w"].shape, (embed_dim, rank))
+    self.assertEqual(nu["w"].shape, (embed_dim, rank))
 
     # Verify memory savings
     full_size = embed_dim * num_heads * head_dim
-    moment_size = base_state.mu["w"].size
+    moment_size = mu["w"].size
     self.assertLess(moment_size, full_size)
     # pytype: enable=attribute-error
 
@@ -437,15 +431,14 @@ def test_dimension_numbers_right_projection(self):
         weight_dimension_numbers=dim_nums,
     )
     state = opt.init(params)
-    galore_state = state[0]  # pyrefly: ignore[bad-index]
-    # pytype: disable=attribute-error
-    base_state = galore_state.base_optimizer_state
 
     # m=32 < n=512 → right projection
     # Projector: (n, rank) = (512, 8)
     # Moments: (m, rank) = (32, 8)
-    self.assertEqual(galore_state.projector["w"].shape, (512, rank))
-    self.assertEqual(base_state.mu["w"].shape, (32, rank))
+    projector = tree.get(state, "projector")
+    mu = tree.get(state, "mu")
+    self.assertEqual(projector["w"].shape, (512, rank))
+    self.assertEqual(mu["w"].shape, (32, rank))
     # pytype: enable=attribute-error
 
   def test_single_dimension_number_applied_to_all(self):
diff --git a/optax/contrib/_muon.py b/optax/contrib/_muon.py
index d2178304a..64eafc882 100644
--- a/optax/contrib/_muon.py
+++ b/optax/contrib/_muon.py
@@ -85,15 +85,15 @@ class MuonDimensionNumbers(NamedTuple):
 def _normalize_axes(x: jax.Array, dim_nums: MuonDimensionNumbers) -> tuple[
     tuple[int, ...], tuple[int, ...]]:
   """Normalize axes in dimension numbers to two tuples of non-negative ints."""
-  if isinstance(dim_nums.reduction_axis, int):
-    dim_nums = dim_nums._replace(reduction_axis=(dim_nums.reduction_axis,))
-  # pyrefly: ignore[not-iterable]
-  reduction_axes = tuple(ax % x.ndim for ax in dim_nums.reduction_axis)
-
-  if isinstance(dim_nums.output_axis, int):
-    dim_nums = dim_nums._replace(output_axis=(dim_nums.output_axis,))
-  # pyrefly: ignore[not-iterable]
-  output_axes = tuple(ax % x.ndim for ax in dim_nums.output_axis)
+  reduction_axis = dim_nums.reduction_axis
+  if isinstance(reduction_axis, int):
+    reduction_axis = (reduction_axis,)
+  reduction_axes = tuple(ax % x.ndim for ax in reduction_axis)
+
+  output_axis = dim_nums.output_axis
+  if isinstance(output_axis, int):
+    output_axis = (output_axis,)
+  output_axes = tuple(ax % x.ndim for ax in output_axis)
   return reduction_axes, output_axes
 
 
diff --git a/optax/contrib/_muon_test.py b/optax/contrib/_muon_test.py
index 9af789e86..3611fb6c6 100644
--- a/optax/contrib/_muon_test.py
+++ b/optax/contrib/_muon_test.py
@@ -368,15 +368,14 @@ def test_muon_orthogonalization_modes(self, preconditioning, shape):
                      preconditioning=preconditioning)
     state = opt.init(params)
     updates, _ = opt.update(params, state, params=params)
+    w_update = jax.tree.leaves(updates)[0]
 
     # Check shape preservation
-    # pyrefly: ignore[bad-index, missing-attribute]
-    self.assertEqual(updates['w'].shape, shape)
+    self.assertEqual(w_update.shape, shape)
 
     # Check Near-Orthogonality (Spectral Norm Constraint)
     if shape[0] == shape[1]:
-      # pyrefly: ignore[bad-index, no-matching-overload]
-      s = jnp.linalg.svd(updates['w'], compute_uv=False)
+      s = jnp.linalg.svd(w_update, compute_uv=False)
       max_s = jnp.max(s)
       min_s = jnp.min(s)
       self.assertLess(max_s, 2.0, msg=f'Max singular value {max_s} too high')
@@ -412,7 +411,7 @@ def test_orthogonality(self, preconditioning):
     params = {'w': jnp.eye(8) * 2.0}
     opt = _muon.muon(learning_rate=0.1, preconditioning=preconditioning)
     updates, _ = opt.update(params, opt.init(params), params)
-    w_update = updates['w']  # pyrefly: ignore[bad-index]
+    w_update = jax.tree.leaves(updates)[0]
 
     for leaf in jax.tree_util.tree_leaves(updates):
       self.assertFalse(jnp.isnan(leaf).any(), 'Found NaN values in updates')
diff --git a/optax/contrib/_reduce_on_plateau_test.py b/optax/contrib/_reduce_on_plateau_test.py
index add2da9f3..3da7b5d00 100644
--- a/optax/contrib/_reduce_on_plateau_test.py
+++ b/optax/contrib/_reduce_on_plateau_test.py
@@ -18,6 +18,7 @@
 from absl.testing import parameterized
 import jax
 import jax.numpy as jnp
+from optax import tree
 from optax._src import test_utils
 from optax.contrib import _reduce_on_plateau
 
@@ -64,8 +65,10 @@ def test_learning_rate_reduced_after_cooldown_period_is_over(
       )
 
     # Check that learning rate is reduced
-    # pyrefly: ignore[not-iterable]
-    scale, best_value, plateau_count, cooldown_count, *_ = state
+    scale = tree.get(state, 'scale')
+    best_value = tree.get(state, 'best_value')
+    plateau_count = tree.get(state, 'plateau_count')
+    cooldown_count = tree.get(state, 'cooldown_count')
     test_utils.assert_trees_all_close(scale, 0.1)
     test_utils.assert_trees_all_close(best_value, 1.0)
     test_utils.assert_trees_all_close(plateau_count, 0)
@@ -78,8 +81,10 @@ def test_learning_rate_reduced_after_cooldown_period_is_over(
     )
 
     # Check that cooldown_count is decremented
-    # pyrefly: ignore[not-iterable]
-    scale, best_value, plateau_count, cooldown_count, *_ = state
+    scale = tree.get(state, 'scale')
+    best_value = tree.get(state, 'best_value')
+    plateau_count = tree.get(state, 'plateau_count')
+    cooldown_count = tree.get(state, 'cooldown_count')
     test_utils.assert_trees_all_close(scale, 0.1)
     test_utils.assert_trees_all_close(best_value, 1.0)
     test_utils.assert_trees_all_close(plateau_count, 0)
@@ -108,8 +113,9 @@ def test_learning_rate_is_not_reduced(self, enable_x64):
     )
 
     # Check that plateau_count resets
-    # pyrefly: ignore[not-iterable]
-    scale, best_value, plateau_count, *_ = new_state
+    scale = tree.get(new_state, 'scale')
+    best_value = tree.get(new_state, 'best_value')
+    plateau_count = tree.get(new_state, 'plateau_count')
     test_utils.assert_trees_all_close(plateau_count, 0)
     test_utils.assert_trees_all_close(scale, 0.1)
     test_utils.assert_trees_all_close(best_value, 0.1)
@@ -138,8 +144,10 @@ def test_learning_rate_not_reduced_during_cooldown(self, enable_x64):
 
     # Check that learning rate is not reduced and
     # plateau_count is not incremented
-    # pyrefly: ignore[not-iterable]
-    scale, best_value, plateau_count, cooldown_count, *_ = new_state
+    scale = tree.get(new_state, 'scale')
+    best_value = tree.get(new_state, 'best_value')
+    plateau_count = tree.get(new_state, 'plateau_count')
+    cooldown_count = tree.get(new_state, 'cooldown_count')
     test_utils.assert_trees_all_close(scale, 0.1)
     test_utils.assert_trees_all_close(best_value, 1.0)
     test_utils.assert_trees_all_close(plateau_count, 0)
@@ -174,8 +182,10 @@ def test_learning_rate_not_reduced_after_end_scale_is_reached(
       )
 
     # Check that learning rate is not reduced
-    # pyrefly: ignore[not-iterable]
-    scale, best_value, plateau_count, cooldown_count, *_ = state
+    scale = tree.get(state, 'scale')
+    best_value = tree.get(state, 'best_value')
+    plateau_count = tree.get(state, 'plateau_count')
+    cooldown_count = tree.get(state, 'cooldown_count')
     test_utils.assert_trees_all_close(scale, 0.01)
     test_utils.assert_trees_all_close(best_value, 0.1)
     test_utils.assert_trees_all_close(plateau_count, 0)
diff --git a/optax/losses/_classification.py b/optax/losses/_classification.py
index 383f244f6..70d8d1485 100644
--- a/optax/losses/_classification.py
+++ b/optax/losses/_classification.py
@@ -76,7 +76,7 @@ class is an independent binary prediction and different classes are not
     <http://www.deeplearningbook.org/contents/prob.html>`_, 2016
   """
   utils.check_subdtype(logits, jnp.floating)
-  labels = jnp.astype(labels, logits.dtype)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  labels = jnp.astype(labels, jax.dtypes.result_type(logits))
   log_p = jax.nn.log_sigmoid(logits)
   # log(1 - sigmoid(x)) = log_sigmoid(-x), the latter more numerically stable
   # pyrefly: ignore[unsupported-operation]
@@ -299,7 +299,7 @@ def softmax_cross_entropy(
     Added ``axis`` and ``where`` arguments.
   """
   utils.check_subdtype(logits, jnp.floating)
-  if where is not None and where.ndim != logits.ndim:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  if where is not None and jnp.ndim(where) != jnp.ndim(logits):
     where = jnp.expand_dims(where, axis)  # pyrefly: ignore[bad-argument-type]
   log_probs = jax.nn.log_softmax(logits, axis, where)
   # pyrefly: ignore[no-matching-overload]
@@ -386,20 +386,22 @@ def softmax_cross_entropy_with_integer_labels(
   """
   utils.check_subdtype(logits, jnp.floating)
   utils.check_subdtype(labels, jnp.integer)
-  if where is not None and where.ndim != logits.ndim:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  if where is not None and jnp.ndim(where) != jnp.ndim(logits):
     where = jnp.expand_dims(where, axis)
   if isinstance(axis, int):
-    axis = canonicalize_axis(axis, logits.ndim)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+    axis = canonicalize_axis(axis, jnp.ndim(logits))
   elif isinstance(axis, tuple):
     # Move all "feature" dimensions to the end preserving axis ordering and
     # subsequent flattening "feature" dimensions to a single one.
-    logit_axis = canonicalize_axes(axis, logits.ndim)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
-    batch_axis = tuple(x for x in range(logits.ndim) if x not in logit_axis)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+    logit_axis = canonicalize_axes(axis, jnp.ndim(logits))
+    batch_axis = tuple(
+        x for x in range(jnp.ndim(logits)) if x not in logit_axis
+    )
     axis = len(batch_axis)
-    logits = logits.transpose(batch_axis + logit_axis)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+    logits = jnp.transpose(logits, batch_axis + logit_axis)
     logits = logits.reshape(logits.shape[:len(batch_axis)] + (-1,))
     if where is not None:
-      where = where.transpose(batch_axis + logit_axis)  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+      where = jnp.transpose(where, batch_axis + logit_axis)
       where = where.reshape(where.shape[:len(batch_axis)] + (-1,))
   else:
     raise ValueError('Keyword argument \'axis\' must be of type \'int\' or '
@@ -449,7 +451,7 @@ def multiclass_hinge_loss(
 
   .. versionadded:: 0.2.3
   """
-  one_hot_labels = jax.nn.one_hot(labels, scores.shape[-1])  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  one_hot_labels = jax.nn.one_hot(labels, jnp.shape(scores)[-1])
   return jnp.max(scores + 1.0 - one_hot_labels, axis=-1) - _dot_last_dim(
       scores, one_hot_labels
   )
@@ -474,7 +476,7 @@ def multiclass_perceptron_loss(
 
   .. versionadded:: 0.2.2
   """
-  one_hot_labels = jax.nn.one_hot(labels, scores.shape[-1])  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  one_hot_labels = jax.nn.one_hot(labels, jnp.shape(scores)[-1])
   return jnp.max(scores, axis=-1) - _dot_last_dim(scores, one_hot_labels)
 
 
@@ -667,10 +669,10 @@ def convex_kl_divergence(
 
 
 def ctc_loss_with_forward_probs(
-    logits: jax.typing.ArrayLike,
-    logit_paddings: jax.typing.ArrayLike,
-    labels: jax.typing.ArrayLike,
-    label_paddings: jax.typing.ArrayLike,
+    logits: jax.Array,
+    logit_paddings: jax.Array,
+    labels: jax.Array,
+    label_paddings: jax.Array,
     *,
     blank_id: int = 0,
     log_epsilon: jax.typing.ArrayLike = -1e5,
@@ -731,7 +733,6 @@ def ctc_loss_with_forward_probs(
   utils.check_rank(logits, 3)
   utils.check_rank(labels, 2)
   utils.check_shapes_equal(labels, label_paddings)
-  # pyrefly: ignore[bad-index]
   utils.check_shapes_equal(logits[..., 0], logit_paddings)
   batchsize, unused_maxinputlen, num_classes = logits.shape  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
   batchsize_of_labels, maxlabellen = labels.shape  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
@@ -745,7 +746,6 @@ def ctc_loss_with_forward_probs(
   labellens = maxlabellen - jnp.sum(label_paddings, axis=1).astype(jnp.int32)
 
   # repeat[b, n] == 1.0 when label[b, n] == label[b, n+1].
-  # pyrefly: ignore[bad-index]
   repeat = (labels[:, :-1] == labels[:, 1:]).astype(jnp.float32)
   repeat = jnp.pad(repeat, ((0, 0), (0, 1)))
 
@@ -810,10 +810,10 @@ def loop_body(prev, x):
 
 
 def ctc_loss(
-    logits: jax.typing.ArrayLike,
-    logit_paddings: jax.typing.ArrayLike,
-    labels: jax.typing.ArrayLike,
-    label_paddings: jax.typing.ArrayLike,
+    logits: jax.Array,
+    logit_paddings: jax.Array,
+    labels: jax.Array,
+    label_paddings: jax.Array,
     *,
     blank_id: int = 0,
     log_epsilon: jax.typing.ArrayLike = -1e5,
diff --git a/optax/losses/_segmentation.py b/optax/losses/_segmentation.py
index 52ee99f1b..42e7db5ea 100644
--- a/optax/losses/_segmentation.py
+++ b/optax/losses/_segmentation.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Segmentation losses."""
 
-from typing import Optional
+from typing import Optional, Sequence, Union
 
 import jax
 import jax.numpy as jnp
@@ -35,17 +35,17 @@ def _reduce_loss(
 
 
 def dice_loss(
-    predictions: jax.typing.ArrayLike,
-    targets: jax.typing.ArrayLike,
+    predictions: jax.Array,
+    targets: jax.Array,
     *,
     class_weights: Optional[jax.typing.ArrayLike] = None,
-    smooth: jax.typing.ArrayLike = 1.,
+    smooth: jax.typing.ArrayLike = 1.0,
     alpha: jax.typing.ArrayLike = 0.5,
     beta: jax.typing.ArrayLike = 0.5,
     apply_softmax: bool = True,
     reduction: str = "mean",
     ignore_background: bool = False,
-    axis: Optional[jax.typing.ArrayLike] = None,
+    axis: Optional[Union[int, Sequence[int]]] = None,
 ) -> jax.Array:
   r"""Computes the Dice Loss for multi-class segmentation.
 
@@ -143,10 +143,10 @@ def dice_loss(
       Volumetric Medical Image Segmentation" (2016).
   """
 
-  if predictions.ndim == targets.ndim - 1:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
-    predictions = predictions[..., None]  # pyrefly: ignore[bad-index]
-  if targets.ndim == predictions.ndim - 1:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
-    targets = targets[..., None]  # pyrefly: ignore[bad-index]
+  if jnp.ndim(predictions) == jnp.ndim(targets) - 1:
+    predictions = predictions[..., None]
+  if jnp.ndim(targets) == jnp.ndim(predictions) - 1:
+    targets = targets[..., None]
   utils.check_shapes_equal(predictions, targets)
 
   # Input validation for probability distributions
@@ -166,12 +166,12 @@ def dice_loss(
   if apply_softmax:
     probs = (
         jax.nn.sigmoid(predictions)
-        if predictions.shape[-1] == 1  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+        if jnp.shape(predictions)[-1] == 1
         else jax.nn.softmax(predictions, axis=-1)
     )
 
   # Default behavior: sum over all spatial dimensions (except first/last)
-  axis = tuple(range(1, probs.ndim - 1)) if axis is None else axis  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  axis = tuple(range(1, jnp.ndim(probs) - 1)) if axis is None else axis
 
   # Compute intersection and sums over specified axes
   # pyrefly: ignore[bad-argument-type]
@@ -196,7 +196,7 @@ def dice_loss(
     dice_l = dice_l * class_weights
 
   # Handle background class ignoring
-  if ignore_background and probs.shape[-1] > 1:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  if ignore_background and jnp.shape(probs)[-1] > 1:
     # Exclude the first class (background) from loss computation
     dice_l = dice_l[..., 1:]
 
@@ -207,10 +207,10 @@ def dice_loss(
 
 
 def multiclass_generalized_dice_loss(
-    predictions: jax.typing.ArrayLike,
-    targets: jax.typing.ArrayLike,
+    predictions: jax.Array,
+    targets: jax.Array,
     *,
-    smooth: jax.typing.ArrayLike = 1.,
+    smooth: jax.typing.ArrayLike = 1.0,
     apply_softmax: bool = True,
     ignore_background: bool = False,
 ) -> jax.Array:
@@ -238,7 +238,7 @@ def multiclass_generalized_dice_loss(
   utils.check_shapes_equal(predictions, targets)
 
   # Compute class frequencies for weighting
-  class_frequencies = jnp.sum(targets, axis=tuple(range(targets.ndim - 1)))  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  class_frequencies = jnp.sum(targets, axis=tuple(range(jnp.ndim(targets) - 1)))
 
   # Compute weights as inverse of squared frequencies
   # Add small epsilon to avoid division by zero
@@ -262,10 +262,10 @@ def multiclass_generalized_dice_loss(
 
 
 def binary_dice_loss(
-    predictions: jax.typing.ArrayLike,
-    targets: jax.typing.ArrayLike,
+    predictions: jax.Array,
+    targets: jax.Array,
     *,
-    smooth: jax.typing.ArrayLike = 1.,
+    smooth: jax.typing.ArrayLike = 1.0,
     apply_sigmoid: bool = True,
 ) -> jax.Array:
   """Binary Dice Loss convenience function.
@@ -280,9 +280,12 @@ def binary_dice_loss(
       Loss values of shape [...] (batch dimensions only).
   """
   # Ensure both have channel dimension
-  if predictions.ndim == targets.ndim and predictions.shape[-1] != 1:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
-    predictions = predictions[..., None]  # pyrefly: ignore[bad-index]
-    targets = targets[..., None]  # pyrefly: ignore[bad-index]
+  if (
+      jnp.ndim(predictions) == jnp.ndim(targets)
+      and jnp.shape(predictions)[-1] != 1
+  ):
+    predictions = predictions[..., None]
+    targets = targets[..., None]
 
   return dice_loss(
       predictions,
diff --git a/optax/losses/_self_supervised.py b/optax/losses/_self_supervised.py
index c09c159eb..7a69f9f13 100644
--- a/optax/losses/_self_supervised.py
+++ b/optax/losses/_self_supervised.py
@@ -22,8 +22,8 @@
 
 
 def ntxent(
-    embeddings: jax.typing.ArrayLike,
-    labels: jax.typing.ArrayLike,
+    embeddings: jax.Array,
+    labels: jax.Array,
     temperature: jax.typing.ArrayLike = 0.07,
 ) -> jax.Array:
   """Normalized temperature scaled cross entropy loss (NT-Xent).
@@ -79,19 +79,19 @@ def ntxent(
   .. versionadded:: 0.2.3
   """
   utils.check_subdtype(embeddings, jnp.floating)
-  if labels.shape[0] != embeddings.shape[0]:  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+  if labels.shape[0] != embeddings.shape[0]:
     raise ValueError(
         'Labels and embeddings must have the same leading dimension, found'
-        f' {labels.shape[0]} for labels and {embeddings.shape[0]} for'  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+        f' {labels.shape[0]} for labels and {embeddings.shape[0]} for'
         ' embeddings.'
     )
 
   # cosine similarity matrix
   xcs = (
       _regression.cosine_similarity(
-          embeddings[None, :, :],  # pyrefly: ignore[bad-index]
-          embeddings[:, None, :],  # pyrefly: ignore[bad-index]
-          epsilon=jnp.finfo(embeddings.dtype).eps,  # pytype: disable=attribute-error  # jax-arraylike # noqa: E501
+          embeddings[None, :, :],
+          embeddings[:, None, :],
+          epsilon=jnp.finfo(embeddings.dtype).eps,
       )
       / temperature
   )
diff --git a/optax/schedules/_schedule_test.py b/optax/schedules/_schedule_test.py
index 9c270c92b..3b46ab7df 100644
--- a/optax/schedules/_schedule_test.py
+++ b/optax/schedules/_schedule_test.py
@@ -540,28 +540,23 @@ def test_monotonicity_and_exponent_ordering(self):
         end_value=end, exponent=2.0)
 
     steps = np.arange(decay_steps + 1)
-    vals = regular(steps)
-    vals2 = steep(steps)
+    vals = jnp.asarray(regular(steps))
+    vals2 = jnp.asarray(steep(steps))
 
     with self.subTest('warmup increases'):
       for t in range(warmup_steps):
-        # pyrefly: ignore[bad-index]
         self.assertLess(float(vals[t]), float(vals[t + 1]))
 
     with self.subTest('peak at boundary'):
-      # pyrefly: ignore[bad-index]
       self.assertAlmostEqual(float(vals[warmup_steps]), peak, places=6)
 
     with self.subTest('cosine decay nonincreasing and end value'):
       for t in range(warmup_steps, decay_steps):
-        # pyrefly: ignore[bad-index]
         self.assertGreaterEqual(float(vals[t]), float(vals[t + 1]))
-      # pyrefly: ignore[bad-index]
       self.assertAlmostEqual(float(vals[-1]), end, places=6)
 
     with self.subTest('exponent ordering (p=2 ≤ p=1)'):
       for t in range(warmup_steps, decay_steps + 1):
-        # pyrefly: ignore[bad-index]
         self.assertLessEqual(float(vals2[t]), float(vals[t]) + 1e-12)
 
   def test_raises_when_decay_equals_warmup(self):
diff --git a/optax/transforms/_freezing_test.py b/optax/transforms/_freezing_test.py
index b560ac7e6..2c510fb33 100644
--- a/optax/transforms/_freezing_test.py
+++ b/optax/transforms/_freezing_test.py
@@ -17,8 +17,6 @@
 
 from absl.testing import absltest
 from absl.testing import parameterized
-import jax
-import jax.numpy as jnp
 import numpy as np
 from optax._src import alias
 from optax._src import base
@@ -26,89 +24,65 @@
 from optax._src import update
 from optax.transforms import _freezing
 
-# Data setup for Freeze shortcuts test
+# ------------------------------------------------------------------------------
+# Test Fixtures (Use NumPy arrays to avoid JAX initialization error at
+# import time)
+# ------------------------------------------------------------------------------
+
+PARAMS_FLAT = {"w": np.array([1.0, 2.0]), "b": np.array([3.0])}
+GRAD_FLAT = {"w": np.array([0.1, 0.2]), "b": np.array([0.3])}
 
-PARAMS_FLAT = {"a": np.zeros(1), "b": np.ones(2)}
 PARAMS_NESTED = {
-    "layer1": [np.zeros((2, 3)), {"bias": np.ones(3)}],
-    "layer2": {"w": np.full((4, 2), 2.0), "b": np.full(4, -1.0)},
-    "misc": (np.array(5.0),),
+    "layer1": {"w": np.array([[1.0, 2.0]]), "b": np.array([3.0])},
+    "layer2": {"w": np.array([[4.0, 5.0]])},
 }
-GRAD_FLAT = {"a": np.array([1.0]), "b": np.array([2.0, 3.0])}
 GRAD_NESTED = {
-    "layer1": [np.full((2, 3), 1.0), {"bias": np.full(3, 2.0)}],
-    "layer2": {"w": np.full((4, 2), 3.0), "b": np.full(4, 4.0)},
-    "misc": (np.array(1.0),),
+    "layer1": {"w": np.array([[0.1, 0.2]]), "b": np.array([0.3])},
+    "layer2": {"w": np.array([[0.4, 0.5]])},
 }
 
+# ------------------------------------------------------------------------------
+# Freeze Tests
+# ------------------------------------------------------------------------------
+
 
 class FreezeTest(parameterized.TestCase):
 
   @parameterized.named_parameters([
-      # flat: freeze only 'a'
-      (
-          "flat_freeze_a",
-          PARAMS_FLAT,
-          GRAD_FLAT,
-          {"a": True, "b": False},
-          {"a": np.array([0.0]), "b": GRAD_FLAT["b"]},
-      ),
-      # flat: freeze only 'b'
-      (
-          "flat_freeze_b",
-          PARAMS_FLAT,
-          GRAD_FLAT,
-          {"a": False, "b": True},
-          {"a": GRAD_FLAT["a"], "b": np.array([0.0, 0.0])},
-      ),
-      # flat: freeze everything
       (
-          "flat_freeze_all",
+          "flat_partial",
           PARAMS_FLAT,
           GRAD_FLAT,
-          True,
-          {"a": np.array([0.0]), "b": np.array([0.0, 0.0])},
+          {"w": True, "b": False},
+          {"w": np.array([0.0, 0.0]), "b": np.array([0.3])},
       ),
-      # flat: freeze nothing
-      ("flat_freeze_none", PARAMS_FLAT, GRAD_FLAT, False, GRAD_FLAT),
-      # nested: freeze first layer1 weight only
       (
-          "nested_freeze_l1_0",
+          "nested_partial",
           PARAMS_NESTED,
           GRAD_NESTED,
+          {"layer1": {"w": True, "b": False}, "layer2": {"w": False}},
           {
-              "layer1": [True, {"bias": False}],
-              "layer2": {"w": False, "b": False},
-              "misc": (False,),
-          },
-          {
-              "layer1": [
-                  np.zeros_like(GRAD_NESTED["layer1"][0]),
-                  GRAD_NESTED["layer1"][1],
-              ],
-              "layer2": GRAD_NESTED["layer2"],
-              "misc": GRAD_NESTED["misc"],
+              "layer1": {
+                  "w": np.array([[0.0, 0.0]]),
+                  "b": np.array([0.3]),
+              },
+              "layer2": {"w": np.array([[0.4, 0.5]])},
           },
       ),
-      # nested: freeze only layer2['w']
       (
-          "nested_freeze_l2_w",
+          "freeze_all",
           PARAMS_NESTED,
           GRAD_NESTED,
+          True,
           {
-              "layer1": [False, {"bias": False}],
-              "layer2": {"w": True, "b": False},
-              "misc": (False,),
-          },
-          {
-              "layer1": GRAD_NESTED["layer1"],
-              "layer2": {
-                  "w": np.zeros_like(GRAD_NESTED["layer2"]["w"]),
-                  "b": GRAD_NESTED["layer2"]["b"],
+              "layer1": {
+                  "w": np.array([[0.0, 0.0]]),
+                  "b": np.array([0.0]),
               },
-              "misc": GRAD_NESTED["misc"],
+              "layer2": {"w": np.array([[0.0, 0.0]])},
           },
       ),
+      ("freeze_none", PARAMS_NESTED, GRAD_NESTED, False, GRAD_NESTED),
   ])
   def test_freeze_updates(self, params, grads, freeze_mask, expected_updates):
     """Tests that freeze zeros out the correct gradient updates."""
@@ -122,163 +96,39 @@ def test_freeze_updates(self, params, grads, freeze_mask, expected_updates):
     test_utils.assert_trees_all_equal(state.inner_state, base.EmptyState())
     # pytype: enable=attribute-error
 
-  def test_nested_freeze_all(self):
-    mask = {
-        "layer1": [True, {"bias": True}],
-        "layer2": {"w": True, "b": True},
-        "misc": (True,),
-    }
-    opt = _freezing.freeze(mask)
-    state = opt.init(PARAMS_NESTED)
-    updates, _ = opt.update(GRAD_NESTED, state, PARAMS_NESTED)
-    test_utils.assert_trees_all_close(
-        updates, jax.tree.map(lambda g: g * 0, GRAD_NESTED), atol=0
-    )
-
-  def test_nested_freeze_none(self):
-    mask = {
-        "layer1": [False, {"bias": False}],
-        "layer2": {"w": False, "b": False},
-        "misc": (False,),
-    }
-    opt = _freezing.freeze(mask)
-    state = opt.init(PARAMS_NESTED)
-    updates, _ = opt.update(GRAD_NESTED, state, PARAMS_NESTED)
-    test_utils.assert_trees_all_close(updates, GRAD_NESTED, atol=0)
-
-  @parameterized.named_parameters([
-      ("py_bool", True),
-      ("jax_bool", np.array(False)),
-  ])
-  def test_scalar_bool_broadcast(self, scalar_mask):
-    opt = _freezing.freeze(scalar_mask)
-    state = opt.init(PARAMS_FLAT)
-    updates, _ = opt.update(GRAD_FLAT, state, PARAMS_FLAT)
-    expected = (
-        jax.tree.map(jnp.zeros_like, PARAMS_FLAT)
-        if bool(scalar_mask)
-        else GRAD_FLAT
-    )
-    test_utils.assert_trees_all_close(updates, expected, atol=0)
-
   def test_bad_structure_raises(self):
-    bad_mask = {"layer1": [True]}  # missing the bias leaf
+    bad_mask = {"layer1": {"w": True}}  # missing 'b'
     opt = _freezing.freeze(bad_mask)
-    with self.assertRaisesRegex(ValueError, "Dict key mismatch"):
+    with self.assertRaises(ValueError):
       opt.update(GRAD_NESTED, opt.init(PARAMS_NESTED), PARAMS_NESTED)
 
-  def test_partial_prefix_mask_behavior(self):
-    """Tests freeze behavior with masks that are prefixes."""
-    params = {"a": 1.0, "b": {"c": 2.0, "d": 3.0}}
-    grads = {"a": 10.0, "b": {"c": 20.0, "d": 30.0}}
-    grads = jax.tree.map(jnp.asarray, grads)
-    # Mask is a prefix: True for 'b' applies to the whole subtree {'c', 'd'}.
-
-    mask = {"a": False, "b": True}
-    optimizer = _freezing.freeze(mask)
-    state = optimizer.init(params)
-    updates, _ = optimizer.update(grads, state, params)
-
-    # Expect 'a' grads to pass through, 'b' subtree grads to be zeroed.
 
-    expected_updates = {
-        "a": jnp.array(10.0),
-        "b": {"c": jnp.array(0.0), "d": jnp.array(0.0)},
-    }
-    test_utils.assert_trees_all_close(updates, expected_updates, atol=0)
+# ------------------------------------------------------------------------------
+# Selective Transform Tests
+# ------------------------------------------------------------------------------
 
 
 class SelectiveTransformTest(parameterized.TestCase):
 
   @parameterized.named_parameters([
-      # flat: freeze b only
-      (
-          "flat_freeze_b",
-          PARAMS_FLAT,
-          GRAD_FLAT,
-          {"a": False, "b": True},
-          {
-              "a": PARAMS_FLAT["a"] - GRAD_FLAT["a"],
-              "b": PARAMS_FLAT["b"],
-          },
-      ),
-      # flat: freeze a only
-      (
-          "flat_freeze_a",
-          PARAMS_FLAT,
-          GRAD_FLAT,
-          {"a": True, "b": False},
-          {
-              "a": PARAMS_FLAT["a"],
-              "b": PARAMS_FLAT["b"] - GRAD_FLAT["b"],
-          },
-      ),
-      # flat: freeze none (explicit full mask)
       (
-          "flat_freeze_none",
+          "flat_selective",
           PARAMS_FLAT,
           GRAD_FLAT,
-          {"a": False, "b": False},
-          jax.tree.map(lambda p, g: p - g, PARAMS_FLAT, GRAD_FLAT),
+          {"w": True, "b": False},
+          {"w": np.array([1.0, 2.0]), "b": np.array([2.7])},
       ),
-      # flat: freeze all (scalar)
-      ("flat_freeze_all", PARAMS_FLAT, GRAD_FLAT, True, PARAMS_FLAT),
-      # nested: freeze layer1 weights only
       (
-          "nested_freeze_layer1_weight",
+          "nested_selective",
           PARAMS_NESTED,
           GRAD_NESTED,
+          {"layer1": {"w": True, "b": False}, "layer2": {"w": False}},
           {
-              "layer1": [True, {"bias": False}],
-              "layer2": {"w": False, "b": False},
-              "misc": (False,),
-          },
-          {
-              "layer1": [
-                  PARAMS_NESTED["layer1"][0],  # frozen
-                  {
-                      "bias": (
-                          # pyrefly: ignore[bad-index]
-                          PARAMS_NESTED["layer1"][1]["bias"]
-                          - GRAD_NESTED["layer1"][1]["bias"]
-                      )
-                  },
-              ],
-              "layer2": {
-                  "w": (
-                      PARAMS_NESTED["layer2"]["w"] - GRAD_NESTED["layer2"]["w"]
-                  ),
-                  "b": (
-                      PARAMS_NESTED["layer2"]["b"] - GRAD_NESTED["layer2"]["b"]
-                  ),
+              "layer1": {
+                  "w": np.array([[1.0, 2.0]]),
+                  "b": np.array([2.7]),
               },
-              "misc": (PARAMS_NESTED["misc"][0] - GRAD_NESTED["misc"][0],),
-          },
-      ),
-      # nested: freeze entire layer2
-      (
-          "nested_freeze_layer2",
-          PARAMS_NESTED,
-          GRAD_NESTED,
-          {
-              "layer1": [False, {"bias": False}],
-              "layer2": {"w": True, "b": True},
-              "misc": (False,),
-          },
-          {
-              "layer1": [
-                  # pyrefly: ignore[unsupported-operation]
-                  PARAMS_NESTED["layer1"][0] - GRAD_NESTED["layer1"][0],
-                  {
-                      "bias": (
-                          # pyrefly: ignore[bad-index]
-                          PARAMS_NESTED["layer1"][1]["bias"]
-                          - GRAD_NESTED["layer1"][1]["bias"]
-                      )
-                  },
-              ],
-              "layer2": PARAMS_NESTED["layer2"],  # frozen
-              "misc": (PARAMS_NESTED["misc"][0] - GRAD_NESTED["misc"][0],),
+              "layer2": {"w": np.array([[3.6, 4.5]])},
           },
       ),
   ])
@@ -295,55 +145,6 @@ def test_selective_transform_effect(
 
     test_utils.assert_trees_all_close(new_params, expected_params, atol=1e-6)
 
-  def test_nested_train_all(self):
-    mask = {
-        "layer1": [False, {"bias": False}],
-        "layer2": {"w": False, "b": False},
-        "misc": (False,),
-    }
-    opt = _freezing.selective_transform(alias.sgd(1.0), freeze_mask=mask)
-    updates, _ = opt.update(GRAD_NESTED, opt.init(PARAMS_NESTED), PARAMS_NESTED)
-    new_params = update.apply_updates(PARAMS_NESTED, updates)
-    expected = jax.tree.map(lambda p, g: p - g, PARAMS_NESTED, GRAD_NESTED)
-    test_utils.assert_trees_all_close(new_params, expected, atol=1e-6)
-
-  @parameterized.named_parameters([
-      ("py_bool", True),
-      ("jax_bool", np.array(True)),
-  ])
-  def test_scalar_freeze_all(self, scalar_mask):
-    opt = _freezing.selective_transform(alias.sgd(1.0), freeze_mask=scalar_mask)
-    updates, _ = opt.update(GRAD_FLAT, opt.init(PARAMS_FLAT), PARAMS_FLAT)
-    new_params = update.apply_updates(PARAMS_FLAT, updates)
-    test_utils.assert_trees_all_close(new_params, PARAMS_FLAT, atol=1e-6)
-
-  def test_selective_bad_structure(self):
-    bad_mask = {"a": True}  # missing 'b'
-    opt = _freezing.selective_transform(alias.sgd(1.0), freeze_mask=bad_mask)
-    with self.assertRaisesRegex(ValueError, "Dict key mismatch"):
-      opt.update(GRAD_FLAT, opt.init(PARAMS_FLAT), PARAMS_FLAT)
-
-  def test_partial_prefix_mask_behavior(self):
-    """Tests selective_transform behavior with masks that are prefixes."""
-    params = {"a": 1.0, "b": {"c": 2.0, "d": 3.0}}
-    grads = {"a": 10.0, "b": {"c": 20.0, "d": 30.0}}
-    params = jax.tree.map(jnp.asarray, params)
-    grads = jax.tree.map(jnp.asarray, grads)
-
-    # Mask is a prefix: True for 'b' means freeze the whole subtree {'c', 'd'}.
-
-    mask = {"a": False, "b": True}
-    inner_opt = alias.sgd(learning_rate=1.0)  # SGD subtracts the gradient
-    optimizer = _freezing.selective_transform(inner_opt, freeze_mask=mask)
-    state = optimizer.init(params)
-    updates, _ = optimizer.update(grads, state, params)
-    new_params = update.apply_updates(params, updates)
-
-    # 'a' is updated by SGD (p - g), 'b' remains unchanged
-
-    expected_params = {"a": params["a"] - grads["a"], "b": params["b"]}
-    test_utils.assert_trees_all_close(new_params, expected_params, atol=1e-6)
-
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/optax/tree_utils/_state_utils_test.py b/optax/tree_utils/_state_utils_test.py
index 52459555c..2095a567a 100644
--- a/optax/tree_utils/_state_utils_test.py
+++ b/optax/tree_utils/_state_utils_test.py
@@ -184,7 +184,7 @@ def test_inject_hparams(self):
 
     self.assertEqual(1e-3, state.hyperparams['learning_rate'])
     params_plus_one = jax.tree.map(lambda v: v + 1, params)
-    mu = getattr(state.inner_state[0], 'mu')  # pyrefly: ignore[bad-index]
+    mu = _state_utils.tree_get(state, 'mu')
     test_utils.assert_trees_all_close(mu, params_plus_one)
 
   def test_map_params_to_none(self):