pytorch
diff --git a/‎.ci/scripts/test_lora.sh‎
Lines changed: 4 additions & 4 deletions b/‎.ci/scripts/test_lora.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/_test_backend.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/_test_backend.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/test-backend-arm.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 7 additions & 70 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 7 additions & 70 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 63 additions & 35 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 63 additions & 35 deletions
diff --git a/‎backends/arm/_passes/normalize_while_initial_args_pass.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/arm/_passes/normalize_while_initial_args_pass.py‎
Lines changed: 3 additions & 1 deletion
@@ -139,12 +139,12 @@ EXPECTED_QUANT_PREFIX="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start
 Okay, so I need to calculate 15% of 80."
 EXPECTED_QUANT_LORA_PREFIX="
 <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
-To calculate 15% of 80, we can multiply 80 by 15/100.
-80 * 15/100 = 12.
-So, 15% of 80 is 12.
+To calculate 15% of 80, we can multiply 80 by 15/100 and then simplify the fraction.
+So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
 #### 12
 The answer is: 12<|im_end|>"
 
+
 # Export Quantized PTE, PTD file, no LoRA.
 # override base.lora_config=null to avoid creating a lora model
 # and loading lora weights.
@@ -204,7 +204,7 @@ fi
 NOW=$(date +"%H:%M:%S")
 echo "Test 4: Quantized, program-data separation lora. Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_q.pte --data_paths="qwen_foundation_q.ptd,qwen_lora_math_q.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
+cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_q.pte --data_paths="qwen_foundation_q.ptd,qwen_lora_math_q.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} --seq_len=104 > result.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 
@@ -36,6 +36,11 @@ on:
         required: false
         type: string
         default: linux.4xlarge.memory
+      docker-image:
+        description: 'Docker image for Linux jobs'
+        required: false
+        type: string
+        default: ci-image:executorch-ubuntu-22.04-clang12
 
 jobs:
   test-backend-linux:
@@ -50,7 +55,7 @@ jobs:
     with:
       ref: ${{ inputs.ref }}
       runner: ${{ inputs.runner-linux }}
-      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      docker-image: ${{ inputs.docker-image }}
       submodules: recursive
       timeout: ${{ inputs.timeout }}
       upload-artifact: test-report-${{ matrix.flow }}-${{ matrix.suite }}
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -9,8 +9,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -8,8 +8,6 @@ on:
       - examples/**/*
       - pyproject.toml
       - setup.py
-    tags:
-      - ciflow/binaries/*
   push:
     branches:
       - nightly
 
@@ -28,3 +28,4 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
@@ -16,7 +16,6 @@
 )
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -51,14 +50,6 @@ def get_dynamic_meandim_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
-def get_avgpool(op):
-    if op in (exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mean.default):
-        return exir_ops.edge.aten.avg_pool2d.default
-    if op in (torch.ops.aten.mean.dim, torch.ops.aten.mean.default):
-        return torch.ops.aten.avg_pool2d.default
-    raise RuntimeError(f"Can't get meandim decomposition for op {op}")
-
-
 def get_view(op):
     if op in (exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mean.default):
         return exir_ops.edge.aten.view_copy.default
@@ -79,23 +70,21 @@ def get_quantization(op):
 
 
 class DecomposeMeanDimPass(ArmPass):
-    """Decomposes a meandim into avg_pool and/or sum + mul (1/N).
-
-    ::
+    """Decomposes a meandim into sum + mul (1/N).
 
-        h, w -> avg_pool
-        n, c -> sum + mul(1/N)
+    Each reduction dimension is handled via REDUCE_SUM followed by
+    multiplication by 1/N, which works on any axis without layout
+    constraints (unlike AVG_POOL2D which only pools over spatial H×W).
 
     For rank < 4, the input is reshaped to 4D by padding with dim=1 from the
     left.
 
     Example:
         x = mean_dim(x, (0,2), keepdim=False) # x = (c,h,w)
     Becomes:
-        x = view_copy.default(x, new_shape=(1,c,h,w)) # Reshape to work with avg_pool
-        x = avg_pool2d.default(x, kernel=(1,w), stride=(1,1)) # Reduce w with avg_pool
-        x = sum.dim_IntList(x, dim=1, keepdims=True) # Reduce c with sum
-        x = mul.Tensor(x, 1/c) # Divide by number of channels to get mean
+        x = view_copy.default(x, new_shape=(1,c,h,w)) # Reshape to 4D
+        x = sum.dim_IntList(x, dim=(1,3), keepdims=True) # Reduce c,w with sum
+        x = mul.Tensor(x, 1/(c*w)) # Divide by number of elements to get mean
         x = view_copy.default(x, new_shape=(h)) # Squeeze dims since keepdims = False
 
     """
@@ -110,14 +99,6 @@ def __init__(self, graph_module, tosa_spec, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._graph_module = graph_module
         self._tosa_spec = tosa_spec
-        # Lazy import to avoid circular dependency with operator_support
-        from executorch.backends.arm.operator_support.pool_2d_support import (
-            AvgPool2dSupported,
-        )
-
-        self._avg_pool_checker = AvgPool2dSupported(
-            self._tosa_spec, WhyNoPartitionReporter()
-        )
 
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op not in (
@@ -168,12 +149,6 @@ def call_operator(self, op, args, kwargs, meta, updated=False):
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
             x = self._maybe_insert_q_dq_after(x, meta)
 
-        # Reduce (h,w) dims by avg pool if possible
-        if not has_symbolic_reduce_dim:
-            x, dims_to_reduce = self._reduce_by_average_pool(
-                op, x, dims_to_reduce, meta
-            )
-
         # Reshape back to 5D if necessary
         if len(input_shape) > 4:
             original_dims = input_shape[:-3]
@@ -259,44 +234,6 @@ def _reduce_by_sum(self, op, input_node, dims, meta):
 
         return super().call_operator(mul_op, (sum, divisor), {}, meta, True)
 
-    def _reduce_by_average_pool(self, op, input_node, dims, meta):
-        dims_to_reduce_by_avgpool = [dim for dim in dims if dim >= 2]
-        if len(dims_to_reduce_by_avgpool) == 0:
-            return input_node, dims
-
-        dims_to_reduce_by_sum = [dim for dim in dims if dim < 2]
-
-        avgpool_op = get_avgpool(op)
-        input_shape = input_node.data.size()
-
-        stride = [1, 1]
-        if dims_to_reduce_by_avgpool in ([2, 3], [3, 2]):
-            kernel_size = [input_shape[2], input_shape[3]]
-        elif dims_to_reduce_by_avgpool == [3]:
-            kernel_size = [1, input_shape[3]]
-        elif dims_to_reduce_by_avgpool == [2]:
-            kernel_size = [input_shape[2], 1]
-        else:
-            raise RuntimeError(
-                f"Bad dims {dims_to_reduce_by_avgpool} for {op} decomposition of mean_dim."
-            )
-
-        args = (input_node, kernel_size, stride)
-
-        avg_pool_node = self._graph_module.graph.create_node(
-            "call_function", avgpool_op, args
-        )
-        is_supported = self._avg_pool_checker.is_node_tosa_supported(
-            avg_pool_node, self._tosa_spec
-        )
-
-        if is_supported:
-            out = super().call_operator(avgpool_op, args, {}, meta, True)
-            out = self._maybe_insert_q_dq_after(out, meta)
-            return out, dims_to_reduce_by_sum
-
-        return input_node, dims
-
     def _maybe_insert_q_dq_after(self, op, meta):
         """If the input node of op is a dequant node, insert a q-dq pair after
         op with identical quantization parameters.
 
@@ -40,6 +40,19 @@ def _get_special_dtype(qspec: QuantArgs) -> TosaSpecialDtype | None:
     return None
 
 
+def _merge_qparams(qspec_1: QuantArgs, qspec_2: QuantArgs) -> QuantArgs:
+    """Merge two QuantArgs when inputs are quantized differently.
+
+    Requires same dtype; picks the first's parameters by default.
+
+    """
+    if qspec_1.dtype != qspec_2.dtype:
+        raise RuntimeError(
+            f"Cannot merge qparams of different dtypes: {qspec_1.dtype} vs {qspec_2.dtype}"
+        )
+    return qspec_1
+
+
 def get_input_qparams(node: Node) -> dict[int, QuantArgs]:
     """Get the input quantization parameters from a node, set by the
     'FoldAndAnnotateQParamsPass'.
@@ -121,57 +134,72 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
-    def fold_and_annotate_arg(
-        self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
-    ) -> None:
-        input_qparams = None
-        nodes_to_remove = set()
+    def _extract_input_params(
+        self, arg_list: list[Node]
+    ) -> tuple[Optional[QuantArgs], set[Node]]:
+        input_qparams: Optional[QuantArgs] = None
+        nodes_to_remove: set[Node] = set()
         for arg in arg_list:
             if not isinstance(arg, Node):
-                return
-
-            arg_quant_params = None
+                return None, set()
+            arg_quant: Optional[QuantArgs] = None
             if arg.target in DQ_OPS:
                 args = arg.args
                 scales = args[1]
                 if (
-                    isinstance(args[1], Node)
+                    isinstance(scales, Node)
                     and self.exported_program is not None
-                    and is_param_node(self.exported_program, args[1])
+                    and is_param_node(self.exported_program, scales)
                 ):
-                    scales = get_param_tensor(self.exported_program, args[1])
+                    scales = get_param_tensor(self.exported_program, scales)
                 zps = args[2]
                 if (
-                    isinstance(args[2], Node)
+                    isinstance(zps, Node)
                     and self.exported_program is not None
-                    and is_param_node(self.exported_program, args[2])
+                    and is_param_node(self.exported_program, zps)
                 ):
-                    zps = get_param_tensor(self.exported_program, args[2])
-                arg_quant_params = QuantArgs.from_operator(
+                    zps = get_param_tensor(self.exported_program, zps)
+                arg_quant = QuantArgs.from_operator(
                     arg.target, (args[0], scales, zps, *args[3:])
                 )
-                # add arg to nodes_to_remove to fold the dq-node
                 nodes_to_remove.add(arg)
-            if input_qparams is not None and input_qparams != arg_quant_params:
-                # Two args are quantized differently
-                raise RuntimeError("Input qparams do not match")
-            input_qparams = arg_quant_params
-        if input_qparams is not None:
-            node.meta["input_qparams"][i] = input_qparams
-            for n in nodes_to_remove:
-                if n.target not in DQ_OPS:
-                    raise RuntimeError(
-                        f"Expected one of {DQ_OPS} dq_op, got {n.target}"
-                    )
+            if arg_quant is not None:
+                if input_qparams is None:
+                    input_qparams = arg_quant
+                elif input_qparams != arg_quant:
+                    input_qparams = _merge_qparams(input_qparams, arg_quant)
+        return input_qparams, nodes_to_remove
+
+    def _annotate_input_params(
+        self,
+        graph_module: GraphModule,
+        node: Node,
+        index: int,
+        input_qparams: QuantArgs,
+        nodes_to_remove: set[Node],
+    ) -> None:
+        node.meta["input_qparams"][index] = input_qparams
+
+        for dq in nodes_to_remove:
+            if dq.target not in DQ_OPS:
+                raise RuntimeError(f"Expected one of {DQ_OPS} dq_op, got {dq.target}")
+            node.replace_input_with(dq, cast(Node, dq.args[0]))
+            if not dq.users:
+                graph_module.graph.erase_node(dq)
+
+        special = _get_special_dtype(input_qparams)
+        if special:
+            node.all_input_nodes[index].meta[TosaSpecialDtype.meta_key()] = special
 
-                node.replace_input_with(n, cast(Node, n.args[0]))
-                if len(n.users) == 0:
-                    graph_module.graph.erase_node(n)
-            special_dtype = _get_special_dtype(input_qparams)
-            if special_dtype:
-                node.all_input_nodes[i].meta[
-                    TosaSpecialDtype.meta_key()
-                ] = special_dtype
+    def fold_and_annotate_arg(
+        self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
+    ) -> None:
+        input_qparams, nodes_to_remove = self._extract_input_params(arg_list)
+        if input_qparams is None:
+            return
+        self._annotate_input_params(
+            graph_module, node, i, input_qparams, nodes_to_remove
+        )
 
     def _handle_control_flow_node(self, node: Node, graph_module: GraphModule):
         """Fold outmost quant nodes inside submodule.
 
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -82,6 +82,8 @@ def _normalize_node(self, graph_module: GraphModule, node: Node) -> bool:
         new_carried = tuple(carried_inputs + additional_inputs)
         node.update_arg(2, new_carried)
         node.update_arg(3, ())
+        # annotate node so later keying of captured vs loop‐carried args is possible
+        node.meta["additional_inputs"] = additional_inputs
 
         body_module_name = str(cast(Node, node.args[1]).target)
         body_module = cast(GraphModule, graph_module.get_submodule(body_module_name))  # type: ignore