From de658f0ae8d8223c2f270245295db0bdf312c92f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ey=C3=BCp=20Can=20Akman?= <eyupcanakman@gmail.com>
Date: Fri, 26 Jun 2026 18:27:37 +0300
Subject: [PATCH] [converter] Fix negative axis in quantize/dequantize lowering

The quantize/dequantize lowering normalized a negative axis as
axis + rank - 1, off by one from the eager op, which resolves it as
axis + rank. A per-channel axis=-1 was applied one dimension early, so
the converted model used the wrong channel; when the channel and a
neighbor dim share a size this is silent, with no shape error.

Normalize a negative axis as axis + rank, matching the eager op. Add
per-channel negative-axis numerical tests for both ops. They use a
(2, 4, 4) shape with equal middle and channel dims so a wrong axis
surfaces as a numerical mismatch rather than a reshape error.
---
 coreai_torch/_custom_to_core.py |  4 +--
 tests/ops/test_custom_ops.py    | 44 +++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/coreai_torch/_custom_to_core.py b/coreai_torch/_custom_to_core.py
index 8e03944..af12e57 100644
--- a/coreai_torch/_custom_to_core.py
+++ b/coreai_torch/_custom_to_core.py
@@ -295,11 +295,11 @@ def _replace_quantize_or_dequantize(
         quant_elem_type = input_type.element_type  # dequantize: quant→float
         float_elem_type = result_elem_type
 
-    # Extract axis; normalize negative axis the same way the C++ lowering does.
+    # Extract axis; normalize a negative axis the same way the eager op does.
     axis_val = _get_optional_int_arg(node, axis_idx, default=0)
     input_rank = len(input_type.shape)
     if axis_val < 0:
-        axis_val = axis_val + input_rank - 1
+        axis_val = axis_val + input_rank
 
     axis = coreai.constant(np.array(axis_val, dtype=np.int32), loc=loc)
 
diff --git a/tests/ops/test_custom_ops.py b/tests/ops/test_custom_ops.py
index 84242db..29aa34a 100644
--- a/tests/ops/test_custom_ops.py
+++ b/tests/ops/test_custom_ops.py
@@ -411,6 +411,28 @@ def forward(self, x: Tensor) -> Tensor:
             prepare_program=inject_subbyte_tensors,
         )
 
+    async def test_per_channel_negative_axis_numerical(self) -> None:
+        """quantize with a per-channel scale on a negative axis matches eager."""
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer(
+                    "scale", torch.tensor([0.1, 0.2, 0.3, 0.4], dtype=torch.float32)
+                )
+                self.register_buffer("zero_point", torch.zeros(4, dtype=torch.int8))
+
+            def forward(self, x: Tensor) -> Tensor:
+                return torch.ops.coreai.quantize(
+                    x, self.scale, torch.int8, zero_point=self.zero_point, axis=-1
+                )
+
+        model = Model()
+        x = torch.randn(2, 4, 4)
+        await validate_numerical_output(
+            model=model, x=x, prepare_program=inject_subbyte_tensors
+        )
+
 
 # ---------------------------------------------------------------------------
 # dequantize → coreai.dequantize
@@ -540,6 +562,28 @@ def forward(self, x: Tensor) -> Tensor:
             prepare_program=inject_subbyte_tensors,
         )
 
+    async def test_per_channel_negative_axis_numerical(self) -> None:
+        """dequantize with a per-channel scale on a negative axis matches eager."""
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer(
+                    "scale", torch.tensor([0.1, 0.2, 0.3, 0.4], dtype=torch.float32)
+                )
+                self.register_buffer("zero_point", torch.zeros(4, dtype=torch.int8))
+
+            def forward(self, x: Tensor) -> Tensor:
+                return torch.ops.coreai.dequantize(
+                    x, self.scale, zero_point=self.zero_point, axis=-1
+                )
+
+        model = Model()
+        x = torch.randint(-128, 127, (2, 4, 4), dtype=torch.int8)
+        await validate_numerical_output(
+            model=model, x=x, prepare_program=inject_subbyte_tensors
+        )
+
 
 # ---------------------------------------------------------------------------
 # sparse_to_dense → coreai.build_sparse_with_bitmask + coreai.sparse_with_bitmask_to_dense