From bcca0ec2d029f20d0682b88bcba50176ff769bf2 Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Fri, 10 Apr 2026 16:28:10 +0200
Subject: [PATCH 1/6] [ci] Add slang lint action

---
 .github/workflows/ci.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6c5561823..68522b677 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -208,6 +208,33 @@ jobs:
 #  Compile stage  #
 ###################
 
+  compile-slang:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read
+      checks: write
+      pull-requests: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+      - name: Download RTL submodules
+        run: |
+            make -C hardware checkout
+            make -C hardware apply-patches
+      - name: Generate flist
+        run: make -C hardware spyglass/tmp/files
+      - name: Run slang
+        uses: pulp-platform/pulp-actions/slang@v2.5.0 # update version as needed, not autoupdated
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          slang-flags: >-
+            -f hardware/spyglass/tmp/files --top ara_soc_wrap
+            -Wextra -Wno-width-trunc -Wno-case-redundant-default -Wno-case-enum -Wno-incomplete-return -Wno-dup-import
+            --ignore-unknown-modules
+            --suppress-warnings .bender/...,hardware/deps/...
+
   compile-apps:
     runs-on: ubuntu-22.04
     strategy:

From af85614c7c27821c9ae85a1caedcbecf15d25c24 Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Fri, 10 Apr 2026 12:24:45 +0200
Subject: [PATCH 2/6] [ci] Update ubuntu to latest

---
 .github/workflows/ci.yml | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 68522b677..d30a7f4af 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ jobs:
 #####################
 
   tc-llvm:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
     - name: Recover the submodule commit hash
@@ -82,7 +82,7 @@ jobs:
         path: tc-llvm.tar
 
   tc-gcc:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
     - name: Recover the submodule commit hash
@@ -134,7 +134,7 @@ jobs:
         path: tc-gcc.tar
 
   tc-isa-sim:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
     - name: Recover the submodule commit hash
@@ -169,7 +169,7 @@ jobs:
         path: tc-isa-sim.tar
 
   tc-verilator:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
     - name: Recover the submodule commit hash
@@ -209,7 +209,7 @@ jobs:
 ###################
 
   compile-slang:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     permissions:
       contents: read
       checks: write
@@ -236,7 +236,7 @@ jobs:
             --suppress-warnings .bender/...,hardware/deps/...
 
   compile-apps:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:
@@ -268,7 +268,7 @@ jobs:
         path: apps/bin
 
   compile-riscv-tests:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:
@@ -307,7 +307,7 @@ jobs:
         path: apps/bin
 
   compile-ara:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:
@@ -341,7 +341,7 @@ jobs:
 ####################
 
   simulate:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 2
       matrix:
@@ -369,7 +369,7 @@ jobs:
 ########################
 
   riscv-tests-simv:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:
@@ -392,7 +392,7 @@ jobs:
       run: config=${{ matrix.ara_config }} make -C hardware -j8 riscv_tests_simv
 
   riscv-tests-spike:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     needs: ["tc-isa-sim", "compile-riscv-tests"]
     steps:
     - uses: actions/checkout@v6
@@ -431,7 +431,7 @@ jobs:
 ###################
 
   check-license:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
     - uses: actions/setup-python@v6
@@ -443,7 +443,7 @@ jobs:
       run: python scripts/licence-checker.py --config scripts/licence-checker.hjson hardware
 
   check-clang-format:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     needs: ['tc-llvm']
     steps:
     - uses: actions/checkout@v6
@@ -469,7 +469,7 @@ jobs:
         exit $EXIT_STATUS
 
   check-trailing-whitespaces:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v6
       with:
@@ -497,7 +497,7 @@ jobs:
 #####################
 
   benchmark:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:
@@ -543,7 +543,7 @@ jobs:
         path: benchmarks-${{ matrix.ara_config }}.tar
 
   roofline:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     needs: benchmark
     steps:
     - uses: actions/checkout@v6
@@ -699,7 +699,7 @@ jobs:
 ####################
 
   clean-up:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     if: always()
     needs: ["simulate", "riscv-tests-spike", "riscv-tests-simv"]
     steps:
@@ -715,7 +715,7 @@ jobs:
             riscv-tests-spike
 
   clean-up-compile-runs:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       max-parallel: 1
       matrix:

From 7bdb7df1fdccc0f32bd4f5ca29e45f515a574185 Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Fri, 10 Apr 2026 13:41:21 +0200
Subject: [PATCH 3/6] [hardware] Fix slang warnings

---
 hardware/spyglass/src/ara_soc_wrap.sv     | 12 +++-
 hardware/src/ara.sv                       |  2 +-
 hardware/src/ara_sequencer.sv             |  2 +-
 hardware/src/ara_soc.sv                   | 16 +++---
 hardware/src/ara_system.sv                |  2 +-
 hardware/src/ctrl_registers.sv            |  4 +-
 hardware/src/lane/lane.sv                 | 10 ++--
 hardware/src/lane/lane_sequencer.sv       | 68 +++++++++++------------
 hardware/src/lane/operand_queues_stage.sv |  2 +-
 hardware/src/lane/power_gating_generic.sv |  2 +-
 hardware/src/lane/simd_alu.sv             | 53 +++++++++---------
 hardware/src/lane/valu.sv                 |  2 +-
 hardware/src/lane/vmfpu.sv                | 45 ++++++++-------
 hardware/src/masku/masku.sv               |  4 +-
 hardware/src/sldu/p2_stride_gen.sv        |  8 +--
 hardware/src/sldu/sldu.sv                 |  4 +-
 hardware/src/vlsu/addrgen.sv              |  2 +-
 hardware/src/vlsu/vldu.sv                 |  8 ++-
 hardware/src/vlsu/vlsu.sv                 |  2 +-
 hardware/src/vlsu/vstu.sv                 |  2 +-
 20 files changed, 127 insertions(+), 123 deletions(-)

diff --git a/hardware/spyglass/src/ara_soc_wrap.sv b/hardware/spyglass/src/ara_soc_wrap.sv
index 988be30c3..e4660ea78 100644
--- a/hardware/spyglass/src/ara_soc_wrap.sv
+++ b/hardware/spyglass/src/ara_soc_wrap.sv
@@ -17,8 +17,6 @@ module ara_soc_wrap (
   localparam int unsigned AxiUserWidth = 1;
   localparam int unsigned AxiIdWidth   = 5;
 
-  logic clk_i, rst_ni;
-
   ara_soc #(
     .NrLanes     (NrLanes      ),
     .VLEN        (VLEN         ),
@@ -33,7 +31,15 @@ module ara_soc_wrap (
     .scan_data_i   (1'b0        ),
     .uart_prdata_i ('0          ),
     .uart_pready_i ('0          ),
-    .uart_pslverr_i('0          )
+    .uart_pslverr_i('0          ),
+    .exit_o(),
+    .hw_cnt_en_o(),
+    .scan_data_o(),
+    .uart_penable_o(),
+    .uart_pwrite_o(),
+    .uart_paddr_o(),
+    .uart_psel_o(),
+    .uart_pwdata_o()
   );
 
 endmodule
diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 4cb9a9506..7cff40045 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -491,7 +491,7 @@ module ara import ara_pkg::*; #(
 
   // Break path for acc_mmu_en. This signal can afford some additional latency
   // since vector mem ops take multiple cycles to reach the addrgen
-  `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni);
+  `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni)
 
   vlsu #(
     .NrLanes     (NrLanes     ),
diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv
index 9cd0cc405..e7e7e3925 100644
--- a/hardware/src/ara_sequencer.sv
+++ b/hardware/src/ara_sequencer.sv
@@ -341,7 +341,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   logic running_mask_insn_d, running_mask_insn_q;
 
   logic lsu_current_burst_exception_q;
-  `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni);
+  `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni)
 
   // pe_req_ready_i comes from all the lanes
   // It is deasserted if the current request is stuck
diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv
index 5ee64e17c..fb25622eb 100644
--- a/hardware/src/ara_soc.sv
+++ b/hardware/src/ara_soc.sv
@@ -259,7 +259,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
 `endif
 
   // One-cycle latency
-  `FF(l2_rvalid, l2_req, 1'b0);
+  `FF(l2_rvalid, l2_req, 1'b0)
 
   ////////////
   //  UART  //
@@ -482,17 +482,17 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #(
   localparam config_pkg::cva6_cfg_t CVA6AraConfig = build_config_pkg::build_config(CVA6AraConfig_user);
 
   // Define the exception type
-  `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig);
+  `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig)
 
   // Standard interface
-  `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e);
-  `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t);
+  `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e)
+  `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t)
   // MMU interface
-  `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig);
-  `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t);
+  `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig)
+  `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t)
   // Accelerator - CVA6's top-level interface
-  `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t);
-  `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t);
+  `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t)
+  `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t)
 
 `ifndef TARGET_GATESIM
   ara_system #(
diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv
index 4accbd77c..6ecd5b3e8 100644
--- a/hardware/src/ara_system.sv
+++ b/hardware/src/ara_system.sv
@@ -96,7 +96,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #(
 
   // Support max 8 cores, for now
   logic [63:0] hart_id;
-  assign hart_id = {'0, hart_id_i};
+  assign hart_id = 64'(hart_id_i);
 
   // Pack invalidation interface into acc interface
   acc_to_cva6_t acc_resp_pack;
diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv
index 54eb9278c..397dbb98c 100644
--- a/hardware/src/ctrl_registers.sv
+++ b/hardware/src/ctrl_registers.sv
@@ -95,7 +95,7 @@ module ctrl_registers #(
     .reg_q_o    ({hw_cnt_en, event_trigger, dram_end_address, dram_base_address, exit})
   );
 
-  `FF(wr_active_q, wr_active_d, '0);
+  `FF(wr_active_q, wr_active_d, '0)
 
   /////////////////
   //   Signals   //
@@ -105,6 +105,6 @@ module ctrl_registers #(
   assign event_trigger_o  = event_trigger;
   assign dram_base_addr_o = dram_base_address;
   assign dram_end_addr_o  = dram_end_address;
-  assign exit_o           = {exit, logic'(|wr_active_q[7:0])};
+  assign exit_o           = {exit, |wr_active_q[7:0]};
 
 endmodule : ctrl_registers
diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv
index c17d87fad..7d01a541e 100644
--- a/hardware/src/lane/lane.sv
+++ b/hardware/src/lane/lane.sv
@@ -222,12 +222,12 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
   logic                 [NrVInsn-1:0]         mfpu_vinsn_done;
   // Interface with the MaskB operand queue (VRGATHER/VCOMPRESS)
   logic                                       mask_b_cmd_pop_d, mask_b_cmd_pop_q;
-  `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni);
+  `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni)
 
 
   // Support for store exception flush
   logic lsu_ex_flush_op_req_d, lsu_ex_flush_op_req_q;
-  `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni);
+  `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni)
 
   // Additional signals to please Verilator's hierarchical verilation
   pe_req_t  pe_req;
@@ -310,7 +310,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
   logic                                       sldu_result_gnt_opqueues;
   // Support for store exception flush
   logic                                       lsu_ex_flush_op_queues_d, lsu_ex_flush_op_queues_q;
-  `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni);
+  `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni)
 
   operand_requester #(
     .NrLanes              (NrLanes              ),
@@ -609,8 +609,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
   );
 
   // Break timing path
-  `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni);
-  `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni);
+  `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni)
+  `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni)
 
   always_comb begin
     sldu_addrgen_sel_d = SLDU_SEL;
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index a9fa44e32..cdf77f1e0 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -52,7 +52,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   `include "common_cells/registers.svh"
 
   // STU exception support
-  `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni);
+  `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni)
 
   ////////////////////////////
   //  Register the request  //
@@ -663,9 +663,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
             end
             VSLIDEDOWN: begin
               // Extra elements to ask, because of the stride
-              logic [$clog2(8*NrLanes)-1:0] extra_stride;
+              automatic logic [$clog2(8*NrLanes)-1:0] extra_stride;
               // Need one bit more than vl, since we will also add the stride contribution
-              logic [$bits(pe_req.vl):0] vl_tot;
+              automatic logic [$bits(pe_req.vl):0] vl_tot;
 
               // We need to trim full words from the start of the vector that are not used
               // as operands by the slide unit.
@@ -744,18 +744,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
           // todo:
 
           // Mask logical and integer comparisons
-          operand_request[AluA] = '{
-            id      : pe_req.id,
-            vs      : pe_req.vs1,
-            scale_vl: pe_req.scale_vl,
-            vtype   : pe_req.vtype,
-            vstart  : vfu_operation_d.vstart,
-            hazard  : pe_req.hazard_vs1 | pe_req.hazard_vd,
-            target_fu : ALU_SLDU,
-            conv      : OpQueueConversionNone,
-            cvt_resize: CVT_SAME,
-            default : '0
-          };
+          operand_request[AluA] = '0;
+          operand_request[AluA].id         = pe_req.id;
+          operand_request[AluA].vs         = pe_req.vs1;
+          operand_request[AluA].scale_vl   = pe_req.scale_vl;
+          operand_request[AluA].vtype      = pe_req.vtype;
+          operand_request[AluA].vstart     = vfu_operation_d.vstart;
+          operand_request[AluA].hazard     = pe_req.hazard_vs1 | pe_req.hazard_vd;
+          operand_request[AluA].target_fu  = ALU_SLDU;
+          operand_request[AluA].conv       = OpQueueConversionNone;
+          operand_request[AluA].cvt_resize = CVT_SAME;
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
 
@@ -858,18 +856,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
           // Vd register to provide correct mask undisturbed policy at bit-level
           // This is can be a mask or normal register
-          operand_request[MaskB] = '{
-            id      : pe_req.id,
-            vs      : pe_req.vd,
-            scale_vl: pe_req.scale_vl,
-            vtype   : pe_req.vtype,
-            vstart  : vfu_operation_d.vstart,
-            hazard  : pe_req.hazard_vd,
-            target_fu : ALU_SLDU,
-            conv      : OpQueueConversionNone,
-            cvt_resize: CVT_SAME,
-            default : '0
-          };
+          operand_request[MaskB] = '0;
+          operand_request[MaskB].id         = pe_req.id;
+          operand_request[MaskB].vs         = pe_req.vd;
+          operand_request[MaskB].scale_vl   = pe_req.scale_vl;
+          operand_request[MaskB].vtype      = pe_req.vtype;
+          operand_request[MaskB].vstart     = vfu_operation_d.vstart;
+          operand_request[MaskB].hazard     = pe_req.hazard_vd;
+          operand_request[MaskB].target_fu  = ALU_SLDU;
+          operand_request[MaskB].conv       = OpQueueConversionNone;
+          operand_request[MaskB].cvt_resize = CVT_SAME;
           // vl and eew depend on the real eew on which we are working on
           if (pe_req.op inside {VIOTA,VID}) begin
             // Non-mask layout
@@ -938,16 +934,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     // VRGATHER and VCOMPRESS access the opreq with ad-hoc requests
     if (vrgat_state_q == REQUESTING) begin
       // Here, we are sure the MaskB operand_request is free
-      operand_request[MaskB] = '{
-        vs         : masku_vrgat_req_q.vs,
-        eew        : masku_vrgat_req_q.eew,
-        scale_vl   : 1'b0,
-        cvt_resize : pe_req.cvt_resize,
-        vl         : 1,
-        vstart     : masku_vrgat_req_q.idx,
-        hazard     : '0,
-        default    : '0
-      };
+      operand_request[MaskB] = '0;
+      operand_request[MaskB].vs         = masku_vrgat_req_q.vs;
+      operand_request[MaskB].eew        = masku_vrgat_req_q.eew;
+      operand_request[MaskB].scale_vl   = 1'b0;
+      operand_request[MaskB].cvt_resize = pe_req.cvt_resize;
+      operand_request[MaskB].vl         = 1;
+      operand_request[MaskB].vstart     = masku_vrgat_req_q.idx;
+      operand_request[MaskB].hazard     = '0;
       operand_request_push[MaskB] = masku_vrgat_req_ready_d;
     end
   end: sequencer
diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv
index a0b750f1e..09369ba26 100644
--- a/hardware/src/lane/operand_queues_stage.sv
+++ b/hardware/src/lane/operand_queues_stage.sv
@@ -58,7 +58,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   `include "common_cells/registers.svh"
 
   // STU flush support
-  `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni);
+  `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni)
 
   ///////////
   //  ALU  //
diff --git a/hardware/src/lane/power_gating_generic.sv b/hardware/src/lane/power_gating_generic.sv
index 928e2625d..bd7797458 100644
--- a/hardware/src/lane/power_gating_generic.sv
+++ b/hardware/src/lane/power_gating_generic.sv
@@ -19,6 +19,6 @@ module power_gating_generic #(
 
   // Gate with an AND
   assign en_wide = en_i ? T'('1) : T'('0);
-  assign out_o   = T'(in_i & en_wide);
+  assign out_o   = in_i & en_wide;
 
 endmodule
diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
index b97016542..33a752388 100644
--- a/hardware/src/lane/simd_alu.sv
+++ b/hardware/src/lane/simd_alu.sv
@@ -55,7 +55,6 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
   alu_sat_operand_t sat_sum, sat_sub;
   vxsat_t     vxsat;
   vxrm_t      vxrm;
-  logic       r;
 
   assign vxrm = vxrm_i;
   assign vxsat_o = vxsat;
@@ -183,7 +182,9 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                 res.w64[b]     = &vxsat.w64[b] ? (sum[63] ? {1'b0, {63{1'b1}}} : {1'b1, {63{1'b0}}} ) : sum[63:0];
               end
           endcase
-        VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
+        VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) begin
+          automatic logic r;
+          unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
               automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b];
                 unique case (vxrm)
@@ -224,48 +225,48 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
                 res.w64[b] = (op_i == VAADDU) ? sum[64:1] + r : {sum[63], sum[63:1]} + r;
               end
-          endcase
+          endcase end
         VADD, VADC, VMADC, VREDSUM, VWREDSUMU, VWREDSUM: unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
                 automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b] +
-                logic'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i);
+                8'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i);
                 res.w8[b] = (op_i == VMADC) ? {6'b0, 1'b1, sum[8]} : sum[7:0];
               end
             EW16: for (int b = 0; b < 4; b++) begin
                 automatic logic [16:0] sum = opa.w16[b] + opb.w16[b] +
-                logic'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i);
+                16'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i);
                 res.w16[b] = (op_i == VMADC) ? {14'b0, 1'b1, sum[16]} : sum[15:0];
               end
             EW32: for (int b = 0; b < 2; b++) begin
                 automatic logic [32:0] sum = opa.w32[b] + opb.w32[b] +
-                logic'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i);
+                32'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i);
                 res.w32[b] = (op_i == VMADC) ? {30'b0, 1'b1, sum[32]} : sum[31:0];
               end
             EW64: for (int b = 0; b < 1; b++) begin
                 automatic logic [64:0] sum = opa.w64[b] + opb.w64[b] +
-                logic'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i);
+                64'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i);
                 res.w64[b] = (op_i == VMADC) ? {62'b0, 1'b1, sum[64]} : sum[63:0];
               end
           endcase
         VSUB, VSBC, VMSBC: unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
                 automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b] -
-                logic'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i);
+                8'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i);
                 res.w8[b] = (op_i == VMSBC) ? {6'b0, 1'b1, sub[8]} : sub[7:0];
               end
             EW16: for (int b = 0; b < 4; b++) begin
                 automatic logic [16:0] sub = opb.w16[b] - opa.w16[b] -
-                logic'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i);
+                16'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i);
                 res.w16[b] = (op_i == VMSBC) ? {14'b0, 1'b1, sub[16]} : sub[15:0];
               end
             EW32: for (int b = 0; b < 2; b++) begin
                 automatic logic [32:0] sub = opb.w32[b] - opa.w32[b] -
-                logic'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i);
+                32'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i);
                 res.w32[b] = (op_i == VMSBC) ? {30'b0, 1'b1, sub[32]} : sub[31:0];
               end
             EW64: for (int b = 0; b < 1; b++) begin
                 automatic logic [64:0] sub = opb.w64[b] - opa.w64[b] -
-                logic'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i);
+                64'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i);
                 res.w64[b] = (op_i == VMSBC) ? {62'b0, 1'b1, sub[64]} : sub[63:0];
               end
           endcase
@@ -308,22 +309,24 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
               automatic logic [16:0] sub = opb.w16[b] - opa.w16[b];
               vxsat.w16[b]   = (!opb.w16[b][15] & opa.w16[b][15] & sub[15]) |
                                (opb.w16[b][15] & !opa.w16[b][15] & !sub[15]);
-              res.w16[b]     = vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0];
+              res.w16[b]     = &vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0];
           end
           EW32: for (int b = 0; b < 2; b++) begin
               automatic logic [32:0] sub = opb.w32[b] - opa.w32[b];
               vxsat.w32[b]   = (!opb.w32[b][31] & opa.w32[b][31] & sub[31]) |
                                (opb.w32[b][31] & !opa.w32[b][31] & !sub[31]);
-              res.w32[b]     = vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0];
+              res.w32[b]     = &vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0];
           end
           EW64: for (int b = 0; b < 1; b++) begin
               automatic logic [64:0] sub = opb.w64[b] - opa.w64[b];
               vxsat.w64[b]   = (!opb.w64[b][63] & opa.w64[b][63] & sub[63]) |
                                (opb.w64[b][63] & !opa.w64[b][63] & !sub[63]);
-              res.w64[b]     = vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0];
+              res.w64[b]     = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0];
           end
           endcase
-        VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
+        VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) begin
+        automatic logic r;
+        unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
                 automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b];
                 unique case (vxrm)
@@ -332,7 +335,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                   2'b10: r = 1'b0;
                   2'b11: r = !sub[1] & (sub[0]!=0);
                 endcase
-                res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + r : ($signed(sub[7:0]) >>> 1) + r;
+                res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + {7'b0, r} : $unsigned(($signed(sub[7:0]) >>> 1) + $signed({7'b0, r}));
               end
             EW16: for (int b = 0; b < 4; b++) begin
                 automatic logic [ 16:0] sub = opb.w16[b] - opa.w16[b];
@@ -342,7 +345,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                   2'b10: r = 1'b0;
                   2'b11: r = !sub[1] & (sub[0]!=0);
                 endcase
-                res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + r : ($signed(sub[15:0]) >>> 1) + r;
+                res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + {15'b0, r} : $unsigned(($signed(sub[15:0]) >>> 1) + $signed({15'b0, r}));
               end
             EW32: for (int b = 0; b < 2; b++) begin
                 automatic logic [ 32:0] sub = opb.w32[b] - opa.w32[b];
@@ -352,7 +355,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                   2'b10: r = 1'b0;
                   2'b11: r = !sub[1] & (sub[0]!=0);
                 endcase
-                res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + r : ($signed(sub[31:0]) >>> 1) + r;
+                res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + {31'b0, r} : $unsigned(($signed(sub[31:0]) >>> 1) + $signed({31'b0, r}));
               end
             EW64: for (int b = 0; b < 1; b++) begin
                 automatic logic [ 64:0] sub = opb.w64[b] - opa.w64[b];
@@ -362,9 +365,9 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                   2'b10: r = 1'b0;
                   2'b11: r = !sub[1] & (sub[0]!=0);
                 endcase
-                res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + r : ($signed(sub[63:0]) >>> 1) + r;
+                res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + {63'b0, r} : $unsigned(($signed(sub[63:0]) >>> 1) + $signed({63'b0, r}));
               end
-          endcase
+          endcase end
 
         // Shift instructions
         VSLL: unique case (vew_i)
@@ -443,19 +446,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
         // Fixed point clip instructions
         VNCLIP: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
             EW8 : for (int b = 0; b < 4; b++) begin
-                automatic logic [15:0] clip = $signed(opb.w16[b]) >>> opa.w16[b][3:0];
+                automatic logic [15:0] clip = $unsigned($signed(opb.w16[b]) >>> opa.w16[b][3:0]);
                 vxsat.w8[b]   = |clip[15:8];
-                res.w8 [2*b + narrowing_select_i] = ($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + rm[b];
+                res.w8 [2*b + narrowing_select_i] = $unsigned(($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + $signed(rm[b]));
               end
             EW16: for (int b = 0; b < 2; b++) begin
-                automatic logic [31:0] clip = $signed(opb.w32[b]) >>> opa.w32[b][4:0];
+                automatic logic [31:0] clip = $unsigned($signed(opb.w32[b]) >>> opa.w32[b][4:0]);
                 vxsat.w8[b]   = |clip[31:16];
-                res.w16[2*b + narrowing_select_i] = ($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + rm[b];
+                res.w16[2*b + narrowing_select_i] = $unsigned(($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + $signed(rm[b]));
               end
             EW32: for (int b = 0; b < 1; b++) begin
                 automatic logic [63:0] clip = $signed(opb.w64[b]) >>> opa.w64[b][5:0];
                 vxsat.w8[b]   = |clip[63:32];
-                res.w32[2*b + narrowing_select_i] = ($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + rm[b];
+                res.w32[2*b + narrowing_select_i] = $unsigned(($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + $signed(rm[b]));
               end
           endcase
         VNCLIPU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 623f3c40a..561e38a9c 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -317,7 +317,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
 
   // The ALU has completed a reduction
   logic alu_red_complete_d;
-  `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni);
+  `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni)
 
   // Signal to indicate the state of the ALU
   typedef enum logic [2:0] {NO_REDUCTION, INTRA_LANE_REDUCTION, INTER_LANES_REDUCTION_RX, INTER_LANES_REDUCTION_TX, LN0_REDUCTION_COMMIT, SIMD_REDUCTION} alu_state_e;
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index c12598649..cdf8a7d48 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -280,10 +280,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
   logic narrowing_select_in_d, narrowing_select_in_q;
   // Output selector, used to control the Result MUX and validate the results
   logic narrowing_select_out_d, narrowing_select_out_q;
-  // FPU SIMD result needs to be shuffled for narrowing instructions before commit
-  elen_t narrowing_shuffled_result;
-  // Helper signal to shuffle the narrowed result
-  logic [7:0] narrowing_shuffle_be;
 
   //////////////////
   //  Multiplier  //
@@ -337,17 +333,17 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                       ~vmul_simd_in_valid[vinsn_issue_q.vtype.vsew];
 
   `FFLARNC(vmul_simd_op_a_q, vinsn_issue_q.use_scalar_op ? scalar_op : mfpu_operand_i[0],
-    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni)
   `FFLARNC(vmul_simd_op_b_q, mfpu_operand_i[1],
-    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni)
   `FFLARNC(vmul_simd_op_c_q, mfpu_operand_i[2],
-    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni)
   `FFLARNC(vmul_simd_mask_q, mask_i,
-    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni)
   `FFLARNC(vmul_simd_op_q, vinsn_issue_q.op,
-    gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni)
   `FFLARNC(vmul_simd_in_valid_q, vmul_simd_in_valid,
-    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni);
+    gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni)
 
   for (genvar i = 0; i < 4; i++) begin
 `ifdef GF22
@@ -616,7 +612,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
   // Inform the lane SLDU/ADDRGEN arbiter that this reduction is over
   logic fpu_red_complete_d;
-  `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni);
+  `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni)
 
   // Signal to indicate the state of the MFPU
   typedef enum logic [2:0] {
@@ -848,7 +844,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       EnableVectors: 1'b1,
       EnableNanBox : 1'b1,
       FpFmtMask    : {RVVF(FPUSupport), RVVD(FPUSupport), RVVH(FPUSupport), RVVB(FPUSupport), RVVHA(FPUSupport), RVVBA(FPUSupport)},
-      IntFmtMask   : {logic'(RVVB(FPUSupport) || RVVBA(FPUSupport)), 1'b1, 1'b1, 1'b1}
+      IntFmtMask   : {RVVB(FPUSupport) || RVVBA(FPUSupport), 1'b1, 1'b1, 1'b1}
     };
 
     // Implementation (number of registers etc)
@@ -1124,20 +1120,12 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     fpu_mask_t vfpu_flag_mask;
 
-    vf7_flag_out_e16 vfrec7_out_e16[4];
-    vf7_flag_out_e32 vfrec7_out_e32[2];
-    vf7_flag_out_e64 vfrec7_out_e64[1];
-
     status_t vfrec7_ex_flag, vfrsqrt7_ex_flag;
 
     roundmode_e fp_rm_process;
 
     elen_t [LatFNonComp:0]   operand_a_d, vfpu_flag_mask_d;
 
-    vf7_flag_out_e16 vfrsqrt7_out_e16[4];
-    vf7_flag_out_e32 vfrsqrt7_out_e32[2];
-    vf7_flag_out_e64 vfrsqrt7_out_e64[1];
-
     logic [15:0] lzc_e16;
     logic [9:0]  lzc_e32;
     logic [5:0]  lzc_e64;
@@ -1153,10 +1141,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       assign vfpu_flag_mask_d[0]= vfpu_simd_mask;
       for (genvar i = 0; i < LatFNonComp; i++) begin
 
-        `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni);
+        `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni)
 
-        `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni);
-        end
+        `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni)
+      end
 
       assign operand_a_delay = operand_a_d[LatFNonComp];
       assign vfpu_flag_mask  = vfpu_flag_mask_d[LatFNonComp];
@@ -1205,6 +1193,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
         // vfrec7 (only supported on 16, 32, 64-bit)
         unique case (vinsn_processing_q.vtype.vsew)
           EW16: begin
+            automatic vf7_flag_out_e16 vfrec7_out_e16[4];
             for (int h = 0; h < 4; h++) vfrec7_out_e16[h] =
               vfrec7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], fp_rm_process);
 
@@ -1217,6 +1206,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                             | (vfrec7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW32: begin
+            automatic vf7_flag_out_e32 vfrec7_out_e32[2];
             for (int w = 0; w < 2; w++) vfrec7_out_e32[w] =
               vfrec7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], fp_rm_process);
 
@@ -1226,6 +1216,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                             | (vfrec7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW64: begin
+            automatic vf7_flag_out_e64 vfrec7_out_e64[1];
             for (int d = 0; d < 1; d++) vfrec7_out_e64[d] =
               vfrec7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], fp_rm_process);
 
@@ -1242,6 +1233,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
        // vfrsqrt7 (only supported on 16, 32, 64-bit)
         unique case (vinsn_processing_q.vtype.vsew)
           EW16: begin
+            automatic vf7_flag_out_e16 vfrsqrt7_out_e16[4];
             for (int h = 0; h < 4; h++) vfrsqrt7_out_e16[h] =
               vfrsqrt7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], lzc_e16[h*4 +: 4]);
 
@@ -1254,6 +1246,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                              | (vfrsqrt7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW32: begin
+            automatic vf7_flag_out_e32 vfrsqrt7_out_e32[2];
             for (int w = 0; w < 2; w++) vfrsqrt7_out_e32[w] =
               vfrsqrt7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], lzc_e32[w*5 +: 5]);
 
@@ -1263,6 +1256,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                              | (vfrsqrt7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW64: begin
+            automatic vf7_flag_out_e64 vfrsqrt7_out_e64[1];
             for (int d = 0; d < 1; d++) vfrsqrt7_out_e64[d] =
               vfrsqrt7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], lzc_e64[d*6 +: 6]);
 
@@ -1490,6 +1484,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     case (mfpu_state_q)
       NO_REDUCTION: begin
+        // FPU SIMD result needs to be shuffled for narrowing instructions before commit
+        automatic elen_t narrowing_shuffled_result;
+        // Helper signal to shuffle the narrowed result
+        automatic logic [7:0] narrowing_shuffle_be;
+
         vfpu_tag_in = mask_i;
 
         // Sign injection
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index d2fdccdc2..7202d5ccc 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -803,9 +803,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         // The vd source can have a different encoding (it gets deshuffled in the masku_operand stage)
         [VRGATHER:VCOMPRESS]: begin
           // Buffer for the current element
-          logic [NrLanes*DataWidth-1:0] vrgat_res;
+          automatic logic [NrLanes*DataWidth-1:0] vrgat_res;
           // Buffer for the current element
-          logic [DataWidth-1:0] vrgat_buf;
+          automatic logic [DataWidth-1:0] vrgat_buf;
 
           // Extract the correct elements
           vrgat_res = '1; // Default assignment
diff --git a/hardware/src/sldu/p2_stride_gen.sv b/hardware/src/sldu/p2_stride_gen.sv
index 7919e29da..e3703c70e 100644
--- a/hardware/src/sldu/p2_stride_gen.sv
+++ b/hardware/src/sldu/p2_stride_gen.sv
@@ -37,10 +37,10 @@ module p2_stride_gen import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   assign valid_o = ~next_stride_zero_q;
   assign spare_stride_d = next_stride;
 
-  `FFL(             popc_q,              popc_d, ff_en, '0);
-  `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0);
-  `FFL( next_stride_zero_q,  next_stride_zero_d, ff_en, '0);
-  `FFL(     spare_stride_q,      spare_stride_d, ff_en, '0);
+  `FFL(             popc_q,              popc_d, ff_en, '0)
+  `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0)
+  `FFL( next_stride_zero_q,  next_stride_zero_d, ff_en, '0)
+  `FFL(     spare_stride_q,      spare_stride_d, ff_en, '0)
 
   // Is the stride power of two?
   popcount #(
diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv
index f582cdf4f..f493d10b1 100644
--- a/hardware/src/sldu/sldu.sv
+++ b/hardware/src/sldu/sldu.sv
@@ -612,7 +612,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Filled up a word to the VRF or finished the instruction
           if (out_pnt_d == NrLanes * 8 || issue_cnt_q <= byte_count) begin
             // Reset the pointer
-            out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {'0, red_stride_cnt_d, 3'b0} : '0;
+            out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {{idx_width(NrLanes*(StrbWidth-1)){1'b0}}, red_stride_cnt_d, 3'b0} : '0;
             // We used all the bits of the mask
             if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN})
               mask_ready_d = !vinsn_issue_q.vm;
@@ -754,7 +754,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Update the p2 stride
           p2_stride_gen_update_d = 1'b1;
           // Commit the final result
-          if (p2_stride_gen_popc_q == {'0, 1'b1} && result_queue_empty) begin
+          if (p2_stride_gen_popc_q == {{(idx_width(idx_width(8*NrLanes))-1){1'b0}}, 1'b1} && result_queue_empty) begin
             state_d = SLIDE_NP2_COMMIT;
             // Prepare the write pointer
             result_queue_write_pnt_d = NP2_RESULT_PNT;
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index 1ba67f650..b59487704 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -970,7 +970,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
                   // Generate an error
                   idx_op_error_d          = 1'b1;
                   // Forward next vstart info to the dispatcher
-                  addrgen_exception_vstart_d  = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1;
+                  addrgen_exception_vstart_d  = (addrgen_req.len - axi_addrgen_q.len) >> (axi_addrgen_q.vew - 1);
                   addrgen_req_ready       = 1'b1;
                   axi_addrgen_state_d     = AXI_ADDRGEN_IDLE;
                 end : eew_misaligned_error
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 37d7b8782..519ceb2a1 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -234,8 +234,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   logic [idx_width(AxiDataWidth/8):0]      axi_r_byte_pnt_d, axi_r_byte_pnt_q;
   // - A pointer to which byte in the full VRF word we are writing data into.
   logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q;
-  // - A pointer that indicates the start byte in the vrf word.
-  logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
 
   // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information
   // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in
@@ -421,7 +419,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         vrf_word_byte_pnt_d   = '0;
         vrf_word_byte_cnt_d   = '0;
         // Account for the results that were issued
-        if (seq_word_wr_offset_q) begin
+        if (seq_word_wr_offset_q != '0) begin
           vrf_eff_write_bytes = (NrLanes * DataWidthB);
         end else begin
           // First payload of the vector instruction
@@ -464,6 +462,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
         // Prepare for the next vector instruction
         if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
+          // - A pointer that indicates the start byte in the vrf word.
+          automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
           issue_cnt_bytes_d = (
                                 vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
                                 - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
@@ -649,6 +649,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
       // New instruction with new vstart. Initialize the vrf byte ptr
       if (vinsn_queue_d.issue_cnt == '0) begin
+        // - A pointer that indicates the start byte in the vrf word.
+        automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
         vrf_word_start_byte  = pe_req_i.vstart[$clog2(8*NrLanes)-1:0] << pe_req_i.vtype.vsew;
         vrf_word_byte_pnt_d  = {1'b0, vrf_word_start_byte[$clog2(8*NrLanes)-1:0]};
         vrf_word_byte_cnt_d  = '0;
diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv
index 27397fa35..8cb33786e 100644
--- a/hardware/src/vlsu/vlsu.sv
+++ b/hardware/src/vlsu/vlsu.sv
@@ -109,7 +109,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
   logic stu_current_burst_exception, ldu_current_burst_exception;
   assign lsu_current_burst_exception_o = stu_current_burst_exception | ldu_current_burst_exception;
 
-  `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni);
+  `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni)
 
   ///////////////////
   //  Definitions  //
diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv
index 845c81a37..811589139 100644
--- a/hardware/src/vlsu/vstu.sv
+++ b/hardware/src/vlsu/vstu.sv
@@ -227,7 +227,6 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   // When vstart > 0, the very first payload written to the VRF contains less than
   // (8 * NrLanes) bytes.
   logic [$clog2(8*NrLanes):0] first_payload_byte_d, first_payload_byte_q;
-  logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes;
 
   // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information
   // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in
@@ -367,6 +366,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
         // We consumed a whole word from the lanes
         if (vrf_pnt_d == NrLanes*8 || vrf_cnt_d == issue_cnt_bytes_q) begin : vrf_word_done
+          automatic logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes;
           // Reset the pointer in the VRF word
           vrf_pnt_d         = '0;
           vrf_cnt_d         = '0;

From 94578c5669ec9dbadf66ca576ee972e28e70e041 Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Tue, 2 Jun 2026 02:04:33 +0200
Subject: [PATCH 4/6] [hardware] Keep latch-fix temporaries as named signals

PR review (hopang-0221): the automatic variables introduced to silence
slang's -Winferred-latch are not visible in waveforms, since they do not
exist as named signals in the design hierarchy.

Restore them as module-scope signals and break the inferred latches with
unconditional default assignments at the top of their always_comb blocks
instead. No functional change: each of these is always assigned before it
is read on the path that uses it, so the defaults only affect the
otherwise-unread latch paths.

Affected: simd_alu (r), lane_sequencer (extra_stride/vl_tot),
masku (vrgat_res/vrgat_buf), vldu (vrf_word_start_byte),
vstu (vrf_eff_write_bytes), vmfpu (vfrec7/vfrsqrt7 scratch arrays,
narrowing_shuffled_result/narrowing_shuffle_be).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 hardware/src/lane/lane_sequencer.sv | 15 +++++++++-----
 hardware/src/lane/simd_alu.sv       | 14 ++++++-------
 hardware/src/lane/vmfpu.sv          | 32 +++++++++++++++++++----------
 hardware/src/masku/masku.sv         | 15 +++++++++-----
 hardware/src/vlsu/vldu.sv           |  7 +++----
 hardware/src/vlsu/vstu.sv           |  3 ++-
 6 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index cdf77f1e0..ceeb9c8e1 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -73,6 +73,12 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
   logic    pe_req_valid;
   logic    pe_req_ready;
 
+  // VSLIDEDOWN stride helpers (kept at module scope to remain visible in waveforms)
+  // Extra elements to ask, because of the stride
+  logic [$clog2(8*NrLanes)-1:0] extra_stride;
+  // Need one bit more than vl, since we will also add the stride contribution
+  logic [$bits(pe_req.vl):0]    vl_tot;
+
   fall_through_register #(
     .T(pe_req_t)
   ) i_pe_req_register (
@@ -279,6 +285,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
     operand_request    = '0;
     operand_request_push = '0;
 
+    // Default the slide unit stride helpers (avoids inferred latches)
+    extra_stride = '0;
+    vl_tot       = '0;
+
     // Make no requests to the lane's VFUs
     vfu_operation_d       = '0;
     vfu_operation_valid_d = 1'b0;
@@ -662,11 +672,6 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
               (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes;
             end
             VSLIDEDOWN: begin
-              // Extra elements to ask, because of the stride
-              automatic logic [$clog2(8*NrLanes)-1:0] extra_stride;
-              // Need one bit more than vl, since we will also add the stride contribution
-              automatic logic [$bits(pe_req.vl):0] vl_tot;
-
               // We need to trim full words from the start of the vector that are not used
               // as operands by the slide unit.
               operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes;
diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
index 33a752388..24ac835ed 100644
--- a/hardware/src/lane/simd_alu.sv
+++ b/hardware/src/lane/simd_alu.sv
@@ -55,6 +55,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
   alu_sat_operand_t sat_sum, sat_sub;
   vxsat_t     vxsat;
   vxrm_t      vxrm;
+  logic       r;
 
   assign vxrm = vxrm_i;
   assign vxsat_o = vxsat;
@@ -113,6 +114,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
     // Default assignment
     res       = '0;
     vxsat.w64 = '0;
+    r         = '0;
 
     if (valid_i)
       unique case (op_i)
@@ -182,9 +184,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                 res.w64[b]     = &vxsat.w64[b] ? (sum[63] ? {1'b0, {63{1'b1}}} : {1'b1, {63{1'b0}}} ) : sum[63:0];
               end
           endcase
-        VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) begin
-          automatic logic r;
-          unique case (vew_i)
+        VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
               automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b];
                 unique case (vxrm)
@@ -225,7 +225,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
                 res.w64[b] = (op_i == VAADDU) ? sum[64:1] + r : {sum[63], sum[63:1]} + r;
               end
-          endcase end
+          endcase
         VADD, VADC, VMADC, VREDSUM, VWREDSUMU, VWREDSUM: unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
                 automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b] +
@@ -324,9 +324,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
               res.w64[b]     = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0];
           end
           endcase
-        VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) begin
-        automatic logic r;
-        unique case (vew_i)
+        VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)
             EW8: for (int b = 0; b < 8; b++) begin
                 automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b];
                 unique case (vxrm)
@@ -367,7 +365,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
                 endcase
                 res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + {63'b0, r} : $unsigned(($signed(sub[63:0]) >>> 1) + $signed({63'b0, r}));
               end
-          endcase end
+          endcase
 
         // Shift instructions
         VSLL: unique case (vew_i)
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index cdf8a7d48..4c296a3f0 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -280,6 +280,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
   logic narrowing_select_in_d, narrowing_select_in_q;
   // Output selector, used to control the Result MUX and validate the results
   logic narrowing_select_out_d, narrowing_select_out_q;
+  // FPU SIMD result needs to be shuffled for narrowing instructions before commit
+  elen_t narrowing_shuffled_result;
+  // Helper signal to shuffle the narrowed result
+  logic [7:0] narrowing_shuffle_be;
 
   //////////////////
   //  Multiplier  //
@@ -1120,12 +1124,20 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     fpu_mask_t vfpu_flag_mask;
 
+    vf7_flag_out_e16 vfrec7_out_e16[4];
+    vf7_flag_out_e32 vfrec7_out_e32[2];
+    vf7_flag_out_e64 vfrec7_out_e64[1];
+
     status_t vfrec7_ex_flag, vfrsqrt7_ex_flag;
 
     roundmode_e fp_rm_process;
 
     elen_t [LatFNonComp:0]   operand_a_d, vfpu_flag_mask_d;
 
+    vf7_flag_out_e16 vfrsqrt7_out_e16[4];
+    vf7_flag_out_e32 vfrsqrt7_out_e32[2];
+    vf7_flag_out_e64 vfrsqrt7_out_e64[1];
+
     logic [15:0] lzc_e16;
     logic [9:0]  lzc_e32;
     logic [5:0]  lzc_e64;
@@ -1187,13 +1199,19 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
     assign   fp_rm_process = vinsn_processing_q.fp_rm;
 
     always_comb begin: fpu_result_processing_p
+      // Default the vfrec7/vfrsqrt7 scratch arrays (avoids inferred latches)
+      vfrec7_out_e16   = '{default: '0};
+      vfrec7_out_e32   = '{default: '0};
+      vfrec7_out_e64   = '{default: '0};
+      vfrsqrt7_out_e16 = '{default: '0};
+      vfrsqrt7_out_e32 = '{default: '0};
+      vfrsqrt7_out_e64 = '{default: '0};
 
       if (FPExtSupport) begin
 
         // vfrec7 (only supported on 16, 32, 64-bit)
         unique case (vinsn_processing_q.vtype.vsew)
           EW16: begin
-            automatic vf7_flag_out_e16 vfrec7_out_e16[4];
             for (int h = 0; h < 4; h++) vfrec7_out_e16[h] =
               vfrec7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], fp_rm_process);
 
@@ -1206,7 +1224,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                             | (vfrec7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW32: begin
-            automatic vf7_flag_out_e32 vfrec7_out_e32[2];
             for (int w = 0; w < 2; w++) vfrec7_out_e32[w] =
               vfrec7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], fp_rm_process);
 
@@ -1216,7 +1233,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                             | (vfrec7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW64: begin
-            automatic vf7_flag_out_e64 vfrec7_out_e64[1];
             for (int d = 0; d < 1; d++) vfrec7_out_e64[d] =
               vfrec7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], fp_rm_process);
 
@@ -1233,7 +1249,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
        // vfrsqrt7 (only supported on 16, 32, 64-bit)
         unique case (vinsn_processing_q.vtype.vsew)
           EW16: begin
-            automatic vf7_flag_out_e16 vfrsqrt7_out_e16[4];
             for (int h = 0; h < 4; h++) vfrsqrt7_out_e16[h] =
               vfrsqrt7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], lzc_e16[h*4 +: 4]);
 
@@ -1246,7 +1261,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                              | (vfrsqrt7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW32: begin
-            automatic vf7_flag_out_e32 vfrsqrt7_out_e32[2];
             for (int w = 0; w < 2; w++) vfrsqrt7_out_e32[w] =
               vfrsqrt7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], lzc_e32[w*5 +: 5]);
 
@@ -1256,7 +1270,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
                              | (vfrsqrt7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}});
           end
           EW64: begin
-            automatic vf7_flag_out_e64 vfrsqrt7_out_e64[1];
             for (int d = 0; d < 1; d++) vfrsqrt7_out_e64[d] =
               vfrsqrt7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], lzc_e64[d*6 +: 6]);
 
@@ -1378,6 +1391,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     narrowing_select_in_d  = narrowing_select_in_q;
     narrowing_select_out_d = narrowing_select_out_q;
+    narrowing_shuffled_result = '0;
+    narrowing_shuffle_be      = '0;
 
     // Inform our status to the lane controller
     mfpu_ready_o      = !vinsn_queue_full;
@@ -1484,11 +1499,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
     case (mfpu_state_q)
       NO_REDUCTION: begin
-        // FPU SIMD result needs to be shuffled for narrowing instructions before commit
-        automatic elen_t narrowing_shuffled_result;
-        // Helper signal to shuffle the narrowed result
-        automatic logic [7:0] narrowing_shuffle_be;
-
         vfpu_tag_in = mask_i;
 
         // Sign injection
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index 7202d5ccc..6a52c2462 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -673,6 +673,12 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // Information about which is the target FU of the request
   assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu;
 
+  // VRGATHER/VCOMPRESS element buffers (kept at module scope to remain visible in waveforms)
+  // Buffer for the current element
+  logic [NrLanes*DataWidth-1:0] vrgat_res;
+  // Buffer for the current element
+  logic [DataWidth-1:0]         vrgat_buf;
+
   always_comb begin
     // Tail-agnostic bus
     alu_result          = '1;
@@ -688,6 +694,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
 
     vrgat_m_seq_bit = 1'b0;
 
+    // Default the VRGATHER/VCOMPRESS buffers (avoids inferred latches)
+    vrgat_res = '1;
+    vrgat_buf = '0;
+
     // The result mask should be created here since the output is a non-mask vector
     be_viota_seq_d = be_viota_seq_q;
 
@@ -802,11 +812,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         // This operation writes vsew-bit elements with vtype.vsew encoding
         // The vd source can have a different encoding (it gets deshuffled in the masku_operand stage)
         [VRGATHER:VCOMPRESS]: begin
-          // Buffer for the current element
-          automatic logic [NrLanes*DataWidth-1:0] vrgat_res;
-          // Buffer for the current element
-          automatic logic [DataWidth-1:0] vrgat_buf;
-
           // Extract the correct elements
           vrgat_res = '1; // Default assignment
           vrgat_buf = masku_operand_vd_seq[vrgat_req_idx_q[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64]; // Default assignment
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 519ceb2a1..ae5997397 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -234,6 +234,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   logic [idx_width(AxiDataWidth/8):0]      axi_r_byte_pnt_d, axi_r_byte_pnt_q;
   // - A pointer to which byte in the full VRF word we are writing data into.
   logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q;
+  // - A pointer that indicates the start byte in the vrf word.
+  logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
 
   // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information
   // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in
@@ -288,6 +290,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     seq_word_wr_offset_d = seq_word_wr_offset_q;
     first_payload_byte_d = first_payload_byte_q;
     vrf_word_byte_cnt_d  = vrf_word_byte_cnt_q;
+    vrf_word_start_byte  = '0;
 
     // Vector instructions currently running
     vinsn_running_d = vinsn_running_q & pe_vinsn_running_i;
@@ -462,8 +465,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
         // Prepare for the next vector instruction
         if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update
-          // - A pointer that indicates the start byte in the vrf word.
-          automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
           issue_cnt_bytes_d = (
                                 vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl
                                 - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart
@@ -649,8 +650,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
       // New instruction with new vstart. Initialize the vrf byte ptr
       if (vinsn_queue_d.issue_cnt == '0) begin
-        // - A pointer that indicates the start byte in the vrf word.
-        automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte;
         vrf_word_start_byte  = pe_req_i.vstart[$clog2(8*NrLanes)-1:0] << pe_req_i.vtype.vsew;
         vrf_word_byte_pnt_d  = {1'b0, vrf_word_start_byte[$clog2(8*NrLanes)-1:0]};
         vrf_word_byte_cnt_d  = '0;
diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv
index 811589139..0f3ac84c0 100644
--- a/hardware/src/vlsu/vstu.sv
+++ b/hardware/src/vlsu/vstu.sv
@@ -227,6 +227,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
   // When vstart > 0, the very first payload written to the VRF contains less than
   // (8 * NrLanes) bytes.
   logic [$clog2(8*NrLanes):0] first_payload_byte_d, first_payload_byte_q;
+  logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes;
 
   // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information
   // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in
@@ -249,6 +250,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
     vinsn_valid_bytes = '0;
     axi_valid_bytes    = '0;
     valid_bytes = '0;
+    vrf_eff_write_bytes = '0;
 
     // Maintain state
     vinsn_queue_d = vinsn_queue_q;
@@ -366,7 +368,6 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #(
 
         // We consumed a whole word from the lanes
         if (vrf_pnt_d == NrLanes*8 || vrf_cnt_d == issue_cnt_bytes_q) begin : vrf_word_done
-          automatic logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes;
           // Reset the pointer in the VRF word
           vrf_pnt_d         = '0;
           vrf_cnt_d         = '0;

From 7af1b0c90517e7788705516395458cf4978f078a Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Tue, 2 Jun 2026 02:09:53 +0200
Subject: [PATCH 5/6] [hardware] Fix VSSUB saturation check (OR-reduce, not
 AND)

PR review (hopang-0221): unlike VSSUBU, the VSSUB saturation flag is not
replicated across all bits of vxsat.w{16,32,64}[b] (only the LSB is set),
so the AND-reduction introduced for the slang -Wint-bool-conv fix is
always 0 and saturation never triggers. Use OR-reduction instead, which
matches the original non-zero test and clears the warning.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 hardware/src/lane/simd_alu.sv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv
index 24ac835ed..48d22038b 100644
--- a/hardware/src/lane/simd_alu.sv
+++ b/hardware/src/lane/simd_alu.sv
@@ -309,19 +309,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #(
               automatic logic [16:0] sub = opb.w16[b] - opa.w16[b];
               vxsat.w16[b]   = (!opb.w16[b][15] & opa.w16[b][15] & sub[15]) |
                                (opb.w16[b][15] & !opa.w16[b][15] & !sub[15]);
-              res.w16[b]     = &vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0];
+              res.w16[b]     = |vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0];
           end
           EW32: for (int b = 0; b < 2; b++) begin
               automatic logic [32:0] sub = opb.w32[b] - opa.w32[b];
               vxsat.w32[b]   = (!opb.w32[b][31] & opa.w32[b][31] & sub[31]) |
                                (opb.w32[b][31] & !opa.w32[b][31] & !sub[31]);
-              res.w32[b]     = &vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0];
+              res.w32[b]     = |vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0];
           end
           EW64: for (int b = 0; b < 1; b++) begin
               automatic logic [64:0] sub = opb.w64[b] - opa.w64[b];
               vxsat.w64[b]   = (!opb.w64[b][63] & opa.w64[b][63] & sub[63]) |
                                (opb.w64[b][63] & !opa.w64[b][63] & !sub[63]);
-              res.w64[b]     = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0];
+              res.w64[b]     = |vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0];
           end
           endcase
         VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i)

From 6fe2091634c81cd1a46c6a5644b9869fc866a2dc Mon Sep 17 00:00:00 2001
From: Michael Rogenmoser <michael@rogenmoser.us>
Date: Tue, 2 Jun 2026 02:09:53 +0200
Subject: [PATCH 6/6] [hardware] Simplify sldu zero-padding for readability

PR review (hopang-0221): replace the verbose explicit zero-pad
replications (added to avoid an unsized '0 inside a concatenation) with
equivalent implicit zero-extension and a plain literal comparison.
out_pnt_d is wide enough to hold {red_stride_cnt_d, 3'b0}, and the popc
compare is against the value 1 - no functional change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 hardware/src/sldu/sldu.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv
index f493d10b1..14770641a 100644
--- a/hardware/src/sldu/sldu.sv
+++ b/hardware/src/sldu/sldu.sv
@@ -612,7 +612,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Filled up a word to the VRF or finished the instruction
           if (out_pnt_d == NrLanes * 8 || issue_cnt_q <= byte_count) begin
             // Reset the pointer
-            out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {{idx_width(NrLanes*(StrbWidth-1)){1'b0}}, red_stride_cnt_d, 3'b0} : '0;
+            out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {red_stride_cnt_d, 3'b0} : '0;
             // We used all the bits of the mask
             if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN})
               mask_ready_d = !vinsn_issue_q.vm;
@@ -754,7 +754,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Update the p2 stride
           p2_stride_gen_update_d = 1'b1;
           // Commit the final result
-          if (p2_stride_gen_popc_q == {{(idx_width(idx_width(8*NrLanes))-1){1'b0}}, 1'b1} && result_queue_empty) begin
+          if (p2_stride_gen_popc_q == 1 && result_queue_empty) begin
             state_d = SLIDE_NP2_COMMIT;
             // Prepare the write pointer
             result_queue_write_pnt_d = NP2_RESULT_PNT;