From fc2a433966a61bbbd6edde91f1d877cc9efe903e Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sun, 20 Nov 2022 20:17:50 +0100
Subject: [PATCH 1/8] [hardware] Handle WAW and WAR `vload` hazards in the
 `VLDU`

Before this commit, all the hazards (RAW, WAR, WAW) are handled
by the operand requesters that throttle access to source reg elements.
Even if the hazard is a WAR/WAW, the suboptimal but efficient way to
deal with it is to slow down the source reg fetch.
If an instruction does not have source regs, this cannot happen. For
example, load instructions. Therefore, all the instructions that do
not have vector source operands are stalled in the sequencer.
Loads are super common, and stalling in the main sequencer means
that all the instructions after the load are also stalled and cannot
start their execution.
Therefore, now they are processed, and the hazard check is done inside
the VLDU. The write-back request is masked until there is no more any
hazards on that load instruction.
---
 hardware/src/ara.sv           | 13 ++++++++++++-
 hardware/src/ara_sequencer.sv | 28 +++++++++++++++++++++++++---
 hardware/src/vlsu/vldu.sv     | 16 +++++++++++++++-
 hardware/src/vlsu/vlsu.sv     |  6 ++++++
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 7668fef06..15bfb7fa5 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -137,6 +137,10 @@ module ara import ara_pkg::*; #(
   logic [NrVInsn-1:0][NrVInsn-1:0] global_hazard_table;
   // Ready for lane 0 (scalar operand fwd)
   logic pe_scalar_resp_ready;
+  // VLDU Hazard checking
+  vid_t                         vldu_commit_id;
+  logic                         vldu_commit_id_valid;
+  logic                         vldu_hazard;
 
   // Mask unit operands
   elen_t     [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand;
@@ -178,7 +182,11 @@ module ara import ara_pkg::*; #(
     // Interface with the address generator
     .addrgen_ack_i         (addrgen_ack              ),
     .addrgen_error_i       (addrgen_error            ),
-    .addrgen_error_vl_i    (addrgen_error_vl         )
+    .addrgen_error_vl_i    (addrgen_error_vl         ),
+    // Interface with the VLDU for hazard handling
+    .vldu_commit_id_i      (vldu_commit_id           ),
+    .vldu_commit_id_valid_i(vldu_commit_id_valid     ),
+    .vldu_hazard_o         (vldu_hazard              )
   );
 
   // Scalar move support
@@ -344,6 +352,9 @@ module ara import ara_pkg::*; #(
     .addrgen_ack_o              (addrgen_ack                                           ),
     .addrgen_error_o            (addrgen_error                                         ),
     .addrgen_error_vl_o         (addrgen_error_vl                                      ),
+    .commit_id_o                (vldu_commit_id                                        ),
+    .commit_id_valid_o          (vldu_commit_id_valid                                  ),
+    .hazard_i                   (vldu_hazard                                           ),
     // Interface with the Mask unit
     .mask_i                     (mask                                                  ),
     .mask_valid_i               (mask_valid                                            ),
diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv
index 348c01107..8355a97de 100644
--- a/hardware/src/ara_sequencer.sv
+++ b/hardware/src/ara_sequencer.sv
@@ -41,7 +41,11 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     // Interface with the Address Generation
     input  logic                            addrgen_ack_i,
     input  logic                            addrgen_error_i,
-    input  vlen_t                           addrgen_error_vl_i
+    input  vlen_t                           addrgen_error_vl_i,
+    // Interface with the VLDU to handle load WAW and WAR hazards
+    input  vid_t                            vldu_commit_id_i,
+    input  logic                            vldu_commit_id_valid_i,
+    output logic                            vldu_hazard_o
   );
 
   ///////////////////////////////////
@@ -261,6 +265,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     write_list_d          = write_list_q;
     global_hazard_table_d = global_hazard_table_o;
 
+    // No hazard check requested
+    vldu_hazard_o = 1'b0;
+
     // Maintain request
     pe_req_d       = '0;
     pe_req_valid_d = 1'b0;
@@ -370,10 +377,13 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
                                                 pe_req_d.hazard_vs1 | pe_req_d.hazard_vs2;
 
             // We only issue instructions that take no operands if they have no hazards.
+            // Exception to this rule: loads, as they are super common. WAW and WAR hazards
+            // on load instructions are handled in the VLDU.
             // Moreover, SLIDE instructions cannot be always chained
             // ToDo: optimize the case for vslide1down, vslide1up (wait 2 cycles, then chain)
-            if (!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) &&
-                |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} ||
+            if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm})              &&
+                |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd}          &&
+                !(is_load(pe_req_d.op)))                                                                     ||
                 (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) ||
                 (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}))
             begin
@@ -453,6 +463,18 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
       end
     endcase
 
+    // Load-related hazards handling
+    // Loads are masters on the x-bar to write the in-lane VRF. Nevertheless,
+    // they can have WAR or WAW dependencies. When there is a load in the load
+    // unit, its hazard bit is always checked and cleared here as soon as the
+    // dependency does not exist anymore. Whenever the hazard bit is set,
+    // the load cannot issue requests.
+    // It's safe to pipeline vldu_hazard_o if the timing is tight.
+    // (if so, add a sync signal)
+    if (vldu_commit_id_valid_i) begin
+      vldu_hazard_o = |global_hazard_table_o[vldu_commit_id_i];
+    end
+
     // Update the global hazard table
     for (int id = 0; id < NrVInsn; id++) global_hazard_table_d[id] &= vinsn_running_d;
   end : p_sequencer
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 51042ed8e..61b26623a 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -33,6 +33,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     input  logic             [NrVInsn-1:0] pe_vinsn_running_i,
     output logic                           pe_req_ready_o,
     output pe_resp_t                       pe_resp_o,
+    // Hazard handling to main sequencer
+    output vid_t                           commit_id_o,
+    output logic                           commit_id_valid_o,
+    input  logic                           hazard_i,
     // Interface with the address generator
     input  addrgen_axi_req_t               axi_addrgen_req_i,
     input  logic                           axi_addrgen_req_valid_i,
@@ -101,6 +105,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
   logic    vinsn_commit_valid;
   assign vinsn_commit       = vinsn_queue_q.vinsn[vinsn_queue_q.commit_pnt];
   assign vinsn_commit_valid = (vinsn_queue_q.commit_cnt != '0);
+  // To the main sequencer, for hazard checking
+  assign commit_id_valid_o = vinsn_commit_valid;
+  assign commit_id_o       = vinsn_commit.id;
 
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
@@ -354,7 +361,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     //////////////////////////////////
 
     for (int lane = 0; lane < NrLanes; lane++) begin: result_write
-      ldu_result_req_o[lane]   = result_queue_valid_q[result_queue_read_pnt_q][lane];
+      // Create a request only if there are no more hazards on vd (check vs1 since the info about
+      // hazard vd is also there)
+      ldu_result_req_o[lane]   = result_queue_valid_q[result_queue_read_pnt_q][lane] &&
+                                 !vinsn_commit.hazard_vs1;
       ldu_result_addr_o[lane]  = result_queue_q[result_queue_read_pnt_q][lane].addr;
       ldu_result_id_o[lane]    = result_queue_q[result_queue_read_pnt_q][lane].id;
       ldu_result_wdata_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].wdata;
@@ -415,6 +425,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
             vinsn_queue_d.commit_pnt].vtype.vsew);
     end
 
+    // Update the Vd hazard bit for the current instruction
+    // hazard_vs1, hazard_vs2, hazard_vm all contain the info about hazard_vd, so work on one of them (vs1)
+    if (commit_id_valid_o) vinsn_queue_d.vinsn[vinsn_queue_q.commit_pnt].hazard_vs1 &= {NrVInsn{hazard_i}};
+
     //////////////////////////////
     //  Accept new instruction  //
     //////////////////////////////
diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv
index aa2e05283..448b53a87 100644
--- a/hardware/src/vlsu/vlsu.sv
+++ b/hardware/src/vlsu/vlsu.sv
@@ -44,6 +44,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     output logic                    addrgen_ack_o,
     output logic                    addrgen_error_o,
     output vlen_t                   addrgen_error_vl_o,
+    output vid_t                    commit_id_o,
+    output logic                    commit_id_valid_o,
+    input  logic                    hazard_i,
     // Interface with the lanes
     // Store unit operands
     input  elen_t     [NrLanes-1:0] stu_operand_i,
@@ -172,6 +175,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .pe_vinsn_running_i     (pe_vinsn_running_i        ),
     .pe_req_ready_o         (pe_req_ready_o[OffsetLoad]),
     .pe_resp_o              (pe_resp_o[OffsetLoad]     ),
+    .commit_id_o            (commit_id_o               ),
+    .commit_id_valid_o      (commit_id_valid_o         ),
+    .hazard_i               (hazard_i                  ),
     // Interface with the address generator
     .axi_addrgen_req_i      (axi_addrgen_req           ),
     .axi_addrgen_req_valid_i(axi_addrgen_req_valid     ),

From e9a9da3c5c22bfd010ebb7aa6592974a8e3c7c10 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sun, 20 Nov 2022 20:24:59 +0100
Subject: [PATCH 2/8] [hardware] :bug: Decouple cmdBuffer and dataBuffer depths
 in opQueues

---
 hardware/src/lane/operand_queue.sv        |  13 +--
 hardware/src/lane/operand_queues_stage.sv | 103 ++++++++++++----------
 2 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv
index fe40a291b..72c8202e1 100644
--- a/hardware/src/lane/operand_queue.sv
+++ b/hardware/src/lane/operand_queue.sv
@@ -9,7 +9,8 @@
 // need it.
 
 module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #(
-    parameter  int           unsigned BufferDepth    = 2,
+    parameter  int           unsigned CmdBufDepth    = 2,
+    parameter  int           unsigned DataBufDepth   = 2,
     parameter  int           unsigned NrSlaves       = 1,
     parameter  int           unsigned NrLanes        = 0,
     // Support for floating-point data types
@@ -52,7 +53,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   logic               cmd_pop;
 
   fifo_v3 #(
-    .DEPTH(BufferDepth        ),
+    .DEPTH(CmdBufDepth        ),
     .dtype(operand_queue_cmd_t)
   ) i_cmd_buffer (
     .clk_i     (clk_i                    ),
@@ -79,8 +80,8 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
   logic  ibuf_pop;
 
   fifo_v3 #(
-    .DEPTH     (BufferDepth),
-    .DATA_WIDTH(DataWidth  )
+    .DEPTH     (DataBufDepth),
+    .DATA_WIDTH(DataWidth   )
   ) i_input_buffer (
     .clk_i     (clk_i          ),
     .rst_ni    (rst_ni         ),
@@ -98,7 +99,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
 
   // We used a credit based system, to ensure that the FIFO is always
   // able to accept a request.
-  logic [idx_width(BufferDepth):0] ibuf_usage_d, ibuf_usage_q;
+  logic [idx_width(DataBufDepth):0] ibuf_usage_d, ibuf_usage_q;
 
   always_comb begin: p_ibuf_usage
     // Maintain state
@@ -110,7 +111,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
     if (ibuf_pop) ibuf_usage_d -= 1;
 
     // Are we ready?
-    operand_queue_ready_o = (ibuf_usage_q != BufferDepth);
+    operand_queue_ready_o = (ibuf_usage_q != DataBufDepth);
   end
 
   always_ff @(posedge clk_i or negedge rst_ni) begin: p_ibuf_usage_ff
diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv
index dab636d07..5ed714522 100644
--- a/hardware/src/lane/operand_queues_stage.sv
+++ b/hardware/src/lane/operand_queues_stage.sv
@@ -52,14 +52,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   ///////////
 
   operand_queue #(
-    .BufferDepth   (5         ),
-    .FPUSupport    (FPUSupport),
-    .NrLanes       (NrLanes   ),
-    .SupportIntExt2(1'b1      ),
-    .SupportIntExt4(1'b1      ),
-    .SupportIntExt8(1'b1      ),
-    .SupportReduct (1'b1      ),
-    .SupportNtrVal (1'b0      )
+    .CmdBufDepth   (ValuInsnQueueDepth),
+    .DataBufDepth  (5                 ),
+    .FPUSupport    (FPUSupport        ),
+    .NrLanes       (NrLanes           ),
+    .SupportIntExt2(1'b1              ),
+    .SupportIntExt4(1'b1              ),
+    .SupportIntExt8(1'b1              ),
+    .SupportReduct (1'b1              ),
+    .SupportNtrVal (1'b0              )
   ) i_operand_queue_alu_a (
     .clk_i                    (clk_i                          ),
     .rst_ni                   (rst_ni                         ),
@@ -77,14 +78,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   );
 
   operand_queue #(
-    .BufferDepth   (5         ),
-    .FPUSupport    (FPUSupport),
-    .NrLanes       (NrLanes   ),
-    .SupportIntExt2(1'b1      ),
-    .SupportIntExt4(1'b1      ),
-    .SupportIntExt8(1'b1      ),
-    .SupportReduct (1'b1      ),
-    .SupportNtrVal (1'b1      )
+    .CmdBufDepth   (ValuInsnQueueDepth),
+    .DataBufDepth  (5                 ),
+    .FPUSupport    (FPUSupport        ),
+    .NrLanes       (NrLanes           ),
+    .SupportIntExt2(1'b1              ),
+    .SupportIntExt4(1'b1              ),
+    .SupportIntExt8(1'b1              ),
+    .SupportReduct (1'b1              ),
+    .SupportNtrVal (1'b1              )
   ) i_operand_queue_alu_b (
     .clk_i                    (clk_i                          ),
     .rst_ni                   (rst_ni                         ),
@@ -106,12 +108,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   //////////////////////
 
   operand_queue #(
-    .BufferDepth   (5         ),
-    .FPUSupport    (FPUSupport),
-    .NrLanes       (NrLanes   ),
-    .SupportIntExt2(1'b1      ),
-    .SupportReduct (1'b1      ),
-    .SupportNtrVal (1'b0      )
+    .CmdBufDepth   (MfpuInsnQueueDepth ),
+    .DataBufDepth  (5                  ),
+    .FPUSupport    (FPUSupport         ),
+    .NrLanes       (NrLanes            ),
+    .SupportIntExt2(1'b1               ),
+    .SupportReduct (1'b1               ),
+    .SupportNtrVal (1'b0               )
   ) i_operand_queue_mfpu_a (
     .clk_i                    (clk_i                             ),
     .rst_ni                   (rst_ni                            ),
@@ -129,12 +132,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   );
 
   operand_queue #(
-    .BufferDepth   (5         ),
-    .FPUSupport    (FPUSupport),
-    .NrLanes       (NrLanes   ),
-    .SupportIntExt2(1'b1      ),
-    .SupportReduct (1'b1      ),
-    .SupportNtrVal (1'b1      )
+    .CmdBufDepth   (MfpuInsnQueueDepth ),
+    .DataBufDepth  (5                  ),
+    .FPUSupport    (FPUSupport         ),
+    .NrLanes       (NrLanes            ),
+    .SupportIntExt2(1'b1               ),
+    .SupportReduct (1'b1               ),
+    .SupportNtrVal (1'b1               )
   ) i_operand_queue_mfpu_b (
     .clk_i                    (clk_i                             ),
     .rst_ni                   (rst_ni                            ),
@@ -152,12 +156,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   );
 
   operand_queue #(
-    .BufferDepth   (5         ),
-    .FPUSupport    (FPUSupport),
-    .NrLanes       (NrLanes   ),
-    .SupportIntExt2(1'b1      ),
-    .SupportReduct (1'b1      ),
-    .SupportNtrVal (1'b1      )
+    .CmdBufDepth   (MfpuInsnQueueDepth ),
+    .DataBufDepth  (5                  ),
+    .FPUSupport    (FPUSupport         ),
+    .NrLanes       (NrLanes            ),
+    .SupportIntExt2(1'b1               ),
+    .SupportReduct (1'b1               ),
+    .SupportNtrVal (1'b1               )
   ) i_operand_queue_mfpu_c (
     .clk_i                    (clk_i                             ),
     .rst_ni                   (rst_ni                            ),
@@ -179,9 +184,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   ///////////////////////
 
   operand_queue #(
-    .BufferDepth(2         ),
-    .FPUSupport (FPUSupport),
-    .NrLanes    (NrLanes   )
+    .CmdBufDepth   (VstuInsnQueueDepth + MaskuInsnQueueDepth),
+    .DataBufDepth  (2                                       ),
+    .FPUSupport    (FPUSupport                              ),
+    .NrLanes       (NrLanes                                 )
   ) i_operand_queue_st_mask_a (
     .clk_i                    (clk_i                         ),
     .rst_ni                   (rst_ni                        ),
@@ -203,9 +209,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
    ****************/
 
   operand_queue #(
-    .BufferDepth(2         ),
-    .FPUSupport (FPUSupport),
-    .NrLanes    (NrLanes   )
+    .CmdBufDepth   (VlduInsnQueueDepth),
+    .DataBufDepth  (2                 ),
+    .FPUSupport    (FPUSupport        ),
+    .NrLanes       (NrLanes           )
   ) i_operand_queue_slide_addrgen_a (
     .clk_i                    (clk_i                                         ),
     .rst_ni                   (rst_ni                                        ),
@@ -227,11 +234,12 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   /////////////////
 
   operand_queue #(
-    .BufferDepth(1         ),
-    .FPUSupport (FPUSupport),
-    .SupportIntExt2(1'b1),
-    .SupportIntExt4(1'b1),
-    .SupportIntExt8(1'b1),
+    .CmdBufDepth   (MaskuInsnQueueDepth),
+    .DataBufDepth  (1                  ),
+    .FPUSupport    (FPUSupport         ),
+    .SupportIntExt2(1'b1               ),
+    .SupportIntExt4(1'b1               ),
+    .SupportIntExt8(1'b1               ),
     .NrLanes    (NrLanes   )
   ) i_operand_queue_mask_b (
     .clk_i                    (clk_i                           ),
@@ -250,8 +258,9 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math
   );
 
   operand_queue #(
-    .BufferDepth(1         ),
-    .NrLanes    (NrLanes   )
+    .CmdBufDepth   (MaskuInsnQueueDepth),
+    .DataBufDepth  (1                  ),
+    .NrLanes       (NrLanes            )
   ) i_operand_queue_mask_m (
     .clk_i                    (clk_i                           ),
     .rst_ni                   (rst_ni                          ),

From 630ef8e812a4fc7be045b3d57ad9589e8a5ae96d Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Sun, 20 Nov 2022 20:30:11 +0100
Subject: [PATCH 3/8] [hardware] Parametrize addrgen queue depth

---
 hardware/include/ara_pkg.sv  | 1 +
 hardware/src/vlsu/addrgen.sv | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 593967a7e..7b18e8597 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -86,6 +86,7 @@ package ara_pkg;
   localparam int unsigned ValuInsnQueueDepth = 4;
   localparam int unsigned VlduInsnQueueDepth = 4;
   localparam int unsigned VstuInsnQueueDepth = 4;
+  localparam int unsigned VaddrgenInsnQueueDepth = 4;
   localparam int unsigned SlduInsnQueueDepth = 2;
   localparam int unsigned NoneInsnQueueDepth = 1;
   // Ara supports MaskuInsnQueueDepth = 1 only.
diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv
index def21df8e..2fbe05e55 100644
--- a/hardware/src/vlsu/addrgen.sv
+++ b/hardware/src/vlsu/addrgen.sv
@@ -89,8 +89,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #(
   logic             axi_addrgen_queue_empty;
 
   fifo_v3 #(
-    .DEPTH(4                ),
-    .dtype(addrgen_axi_req_t)
+    .DEPTH(VaddrgenInsnQueueDepth),
+    .dtype(addrgen_axi_req_t     )
   ) i_addrgen_req_queue (
     .clk_i     (clk_i                                                    ),
     .rst_ni    (rst_ni                                                   ),

From 4522f6cc168d05d44639b7e7007066fa39ae80ee Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 21 Nov 2022 00:19:56 +0100
Subject: [PATCH 4/8] [hardware] Add support for Barber's Pole VRF layout

With Barber Pole layout, the PEs can almost always increment
the address by 1 when writing back new data into the VRF.
Only the Slide Unit has some special treatment, as its
start address come with an offset.
Remember that the VRF layou should also be consistent
among different LMUL settings, i.e. when LMUL > 1 and
we pass from reg N to reg N+1, we must also take into
account that reg N+1 has a different starting position
for element 0.
---
 hardware/include/ara_pkg.sv            |  5 --
 hardware/include/ara_vaddr.svh         | 80 ++++++++++++++++++++++++++
 hardware/src/ara.sv                    |  6 +-
 hardware/src/lane/lane.sv              |  3 +-
 hardware/src/lane/operand_requester.sv | 22 +++++--
 hardware/src/lane/valu.sv              | 35 ++++++++---
 hardware/src/lane/vector_fus_stage.sv  | 12 ++--
 hardware/src/lane/vmfpu.sv             | 47 +++++++++++----
 hardware/src/masku/masku.sv            | 39 ++++++++++---
 hardware/src/sldu/sldu.sv              | 41 ++++++++++---
 hardware/src/vlsu/vldu.sv              | 45 +++++++++++----
 hardware/src/vlsu/vlsu.sv              |  3 +-
 12 files changed, 265 insertions(+), 73 deletions(-)
 create mode 100644 hardware/include/ara_vaddr.svh

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 7b18e8597..6fe041e70 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -878,11 +878,6 @@ package ara_pkg;
   // Each lane has eight VRF banks
   localparam int unsigned NrVRFBanksPerLane = 8;
 
-  // Find the starting address of a vector register vid
-  function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes);
-    vaddr = vid * (VLENB / NrLanes / 8);
-  endfunction: vaddr
-
   // Differenciate between SLDU and ADDRGEN operands from opqueue
   typedef enum logic {
     ALU_SLDU     = 1'b0,
diff --git a/hardware/include/ara_vaddr.svh b/hardware/include/ara_vaddr.svh
new file mode 100644
index 000000000..3cd9f9ce8
--- /dev/null
+++ b/hardware/include/ara_vaddr.svh
@@ -0,0 +1,80 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Author: Matteo Perotti <mperotti@iis.ee.ethz.ch>
+// Description:
+// Ara's functions to calculate VRF addresses. Not in the package
+// since the functions depend on `NrLanes`
+
+// All the functions to support a Barber-Pole VRF layout
+
+// Find the starting VRF address of a vector register vid
+function automatic vaddr_t vaddr(logic [4:0] vid, int NrLanes);
+  // This is not an adder, it's only wires.
+  // (this holds if VLENB / NrLanes >= NrVRFBanksPerLane^2)
+  vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane) + vid[VaddrBankWidth-1:0];
+endfunction: vaddr
+
+// Return the physical address of the next element of a certain vector
+function automatic vaddr_t next_vaddr(vaddr_t vaddr, logic [4:0] vid);
+  // vaddr msbs -> byte index in a bank
+  logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index;
+  // vaddr lsbs -> bank index
+  logic [VaddrBankWidth-1:0] bank;
+
+  index = vaddr[VaddrIdxWidth-1:VaddrBankWidth];
+  bank  = vaddr[VaddrBankWidth-1:0];
+
+  old_index = index;
+
+  // Increment bank counter
+  bank += 1;
+  if (bank == vid[VaddrBankWidth-1:0])
+    // Wrap around
+    index += 1;
+
+  // If we change vreg, the start element position is +1 (LMUL > 1)
+  // This is important for B layout consistency among different LMUL
+  // or when inactive element policy is "undistrubed"
+  if (index[VaddrVregWidth] != old_index[VaddrVregWidth])
+    bank += 1;
+
+  return {index, bank};
+endfunction
+
+// Initialize with an offset (necessary with vslideup)
+function automatic vaddr_t vaddr_offset(vaddr_t vaddr, vaddr_t off, logic [4:0] vid);
+  // vaddr msbs -> byte index in a bank
+  logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index;
+  // vaddr lsbs -> bank index
+  logic [VaddrBankWidth-1:0] bank, old_bank;
+
+  index = vaddr[VaddrIdxWidth-1:VaddrBankWidth];
+  bank  = vaddr[VaddrBankWidth-1:0];
+
+  old_index = index;
+  old_bank  = bank;
+
+  // Increment bank counter
+  index += off[VaddrIdxWidth-1:VaddrBankWidth];
+  bank  += off[VaddrBankWidth-1:0];
+  // Support vstart != 0: don't hypothesize that old_bank == vid[VaddrBankWidth-1:0]
+  // Wrap around if we meet vid[VaddrBankWidth-1:0] during the addition
+  if (old_bank > vid[VaddrBankWidth-1:0]) begin
+    if (bank >= vid[VaddrBankWidth-1:0] && bank < old_bank)
+      // Wrap around
+      index += 1;
+  end else if (old_bank < vid[VaddrBankWidth-1:0]) begin
+    if (bank >= vid[VaddrBankWidth-1:0] || bank < old_bank)
+      // Wrap around
+      index += 1;
+  end
+
+  // If we change vreg, the start element position is +1
+  // for every reg passed (LMUL > 1). The max reg id delta is 7
+  // with LMUL == 8.
+  bank += index[VaddrVregWidth +: 3] - old_index[VaddrVregWidth +: 3];
+
+  return {index, bank};
+endfunction
diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 15bfb7fa5..350806979 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -388,8 +388,7 @@ module ara import ara_pkg::*; #(
   logic sldu_mask_ready;
 
   sldu #(
-    .NrLanes(NrLanes),
-    .vaddr_t(vaddr_t)
+    .NrLanes(NrLanes)
   ) i_sldu (
     .clk_i                   (clk_i                            ),
     .rst_ni                  (rst_ni                           ),
@@ -424,8 +423,7 @@ module ara import ara_pkg::*; #(
   /////////////////
 
   masku #(
-    .NrLanes(NrLanes),
-    .vaddr_t(vaddr_t)
+    .NrLanes(NrLanes)
   ) i_masku (
     .clk_i                   (clk_i                           ),
     .rst_ni                  (rst_ni                          ),
diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv
index d12c71345..a786cabfe 100644
--- a/hardware/src/lane/lane.sv
+++ b/hardware/src/lane/lane.sv
@@ -191,8 +191,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #(
 
   operand_requester #(
     .NrBanks(NrVRFBanksPerLane),
-    .NrLanes(NrLanes          ),
-    .vaddr_t(vaddr_t          )
+    .NrLanes(NrLanes          )
   ) i_operand_requester (
     .clk_i                    (clk_i                   ),
     .rst_ni                   (rst_ni                  ),
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 54590fbc3..ba9895268 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -9,11 +9,17 @@
 // queues. This stage also includes the VRF arbiter.
 
 module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
-    parameter  int  unsigned NrLanes = 0,
-    parameter  int  unsigned NrBanks = 0,     // Number of banks in the vector register file
-    parameter  type          vaddr_t = logic, // Type used to address vector register file elements
+    parameter  int unsigned NrLanes         = 0,
+    parameter  int unsigned NrBanks         = 0, // Number of banks in the vector register file
+    // Type used to address vector register file elements
+    localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type         vaddr_t         = logic [VaddrIdxWidth-1:0],
     // Dependant parameters. DO NOT CHANGE!
-    localparam type          strb_t  = logic[$bits(elen_t)/8-1:0]
+    localparam type          strb_t         = logic[$bits(elen_t)/8-1:0]
   ) (
     input  logic                                       clk_i,
     input  logic                                       rst_ni,
@@ -76,6 +82,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
     output logic                                       ldu_result_final_gnt_o
   );
 
+  `include "../include/ara_vaddr.svh"
+
   import cf_math_pkg::idx_width;
 
   ////////////////////////
@@ -233,6 +241,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       vid_t id;
       // Address of the next element to be read
       vaddr_t addr;
+      // Source reg LSbs (useful for barber's pole)
+      logic [idx_width(NrBanks)-1:0] vs;
       // How many elements remain to be read
       vlen_t len;
       // Element width
@@ -316,6 +326,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
               addr   : vaddr(operand_request_i[requester].vs, NrLanes) +
               (operand_request_i[requester].vstart >>
                 (int'(EW64) - int'(operand_request_i[requester].eew))),
+              vs     : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
               // For memory operations, the number of elements initially refers to the new EEW (vsew here),
               // but the requester must refer to the old EEW (eew here)
               // This reasoning cannot be applied also to widening instructions, which modify vsew
@@ -363,7 +374,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
             // Received a grant.
             if (|operand_requester_gnt) begin
               // Bump the address pointer
-              requester_d.addr = requester_q.addr + 1'b1;
+              requester_d.addr = next_vaddr(requester_q.addr, requester_q.vs);
 
               // We read less than 64 bits worth of elements
               if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew))))
@@ -405,6 +416,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                   addr : vaddr(operand_request_i[requester].vs, NrLanes) +
                   (operand_request_i[requester].vstart >>
                     (int'(EW64) - int'(operand_request_i[requester].eew))),
+                  vs     : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
                   len    : (operand_request_i[requester].scale_vl) ?
                              ((operand_request_i[requester].vl <<
                              operand_request_i[requester].vtype.vsew) >>
diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv
index 386caca74..7cc93f3d8 100644
--- a/hardware/src/lane/valu.sv
+++ b/hardware/src/lane/valu.sv
@@ -8,15 +8,20 @@
 // in a SIMD fashion, always operating on 64 bits.
 
 module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #(
-    parameter  int  unsigned NrLanes      = 0,
+    parameter  int  unsigned NrLanes         = 0,
     // Support for fixed-point data types
-    parameter  logic         FixPtSupport = FixedPointEnable,
+    parameter  logic         FixPtSupport    = FixedPointEnable,
     // Type used to address vector register file elements
-    parameter  type          vaddr_t      = logic,
+    localparam int  unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int  unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int  unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int  unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int  unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type          vaddr_t         = logic [VaddrIdxWidth-1:0],
     // Dependant parameters. DO NOT CHANGE!
-    localparam int  unsigned DataWidth    = $bits(elen_t),
-    localparam int  unsigned StrbWidth    = DataWidth/8,
-    localparam type          strb_t       = logic [StrbWidth-1:0]
+    localparam int  unsigned DataWidth       = $bits(elen_t),
+    localparam int  unsigned StrbWidth       = DataWidth/8,
+    localparam type          strb_t          = logic [StrbWidth-1:0]
   ) (
     input  logic                         clk_i,
     input  logic                         rst_ni,
@@ -55,6 +60,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     output logic                         mask_ready_o
   );
 
+  // Include address-handling functions
+  `include "../../include/ara_vaddr.svh"
+
   import cf_math_pkg::idx_width;
 
   /////////////
@@ -137,6 +145,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     logic mask;
   } payload_t;
 
+  vaddr_t addr_d, addr_q;
+
   // Result queue
   payload_t [ResultQueueDepth-1:0]            result_queue_d, result_queue_q;
   logic     [ResultQueueDepth-1:0]            result_queue_valid_d, result_queue_valid_q;
@@ -424,6 +434,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
     reduction_rx_cnt_d      = reduction_rx_cnt_q;
     sldu_transactions_cnt_d = sldu_transactions_cnt_q;
     red_hs_synch_d          = red_hs_synch_q;
+    addr_d                  = addr_q;
     alu_red_valid_o         = 1'b0;
     sldu_alu_ready_d        = 1'b0;
     simd_red_cnt_max_d      = simd_red_cnt_max_q;
@@ -474,8 +485,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 mask_ready_o = !vinsn_issue_q.vm;
 
               // Store the result in the result queue
+              addr_d = next_vaddr(addr_q, vinsn_issue_q.vd);
               result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result;
-              result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew));
+              result_queue_d[result_queue_write_pnt_q].addr  = addr_q;
               result_queue_d[result_queue_write_pnt_q].id    = vinsn_issue_q.id;
               result_queue_d[result_queue_write_pnt_q].mask  = vinsn_issue_q.vfu == VFU_MaskUnit;
               if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q)
@@ -531,6 +543,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
                 else
                   vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1;
 
+                // Change starting address when we issue a new instruction
+                // Since this unit is not pipelined and elements written in the
+                // result queue belong to vinsn_issue_q
+                addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes);
+
                 // Assign vector length for next instruction in the instruction queue
                 if (vinsn_queue_d.issue_cnt != 0) begin
                   if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]}))
@@ -830,6 +847,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
         red_hs_synch_d = 1'b1; // Allow the first valid
 
         issue_cnt_d = vfu_operation_i.vl;
+        // Initialize the starting address for the next instruction
+        addr_d = vaddr(vfu_operation_i.vd, NrLanes);
         if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]}))
           issue_cnt_d = vfu_operation_i.vl;
         else begin
@@ -877,6 +896,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       simd_red_cnt_max_q      <= '0;
       alu_red_ready_q         <= 1'b0;
       alu_vxsat_q             <= '0;
+      addr_q                  <= '0;
     end else begin
       issue_cnt_q             <= issue_cnt_d;
       commit_cnt_q            <= commit_cnt_d;
@@ -890,6 +910,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width;
       simd_red_cnt_max_q      <= simd_red_cnt_max_d;
       alu_red_ready_q         <= alu_red_ready_i;
       alu_vxsat_q             <= alu_vxsat_d;
+      addr_q                  <= addr_d;
     end
   end
 
diff --git a/hardware/src/lane/vector_fus_stage.sv b/hardware/src/lane/vector_fus_stage.sv
index 6eb28e7c2..42b6a347e 100644
--- a/hardware/src/lane/vector_fus_stage.sv
+++ b/hardware/src/lane/vector_fus_stage.sv
@@ -96,9 +96,8 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg
   //////////////////
 
   valu #(
-    .NrLanes(NrLanes),
-    .FixPtSupport(FixPtSupport),
-    .vaddr_t(vaddr_t)
+    .NrLanes     (NrLanes     ),
+    .FixPtSupport(FixPtSupport)
   ) i_valu (
     .clk_i                (clk_i                          ),
     .rst_ni               (rst_ni                         ),
@@ -143,10 +142,9 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg
   ///////////////////
 
   vmfpu #(
-    .NrLanes   (NrLanes   ),
-    .FPUSupport(FPUSupport),
-    .FixPtSupport(FixPtSupport),
-    .vaddr_t   (vaddr_t   )
+    .NrLanes     (NrLanes     ),
+    .FPUSupport  (FPUSupport  ),
+    .FixPtSupport(FixPtSupport)
   ) i_vmfpu (
     .clk_i                (clk_i                           ),
     .rst_ni               (rst_ni                          ),
diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv
index c4ffc6d72..81c729864 100644
--- a/hardware/src/lane/vmfpu.sv
+++ b/hardware/src/lane/vmfpu.sv
@@ -9,17 +9,22 @@
 
 module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
   import cf_math_pkg::idx_width; #(
-    parameter  int           unsigned NrLanes      = 0,
+    parameter  int           unsigned NrLanes         = 0,
     // Support for floating-point data types
-    parameter  fpu_support_e          FPUSupport   = FPUSupportHalfSingleDouble,
+    parameter  fpu_support_e          FPUSupport      = FPUSupportHalfSingleDouble,
     // Support for fixed-point data types
-    parameter  logic                  FixPtSupport = FixedPointEnable,
+    parameter  logic                  FixPtSupport    = FixedPointEnable,
     // Type used to address vector register file elements
-    parameter  type                   vaddr_t      = logic,
+    localparam int           unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int           unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int           unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int           unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int           unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type          vaddr_t                  = logic [VaddrIdxWidth-1:0],
     // Dependant parameters. DO NOT CHANGE!
-    localparam int           unsigned DataWidth    = $bits(elen_t),
-    localparam int           unsigned StrbWidth    = DataWidth/8,
-    localparam type                   strb_t       = logic [DataWidth/8-1:0]
+    localparam int           unsigned DataWidth       = $bits(elen_t),
+    localparam int           unsigned StrbWidth       = DataWidth/8,
+    localparam type                   strb_t          = logic [DataWidth/8-1:0]
   ) (
     input  logic                         clk_i,
     input  logic                         rst_ni,
@@ -61,6 +66,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
     output logic                         mask_ready_o
   );
 
+  // Include address-handling functions
+  `include "../../include/ara_vaddr.svh"
+
   ////////////////////////////////
   //  Vector instruction queue  //
   ////////////////////////////////
@@ -180,6 +188,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
   //  Helper signals  //
   //////////////////////
 
+  vaddr_t addr_d, addr_q;
+
   logic vinsn_issue_mul, vinsn_issue_div, vinsn_issue_fpu;
 
   assign vinsn_issue_mul = vinsn_issue_q.op inside {[VMUL:VSMUL]};
@@ -1044,6 +1054,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
     reduction_rx_cnt_d      = reduction_rx_cnt_q;
     sldu_transactions_cnt_d = sldu_transactions_cnt_q;
     red_hs_synch_d          = red_hs_synch_q;
+    addr_d                  = addr_q;
     mfpu_red_valid_o        = 1'b0;
     sldu_mfpu_ready_d       = 1'b0;
     simd_red_cnt_max_d      = simd_red_cnt_max_q;
@@ -1218,9 +1229,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
           to_process_cnt_d = (narrowing(vinsn_processing_q.cvt_resize)) ? (to_process_cnt_q - processed_element_cnt_narrow) : (to_process_cnt_q - processed_element_cnt);
 
           // Store the result in the result queue
+          addr_d = next_vaddr(addr_q, vinsn_processing_q.vd);
           result_queue_d[result_queue_write_pnt_q].id    = vinsn_processing_q.id;
-          result_queue_d[result_queue_write_pnt_q].addr  = vaddr(vinsn_processing_q.vd, NrLanes) +
-            ((vinsn_processing_q.vl - to_process_cnt_q) >> (int'(EW64) - vinsn_processing_q.vtype.vsew));
+          result_queue_d[result_queue_write_pnt_q].addr  = addr_q;
           // FP narrowing instructions pack the result in two different cycles, and only some 16-bit slices are active
           if (narrowing(vinsn_processing_q.cvt_resize)) begin
             for (int b = 0; b < 4; b++) begin
@@ -1275,6 +1286,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
 
             if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d =
               vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl;
+
+            // Update the address for the results of the next cycles since they belong
+            // to the next instruction
+            addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes);
           end
         end
       end
@@ -1695,6 +1710,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
           if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d =
             vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl;
 
+          // Update the starting address for the next instruction
+          addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes);
+
           // Bump issue counter and pointers
           vinsn_queue_d.issue_cnt -= 1;
           if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) vinsn_queue_d.issue_pnt = '0;
@@ -1833,9 +1851,12 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
         osum_issue_cnt_d        = '0;
         issue_cnt_d             = vfu_operation_i.vl;
       end
-      if (vinsn_queue_d.processing_cnt == '0) to_process_cnt_d = vfu_operation_i.vl;
-      if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d =
-        is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl;
+      if (vinsn_queue_d.processing_cnt == '0) begin
+        to_process_cnt_d = vfu_operation_i.vl;
+        // A new instruction to process; update the starting address
+        addr_d = vaddr(vfu_operation_i.vd, NrLanes);
+      end
+      if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl;
       // Floating-Point re-encoding for widening operations
       // Enabled only for the supported formats
       if (FPUSupport != FPUSupportNone) begin
@@ -1902,6 +1923,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       intra_op_rx_cnt_q       <= '0;
       osum_issue_cnt_q        <= '0;
       mfpu_vxsat_q            <= '0;
+      addr_q                  <= '0;
     end else begin
       issue_cnt_q             <= issue_cnt_d;
       to_process_cnt_q        <= to_process_cnt_d;
@@ -1925,6 +1947,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*;
       intra_op_rx_cnt_q       <= intra_op_rx_cnt_d;
       osum_issue_cnt_q        <= osum_issue_cnt_d;
       mfpu_vxsat_q            <= mfpu_vxsat_d;
+      addr_q                  <= addr_d;
     end
   end
 
diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv
index afea302f6..58cc11f1b 100644
--- a/hardware/src/masku/masku.sv
+++ b/hardware/src/masku/masku.sv
@@ -10,12 +10,18 @@
 // predicated instructions.
 
 module masku import ara_pkg::*; import rvv_pkg::*; #(
-    parameter  int  unsigned NrLanes = 0,
-    parameter  type          vaddr_t = logic, // Type used to address vector register file elements
+    parameter  int  unsigned NrLanes         = 0,
+    // Address of an element in the lane's VRF
+    localparam int  unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int  unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int  unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int  unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int  unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type          vaddr_t         = logic [VaddrIdxWidth-1:0],
     // Dependant parameters. DO NOT CHANGE!
-    localparam int  unsigned DataWidth = $bits(elen_t), // Width of the lane datapath
-    localparam int  unsigned StrbWidth = DataWidth/8,
-    localparam type          strb_t    = logic [StrbWidth-1:0] // Byte-strobe type
+    localparam int  unsigned DataWidth       = $bits(elen_t), // Width of the lane datapath
+    localparam int  unsigned StrbWidth       = DataWidth/8,
+    localparam type          strb_t          = logic [StrbWidth-1:0] // Byte-strobe type
   ) (
     input  logic                                       clk_i,
     input  logic                                       rst_ni,
@@ -48,6 +54,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     input  logic                                       sldu_mask_ready_i
   );
 
+  // Include address-handling functions
+  `include "../../include/ara_vaddr.svh"
+
   import cf_math_pkg::idx_width;
 
   ////////////////
@@ -142,6 +151,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
   // There is a mask queue per lane, holding the operands that were not
   // yet used by the corresponding lane.
 
+  vaddr_t addr_d, addr_q;
+
   // Mask queue
   strb_t [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_d, mask_queue_q;
   logic  [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_valid_d, mask_queue_valid_q;
@@ -647,6 +658,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
     popcount_d     = popcount_q;
     vfirst_count_d = vfirst_count_q;
 
+    addr_d         = addr_q;
+
     mask_queue_d           = mask_queue_q;
     mask_queue_valid_d     = mask_queue_valid_q;
     mask_queue_write_pnt_d = mask_queue_write_pnt_q;
@@ -732,6 +745,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         else
           mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1;
 
+        // Increment write-back address
+        addr_d = next_vaddr(addr_q, vinsn_issue.vd);
+
         // Account for the operands that were issued
         read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew));
         if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)))
@@ -838,9 +854,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
             result_queue_d[result_queue_write_pnt_q][lane] = '{
               wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane],
               be   : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew),
-              addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) +
-                (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)),
-              id : vinsn_issue.id
+              addr : addr_q,
+              id   : vinsn_issue.id
             };
           end
 
@@ -848,6 +863,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
           if (vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}) begin
             vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew));
 
+            // Increment write-back address
+            addr_d = next_vaddr(addr_q, vinsn_issue.vd);
+
             // Filled-up a word, or finished execution
             if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin
               result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
@@ -1077,6 +1095,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
         issue_cnt_d = pe_req_i.vl;
         read_cnt_d  = pe_req_i.vl;
 
+        // Initialize the starting address of the next instruction
+        addr_d = vaddr(pe_req_i.vd, NrLanes);
+
         // Trim skipped words
         if (pe_req_i.op == VSLIDEUP) begin
           issue_cnt_d -= vlen_t'(trimmed_stride);
@@ -1131,6 +1152,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       result_final_gnt_q <= '0;
       popcount_q         <= '0;
       vfirst_count_q     <= '0;
+      addr_q             <= '0;
     end else begin
       vinsn_running_q    <= vinsn_running_d;
       read_cnt_q         <= read_cnt_d;
@@ -1142,6 +1164,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #(
       result_final_gnt_q <= result_final_gnt_d;
       popcount_q         <= popcount_d;
       vfirst_count_q     <= vfirst_count_d;
+      addr_q             <= addr_d;
     end
   end
 
diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv
index 9c06c3ac5..7439bed7a 100644
--- a/hardware/src/sldu/sldu.sv
+++ b/hardware/src/sldu/sldu.sv
@@ -8,12 +8,18 @@
 // instructions, which need access to the whole Vector Register File.
 
 module sldu import ara_pkg::*; import rvv_pkg::*; #(
-    parameter  int  unsigned NrLanes = 0,
-    parameter  type          vaddr_t = logic, // Type used to address vector register file elements
+    parameter  int  unsigned NrLanes         = 0,
+    // Address of an element in the lane's VRF
+    localparam int  unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int  unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int  unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int  unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int  unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type          vaddr_t         = logic [VaddrIdxWidth-1:0],
     // Dependant parameters. DO NOT CHANGE!
-    localparam int  unsigned DataWidth = $bits(elen_t), // Width of the lane datapath
-    localparam int  unsigned StrbWidth = DataWidth/8,
-    localparam type          strb_t    = logic [StrbWidth-1:0] // Byte-strobe type
+    localparam int  unsigned DataWidth       = $bits(elen_t), // Width of the lane datapath
+    localparam int  unsigned StrbWidth       = DataWidth/8,
+    localparam type          strb_t          = logic [StrbWidth-1:0] // Byte-strobe type
   ) (
     input  logic                   clk_i,
     input  logic                   rst_ni,
@@ -46,6 +52,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
 
   `include "common_cells/registers.svh"
 
+  // Include address-handling functions
+  `include "../../include/ara_vaddr.svh"
+
   import cf_math_pkg::idx_width;
 
   ////////////////////////////////
@@ -108,6 +117,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
 
   localparam int unsigned ResultQueueDepth = 2;
 
+  vaddr_t addr_d, addr_q;
+
   // There is a result queue per lane, holding the results that were not
   // yet accepted by the corresponding lane.
   typedef struct packed {
@@ -220,6 +231,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
     out_pnt_d     = out_pnt_q;
     vrf_pnt_d     = vrf_pnt_q;
     state_d       = state_q;
+    addr_d        = addr_q;
 
     result_queue_d           = result_queue_q;
     result_queue_valid_d     = result_queue_valid_q;
@@ -268,6 +280,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
               // Start writing at the middle of the destination vector
               vrf_pnt_d = vinsn_issue_q.stride >> $clog2(8*NrLanes);
 
+              // Fix the starting address
+              addr_d = vaddr_offset(addr_q, vrf_pnt_d, vinsn_issue_q.vd);
+
               // Go to SLIDE_RUN_VSLIDE1UP_FIRST_WORD if this is a vslide1up instruction
               if (vinsn_issue_q.use_scalar_op)
                 state_d = SLIDE_RUN_VSLIDE1UP_FIRST_WORD;
@@ -349,8 +364,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Initialize id and addr fields of the result queue requests
           for (int lane = 0; lane < NrLanes; lane++) begin
             result_queue_d[result_queue_write_pnt_q][lane].id   = vinsn_issue_q.id;
-            result_queue_d[result_queue_write_pnt_q][lane].addr =
-              vaddr(vinsn_issue_q.vd, NrLanes) + vrf_pnt_q;
+            result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q;
           end
 
           // Bump pointers (reductions always finish in one shot)
@@ -409,8 +423,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
             if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN})
               mask_ready_o = !vinsn_issue_q.vm;
 
-            // Increment VRF address
-            vrf_pnt_d = vrf_pnt_q + 1;
+            // Increment write-back address
+            addr_d = vaddr_offset(addr_q, 1, vinsn_issue_q.vd);
 
             // Send result to the VRF
             result_queue_cnt_d += 1;
@@ -466,6 +480,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
             // Increment vector instruction queue pointers and counters
             vinsn_queue_d.issue_pnt += 1;
             vinsn_queue_d.issue_cnt -= 1;
+
+            addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes);
           end
         end
       end
@@ -500,6 +516,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
           // Increment vector instruction queue pointers and counters
           vinsn_queue_d.issue_pnt += 1;
           vinsn_queue_d.issue_cnt -= 1;
+
+          addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes);
         end
       end
       SLIDE_WAIT_OSUM: begin
@@ -607,6 +625,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
         // VSLIDE1UP always writes at least 1 element
         if (pe_req_i.op == VSLIDEUP && !pe_req_i.use_scalar_op)
           issue_cnt_d -= vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].stride;
+
+        // Initialize the starting address for the next instruction
+        addr_d = vaddr(pe_req_i.vd, NrLanes);
       end
       if (vinsn_queue_d.commit_cnt == '0) begin
         commit_cnt_d = pe_req_i.op inside {VSLIDEUP, VSLIDEDOWN}
@@ -638,6 +659,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
       pe_resp_o          <= '0;
       result_final_gnt_q <= '0;
       red_stride_cnt_q   <= 1;
+      addr_q             <= '0;
     end else begin
       vinsn_running_q    <= vinsn_running_d;
       issue_cnt_q        <= issue_cnt_d;
@@ -649,6 +671,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #(
       pe_resp_o          <= pe_resp;
       result_final_gnt_q <= result_final_gnt_d;
       red_stride_cnt_q   <= red_stride_cnt_d;
+      addr_q             <= addr_d;
     end
   end
 
diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv
index 61b26623a..6f94d9ec1 100644
--- a/hardware/src/vlsu/vldu.sv
+++ b/hardware/src/vlsu/vldu.sv
@@ -8,16 +8,22 @@
 // upon receiving vector memory operations.
 
 module vldu import ara_pkg::*; import rvv_pkg::*; #(
-    parameter  int  unsigned NrLanes = 0,
-    parameter  type          vaddr_t = logic,  // Type used to address vector register file elements
+    parameter  int  unsigned NrLanes         = 0,
+    // Address of an element in the lane's VRF
+    localparam int  unsigned MaxVLenBPerLane = VLENB / NrLanes,      // In bytes
+    localparam int  unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes
+    localparam int  unsigned VaddrIdxWidth   = $clog2(VRFBSizePerLane),
+    localparam int  unsigned VaddrBankWidth  = $clog2(NrVRFBanksPerLane),
+    localparam int  unsigned VaddrVregWidth  = $clog2(MaxVLenBPerLane),
+    localparam type          vaddr_t         = logic [VaddrIdxWidth-1:0],
     // AXI Interface parameters
-    parameter  int  unsigned AxiDataWidth = 0,
-    parameter  int  unsigned AxiAddrWidth = 0,
-    parameter  type          axi_r_t      = logic,
+    parameter  int  unsigned AxiDataWidth    = 0,
+    parameter  int  unsigned AxiAddrWidth    = 0,
+    parameter  type          axi_r_t         = logic,
     // Dependant parameters. DO NOT CHANGE!
-    localparam int           DataWidth    = $bits(elen_t),
-    localparam type          strb_t       = logic[DataWidth/8-1:0],
-    localparam type          axi_addr_t   = logic [AxiAddrWidth-1:0]
+    localparam int           DataWidth       = $bits(elen_t),
+    localparam type          strb_t          = logic[DataWidth/8-1:0],
+    localparam type          axi_addr_t      = logic [AxiAddrWidth-1:0]
   ) (
     input  logic                           clk_i,
     input  logic                           rst_ni,
@@ -55,7 +61,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     output logic                           mask_ready_o
   );
 
+  // Include address-handling functions
+  `include "../../include/ara_vaddr.svh"
+
   import cf_math_pkg::idx_width;
+
   import axi_pkg::beat_lower_byte;
   import axi_pkg::beat_upper_byte;
   import axi_pkg::BURST_INCR;
@@ -125,6 +135,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
 
   localparam int unsigned ResultQueueDepth = 2;
 
+  vaddr_t addr_d, addr_q;
+
   // There is a result queue per lane, holding the results that were not
   // yet accepted by the corresponding lane.
   typedef struct packed {
@@ -204,6 +216,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
     vinsn_queue_d = vinsn_queue_q;
     issue_cnt_d   = issue_cnt_q;
     commit_cnt_d  = commit_cnt_q;
+    addr_d        = addr_q;
 
     len_d     = len_q;
     r_pnt_d   = r_pnt_q;
@@ -293,9 +306,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         // Initialize id and addr fields of the result queue requests
         for (int lane = 0; lane < NrLanes; lane++) begin
           result_queue_d[result_queue_write_pnt_q][lane].id   = vinsn_issue_q.id;
-          result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) +
-            (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >>
-            (int'(EW64) - int'(vinsn_issue_q.vtype.vsew)));
+          result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q;
         end
       end
 
@@ -311,6 +322,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         // Trigger the request signal
         result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}};
 
+        // Increase the address
+        addr_d = next_vaddr(addr_q, vinsn_issue_q.vd);
+
         // Acknowledge the mask operands
         mask_ready_o = !vinsn_issue_q.vm;
 
@@ -349,6 +363,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
         else
           vinsn_queue_d.issue_pnt += 1;
 
+        // Modify the next instruction's address
+        addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes);
+
         // Prepare for the next vector instruction
         if (vinsn_queue_d.issue_cnt != 0)
           issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[
@@ -439,8 +456,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
       vinsn_running_d[pe_req_i.id]                  = 1'b1;
 
       // Initialize counters
-      if (vinsn_queue_d.issue_cnt == '0)
+      if (vinsn_queue_d.issue_cnt == '0) begin
         issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
+        addr_d      = vaddr(pe_req_i.vd, NrLanes);
+      end
       if (vinsn_queue_d.commit_cnt == '0)
         commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew);
 
@@ -461,6 +480,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
       vrf_pnt_q          <= '0;
       pe_resp_o          <= '0;
       result_final_gnt_q <= '0;
+      addr_q             <= '0;
     end else begin
       vinsn_running_q    <= vinsn_running_d;
       issue_cnt_q        <= issue_cnt_d;
@@ -470,6 +490,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #(
       vrf_pnt_q          <= vrf_pnt_d;
       pe_resp_o          <= pe_resp;
       result_final_gnt_q <= result_final_gnt_d;
+      addr_q             <= addr_d;
     end
   end
 
diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv
index 448b53a87..c86b7ee15 100644
--- a/hardware/src/vlsu/vlsu.sv
+++ b/hardware/src/vlsu/vlsu.sv
@@ -158,8 +158,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #(
     .AxiAddrWidth(AxiAddrWidth),
     .AxiDataWidth(AxiDataWidth),
     .axi_r_t     (axi_r_t     ),
-    .NrLanes     (NrLanes     ),
-    .vaddr_t     (vaddr_t     )
+    .NrLanes     (NrLanes     )
   ) i_vldu (
     .clk_i                  (clk_i                     ),
     .rst_ni                 (rst_ni                    ),

From 40b408b5618848190446adf55b949e5d7734f4d1 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 5 Dec 2022 20:32:44 +0100
Subject: [PATCH 5/8] [hardware] Handle slide1x and widening hazards with a
 special protocol

Slide1Up/Down were blocked in the main sequencer when they had specific
hazards. Now, these hazards are handled downstream, waiting for 1 cycle
of stall and then continuing with the usual protocol.
WAW hazards for widening instructions are also handled better now,
discriminating between real widening instructions and reductions.
---
 hardware/include/ara_pkg.sv            |   5 +
 hardware/src/ara_dispatcher.sv         |  80 +++++++++-
 hardware/src/ara_sequencer.sv          |   7 +-
 hardware/src/lane/lane_sequencer.sv    | 208 +++++++++++++------------
 hardware/src/lane/operand_requester.sv |  93 +++++++----
 5 files changed, 254 insertions(+), 139 deletions(-)

diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv
index 6fe041e70..6c45d77ff 100644
--- a/hardware/include/ara_pkg.sv
+++ b/hardware/include/ara_pkg.sv
@@ -300,6 +300,8 @@ package ara_pkg;
     logic wide_fp_imm;
     // Resizing of FP conversions
     resize_e cvt_resize;
+    // Widening and vslide1x instructions have different hazard stall policies
+    logic special_hazard;
 
     // Vector machine metadata
     vlen_t vl;
@@ -397,6 +399,8 @@ package ara_pkg;
     logic wide_fp_imm;
     // Resizing of FP conversions
     resize_e cvt_resize;
+    // Widening and vslide1x instructions have different hazard stall policies
+    logic special_hazard;
 
     // Vector machine metadata
     vlen_t vl;
@@ -894,6 +898,7 @@ package ara_pkg;
     logic scale_vl; // Rescale vl taking into account the new and old EEW
 
     resize_e cvt_resize;    // Resizing of FP conversions
+    logic special_hazard; // Widening and vslide1x instructions have different hazard stall policies
 
     logic is_reduct; // Is this a reduction?
 
diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv
index 6fe3783dc..22c1cd5b1 100644
--- a/hardware/src/ara_dispatcher.sv
+++ b/hardware/src/ara_dispatcher.sv
@@ -681,6 +681,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110001: begin
                     ara_req_d.op = ara_pkg::VWREDSUM;
@@ -690,6 +691,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -1300,6 +1302,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt8;
                         ara_req_d.eew_vs2        = eew_q[insn.varith_type.rs2];
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW64) ||
@@ -1310,6 +1313,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt8;
                         ara_req_d.eew_vs2        = eew_q[insn.varith_type.rs2];
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW64) ||
@@ -1320,6 +1324,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt4;
                         ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW32) ||
@@ -1329,6 +1334,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt4;
                         ara_req_d.eew_vs2        = prev_prev_ew(vtype_q.vsew);
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW32) ||
@@ -1338,6 +1344,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                         ara_req_d.eew_vs2        = vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
@@ -1347,6 +1354,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                         ara_req_d.eew_vs2        = vtype_q.vsew.prev();
                         ara_req_d.cvt_resize     = CVT_WIDE;
+                        ara_req_d.special_hazard = 1'b1;
 
                         // Invalid conversion
                         if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8})
@@ -1394,6 +1402,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1402,6 +1411,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1410,6 +1420,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1418,6 +1429,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1427,6 +1439,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1436,6 +1449,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1445,6 +1459,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1454,6 +1469,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1462,6 +1478,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1470,6 +1487,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1478,6 +1496,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
@@ -1508,6 +1527,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.eew_vd_op      = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -1567,6 +1587,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2 = vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
+                    // Special hazard handling for this instruction
+                    ara_req_d.special_hazard = 1'b1;
                     // If stride > vl, the vslideup has no effects
                     if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
                       (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
@@ -1577,6 +1599,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2 = vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
+                    // Special hazard handling for this instruction
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b010000: begin // VRXUNARY0
                     // vmv.s.x
@@ -1625,6 +1649,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110001: begin // VWADD
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1633,6 +1658,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110010: begin // VWSUBU
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1641,6 +1667,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110011: begin // VWSUB
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1649,6 +1676,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110100: begin // VWADDU.W
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1658,6 +1686,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110101: begin // VWADD.W
                     ara_req_d.op             = ara_pkg::VADD;
@@ -1667,6 +1696,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110110: begin // VWSUBU.W
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1676,6 +1706,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b110111: begin // VWSUB.W
                     ara_req_d.op             = ara_pkg::VSUB;
@@ -1685,6 +1716,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.eew_vs2        = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111000: begin // VWMULU
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1693,6 +1725,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111010: begin // VWMULSU
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1701,6 +1734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionZExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111011: begin // VWMUL
                     ara_req_d.op             = ara_pkg::VMUL;
@@ -1709,6 +1743,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs1 = OpQueueConversionSExt2;
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111100: begin // VWMACCU
                     ara_req_d.op             = ara_pkg::VMACC;
@@ -1719,6 +1754,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.eew_vd_op      = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111101: begin // VWMACC
                     ara_req_d.op             = ara_pkg::VMACC;
@@ -1729,6 +1765,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.eew_vd_op      = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111110: begin // VWMACCUS
                     ara_req_d.op             = ara_pkg::VMACC;
@@ -1739,6 +1776,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs2 = OpQueueConversionSExt2;
                     ara_req_d.eew_vd_op      = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   6'b111111: begin // VWMACCSU
                     ara_req_d.op             = ara_pkg::VMACC;
@@ -1749,6 +1787,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.conversion_vs2 = OpQueueConversionZExt2;
                     ara_req_d.eew_vd_op      = vtype_q.vsew.next();
                     ara_req_d.cvt_resize     = CVT_WIDE;
+                    ara_req_d.special_hazard = 1'b1;
                   end
                   default: illegal_insn = 1'b1;
                 endcase
@@ -1883,6 +1922,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01000: begin // Widening VFCVTXUF
                           ara_req_d.op             = VFCVTXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1890,6 +1930,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01001: begin // Widening VFCVTXF
                           ara_req_d.op             = VFCVTXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1897,6 +1938,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01010: begin // Widening VFCVTFXU
                           ara_req_d.op             = VFCVTFXU;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1904,6 +1946,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01011: begin // Widening VFCVTFX
                           ara_req_d.op             = VFCVTFX;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1911,6 +1954,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01100: begin // Widening VFCVTFF
                           ara_req_d.op             = VFCVTFF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1918,6 +1962,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01110: begin // Widening VFCVTRTZXUF
                           ara_req_d.op             = VFCVTRTZXUF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -1925,6 +1970,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                         5'b01111: begin // Widening VFCVTRTZXF
                           ara_req_d.op             = VFCVTRTZXF;
                           ara_req_d.cvt_resize     = CVT_WIDE;
+                          ara_req_d.special_hazard = 1'b1;
                           ara_req_d.emul           = next_lmul(vtype_q.vlmul);
                           ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                           ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt;
@@ -2032,6 +2078,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b110001: begin // VFWREDUSUM
                       ara_req_d.op             = ara_pkg::VFWREDUSUM;
@@ -2041,7 +2089,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                      ara_req_d.cvt_resize     = resize_e'(2'b00);
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b110010: begin // VFWSUB
                       ara_req_d.op             = ara_pkg::VFSUB;
@@ -2050,6 +2099,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b110011: begin // VFWREDOSUM
                       ara_req_d.op             = ara_pkg::VFWREDOSUM;
@@ -2059,7 +2110,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueReductionZExt;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vs1        = vtype_q.vsew.next();
-                      ara_req_d.cvt_resize     = resize_e'(2'b00);
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b110100: begin // VFWADD.W
                       ara_req_d.op             = ara_pkg::VFADD;
@@ -2069,6 +2121,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                       ara_req_d.eew_vs2        = vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b110110: begin // VFWSUB.W
                       ara_req_d.op             = ara_pkg::VFSUB;
@@ -2078,6 +2132,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                       ara_req_d.eew_vs2        = vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b111000: begin // VFWMUL
                       ara_req_d.op             = ara_pkg::VFMUL;
@@ -2085,6 +2141,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.vtype.vsew     = vtype_q.vsew.next();
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b111100: begin // VFWMACC
                       ara_req_d.op             = ara_pkg::VFMACC;
@@ -2094,6 +2152,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b111101: begin // VFWNMACC
                       ara_req_d.op             = ara_pkg::VFNMACC;
@@ -2103,6 +2163,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b111110: begin // VFWMSAC
                       ara_req_d.op             = ara_pkg::VFMSAC;
@@ -2112,6 +2174,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b111111: begin // VFWNMSAC
                       ara_req_d.op             = ara_pkg::VFNMSAC;
@@ -2121,6 +2185,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                       ara_req_d.conversion_vs1 = OpQueueConversionWideFP2;
                       ara_req_d.conversion_vs2 = OpQueueConversionWideFP2;
                       ara_req_d.eew_vd_op      = vtype_q.vsew.next();
+                      ara_req_d.cvt_resize     = CVT_WIDE;
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     default: illegal_insn = 1'b1;
                   endcase
@@ -2217,6 +2283,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     ara_req_d.eew_vs2  = vtype_q.vsew;
                     // Request will need reshuffling
                     ara_req_d.scale_vl = 1'b1;
+                    // Special hazard handling for this instruction
+                    ara_req_d.special_hazard = 1'b1;
                     // If stride > vl, the vslideup has no effects
                     if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] ||
                       (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1;
@@ -2224,9 +2292,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
                     6'b001111: begin // vfslide1down
                       ara_req_d.op     = ara_pkg::VSLIDEDOWN;
                       ara_req_d.stride = 1;
-                    ara_req_d.eew_vs2  = vtype_q.vsew;
-                    // Request will need reshuffling
-                    ara_req_d.scale_vl = 1'b1;
+                      ara_req_d.eew_vs2  = vtype_q.vsew;
+                      // Request will need reshuffling
+                      ara_req_d.scale_vl = 1'b1;
+                      // Special hazard handling for this instruction
+                      ara_req_d.special_hazard = 1'b1;
                     end
                     6'b010000: begin // VRFUNARY0
                       // vmv.s.f
diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv
index 8355a97de..539e3d2b3 100644
--- a/hardware/src/ara_sequencer.sv
+++ b/hardware/src/ara_sequencer.sv
@@ -361,6 +361,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
               fp_rm         : ara_req_i.fp_rm,
               wide_fp_imm   : ara_req_i.wide_fp_imm,
               cvt_resize    : ara_req_i.cvt_resize,
+              special_hazard: ara_req_i.special_hazard,
               scale_vl      : ara_req_i.scale_vl,
               vl            : ara_req_i.vl,
               vstart        : ara_req_i.vstart,
@@ -384,8 +385,10 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i
             if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm})              &&
                 |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd}          &&
                 !(is_load(pe_req_d.op)))                                                                     ||
-                (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) ||
-                (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}))
+                (pe_req_d.op == VSLIDEUP && !pe_req_d.use_scalar_op &&
+                |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) ||
+                (pe_req_d.op == VSLIDEDOWN && !pe_req_d.use_scalar_op &&
+                |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}))
             begin
               ara_req_ready_o = 1'b0;
               pe_req_valid_d  = 1'b0;
diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv
index 722bab7a5..cee688f18 100644
--- a/hardware/src/lane/lane_sequencer.sv
+++ b/hardware/src/lane/lane_sequencer.sv
@@ -240,42 +240,44 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
       unique case (pe_req.vfu)
         VFU_Alu: begin
           operand_request_i[AluA] = '{
-            id         : pe_req.id,
-            vs         : pe_req.vs1,
-            eew        : pe_req.eew_vs1,
+            id             : pe_req.id,
+            vs             : pe_req.vs1,
+            eew            : pe_req.eew_vs1,
             // If reductions and vl == 0, we must replace with neutral values
-            conv       : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1,
-            scale_vl   : pe_req.scale_vl,
-            cvt_resize : pe_req.cvt_resize,
-            vtype      : pe_req.vtype,
+            conv           : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1,
+            scale_vl       : pe_req.scale_vl,
+            cvt_resize     : pe_req.cvt_resize,
+            special_hazard : pe_req.special_hazard,
+            vtype          : pe_req.vtype,
             // In case of reduction, AluA opqueue will keep the scalar element
-            vl         : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl,
-            vstart     : vfu_operation_d.vstart,
-            hazard     : pe_req.hazard_vs1 | pe_req.hazard_vd,
-            is_reduct  : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0,
-            target_fu  : ALU_SLDU,
-            default    : '0
+            vl             : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl,
+            vstart         : vfu_operation_d.vstart,
+            hazard         : pe_req.hazard_vs1 | pe_req.hazard_vd,
+            is_reduct      : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0,
+            target_fu      : ALU_SLDU,
+            default        : '0
           };
           operand_request_push[AluA] = pe_req.use_vs1;
 
           operand_request_i[AluB] = '{
-            id         : pe_req.id,
-            vs         : pe_req.vs2,
-            eew        : pe_req.eew_vs2,
+            id             : pe_req.id,
+            vs             : pe_req.vs2,
+            eew            : pe_req.eew_vs2,
             // If reductions and vl == 0, we must replace with neutral values
-            conv       : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2,
-            scale_vl   : pe_req.scale_vl,
-            cvt_resize : pe_req.cvt_resize,
-            vtype      : pe_req.vtype,
+            conv           : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2,
+            scale_vl       : pe_req.scale_vl,
+            cvt_resize     : pe_req.cvt_resize,
+            special_hazard : pe_req.special_hazard,
+            vtype          : pe_req.vtype,
             // If reductions and vl == 0, we must replace the operands with neutral
             // values in the opqueues. So, vl must be 1 at least
-            vl         : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0)
-                         ? 1 : vfu_operation_d.vl,
-            vstart     : vfu_operation_d.vstart,
-            hazard     : pe_req.hazard_vs2 | pe_req.hazard_vd,
-            is_reduct  : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0,
-            target_fu  : ALU_SLDU,
-            default    : '0
+            vl             : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0)
+                             ? 1 : vfu_operation_d.vl,
+            vstart         : vfu_operation_d.vstart,
+            hazard         : pe_req.hazard_vs2 | pe_req.hazard_vd,
+            is_reduct      : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0,
+            target_fu      : ALU_SLDU,
+            default        : '0
           };
           operand_request_push[AluB] = pe_req.use_vs2;
 
@@ -298,66 +300,69 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
         end
         VFU_MFpu: begin
           operand_request_i[MulFPUA] = '{
-            id         : pe_req.id,
-            vs         : pe_req.vs1,
-            eew        : pe_req.eew_vs1,
+            id             : pe_req.id,
+            vs             : pe_req.vs1,
+            eew            : pe_req.eew_vs1,
             // If reductions and vl == 0, we must replace with neutral values
-            conv       : pe_req.conversion_vs1,
-            scale_vl   : pe_req.scale_vl,
-            cvt_resize : pe_req.cvt_resize,
-            vtype      : pe_req.vtype,
+            conv           : pe_req.conversion_vs1,
+            scale_vl       : pe_req.scale_vl,
+            cvt_resize     : pe_req.cvt_resize,
+            special_hazard : pe_req.special_hazard,
+            vtype          : pe_req.vtype,
             // If reductions and vl == 0, we must replace the operands with neutral
             // values in the opqueues. So, vl must be 1 at least
-            vl         : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl,
-            vstart     : vfu_operation_d.vstart,
-            hazard     : pe_req.hazard_vs1 | pe_req.hazard_vd,
-            is_reduct  : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
-            target_fu  : MFPU_ADDRGEN,
-            default    : '0
+            vl             : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl,
+            vstart         : vfu_operation_d.vstart,
+            hazard         : pe_req.hazard_vs1 | pe_req.hazard_vd,
+            is_reduct      : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
+            target_fu      : MFPU_ADDRGEN,
+            default        : '0
           };
           operand_request_push[MulFPUA] = pe_req.use_vs1;
 
           operand_request_i[MulFPUB] = '{
-            id         : pe_req.id,
-            vs         : pe_req.swap_vs2_vd_op ? pe_req.vd        : pe_req.vs2,
-            eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2,
+            id               : pe_req.id,
+            vs               : pe_req.swap_vs2_vd_op ? pe_req.vd        : pe_req.vs2,
+            eew              : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2,
             // If reductions and vl == 0, we must replace with neutral values
-            conv       : pe_req.conversion_vs2,
-            scale_vl   : pe_req.scale_vl,
-            cvt_resize : pe_req.cvt_resize,
-            vtype      : pe_req.vtype,
+            conv             : pe_req.conversion_vs2,
+            scale_vl         : pe_req.scale_vl,
+            cvt_resize       : pe_req.cvt_resize,
+            special_hazard   : pe_req.special_hazard,
+            vtype            : pe_req.vtype,
             // If reductions and vl == 0, we must replace the operands with neutral
             // values in the opqueues. So, vl must be 1 at least
-            vl         : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0)
-                        ? 1 : vfu_operation_d.vl,
-            vstart     : vfu_operation_d.vstart,
-            hazard     : (pe_req.swap_vs2_vd_op ?
+            vl               : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0)
+                               ? 1 : vfu_operation_d.vl,
+            vstart           : vfu_operation_d.vstart,
+            hazard           : (pe_req.swap_vs2_vd_op ?
             pe_req.hazard_vd : (pe_req.hazard_vs2 | pe_req.hazard_vd)),
-            is_reduct  : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
-            target_fu  : MFPU_ADDRGEN,
-            default: '0
+            is_reduct        : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
+            target_fu        : MFPU_ADDRGEN,
+            default          : '0
           };
           operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ?
           pe_req.use_vd_op : pe_req.use_vs2;
 
           operand_request_i[MulFPUC] = '{
-            id         : pe_req.id,
-            vs         : pe_req.swap_vs2_vd_op ? pe_req.vs2            : pe_req.vd,
-            eew        : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2        : pe_req.eew_vd_op,
-            conv       : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone,
-            scale_vl   : pe_req.scale_vl,
-            cvt_resize : pe_req.cvt_resize,
+            id             : pe_req.id,
+            vs             : pe_req.swap_vs2_vd_op ? pe_req.vs2            : pe_req.vd,
+            eew            : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2        : pe_req.eew_vd_op,
+            conv           : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone,
+            scale_vl       : pe_req.scale_vl,
+            cvt_resize     : pe_req.cvt_resize,
+            special_hazard : pe_req.special_hazard,
             // If reductions and vl == 0, we must replace the operands with neutral
             // values in the opqueues. So, vl must be 1 at least
-            vl         : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0)
-                        ? 1 : vfu_operation_d.vl,
-            vstart     : vfu_operation_d.vstart,
-            vtype      : pe_req.vtype,
-            hazard     : pe_req.swap_vs2_vd_op ?
+            vl             : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0)
+                            ? 1 : vfu_operation_d.vl,
+            vstart         : vfu_operation_d.vstart,
+            vtype          : pe_req.vtype,
+            hazard         : pe_req.swap_vs2_vd_op ?
             (pe_req.hazard_vs2 | pe_req.hazard_vd) : pe_req.hazard_vd,
-            is_reduct  : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
-            target_fu  : MFPU_ADDRGEN,
-            default : '0
+            is_reduct      : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0,
+            target_fu      : MFPU_ADDRGEN,
+            default        : '0
           };
           operand_request_push[MulFPUC] = pe_req.swap_vs2_vd_op ?
           pe_req.use_vs2 : pe_req.use_vd_op;
@@ -399,17 +404,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
           // Load indexed
           operand_request_i[SlideAddrGenA] = '{
-            id       : pe_req_i.id,
-            vs       : pe_req_i.vs2,
-            eew      : pe_req_i.eew_vs2,
-            conv     : pe_req_i.conversion_vs2,
-            target_fu: MFPU_ADDRGEN,
-            vl       : pe_req_i.vl / NrLanes,
-            scale_vl : pe_req_i.scale_vl,
-            vstart   : vfu_operation_d.vstart,
-            vtype    : pe_req_i.vtype,
-            hazard   : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd,
-            default  : '0
+            id             : pe_req_i.id,
+            vs             : pe_req_i.vs2,
+            eew            : pe_req_i.eew_vs2,
+            conv           : pe_req_i.conversion_vs2,
+            target_fu      : MFPU_ADDRGEN,
+            special_hazard : pe_req.special_hazard,
+            vl             : pe_req_i.vl / NrLanes,
+            scale_vl       : pe_req_i.scale_vl,
+            vstart         : vfu_operation_d.vstart,
+            vtype          : pe_req_i.vtype,
+            hazard         : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd,
+            default        : '0
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
@@ -455,17 +461,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
           // Store indexed
           operand_request_i[SlideAddrGenA] = '{
-            id       : pe_req_i.id,
-            vs       : pe_req_i.vs2,
-            eew      : pe_req_i.eew_vs2,
-            conv     : pe_req_i.conversion_vs2,
-            target_fu: MFPU_ADDRGEN,
-            vl       : pe_req_i.vl / NrLanes,
-            scale_vl : pe_req_i.scale_vl,
-            vstart   : vfu_operation_d.vstart,
-            vtype    : pe_req_i.vtype,
-            hazard   : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd,
-            default  : '0
+            id             : pe_req_i.id,
+            vs             : pe_req_i.vs2,
+            eew            : pe_req_i.eew_vs2,
+            conv           : pe_req_i.conversion_vs2,
+            target_fu      : MFPU_ADDRGEN,
+            special_hazard : pe_req.special_hazard,
+            vl             : pe_req_i.vl / NrLanes,
+            scale_vl       : pe_req_i.scale_vl,
+            vstart         : vfu_operation_d.vstart,
+            vtype          : pe_req_i.vtype,
+            hazard         : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd,
+            default        : '0
           };
           // Since this request goes outside of the lane, we might need to request an
           // extra operand regardless of whether it is valid in this lane or not.
@@ -476,16 +483,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::
 
         VFU_SlideUnit: begin
           operand_request_i[SlideAddrGenA] = '{
-            id       : pe_req.id,
-            vs       : pe_req.vs2,
-            eew      : pe_req.eew_vs2,
-            conv     : pe_req.conversion_vs2,
-            target_fu: ALU_SLDU,
-            scale_vl : pe_req.scale_vl,
-            vtype    : pe_req.vtype,
-            vstart   : vfu_operation_d.vstart,
-            hazard   : pe_req.hazard_vs2 | pe_req.hazard_vd,
-            default  : '0
+            id             : pe_req.id,
+            vs             : pe_req.vs2,
+            eew            : pe_req.eew_vs2,
+            conv           : pe_req.conversion_vs2,
+            target_fu      : ALU_SLDU,
+            special_hazard : pe_req.special_hazard,
+            scale_vl       : pe_req.scale_vl,
+            vtype          : pe_req.vtype,
+            vstart         : vfu_operation_d.vstart,
+            hazard         : pe_req.hazard_vs2 | pe_req.hazard_vd,
+            default        : '0
           };
           operand_request_push[SlideAddrGenA] = pe_req.use_vs2;
 
diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index ba9895268..97ce49ad1 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -255,15 +255,27 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       // In case of a WAW with a previous instruction,
       // read once every two writes of the previous instruction
       logic is_widening;
+      // Does this instruction have a special hazard protocol?
+      logic special_hazard;
       // One-bit counters
       logic [NrVInsn-1:0] waw_hazard_counter;
     } requester_d, requester_q;
 
+    // Asserted if the SLDU requester is registering a new instruction
+    logic new_sldu_insn;
+    logic has_stalled_d, has_stalled_q;
 
     // Is there a hazard during this cycle?
+    // WAW with widening instructions are special: wait for 2 writes instead of 1
+    // Slide1Up/Down with hazards should wait one cycle before being handled normally
     logic stall;
-    assign stall = |(requester_q.hazard & ~(vinsn_result_written_q &
-                   (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter)));
+    assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & ((~{NrVInsn{requester_q.is_widening}} &
+                     requester_q.special_hazard) | requester_q.waw_hazard_counter))) |
+                     (~has_stalled_q & requester_q.special_hazard & |requester_q.hazard);
+
+    // For every instruction, it signals if the requester has already stalled once
+    // This is needed for vslide1x stall handling
+    assign has_stalled_d = new_sldu_insn ? 1'b0 : (stall ? 1'b1 : has_stalled_q);
 
     // Did we get a grant?
     logic [NrBanks-1:0] operand_requester_gnt;
@@ -279,6 +291,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
       state_d     = state_q;
       requester_d = requester_q;
 
+      new_sldu_insn = 1'b0;
+
       // Make no requests to the VRF
       operand_payload[requester] = '0;
       for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0;
@@ -298,6 +312,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
             // Acknowledge the request
             operand_request_ready_o[requester] = 1'b1;
 
+            // New slide unit instruction incoming
+            if (requester == (NrOperandQueues + VFU_SlideUnit))
+              new_sldu_insn = 1'b1;
+
             // Send a command to the operand queue
             operand_queue_cmd_o[requester] = '{
               eew : operand_request_i[requester].eew,
@@ -322,23 +340,25 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
 
             // Store the request
             requester_d = '{
-              id     : operand_request_i[requester].id,
-              addr   : vaddr(operand_request_i[requester].vs, NrLanes) +
-              (operand_request_i[requester].vstart >>
-                (int'(EW64) - int'(operand_request_i[requester].eew))),
-              vs     : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
+              id             : operand_request_i[requester].id,
+              addr           : vaddr(operand_request_i[requester].vs, NrLanes) +
+                                 (operand_request_i[requester].vstart >>
+                                 (int'(EW64) - int'(operand_request_i[requester].eew))),
+              vs             : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
               // For memory operations, the number of elements initially refers to the new EEW (vsew here),
               // but the requester must refer to the old EEW (eew here)
               // This reasoning cannot be applied also to widening instructions, which modify vsew
               // treating it as the EEW of vd
-              len         : (operand_request_i[requester].scale_vl) ?
-                              ((operand_request_i[requester].vl <<
-                              operand_request_i[requester].vtype.vsew) >>
-                              operand_request_i[requester].eew) :
-                              operand_request_i[requester].vl,
-              vew         : operand_request_i[requester].eew,
-              hazard      : operand_request_i[requester].hazard,
-              is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE,
+              len            : (operand_request_i[requester].scale_vl) ?
+                                 ((operand_request_i[requester].vl <<
+                                 operand_request_i[requester].vtype.vsew) >>
+                                 operand_request_i[requester].eew) :
+                                 operand_request_i[requester].vl,
+              vew            : operand_request_i[requester].eew,
+              hazard         : operand_request_i[requester].hazard,
+              is_widening    : operand_request_i[requester].cvt_resize == CVT_WIDE &&
+                                 operand_request_i[requester].special_hazard,
+              special_hazard : operand_request_i[requester].special_hazard,
               default: '0
             };
             // The length should be at least one after the rescaling
@@ -392,6 +412,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                 // Acknowledge the request
                 operand_request_ready_o[requester] = 1'b1;
 
+                // New slide unit instruction incoming
+                if (requester == (NrOperandQueues + VFU_SlideUnit))
+                  new_sldu_insn = 1'b1;
+
                 // Send a command to the operand queue
                 operand_queue_cmd_o[requester] = '{
                   eew      : operand_request_i[requester].eew,
@@ -412,19 +436,22 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
 
                 // Store the request
                 requester_d = '{
-                  id   : operand_request_i[requester].id,
-                  addr : vaddr(operand_request_i[requester].vs, NrLanes) +
-                  (operand_request_i[requester].vstart >>
-                    (int'(EW64) - int'(operand_request_i[requester].eew))),
-                  vs     : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
-                  len    : (operand_request_i[requester].scale_vl) ?
-                             ((operand_request_i[requester].vl <<
-                             operand_request_i[requester].vtype.vsew) >>
-                             operand_request_i[requester].eew) :
-                             operand_request_i[requester].vl,
-                  vew    : operand_request_i[requester].eew,
-                  hazard : operand_request_i[requester].hazard,
-                  default: '0
+                  id             : operand_request_i[requester].id,
+                  addr           : vaddr(operand_request_i[requester].vs, NrLanes) +
+                                     (operand_request_i[requester].vstart >>
+                                     (int'(EW64) - int'(operand_request_i[requester].eew))),
+                  vs             : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
+                  len            : (operand_request_i[requester].scale_vl) ?
+                                     ((operand_request_i[requester].vl <<
+                                     operand_request_i[requester].vtype.vsew) >>
+                                     operand_request_i[requester].eew) :
+                                     operand_request_i[requester].vl,
+                  vew            : operand_request_i[requester].eew,
+                  hazard         : operand_request_i[requester].hazard,
+                  is_widening    : operand_request_i[requester].cvt_resize == CVT_WIDE &&
+                                     operand_request_i[requester].special_hazard,
+                  special_hazard : operand_request_i[requester].special_hazard,
+                  default        : '0
                 };
                 // The length should be at least one after the rescaling
                 if (requester_d.len == '0)
@@ -440,11 +467,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
 
     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
-        state_q     <= IDLE;
-        requester_q <= '0;
+        state_q       <= IDLE;
+        requester_q   <= '0;
+        has_stalled_q <= 1'b0;
       end else begin
-        state_q     <= state_d;
-        requester_q <= requester_d;
+        state_q       <= state_d;
+        requester_q   <= requester_d;
+        has_stalled_q <= has_stalled_d;
       end
     end
   end : gen_operand_requester

From fd801c4768e8d819fcc302438694cc9c638b71df Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 5 Dec 2022 20:38:22 +0100
Subject: [PATCH 6/8] [CHANGELOG] Update Changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 529361b68..40ca1055c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - `VLXE` and `VSXE` need to wait that the SlideAddrGenA opreq is free before being issued by the lane sequencer to the operand requester stage
  - Do not trap instructions with no operands in the main sequencer
  - Commit a reduction only after a grant from the VRF
+ - Decouple `cmdBuffer` and `dataBuffer` depth parameters in the operand queues
 
 ### Added
 
@@ -104,6 +105,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Add support for vector mask population count and find first set bit instructions: `vcpop.m`, `vfirst.m`
  - Add Spyglass linting script
  - Add parametrized support for Fixed-Point math
+ - Add support for Barber's Pole VRF Layout
 
 ### Changed
 
@@ -134,6 +136,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Adapt `fdotproduct` to `dotproduct` structure
  - Pre-calculate next-cycle `aligned_start_address` in `addrgen` for timing reasons
  - Add `is_reduct` signal to the operand queues, to gate the neutral value filling
+ - Handle WAW and WAR `vload` hazards in the `VLDU` without stalling the main sequencer
+ - Reductions are no more treated as widening instructions for what concerns WAW hazards in the operand requesters
+ - `slide1x` instructions are now not stalled in the main sequencer, but the hazard is handled downstream
 
 ## 2.2.0 - 2021-11-02
 

From a19965474412871fda95a0643948ec76517b9655 Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Mon, 12 Dec 2022 15:40:27 +0100
Subject: [PATCH 7/8] [hardware] :bug: `vstart` should consider Barber's Pole
 layout

---
 hardware/src/lane/operand_requester.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv
index 97ce49ad1..3e85a58b3 100644
--- a/hardware/src/lane/operand_requester.sv
+++ b/hardware/src/lane/operand_requester.sv
@@ -341,9 +341,9 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
             // Store the request
             requester_d = '{
               id             : operand_request_i[requester].id,
-              addr           : vaddr(operand_request_i[requester].vs, NrLanes) +
-                                 (operand_request_i[requester].vstart >>
-                                 (int'(EW64) - int'(operand_request_i[requester].eew))),
+              addr           : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes),
+                vaddr_t'(operand_request_i[requester].vstart >>
+                (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs),
               vs             : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
               // For memory operations, the number of elements initially refers to the new EEW (vsew here),
               // but the requester must refer to the old EEW (eew here)
@@ -437,9 +437,9 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #(
                 // Store the request
                 requester_d = '{
                   id             : operand_request_i[requester].id,
-                  addr           : vaddr(operand_request_i[requester].vs, NrLanes) +
-                                     (operand_request_i[requester].vstart >>
-                                     (int'(EW64) - int'(operand_request_i[requester].eew))),
+                  addr           : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes),
+                    vaddr_t'(operand_request_i[requester].vstart >>
+                    (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs),
                   vs             : operand_request_i[requester].vs[idx_width(NrBanks)-1:0],
                   len            : (operand_request_i[requester].scale_vl) ?
                                      ((operand_request_i[requester].vl <<

From 12e5768ce302bba2c5d82d42deceb590ff3f013f Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Thu, 23 Mar 2023 13:42:46 +0100
Subject: [PATCH 8/8] DEBUG: retrigger the CI

---
 hardware/src/ara.sv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv
index 350806979..4bc110a5d 100644
--- a/hardware/src/ara.sv
+++ b/hardware/src/ara.sv
@@ -242,9 +242,9 @@ module ara import ara_pkg::*; #(
 
   for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_lanes
     lane #(
-      .NrLanes     (NrLanes     ),
-      .FPUSupport  (FPUSupport  ),
-      .FixPtSupport(FixPtSupport)
+      .NrLanes                         (NrLanes                             ),
+      .FPUSupport                      (FPUSupport                          ),
+      .FixPtSupport                    (FixPtSupport                        )
     ) i_lane (
       .clk_i                           (clk_i                               ),
       .rst_ni                          (rst_ni                              ),