From fc2a433966a61bbbd6edde91f1d877cc9efe903e Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sun, 20 Nov 2022 20:17:50 +0100 Subject: [PATCH 1/8] [hardware] Handle WAW and WAR `vload` hazards in the `VLDU` Before this commit, all the hazards (RAW, WAR, WAW) are handled by the operand requesters that throttle access to source reg elements. Even if the hazard is a WAR/WAW, the suboptimal but efficient way to deal with it is to slow down the source reg fetch. If an instruction does not have source regs, this cannot happen. For example, load instructions. Therefore, all the instructions that do not have vector source operands are stalled in the sequencer. Loads are super common, and stalling in the main sequencer means that all the instructions after the load are also stalled and cannot start their execution. Therefore, now they are processed, and the hazard check is done inside the VLDU. The write-back request is masked until there is no more any hazards on that load instruction. --- hardware/src/ara.sv | 13 ++++++++++++- hardware/src/ara_sequencer.sv | 28 +++++++++++++++++++++++++--- hardware/src/vlsu/vldu.sv | 16 +++++++++++++++- hardware/src/vlsu/vlsu.sv | 6 ++++++ 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 7668fef06..15bfb7fa5 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -137,6 +137,10 @@ module ara import ara_pkg::*; #( logic [NrVInsn-1:0][NrVInsn-1:0] global_hazard_table; // Ready for lane 0 (scalar operand fwd) logic pe_scalar_resp_ready; + // VLDU Hazard checking + vid_t vldu_commit_id; + logic vldu_commit_id_valid; + logic vldu_hazard; // Mask unit operands elen_t [NrLanes-1:0][NrMaskFUnits+2-1:0] masku_operand; @@ -178,7 +182,11 @@ module ara import ara_pkg::*; #( // Interface with the address generator .addrgen_ack_i (addrgen_ack ), .addrgen_error_i (addrgen_error ), - .addrgen_error_vl_i (addrgen_error_vl ) + .addrgen_error_vl_i (addrgen_error_vl ), + // Interface with the VLDU for hazard handling + .vldu_commit_id_i (vldu_commit_id ), + .vldu_commit_id_valid_i(vldu_commit_id_valid ), + .vldu_hazard_o (vldu_hazard ) ); // Scalar move support @@ -344,6 +352,9 @@ module ara import ara_pkg::*; #( .addrgen_ack_o (addrgen_ack ), .addrgen_error_o (addrgen_error ), .addrgen_error_vl_o (addrgen_error_vl ), + .commit_id_o (vldu_commit_id ), + .commit_id_valid_o (vldu_commit_id_valid ), + .hazard_i (vldu_hazard ), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 348c01107..8355a97de 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -41,7 +41,11 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // Interface with the Address Generation input logic addrgen_ack_i, input logic addrgen_error_i, - input vlen_t addrgen_error_vl_i + input vlen_t addrgen_error_vl_i, + // Interface with the VLDU to handle load WAW and WAR hazards + input vid_t vldu_commit_id_i, + input logic vldu_commit_id_valid_i, + output logic vldu_hazard_o ); /////////////////////////////////// @@ -261,6 +265,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i write_list_d = write_list_q; global_hazard_table_d = global_hazard_table_o; + // No hazard check requested + vldu_hazard_o = 1'b0; + // Maintain request pe_req_d = '0; pe_req_valid_d = 1'b0; @@ -370,10 +377,13 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i pe_req_d.hazard_vs1 | pe_req_d.hazard_vs2; // We only issue instructions that take no operands if they have no hazards. + // Exception to this rule: loads, as they are super common. WAW and WAR hazards + // on load instructions are handled in the VLDU. // Moreover, SLIDE instructions cannot be always chained // ToDo: optimize the case for vslide1down, vslide1up (wait 2 cycles, then chain) - if (!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && - |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} || + if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && + !(is_load(pe_req_d.op))) || (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin @@ -453,6 +463,18 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i end endcase + // Load-related hazards handling + // Loads are masters on the x-bar to write the in-lane VRF. Nevertheless, + // they can have WAR or WAW dependencies. When there is a load in the load + // unit, its hazard bit is always checked and cleared here as soon as the + // dependency does not exist anymore. Whenever the hazard bit is set, + // the load cannot issue requests. + // It's safe to pipeline vldu_hazard_o if the timing is tight. + // (if so, add a sync signal) + if (vldu_commit_id_valid_i) begin + vldu_hazard_o = |global_hazard_table_o[vldu_commit_id_i]; + end + // Update the global hazard table for (int id = 0; id < NrVInsn; id++) global_hazard_table_d[id] &= vinsn_running_d; end : p_sequencer diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 51042ed8e..61b26623a 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -33,6 +33,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrVInsn-1:0] pe_vinsn_running_i, output logic pe_req_ready_o, output pe_resp_t pe_resp_o, + // Hazard handling to main sequencer + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, @@ -101,6 +105,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic vinsn_commit_valid; assign vinsn_commit = vinsn_queue_q.vinsn[vinsn_queue_q.commit_pnt]; assign vinsn_commit_valid = (vinsn_queue_q.commit_cnt != '0); + // To the main sequencer, for hazard checking + assign commit_id_valid_o = vinsn_commit_valid; + assign commit_id_o = vinsn_commit.id; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin @@ -354,7 +361,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( ////////////////////////////////// for (int lane = 0; lane < NrLanes; lane++) begin: result_write - ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane]; + // Create a request only if there are no more hazards on vd (check vs1 since the info about + // hazard vd is also there) + ldu_result_req_o[lane] = result_queue_valid_q[result_queue_read_pnt_q][lane] && + !vinsn_commit.hazard_vs1; ldu_result_addr_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].addr; ldu_result_id_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].id; ldu_result_wdata_o[lane] = result_queue_q[result_queue_read_pnt_q][lane].wdata; @@ -415,6 +425,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.commit_pnt].vtype.vsew); end + // Update the Vd hazard bit for the current instruction + // hazard_vs1, hazard_vs2, hazard_vm all contain the info about hazard_vd, so work on one of them (vs1) + if (commit_id_valid_o) vinsn_queue_d.vinsn[vinsn_queue_q.commit_pnt].hazard_vs1 &= {NrVInsn{hazard_i}}; + ////////////////////////////// // Accept new instruction // ////////////////////////////// diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index aa2e05283..448b53a87 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -44,6 +44,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output logic addrgen_ack_o, output logic addrgen_error_o, output vlen_t addrgen_error_vl_o, + output vid_t commit_id_o, + output logic commit_id_valid_o, + input logic hazard_i, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -172,6 +175,9 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), + .commit_id_o (commit_id_o ), + .commit_id_valid_o (commit_id_valid_o ), + .hazard_i (hazard_i ), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), From e9a9da3c5c22bfd010ebb7aa6592974a8e3c7c10 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sun, 20 Nov 2022 20:24:59 +0100 Subject: [PATCH 2/8] [hardware] :bug: Decouple cmdBuffer and dataBuffer depths in opQueues --- hardware/src/lane/operand_queue.sv | 13 +-- hardware/src/lane/operand_queues_stage.sv | 103 ++++++++++++---------- 2 files changed, 63 insertions(+), 53 deletions(-) diff --git a/hardware/src/lane/operand_queue.sv b/hardware/src/lane/operand_queue.sv index fe40a291b..72c8202e1 100644 --- a/hardware/src/lane/operand_queue.sv +++ b/hardware/src/lane/operand_queue.sv @@ -9,7 +9,8 @@ // need it. module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned BufferDepth = 2, + parameter int unsigned CmdBufDepth = 2, + parameter int unsigned DataBufDepth = 2, parameter int unsigned NrSlaves = 1, parameter int unsigned NrLanes = 0, // Support for floating-point data types @@ -52,7 +53,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic cmd_pop; fifo_v3 #( - .DEPTH(BufferDepth ), + .DEPTH(CmdBufDepth ), .dtype(operand_queue_cmd_t) ) i_cmd_buffer ( .clk_i (clk_i ), @@ -79,8 +80,8 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic ibuf_pop; fifo_v3 #( - .DEPTH (BufferDepth), - .DATA_WIDTH(DataWidth ) + .DEPTH (DataBufDepth), + .DATA_WIDTH(DataWidth ) ) i_input_buffer ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -98,7 +99,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // We used a credit based system, to ensure that the FIFO is always // able to accept a request. - logic [idx_width(BufferDepth):0] ibuf_usage_d, ibuf_usage_q; + logic [idx_width(DataBufDepth):0] ibuf_usage_d, ibuf_usage_q; always_comb begin: p_ibuf_usage // Maintain state @@ -110,7 +111,7 @@ module operand_queue import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if (ibuf_pop) ibuf_usage_d -= 1; // Are we ready? - operand_queue_ready_o = (ibuf_usage_q != BufferDepth); + operand_queue_ready_o = (ibuf_usage_q != DataBufDepth); end always_ff @(posedge clk_i or negedge rst_ni) begin: p_ibuf_usage_ff diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index dab636d07..5ed714522 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -52,14 +52,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math /////////// operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportIntExt4(1'b1 ), - .SupportIntExt8(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b0 ) + .CmdBufDepth (ValuInsnQueueDepth), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b0 ) ) i_operand_queue_alu_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -77,14 +78,15 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportIntExt4(1'b1 ), - .SupportIntExt8(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (ValuInsnQueueDepth), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_alu_b ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -106,12 +108,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ////////////////////// operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b0 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b0 ) ) i_operand_queue_mfpu_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -129,12 +132,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_mfpu_b ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -152,12 +156,13 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth (5 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ), - .SupportIntExt2(1'b1 ), - .SupportReduct (1'b1 ), - .SupportNtrVal (1'b1 ) + .CmdBufDepth (MfpuInsnQueueDepth ), + .DataBufDepth (5 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ), + .SupportIntExt2(1'b1 ), + .SupportReduct (1'b1 ), + .SupportNtrVal (1'b1 ) ) i_operand_queue_mfpu_c ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -179,9 +184,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math /////////////////////// operand_queue #( - .BufferDepth(2 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ) + .CmdBufDepth (VstuInsnQueueDepth + MaskuInsnQueueDepth), + .DataBufDepth (2 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ) ) i_operand_queue_st_mask_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -203,9 +209,10 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ****************/ operand_queue #( - .BufferDepth(2 ), - .FPUSupport (FPUSupport), - .NrLanes (NrLanes ) + .CmdBufDepth (VlduInsnQueueDepth), + .DataBufDepth (2 ), + .FPUSupport (FPUSupport ), + .NrLanes (NrLanes ) ) i_operand_queue_slide_addrgen_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -227,11 +234,12 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ///////////////// operand_queue #( - .BufferDepth(1 ), - .FPUSupport (FPUSupport), - .SupportIntExt2(1'b1), - .SupportIntExt4(1'b1), - .SupportIntExt8(1'b1), + .CmdBufDepth (MaskuInsnQueueDepth), + .DataBufDepth (1 ), + .FPUSupport (FPUSupport ), + .SupportIntExt2(1'b1 ), + .SupportIntExt4(1'b1 ), + .SupportIntExt8(1'b1 ), .NrLanes (NrLanes ) ) i_operand_queue_mask_b ( .clk_i (clk_i ), @@ -250,8 +258,9 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ); operand_queue #( - .BufferDepth(1 ), - .NrLanes (NrLanes ) + .CmdBufDepth (MaskuInsnQueueDepth), + .DataBufDepth (1 ), + .NrLanes (NrLanes ) ) i_operand_queue_mask_m ( .clk_i (clk_i ), .rst_ni (rst_ni ), From 630ef8e812a4fc7be045b3d57ad9589e8a5ae96d Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sun, 20 Nov 2022 20:30:11 +0100 Subject: [PATCH 3/8] [hardware] Parametrize addrgen queue depth --- hardware/include/ara_pkg.sv | 1 + hardware/src/vlsu/addrgen.sv | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 593967a7e..7b18e8597 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -86,6 +86,7 @@ package ara_pkg; localparam int unsigned ValuInsnQueueDepth = 4; localparam int unsigned VlduInsnQueueDepth = 4; localparam int unsigned VstuInsnQueueDepth = 4; + localparam int unsigned VaddrgenInsnQueueDepth = 4; localparam int unsigned SlduInsnQueueDepth = 2; localparam int unsigned NoneInsnQueueDepth = 1; // Ara supports MaskuInsnQueueDepth = 1 only. diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index def21df8e..2fbe05e55 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -89,8 +89,8 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( logic axi_addrgen_queue_empty; fifo_v3 #( - .DEPTH(4 ), - .dtype(addrgen_axi_req_t) + .DEPTH(VaddrgenInsnQueueDepth), + .dtype(addrgen_axi_req_t ) ) i_addrgen_req_queue ( .clk_i (clk_i ), .rst_ni (rst_ni ), From 4522f6cc168d05d44639b7e7007066fa39ae80ee Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 21 Nov 2022 00:19:56 +0100 Subject: [PATCH 4/8] [hardware] Add support for Barber's Pole VRF layout With Barber Pole layout, the PEs can almost always increment the address by 1 when writing back new data into the VRF. Only the Slide Unit has some special treatment, as its start address come with an offset. Remember that the VRF layou should also be consistent among different LMUL settings, i.e. when LMUL > 1 and we pass from reg N to reg N+1, we must also take into account that reg N+1 has a different starting position for element 0. --- hardware/include/ara_pkg.sv | 5 -- hardware/include/ara_vaddr.svh | 80 ++++++++++++++++++++++++++ hardware/src/ara.sv | 6 +- hardware/src/lane/lane.sv | 3 +- hardware/src/lane/operand_requester.sv | 22 +++++-- hardware/src/lane/valu.sv | 35 ++++++++--- hardware/src/lane/vector_fus_stage.sv | 12 ++-- hardware/src/lane/vmfpu.sv | 47 +++++++++++---- hardware/src/masku/masku.sv | 39 ++++++++++--- hardware/src/sldu/sldu.sv | 41 ++++++++++--- hardware/src/vlsu/vldu.sv | 45 +++++++++++---- hardware/src/vlsu/vlsu.sv | 3 +- 12 files changed, 265 insertions(+), 73 deletions(-) create mode 100644 hardware/include/ara_vaddr.svh diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 7b18e8597..6fe041e70 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -878,11 +878,6 @@ package ara_pkg; // Each lane has eight VRF banks localparam int unsigned NrVRFBanksPerLane = 8; - // Find the starting address of a vector register vid - function automatic logic [63:0] vaddr(logic [4:0] vid, int NrLanes); - vaddr = vid * (VLENB / NrLanes / 8); - endfunction: vaddr - // Differenciate between SLDU and ADDRGEN operands from opqueue typedef enum logic { ALU_SLDU = 1'b0, diff --git a/hardware/include/ara_vaddr.svh b/hardware/include/ara_vaddr.svh new file mode 100644 index 000000000..3cd9f9ce8 --- /dev/null +++ b/hardware/include/ara_vaddr.svh @@ -0,0 +1,80 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Matteo Perotti +// Description: +// Ara's functions to calculate VRF addresses. Not in the package +// since the functions depend on `NrLanes` + +// All the functions to support a Barber-Pole VRF layout + +// Find the starting VRF address of a vector register vid +function automatic vaddr_t vaddr(logic [4:0] vid, int NrLanes); + // This is not an adder, it's only wires. + // (this holds if VLENB / NrLanes >= NrVRFBanksPerLane^2) + vaddr = vid * (VLENB / NrLanes / NrVRFBanksPerLane) + vid[VaddrBankWidth-1:0]; +endfunction: vaddr + +// Return the physical address of the next element of a certain vector +function automatic vaddr_t next_vaddr(vaddr_t vaddr, logic [4:0] vid); + // vaddr msbs -> byte index in a bank + logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index; + // vaddr lsbs -> bank index + logic [VaddrBankWidth-1:0] bank; + + index = vaddr[VaddrIdxWidth-1:VaddrBankWidth]; + bank = vaddr[VaddrBankWidth-1:0]; + + old_index = index; + + // Increment bank counter + bank += 1; + if (bank == vid[VaddrBankWidth-1:0]) + // Wrap around + index += 1; + + // If we change vreg, the start element position is +1 (LMUL > 1) + // This is important for B layout consistency among different LMUL + // or when inactive element policy is "undistrubed" + if (index[VaddrVregWidth] != old_index[VaddrVregWidth]) + bank += 1; + + return {index, bank}; +endfunction + +// Initialize with an offset (necessary with vslideup) +function automatic vaddr_t vaddr_offset(vaddr_t vaddr, vaddr_t off, logic [4:0] vid); + // vaddr msbs -> byte index in a bank + logic [VaddrIdxWidth-1:VaddrBankWidth] index, old_index; + // vaddr lsbs -> bank index + logic [VaddrBankWidth-1:0] bank, old_bank; + + index = vaddr[VaddrIdxWidth-1:VaddrBankWidth]; + bank = vaddr[VaddrBankWidth-1:0]; + + old_index = index; + old_bank = bank; + + // Increment bank counter + index += off[VaddrIdxWidth-1:VaddrBankWidth]; + bank += off[VaddrBankWidth-1:0]; + // Support vstart != 0: don't hypothesize that old_bank == vid[VaddrBankWidth-1:0] + // Wrap around if we meet vid[VaddrBankWidth-1:0] during the addition + if (old_bank > vid[VaddrBankWidth-1:0]) begin + if (bank >= vid[VaddrBankWidth-1:0] && bank < old_bank) + // Wrap around + index += 1; + end else if (old_bank < vid[VaddrBankWidth-1:0]) begin + if (bank >= vid[VaddrBankWidth-1:0] || bank < old_bank) + // Wrap around + index += 1; + end + + // If we change vreg, the start element position is +1 + // for every reg passed (LMUL > 1). The max reg id delta is 7 + // with LMUL == 8. + bank += index[VaddrVregWidth +: 3] - old_index[VaddrVregWidth +: 3]; + + return {index, bank}; +endfunction diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 15bfb7fa5..350806979 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -388,8 +388,7 @@ module ara import ara_pkg::*; #( logic sldu_mask_ready; sldu #( - .NrLanes(NrLanes), - .vaddr_t(vaddr_t) + .NrLanes(NrLanes) ) i_sldu ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -424,8 +423,7 @@ module ara import ara_pkg::*; #( ///////////////// masku #( - .NrLanes(NrLanes), - .vaddr_t(vaddr_t) + .NrLanes(NrLanes) ) i_masku ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index d12c71345..a786cabfe 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -191,8 +191,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( operand_requester #( .NrBanks(NrVRFBanksPerLane), - .NrLanes(NrLanes ), - .vaddr_t(vaddr_t ) + .NrLanes(NrLanes ) ) i_operand_requester ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 54590fbc3..ba9895268 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -9,11 +9,17 @@ // queues. This stage also includes the VRF arbiter. module operand_requester import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter int unsigned NrBanks = 0, // Number of banks in the vector register file - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + parameter int unsigned NrBanks = 0, // Number of banks in the vector register file + // Type used to address vector register file elements + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam type strb_t = logic[$bits(elen_t)/8-1:0] + localparam type strb_t = logic[$bits(elen_t)/8-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -76,6 +82,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( output logic ldu_result_final_gnt_o ); + `include "../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////////////// @@ -233,6 +241,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( vid_t id; // Address of the next element to be read vaddr_t addr; + // Source reg LSbs (useful for barber's pole) + logic [idx_width(NrBanks)-1:0] vs; // How many elements remain to be read vlen_t len; // Element width @@ -316,6 +326,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( addr : vaddr(operand_request_i[requester].vs, NrLanes) + (operand_request_i[requester].vstart >> (int'(EW64) - int'(operand_request_i[requester].eew))), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) // This reasoning cannot be applied also to widening instructions, which modify vsew @@ -363,7 +374,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Received a grant. if (|operand_requester_gnt) begin // Bump the address pointer - requester_d.addr = requester_q.addr + 1'b1; + requester_d.addr = next_vaddr(requester_q.addr, requester_q.vs); // We read less than 64 bits worth of elements if (requester_q.len < (1 << (int'(EW64) - int'(requester_q.vew)))) @@ -405,6 +416,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( addr : vaddr(operand_request_i[requester].vs, NrLanes) + (operand_request_i[requester].vstart >> (int'(EW64) - int'(operand_request_i[requester].eew))), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], len : (operand_request_i[requester].scale_vl) ? ((operand_request_i[requester].vl << operand_request_i[requester].vtype.vsew) >> diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 386caca74..7cc93f3d8 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -8,15 +8,20 @@ // in a SIMD fashion, always operating on 64 bits. module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned NrLanes = 0, + parameter int unsigned NrLanes = 0, // Support for fixed-point data types - parameter logic FixPtSupport = FixedPointEnable, + parameter logic FixPtSupport = FixedPointEnable, // Type used to address vector register file elements - parameter type vaddr_t = logic, + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] + localparam int unsigned DataWidth = $bits(elen_t), + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -55,6 +60,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; ///////////// @@ -137,6 +145,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; logic mask; } payload_t; + vaddr_t addr_d, addr_q; + // Result queue payload_t [ResultQueueDepth-1:0] result_queue_d, result_queue_q; logic [ResultQueueDepth-1:0] result_queue_valid_d, result_queue_valid_q; @@ -424,6 +434,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; reduction_rx_cnt_d = reduction_rx_cnt_q; sldu_transactions_cnt_d = sldu_transactions_cnt_q; red_hs_synch_d = red_hs_synch_q; + addr_d = addr_q; alu_red_valid_o = 1'b0; sldu_alu_ready_d = 1'b0; simd_red_cnt_max_d = simd_red_cnt_max_q; @@ -474,8 +485,9 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; mask_ready_o = !vinsn_issue_q.vm; // Store the result in the result queue + addr_d = next_vaddr(addr_q, vinsn_issue_q.vd); result_queue_d[result_queue_write_pnt_q].wdata = result_queue_q[result_queue_write_pnt_q].wdata | valu_result; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_issue_q.vd, NrLanes) + ((vinsn_issue_q.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = addr_q; result_queue_d[result_queue_write_pnt_q].id = vinsn_issue_q.id; result_queue_d[result_queue_write_pnt_q].mask = vinsn_issue_q.vfu == VFU_MaskUnit; if (!narrowing(vinsn_issue_q.op) || !narrowing_select_q) @@ -531,6 +543,11 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; else vinsn_queue_d.issue_pnt = vinsn_queue_q.issue_pnt + 1; + // Change starting address when we issue a new instruction + // Since this unit is not pipelined and elements written in the + // result queue belong to vinsn_issue_q + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); + // Assign vector length for next instruction in the instruction queue if (vinsn_queue_d.issue_cnt != 0) begin if (!(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].op inside {[VMANDNOT:VMXNOR]})) @@ -830,6 +847,8 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; red_hs_synch_d = 1'b1; // Allow the first valid issue_cnt_d = vfu_operation_i.vl; + // Initialize the starting address for the next instruction + addr_d = vaddr(vfu_operation_i.vd, NrLanes); if (!(vfu_operation_i.op inside {[VMANDNOT:VMXNOR]})) issue_cnt_d = vfu_operation_i.vl; else begin @@ -877,6 +896,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; simd_red_cnt_max_q <= '0; alu_red_ready_q <= 1'b0; alu_vxsat_q <= '0; + addr_q <= '0; end else begin issue_cnt_q <= issue_cnt_d; commit_cnt_q <= commit_cnt_d; @@ -890,6 +910,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; simd_red_cnt_max_q <= simd_red_cnt_max_d; alu_red_ready_q <= alu_red_ready_i; alu_vxsat_q <= alu_vxsat_d; + addr_q <= addr_d; end end diff --git a/hardware/src/lane/vector_fus_stage.sv b/hardware/src/lane/vector_fus_stage.sv index 6eb28e7c2..42b6a347e 100644 --- a/hardware/src/lane/vector_fus_stage.sv +++ b/hardware/src/lane/vector_fus_stage.sv @@ -96,9 +96,8 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg ////////////////// valu #( - .NrLanes(NrLanes), - .FixPtSupport(FixPtSupport), - .vaddr_t(vaddr_t) + .NrLanes (NrLanes ), + .FixPtSupport(FixPtSupport) ) i_valu ( .clk_i (clk_i ), .rst_ni (rst_ni ), @@ -143,10 +142,9 @@ module vector_fus_stage import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg /////////////////// vmfpu #( - .NrLanes (NrLanes ), - .FPUSupport(FPUSupport), - .FixPtSupport(FixPtSupport), - .vaddr_t (vaddr_t ) + .NrLanes (NrLanes ), + .FPUSupport (FPUSupport ), + .FixPtSupport(FixPtSupport) ) i_vmfpu ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index c4ffc6d72..81c729864 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -9,17 +9,22 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; import cf_math_pkg::idx_width; #( - parameter int unsigned NrLanes = 0, + parameter int unsigned NrLanes = 0, // Support for floating-point data types - parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, + parameter fpu_support_e FPUSupport = FPUSupportHalfSingleDouble, // Support for fixed-point data types - parameter logic FixPtSupport = FixedPointEnable, + parameter logic FixPtSupport = FixedPointEnable, // Type used to address vector register file elements - parameter type vaddr_t = logic, + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [DataWidth/8-1:0] + localparam int unsigned DataWidth = $bits(elen_t), + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [DataWidth/8-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -61,6 +66,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + //////////////////////////////// // Vector instruction queue // //////////////////////////////// @@ -180,6 +188,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Helper signals // ////////////////////// + vaddr_t addr_d, addr_q; + logic vinsn_issue_mul, vinsn_issue_div, vinsn_issue_fpu; assign vinsn_issue_mul = vinsn_issue_q.op inside {[VMUL:VSMUL]}; @@ -1044,6 +1054,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; reduction_rx_cnt_d = reduction_rx_cnt_q; sldu_transactions_cnt_d = sldu_transactions_cnt_q; red_hs_synch_d = red_hs_synch_q; + addr_d = addr_q; mfpu_red_valid_o = 1'b0; sldu_mfpu_ready_d = 1'b0; simd_red_cnt_max_d = simd_red_cnt_max_q; @@ -1218,9 +1229,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; to_process_cnt_d = (narrowing(vinsn_processing_q.cvt_resize)) ? (to_process_cnt_q - processed_element_cnt_narrow) : (to_process_cnt_q - processed_element_cnt); // Store the result in the result queue + addr_d = next_vaddr(addr_q, vinsn_processing_q.vd); result_queue_d[result_queue_write_pnt_q].id = vinsn_processing_q.id; - result_queue_d[result_queue_write_pnt_q].addr = vaddr(vinsn_processing_q.vd, NrLanes) + - ((vinsn_processing_q.vl - to_process_cnt_q) >> (int'(EW64) - vinsn_processing_q.vtype.vsew)); + result_queue_d[result_queue_write_pnt_q].addr = addr_q; // FP narrowing instructions pack the result in two different cycles, and only some 16-bit slices are active if (narrowing(vinsn_processing_q.cvt_resize)) begin for (int b = 0; b < 4; b++) begin @@ -1275,6 +1286,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl; + + // Update the address for the results of the next cycles since they belong + // to the next instruction + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes); end end end @@ -1695,6 +1710,9 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; if (vinsn_queue_d.processing_cnt != 0) to_process_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vl; + // Update the starting address for the next instruction + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.processing_pnt].vd, NrLanes); + // Bump issue counter and pointers vinsn_queue_d.issue_cnt -= 1; if (vinsn_queue_q.issue_pnt == VInsnQueueDepth-1) vinsn_queue_d.issue_pnt = '0; @@ -1833,9 +1851,12 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; osum_issue_cnt_d = '0; issue_cnt_d = vfu_operation_i.vl; end - if (vinsn_queue_d.processing_cnt == '0) to_process_cnt_d = vfu_operation_i.vl; - if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = - is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl; + if (vinsn_queue_d.processing_cnt == '0) begin + to_process_cnt_d = vfu_operation_i.vl; + // A new instruction to process; update the starting address + addr_d = vaddr(vfu_operation_i.vd, NrLanes); + end + if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = is_reduction(vfu_operation_i.op) ? 1 : vfu_operation_i.vl; // Floating-Point re-encoding for widening operations // Enabled only for the supported formats if (FPUSupport != FPUSupportNone) begin @@ -1902,6 +1923,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= '0; osum_issue_cnt_q <= '0; mfpu_vxsat_q <= '0; + addr_q <= '0; end else begin issue_cnt_q <= issue_cnt_d; to_process_cnt_q <= to_process_cnt_d; @@ -1925,6 +1947,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; intra_op_rx_cnt_q <= intra_op_rx_cnt_d; osum_issue_cnt_q <= osum_issue_cnt_d; mfpu_vxsat_q <= mfpu_vxsat_d; + addr_q <= addr_d; end end diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index afea302f6..58cc11f1b 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -10,12 +10,18 @@ // predicated instructions. module masku import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type + localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type ) ( input logic clk_i, input logic rst_ni, @@ -48,6 +54,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( input logic sldu_mask_ready_i ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////// @@ -142,6 +151,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // There is a mask queue per lane, holding the operands that were not // yet used by the corresponding lane. + vaddr_t addr_d, addr_q; + // Mask queue strb_t [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_d, mask_queue_q; logic [MaskQueueDepth-1:0][NrLanes-1:0] mask_queue_valid_d, mask_queue_valid_q; @@ -647,6 +658,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( popcount_d = popcount_q; vfirst_count_d = vfirst_count_q; + addr_d = addr_q; + mask_queue_d = mask_queue_q; mask_queue_valid_d = mask_queue_valid_q; mask_queue_write_pnt_d = mask_queue_write_pnt_q; @@ -732,6 +745,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( else mask_queue_write_pnt_d = mask_queue_write_pnt_q + 1; + // Increment write-back address + addr_d = next_vaddr(addr_q, vinsn_issue.vd); + // Account for the operands that were issued read_cnt_d = read_cnt_q - NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew)); if (read_cnt_q < NrLanes * (1 << (int'(EW64) - vinsn_issue.vtype.vsew))) @@ -838,9 +854,8 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_queue_d[result_queue_write_pnt_q][lane] = '{ wdata: result_queue_q[result_queue_write_pnt_q][lane].wdata | alu_result[lane], be : (vinsn_issue.op inside {[VMSBF:VID]}) ? '1 : be(element_cnt, vinsn_issue.vtype.vsew), - addr : (vinsn_issue.op inside {[VMSBF:VID]}) ? vaddr(vinsn_issue.vd, NrLanes) + ((vinsn_issue.vl - issue_cnt_q) >> (int'(EW64) - vinsn_issue.vtype.vsew)) : vaddr(vinsn_issue.vd, NrLanes) + - (((vinsn_issue.vl - issue_cnt_q) / NrLanes / DataWidth)), - id : vinsn_issue.id + addr : addr_q, + id : vinsn_issue.id }; end @@ -848,6 +863,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue.op inside {[VMFEQ:VMSGTU], [VMSGT:VMSBC]}) begin vrf_pnt_d = vrf_pnt_q + (NrLanes << (int'(EW64) - vinsn_issue.vtype.vsew)); + // Increment write-back address + addr_d = next_vaddr(addr_q, vinsn_issue.vd); + // Filled-up a word, or finished execution if (vrf_pnt_d == DataWidth*NrLanes || vrf_pnt_d >= issue_cnt_q) begin result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; @@ -1077,6 +1095,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( issue_cnt_d = pe_req_i.vl; read_cnt_d = pe_req_i.vl; + // Initialize the starting address of the next instruction + addr_d = vaddr(pe_req_i.vd, NrLanes); + // Trim skipped words if (pe_req_i.op == VSLIDEUP) begin issue_cnt_d -= vlen_t'(trimmed_stride); @@ -1131,6 +1152,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_final_gnt_q <= '0; popcount_q <= '0; vfirst_count_q <= '0; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; read_cnt_q <= read_cnt_d; @@ -1142,6 +1164,7 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( result_final_gnt_q <= result_final_gnt_d; popcount_q <= popcount_d; vfirst_count_q <= vfirst_count_d; + addr_q <= addr_d; end end diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index 9c06c3ac5..7439bed7a 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -8,12 +8,18 @@ // instructions, which need access to the whole Vector Register File. module sldu import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // Dependant parameters. DO NOT CHANGE! - localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath - localparam int unsigned StrbWidth = DataWidth/8, - localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type + localparam int unsigned DataWidth = $bits(elen_t), // Width of the lane datapath + localparam int unsigned StrbWidth = DataWidth/8, + localparam type strb_t = logic [StrbWidth-1:0] // Byte-strobe type ) ( input logic clk_i, input logic rst_ni, @@ -46,6 +52,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( `include "common_cells/registers.svh" + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; //////////////////////////////// @@ -108,6 +117,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( localparam int unsigned ResultQueueDepth = 2; + vaddr_t addr_d, addr_q; + // There is a result queue per lane, holding the results that were not // yet accepted by the corresponding lane. typedef struct packed { @@ -220,6 +231,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( out_pnt_d = out_pnt_q; vrf_pnt_d = vrf_pnt_q; state_d = state_q; + addr_d = addr_q; result_queue_d = result_queue_q; result_queue_valid_d = result_queue_valid_q; @@ -268,6 +280,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Start writing at the middle of the destination vector vrf_pnt_d = vinsn_issue_q.stride >> $clog2(8*NrLanes); + // Fix the starting address + addr_d = vaddr_offset(addr_q, vrf_pnt_d, vinsn_issue_q.vd); + // Go to SLIDE_RUN_VSLIDE1UP_FIRST_WORD if this is a vslide1up instruction if (vinsn_issue_q.use_scalar_op) state_d = SLIDE_RUN_VSLIDE1UP_FIRST_WORD; @@ -349,8 +364,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Initialize id and addr fields of the result queue requests for (int lane = 0; lane < NrLanes; lane++) begin result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = - vaddr(vinsn_issue_q.vd, NrLanes) + vrf_pnt_q; + result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q; end // Bump pointers (reductions always finish in one shot) @@ -409,8 +423,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN}) mask_ready_o = !vinsn_issue_q.vm; - // Increment VRF address - vrf_pnt_d = vrf_pnt_q + 1; + // Increment write-back address + addr_d = vaddr_offset(addr_q, 1, vinsn_issue_q.vd); // Send result to the VRF result_queue_cnt_d += 1; @@ -466,6 +480,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_pnt += 1; vinsn_queue_d.issue_cnt -= 1; + + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); end end end @@ -500,6 +516,8 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Increment vector instruction queue pointers and counters vinsn_queue_d.issue_pnt += 1; vinsn_queue_d.issue_cnt -= 1; + + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); end end SLIDE_WAIT_OSUM: begin @@ -607,6 +625,9 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // VSLIDE1UP always writes at least 1 element if (pe_req_i.op == VSLIDEUP && !pe_req_i.use_scalar_op) issue_cnt_d -= vinsn_queue_d.vinsn[vinsn_queue_q.accept_pnt].stride; + + // Initialize the starting address for the next instruction + addr_d = vaddr(pe_req_i.vd, NrLanes); end if (vinsn_queue_d.commit_cnt == '0) begin commit_cnt_d = pe_req_i.op inside {VSLIDEUP, VSLIDEDOWN} @@ -638,6 +659,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( pe_resp_o <= '0; result_final_gnt_q <= '0; red_stride_cnt_q <= 1; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; issue_cnt_q <= issue_cnt_d; @@ -649,6 +671,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( pe_resp_o <= pe_resp; result_final_gnt_q <= result_final_gnt_d; red_stride_cnt_q <= red_stride_cnt_d; + addr_q <= addr_d; end end diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 61b26623a..6f94d9ec1 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -8,16 +8,22 @@ // upon receiving vector memory operations. module vldu import ara_pkg::*; import rvv_pkg::*; #( - parameter int unsigned NrLanes = 0, - parameter type vaddr_t = logic, // Type used to address vector register file elements + parameter int unsigned NrLanes = 0, + // Address of an element in the lane's VRF + localparam int unsigned MaxVLenBPerLane = VLENB / NrLanes, // In bytes + localparam int unsigned VRFBSizePerLane = MaxVLenBPerLane * 32, // In bytes + localparam int unsigned VaddrIdxWidth = $clog2(VRFBSizePerLane), + localparam int unsigned VaddrBankWidth = $clog2(NrVRFBanksPerLane), + localparam int unsigned VaddrVregWidth = $clog2(MaxVLenBPerLane), + localparam type vaddr_t = logic [VaddrIdxWidth-1:0], // AXI Interface parameters - parameter int unsigned AxiDataWidth = 0, - parameter int unsigned AxiAddrWidth = 0, - parameter type axi_r_t = logic, + parameter int unsigned AxiDataWidth = 0, + parameter int unsigned AxiAddrWidth = 0, + parameter type axi_r_t = logic, // Dependant parameters. DO NOT CHANGE! - localparam int DataWidth = $bits(elen_t), - localparam type strb_t = logic[DataWidth/8-1:0], - localparam type axi_addr_t = logic [AxiAddrWidth-1:0] + localparam int DataWidth = $bits(elen_t), + localparam type strb_t = logic[DataWidth/8-1:0], + localparam type axi_addr_t = logic [AxiAddrWidth-1:0] ) ( input logic clk_i, input logic rst_ni, @@ -55,7 +61,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( output logic mask_ready_o ); + // Include address-handling functions + `include "../../include/ara_vaddr.svh" + import cf_math_pkg::idx_width; + import axi_pkg::beat_lower_byte; import axi_pkg::beat_upper_byte; import axi_pkg::BURST_INCR; @@ -125,6 +135,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( localparam int unsigned ResultQueueDepth = 2; + vaddr_t addr_d, addr_q; + // There is a result queue per lane, holding the results that were not // yet accepted by the corresponding lane. typedef struct packed { @@ -204,6 +216,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d = vinsn_queue_q; issue_cnt_d = issue_cnt_q; commit_cnt_d = commit_cnt_q; + addr_d = addr_q; len_d = len_q; r_pnt_d = r_pnt_q; @@ -293,9 +306,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Initialize id and addr fields of the result queue requests for (int lane = 0; lane < NrLanes; lane++) begin result_queue_d[result_queue_write_pnt_q][lane].id = vinsn_issue_q.id; - result_queue_d[result_queue_write_pnt_q][lane].addr = vaddr(vinsn_issue_q.vd, NrLanes) + - (((vinsn_issue_q.vl - (issue_cnt_q >> int'(vinsn_issue_q.vtype.vsew))) / NrLanes) >> - (int'(EW64) - int'(vinsn_issue_q.vtype.vsew))); + result_queue_d[result_queue_write_pnt_q][lane].addr = addr_q; end end @@ -311,6 +322,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Trigger the request signal result_queue_valid_d[result_queue_write_pnt_q] = {NrLanes{1'b1}}; + // Increase the address + addr_d = next_vaddr(addr_q, vinsn_issue_q.vd); + // Acknowledge the mask operands mask_ready_o = !vinsn_issue_q.vm; @@ -349,6 +363,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( else vinsn_queue_d.issue_pnt += 1; + // Modify the next instruction's address + addr_d = vaddr(vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vd, NrLanes); + // Prepare for the next vector instruction if (vinsn_queue_d.issue_cnt != 0) issue_cnt_d = vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl << int'(vinsn_queue_q.vinsn[ @@ -439,8 +456,10 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_running_d[pe_req_i.id] = 1'b1; // Initialize counters - if (vinsn_queue_d.issue_cnt == '0) + if (vinsn_queue_d.issue_cnt == '0) begin issue_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); + addr_d = vaddr(pe_req_i.vd, NrLanes); + end if (vinsn_queue_d.commit_cnt == '0) commit_cnt_d = pe_req_i.vl << int'(pe_req_i.vtype.vsew); @@ -461,6 +480,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_pnt_q <= '0; pe_resp_o <= '0; result_final_gnt_q <= '0; + addr_q <= '0; end else begin vinsn_running_q <= vinsn_running_d; issue_cnt_q <= issue_cnt_d; @@ -470,6 +490,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_pnt_q <= vrf_pnt_d; pe_resp_o <= pe_resp; result_final_gnt_q <= result_final_gnt_d; + addr_q <= addr_d; end end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index 448b53a87..c86b7ee15 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -158,8 +158,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .AxiAddrWidth(AxiAddrWidth), .AxiDataWidth(AxiDataWidth), .axi_r_t (axi_r_t ), - .NrLanes (NrLanes ), - .vaddr_t (vaddr_t ) + .NrLanes (NrLanes ) ) i_vldu ( .clk_i (clk_i ), .rst_ni (rst_ni ), From 40b408b5618848190446adf55b949e5d7734f4d1 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 5 Dec 2022 20:32:44 +0100 Subject: [PATCH 5/8] [hardware] Handle slide1x and widening hazards with a special protocol Slide1Up/Down were blocked in the main sequencer when they had specific hazards. Now, these hazards are handled downstream, waiting for 1 cycle of stall and then continuing with the usual protocol. WAW hazards for widening instructions are also handled better now, discriminating between real widening instructions and reductions. --- hardware/include/ara_pkg.sv | 5 + hardware/src/ara_dispatcher.sv | 80 +++++++++- hardware/src/ara_sequencer.sv | 7 +- hardware/src/lane/lane_sequencer.sv | 208 +++++++++++++------------ hardware/src/lane/operand_requester.sv | 93 +++++++---- 5 files changed, 254 insertions(+), 139 deletions(-) diff --git a/hardware/include/ara_pkg.sv b/hardware/include/ara_pkg.sv index 6fe041e70..6c45d77ff 100644 --- a/hardware/include/ara_pkg.sv +++ b/hardware/include/ara_pkg.sv @@ -300,6 +300,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -397,6 +399,8 @@ package ara_pkg; logic wide_fp_imm; // Resizing of FP conversions resize_e cvt_resize; + // Widening and vslide1x instructions have different hazard stall policies + logic special_hazard; // Vector machine metadata vlen_t vl; @@ -894,6 +898,7 @@ package ara_pkg; logic scale_vl; // Rescale vl taking into account the new and old EEW resize_e cvt_resize; // Resizing of FP conversions + logic special_hazard; // Widening and vslide1x instructions have different hazard stall policies logic is_reduct; // Is this a reduction? diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index 6fe3783dc..22c1cd5b1 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -681,6 +681,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin ara_req_d.op = ara_pkg::VWREDSUM; @@ -690,6 +691,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1300,6 +1302,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1310,6 +1313,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt8; ara_req_d.eew_vs2 = eew_q[insn.varith_type.rs2]; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW64) || @@ -1320,6 +1324,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1329,6 +1334,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt4; ara_req_d.eew_vs2 = prev_prev_ew(vtype_q.vsew); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW32) || @@ -1338,6 +1344,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1347,6 +1354,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.prev(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; // Invalid conversion if (int'(vtype_q.vsew) < int'(EW16) || int'(vtype_q.vlmul) inside {LMUL_1_8}) @@ -1394,6 +1402,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1402,6 +1411,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1410,6 +1420,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1418,6 +1429,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1427,6 +1439,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1436,6 +1449,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1445,6 +1459,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1454,6 +1469,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1462,6 +1478,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1470,6 +1487,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1478,6 +1496,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1508,6 +1527,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1567,6 +1587,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -1577,6 +1599,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRXUNARY0 // vmv.s.x @@ -1625,6 +1649,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VWADD ara_req_d.op = ara_pkg::VADD; @@ -1633,6 +1658,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VWSUBU ara_req_d.op = ara_pkg::VSUB; @@ -1641,6 +1667,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VWSUB ara_req_d.op = ara_pkg::VSUB; @@ -1649,6 +1676,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VWADDU.W ara_req_d.op = ara_pkg::VADD; @@ -1658,6 +1686,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110101: begin // VWADD.W ara_req_d.op = ara_pkg::VADD; @@ -1667,6 +1696,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VWSUBU.W ara_req_d.op = ara_pkg::VSUB; @@ -1676,6 +1706,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110111: begin // VWSUB.W ara_req_d.op = ara_pkg::VSUB; @@ -1685,6 +1716,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VWMULU ara_req_d.op = ara_pkg::VMUL; @@ -1693,6 +1725,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111010: begin // VWMULSU ara_req_d.op = ara_pkg::VMUL; @@ -1701,6 +1734,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionZExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111011: begin // VWMUL ara_req_d.op = ara_pkg::VMUL; @@ -1709,6 +1743,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionSExt2; ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VWMACCU ara_req_d.op = ara_pkg::VMACC; @@ -1719,6 +1754,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VWMACC ara_req_d.op = ara_pkg::VMACC; @@ -1729,6 +1765,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VWMACCUS ara_req_d.op = ara_pkg::VMACC; @@ -1739,6 +1776,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionSExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VWMACCSU ara_req_d.op = ara_pkg::VMACC; @@ -1749,6 +1787,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs2 = OpQueueConversionZExt2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -1883,6 +1922,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01000: begin // Widening VFCVTXUF ara_req_d.op = VFCVTXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1890,6 +1930,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01001: begin // Widening VFCVTXF ara_req_d.op = VFCVTXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1897,6 +1938,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01010: begin // Widening VFCVTFXU ara_req_d.op = VFCVTFXU; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1904,6 +1946,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01011: begin // Widening VFCVTFX ara_req_d.op = VFCVTFX; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1911,6 +1954,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01100: begin // Widening VFCVTFF ara_req_d.op = VFCVTFF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1918,6 +1962,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01110: begin // Widening VFCVTRTZXUF ara_req_d.op = VFCVTRTZXUF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -1925,6 +1970,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 5'b01111: begin // Widening VFCVTRTZXF ara_req_d.op = VFCVTRTZXF; ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; ara_req_d.emul = next_lmul(vtype_q.vlmul); ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs2 = OpQueueAdjustFPCvt; @@ -2032,6 +2078,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110001: begin // VFWREDUSUM ara_req_d.op = ara_pkg::VFWREDUSUM; @@ -2041,7 +2089,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110010: begin // VFWSUB ara_req_d.op = ara_pkg::VFSUB; @@ -2050,6 +2099,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110011: begin // VFWREDOSUM ara_req_d.op = ara_pkg::VFWREDOSUM; @@ -2059,7 +2110,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueReductionZExt; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vs1 = vtype_q.vsew.next(); - ara_req_d.cvt_resize = resize_e'(2'b00); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110100: begin // VFWADD.W ara_req_d.op = ara_pkg::VFADD; @@ -2069,6 +2121,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b110110: begin // VFWSUB.W ara_req_d.op = ara_pkg::VFSUB; @@ -2078,6 +2132,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.eew_vs2 = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111000: begin // VFWMUL ara_req_d.op = ara_pkg::VFMUL; @@ -2085,6 +2141,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.vtype.vsew = vtype_q.vsew.next(); ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111100: begin // VFWMACC ara_req_d.op = ara_pkg::VFMACC; @@ -2094,6 +2152,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111101: begin // VFWNMACC ara_req_d.op = ara_pkg::VFNMACC; @@ -2103,6 +2163,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111110: begin // VFWMSAC ara_req_d.op = ara_pkg::VFMSAC; @@ -2112,6 +2174,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end 6'b111111: begin // VFWNMSAC ara_req_d.op = ara_pkg::VFNMSAC; @@ -2121,6 +2185,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.conversion_vs1 = OpQueueConversionWideFP2; ara_req_d.conversion_vs2 = OpQueueConversionWideFP2; ara_req_d.eew_vd_op = vtype_q.vsew.next(); + ara_req_d.cvt_resize = CVT_WIDE; + ara_req_d.special_hazard = 1'b1; end default: illegal_insn = 1'b1; endcase @@ -2217,6 +2283,8 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( ara_req_d.eew_vs2 = vtype_q.vsew; // Request will need reshuffling ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; // If stride > vl, the vslideup has no effects if (|ara_req_d.stride[$bits(ara_req_d.stride)-1:$bits(vl_q)] || (vlen_t'(ara_req_d.stride) >= vl_q)) null_vslideup = 1'b1; @@ -2224,9 +2292,11 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( 6'b001111: begin // vfslide1down ara_req_d.op = ara_pkg::VSLIDEDOWN; ara_req_d.stride = 1; - ara_req_d.eew_vs2 = vtype_q.vsew; - // Request will need reshuffling - ara_req_d.scale_vl = 1'b1; + ara_req_d.eew_vs2 = vtype_q.vsew; + // Request will need reshuffling + ara_req_d.scale_vl = 1'b1; + // Special hazard handling for this instruction + ara_req_d.special_hazard = 1'b1; end 6'b010000: begin // VRFUNARY0 // vmv.s.f diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 8355a97de..539e3d2b3 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -361,6 +361,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i fp_rm : ara_req_i.fp_rm, wide_fp_imm : ara_req_i.wide_fp_imm, cvt_resize : ara_req_i.cvt_resize, + special_hazard: ara_req_i.special_hazard, scale_vl : ara_req_i.scale_vl, vl : ara_req_i.vl, vstart : ara_req_i.vstart, @@ -384,8 +385,10 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i if ((!(|{ara_req_i.use_vs1, ara_req_i.use_vs2, ara_req_i.use_vd_op, !ara_req_i.vm}) && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2, pe_req_d.hazard_vm, pe_req_d.hazard_vd} && !(is_load(pe_req_d.op))) || - (pe_req_d.op == VSLIDEUP && |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || - (pe_req_d.op == VSLIDEDOWN && |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) + (pe_req_d.op == VSLIDEUP && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vd, pe_req_d.hazard_vs1, pe_req_d.hazard_vs2}) || + (pe_req_d.op == VSLIDEDOWN && !pe_req_d.use_scalar_op && + |{pe_req_d.hazard_vs1, pe_req_d.hazard_vs2})) begin ara_req_ready_o = 1'b0; pe_req_valid_d = 1'b0; diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 722bab7a5..cee688f18 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -240,42 +240,44 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: unique case (pe_req.vfu) VFU_Alu: begin operand_request_i[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // In case of reduction, AluA opqueue will keep the scalar element - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluA] = pe_req.use_vs1; operand_request_i[AluB] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : (vfu_operation_d.vl == '0) ? OpQueueReductionZExt : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, - target_fu : ALU_SLDU, - default : '0 + vl : (pe_req.op inside {[VREDSUM:VWREDSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VREDSUM:VWREDSUM]} ? 1'b1 : 0, + target_fu : ALU_SLDU, + default : '0 }; operand_request_push[AluB] = pe_req.use_vs2; @@ -298,66 +300,69 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end VFU_MFpu: begin operand_request_i[MulFPUA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - eew : pe_req.eew_vs1, + id : pe_req.id, + vs : pe_req.vs1, + eew : pe_req.eew_vs1, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs1, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs1, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]}) ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUA] = pe_req.use_vs1; operand_request_i[MulFPUB] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vd : pe_req.vs2, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vd_op : pe_req.eew_vs2, // If reductions and vl == 0, we must replace with neutral values - conv : pe_req.conversion_vs2, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, - vtype : pe_req.vtype, + conv : pe_req.conversion_vs2, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, + vtype : pe_req.vtype, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - hazard : (pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + hazard : (pe_req.swap_vs2_vd_op ? pe_req.hazard_vd : (pe_req.hazard_vs2 | pe_req.hazard_vd)), - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default: '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUB] = pe_req.swap_vs2_vd_op ? pe_req.use_vd_op : pe_req.use_vs2; operand_request_i[MulFPUC] = '{ - id : pe_req.id, - vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, - eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, - conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, - scale_vl : pe_req.scale_vl, - cvt_resize : pe_req.cvt_resize, + id : pe_req.id, + vs : pe_req.swap_vs2_vd_op ? pe_req.vs2 : pe_req.vd, + eew : pe_req.swap_vs2_vd_op ? pe_req.eew_vs2 : pe_req.eew_vd_op, + conv : pe_req.swap_vs2_vd_op ? pe_req.conversion_vs2 : OpQueueConversionNone, + scale_vl : pe_req.scale_vl, + cvt_resize : pe_req.cvt_resize, + special_hazard : pe_req.special_hazard, // If reductions and vl == 0, we must replace the operands with neutral // values in the opqueues. So, vl must be 1 at least - vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) - ? 1 : vfu_operation_d.vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req.vtype, - hazard : pe_req.swap_vs2_vd_op ? + vl : (pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} && vfu_operation_d.vl == '0) + ? 1 : vfu_operation_d.vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req.vtype, + hazard : pe_req.swap_vs2_vd_op ? (pe_req.hazard_vs2 | pe_req.hazard_vd) : pe_req.hazard_vd, - is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, - target_fu : MFPU_ADDRGEN, - default : '0 + is_reduct : pe_req.op inside {[VFREDUSUM:VFWREDOSUM]} ? 1'b1 : 0, + target_fu : MFPU_ADDRGEN, + default : '0 }; operand_request_push[MulFPUC] = pe_req.swap_vs2_vd_op ? pe_req.use_vs2 : pe_req.use_vd_op; @@ -399,17 +404,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Load indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -455,17 +461,18 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Store indexed operand_request_i[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, - target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, - vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, - default : '0 + id : pe_req_i.id, + vs : pe_req_i.vs2, + eew : pe_req_i.eew_vs2, + conv : pe_req_i.conversion_vs2, + target_fu : MFPU_ADDRGEN, + special_hazard : pe_req.special_hazard, + vl : pe_req_i.vl / NrLanes, + scale_vl : pe_req_i.scale_vl, + vstart : vfu_operation_d.vstart, + vtype : pe_req_i.vtype, + hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -476,16 +483,17 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: VFU_SlideUnit: begin operand_request_i[SlideAddrGenA] = '{ - id : pe_req.id, - vs : pe_req.vs2, - eew : pe_req.eew_vs2, - conv : pe_req.conversion_vs2, - target_fu: ALU_SLDU, - scale_vl : pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, - default : '0 + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, + target_fu : ALU_SLDU, + special_hazard : pe_req.special_hazard, + scale_vl : pe_req.scale_vl, + vtype : pe_req.vtype, + vstart : vfu_operation_d.vstart, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, + default : '0 }; operand_request_push[SlideAddrGenA] = pe_req.use_vs2; diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index ba9895268..97ce49ad1 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -255,15 +255,27 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // In case of a WAW with a previous instruction, // read once every two writes of the previous instruction logic is_widening; + // Does this instruction have a special hazard protocol? + logic special_hazard; // One-bit counters logic [NrVInsn-1:0] waw_hazard_counter; } requester_d, requester_q; + // Asserted if the SLDU requester is registering a new instruction + logic new_sldu_insn; + logic has_stalled_d, has_stalled_q; // Is there a hazard during this cycle? + // WAW with widening instructions are special: wait for 2 writes instead of 1 + // Slide1Up/Down with hazards should wait one cycle before being handled normally logic stall; - assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & - (~{NrVInsn{requester_q.is_widening}} | requester_q.waw_hazard_counter))); + assign stall = |(requester_q.hazard & ~(vinsn_result_written_q & ((~{NrVInsn{requester_q.is_widening}} & + requester_q.special_hazard) | requester_q.waw_hazard_counter))) | + (~has_stalled_q & requester_q.special_hazard & |requester_q.hazard); + + // For every instruction, it signals if the requester has already stalled once + // This is needed for vslide1x stall handling + assign has_stalled_d = new_sldu_insn ? 1'b0 : (stall ? 1'b1 : has_stalled_q); // Did we get a grant? logic [NrBanks-1:0] operand_requester_gnt; @@ -279,6 +291,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( state_d = state_q; requester_d = requester_q; + new_sldu_insn = 1'b0; + // Make no requests to the VRF operand_payload[requester] = '0; for (int bank = 0; bank < NrBanks; bank++) operand_req[bank][requester] = 1'b0; @@ -298,6 +312,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -322,23 +340,25 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) // This reasoning cannot be applied also to widening instructions, which modify vsew // treating it as the EEW of vd - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE, + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, default: '0 }; // The length should be at least one after the rescaling @@ -392,6 +412,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Acknowledge the request operand_request_ready_o[requester] = 1'b1; + // New slide unit instruction incoming + if (requester == (NrOperandQueues + VFU_SlideUnit)) + new_sldu_insn = 1'b1; + // Send a command to the operand queue operand_queue_cmd_o[requester] = '{ eew : operand_request_i[requester].eew, @@ -412,19 +436,22 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ - id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), - vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], - len : (operand_request_i[requester].scale_vl) ? - ((operand_request_i[requester].vl << - operand_request_i[requester].vtype.vsew) >> - operand_request_i[requester].eew) : - operand_request_i[requester].vl, - vew : operand_request_i[requester].eew, - hazard : operand_request_i[requester].hazard, - default: '0 + id : operand_request_i[requester].id, + addr : vaddr(operand_request_i[requester].vs, NrLanes) + + (operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), + vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], + len : (operand_request_i[requester].scale_vl) ? + ((operand_request_i[requester].vl << + operand_request_i[requester].vtype.vsew) >> + operand_request_i[requester].eew) : + operand_request_i[requester].vl, + vew : operand_request_i[requester].eew, + hazard : operand_request_i[requester].hazard, + is_widening : operand_request_i[requester].cvt_resize == CVT_WIDE && + operand_request_i[requester].special_hazard, + special_hazard : operand_request_i[requester].special_hazard, + default : '0 }; // The length should be at least one after the rescaling if (requester_d.len == '0) @@ -440,11 +467,13 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - state_q <= IDLE; - requester_q <= '0; + state_q <= IDLE; + requester_q <= '0; + has_stalled_q <= 1'b0; end else begin - state_q <= state_d; - requester_q <= requester_d; + state_q <= state_d; + requester_q <= requester_d; + has_stalled_q <= has_stalled_d; end end end : gen_operand_requester From fd801c4768e8d819fcc302438694cc9c638b71df Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 5 Dec 2022 20:38:22 +0100 Subject: [PATCH 6/8] [CHANGELOG] Update Changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 529361b68..40ca1055c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `VLXE` and `VSXE` need to wait that the SlideAddrGenA opreq is free before being issued by the lane sequencer to the operand requester stage - Do not trap instructions with no operands in the main sequencer - Commit a reduction only after a grant from the VRF + - Decouple `cmdBuffer` and `dataBuffer` depth parameters in the operand queues ### Added @@ -104,6 +105,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add support for vector mask population count and find first set bit instructions: `vcpop.m`, `vfirst.m` - Add Spyglass linting script - Add parametrized support for Fixed-Point math + - Add support for Barber's Pole VRF Layout ### Changed @@ -134,6 +136,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Adapt `fdotproduct` to `dotproduct` structure - Pre-calculate next-cycle `aligned_start_address` in `addrgen` for timing reasons - Add `is_reduct` signal to the operand queues, to gate the neutral value filling + - Handle WAW and WAR `vload` hazards in the `VLDU` without stalling the main sequencer + - Reductions are no more treated as widening instructions for what concerns WAW hazards in the operand requesters + - `slide1x` instructions are now not stalled in the main sequencer, but the hazard is handled downstream ## 2.2.0 - 2021-11-02 From a19965474412871fda95a0643948ec76517b9655 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Mon, 12 Dec 2022 15:40:27 +0100 Subject: [PATCH 7/8] [hardware] :bug: `vstart` should consider Barber's Pole layout --- hardware/src/lane/operand_requester.sv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 97ce49ad1..3e85a58b3 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -341,9 +341,9 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), + addr : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes), + vaddr_t'(operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs), vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], // For memory operations, the number of elements initially refers to the new EEW (vsew here), // but the requester must refer to the old EEW (eew here) @@ -437,9 +437,9 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Store the request requester_d = '{ id : operand_request_i[requester].id, - addr : vaddr(operand_request_i[requester].vs, NrLanes) + - (operand_request_i[requester].vstart >> - (int'(EW64) - int'(operand_request_i[requester].eew))), + addr : vaddr_offset(vaddr(operand_request_i[requester].vs, NrLanes), + vaddr_t'(operand_request_i[requester].vstart >> + (int'(EW64) - int'(operand_request_i[requester].eew))), operand_request_i[requester].vs), vs : operand_request_i[requester].vs[idx_width(NrBanks)-1:0], len : (operand_request_i[requester].scale_vl) ? ((operand_request_i[requester].vl << From 12e5768ce302bba2c5d82d42deceb590ff3f013f Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Thu, 23 Mar 2023 13:42:46 +0100 Subject: [PATCH 8/8] DEBUG: retrigger the CI --- hardware/src/ara.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 350806979..4bc110a5d 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -242,9 +242,9 @@ module ara import ara_pkg::*; #( for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_lanes lane #( - .NrLanes (NrLanes ), - .FPUSupport (FPUSupport ), - .FixPtSupport(FixPtSupport) + .NrLanes (NrLanes ), + .FPUSupport (FPUSupport ), + .FixPtSupport (FixPtSupport ) ) i_lane ( .clk_i (clk_i ), .rst_ni (rst_ni ),