diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c5561823..d30a7f4af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: ##################### tc-llvm: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -82,7 +82,7 @@ jobs: path: tc-llvm.tar tc-gcc: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -134,7 +134,7 @@ jobs: path: tc-gcc.tar tc-isa-sim: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -169,7 +169,7 @@ jobs: path: tc-isa-sim.tar tc-verilator: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -208,8 +208,35 @@ jobs: # Compile stage # ################### + compile-slang: + runs-on: ubuntu-latest + permissions: + contents: read + checks: write + pull-requests: write + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + - name: Download RTL submodules + run: | + make -C hardware checkout + make -C hardware apply-patches + - name: Generate flist + run: make -C hardware spyglass/tmp/files + - name: Run slang + uses: pulp-platform/pulp-actions/slang@v2.5.0 # update version as needed, not autoupdated + with: + token: ${{ secrets.GITHUB_TOKEN }} + slang-flags: >- + -f hardware/spyglass/tmp/files --top ara_soc_wrap + -Wextra -Wno-width-trunc -Wno-case-redundant-default -Wno-case-enum -Wno-incomplete-return -Wno-dup-import + --ignore-unknown-modules + --suppress-warnings .bender/...,hardware/deps/... + compile-apps: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -241,7 +268,7 @@ jobs: path: apps/bin compile-riscv-tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -280,7 +307,7 @@ jobs: path: apps/bin compile-ara: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -314,7 +341,7 @@ jobs: #################### simulate: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 2 matrix: @@ -342,7 +369,7 @@ jobs: ######################## riscv-tests-simv: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -365,7 +392,7 @@ jobs: run: config=${{ matrix.ara_config }} make -C hardware -j8 riscv_tests_simv riscv-tests-spike: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: ["tc-isa-sim", "compile-riscv-tests"] steps: - uses: actions/checkout@v6 @@ -404,7 +431,7 @@ jobs: ################### check-license: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 @@ -416,7 +443,7 @@ jobs: run: python scripts/licence-checker.py --config scripts/licence-checker.hjson hardware check-clang-format: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: ['tc-llvm'] steps: - uses: actions/checkout@v6 @@ -442,7 +469,7 @@ jobs: exit $EXIT_STATUS check-trailing-whitespaces: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: @@ -470,7 +497,7 @@ jobs: ##################### benchmark: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -516,7 +543,7 @@ jobs: path: benchmarks-${{ matrix.ara_config }}.tar roofline: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: benchmark steps: - uses: actions/checkout@v6 @@ -672,7 +699,7 @@ jobs: #################### clean-up: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest if: always() needs: ["simulate", "riscv-tests-spike", "riscv-tests-simv"] steps: @@ -688,7 +715,7 @@ jobs: riscv-tests-spike clean-up-compile-runs: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: diff --git a/hardware/spyglass/src/ara_soc_wrap.sv b/hardware/spyglass/src/ara_soc_wrap.sv index 988be30c3..e4660ea78 100644 --- a/hardware/spyglass/src/ara_soc_wrap.sv +++ b/hardware/spyglass/src/ara_soc_wrap.sv @@ -17,8 +17,6 @@ module ara_soc_wrap ( localparam int unsigned AxiUserWidth = 1; localparam int unsigned AxiIdWidth = 5; - logic clk_i, rst_ni; - ara_soc #( .NrLanes (NrLanes ), .VLEN (VLEN ), @@ -33,7 +31,15 @@ module ara_soc_wrap ( .scan_data_i (1'b0 ), .uart_prdata_i ('0 ), .uart_pready_i ('0 ), - .uart_pslverr_i('0 ) + .uart_pslverr_i('0 ), + .exit_o(), + .hw_cnt_en_o(), + .scan_data_o(), + .uart_penable_o(), + .uart_pwrite_o(), + .uart_paddr_o(), + .uart_psel_o(), + .uart_pwdata_o() ); endmodule diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 4cb9a9506..7cff40045 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -491,7 +491,7 @@ module ara import ara_pkg::*; #( // Break path for acc_mmu_en. This signal can afford some additional latency // since vector mem ops take multiple cycles to reach the addrgen - `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni); + `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni) vlsu #( .NrLanes (NrLanes ), diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 9cd0cc405..e7e7e3925 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -341,7 +341,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic running_mask_insn_d, running_mask_insn_q; logic lsu_current_burst_exception_q; - `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni); + `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni) // pe_req_ready_i comes from all the lanes // It is deasserted if the current request is stuck diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index 5ee64e17c..fb25622eb 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -259,7 +259,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( `endif // One-cycle latency - `FF(l2_rvalid, l2_req, 1'b0); + `FF(l2_rvalid, l2_req, 1'b0) //////////// // UART // @@ -482,17 +482,17 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( localparam config_pkg::cva6_cfg_t CVA6AraConfig = build_config_pkg::build_config(CVA6AraConfig_user); // Define the exception type - `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig); + `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig) // Standard interface - `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e); - `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t); + `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e) + `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t) // MMU interface - `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig); - `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t); + `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig) + `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t) // Accelerator - CVA6's top-level interface - `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t); - `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t); + `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t) + `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t) `ifndef TARGET_GATESIM ara_system #( diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv index 4accbd77c..6ecd5b3e8 100644 --- a/hardware/src/ara_system.sv +++ b/hardware/src/ara_system.sv @@ -96,7 +96,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( // Support max 8 cores, for now logic [63:0] hart_id; - assign hart_id = {'0, hart_id_i}; + assign hart_id = 64'(hart_id_i); // Pack invalidation interface into acc interface acc_to_cva6_t acc_resp_pack; diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 54eb9278c..397dbb98c 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -95,7 +95,7 @@ module ctrl_registers #( .reg_q_o ({hw_cnt_en, event_trigger, dram_end_address, dram_base_address, exit}) ); - `FF(wr_active_q, wr_active_d, '0); + `FF(wr_active_q, wr_active_d, '0) ///////////////// // Signals // @@ -105,6 +105,6 @@ module ctrl_registers #( assign event_trigger_o = event_trigger; assign dram_base_addr_o = dram_base_address; assign dram_end_addr_o = dram_end_address; - assign exit_o = {exit, logic'(|wr_active_q[7:0])}; + assign exit_o = {exit, |wr_active_q[7:0]}; endmodule : ctrl_registers diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index c17d87fad..7d01a541e 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -222,12 +222,12 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic [NrVInsn-1:0] mfpu_vinsn_done; // Interface with the MaskB operand queue (VRGATHER/VCOMPRESS) logic mask_b_cmd_pop_d, mask_b_cmd_pop_q; - `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni); + `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni) // Support for store exception flush logic lsu_ex_flush_op_req_d, lsu_ex_flush_op_req_q; - `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni) // Additional signals to please Verilator's hierarchical verilation pe_req_t pe_req; @@ -310,7 +310,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic sldu_result_gnt_opqueues; // Support for store exception flush logic lsu_ex_flush_op_queues_d, lsu_ex_flush_op_queues_q; - `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni) operand_requester #( .NrLanes (NrLanes ), @@ -609,8 +609,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( ); // Break timing path - `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni); - `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni); + `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni) + `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni) always_comb begin sldu_addrgen_sel_d = SLDU_SEL; diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index a9fa44e32..ceeb9c8e1 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -52,7 +52,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: `include "common_cells/registers.svh" // STU exception support - `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) //////////////////////////// // Register the request // @@ -73,6 +73,12 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: logic pe_req_valid; logic pe_req_ready; + // VSLIDEDOWN stride helpers (kept at module scope to remain visible in waveforms) + // Extra elements to ask, because of the stride + logic [$clog2(8*NrLanes)-1:0] extra_stride; + // Need one bit more than vl, since we will also add the stride contribution + logic [$bits(pe_req.vl):0] vl_tot; + fall_through_register #( .T(pe_req_t) ) i_pe_req_register ( @@ -279,6 +285,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request = '0; operand_request_push = '0; + // Default the slide unit stride helpers (avoids inferred latches) + extra_stride = '0; + vl_tot = '0; + // Make no requests to the lane's VFUs vfu_operation_d = '0; vfu_operation_valid_d = 1'b0; @@ -662,11 +672,6 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes; end VSLIDEDOWN: begin - // Extra elements to ask, because of the stride - logic [$clog2(8*NrLanes)-1:0] extra_stride; - // Need one bit more than vl, since we will also add the stride contribution - logic [$bits(pe_req.vl):0] vl_tot; - // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes; @@ -744,18 +749,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // todo: // Mask logical and integer comparisons - operand_request[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - scale_vl: pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - target_fu : ALU_SLDU, - conv : OpQueueConversionNone, - cvt_resize: CVT_SAME, - default : '0 - }; + operand_request[AluA] = '0; + operand_request[AluA].id = pe_req.id; + operand_request[AluA].vs = pe_req.vs1; + operand_request[AluA].scale_vl = pe_req.scale_vl; + operand_request[AluA].vtype = pe_req.vtype; + operand_request[AluA].vstart = vfu_operation_d.vstart; + operand_request[AluA].hazard = pe_req.hazard_vs1 | pe_req.hazard_vd; + operand_request[AluA].target_fu = ALU_SLDU; + operand_request[AluA].conv = OpQueueConversionNone; + operand_request[AluA].cvt_resize = CVT_SAME; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -858,18 +861,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Vd register to provide correct mask undisturbed policy at bit-level // This is can be a mask or normal register - operand_request[MaskB] = '{ - id : pe_req.id, - vs : pe_req.vd, - scale_vl: pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vd, - target_fu : ALU_SLDU, - conv : OpQueueConversionNone, - cvt_resize: CVT_SAME, - default : '0 - }; + operand_request[MaskB] = '0; + operand_request[MaskB].id = pe_req.id; + operand_request[MaskB].vs = pe_req.vd; + operand_request[MaskB].scale_vl = pe_req.scale_vl; + operand_request[MaskB].vtype = pe_req.vtype; + operand_request[MaskB].vstart = vfu_operation_d.vstart; + operand_request[MaskB].hazard = pe_req.hazard_vd; + operand_request[MaskB].target_fu = ALU_SLDU; + operand_request[MaskB].conv = OpQueueConversionNone; + operand_request[MaskB].cvt_resize = CVT_SAME; // vl and eew depend on the real eew on which we are working on if (pe_req.op inside {VIOTA,VID}) begin // Non-mask layout @@ -938,16 +939,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // VRGATHER and VCOMPRESS access the opreq with ad-hoc requests if (vrgat_state_q == REQUESTING) begin // Here, we are sure the MaskB operand_request is free - operand_request[MaskB] = '{ - vs : masku_vrgat_req_q.vs, - eew : masku_vrgat_req_q.eew, - scale_vl : 1'b0, - cvt_resize : pe_req.cvt_resize, - vl : 1, - vstart : masku_vrgat_req_q.idx, - hazard : '0, - default : '0 - }; + operand_request[MaskB] = '0; + operand_request[MaskB].vs = masku_vrgat_req_q.vs; + operand_request[MaskB].eew = masku_vrgat_req_q.eew; + operand_request[MaskB].scale_vl = 1'b0; + operand_request[MaskB].cvt_resize = pe_req.cvt_resize; + operand_request[MaskB].vl = 1; + operand_request[MaskB].vstart = masku_vrgat_req_q.idx; + operand_request[MaskB].hazard = '0; operand_request_push[MaskB] = masku_vrgat_req_ready_d; end end: sequencer diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index a0b750f1e..09369ba26 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -58,7 +58,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math `include "common_cells/registers.svh" // STU flush support - `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) /////////// // ALU // diff --git a/hardware/src/lane/power_gating_generic.sv b/hardware/src/lane/power_gating_generic.sv index 928e2625d..bd7797458 100644 --- a/hardware/src/lane/power_gating_generic.sv +++ b/hardware/src/lane/power_gating_generic.sv @@ -19,6 +19,6 @@ module power_gating_generic #( // Gate with an AND assign en_wide = en_i ? T'('1) : T'('0); - assign out_o = T'(in_i & en_wide); + assign out_o = in_i & en_wide; endmodule diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index b97016542..48d22038b 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -114,6 +114,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( // Default assignment res = '0; vxsat.w64 = '0; + r = '0; if (valid_i) unique case (op_i) @@ -228,44 +229,44 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( VADD, VADC, VMADC, VREDSUM, VWREDSUMU, VWREDSUM: unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b] + - logic'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i); + 8'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i); res.w8[b] = (op_i == VMADC) ? {6'b0, 1'b1, sum[8]} : sum[7:0]; end EW16: for (int b = 0; b < 4; b++) begin automatic logic [16:0] sum = opa.w16[b] + opb.w16[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i); + 16'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i); res.w16[b] = (op_i == VMADC) ? {14'b0, 1'b1, sum[16]} : sum[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sum = opa.w32[b] + opb.w32[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i); + 32'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i); res.w32[b] = (op_i == VMADC) ? {30'b0, 1'b1, sum[32]} : sum[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sum = opa.w64[b] + opb.w64[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i); + 64'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i); res.w64[b] = (op_i == VMADC) ? {62'b0, 1'b1, sum[64]} : sum[63:0]; end endcase VSUB, VSBC, VMSBC: unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i); + 8'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i); res.w8[b] = (op_i == VMSBC) ? {6'b0, 1'b1, sub[8]} : sub[7:0]; end EW16: for (int b = 0; b < 4; b++) begin automatic logic [16:0] sub = opb.w16[b] - opa.w16[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i); + 16'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i); res.w16[b] = (op_i == VMSBC) ? {14'b0, 1'b1, sub[16]} : sub[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sub = opb.w32[b] - opa.w32[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i); + 32'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i); res.w32[b] = (op_i == VMSBC) ? {30'b0, 1'b1, sub[32]} : sub[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sub = opb.w64[b] - opa.w64[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i); + 64'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i); res.w64[b] = (op_i == VMSBC) ? {62'b0, 1'b1, sub[64]} : sub[63:0]; end endcase @@ -308,19 +309,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( automatic logic [16:0] sub = opb.w16[b] - opa.w16[b]; vxsat.w16[b] = (!opb.w16[b][15] & opa.w16[b][15] & sub[15]) | (opb.w16[b][15] & !opa.w16[b][15] & !sub[15]); - res.w16[b] = vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; + res.w16[b] = |vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sub = opb.w32[b] - opa.w32[b]; vxsat.w32[b] = (!opb.w32[b][31] & opa.w32[b][31] & sub[31]) | (opb.w32[b][31] & !opa.w32[b][31] & !sub[31]); - res.w32[b] = vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; + res.w32[b] = |vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sub = opb.w64[b] - opa.w64[b]; vxsat.w64[b] = (!opb.w64[b][63] & opa.w64[b][63] & sub[63]) | (opb.w64[b][63] & !opa.w64[b][63] & !sub[63]); - res.w64[b] = vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; + res.w64[b] = |vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; end endcase VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) @@ -332,7 +333,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + r : ($signed(sub[7:0]) >>> 1) + r; + res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + {7'b0, r} : $unsigned(($signed(sub[7:0]) >>> 1) + $signed({7'b0, r})); end EW16: for (int b = 0; b < 4; b++) begin automatic logic [ 16:0] sub = opb.w16[b] - opa.w16[b]; @@ -342,7 +343,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + r : ($signed(sub[15:0]) >>> 1) + r; + res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + {15'b0, r} : $unsigned(($signed(sub[15:0]) >>> 1) + $signed({15'b0, r})); end EW32: for (int b = 0; b < 2; b++) begin automatic logic [ 32:0] sub = opb.w32[b] - opa.w32[b]; @@ -352,7 +353,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + r : ($signed(sub[31:0]) >>> 1) + r; + res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + {31'b0, r} : $unsigned(($signed(sub[31:0]) >>> 1) + $signed({31'b0, r})); end EW64: for (int b = 0; b < 1; b++) begin automatic logic [ 64:0] sub = opb.w64[b] - opa.w64[b]; @@ -362,7 +363,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + r : ($signed(sub[63:0]) >>> 1) + r; + res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + {63'b0, r} : $unsigned(($signed(sub[63:0]) >>> 1) + $signed({63'b0, r})); end endcase @@ -443,19 +444,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( // Fixed point clip instructions VNCLIP: if (FixPtSupport == FixedPointEnable) unique case (vew_i) EW8 : for (int b = 0; b < 4; b++) begin - automatic logic [15:0] clip = $signed(opb.w16[b]) >>> opa.w16[b][3:0]; + automatic logic [15:0] clip = $unsigned($signed(opb.w16[b]) >>> opa.w16[b][3:0]); vxsat.w8[b] = |clip[15:8]; - res.w8 [2*b + narrowing_select_i] = ($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + rm[b]; + res.w8 [2*b + narrowing_select_i] = $unsigned(($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + $signed(rm[b])); end EW16: for (int b = 0; b < 2; b++) begin - automatic logic [31:0] clip = $signed(opb.w32[b]) >>> opa.w32[b][4:0]; + automatic logic [31:0] clip = $unsigned($signed(opb.w32[b]) >>> opa.w32[b][4:0]); vxsat.w8[b] = |clip[31:16]; - res.w16[2*b + narrowing_select_i] = ($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + rm[b]; + res.w16[2*b + narrowing_select_i] = $unsigned(($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + $signed(rm[b])); end EW32: for (int b = 0; b < 1; b++) begin automatic logic [63:0] clip = $signed(opb.w64[b]) >>> opa.w64[b][5:0]; vxsat.w8[b] = |clip[63:32]; - res.w32[2*b + narrowing_select_i] = ($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + rm[b]; + res.w32[2*b + narrowing_select_i] = $unsigned(($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + $signed(rm[b])); end endcase VNCLIPU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 623f3c40a..561e38a9c 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -317,7 +317,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // The ALU has completed a reduction logic alu_red_complete_d; - `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni); + `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni) // Signal to indicate the state of the ALU typedef enum logic [2:0] {NO_REDUCTION, INTRA_LANE_REDUCTION, INTER_LANES_REDUCTION_RX, INTER_LANES_REDUCTION_TX, LN0_REDUCTION_COMMIT, SIMD_REDUCTION} alu_state_e; diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index c12598649..4c296a3f0 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -337,17 +337,17 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; ~vmul_simd_in_valid[vinsn_issue_q.vtype.vsew]; `FFLARNC(vmul_simd_op_a_q, vinsn_issue_q.use_scalar_op ? scalar_op : mfpu_operand_i[0], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_b_q, mfpu_operand_i[1], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_c_q, mfpu_operand_i[2], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_mask_q, mask_i, - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_q, vinsn_issue_q.op, - gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni) `FFLARNC(vmul_simd_in_valid_q, vmul_simd_in_valid, - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) for (genvar i = 0; i < 4; i++) begin `ifdef GF22 @@ -616,7 +616,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Inform the lane SLDU/ADDRGEN arbiter that this reduction is over logic fpu_red_complete_d; - `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni); + `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni) // Signal to indicate the state of the MFPU typedef enum logic [2:0] { @@ -848,7 +848,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; EnableVectors: 1'b1, EnableNanBox : 1'b1, FpFmtMask : {RVVF(FPUSupport), RVVD(FPUSupport), RVVH(FPUSupport), RVVB(FPUSupport), RVVHA(FPUSupport), RVVBA(FPUSupport)}, - IntFmtMask : {logic'(RVVB(FPUSupport) || RVVBA(FPUSupport)), 1'b1, 1'b1, 1'b1} + IntFmtMask : {RVVB(FPUSupport) || RVVBA(FPUSupport), 1'b1, 1'b1, 1'b1} }; // Implementation (number of registers etc) @@ -1153,10 +1153,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; assign vfpu_flag_mask_d[0]= vfpu_simd_mask; for (genvar i = 0; i < LatFNonComp; i++) begin - `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni); + `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni) - `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni); - end + `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni) + end assign operand_a_delay = operand_a_d[LatFNonComp]; assign vfpu_flag_mask = vfpu_flag_mask_d[LatFNonComp]; @@ -1199,6 +1199,13 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; assign fp_rm_process = vinsn_processing_q.fp_rm; always_comb begin: fpu_result_processing_p + // Default the vfrec7/vfrsqrt7 scratch arrays (avoids inferred latches) + vfrec7_out_e16 = '{default: '0}; + vfrec7_out_e32 = '{default: '0}; + vfrec7_out_e64 = '{default: '0}; + vfrsqrt7_out_e16 = '{default: '0}; + vfrsqrt7_out_e32 = '{default: '0}; + vfrsqrt7_out_e64 = '{default: '0}; if (FPExtSupport) begin @@ -1384,6 +1391,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; narrowing_select_in_d = narrowing_select_in_q; narrowing_select_out_d = narrowing_select_out_q; + narrowing_shuffled_result = '0; + narrowing_shuffle_be = '0; // Inform our status to the lane controller mfpu_ready_o = !vinsn_queue_full; diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index d2fdccdc2..6a52c2462 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -673,6 +673,12 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Information about which is the target FU of the request assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + // VRGATHER/VCOMPRESS element buffers (kept at module scope to remain visible in waveforms) + // Buffer for the current element + logic [NrLanes*DataWidth-1:0] vrgat_res; + // Buffer for the current element + logic [DataWidth-1:0] vrgat_buf; + always_comb begin // Tail-agnostic bus alu_result = '1; @@ -688,6 +694,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vrgat_m_seq_bit = 1'b0; + // Default the VRGATHER/VCOMPRESS buffers (avoids inferred latches) + vrgat_res = '1; + vrgat_buf = '0; + // The result mask should be created here since the output is a non-mask vector be_viota_seq_d = be_viota_seq_q; @@ -802,11 +812,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // This operation writes vsew-bit elements with vtype.vsew encoding // The vd source can have a different encoding (it gets deshuffled in the masku_operand stage) [VRGATHER:VCOMPRESS]: begin - // Buffer for the current element - logic [NrLanes*DataWidth-1:0] vrgat_res; - // Buffer for the current element - logic [DataWidth-1:0] vrgat_buf; - // Extract the correct elements vrgat_res = '1; // Default assignment vrgat_buf = masku_operand_vd_seq[vrgat_req_idx_q[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64]; // Default assignment diff --git a/hardware/src/sldu/p2_stride_gen.sv b/hardware/src/sldu/p2_stride_gen.sv index 7919e29da..e3703c70e 100644 --- a/hardware/src/sldu/p2_stride_gen.sv +++ b/hardware/src/sldu/p2_stride_gen.sv @@ -37,10 +37,10 @@ module p2_stride_gen import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i assign valid_o = ~next_stride_zero_q; assign spare_stride_d = next_stride; - `FFL( popc_q, popc_d, ff_en, '0); - `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0); - `FFL( next_stride_zero_q, next_stride_zero_d, ff_en, '0); - `FFL( spare_stride_q, spare_stride_d, ff_en, '0); + `FFL( popc_q, popc_d, ff_en, '0) + `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0) + `FFL( next_stride_zero_q, next_stride_zero_d, ff_en, '0) + `FFL( spare_stride_q, spare_stride_d, ff_en, '0) // Is the stride power of two? popcount #( diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index f582cdf4f..14770641a 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -612,7 +612,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Filled up a word to the VRF or finished the instruction if (out_pnt_d == NrLanes * 8 || issue_cnt_q <= byte_count) begin // Reset the pointer - out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {'0, red_stride_cnt_d, 3'b0} : '0; + out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {red_stride_cnt_d, 3'b0} : '0; // We used all the bits of the mask if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN}) mask_ready_d = !vinsn_issue_q.vm; @@ -754,7 +754,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Update the p2 stride p2_stride_gen_update_d = 1'b1; // Commit the final result - if (p2_stride_gen_popc_q == {'0, 1'b1} && result_queue_empty) begin + if (p2_stride_gen_popc_q == 1 && result_queue_empty) begin state_d = SLIDE_NP2_COMMIT; // Prepare the write pointer result_queue_write_pnt_d = NP2_RESULT_PNT; diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index 1ba67f650..b59487704 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -970,7 +970,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1; + addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> (axi_addrgen_q.vew - 1); addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end : eew_misaligned_error diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 37d7b8782..ae5997397 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -290,6 +290,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( seq_word_wr_offset_d = seq_word_wr_offset_q; first_payload_byte_d = first_payload_byte_q; vrf_word_byte_cnt_d = vrf_word_byte_cnt_q; + vrf_word_start_byte = '0; // Vector instructions currently running vinsn_running_d = vinsn_running_q & pe_vinsn_running_i; @@ -421,7 +422,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_word_byte_pnt_d = '0; vrf_word_byte_cnt_d = '0; // Account for the results that were issued - if (seq_word_wr_offset_q) begin + if (seq_word_wr_offset_q != '0) begin vrf_eff_write_bytes = (NrLanes * DataWidthB); end else begin // First payload of the vector instruction diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index 27397fa35..8cb33786e 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -109,7 +109,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( logic stu_current_burst_exception, ldu_current_burst_exception; assign lsu_current_burst_exception_o = stu_current_burst_exception | ldu_current_burst_exception; - `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) /////////////////// // Definitions // diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 845c81a37..0f3ac84c0 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -250,6 +250,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( vinsn_valid_bytes = '0; axi_valid_bytes = '0; valid_bytes = '0; + vrf_eff_write_bytes = '0; // Maintain state vinsn_queue_d = vinsn_queue_q;