From bcca0ec2d029f20d0682b88bcba50176ff769bf2 Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Fri, 10 Apr 2026 16:28:10 +0200 Subject: [PATCH 1/6] [ci] Add slang lint action --- .github/workflows/ci.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c5561823..68522b677 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -208,6 +208,33 @@ jobs: # Compile stage # ################### + compile-slang: + runs-on: ubuntu-22.04 + permissions: + contents: read + checks: write + pull-requests: write + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + submodules: recursive + - name: Download RTL submodules + run: | + make -C hardware checkout + make -C hardware apply-patches + - name: Generate flist + run: make -C hardware spyglass/tmp/files + - name: Run slang + uses: pulp-platform/pulp-actions/slang@v2.5.0 # update version as needed, not autoupdated + with: + token: ${{ secrets.GITHUB_TOKEN }} + slang-flags: >- + -f hardware/spyglass/tmp/files --top ara_soc_wrap + -Wextra -Wno-width-trunc -Wno-case-redundant-default -Wno-case-enum -Wno-incomplete-return -Wno-dup-import + --ignore-unknown-modules + --suppress-warnings .bender/...,hardware/deps/... + compile-apps: runs-on: ubuntu-22.04 strategy: From af85614c7c27821c9ae85a1caedcbecf15d25c24 Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Fri, 10 Apr 2026 12:24:45 +0200 Subject: [PATCH 2/6] [ci] Update ubuntu to latest --- .github/workflows/ci.yml | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68522b677..d30a7f4af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: ##################### tc-llvm: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -82,7 +82,7 @@ jobs: path: tc-llvm.tar tc-gcc: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -134,7 +134,7 @@ jobs: path: tc-gcc.tar tc-isa-sim: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -169,7 +169,7 @@ jobs: path: tc-isa-sim.tar tc-verilator: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Recover the submodule commit hash @@ -209,7 +209,7 @@ jobs: ################### compile-slang: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest permissions: contents: read checks: write @@ -236,7 +236,7 @@ jobs: --suppress-warnings .bender/...,hardware/deps/... compile-apps: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -268,7 +268,7 @@ jobs: path: apps/bin compile-riscv-tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -307,7 +307,7 @@ jobs: path: apps/bin compile-ara: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -341,7 +341,7 @@ jobs: #################### simulate: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 2 matrix: @@ -369,7 +369,7 @@ jobs: ######################## riscv-tests-simv: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -392,7 +392,7 @@ jobs: run: config=${{ matrix.ara_config }} make -C hardware -j8 riscv_tests_simv riscv-tests-spike: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: ["tc-isa-sim", "compile-riscv-tests"] steps: - uses: actions/checkout@v6 @@ -431,7 +431,7 @@ jobs: ################### check-license: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 @@ -443,7 +443,7 @@ jobs: run: python scripts/licence-checker.py --config scripts/licence-checker.hjson hardware check-clang-format: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: ['tc-llvm'] steps: - uses: actions/checkout@v6 @@ -469,7 +469,7 @@ jobs: exit $EXIT_STATUS check-trailing-whitespaces: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: @@ -497,7 +497,7 @@ jobs: ##################### benchmark: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: @@ -543,7 +543,7 @@ jobs: path: benchmarks-${{ matrix.ara_config }}.tar roofline: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest needs: benchmark steps: - uses: actions/checkout@v6 @@ -699,7 +699,7 @@ jobs: #################### clean-up: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest if: always() needs: ["simulate", "riscv-tests-spike", "riscv-tests-simv"] steps: @@ -715,7 +715,7 @@ jobs: riscv-tests-spike clean-up-compile-runs: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest strategy: max-parallel: 1 matrix: From 7bdb7df1fdccc0f32bd4f5ca29e45f515a574185 Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Fri, 10 Apr 2026 13:41:21 +0200 Subject: [PATCH 3/6] [hardware] Fix slang warnings --- hardware/spyglass/src/ara_soc_wrap.sv | 12 +++- hardware/src/ara.sv | 2 +- hardware/src/ara_sequencer.sv | 2 +- hardware/src/ara_soc.sv | 16 +++--- hardware/src/ara_system.sv | 2 +- hardware/src/ctrl_registers.sv | 4 +- hardware/src/lane/lane.sv | 10 ++-- hardware/src/lane/lane_sequencer.sv | 68 +++++++++++------------ hardware/src/lane/operand_queues_stage.sv | 2 +- hardware/src/lane/power_gating_generic.sv | 2 +- hardware/src/lane/simd_alu.sv | 53 +++++++++--------- hardware/src/lane/valu.sv | 2 +- hardware/src/lane/vmfpu.sv | 45 ++++++++------- hardware/src/masku/masku.sv | 4 +- hardware/src/sldu/p2_stride_gen.sv | 8 +-- hardware/src/sldu/sldu.sv | 4 +- hardware/src/vlsu/addrgen.sv | 2 +- hardware/src/vlsu/vldu.sv | 8 ++- hardware/src/vlsu/vlsu.sv | 2 +- hardware/src/vlsu/vstu.sv | 2 +- 20 files changed, 127 insertions(+), 123 deletions(-) diff --git a/hardware/spyglass/src/ara_soc_wrap.sv b/hardware/spyglass/src/ara_soc_wrap.sv index 988be30c3..e4660ea78 100644 --- a/hardware/spyglass/src/ara_soc_wrap.sv +++ b/hardware/spyglass/src/ara_soc_wrap.sv @@ -17,8 +17,6 @@ module ara_soc_wrap ( localparam int unsigned AxiUserWidth = 1; localparam int unsigned AxiIdWidth = 5; - logic clk_i, rst_ni; - ara_soc #( .NrLanes (NrLanes ), .VLEN (VLEN ), @@ -33,7 +31,15 @@ module ara_soc_wrap ( .scan_data_i (1'b0 ), .uart_prdata_i ('0 ), .uart_pready_i ('0 ), - .uart_pslverr_i('0 ) + .uart_pslverr_i('0 ), + .exit_o(), + .hw_cnt_en_o(), + .scan_data_o(), + .uart_penable_o(), + .uart_pwrite_o(), + .uart_paddr_o(), + .uart_psel_o(), + .uart_pwdata_o() ); endmodule diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 4cb9a9506..7cff40045 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -491,7 +491,7 @@ module ara import ara_pkg::*; #( // Break path for acc_mmu_en. This signal can afford some additional latency // since vector mem ops take multiple cycles to reach the addrgen - `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni); + `FF(acc_mmu_en_q, acc_mmu_en, '0, clk_i, rst_ni) vlsu #( .NrLanes (NrLanes ), diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 9cd0cc405..e7e7e3925 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -341,7 +341,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i logic running_mask_insn_d, running_mask_insn_q; logic lsu_current_burst_exception_q; - `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni); + `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni) // pe_req_ready_i comes from all the lanes // It is deasserted if the current request is stuck diff --git a/hardware/src/ara_soc.sv b/hardware/src/ara_soc.sv index 5ee64e17c..fb25622eb 100644 --- a/hardware/src/ara_soc.sv +++ b/hardware/src/ara_soc.sv @@ -259,7 +259,7 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( `endif // One-cycle latency - `FF(l2_rvalid, l2_req, 1'b0); + `FF(l2_rvalid, l2_req, 1'b0) //////////// // UART // @@ -482,17 +482,17 @@ module ara_soc import axi_pkg::*; import ara_pkg::*; #( localparam config_pkg::cva6_cfg_t CVA6AraConfig = build_config_pkg::build_config(CVA6AraConfig_user); // Define the exception type - `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig); + `CVA6_TYPEDEF_EXCEPTION(exception_t, CVA6AraConfig) // Standard interface - `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e); - `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t); + `CVA6_INTF_TYPEDEF_ACC_REQ(accelerator_req_t, CVA6AraConfig, fpnew_pkg::roundmode_e) + `CVA6_INTF_TYPEDEF_ACC_RESP(accelerator_resp_t, CVA6AraConfig, exception_t) // MMU interface - `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig); - `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t); + `CVA6_INTF_TYPEDEF_MMU_REQ(acc_mmu_req_t, CVA6AraConfig) + `CVA6_INTF_TYPEDEF_MMU_RESP(acc_mmu_resp_t, CVA6AraConfig, exception_t) // Accelerator - CVA6's top-level interface - `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t); - `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t); + `CVA6_INTF_TYPEDEF_CVA6_TO_ACC(cva6_to_acc_t, accelerator_req_t, acc_mmu_resp_t) + `CVA6_INTF_TYPEDEF_ACC_TO_CVA6(acc_to_cva6_t, accelerator_resp_t, acc_mmu_req_t) `ifndef TARGET_GATESIM ara_system #( diff --git a/hardware/src/ara_system.sv b/hardware/src/ara_system.sv index 4accbd77c..6ecd5b3e8 100644 --- a/hardware/src/ara_system.sv +++ b/hardware/src/ara_system.sv @@ -96,7 +96,7 @@ module ara_system import axi_pkg::*; import ara_pkg::*; #( // Support max 8 cores, for now logic [63:0] hart_id; - assign hart_id = {'0, hart_id_i}; + assign hart_id = 64'(hart_id_i); // Pack invalidation interface into acc interface acc_to_cva6_t acc_resp_pack; diff --git a/hardware/src/ctrl_registers.sv b/hardware/src/ctrl_registers.sv index 54eb9278c..397dbb98c 100644 --- a/hardware/src/ctrl_registers.sv +++ b/hardware/src/ctrl_registers.sv @@ -95,7 +95,7 @@ module ctrl_registers #( .reg_q_o ({hw_cnt_en, event_trigger, dram_end_address, dram_base_address, exit}) ); - `FF(wr_active_q, wr_active_d, '0); + `FF(wr_active_q, wr_active_d, '0) ///////////////// // Signals // @@ -105,6 +105,6 @@ module ctrl_registers #( assign event_trigger_o = event_trigger; assign dram_base_addr_o = dram_base_address; assign dram_end_addr_o = dram_end_address; - assign exit_o = {exit, logic'(|wr_active_q[7:0])}; + assign exit_o = {exit, |wr_active_q[7:0]}; endmodule : ctrl_registers diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index c17d87fad..7d01a541e 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -222,12 +222,12 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic [NrVInsn-1:0] mfpu_vinsn_done; // Interface with the MaskB operand queue (VRGATHER/VCOMPRESS) logic mask_b_cmd_pop_d, mask_b_cmd_pop_q; - `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni); + `FF(mask_b_cmd_pop_q, mask_b_cmd_pop_d, 1'b0, clk_i, rst_ni) // Support for store exception flush logic lsu_ex_flush_op_req_d, lsu_ex_flush_op_req_q; - `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni) // Additional signals to please Verilator's hierarchical verilation pe_req_t pe_req; @@ -310,7 +310,7 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic sldu_result_gnt_opqueues; // Support for store exception flush logic lsu_ex_flush_op_queues_d, lsu_ex_flush_op_queues_q; - `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni) operand_requester #( .NrLanes (NrLanes ), @@ -609,8 +609,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( ); // Break timing path - `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni); - `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni); + `FF(vfu_operation_valid_q, vfu_operation_valid, 1'b0, clk_i, rst_ni) + `FF(vfu_operation_op_q, vfu_operation.op, VADD, clk_i, rst_ni) always_comb begin sldu_addrgen_sel_d = SLDU_SEL; diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index a9fa44e32..cdf77f1e0 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -52,7 +52,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: `include "common_cells/registers.svh" // STU exception support - `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) //////////////////////////// // Register the request // @@ -663,9 +663,9 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end VSLIDEDOWN: begin // Extra elements to ask, because of the stride - logic [$clog2(8*NrLanes)-1:0] extra_stride; + automatic logic [$clog2(8*NrLanes)-1:0] extra_stride; // Need one bit more than vl, since we will also add the stride contribution - logic [$bits(pe_req.vl):0] vl_tot; + automatic logic [$bits(pe_req.vl):0] vl_tot; // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. @@ -744,18 +744,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // todo: // Mask logical and integer comparisons - operand_request[AluA] = '{ - id : pe_req.id, - vs : pe_req.vs1, - scale_vl: pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vs1 | pe_req.hazard_vd, - target_fu : ALU_SLDU, - conv : OpQueueConversionNone, - cvt_resize: CVT_SAME, - default : '0 - }; + operand_request[AluA] = '0; + operand_request[AluA].id = pe_req.id; + operand_request[AluA].vs = pe_req.vs1; + operand_request[AluA].scale_vl = pe_req.scale_vl; + operand_request[AluA].vtype = pe_req.vtype; + operand_request[AluA].vstart = vfu_operation_d.vstart; + operand_request[AluA].hazard = pe_req.hazard_vs1 | pe_req.hazard_vd; + operand_request[AluA].target_fu = ALU_SLDU; + operand_request[AluA].conv = OpQueueConversionNone; + operand_request[AluA].cvt_resize = CVT_SAME; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. @@ -858,18 +856,16 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Vd register to provide correct mask undisturbed policy at bit-level // This is can be a mask or normal register - operand_request[MaskB] = '{ - id : pe_req.id, - vs : pe_req.vd, - scale_vl: pe_req.scale_vl, - vtype : pe_req.vtype, - vstart : vfu_operation_d.vstart, - hazard : pe_req.hazard_vd, - target_fu : ALU_SLDU, - conv : OpQueueConversionNone, - cvt_resize: CVT_SAME, - default : '0 - }; + operand_request[MaskB] = '0; + operand_request[MaskB].id = pe_req.id; + operand_request[MaskB].vs = pe_req.vd; + operand_request[MaskB].scale_vl = pe_req.scale_vl; + operand_request[MaskB].vtype = pe_req.vtype; + operand_request[MaskB].vstart = vfu_operation_d.vstart; + operand_request[MaskB].hazard = pe_req.hazard_vd; + operand_request[MaskB].target_fu = ALU_SLDU; + operand_request[MaskB].conv = OpQueueConversionNone; + operand_request[MaskB].cvt_resize = CVT_SAME; // vl and eew depend on the real eew on which we are working on if (pe_req.op inside {VIOTA,VID}) begin // Non-mask layout @@ -938,16 +934,14 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // VRGATHER and VCOMPRESS access the opreq with ad-hoc requests if (vrgat_state_q == REQUESTING) begin // Here, we are sure the MaskB operand_request is free - operand_request[MaskB] = '{ - vs : masku_vrgat_req_q.vs, - eew : masku_vrgat_req_q.eew, - scale_vl : 1'b0, - cvt_resize : pe_req.cvt_resize, - vl : 1, - vstart : masku_vrgat_req_q.idx, - hazard : '0, - default : '0 - }; + operand_request[MaskB] = '0; + operand_request[MaskB].vs = masku_vrgat_req_q.vs; + operand_request[MaskB].eew = masku_vrgat_req_q.eew; + operand_request[MaskB].scale_vl = 1'b0; + operand_request[MaskB].cvt_resize = pe_req.cvt_resize; + operand_request[MaskB].vl = 1; + operand_request[MaskB].vstart = masku_vrgat_req_q.idx; + operand_request[MaskB].hazard = '0; operand_request_push[MaskB] = masku_vrgat_req_ready_d; end end: sequencer diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index a0b750f1e..09369ba26 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -58,7 +58,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math `include "common_cells/registers.svh" // STU flush support - `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) /////////// // ALU // diff --git a/hardware/src/lane/power_gating_generic.sv b/hardware/src/lane/power_gating_generic.sv index 928e2625d..bd7797458 100644 --- a/hardware/src/lane/power_gating_generic.sv +++ b/hardware/src/lane/power_gating_generic.sv @@ -19,6 +19,6 @@ module power_gating_generic #( // Gate with an AND assign en_wide = en_i ? T'('1) : T'('0); - assign out_o = T'(in_i & en_wide); + assign out_o = in_i & en_wide; endmodule diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index b97016542..33a752388 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -55,7 +55,6 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( alu_sat_operand_t sat_sum, sat_sub; vxsat_t vxsat; vxrm_t vxrm; - logic r; assign vxrm = vxrm_i; assign vxsat_o = vxsat; @@ -183,7 +182,9 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( res.w64[b] = &vxsat.w64[b] ? (sum[63] ? {1'b0, {63{1'b1}}} : {1'b1, {63{1'b0}}} ) : sum[63:0]; end endcase - VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) + VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) begin + automatic logic r; + unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b]; unique case (vxrm) @@ -224,48 +225,48 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( endcase res.w64[b] = (op_i == VAADDU) ? sum[64:1] + r : {sum[63], sum[63:1]} + r; end - endcase + endcase end VADD, VADC, VMADC, VREDSUM, VWREDSUMU, VWREDSUM: unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b] + - logic'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i); + 8'(op_i inside {VADC, VMADC} && mask_i[1*b] & ~vm_i); res.w8[b] = (op_i == VMADC) ? {6'b0, 1'b1, sum[8]} : sum[7:0]; end EW16: for (int b = 0; b < 4; b++) begin automatic logic [16:0] sum = opa.w16[b] + opb.w16[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i); + 16'(op_i inside {VADC, VMADC} && mask_i[2*b] & ~vm_i); res.w16[b] = (op_i == VMADC) ? {14'b0, 1'b1, sum[16]} : sum[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sum = opa.w32[b] + opb.w32[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i); + 32'(op_i inside {VADC, VMADC} && mask_i[4*b] & ~vm_i); res.w32[b] = (op_i == VMADC) ? {30'b0, 1'b1, sum[32]} : sum[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sum = opa.w64[b] + opb.w64[b] + - logic'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i); + 64'(op_i inside {VADC, VMADC} && mask_i[8*b] & ~vm_i); res.w64[b] = (op_i == VMADC) ? {62'b0, 1'b1, sum[64]} : sum[63:0]; end endcase VSUB, VSBC, VMSBC: unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i); + 8'(op_i inside {VSBC, VMSBC} && mask_i[1*b] & ~vm_i); res.w8[b] = (op_i == VMSBC) ? {6'b0, 1'b1, sub[8]} : sub[7:0]; end EW16: for (int b = 0; b < 4; b++) begin automatic logic [16:0] sub = opb.w16[b] - opa.w16[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i); + 16'(op_i inside {VSBC, VMSBC} && mask_i[2*b] & ~vm_i); res.w16[b] = (op_i == VMSBC) ? {14'b0, 1'b1, sub[16]} : sub[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sub = opb.w32[b] - opa.w32[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i); + 32'(op_i inside {VSBC, VMSBC} && mask_i[4*b] & ~vm_i); res.w32[b] = (op_i == VMSBC) ? {30'b0, 1'b1, sub[32]} : sub[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sub = opb.w64[b] - opa.w64[b] - - logic'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i); + 64'(op_i inside {VSBC, VMSBC} && mask_i[8*b] & ~vm_i); res.w64[b] = (op_i == VMSBC) ? {62'b0, 1'b1, sub[64]} : sub[63:0]; end endcase @@ -308,22 +309,24 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( automatic logic [16:0] sub = opb.w16[b] - opa.w16[b]; vxsat.w16[b] = (!opb.w16[b][15] & opa.w16[b][15] & sub[15]) | (opb.w16[b][15] & !opa.w16[b][15] & !sub[15]); - res.w16[b] = vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; + res.w16[b] = &vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sub = opb.w32[b] - opa.w32[b]; vxsat.w32[b] = (!opb.w32[b][31] & opa.w32[b][31] & sub[31]) | (opb.w32[b][31] & !opa.w32[b][31] & !sub[31]); - res.w32[b] = vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; + res.w32[b] = &vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sub = opb.w64[b] - opa.w64[b]; vxsat.w64[b] = (!opb.w64[b][63] & opa.w64[b][63] & sub[63]) | (opb.w64[b][63] & !opa.w64[b][63] & !sub[63]); - res.w64[b] = vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; + res.w64[b] = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; end endcase - VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) + VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) begin + automatic logic r; + unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b]; unique case (vxrm) @@ -332,7 +335,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + r : ($signed(sub[7:0]) >>> 1) + r; + res.w8[b] = (op_i == VASUBU) ? (sub[7:0] >> 1) + {7'b0, r} : $unsigned(($signed(sub[7:0]) >>> 1) + $signed({7'b0, r})); end EW16: for (int b = 0; b < 4; b++) begin automatic logic [ 16:0] sub = opb.w16[b] - opa.w16[b]; @@ -342,7 +345,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + r : ($signed(sub[15:0]) >>> 1) + r; + res.w16[b] = (op_i == VASUBU) ? (sub[15:0] >> 1) + {15'b0, r} : $unsigned(($signed(sub[15:0]) >>> 1) + $signed({15'b0, r})); end EW32: for (int b = 0; b < 2; b++) begin automatic logic [ 32:0] sub = opb.w32[b] - opa.w32[b]; @@ -352,7 +355,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + r : ($signed(sub[31:0]) >>> 1) + r; + res.w32[b] = (op_i == VASUBU) ? (sub[31:0] >> 1) + {31'b0, r} : $unsigned(($signed(sub[31:0]) >>> 1) + $signed({31'b0, r})); end EW64: for (int b = 0; b < 1; b++) begin automatic logic [ 64:0] sub = opb.w64[b] - opa.w64[b]; @@ -362,9 +365,9 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( 2'b10: r = 1'b0; 2'b11: r = !sub[1] & (sub[0]!=0); endcase - res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + r : ($signed(sub[63:0]) >>> 1) + r; + res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + {63'b0, r} : $unsigned(($signed(sub[63:0]) >>> 1) + $signed({63'b0, r})); end - endcase + endcase end // Shift instructions VSLL: unique case (vew_i) @@ -443,19 +446,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( // Fixed point clip instructions VNCLIP: if (FixPtSupport == FixedPointEnable) unique case (vew_i) EW8 : for (int b = 0; b < 4; b++) begin - automatic logic [15:0] clip = $signed(opb.w16[b]) >>> opa.w16[b][3:0]; + automatic logic [15:0] clip = $unsigned($signed(opb.w16[b]) >>> opa.w16[b][3:0]); vxsat.w8[b] = |clip[15:8]; - res.w8 [2*b + narrowing_select_i] = ($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + rm[b]; + res.w8 [2*b + narrowing_select_i] = $unsigned(($signed(opb.w16[b]) >>> opa.w16[b][3:0]) + $signed(rm[b])); end EW16: for (int b = 0; b < 2; b++) begin - automatic logic [31:0] clip = $signed(opb.w32[b]) >>> opa.w32[b][4:0]; + automatic logic [31:0] clip = $unsigned($signed(opb.w32[b]) >>> opa.w32[b][4:0]); vxsat.w8[b] = |clip[31:16]; - res.w16[2*b + narrowing_select_i] = ($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + rm[b]; + res.w16[2*b + narrowing_select_i] = $unsigned(($signed(opb.w32[b]) >>> opa.w32[b][4:0]) + $signed(rm[b])); end EW32: for (int b = 0; b < 1; b++) begin automatic logic [63:0] clip = $signed(opb.w64[b]) >>> opa.w64[b][5:0]; vxsat.w8[b] = |clip[63:32]; - res.w32[2*b + narrowing_select_i] = ($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + rm[b]; + res.w32[2*b + narrowing_select_i] = $unsigned(($signed(opb.w64[b]) >>> opa.w64[b][5:0]) + $signed(rm[b])); end endcase VNCLIPU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) diff --git a/hardware/src/lane/valu.sv b/hardware/src/lane/valu.sv index 623f3c40a..561e38a9c 100644 --- a/hardware/src/lane/valu.sv +++ b/hardware/src/lane/valu.sv @@ -317,7 +317,7 @@ module valu import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::idx_width; // The ALU has completed a reduction logic alu_red_complete_d; - `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni); + `FF(alu_red_complete_o, alu_red_complete_d, 1'b0, clk_i, rst_ni) // Signal to indicate the state of the ALU typedef enum logic [2:0] {NO_REDUCTION, INTRA_LANE_REDUCTION, INTER_LANES_REDUCTION_RX, INTER_LANES_REDUCTION_TX, LN0_REDUCTION_COMMIT, SIMD_REDUCTION} alu_state_e; diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index c12598649..cdf8a7d48 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -280,10 +280,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; logic narrowing_select_in_d, narrowing_select_in_q; // Output selector, used to control the Result MUX and validate the results logic narrowing_select_out_d, narrowing_select_out_q; - // FPU SIMD result needs to be shuffled for narrowing instructions before commit - elen_t narrowing_shuffled_result; - // Helper signal to shuffle the narrowed result - logic [7:0] narrowing_shuffle_be; ////////////////// // Multiplier // @@ -337,17 +333,17 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; ~vmul_simd_in_valid[vinsn_issue_q.vtype.vsew]; `FFLARNC(vmul_simd_op_a_q, vinsn_issue_q.use_scalar_op ? scalar_op : mfpu_operand_i[0], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_b_q, mfpu_operand_i[1], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_c_q, mfpu_operand_i[2], - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_mask_q, mask_i, - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) `FFLARNC(vmul_simd_op_q, vinsn_issue_q.op, - gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, ara_op_e'('0), clk_i_gated, rst_ni) `FFLARNC(vmul_simd_in_valid_q, vmul_simd_in_valid, - gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni); + gate_ff_en, gate_ff_clr, '0, clk_i_gated, rst_ni) for (genvar i = 0; i < 4; i++) begin `ifdef GF22 @@ -616,7 +612,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // Inform the lane SLDU/ADDRGEN arbiter that this reduction is over logic fpu_red_complete_d; - `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni); + `FF(fpu_red_complete_o, fpu_red_complete_d, 1'b0, clk_i, rst_ni) // Signal to indicate the state of the MFPU typedef enum logic [2:0] { @@ -848,7 +844,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; EnableVectors: 1'b1, EnableNanBox : 1'b1, FpFmtMask : {RVVF(FPUSupport), RVVD(FPUSupport), RVVH(FPUSupport), RVVB(FPUSupport), RVVHA(FPUSupport), RVVBA(FPUSupport)}, - IntFmtMask : {logic'(RVVB(FPUSupport) || RVVBA(FPUSupport)), 1'b1, 1'b1, 1'b1} + IntFmtMask : {RVVB(FPUSupport) || RVVBA(FPUSupport), 1'b1, 1'b1, 1'b1} }; // Implementation (number of registers etc) @@ -1124,20 +1120,12 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; fpu_mask_t vfpu_flag_mask; - vf7_flag_out_e16 vfrec7_out_e16[4]; - vf7_flag_out_e32 vfrec7_out_e32[2]; - vf7_flag_out_e64 vfrec7_out_e64[1]; - status_t vfrec7_ex_flag, vfrsqrt7_ex_flag; roundmode_e fp_rm_process; elen_t [LatFNonComp:0] operand_a_d, vfpu_flag_mask_d; - vf7_flag_out_e16 vfrsqrt7_out_e16[4]; - vf7_flag_out_e32 vfrsqrt7_out_e32[2]; - vf7_flag_out_e64 vfrsqrt7_out_e64[1]; - logic [15:0] lzc_e16; logic [9:0] lzc_e32; logic [5:0] lzc_e64; @@ -1153,10 +1141,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; assign vfpu_flag_mask_d[0]= vfpu_simd_mask; for (genvar i = 0; i < LatFNonComp; i++) begin - `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni); + `FF(operand_a_d[i+1], operand_a_d[i], '0, clk_i, rst_ni) - `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni); - end + `FF(vfpu_flag_mask_d[i+1], vfpu_flag_mask_d[i],'0,clk_i,rst_ni) + end assign operand_a_delay = operand_a_d[LatFNonComp]; assign vfpu_flag_mask = vfpu_flag_mask_d[LatFNonComp]; @@ -1205,6 +1193,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // vfrec7 (only supported on 16, 32, 64-bit) unique case (vinsn_processing_q.vtype.vsew) EW16: begin + automatic vf7_flag_out_e16 vfrec7_out_e16[4]; for (int h = 0; h < 4; h++) vfrec7_out_e16[h] = vfrec7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], fp_rm_process); @@ -1217,6 +1206,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrec7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW32: begin + automatic vf7_flag_out_e32 vfrec7_out_e32[2]; for (int w = 0; w < 2; w++) vfrec7_out_e32[w] = vfrec7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], fp_rm_process); @@ -1226,6 +1216,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrec7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW64: begin + automatic vf7_flag_out_e64 vfrec7_out_e64[1]; for (int d = 0; d < 1; d++) vfrec7_out_e64[d] = vfrec7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], fp_rm_process); @@ -1242,6 +1233,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // vfrsqrt7 (only supported on 16, 32, 64-bit) unique case (vinsn_processing_q.vtype.vsew) EW16: begin + automatic vf7_flag_out_e16 vfrsqrt7_out_e16[4]; for (int h = 0; h < 4; h++) vfrsqrt7_out_e16[h] = vfrsqrt7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], lzc_e16[h*4 +: 4]); @@ -1254,6 +1246,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrsqrt7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW32: begin + automatic vf7_flag_out_e32 vfrsqrt7_out_e32[2]; for (int w = 0; w < 2; w++) vfrsqrt7_out_e32[w] = vfrsqrt7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], lzc_e32[w*5 +: 5]); @@ -1263,6 +1256,7 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrsqrt7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW64: begin + automatic vf7_flag_out_e64 vfrsqrt7_out_e64[1]; for (int d = 0; d < 1; d++) vfrsqrt7_out_e64[d] = vfrsqrt7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], lzc_e64[d*6 +: 6]); @@ -1490,6 +1484,11 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; case (mfpu_state_q) NO_REDUCTION: begin + // FPU SIMD result needs to be shuffled for narrowing instructions before commit + automatic elen_t narrowing_shuffled_result; + // Helper signal to shuffle the narrowed result + automatic logic [7:0] narrowing_shuffle_be; + vfpu_tag_in = mask_i; // Sign injection diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index d2fdccdc2..7202d5ccc 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -803,9 +803,9 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // The vd source can have a different encoding (it gets deshuffled in the masku_operand stage) [VRGATHER:VCOMPRESS]: begin // Buffer for the current element - logic [NrLanes*DataWidth-1:0] vrgat_res; + automatic logic [NrLanes*DataWidth-1:0] vrgat_res; // Buffer for the current element - logic [DataWidth-1:0] vrgat_buf; + automatic logic [DataWidth-1:0] vrgat_buf; // Extract the correct elements vrgat_res = '1; // Default assignment diff --git a/hardware/src/sldu/p2_stride_gen.sv b/hardware/src/sldu/p2_stride_gen.sv index 7919e29da..e3703c70e 100644 --- a/hardware/src/sldu/p2_stride_gen.sv +++ b/hardware/src/sldu/p2_stride_gen.sv @@ -37,10 +37,10 @@ module p2_stride_gen import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i assign valid_o = ~next_stride_zero_q; assign spare_stride_d = next_stride; - `FFL( popc_q, popc_d, ff_en, '0); - `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0); - `FFL( next_stride_zero_q, next_stride_zero_d, ff_en, '0); - `FFL( spare_stride_q, spare_stride_d, ff_en, '0); + `FFL( popc_q, popc_d, ff_en, '0) + `FFL(next_stride_first_q, next_stride_first_d, ff_en, '0) + `FFL( next_stride_zero_q, next_stride_zero_d, ff_en, '0) + `FFL( spare_stride_q, spare_stride_d, ff_en, '0) // Is the stride power of two? popcount #( diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index f582cdf4f..f493d10b1 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -612,7 +612,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Filled up a word to the VRF or finished the instruction if (out_pnt_d == NrLanes * 8 || issue_cnt_q <= byte_count) begin // Reset the pointer - out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {'0, red_stride_cnt_d, 3'b0} : '0; + out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {{idx_width(NrLanes*(StrbWidth-1)){1'b0}}, red_stride_cnt_d, 3'b0} : '0; // We used all the bits of the mask if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN}) mask_ready_d = !vinsn_issue_q.vm; @@ -754,7 +754,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Update the p2 stride p2_stride_gen_update_d = 1'b1; // Commit the final result - if (p2_stride_gen_popc_q == {'0, 1'b1} && result_queue_empty) begin + if (p2_stride_gen_popc_q == {{(idx_width(idx_width(8*NrLanes))-1){1'b0}}, 1'b1} && result_queue_empty) begin state_d = SLIDE_NP2_COMMIT; // Prepare the write pointer result_queue_write_pnt_d = NP2_RESULT_PNT; diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index 1ba67f650..b59487704 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -970,7 +970,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( // Generate an error idx_op_error_d = 1'b1; // Forward next vstart info to the dispatcher - addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> axi_addrgen_q.vew - 1; + addrgen_exception_vstart_d = (addrgen_req.len - axi_addrgen_q.len) >> (axi_addrgen_q.vew - 1); addrgen_req_ready = 1'b1; axi_addrgen_state_d = AXI_ADDRGEN_IDLE; end : eew_misaligned_error diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 37d7b8782..519ceb2a1 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -234,8 +234,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic [idx_width(AxiDataWidth/8):0] axi_r_byte_pnt_d, axi_r_byte_pnt_q; // - A pointer to which byte in the full VRF word we are writing data into. logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q; - // - A pointer that indicates the start byte in the vrf word. - logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in @@ -421,7 +419,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vrf_word_byte_pnt_d = '0; vrf_word_byte_cnt_d = '0; // Account for the results that were issued - if (seq_word_wr_offset_q) begin + if (seq_word_wr_offset_q != '0) begin vrf_eff_write_bytes = (NrLanes * DataWidthB); end else begin // First payload of the vector instruction @@ -464,6 +462,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Prepare for the next vector instruction if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update + // - A pointer that indicates the start byte in the vrf word. + automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart @@ -649,6 +649,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // New instruction with new vstart. Initialize the vrf byte ptr if (vinsn_queue_d.issue_cnt == '0) begin + // - A pointer that indicates the start byte in the vrf word. + automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; vrf_word_start_byte = pe_req_i.vstart[$clog2(8*NrLanes)-1:0] << pe_req_i.vtype.vsew; vrf_word_byte_pnt_d = {1'b0, vrf_word_start_byte[$clog2(8*NrLanes)-1:0]}; vrf_word_byte_cnt_d = '0; diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index 27397fa35..8cb33786e 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -109,7 +109,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( logic stu_current_burst_exception, ldu_current_burst_exception; assign lsu_current_burst_exception_o = stu_current_burst_exception | ldu_current_burst_exception; - `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni) /////////////////// // Definitions // diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 845c81a37..811589139 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -227,7 +227,6 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // When vstart > 0, the very first payload written to the VRF contains less than // (8 * NrLanes) bytes. logic [$clog2(8*NrLanes):0] first_payload_byte_d, first_payload_byte_q; - logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes; // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in @@ -367,6 +366,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // We consumed a whole word from the lanes if (vrf_pnt_d == NrLanes*8 || vrf_cnt_d == issue_cnt_bytes_q) begin : vrf_word_done + automatic logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes; // Reset the pointer in the VRF word vrf_pnt_d = '0; vrf_cnt_d = '0; From 94578c5669ec9dbadf66ca576ee972e28e70e041 Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Tue, 2 Jun 2026 02:04:33 +0200 Subject: [PATCH 4/6] [hardware] Keep latch-fix temporaries as named signals PR review (hopang-0221): the automatic variables introduced to silence slang's -Winferred-latch are not visible in waveforms, since they do not exist as named signals in the design hierarchy. Restore them as module-scope signals and break the inferred latches with unconditional default assignments at the top of their always_comb blocks instead. No functional change: each of these is always assigned before it is read on the path that uses it, so the defaults only affect the otherwise-unread latch paths. Affected: simd_alu (r), lane_sequencer (extra_stride/vl_tot), masku (vrgat_res/vrgat_buf), vldu (vrf_word_start_byte), vstu (vrf_eff_write_bytes), vmfpu (vfrec7/vfrsqrt7 scratch arrays, narrowing_shuffled_result/narrowing_shuffle_be). Co-Authored-By: Claude Opus 4.8 (1M context) --- hardware/src/lane/lane_sequencer.sv | 15 +++++++++----- hardware/src/lane/simd_alu.sv | 14 ++++++------- hardware/src/lane/vmfpu.sv | 32 +++++++++++++++++++---------- hardware/src/masku/masku.sv | 15 +++++++++----- hardware/src/vlsu/vldu.sv | 7 +++---- hardware/src/vlsu/vstu.sv | 3 ++- 6 files changed, 52 insertions(+), 34 deletions(-) diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index cdf77f1e0..ceeb9c8e1 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -73,6 +73,12 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: logic pe_req_valid; logic pe_req_ready; + // VSLIDEDOWN stride helpers (kept at module scope to remain visible in waveforms) + // Extra elements to ask, because of the stride + logic [$clog2(8*NrLanes)-1:0] extra_stride; + // Need one bit more than vl, since we will also add the stride contribution + logic [$bits(pe_req.vl):0] vl_tot; + fall_through_register #( .T(pe_req_t) ) i_pe_req_register ( @@ -279,6 +285,10 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request = '0; operand_request_push = '0; + // Default the slide unit stride helpers (avoids inferred latches) + extra_stride = '0; + vl_tot = '0; + // Make no requests to the lane's VFUs vfu_operation_d = '0; vfu_operation_valid_d = 1'b0; @@ -662,11 +672,6 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: (pe_req.vl - pe_req.stride + NrLanes - 1) / NrLanes; end VSLIDEDOWN: begin - // Extra elements to ask, because of the stride - automatic logic [$clog2(8*NrLanes)-1:0] extra_stride; - // Need one bit more than vl, since we will also add the stride contribution - automatic logic [$bits(pe_req.vl):0] vl_tot; - // We need to trim full words from the start of the vector that are not used // as operands by the slide unit. operand_request[SlideAddrGenA].vstart = pe_req.stride / NrLanes; diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index 33a752388..24ac835ed 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -55,6 +55,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( alu_sat_operand_t sat_sum, sat_sub; vxsat_t vxsat; vxrm_t vxrm; + logic r; assign vxrm = vxrm_i; assign vxsat_o = vxsat; @@ -113,6 +114,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( // Default assignment res = '0; vxsat.w64 = '0; + r = '0; if (valid_i) unique case (op_i) @@ -182,9 +184,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( res.w64[b] = &vxsat.w64[b] ? (sum[63] ? {1'b0, {63{1'b1}}} : {1'b1, {63{1'b0}}} ) : sum[63:0]; end endcase - VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) begin - automatic logic r; - unique case (vew_i) + VAADD, VAADDU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b]; unique case (vxrm) @@ -225,7 +225,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( endcase res.w64[b] = (op_i == VAADDU) ? sum[64:1] + r : {sum[63], sum[63:1]} + r; end - endcase end + endcase VADD, VADC, VMADC, VREDSUM, VWREDSUMU, VWREDSUM: unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sum = opa.w8 [b] + opb.w8 [b] + @@ -324,9 +324,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( res.w64[b] = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; end endcase - VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) begin - automatic logic r; - unique case (vew_i) + VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) EW8: for (int b = 0; b < 8; b++) begin automatic logic [ 8:0] sub = opb.w8 [b] - opa.w8 [b]; unique case (vxrm) @@ -367,7 +365,7 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( endcase res.w64[b] = (op_i == VASUBU) ? (sub[63:0] >> 1) + {63'b0, r} : $unsigned(($signed(sub[63:0]) >>> 1) + $signed({63'b0, r})); end - endcase end + endcase // Shift instructions VSLL: unique case (vew_i) diff --git a/hardware/src/lane/vmfpu.sv b/hardware/src/lane/vmfpu.sv index cdf8a7d48..4c296a3f0 100644 --- a/hardware/src/lane/vmfpu.sv +++ b/hardware/src/lane/vmfpu.sv @@ -280,6 +280,10 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; logic narrowing_select_in_d, narrowing_select_in_q; // Output selector, used to control the Result MUX and validate the results logic narrowing_select_out_d, narrowing_select_out_q; + // FPU SIMD result needs to be shuffled for narrowing instructions before commit + elen_t narrowing_shuffled_result; + // Helper signal to shuffle the narrowed result + logic [7:0] narrowing_shuffle_be; ////////////////// // Multiplier // @@ -1120,12 +1124,20 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; fpu_mask_t vfpu_flag_mask; + vf7_flag_out_e16 vfrec7_out_e16[4]; + vf7_flag_out_e32 vfrec7_out_e32[2]; + vf7_flag_out_e64 vfrec7_out_e64[1]; + status_t vfrec7_ex_flag, vfrsqrt7_ex_flag; roundmode_e fp_rm_process; elen_t [LatFNonComp:0] operand_a_d, vfpu_flag_mask_d; + vf7_flag_out_e16 vfrsqrt7_out_e16[4]; + vf7_flag_out_e32 vfrsqrt7_out_e32[2]; + vf7_flag_out_e64 vfrsqrt7_out_e64[1]; + logic [15:0] lzc_e16; logic [9:0] lzc_e32; logic [5:0] lzc_e64; @@ -1187,13 +1199,19 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; assign fp_rm_process = vinsn_processing_q.fp_rm; always_comb begin: fpu_result_processing_p + // Default the vfrec7/vfrsqrt7 scratch arrays (avoids inferred latches) + vfrec7_out_e16 = '{default: '0}; + vfrec7_out_e32 = '{default: '0}; + vfrec7_out_e64 = '{default: '0}; + vfrsqrt7_out_e16 = '{default: '0}; + vfrsqrt7_out_e32 = '{default: '0}; + vfrsqrt7_out_e64 = '{default: '0}; if (FPExtSupport) begin // vfrec7 (only supported on 16, 32, 64-bit) unique case (vinsn_processing_q.vtype.vsew) EW16: begin - automatic vf7_flag_out_e16 vfrec7_out_e16[4]; for (int h = 0; h < 4; h++) vfrec7_out_e16[h] = vfrec7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], fp_rm_process); @@ -1206,7 +1224,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrec7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW32: begin - automatic vf7_flag_out_e32 vfrec7_out_e32[2]; for (int w = 0; w < 2; w++) vfrec7_out_e32[w] = vfrec7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], fp_rm_process); @@ -1216,7 +1233,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrec7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW64: begin - automatic vf7_flag_out_e64 vfrec7_out_e64[1]; for (int d = 0; d < 1; d++) vfrec7_out_e64[d] = vfrec7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], fp_rm_process); @@ -1233,7 +1249,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; // vfrsqrt7 (only supported on 16, 32, 64-bit) unique case (vinsn_processing_q.vtype.vsew) EW16: begin - automatic vf7_flag_out_e16 vfrsqrt7_out_e16[4]; for (int h = 0; h < 4; h++) vfrsqrt7_out_e16[h] = vfrsqrt7_fp16(vfpu_result[h*16 +: 10], operand_a_delay[h*16 +: 16], lzc_e16[h*4 +: 4]); @@ -1246,7 +1261,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrsqrt7_out_e16[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW32: begin - automatic vf7_flag_out_e32 vfrsqrt7_out_e32[2]; for (int w = 0; w < 2; w++) vfrsqrt7_out_e32[w] = vfrsqrt7_fp32(vfpu_result[w*32 +: 10], operand_a_delay[w*32 +: 32], lzc_e32[w*5 +: 5]); @@ -1256,7 +1270,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; | (vfrsqrt7_out_e32[0].ex_flag & {5{vfpu_flag_mask[0]}}); end EW64: begin - automatic vf7_flag_out_e64 vfrsqrt7_out_e64[1]; for (int d = 0; d < 1; d++) vfrsqrt7_out_e64[d] = vfrsqrt7_fp64(vfpu_result[d*64 +: 10], operand_a_delay[d*64 +: 64], lzc_e64[d*6 +: 6]); @@ -1378,6 +1391,8 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; narrowing_select_in_d = narrowing_select_in_q; narrowing_select_out_d = narrowing_select_out_q; + narrowing_shuffled_result = '0; + narrowing_shuffle_be = '0; // Inform our status to the lane controller mfpu_ready_o = !vinsn_queue_full; @@ -1484,11 +1499,6 @@ module vmfpu import ara_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; case (mfpu_state_q) NO_REDUCTION: begin - // FPU SIMD result needs to be shuffled for narrowing instructions before commit - automatic elen_t narrowing_shuffled_result; - // Helper signal to shuffle the narrowed result - automatic logic [7:0] narrowing_shuffle_be; - vfpu_tag_in = mask_i; // Sign injection diff --git a/hardware/src/masku/masku.sv b/hardware/src/masku/masku.sv index 7202d5ccc..6a52c2462 100644 --- a/hardware/src/masku/masku.sv +++ b/hardware/src/masku/masku.sv @@ -673,6 +673,12 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // Information about which is the target FU of the request assign masku_operand_fu = (vinsn_issue.op inside {[VMFEQ:VMFGE]}) ? MaskFUMFpu : MaskFUAlu; + // VRGATHER/VCOMPRESS element buffers (kept at module scope to remain visible in waveforms) + // Buffer for the current element + logic [NrLanes*DataWidth-1:0] vrgat_res; + // Buffer for the current element + logic [DataWidth-1:0] vrgat_buf; + always_comb begin // Tail-agnostic bus alu_result = '1; @@ -688,6 +694,10 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( vrgat_m_seq_bit = 1'b0; + // Default the VRGATHER/VCOMPRESS buffers (avoids inferred latches) + vrgat_res = '1; + vrgat_buf = '0; + // The result mask should be created here since the output is a non-mask vector be_viota_seq_d = be_viota_seq_q; @@ -802,11 +812,6 @@ module masku import ara_pkg::*; import rvv_pkg::*; #( // This operation writes vsew-bit elements with vtype.vsew encoding // The vd source can have a different encoding (it gets deshuffled in the masku_operand stage) [VRGATHER:VCOMPRESS]: begin - // Buffer for the current element - automatic logic [NrLanes*DataWidth-1:0] vrgat_res; - // Buffer for the current element - automatic logic [DataWidth-1:0] vrgat_buf; - // Extract the correct elements vrgat_res = '1; // Default assignment vrgat_buf = masku_operand_vd_seq[vrgat_req_idx_q[idx_width(NrLanes*ELENB/8)-1:0] * 64 +: 64]; // Default assignment diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 519ceb2a1..ae5997397 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -234,6 +234,8 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( logic [idx_width(AxiDataWidth/8):0] axi_r_byte_pnt_d, axi_r_byte_pnt_q; // - A pointer to which byte in the full VRF word we are writing data into. logic [idx_width(DataWidth*NrLanes/8):0] vrf_word_byte_pnt_d, vrf_word_byte_pnt_q; + // - A pointer that indicates the start byte in the vrf word. + logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in @@ -288,6 +290,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( seq_word_wr_offset_d = seq_word_wr_offset_q; first_payload_byte_d = first_payload_byte_q; vrf_word_byte_cnt_d = vrf_word_byte_cnt_q; + vrf_word_start_byte = '0; // Vector instructions currently running vinsn_running_d = vinsn_running_q & pe_vinsn_running_i; @@ -462,8 +465,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Prepare for the next vector instruction if (vinsn_queue_d.issue_cnt != 0) begin : issue_cnt_bytes_update - // - A pointer that indicates the start byte in the vrf word. - automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; issue_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vl - vinsn_queue_q.vinsn[vinsn_queue_d.issue_pnt].vstart @@ -649,8 +650,6 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // New instruction with new vstart. Initialize the vrf byte ptr if (vinsn_queue_d.issue_cnt == '0) begin - // - A pointer that indicates the start byte in the vrf word. - automatic logic [$clog2(8*NrLanes)-1:0] vrf_word_start_byte; vrf_word_start_byte = pe_req_i.vstart[$clog2(8*NrLanes)-1:0] << pe_req_i.vtype.vsew; vrf_word_byte_pnt_d = {1'b0, vrf_word_start_byte[$clog2(8*NrLanes)-1:0]}; vrf_word_byte_cnt_d = '0; diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 811589139..0f3ac84c0 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -227,6 +227,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // When vstart > 0, the very first payload written to the VRF contains less than // (8 * NrLanes) bytes. logic [$clog2(8*NrLanes):0] first_payload_byte_d, first_payload_byte_q; + logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes; // A counter that follows the vrf_word_byte_pnt pointer, but without the vstart information // We can compare this counter witht the issue_cnt_bytes counter to find the last byte in @@ -249,6 +250,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( vinsn_valid_bytes = '0; axi_valid_bytes = '0; valid_bytes = '0; + vrf_eff_write_bytes = '0; // Maintain state vinsn_queue_d = vinsn_queue_q; @@ -366,7 +368,6 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( // We consumed a whole word from the lanes if (vrf_pnt_d == NrLanes*8 || vrf_cnt_d == issue_cnt_bytes_q) begin : vrf_word_done - automatic logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes; // Reset the pointer in the VRF word vrf_pnt_d = '0; vrf_cnt_d = '0; From 7af1b0c90517e7788705516395458cf4978f078a Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Tue, 2 Jun 2026 02:09:53 +0200 Subject: [PATCH 5/6] [hardware] Fix VSSUB saturation check (OR-reduce, not AND) PR review (hopang-0221): unlike VSSUBU, the VSSUB saturation flag is not replicated across all bits of vxsat.w{16,32,64}[b] (only the LSB is set), so the AND-reduction introduced for the slang -Wint-bool-conv fix is always 0 and saturation never triggers. Use OR-reduction instead, which matches the original non-zero test and clears the warning. Co-Authored-By: Claude Opus 4.8 (1M context) --- hardware/src/lane/simd_alu.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hardware/src/lane/simd_alu.sv b/hardware/src/lane/simd_alu.sv index 24ac835ed..48d22038b 100644 --- a/hardware/src/lane/simd_alu.sv +++ b/hardware/src/lane/simd_alu.sv @@ -309,19 +309,19 @@ module simd_alu import ara_pkg::*; import rvv_pkg::*; #( automatic logic [16:0] sub = opb.w16[b] - opa.w16[b]; vxsat.w16[b] = (!opb.w16[b][15] & opa.w16[b][15] & sub[15]) | (opb.w16[b][15] & !opa.w16[b][15] & !sub[15]); - res.w16[b] = &vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; + res.w16[b] = |vxsat.w16[b] ? (opb.w16[b][15] ? 16'h8000 : 16'h7FFF) : sub[15:0]; end EW32: for (int b = 0; b < 2; b++) begin automatic logic [32:0] sub = opb.w32[b] - opa.w32[b]; vxsat.w32[b] = (!opb.w32[b][31] & opa.w32[b][31] & sub[31]) | (opb.w32[b][31] & !opa.w32[b][31] & !sub[31]); - res.w32[b] = &vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; + res.w32[b] = |vxsat.w32[b] ? (opb.w32[b][31] ? 32'h80000000 : 32'h7FFFFFFF) : sub[31:0]; end EW64: for (int b = 0; b < 1; b++) begin automatic logic [64:0] sub = opb.w64[b] - opa.w64[b]; vxsat.w64[b] = (!opb.w64[b][63] & opa.w64[b][63] & sub[63]) | (opb.w64[b][63] & !opa.w64[b][63] & !sub[63]); - res.w64[b] = &vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; + res.w64[b] = |vxsat.w64[b] ? (opb.w64[b][63] ? 64'h8000000000000000 : 64'h7FFFFFFFFFFFFFFF) : sub[63:0]; end endcase VASUB, VASUBU: if (FixPtSupport == FixedPointEnable) unique case (vew_i) From 6fe2091634c81cd1a46c6a5644b9869fc866a2dc Mon Sep 17 00:00:00 2001 From: Michael Rogenmoser Date: Tue, 2 Jun 2026 02:09:53 +0200 Subject: [PATCH 6/6] [hardware] Simplify sldu zero-padding for readability PR review (hopang-0221): replace the verbose explicit zero-pad replications (added to avoid an unsized '0 inside a concatenation) with equivalent implicit zero-extension and a plain literal comparison. out_pnt_d is wide enough to hold {red_stride_cnt_d, 3'b0}, and the popc compare is against the value 1 - no functional change. Co-Authored-By: Claude Opus 4.8 (1M context) --- hardware/src/sldu/sldu.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hardware/src/sldu/sldu.sv b/hardware/src/sldu/sldu.sv index f493d10b1..14770641a 100644 --- a/hardware/src/sldu/sldu.sv +++ b/hardware/src/sldu/sldu.sv @@ -612,7 +612,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Filled up a word to the VRF or finished the instruction if (out_pnt_d == NrLanes * 8 || issue_cnt_q <= byte_count) begin // Reset the pointer - out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {{idx_width(NrLanes*(StrbWidth-1)){1'b0}}, red_stride_cnt_d, 3'b0} : '0; + out_pnt_d = vinsn_issue_q.vfu inside {VFU_Alu, VFU_MFpu} ? {red_stride_cnt_d, 3'b0} : '0; // We used all the bits of the mask if (vinsn_issue_q.op inside {VSLIDEUP, VSLIDEDOWN}) mask_ready_d = !vinsn_issue_q.vm; @@ -754,7 +754,7 @@ module sldu import ara_pkg::*; import rvv_pkg::*; #( // Update the p2 stride p2_stride_gen_update_d = 1'b1; // Commit the final result - if (p2_stride_gen_popc_q == {{(idx_width(idx_width(8*NrLanes))-1){1'b0}}, 1'b1} && result_queue_empty) begin + if (p2_stride_gen_popc_q == 1 && result_queue_empty) begin state_d = SLIDE_NP2_COMMIT; // Prepare the write pointer result_queue_write_pnt_d = NP2_RESULT_PNT;